diff --git a/.Doxyfile b/.Doxyfile
index c3386af2..9dbfe4ba 100644
--- a/.Doxyfile
+++ b/.Doxyfile
@@ -771,8 +771,11 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = ./inference/engine/api ./inference/flow/include/flow.h \
-./common/uni/include/task.h ./inference/flow/src/flow.proto
+INPUT                  = ./inference/engine/api/c \
+./inference/engine/api/java \
+./inference/flow/include/flow.h ./common/uni/include/task.h ./inference/flow/src/flow.proto \
+./training/api/training/api/API.h \
+./training/demos/common/training.h
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/.gitignore b/.gitignore
index 587e6768..016a22bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,6 +53,8 @@ kit/Android/SimpleImageClassification/app/src/main/java
 kit/iOS/SimpleImgClassfication/libbolt
 kit/Android/Semantics/app/src/main/java
 kit/Android/Semantics/app/src/main/assets/
+kit/Android
+kit/iOS
 
 
 final_combinations.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d06678f..9ca700b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,9 @@ if (NOT "$ENV{JNI_ROOT}" STREQUAL "")
         set(USE_JNI ON)
     endif(JNI_FOUND)
 endif ()
+if (USE_SECURE_C)
+    find_package(SecureC)
+endif ()
 if (BUILD_TEST)
     find_package(jpeg)
     if (EXISTS ${OpenCV_CMAKE_PATH})
@@ -33,7 +36,7 @@ if (BUILD_TEST)
 endif (BUILD_TEST)
 
 add_subdirectory(common)
-if (USE_CAFFE OR USE_ONNX OR USE_TFLITE OR USE_TENSORFLOW)
+if (USE_CAFFE OR USE_ONNX OR USE_TFLITE OR USE_TENSORFLOW OR USE_MINDSPORE)
     add_subdirectory(model_tools)
 endif()
 add_subdirectory(compute)
@@ -45,6 +48,13 @@ message(STATUS "CXXFLAGS: ${CMAKE_CXX_FLAGS}")
 add_custom_target(bolt_library ALL
     COMMAND bash ./scripts/build_light_bolt.sh ${CMAKE_SYSTEM_NAME} ${CMAKE_CXX_COMPILER} ${CMAKE_AR} ${CMAKE_STRIP} ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_OUTPUT_EXTENSION} ${CMAKE_SHARED_LIBRARY_PREFIX} ${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_STATIC_LIBRARY_PREFIX} ${CMAKE_STATIC_LIBRARY_SUFFIX} ${CMAKE_BINARY_DIR}
     WORKING_DIRECTORY ${BOLT_ROOT})
+if (USE_TRAINING)
+    set(TRAINING_BUILD_C_API ON)
+    set(TRAINING_BUILD_DEMO ON)
+    add_subdirectory(training)
+    add_dependencies(Raul blas_enhance uni)
+    add_dependencies(Raul blas_enhance_static uni_static)
+endif (USE_TRAINING)
 add_dependencies(bolt_library engine model_spec tensor image blas_enhance uni)
 add_dependencies(bolt_library engine_static model_spec_static tensor_static image_static blas_enhance_static uni_static)
 
@@ -70,11 +80,30 @@ endif ()
 enable_testing()
 find_program (BASH_PROGRAM bash)
 if (BASH_PROGRAM AND USE_GENERAL)
+    file(GLOB CPUINFO_CMAKE_FILE $ENV{BOLT_ROOT}/common/cmakes/cpuinfo.cmake ${BOLT_ROOT}/common/cmakes/cpuinfo.cmake)
+    include(${CPUINFO_CMAKE_FILE})
     set(parameters --host_dir=${CMAKE_INSTALL_PREFIX})
     if (ANDROID)
         set(parameters ${parameters} -d android --device_dir=/data/local/tmp/uldra)
     elseif("${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_SYSTEM_PROCESSOR}" AND "${CMAKE_HOST_SYSTEM}" MATCHES "${CMAKE_SYSTEM_NAME}*")
-        set(parameters ${parameters} -d host)
+        if ("${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "armv7")
+            set(parameters ${parameters} -d host)
+        elseif (USE_X86)
+            set(x86_test ${cpuinfo_avx2})
+            if (USE_INT8)
+                set(x86_test ${cpuinfo_avx512})
+            endif ()
+            if (USE_AVX512_VNNI)
+                set(x86_test ${cpuinfo_avx512_vnni})
+            endif()
+            if (x86_test)
+                set(parameters ${parameters} -d host)
+            else ()
+                set(parameters ${parameters} -d unknown)
+            endif ()
+        else ()
+            set(parameters ${parameters} -d unknown)
+        endif()
     else()
         set(parameters ${parameters} -d unknown)
     endif()
diff --git a/README.md b/README.md
index 9b37e130..995733cd 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,8 @@
 ---
 [![License: MIT](docs/images/license-mit.png)](https://opensource.org/licenses/MIT)
 
-[Bolt](https://huawei-noah.github.io/bolt/) is a light-weight library for deep learning. Bolt, as a universal deployment tool for all kinds of neural networks, aims to minimize the inference runtime as much as possible.
+[Bolt](https://huawei-noah.github.io/bolt/) is a light-weight library for deep learning.
+Bolt, as a universal deployment tool for all kinds of neural networks, aims to automate the deployment pipeline and achieve extreme acceleration.
 Bolt has been widely deployed and used in many departments of HUAWEI company, such as 2012 Laboratory, CBG and HUAWEI Product Lines.
 If you have questions or suggestions, you can submit issue. **QQ群: 833345709**
 
@@ -11,7 +12,7 @@ If you have questions or suggestions, you can submit issue. **QQ群: 833345709**
 - **High Performance:** **15%+** faster than existing open source acceleration libraries.
 - **Rich Model Conversion:** support Caffe, ONNX, TFLite, Tensorflow.
 - **Various Inference Precision:** support FP32, FP16, INT8, 1-BIT.
-- **Multiple platforms:** ARM CPU(v7, v8, v8.2), Mali GPU, Qualcomm GPU, X86 CPU(AVX2, AVX512)
+- **Multiple platforms:** ARM CPU(v7, v8, v8.2+), Mali GPU, Qualcomm GPU, X86 CPU(AVX2, AVX512)
 - **Bolt is the first to support NLP and also supports common CV applications.**
 - **Minimize ROM/RAM**
 - Rich Graph Optimization
@@ -23,30 +24,42 @@ If you have questions or suggestions, you can submit issue. **QQ群: 833345709**
 
 # Building Status
 ---
-Kinds of choices are provided for the compilation of bolt. Please make a suitable choice depending on your environment.
-
-| target platform      | build command                                | Linux | Windows | MacOS |
-| -------------------- | -------------------------------------------- | ----- | ------- | ----- |
-| Android(armv7)       | ./install.sh --target=android-armv7          | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv7) |
-| Android(armv8+gpu)  | ./install.sh --target=android-aarch64 --gpu | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) |
-| Android(x86_64)      | ./install.sh --target=android-x86_64         | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-x86_64) |
-| iOS(armv7)           | ./install.sh --target=ios-armv7              | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv7) |
-| iOS(armv8)           | ./install.sh --target=ios-aarch64            | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv8) |
-| Linux(X86_64)        | ./install.sh --target=linux-x86_64           | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / |
-| Linux(x86_64_avx2)   | ./install.sh --target=linux-x86_64_avx2      | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86-avx2) | / | / |
-| Windows(X86_64)      | ./install.sh --target=windows-x86_64         | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86) | / |
-| Windows(x86_64_avx2) | ./install.sh --target=windows-x86_64_avx2    | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86-avx2) | / |
-| MacOS(X86_64)        | ./install.sh --target=macos-x86_64           | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86) |
-| MacOS(x86_64_avx2)   | ./install.sh --target=macos-x86_64_avx2      | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86-avx2) |
-
-*NOTE: Bolt defaultly link static library, This may cause some problem on some platforms. You can use --shared option to link shared library.*
+There are some common used platform for inference. More targets can be seen from [scripts/target.sh](scripts/target.sh). Please make a suitable choice depending on your environment. 
+If you want to build on-device training module, you can add **--train** option.
+If you want to use multi-threads parallel, you can add **--openmp** option.
+
+*Bolt defaultly link static library, This may cause some problem on some platforms. You can use --shared option to link shared library.*
+
+| target platform        | precision          | build command                                        | Linux | Windows | MacOS |
+| ---------------------- | ------------------ | ---------------------------------------------------- | ----- | ------- | ----- |
+| Android(armv7)         | fp32,int8          | ./install.sh --target=android-armv7                  | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv7) |
+| Android(armv8)         | fp32,int8          | ./install.sh --target=android-aarch64 --fp16=off     | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) |
+| Android(armv8.2+)      | fp32,fp16,int8,bnn | ./install.sh --target=android-aarch64                | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) |
+| Android(gpu)           | fp16               | ./install.sh --target=android-aarch64 --gpu          | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) |
+| Android(x86_64)        | fp32,int8          | ./install.sh --target=android-x86_64                 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-x86_64) |
+| iOS(armv7)             | fp32,int8          | ./install.sh --target=ios-armv7                      | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv7) |
+| iOS(armv8)             | fp32,int8          | ./install.sh --target=ios-aarch64 --fp16=off         | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv8) |
+| iOS(armv8.2+)          | fp32,fp16,int8,bnn | ./install.sh --target=ios-aarch64                    | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv8) |
+| Linux(armv7)           | fp32,int8          | ./install.sh --target=linux-armv7_blank              | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / |
+| Linux(armv8)           | fp32,int8          | ./install.sh --target=linux-aarch64_blank --fp16=off | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / |
+| Linux(armv8.2+)        | fp32,fp16,int8,bnn | ./install.sh --target=linux-aarch64_blank            | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / |
+| Linux(x86_64)          | fp32,int8          | ./install.sh --target=linux-x86_64                   | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / |
+| Linux(x86_64_avx2)     | fp32               | ./install.sh --target=linux-x86_64_avx2              | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86-avx2) | / | / |
+| Linux(x86_64_avx512)   | fp32,int8          | ./install.sh --target=linux-x86_64_avx512            | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86-avx2) | / | / |
+| Windows(x86_64)        | fp32,int8          | ./install.sh --target=windows-x86_64                 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86) | / |
+| Windows(x86_64_avx2)   | fp32               | ./install.sh --target=windows-x86_64_avx2            | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86-avx2) | / |
+| Windows(x86_64_avx512) | fp32,int8          | ./install.sh --target=windows-x86_64_avx512          | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86-avx2) | / |
+| MacOS(armv8.2+)        | fp32,fp16,int8,bnn | ./install.sh --target=macos-aarch64                  | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86) |
+| MacOS(x86_64)          | fp32,int8          | ./install.sh --target=macos-x86_64                   | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86) |
+| MacOS(x86_64_avx2)     | fp32               | ./install.sh --target=macos-x86_64_avx2              | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86-avx2) |
+| MacOS(x86_64_avx512)   | fp32,int8          | ./install.sh --target=macos-x86_64_avx512            | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86-avx2) |
 
 # Quick Start
 ---
 <div align=center><img src="docs/images/QuickStart.jpg" width = 100% height = 100%  style="border: 1px solid rgba(151,151,151,0.50)" /></div>
 Two steps to get started with bolt.
 
-1. Conversion: use **[X2bolt](model_tools/tools/X2bolt/X2bolt.cpp)** to convert your model from caffe,onnx,tflite or tensorflow to .bolt;
+1. Conversion: use **[X2bolt](model_tools/tools/X2bolt/X2bolt.cpp)** to convert your model from caffe, onnx, tflite or tensorflow to .bolt file;
 
 2. Inference: run **[benchmark](inference/examples/benchmark/benchmark.cpp)** with .bolt and data to get the inference result.
 
@@ -56,9 +69,10 @@ Two steps to get started with bolt.
 
   Here we show some interesting and useful applications in bolt.
 
-| Face Detection | ASR | Semantics Analysis | Image Classification
-| :------: | :------: | :------: |:------: 
-| [![face_detection](docs/images/20_bolt_face_detection.gif)](inference/examples/ultra_face) demo_link: [face detection](inference/examples/ultra_face) | [![asr](docs/images/ChineseSpeechRecognition.gif)]()  demo_link: [asr](inference/examples/automatic_speech_recognition) | [![semantics analysis](docs/images/SemanticsAnalysis.gif)]()  demo_link: [semantics analysis](kit/Android/Semantics) | [![image_classification](docs/images/ImageClassification.gif)]() demo_link: [image classification](inference/examples/image_classification) 
+| Face Detection | ASR | Semantics Analysis | Image Classification | Reading Comprehension |
+| :------: | :------: | :------: | :------: | :------: |
+| ![face_detection](docs/images/20_bolt_face_detection.gif) [android](kit/Android/FaceDetection)  [ios](kit/iOS/FaceDetection)  [exe](inference/examples/ultra_face) | ![asr](docs/images/ChineseSpeechRecognition.gif) [android](kit/Android/ChineseSpeechRecognition)    [ios](kit/iOS/ChineseSpeechRecognition) | ![semantics analysis](docs/images/SemanticsAnalysis.gif) [android](kit/Android/Semantics) | ![image_classification](docs/images/ImageClassification.gif) [android](kit/Android/SimpleImageClassification)    [ios](kit/iOS/SimpleImageClassification) | ![reading_comprehension](docs/images/ReadingComprehension.gif) [android](kit/Android/ReadingComprehension) |
+
 # Verified Networks
 ---
   Bolt has shown its high performance in the inference of common CV and NLP neural networks. Some of the representative networks that we have verified are listed below. You can find detailed benchmark information in [docs/BENCHMARK.md](docs/BENCHMARK.md).
@@ -81,16 +95,32 @@ Two steps to get started with bolt.
         <a href="https://github.com/liuzechun/Bi-Real-net">BiRealNet</a>,
         <a href="https://github.com/liuzechun/ReActNet">ReActNet</a>,
         <a href="https://github.com/huawei-noah/ghostnet">Ghostnet</a>,
-        SSD, Yolov3, Pointnet, ViT, TNT ...
+        <a href="https://github.com/milesial/Pytorch-UNet">unet</a>,
+        LCNet, Pointnet,
+        <a href="https://github.com/thangtran480/hair-segmentation">hair-segmentation</a>,
+        <a href="https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/duc">duc</a>,
+        <a href="https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/fcn">fcn</a>,
+        <a href="https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/retinanet">retinanet</a>,
+        <a href="https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/ssd">SSD</a>,
+        <a href="https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/faster-rcnn">Faster-RCNN</a>,
+        <a href="https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/faster-rcnn">Mask-RCNN</a>,
+        <a href="https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/yolov2-coco">Yolov2</a>,
+        <a href="https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/tiny-yolov3">Yolov3</a>,
+        <a href="https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/yolov4">Yolov4</a>,
+        <a href="https://github.com/ultralytics/yolov5">Yolov5</a>,
+        ViT, TNT ...
       </td>
     </tr>
     <tr>
       <td>
         NLP
       </td>
-      <td> Bert,
-        Albert, Neural Machine Translation, Text To Speech, Automatic Speech Recognition,
-        <a href="docs/USER_HANDBOOK.md#voice-wake-up">Tdnn</a> ...
+      <td> Bert, Albert, Tinybert, Neural Machine Translation, Text To Speech(Tactron,Tactron2,FastSpeech+hifigan,melgan), Automatic Speech Recognition, DFSMN, Conformer,
+        <a href="docs/USER_HANDBOOK.md#voice-wake-up">Tdnn</a>,
+        <a href="https://tfhub.dev/google/lite-model/nonsemantic-speech-benchmark/frill-nofrontend/1">FRILL</a>,
+        <a href="https://github.com/onnx/models/tree/master/text/machine_comprehension/t5">T5</a>,
+        <a href="https://github.com/onnx/models/tree/master/text/machine_comprehension/gpt-2">GPT-2</a>,
+        <a href="https://github.com/onnx/models/tree/master/text/machine_comprehension/roberta">Roberta</a> ...
       </td>
     </tr>
     <tr>
@@ -111,18 +141,23 @@ Two steps to get started with bolt.
     </tr>
   </table>
 
-  More models than these mentioned above are supported,  users are encouraged to further explore.
+  More models than these mentioned above are supported, users are encouraged to further explore.
+
+# On-Device Training
+---
+On-Device Training has come, it's a beta vesion which supports [Lenet](./training/demos/lenet_demo/), [Mobilenet_v1](./training/demos/mobilenet_v1_demo) and [Resnet18](./training/demos/resnet18_demo) for training on the embedded devices and servers. Want more details of on-device training in bolt? Get with the official training [tutorial](./training/TUTORIAL.md).
 
 # Documentations
 ---
 Everything you want to know about bolt is recorded in the detailed documentations stored in [docs](docs).
 
-- [How to install bolt with different compilers](docs/INSTALL.md).
-- [How to use bolt to inference your ML models.](docs/USER_HANDBOOK.md)
-- [How to develop bolt to customize more models.](docs/DEVELOPER.md)
+- [How to install bolt with different compilers?](docs/INSTALL.md).
+- [How to use bolt to inference your ML models?](docs/USER_HANDBOOK.md)
+- [How to develop bolt to customize more models?](docs/DEVELOPER.md)
 - [Operators documentation](docs/OPERATORS.md)
 - [Benchmark results on some universal models.](docs/BENCHMARK.md)
-- [How to build demo/example with kit.](docs/KIT.md)
+- [How to visualise/protect bolt model?](docs/USER_HANDBOOK.md#model-visualization)
+- [How to build demo/example with kit?](docs/KIT.md)
 - [Frequently Asked Questions(FAQ)](docs/FAQ.md)
 
 # Articles
@@ -133,6 +168,7 @@ Everything you want to know about bolt is recorded in the detailed documentation
 - [Bolt GPU性能优化，让上帝帮忙掷骰子](https://zhuanlan.zhihu.com/p/336218879)
 - [Bolt助力HMS机器翻译，自然语言处理又下一城](https://zhuanlan.zhihu.com/p/337887620)
 - [ARM CPU 1-bit推理，走向极致的道路](https://zhuanlan.zhihu.com/p/158161592)
+- [基于深度学习加速库Bolt的声音克隆技术(Voice Cloning)](https://zhuanlan.zhihu.com/p/498919929)
 
 # 教程
 ---
@@ -141,7 +177,8 @@ Everything you want to know about bolt is recorded in the detailed documentation
 - 情感分类: [Android Demo](https://zhuanlan.zhihu.com/p/414971037)
 - 中文语音识别: [Android Demo](https://zhuanlan.zhihu.com/p/414978782), [iOS Demo](https://zhuanlan.zhihu.com/p/414981121)
 - 人脸检测: [Android Demo](https://zhuanlan.zhihu.com/p/414975102), [iOS Demo](https://zhuanlan.zhihu.com/p/414971375)
-
+- 阅读理解: [Android Demo](https://zhuanlan.zhihu.com/p/498906834)
+- 
 # Acknowledgement
 ---
 Bolt refers to the following projects: [caffe](https://github.com/BVLC/caffe), [onnx](https://github.com/onnx/onnx), [tensorflow](https://github.com/tensorflow/tensorflow), [ncnn](https://github.com/Tencent/ncnn), [mnn](https://github.com/alibaba/MNN), [dabnn](https://github.com/JDAI-CV/dabnn).
diff --git a/SUMMARY.md b/SUMMARY.md
new file mode 100644
index 00000000..7a4216cd
--- /dev/null
+++ b/SUMMARY.md
@@ -0,0 +1,42 @@
+# Summary
+
+* [Introduction](README.md)
+
+
+* [Architechture](docs/ARCHITECTURE.md)
+
+
+* [Operators](docs/OPERATORS.md)
+
+
+* [Install](docs/INSTALL.md)
+
+
+* [Basic Inference Usage](docs/USER_HANDBOOK.md#basic-usage)
+
+
+* [Basic On-device Training Usage](training/TUTORIAL.md)
+
+
+* [Advanced Features](docs/USER_HANDBOOK.md#advanced-features)
+
+
+* [Developer Customization](docs/DEVELOPER.md)
+
+
+* [How to Reduce GPU Initial Time](docs/REDUCE_GPU_PREPARE_TIME.md)
+
+
+* [Kit Example](docs/KIT.md)
+
+
+* [Changelog](docs/CHANGELOG.md)
+
+
+* [FAQ](docs/FAQ.md)
+
+
+* [Feedback](docs/FEEDBACK.md)
+
+
+* [Appendix](docs/IOS_USAGE.md)
diff --git a/book.json b/book.json
new file mode 100644
index 00000000..baf158d1
--- /dev/null
+++ b/book.json
@@ -0,0 +1,22 @@
+{
+    "plugins": [
+        "github",
+        "back-to-top-button",
+        "page-toc-button",
+        "insert-logo"
+    ],
+
+    "pluginsConfig": {
+        "github": {
+            "url": "https://github.com/huawei-noah/bolt"
+        },
+        "page-toc-button": {
+            "maxTocDepth": 1,
+            "minTocSize": 2
+        },
+        "insert-logo":{
+            "url":"../docs/images/LOGO.PNG",
+            "style":"background:none;max-height:100px"
+        }
+    }
+}
diff --git a/common/cmakes/FindSecureC.cmake b/common/cmakes/FindSecureC.cmake
new file mode 100644
index 00000000..72a8ed82
--- /dev/null
+++ b/common/cmakes/FindSecureC.cmake
@@ -0,0 +1,24 @@
+find_path(SecureC_INCLUDE_DIR NAMES securec.h HINTS $ENV{SecureC_ROOT}/include ${SecureC_ROOT}/include)
+
+if (USE_DYNAMIC_LIBRARY)
+    find_library(SecureC_LIBRARY NAMES securec HINTS $ENV{SecureC_ROOT}/lib ${SecureC_ROOT}/lib)
+    set(SecureC_SHARED_LIBRARY ${SecureC_LIBRARY})
+else (USE_DYNAMIC_LIBRARY)
+    find_library(SecureC_LIBRARY NAMES ${CMAKE_STATIC_LIBRARY_PREFIX}securec${CMAKE_STATIC_LIBRARY_SUFFIX} HINTS $ENV{SecureC_ROOT}/lib ${SecureC_ROOT}/lib)
+    find_library(SecureC_SHARED_LIBRARY NAMES securec HINTS $ENV{SecureC_ROOT}/lib ${SecureC_ROOT}/lib)
+endif (USE_DYNAMIC_LIBRARY)
+
+if (SecureC_INCLUDE_DIR AND SecureC_LIBRARY)
+    set(SecureC_FOUND true)
+endif (SecureC_INCLUDE_DIR AND SecureC_LIBRARY)
+
+if (SecureC_FOUND)
+    include_directories(${SecureC_INCLUDE_DIR})
+    message(STATUS "Found securec.h: ${SecureC_INCLUDE_DIR}")
+    message(STATUS "Found securec: ${SecureC_LIBRARY}")
+else (SecureC_FOUND)
+    message(FATAL_ERROR "
+FATAL: can not find securec library in <SecureC_ROOT>/[include|lib] directory,
+       please set shell environment variable SecureC_ROOT.
+    ")
+endif (SecureC_FOUND)
diff --git a/common/cmakes/bolt.cmake b/common/cmakes/bolt.cmake
index 04308d5b..3f0378d1 100644
--- a/common/cmakes/bolt.cmake
+++ b/common/cmakes/bolt.cmake
@@ -12,6 +12,7 @@ option(USE_CAFFE "set use caffe model as input or not" OFF)
 option(USE_ONNX "set use onnx model as input or not" OFF)
 option(USE_TFLITE "set use tflite model as input or not" OFF)
 option(USE_TENSORFLOW "set use tensorflow model as input or not" OFF)
+option(USE_MINDSPORE "set use mindspore model as input or not" OFF)
 
 # blas_enhance tensor
 option(USE_GENERAL "set use CPU serial code or not" OFF)
@@ -26,12 +27,23 @@ option(USE_INT8_WINOGRAD "set use ARM NEON INT8 winograd" ON)
 option(USE_OPENMP "set use openmp to run test(tinybert) or not" OFF)
 
 option(USE_LIBRARY_TUNING "set use algorithm tuning or not" OFF)
+option(USE_MEM_CHECK "set to use memory check or not" OFF)
+option(USE_MODEL_PRINT "set to use model print or not" ON)
+option(USE_SECURE_C "set to use Huawei Secure C or not" OFF)
+
+option(USE_TRAINING "set whether to use training or not" OFF)
 option(USE_FLOW "set whether to use flow or not" OFF)
 
 option(USE_JNI "set whether to use Java API or not" OFF)
 
 option(BUILD_TEST "set to build unit test or not" OFF)
 
+include(CheckCXXCompilerFlag)
+
+if (USE_TRAINING)
+    set(ANDROID_TOOLCHAIN_PREFIX "aarch64-linux-android-")
+endif(USE_TRAINING)
+
 function (set_policy)
     if (POLICY CMP0074)
         cmake_policy(SET CMP0074 NEW)
@@ -39,15 +51,19 @@ function (set_policy)
 endfunction(set_policy)
 
 macro (set_c_cxx_flags)
-    set(COMMON_FLAGS "-W -Wextra -O3 -fPIC")
-    if (NOT WIN32)
-        set(COMMON_FLAGS "${COMMON_FLAGS} -fstack-protector-all")
-    endif()
+    set(COMMON_FLAGS "-O3 -fPIC -fPIE")
+    # warning flag can be remove in release version
+    set(COMMON_FLAGS "${COMMON_FLAGS} -W -Wextra")
+    set(COMMON_FLAGS "${COMMON_FLAGS} -fstack-protector-all -fstack-protector-strong")
     set(COMMON_FLAGS "${COMMON_FLAGS} -Wno-unused-command-line-argument -Wno-unused-parameter")
     set(COMMON_FLAGS "${COMMON_FLAGS} -Wno-unused-result -Wno-deprecated-declarations -Wno-unused-variable")
 
     if (USE_OPENMP)
         set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_OPENMP -fopenmp")
+        CHECK_CXX_COMPILER_FLAG("-static-openmp" COMPILER_SUPPORTS_STATIC_OPENMP)
+        if (COMPILER_SUPPORTS_STATIC_OPENMP)
+            set(COMMON_FLAGS "${COMMON_FLAGS} -static-openmp")
+        endif ()
     endif(USE_OPENMP)
 
     if (USE_THREAD_SAFE OR USE_CAFFE OR USE_ONNX OR USE_FLOW)
@@ -99,27 +115,29 @@ macro (set_c_cxx_flags)
         if (USE_INT8)
             set(COMMON_FLAGS "${COMMON_FLAGS} -mavx512f")
         endif (USE_INT8)
-	if (USE_AVX512_VNNI)
+        if (USE_AVX512_VNNI)
             set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_AVX512_VNNI")
-	endif(USE_AVX512_VNNI)
+        endif(USE_AVX512_VNNI)
     endif(USE_X86)
 
     if (USE_FP32)
         set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP32")
     endif (USE_FP32)
 
+    if (USE_FP16)
+        set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP16")
+        if (USE_F16_MIX_PRECISION)
+            set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_F16_MIX_PRECISION")
+        endif (USE_F16_MIX_PRECISION)
+    endif ()
+
     if (USE_INT8)
         set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8")
     endif (USE_INT8)
 
     if (USE_NEON)
         set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_NEON")
-
         if (USE_FP16)
-            set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP16")
-            if (USE_F16_MIX_PRECISION)
-                set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_F16_MIX_PRECISION")
-            endif (USE_F16_MIX_PRECISION)
             if (USE_INT8)
                 if (USE_INT8_WINOGRAD)
                     set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8_WINOGRAD")
@@ -137,9 +155,6 @@ macro (set_c_cxx_flags)
                 endif ()
             endif (USE_INT8)
         endif (USE_FP16)
-        if (USE_INT8)
-            set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8")
-        endif ()
     endif(USE_NEON)
 
     if (USE_CAFFE)
@@ -154,6 +169,21 @@ macro (set_c_cxx_flags)
     if (USE_TENSORFLOW)
         set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_TENSORFLOW")
     endif()
+    if (USE_MINDSPORE)
+	    set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MINDSPORE")
+    endif()    
+
+    if (USE_MEM_CHECK)
+	    set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MEM_CHECK")
+    endif()
+
+    if (USE_MODEL_PRINT)
+	    set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MODEL_PRINT")
+    endif()
+
+    if (USE_SECURE_C)
+	    set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_SECURE_C")
+    endif()
 
     set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${COMMON_FLAGS} -std=gnu99")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS} -std=c++11")
@@ -168,8 +198,14 @@ endmacro(set_c_cxx_flags)
 macro (set_test_c_cxx_flags)
     if (NOT USE_DYNAMIC_LIBRARY)
         set(COMMON_FLAGS "${COMMON_FLAGS} -static-libstdc++")
-        if (NOT "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_SYSTEM_PROCESSOR}" AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-            set(COMMON_FLAGS "${COMMON_FLAGS} -static")
+        if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+            set(COMMON_FLAGS "${COMMON_FLAGS} -static-libgcc")
+            if (NOT "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_SYSTEM_PROCESSOR}")
+                set(COMMON_FLAGS "${COMMON_FLAGS} -static")
+            endif()
+            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+                set(COMMON_FLAGS "${COMMON_FLAGS} -static")
+            endif()
         endif()
     endif()
 
@@ -198,6 +234,7 @@ if(USE_DYNAMIC_LIBRARY)
     set(model_tools_onnx_library model_tools_onnx)
     set(model_tools_tflite_library model_tools_tflite)
     set(model_tools_tensorflow_library model_tools_tensorflow)
+    set(model_tools_mindspore_library model_tools_mindspore)
     set(model_tools_library model_tools)
     set(engine_library engine)
     set(flow_library flow)
@@ -213,6 +250,7 @@ else()
     set(model_tools_onnx_library model_tools_onnx_static)
     set(model_tools_tflite_library model_tools_tflite_static)
     set(model_tools_tensorflow_library model_tools_tensorflow_static)
+    set(model_tools_mindspore_library model_tools_mindspore_static)
     set(model_tools_library model_tools_static)
     set(engine_library engine_static)
     set(flow_library flow_static)
@@ -220,10 +258,16 @@ endif()
 
 macro(include_uni)
     include_directories(${BOLT_ROOT}/common/uni/include)
+    if (USE_SECURE_C)
+        include_directories(${SecureC_ROOT}/include)
+    endif ()
 endmacro()
 
 macro(link_uni name)
     target_link_libraries(${name} ${uni_library})
+    if (USE_SECURE_C)
+        target_link_libraries(${name} ${SecureC_LIBRARY})
+    endif ()
 endmacro()
 
 macro(include_model_spec)
@@ -330,7 +374,10 @@ macro(link_model_tools name)
         target_link_libraries(${name} ${model_tools_tensorflow_library})
         target_link_libraries(${name} ${JSONCPP_LIBRARY})
     endif()
-    if(USE_CAFFE OR USE_ONNX)
+    if(USE_MINDSPORE)
+        target_link_libraries(${name} ${model_tools_mindspore_library})
+    endif()    
+    if(USE_CAFFE OR USE_ONNX OR USE_MINDSPORE)
         link_protobuf(${name})
     endif()
     link_model_spec(${name})
diff --git a/common/cmakes/cpuinfo.cmake b/common/cmakes/cpuinfo.cmake
new file mode 100644
index 00000000..87c94f77
--- /dev/null
+++ b/common/cmakes/cpuinfo.cmake
@@ -0,0 +1,16 @@
+set(CPUINFO "null")
+file(GLOB CPUINFO_FILE /proc/cpuinfo)
+if (CPUINFO_FILE)
+    exec_program(cat ARGS ${CPUINFO_FILE} OUTPUT_VARIABLE CPUINFO)
+else ()
+    message(STATUS "can not find /proc/cpuinfo")
+endif ()
+
+macro(check_cpuinfo feature)
+    string(REGEX REPLACE "^.*(${feature}).*$" "\\1" _FEATURE_THERE ${CPUINFO})
+    string(COMPARE EQUAL "${feature}" "${_FEATURE_THERE}" cpuinfo_${feature})
+endmacro()
+
+check_cpuinfo(avx2)
+check_cpuinfo(avx512)
+check_cpuinfo(avx512_vnni)
diff --git a/common/gcl/include/gcl_common.h b/common/gcl/include/gcl_common.h
index 0e0e16c2..2836d69c 100644
--- a/common/gcl/include/gcl_common.h
+++ b/common/gcl/include/gcl_common.h
@@ -130,13 +130,14 @@ inline CI8 *map_cl_error_2_string(cl_int err)
     }
 }
 
-#define map_cl_error_2_ee(err)                                                                   \
-    {                                                                                            \
-        if (err == 0)                                                                            \
-            return SUCCESS;                                                                      \
-        UNI_ERROR_LOG("GCLAPI error in:  File: %s  Line: %d  Func name is: %s  GCLERROR = %s\n", \
-            __FILE__, __LINE__, __FUNCTION__, map_cl_error_2_string(err));                       \
-        return GCL_ERROR;                                                                        \
+#define map_cl_error_2_ee(err)                                                \
+    {                                                                         \
+        if (err == 0) {                                                       \
+            return SUCCESS;                                                   \
+        } else {                                                              \
+            UNI_ERROR_LOG("GCLAPI error: %s.\n", map_cl_error_2_string(err)); \
+            return GCL_ERROR;                                                 \
+        }                                                                     \
     }
 
 inline EE has_dedicated_local(Device device, I32 *b)
@@ -171,6 +172,14 @@ struct GCLKernelInfo {
     std::string name;
 };
 
+typedef struct {
+    I32 algorithm;
+    U32 best_h[6];
+    U32 best_c[6];
+    U32 best_k[6];
+} ForwardRunInfoMali;
+typedef ForwardRunInfoMali *ForwardRunInfoMali_t;
+
 struct GCLHandle {
     Platform *platforms;
     U32 numPlatform;
@@ -201,6 +210,8 @@ struct GCLHandle {
     std::string deviceName;
     std::map<std::string, Kernel> kernelMap;
     std::map<std::string, Program> programMap;
+    std::map<std::vector<I32>, ForwardRunInfoMali> runInfoCache;
+    std::map<std::string, std::vector<U32>> kernelLSCache;
     std::vector<GCLKernelInfo> *kernelVec;
     std::string curOpName;
     void *kernel_source;
@@ -221,14 +232,6 @@ struct GCLHandleConfig {
 
 typedef GCLHandleConfig *GCLHandleConfig_t;
 
-typedef struct {
-    I32 algorithm;
-    U32 best_h[6];
-    U32 best_c[6];
-    U32 best_k[6];
-} ForwardRunInfoMali;
-typedef ForwardRunInfoMali *ForwardRunInfoMali_t;
-
 typedef struct {
     GCLHandle_t handle;
     GCLMemDesc_t gclmemFilterDesc;
diff --git a/common/gcl/include/gcl_func.h b/common/gcl/include/gcl_func.h
index 515c1486..93cb7130 100644
--- a/common/gcl/include/gcl_func.h
+++ b/common/gcl/include/gcl_func.h
@@ -559,7 +559,8 @@ inline EE gcl_create_kernel_with_source_map(
             option = handle->common_source_opt + " " + option;
         }
         if (!kernel_source->get_source(sourceName, &source_ptr)) {
-            UNI_ERROR_LOG("the %s doesn't exist in sourceMap\n", sourceName);
+            UNI_ERROR_LOG(
+                "the %s doesn't exist in sourceMap to find kernel %s.\n", sourceName, kernelName);
             CHECK_STATUS(NULL_POINTER);
         }
 
@@ -878,6 +879,53 @@ inline EE gcl_run_kernel(
     return SUCCESS;
 }
 
+inline EE gcl_get_kernel_name(Kernel kernel, I8 *kernelName)
+{
+    char name[256];
+    U32 len;
+    CHECK_STATUS(get_kernel_name(kernel, name, &len));
+    if (len > 256) {
+        UNI_ERROR_LOG("KernelName length %d > 256, please reset name array length\n", len);
+        CHECK_STATUS(NOT_MATCH);
+    } else {
+        UNI_STRCPY(kernelName, name);
+    }
+    return SUCCESS;
+}
+
+inline void gcl_set_kernel_ls_to_cache(GCLHandle_t handle, CI8 *kernelName, U32 gs[3], U32 ls[3])
+{
+    std::string name = kernelName;
+    name += "_" + std::to_string(gs[0]);
+    name += "_" + std::to_string(gs[1]);
+    name += "_" + std::to_string(gs[2]);
+    std::vector<U32> lsVec = {ls[0], ls[1], ls[2]};
+    if (handle->kernelLSCache.find(name) == handle->kernelLSCache.end()) {
+        handle->kernelLSCache[name] = lsVec;
+    }
+}
+
+inline bool gcl_get_kernel_ls_from_cache(GCLHandle_t handle, CI8 *kernelName, U32 gs[3], U32 ls[3])
+{
+    std::string name = kernelName;
+    name += "_" + std::to_string(gs[0]);
+    name += "_" + std::to_string(gs[1]);
+    name += "_" + std::to_string(gs[2]);
+    if (handle->kernelLSCache.find(name) != handle->kernelLSCache.end()) {
+        for (U32 i = 0; i < 3; i++) {
+            ls[i] = handle->kernelLSCache[name][i];
+        }
+        UNI_DEBUG_LOG("get kernel %s ls from cache success, gs is {%d %d %d}, ls is {%d %d %d}\n",
+            kernelName, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2]);
+        return true;
+    } else {
+        UNI_DEBUG_LOG("get kernel %s ls from cache fail, try to find best ls for kernel, gs is {%d "
+                      "%d %d}\n",
+            kernelName, gs[0], gs[1], gs[2]);
+        return false;
+    }
+}
+
 inline U32 get_next_ls_size(U32 ls_size)
 {
     return (ls_size << 1);
@@ -969,16 +1017,20 @@ inline EE gcl_run_kernelVec_select_ls(GCLHandle_t handle, std::vector<U32> kerne
     for (auto index : kernelIndex) {
         auto kernelInfo = (*handle->kernelVec)[index];
         bool needSelectLs = false;
+        U32 gs[3] = {0, 0, 0};
         for (U32 i = 0; i < kernelInfo.dim; i++) {
             if (kernelInfo.ls[i] == 0) {
                 needSelectLs = true;
-                break;
             }
+            gs[i] = kernelInfo.gs[i];
         }
         if (!needSelectLs) {
             continue;
         }
         CHECK_STATUS(gcl_run_kernel_select_ls(handle, &kernelInfo));
+        char kernelName[256];
+        gcl_get_kernel_name(kernelInfo.kernel, kernelName);
+        gcl_set_kernel_ls_to_cache(handle, kernelName, gs, kernelInfo.ls);
         (*handle->kernelVec)[index].gs[0] = kernelInfo.gs[0];
         (*handle->kernelVec)[index].gs[1] = kernelInfo.gs[1];
         (*handle->kernelVec)[index].gs[2] = kernelInfo.gs[2];
@@ -995,17 +1047,18 @@ inline EE gcl_infer_best_kernelVec_ls_with_map(
 {
     std::vector<U32> kernelIndex;
     U32 len = handle->kernelVec->size();
+    bool needSaveKernelThreadInfoToMap = false;
     for (U32 i = 0; i < len; i++) {
         auto kernelInfo = (*handle->kernelVec)[i];
-        U32 gs[3];
-        U32 ls[3];
+        U32 gs[3] = {0};
+        U32 ls[3] = {0};
         bool findKernelThreadInfo = false;
         findKernelThreadInfo = algoMap->getKernelThreadInfoFromMap(kernelInfo.name, gs, ls);
         U32 dim = (*handle->kernelVec)[i].dim;
         if (findKernelThreadInfo) {
             U32 cur_gs[3];
             for (U32 j = 0; j < dim; j++) {
-                cur_gs[j] = (*handle->kernelVec)[i].gs[j];
+                cur_gs[j] = kernelInfo.gs[j];
                 if (ls[j] != 0) {
                     cur_gs[j] = (cur_gs[j] + ls[j] - 1) / ls[j] * ls[j];
                 }
@@ -1014,16 +1067,29 @@ inline EE gcl_infer_best_kernelVec_ls_with_map(
             }
         } else {
             bool noNeedInferLS = true;
+            needSaveKernelThreadInfoToMap = true;
             for (U32 j = 0; j < dim; j++) {
-                gs[j] = (*handle->kernelVec)[i].gs[j];
-                ls[j] = (*handle->kernelVec)[i].ls[j];
+                gs[j] = kernelInfo.gs[j];
+                ls[j] = kernelInfo.ls[j];
                 if (ls[j] == 0) {
                     noNeedInferLS = false;
                 }
             }
+            if (!noNeedInferLS) {
+                char kernelName[256];
+                gcl_get_kernel_name(kernelInfo.kernel, kernelName);
+                if (gcl_get_kernel_ls_from_cache(handle, kernelName, gs, ls)) {
+                    for (U32 j = 0; j < dim; j++) {
+                        (*handle->kernelVec)[i].ls[j] = ls[j];
+                    }
+                    noNeedInferLS = true;
+                }
+            }
             if (noNeedInferLS) {
                 for (U32 j = 0; j < dim; j++) {
-                    (*handle->kernelVec)[i].gs[j] = (gs[j] + ls[j] - 1) / ls[j] * ls[j];
+                    if (ls[j] > 0) {
+                        (*handle->kernelVec)[i].gs[j] = (gs[j] + ls[j] - 1) / ls[j] * ls[j];
+                    }
                 }
             }
             if (!noNeedInferLS) {
@@ -1032,9 +1098,11 @@ inline EE gcl_infer_best_kernelVec_ls_with_map(
         }
     }
     CHECK_STATUS(gcl_run_kernelVec_select_ls(handle, kernelIndex));
-    for (U32 i = 0; i < len; i++) {
-        auto kernelInfo = (*handle->kernelVec)[i];
-        algoMap->setKernelThreadInfoToMap(kernelInfo.name, kernelInfo.gs, kernelInfo.ls);
+    if (needSaveKernelThreadInfoToMap) {
+        for (U32 i = 0; i < len; i++) {
+            auto kernelInfo = (*handle->kernelVec)[i];
+            algoMap->setKernelThreadInfoToMap(kernelInfo.name, kernelInfo.gs, kernelInfo.ls);
+        }
     }
     return SUCCESS;
 }
@@ -1387,7 +1455,7 @@ inline EE gcl_set_kernelArgs(Kernel kernel, Args... args)
 inline std::string gclMemDesc2Str(GCLMemDesc desc)
 {
     char buff[128];
-    snprintf(buff, sizeof(buff), "dt:%s memFormat:%s ", DataTypeName()[desc.dt],
+    UNI_SNPRINTF(buff, sizeof(buff), "dt:%s memFormat:%s ", DataTypeName()[desc.dt],
         DataFormatName()[desc.memFormat]);
     std::string descStr = buff;
     descStr += "stride(";
@@ -1414,6 +1482,28 @@ inline EE gcl_get_image_size(GCLMem_t gclMem, U32 *width, U32 *height, U32 *dept
     CHECK_STATUS(get_image_size(gclMem->mem, width, height, depth));
     return SUCCESS;
 }
+
+inline void gcl_set_runInfo_to_cache(
+    GCLHandle_t handle, std::vector<I32> flag, ForwardRunInfoMali runInfo)
+{
+    if (handle->runInfoCache.find(flag) == handle->runInfoCache.end()) {
+        handle->runInfoCache[flag] = runInfo;
+    }
+}
+
+inline bool gcl_get_runInfo_from_cache(
+    GCLHandle_t handle, std::vector<I32> flag, ForwardRunInfoMali_t runInfo)
+{
+    if (handle->runInfoCache.find(flag) != handle->runInfoCache.end()) {
+        *runInfo = handle->runInfoCache[flag];
+        UNI_DEBUG_LOG("get forward run info from cache success\n");
+        return true;
+    } else {
+        UNI_DEBUG_LOG("get forward run info from cache fail, try to find best forward run info\n");
+        return false;
+    }
+}
+
 #ifdef _DEBUG
 template <typename T>
 inline EE gcl_print_memory(GCLHandle_t handle, GCLMem_t gclMem, CI8 *gclMemName = NULL)
diff --git a/common/gcl/include/kernel.h b/common/gcl/include/kernel.h
index 5653e1b5..d2147edc 100644
--- a/common/gcl/include/kernel.h
+++ b/common/gcl/include/kernel.h
@@ -49,6 +49,22 @@ inline EE get_kernel_info(Kernel kernel, cl_kernel_info info, void **value, size
     map_cl_error_2_ee(ret);
 }
 
+inline EE get_kernel_name(Kernel kernel, char* name, U32 *len)
+{
+    if (NULL == name || NULL == len) {
+        return NULL_POINTER;
+    }
+
+    size_t lenVal;
+    cl_int ret = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, 0, NULL, &lenVal);
+    if (ret != CL_SUCCESS) {
+        map_cl_error_2_ee(ret);
+    }
+    *len = lenVal;
+    ret = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, lenVal, name, NULL);
+    map_cl_error_2_ee(ret);
+}
+
 inline EE get_program_info_from_kernel(Kernel kernel, Program *program)
 {
     cl_int ret = clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(Program), program, NULL);
diff --git a/common/gcl/src/ocl_data_trans.cpp b/common/gcl/src/ocl_data_trans.cpp
index ab0aa94d..0b16d266 100644
--- a/common/gcl/src/ocl_data_trans.cpp
+++ b/common/gcl/src/ocl_data_trans.cpp
@@ -413,7 +413,7 @@ EE ocl_trans_mem(
                     CHECK_STATUS(NOT_MATCH);
                 }
                 CHECK_STATUS(set_padding_opt_mali(
-                    true, Pad_Constant, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt));
+                    true, PAD_CONSTANT, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt));
                 CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
                 CHECK_STATUS(gcl_set_kernelArgs(kernel, sw_str, sh_str, dw_str, dh_str, 0, 0,
                     sw_str, sh_str, dw_str, dh_str, pl, pr, pt, pb, gs[0], gs[1], srcMem, dstMem));
@@ -494,7 +494,7 @@ EE ocl_map_mem_write(
                 CHECK_STATUS(NOT_MATCH);
             }
             CHECK_STATUS(set_padding_opt_mali(
-                true, Pad_Constant, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt));
+                true, PAD_CONSTANT, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt));
             CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelName, &kernel, &kernelOpt));
             CHECK_STATUS(gcl_set_kernelArgs(kernel, w, h, w_str, h_str, offset, 0, w, h, w_str,
                 h_str, pl, pr, pt, pb, gs[0], gs[1], gclMem->mem, gclMem->mem));
diff --git a/common/gcl/tools/gcl_sample/sample.cpp b/common/gcl/tools/gcl_sample/sample.cpp
index b496ad07..8d7821a9 100644
--- a/common/gcl/tools/gcl_sample/sample.cpp
+++ b/common/gcl/tools/gcl_sample/sample.cpp
@@ -10,7 +10,6 @@
 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-#ifdef _USE_FP16
 #include "gcl.h"
 
 #include "ocl_context.h"
@@ -128,22 +127,22 @@ int main()
     oc_str = oc_str / (ot * on);
     on_str = owh_str * oc_str;
 
-    //    F16* input_val = (F16*)malloc(inputGclDesc.byteSize);
-    //    F16* filter_val = (F16*)malloc(filterGclDesc.byteSize);
-    //    F16* bias_val = (F16*)malloc(biasGclDesc.byteSize);
-    //    for (U32 i = 0; i < inputGclDesc.num; i++) input_val[i] = (i % 16) * 0.1;
-    //    for (U32 i = 0; i < filterGclDesc.num; i++) filter_val[i] = (i % 16) * 0.1;
-    //    for (U32 i = 0; i < biasGclDesc.num * 4; i++) bias_val[i] = 1.0;
-    //    U32 size[3] = {1, 1, 1};
-    //    size[0] = inputGclDesc.byteSize;
-    //    CHECK_STATUS(gcl_trans_memory(handle, input_val, input, size, HOST_TO_DEVICE_BUF, CL_TRUE));
-    //    size[0] = filterGclDesc.byteSize;
-    //    CHECK_STATUS(gcl_trans_memory(handle, filter_val, flt, size, HOST_TO_DEVICE_BUF, CL_TRUE));
-    //    size[0] = biasGclDesc.num;
-    //    CHECK_STATUS(gcl_trans_memory(handle, bias_val, bias, size, HOST_TO_DEVICE_IMG, CL_TRUE));
+    // F16* input_val = (F16*)malloc(inputGclDesc.byteSize);
+    // F16* filter_val = (F16*)malloc(filterGclDesc.byteSize);
+    // F16* bias_val = (F16*)malloc(biasGclDesc.byteSize);
+    // for (U32 i = 0; i < inputGclDesc.num; i++) input_val[i] = (i % 16) * 0.1;
+    // for (U32 i = 0; i < filterGclDesc.num; i++) filter_val[i] = (i % 16) * 0.1;
+    // for (U32 i = 0; i < biasGclDesc.num * 4; i++) bias_val[i] = 1.0;
+    // U32 size[3] = {1, 1, 1};
+    // size[0] = inputGclDesc.byteSize;
+    // CHECK_STATUS(gcl_trans_memory(handle, input_val, input, size, HOST_TO_DEVICE_BUF, CL_TRUE));
+    // size[0] = filterGclDesc.byteSize;
+    // CHECK_STATUS(gcl_trans_memory(handle, filter_val, flt, size, HOST_TO_DEVICE_BUF, CL_TRUE));
+    // size[0] = biasGclDesc.num;
+    // CHECK_STATUS(gcl_trans_memory(handle, bias_val, bias, size, HOST_TO_DEVICE_IMG, CL_TRUE));
     //
-    //    CHECK_STATUS(gcl_check_buf<F16>(handle, input->mem, inputGclDesc.byteSize, false, "input"));
-    //    CHECK_STATUS(gcl_check_buf<F16>(handle, flt->mem, filterGclDesc.byteSize, false, "filter"));
+    // CHECK_STATUS(gcl_check_buf<F16>(handle, input->mem, inputGclDesc.byteSize, false, "input"));
+    // CHECK_STATUS(gcl_check_buf<F16>(handle, flt->mem, filterGclDesc.byteSize, false, "filter"));
     gcl_finish(handle);
     for (U32 item_bn = 2; item_bn <= 4; item_bn++) {
         for (U32 item_kn = 1; item_kn <= 2; item_kn = item_kn * 2) {
@@ -160,10 +159,10 @@ int main()
                 }
 
                 Kernel kernel;
-                char kernelName[1024];
-                sprintf(kernelName, "conv_direct_multi_batch_s1_%d%d%d%d%d", fw, fh, item_w,
-                    item_kn, item_bn);
-                CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel));
+                std::string kernelName = std::string("conv_direct_multi_batch_s1_") +
+                    std::to_string(fw) + std::to_string(fh) + std::to_string(item_w) +
+                    std::to_string(item_kn) + std::to_string(item_bn);
+                CHECK_STATUS(gcl_create_kernel(handle, kernelName.c_str(), &kernel));
                 if (oc_str % item_kn != 0) {
                     continue;
                 }
@@ -174,7 +173,7 @@ int main()
                 CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iwh_str, ic_str, ih_off, iw_off,
                     oh_str, owh_str, oh_off, ow_off, ow, oc, on, sh, in_str, on_str, gs[0], gs[1],
                     input->mem, flt->mem, bias->mem, output->mem));
-                gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName);
+                gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName.c_str());
                 CHECK_STATUS(gcl_run_kernel_select_ls(handle, &kernelVec[0]));
 #ifdef _DEBUG
                 CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size()));
@@ -185,12 +184,11 @@ int main()
 #else
                 CHECK_STATUS(gcl_run_kernelVec(handle));
 #endif
-                //                CHECK_STATUS(gcl_check_buf<F16>(handle, output->mem, outputGclDesc.byteSize, false, "output"));
-                //                CHECK_STATUS(gcl_fill_memory_zero(handle, output));
+                // CHECK_STATUS(gcl_check_buf<F16>(handle, output->mem, outputGclDesc.byteSize, false, "output"));
+                // CHECK_STATUS(gcl_fill_memory_zero(handle, output));
                 CHECK_STATUS(gcl_clean_kernelVec(handle));
                 gcl_finish(handle);
             }
         }
     }
 }
-#endif
diff --git a/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp b/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp
index c6f2e89d..469a238f 100644
--- a/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp
+++ b/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp
@@ -164,9 +164,9 @@ int main(I32 argc, I8 *argv[])
     U32 srcLen = imageLen + half16Len + clcodeLen;
     I8 *source = new I8[srcLen];
 #ifdef CL_VERSION_1_2
-    memcpy(source, imagesource, imageLen);
+    UNI_MEMCPY(source, imagesource, imageLen);
 #endif
-    memcpy(source + imageLen, half16source, half16Len);
+    UNI_MEMCPY(source + imageLen, half16source, half16Len);
     FileStatus = LoadBinFile(FLAGS_inputFilename, source + imageLen + half16Len, clcodeLen);
     if (!FileStatus) {
         printf("load bin file failed\n");
diff --git a/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp b/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp
index 3b73ffba..cfd4c113 100644
--- a/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp
+++ b/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp
@@ -12,11 +12,9 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include <fstream>
-#include <iostream>
 #include <sstream>
 #include <sys/stat.h>
 #include <string>
-#include <string.h>
 
 int main(int argc, char *argv[])
 {
@@ -55,57 +53,47 @@ int main(int argc, char *argv[])
         }
         binMapName = argv[3];
     } else {
-        printf("please input .bin name + binmapname or input .bin name + .cpp name + binmapname\n");
+        printf("[ERROR] please pass xxx.bin name + binmapname or xxx.bin name + xxx.cpp name + "
+               "binmapname.\n");
+        return 1;
     }
 
     FILE *fpbin = fopen(binFile.c_str(), "rb");
     if (fpbin == NULL) {
-        printf("file %s open error\n", binFile.c_str());
+        printf("[ERROR] can not open file %s.\n", binFile.c_str());
         return 1;
     }
 
     struct stat f_stat;
     if (stat(binFile.c_str(), &f_stat) == -1) {
-        printf("file %s get size error\n", binFile.c_str());
+        printf("[ERROR] can not get file %s size.\n", binFile.c_str());
         fclose(fpbin);
         return 1;
     }
     int filelen = f_stat.st_size;
-    std::stringstream templen;
-    templen << filelen;
-    std::string filelen_st = templen.str();
-
     std::string str = "#include \"inline_" + std::string(binMapName) + ".h\"\n\nCU32 " +
-        std::string(charName) + "_len = " + filelen_st + ";\nCU8 " + std::string(charName) +
-        "[] = {";
-
-    unsigned char charRead;
-    std::string appendBuf;
-
+        std::string(charName) + "_len = " + std::to_string(filelen_st) + ";\nCU8 " +
+        std::string(charName) + "[] = {";
+    std::stringstream ss;
     for (int i = 0; i < filelen; i++) {
-        appendBuf.clear();
+        unsigned char c;
         if (i % 20 == 0) {
-            appendBuf += "\n";
+            ss << "\n";
         }
-        if (1 != fread(&charRead, 1, 1, fpbin)) {
-            printf("file %s read error\n", binFile.c_str());
+        if (1 != fread(&c, 1, 1, fpbin)) {
+            printf("[ERROR] can not read file %s content.\n", binFile.c_str());
             fclose(fpbin);
             return 1;
         }
-        char tempstr[4];
-        sprintf(tempstr, "0x%02x", charRead);
-        appendBuf += std::string(tempstr);
-
+        ss << "0x" << std::hex << std::setw(2) << std::setfill('0') << i;
         if (i == filelen - 1) {
         } else if (i % 20 == 19) {
-            appendBuf += ",";
+            ss << ",";
         } else {
-            appendBuf += ", ";
+            ss << ", ";
         }
-        str += appendBuf;
     }
-
-    str += "};";
+    str += ss.str() + "};";
 
     std::ofstream file;
     file.open(cppFile.c_str());
@@ -113,6 +101,5 @@ int main(int argc, char *argv[])
     file.close();
 
     fclose(fpbin);
-
     return 0;
 }
diff --git a/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp b/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp
index 2dc1cd0e..2a0d5871 100644
--- a/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp
+++ b/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp
@@ -378,11 +378,7 @@ int main()
     if (boltEnv == NULL) {
         UNI_ERROR_LOG("BOLT_ROOT env value has not been set successfully\n");
     };
-    std::string boltPath = boltEnv;
-    CI8 lastFlag = boltPath[boltPath.length() - 1];
-    if (strcmp(&lastFlag, "/") != 0) {
-        boltPath += "/";
-    }
+    std::string boltPath = boltEnv + std::string("/");
     std::string tensorComputingClPath = "compute/tensor/src/gpu/mali/cl/";
     std::string tensorComputingClPathQc = "compute/tensor/src/gpu/mali/cl/qualcomm/";
     std::string imageClPath = "compute/image/src/gpu/mali/cl/";
diff --git a/common/memory/include/memory_cpu.hpp b/common/memory/include/memory_cpu.hpp
index 7ac82f70..eb654d49 100644
--- a/common/memory/include/memory_cpu.hpp
+++ b/common/memory/include/memory_cpu.hpp
@@ -18,7 +18,7 @@
 
 inline void *CPUMemoryAlignedAlloc(size_t alignment, size_t bytes)
 {
-    void *ptr = (void **)operator new(bytes + sizeof(void *) + alignment - 1);
+    void *ptr = (void **)UNI_OPERATOR_NEW(bytes + sizeof(void *) + alignment - 1);
     CHECK_REQUIREMENT(ptr != NULL);
     void **aligned_ptr =
         (void **)(((uintptr_t)(ptr) + sizeof(void *) + alignment - 1) & ~(alignment - 1));
@@ -28,7 +28,7 @@ inline void *CPUMemoryAlignedAlloc(size_t alignment, size_t bytes)
 
 inline void CPUMemoryAlignedfree(void *aligned_ptr)
 {
-    operator delete(((void **)aligned_ptr)[-1]);
+    UNI_OPERATOR_DELETE(((void **)aligned_ptr)[-1]);
 }
 
 class CpuMemory : public Memory {
@@ -39,7 +39,8 @@ class CpuMemory : public Memory {
         this->allocated = false;
     }
 
-    ~CpuMemory() = default;
+    ~CpuMemory()
+    {}
 
     std::shared_ptr<Memory> clone(bool allocate) override
     {
@@ -71,13 +72,13 @@ class CpuMemory : public Memory {
             this->capacitySize = size;
             try {
 #ifndef _USE_X86
-                this->val = std::shared_ptr<U8>((U8 *)operator new(size));
+                this->val = std::shared_ptr<U8>((U8 *)UNI_OPERATOR_NEW(size), UNI_OPERATOR_DELETE);
 #else
                 this->val = std::shared_ptr<U8>(
                     (U8 *)CPUMemoryAlignedAlloc(64, size), CPUMemoryAlignedfree);
 #endif
             } catch (const std::bad_alloc &e) {
-                UNI_ERROR_LOG("CPU memory alloc %d bytes failed\n", (int)size);
+                UNI_ERROR_LOG("CPU memory alloc %d bytes failed.\n", (int)size);
             }
         }
         this->allocated = true;
@@ -179,7 +180,7 @@ class CpuMemory : public Memory {
     std::string string(U32 num, F32 factor) override
     {
         U32 capacityNum = this->capacitySize / bytesOf(this->desc.dt);
-        std::string line = "desc: " + tensorDesc2Str(this->desc) + " data:";
+        std::string line = "desc:" + tensorDesc2Str(this->desc) + " data:";
         for (U32 i = 0; i < num && i < capacityNum; i++) {
             line = line + std::to_string(this->element(i) / factor) + " ";
         }
@@ -187,7 +188,7 @@ class CpuMemory : public Memory {
         for (U32 i = 0; i < UNI_MIN(tensorNumElements(this->desc), capacityNum); i++) {
             sum += this->element(i) / factor;
         }
-        line += " sum: " + std::to_string(sum);
+        line += " sum:" + std::to_string(sum);
         return line;
     }
 
diff --git a/common/memory/include/memory_ocl.hpp b/common/memory/include/memory_ocl.hpp
index a7194cce..488880f1 100644
--- a/common/memory/include/memory_ocl.hpp
+++ b/common/memory/include/memory_ocl.hpp
@@ -23,7 +23,7 @@ class OclMemory : public Memory {
 public:
     OclMemory()
     {
-        memset(&(this->desc), 0, sizeof(GCLMemDesc));
+        UNI_MEMSET(&(this->desc), 0, sizeof(GCLMemDesc));
         this->desc.memFormat = DF_NCHW;
         this->desc.memType = GCL_MEM_BUF;
         this->desc.flags = CL_MEM_READ_WRITE;
@@ -202,14 +202,14 @@ class OclMemory : public Memory {
             if (!allocated) {
                 U8 *tmp = nullptr;
                 if (size < this->desc.byteSize) {
-                    U8 *tmp = (U8 *)operator new(this->desc.byteSize);
-                    memset(tmp, 0, this->desc.byteSize);
-                    memcpy(tmp, host_ptr, size);
+                    U8 *tmp = (U8 *)UNI_OPERATOR_NEW(this->desc.byteSize);
+                    UNI_MEMSET(tmp, 0, this->desc.byteSize);
+                    UNI_MEMCPY(tmp, host_ptr, size);
                     host_ptr = tmp;
                 }
                 this->alloc(host_ptr);
-                if (tmp) {
-                    delete tmp;
+                if (tmp != nullptr) {
+                    UNI_OPERATOR_DELETE(tmp);
                 }
             } else {
                 this->val->desc = this->desc;  //TODO DELETE AFTER SPLITE DESC FROM GCLMEM
@@ -345,7 +345,7 @@ class OclMemory : public Memory {
 
     std::string string(U32 num, F32 factor) override
     {
-        std::string line = "desc: " + gclMemDesc2Str(this->desc) + " data: ";
+        std::string line = "desc:" + gclMemDesc2Str(this->desc) + " data:";
 #ifdef _DEBUG
         DataType dt = (this->desc.dt == DT_U8) ? DT_F16 : this->desc.dt;
         if (dt == DT_U32) {
@@ -374,7 +374,7 @@ class OclMemory : public Memory {
             for (U32 i = 0; i < this->length(); i++) {
                 sum += this->element(i) / factor;
             }
-            line += " sum: " + std::to_string(sum);
+            line += " sum:" + std::to_string(sum);
         }
 #endif
         return line;
diff --git a/common/memory/include/memory_ocl_img.hpp b/common/memory/include/memory_ocl_img.hpp
index 6865aa43..abd6a7f1 100644
--- a/common/memory/include/memory_ocl_img.hpp
+++ b/common/memory/include/memory_ocl_img.hpp
@@ -127,9 +127,9 @@ class OclMemoryImg : public OclMemory {
             U8 *tmp = nullptr;
             if (size < this->desc.byteSize) {
                 if (this->get_mem_type() == OCLMemImg1D) {
-                    U8 *tmp = (U8 *)operator new(this->bytes());
-                    memset(tmp, 0, this->bytes());
-                    memcpy(tmp, host_ptr, size);
+                    tmp = (U8 *)UNI_OPERATOR_NEW(this->bytes());
+                    UNI_MEMSET(tmp, 0, this->bytes());
+                    UNI_MEMCPY(tmp, host_ptr, size);
                     host_ptr = tmp;
                 } else {
                     CHECK_STATUS(NOT_MATCH);
@@ -146,6 +146,9 @@ class OclMemoryImg : public OclMemory {
                     CHECK_STATUS(NOT_SUPPORTED);
                 }
             }
+            if (tmp != nullptr) {
+                UNI_OPERATOR_DELETE(tmp);
+            }
         } else {
             if (!allocated) {
                 this->alloc();
diff --git a/common/memory/include/tensor.hpp b/common/memory/include/tensor.hpp
index abd6d20f..37966c27 100644
--- a/common/memory/include/tensor.hpp
+++ b/common/memory/include/tensor.hpp
@@ -85,6 +85,11 @@ class Tensor {
         *(this->scale) = scale;
     }
 
+    void set_scale_ptr(std::shared_ptr<F32> scale)
+    {
+        this->scale = scale;
+    }
+
     F32 get_scale()
     {
         return *(this->scale);
@@ -97,7 +102,7 @@ class Tensor {
 
     void copy_from(Tensor *other)
     {
-        memcpy(this->scale.get(), other->scale.get(), sizeof(F32));
+        UNI_MEMCPY(this->scale.get(), other->scale.get(), sizeof(F32));
         this->val->copy_from(other->val.get());
     }
 
diff --git a/common/memory/include/tensor_common.h b/common/memory/include/tensor_common.h
index b0672299..ab912567 100644
--- a/common/memory/include/tensor_common.h
+++ b/common/memory/include/tensor_common.h
@@ -40,7 +40,7 @@ static void transformToNCHWKernel(
         case DF_NCHW: {
             if (in == on && ic == oc && ih == oh && iw == ow) {
                 if (output != input) {
-                    memcpy(output, input, tensorNumBytes(outputDesc));
+                    UNI_MEMCPY(output, input, tensorNumBytes(outputDesc));
                 }
             } else {
                 U32 tileSize = UNI_MIN(iw, ow) * bytesOf(idt);
@@ -49,7 +49,7 @@ static void transformToNCHWKernel(
                         for (U32 h = 0; h < oh && h < ih; h++) {
                             U32 srcIndex = ((n * ic + c) * ih + h) * iw;
                             U32 dstIndex = ((n * oc + c) * oh + h) * ow;
-                            memcpy(output + dstIndex, input + srcIndex, tileSize);
+                            UNI_MEMCPY(output + dstIndex, input + srcIndex, tileSize);
                         }
                     }
                 }
@@ -169,7 +169,7 @@ static void transformToNHWCKernel(
         case DF_NHWC: {
             CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size);
             if (input != output) {
-                memcpy(output, input, tensorNumBytes(inputDesc));
+                UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
             }
             break;
         }
@@ -262,9 +262,9 @@ EE transformNCHWToNCHWC8(
                         // support channel padding
                         if (c_i < ic) {
                             U32 srcIndex = (((n * ic + c_i) * ih + h) * iw + w) * elementSize;
-                            memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
+                            UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
                         } else {
-                            memset(outputPtr + dstIndex, 0, elementSize);
+                            UNI_MEMSET(outputPtr + dstIndex, 0, elementSize);
                         }
                     }
                 }
@@ -299,9 +299,9 @@ EE transformNHWCToNCHWC8(
                         // support channel padding
                         if (c_i < ic) {
                             U32 srcIndex = (((n * ih + h) * iw + w) * ic + c_i) * elementSize;
-                            memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
+                            UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
                         } else {
-                            memset(outputPtr + dstIndex, 0, elementSize);
+                            UNI_MEMSET(outputPtr + dstIndex, 0, elementSize);
                         }
                     }
                 }
@@ -318,7 +318,7 @@ EE transformNCHWC8ToNCHWC8ByGroup(
     U32 outputSize = tensorNumElements(outputDesc);
     if (group <= 1 || inputSize == outputSize) {
         if (input != output) {
-            memcpy(output, input, outputSize);
+            UNI_MEMCPY(output, input, outputSize);
         }
         return SUCCESS;
     }
@@ -354,10 +354,10 @@ EE transformNCHWC8ToNCHWC8ByGroup(
                             U32 srcIndex =
                                 ((((n * ict + id_a) * ih + h) * iw + w) * channelAlignSize + id_b) *
                                 elementSize;
-                            memcpy(
+                            UNI_MEMCPY(
                                 (U8 *)output + dstIndex, (const U8 *)input + srcIndex, elementSize);
                         } else {
-                            memset((U8 *)output + dstIndex, 0, elementSize);
+                            UNI_MEMSET((U8 *)output + dstIndex, 0, elementSize);
                         }
                     }
                 }
@@ -417,7 +417,7 @@ EE transposeFilter(TensorDesc inputDesc, const void *input, TensorDesc outputDes
                 for (U32 hw = 0; hw < ih * iw; hw++) {
                     U32 srcIndex = o * ih * iw * innerSize + hw * innerSize;
                     U32 dstIndex = o * ih * iw * innerSize + (hwMax - hw) * innerSize;
-                    memcpy(outputPtr + dstIndex, inputPtr + srcIndex, innerSize);
+                    UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, innerSize);
                 }
             }
             break;
@@ -475,7 +475,7 @@ EE array_transpose(DataType dt,
             inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
         }
         inputIndex += inputLocalIndex[sizeInnerIndex];
-        memcpy(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize);
+        UNI_MEMCPY(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize);
     }
 
     return SUCCESS;
@@ -513,7 +513,7 @@ EE array_transpose_naive(DataType dt,
             inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
         }
         inputIndex += inputLocalIndex[0];
-        memcpy(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize);
+        UNI_MEMCPY(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize);
     }
 
     return SUCCESS;
diff --git a/common/memory/include/tensor_desc.h b/common/memory/include/tensor_desc.h
index f3b92a6e..d2353a02 100644
--- a/common/memory/include/tensor_desc.h
+++ b/common/memory/include/tensor_desc.h
@@ -20,11 +20,14 @@
 
 #include "data_type.h"
 #include "error.h"
+#include "secure_c_wrapper.h"
 #ifdef _USE_GPU
 #define CL_TARGET_OPENCL_VERSION 200
 #include "CL/cl.h"
 #endif
 
+#define DIM_LEN 6
+
 typedef enum {
     DF_NCHW,
     DF_NCHWN16,     // vectorize for N=16, for filter
@@ -68,7 +71,8 @@ typedef enum {
     DF_NKN12K4,     // Optimized MMM filter for INT8
     DF_NKNx_NKN32,  // Optimized LSTM filter
     DF_NCHWC16,     // vectorize for C=16, for input and output
-    DF_NCHWC2NxC4
+    DF_NCHWC2NxC4,
+    DF_SCALAR
 } DataFormat;
 
 inline const char *const *DataFormatName()
@@ -79,7 +83,8 @@ inline const char *const *DataFormatName()
         "DF_MKT", "DF_NK", "DF_NKN16", "DF_NKN32", "DF_NKN64", "DF_NKN32K4", "DF_NCHWC4",
         "DF_NCHWC3", "DF_NHWC", "DF_NCHWN4C4", "DF_NCHWN4", "DF_HWCN", "DF_NCWHN4C4", "DF_NHWCN4",
         "DF_CHWNC4", "DF_CHWNC8", "DF_CHWNC16", "DF_CHWC8_NCN8", "DF_RGB", "DF_HWNCN8", "DF_NKN24",
-        "DF_NKN12", "DF_NKN8", "DF_NKN12K4", "DF_NKNx_NKN32", "DF_NCHWC16", "DF_NCHWC2NxC4"};
+        "DF_NKN12", "DF_NKN8", "DF_NKN12K4", "DF_NKNx_NKN32", "DF_NCHWC16", "DF_NCHWC2NxC4",
+        "DF_SCALAR"};
     return names;
 }
 
@@ -87,13 +92,13 @@ typedef struct TensorDesc {
     DataType dt = DT_U8;
     DataFormat df = DF_NCHW;
     U32 nDims = 0;
-    U32 dims[6] = {0};
+    U32 dims[DIM_LEN] = {0};
 } TensorDesc;
 
 inline TensorDesc tensor0d()
 {
     TensorDesc desc;
-    memset(&desc, 0, sizeof(TensorDesc));
+    UNI_MEMSET(&desc, 0, sizeof(TensorDesc));
     return desc;
 }
 
@@ -365,20 +370,38 @@ inline U8 tensorIs5d(TensorDesc desc)
     return 5 == desc.nDims;
 }
 
+// in order to support shape calculation, there is a reserved buffer in TensorDesc.dims to save.
+inline U8 tensorIsShape(TensorDesc desc)
+{
+    U32 length = tensorNumElements(desc);
+    U8 ret = 0;
+    if (desc.dt == DT_U32 && length > 0 && length + desc.nDims <= DIM_LEN) {
+        ret = 1;
+    }
+    return ret;
+}
+
 inline std::string tensorDesc2Str(TensorDesc desc)
 {
     std::string descStr = "dt:" + std::string(DataTypeName()[desc.dt]) +
         " df:" + std::string(DataFormatName()[desc.df]) + " dims:" + std::to_string(desc.nDims);
-
     if (desc.nDims > 0) {
         descStr += "(";
-    }
-    for (I32 i = int(desc.nDims) - 1; i >= 0; i--) {
-        descStr += std::to_string(desc.dims[i]);
-        if (i > 0) {
-            descStr += ",";
-        } else {
-            descStr += ")";
+        for (I32 i = int(desc.nDims) - 1; i > 0; i--) {
+            descStr += std::to_string(desc.dims[i]) + ",";
+        }
+        descStr += std::to_string(desc.dims[0]) + ")";
+        if (tensorIsShape(desc)) {
+            U32 length = tensorNumElements(desc);
+            descStr += " reserve:(";
+            for (U32 i = desc.nDims; i < desc.nDims + length && i < DIM_LEN; i++) {
+                descStr += std::to_string((int)desc.dims[i]);
+                if (i + 1 < desc.nDims + length && i + 1 < DIM_LEN) {
+                    descStr += ",";
+                } else {
+                    descStr += ")";
+                }
+            }
         }
     }
 
@@ -387,15 +410,15 @@ inline std::string tensorDesc2Str(TensorDesc desc)
 
 inline int tensorDescIsValid(TensorDesc desc)
 {
-    if (desc.dt < 0 || desc.dt >= 10) {
+    if (desc.dt < 0 || desc.dt >= DT_NUM) {
         return 0;
     }
 
-    if (desc.df < 0 || desc.df >= 30) {
+    if (desc.df < 0 || desc.df >= 50) {
         return 0;
     }
 
-    if (desc.nDims > 6) {
+    if (desc.nDims > DIM_LEN) {
         return 0;
     }
 
@@ -427,6 +450,7 @@ inline DataFormat getTensorDefaultDataFormat(int nDims)
     return df;
 }
 
+// return format is [w, h, c, n]
 inline std::vector<U32> calculateLocalIndex(U32 index, const U32 *dims, U32 nDims)
 {
     std::vector<U32> indexes(nDims);
@@ -441,7 +465,8 @@ inline U32 calculateGlobalIndex(const U32 *indexes, const U32 *dims, U32 nDims)
 {
     U32 index = 0;
     for (int i = ((int)nDims) - 1; i >= 0; i--) {
-        index = index * dims[i] + indexes[i];
+        U32 value = indexes[i] >= dims[i] ? 0 : indexes[i];
+        index = index * dims[i] + value;
     }
     return index;
 }
@@ -470,13 +495,13 @@ typedef enum {
 } GCLMemType;
 
 struct GCLMemDesc {
-    U32 dims[6];
+    U32 dims[DIM_LEN];
     U32 nDims;
     DataType dt;
     DataFormat df;
 
     U32 stride[3];
-    U32 offset[6];
+    U32 offset[DIM_LEN];
     GCLMemType memType;
     DataFormat memFormat;
     U32 byteSize;
diff --git a/common/memory/include/tensor_transpose.h b/common/memory/include/tensor_transpose.h
index 5a37ab33..63a097f4 100644
--- a/common/memory/include/tensor_transpose.h
+++ b/common/memory/include/tensor_transpose.h
@@ -16,10 +16,10 @@
 
 #include "tensor_desc.h"
 #include "uni.h"
-#include "thread_affinity.h"
+#include "affinity_policy.h"
 
 template <typename T>
-inline static void transformToNCHWKernel(
+inline static EE transformToNCHWKernel(
     TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output)
 {
     DataType idt, odt;
@@ -40,24 +40,30 @@ inline static void transformToNCHWKernel(
     } else if (tensorIs4d(inputDesc)) {
         CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     } else {
-        UNI_ERROR_LOG("not support transform %d-dim tensor to NCHW format\n", (int)inputDesc.nDims);
-        return;
+        UNI_ERROR_LOG("not support transform %d-dim tensor to NCHW format.\n", (int)inputDesc.nDims);
+        return NOT_SUPPORTED;
     }
-    if (tensorIs3d(outputDesc)) {
+    if (tensorIs2d(outputDesc)) {
+        CHECK_STATUS(tensor2dGet(outputDesc, &odt, &odf, &on, &oc));
+        oh = ow = 1;
+    } else if (tensorIs3d(outputDesc)) {
         CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh));
         ow = 1;
     } else if (tensorIs4d(outputDesc)) {
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     } else {
-        UNI_ERROR_LOG("not support transform to %d-dim NCHW tensor\n", (int)outputDesc.nDims);
-        return;
+        UNI_ERROR_LOG("not support transform to %d-dim NCHW tensor.\n", (int)outputDesc.nDims);
+        return NOT_SUPPORTED;
     }
     CHECK_REQUIREMENT(idt == odt);
+    EE ret = SUCCESS;
     switch (idf) {
+        case DF_NORMAL:
+        case DF_MTK:
         case DF_NCHW: {
             if (in == on && ic == oc && ih == oh && iw == ow) {
                 if (output != input) {
-                    memcpy(output, input, tensorNumBytes(outputDesc));
+                    UNI_MEMCPY(output, input, tensorNumBytes(outputDesc));
                 }
             } else {
                 U32 tileSize = UNI_MIN(iw, ow) * bytesOf(idt);
@@ -66,7 +72,7 @@ inline static void transformToNCHWKernel(
                         for (U32 h = 0; h < oh && h < ih; h++) {
                             U32 srcIndex = ((n * ic + c) * ih + h) * iw;
                             U32 dstIndex = ((n * oc + c) * oh + h) * ow;
-                            memcpy(output + dstIndex, input + srcIndex, tileSize);
+                            UNI_MEMCPY(output + dstIndex, input + srcIndex, tileSize);
                         }
                     }
                 }
@@ -160,49 +166,56 @@ inline static void transformToNCHWKernel(
             break;
         }
         default: {
-            UNI_ERROR_LOG("not support transform %s format to NCHW format\n", DataFormatName()[idf]);
+            UNI_ERROR_LOG(
+                "not support transform %s format to NCHW format.\n", DataFormatName()[idf]);
+            ret = NOT_SUPPORTED;
+            break;
         }
     }
+    return ret;
 }
 
 inline EE transformToNCHW(
     TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output)
 {
     if (nullptr == input || nullptr == output) {
-        return NULL_POINTER;
+        CHECK_STATUS(NULL_POINTER);
     }
+    EE ret = NOT_SUPPORTED;
     switch (inputDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
-            transformToNCHWKernel<F32>(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
+            ret = transformToNCHWKernel<F32>(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
             break;
         }
 #endif
 #ifdef _USE_FP16
         case DT_F16: {
-            transformToNCHWKernel<F16>(inputDesc, (F16 *)input, outputDesc, (F16 *)output);
+            ret = transformToNCHWKernel<F16>(inputDesc, (F16 *)input, outputDesc, (F16 *)output);
             break;
         }
 #endif
 #ifdef _USE_INT8
         case DT_I8: {
-            transformToNCHWKernel<INT8>(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output);
+            ret = transformToNCHWKernel<INT8>(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output);
             break;
         }
         case DT_U8_Q: {
-            transformToNCHWKernel<UINT8>(inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output);
+            ret = transformToNCHWKernel<UINT8>(
+                inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output);
             break;
         }
 #endif
         default: {
-            return NOT_SUPPORTED;
+            UNI_ERROR_LOG("not support transform %s type tensor.\n", DataTypeName()[inputDesc.dt]);
+            break;
         }
     }
-    return SUCCESS;
+    return ret;
 }
 
 template <typename T>
-inline static void transformToNHWCKernel(
+inline static EE transformToNHWCKernel(
     TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output)
 {
     DataType idt, odt;
@@ -219,19 +232,27 @@ inline static void transformToNHWCKernel(
         CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     } else {
         UNI_ERROR_LOG("not support transform %d-dim tensor to NHWC format\n", (int)inputDesc.nDims);
-        return;
+        return NOT_SUPPORTED;
+    }
+    if (tensorIs4d(outputDesc)) {
+        CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+    } else {
+        UNI_ERROR_LOG("not support transform to %d-dim NHWC tensor.\n", (int)outputDesc.nDims);
+        return NOT_SUPPORTED;
     }
-    CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 size = tensorNumElements(outputDesc);
     U32 ihiw = ih * iw;
+    EE ret = SUCCESS;
     switch (idf) {
         case DF_NHWC: {
             CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size);
             if (input != output) {
-                memcpy(output, input, tensorNumBytes(inputDesc));
+                UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
             }
             break;
         }
+        case DF_NORMAL:
+        case DF_MTK:
         case DF_NCHW: {
             CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size);
             for (U32 o = 0, srcIndex = 0; o < in; o++) {
@@ -244,14 +265,16 @@ inline static void transformToNHWCKernel(
             }
             break;
         }
-        case DF_NCHWC8: {
-            CHECK_REQUIREMENT(ic % 8 == 0);
-            ic /= 8;
+        case DF_NCHWC8:
+        case DF_NCHWC16: {
+            U32 align = (idf == DF_NCHWC16) ? 16 : 8;
+            CHECK_REQUIREMENT(ic % align == 0);
+            ic /= align;
             for (U32 n = 0, srcIndex = 0; n < in; n++) {
                 for (U32 c = 0; c < ic; c++) {
                     for (U32 hw = 0; hw < ihiw; hw++) {
-                        for (U32 c8 = 0; c8 < 8; c8++, srcIndex++) {
-                            U32 dstIndex = ((n * ihiw + hw) * ic + c) * 8 + c8;
+                        for (U32 cx = 0; cx < align; cx++, srcIndex++) {
+                            U32 dstIndex = ((n * ihiw + hw) * ic + c) * align + cx;
                             output[dstIndex] = input[srcIndex];
                         }
                     }
@@ -262,8 +285,11 @@ inline static void transformToNHWCKernel(
         default: {
             UNI_ERROR_LOG(
                 "not support transform %s format tensor to NHWC format\n", DataFormatName()[idf]);
+            ret = NOT_SUPPORTED;
+            break;
         }
     }
+    return ret;
 }
 
 inline EE transformToNHWC(
@@ -272,30 +298,32 @@ inline EE transformToNHWC(
     if (nullptr == input || nullptr == output) {
         return NULL_POINTER;
     }
+    EE ret = NOT_SUPPORTED;
     switch (inputDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
-            transformToNHWCKernel<F32>(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
+            ret = transformToNHWCKernel<F32>(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
             break;
         }
 #endif
 #ifdef _USE_FP16
         case DT_F16: {
-            transformToNHWCKernel<F16>(inputDesc, (F16 *)input, outputDesc, (F16 *)output);
+            ret = transformToNHWCKernel<F16>(inputDesc, (F16 *)input, outputDesc, (F16 *)output);
             break;
         }
 #endif
 #ifdef _USE_INT8
         case DT_I8: {
-            transformToNHWCKernel<INT8>(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output);
+            ret = transformToNHWCKernel<INT8>(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output);
             break;
         }
 #endif
         default: {
-            return NOT_SUPPORTED;
+            UNI_ERROR_LOG("not support transform %s type tensor.\n", DataTypeName()[inputDesc.dt]);
+            break;
         }
     }
-    return SUCCESS;
+    return ret;
 }
 
 inline EE transformNCHWC16ToNCHWC8(
@@ -309,7 +337,7 @@ inline EE transformNCHWC16ToNCHWC8(
     U32 in, ic, ih, iw, on, oc, oh, ow;
     if (tensorIs2d(inputDesc)) {
         if (input != output) {
-            memcpy(output, input, tensorNumBytes(inputDesc));
+            UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
         }
         return SUCCESS;
     } else if (tensorIs3d(inputDesc)) {
@@ -333,7 +361,7 @@ inline EE transformNCHWC16ToNCHWC8(
                         U32 srcIndex =
                             n * ic * ih * iw + c * ih * iw * 8 + (h * iw + w) * 16 + c8 * 8;
                         U32 dstIndex = n * ic * ih * iw + (c + c8) * ih * iw * 8 + (h * iw + w) * 8;
-                        memcpy(outputPtr + dstIndex * elementSize,
+                        UNI_MEMCPY(outputPtr + dstIndex * elementSize,
                             inputPtr + srcIndex * elementSize, elementSize * 8);
                     }
                 }
@@ -354,7 +382,7 @@ inline EE transformNCHWToNCHWC8(
     U32 in, ic, ih, iw, on, oc, oh, ow;
     if (tensorIs2d(inputDesc)) {
         if (input != output) {
-            memcpy(output, input, tensorNumBytes(inputDesc));
+            UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
         }
         return SUCCESS;
     } else if (tensorIs3d(inputDesc)) {
@@ -379,9 +407,9 @@ inline EE transformNCHWToNCHWC8(
                         // support channel padding
                         if (c_i < ic) {
                             U32 srcIndex = (((n * ic + c_i) * ih + h) * iw + w) * elementSize;
-                            memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
+                            UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
                         } else {
-                            memset(outputPtr + dstIndex, 0, elementSize);
+                            UNI_MEMSET(outputPtr + dstIndex, 0, elementSize);
                         }
                     }
                 }
@@ -416,9 +444,9 @@ inline EE transformNHWCToNCHWC8(
                         // support channel padding
                         if (c_i < ic) {
                             U32 srcIndex = (((n * ih + h) * iw + w) * ic + c_i) * elementSize;
-                            memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
+                            UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
                         } else {
-                            memset(outputPtr + dstIndex, 0, elementSize);
+                            UNI_MEMSET(outputPtr + dstIndex, 0, elementSize);
                         }
                     }
                 }
@@ -435,7 +463,7 @@ inline EE transformNCHWC8ToNCHWC8ByGroup(
     U32 outputSize = tensorNumElements(outputDesc);
     if (group <= 1 || inputSize == outputSize) {
         if (input != output) {
-            memcpy(output, input, outputSize);
+            UNI_MEMCPY(output, input, outputSize);
         }
         return SUCCESS;
     }
@@ -471,10 +499,10 @@ inline EE transformNCHWC8ToNCHWC8ByGroup(
                             U32 srcIndex =
                                 ((((n * ict + id_a) * ih + h) * iw + w) * channelAlignSize + id_b) *
                                 elementSize;
-                            memcpy(
+                            UNI_MEMCPY(
                                 (U8 *)output + dstIndex, (const U8 *)input + srcIndex, elementSize);
                         } else {
-                            memset((U8 *)output + dstIndex, 0, elementSize);
+                            UNI_MEMSET((U8 *)output + dstIndex, 0, elementSize);
                         }
                     }
                 }
@@ -485,7 +513,7 @@ inline EE transformNCHWC8ToNCHWC8ByGroup(
 }
 
 template <typename T>
-inline static void transformToNCHWC16Kernel(
+inline static EE transformToNCHWC16Kernel(
     TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output)
 {
     DataType idt, odt;
@@ -508,7 +536,7 @@ inline static void transformToNCHWC16Kernel(
     } else {
         UNI_ERROR_LOG(
             "not support transform %d-dim tensor to NCHWC16 format\n", (int)inputDesc.nDims);
-        return;
+        return NOT_SUPPORTED;
     }
     if (tensorIs3d(outputDesc)) {
         CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh));
@@ -517,10 +545,12 @@ inline static void transformToNCHWC16Kernel(
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     } else {
         UNI_ERROR_LOG("not support transform to %d-dim NCHWC16 tensor\n", (int)outputDesc.nDims);
-        return;
+        return NOT_SUPPORTED;
     }
     CHECK_REQUIREMENT(idt == odt);
+    EE ret = SUCCESS;
     switch (idf) {
+        case DF_NORMAL:
         case DF_MTK:
         case DF_NCHW: {
             U32 ic16 = ic / 16;
@@ -593,8 +623,11 @@ inline static void transformToNCHWC16Kernel(
         default: {
             UNI_ERROR_LOG(
                 "not support transform %s format to NCHWC16 format\n", DataFormatName()[idf]);
+            ret = NOT_SUPPORTED;
+            break;
         }
     }
+    return ret;
 }
 
 inline EE transformToNCHWC16(
@@ -603,37 +636,40 @@ inline EE transformToNCHWC16(
     if (nullptr == input || nullptr == output) {
         return NULL_POINTER;
     }
+    EE ret = NOT_SUPPORTED;
     switch (inputDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
-            transformToNCHWC16Kernel<F32>(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
+            ret = transformToNCHWC16Kernel<F32>(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
             break;
         }
 #endif
 #ifdef _USE_INT8
         case DT_U8_Q: {
-            transformToNCHWC16Kernel<UINT8>(inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output);
+            ret = transformToNCHWC16Kernel<UINT8>(
+                inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output);
             break;
         }
 #endif
         default: {
-            return NOT_SUPPORTED;
+            UNI_ERROR_LOG("not support transform %s type tensor.\n", DataTypeName()[inputDesc.dt]);
+            break;
         }
     }
-    return SUCCESS;
+    return ret;
 }
 
 inline EE transformFormat(
     TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output)
 {
     EE ret = NOT_SUPPORTED;
-    if (outputDesc.df == DF_NCHW) {
+    if (outputDesc.df == DF_NCHW || outputDesc.df == DF_MTK || outputDesc.df == DF_NORMAL) {
         ret = transformToNCHW(inputDesc, input, outputDesc, output);
     } else if (outputDesc.df == DF_NCHWC8) {
         if (inputDesc.df == DF_NORMAL) {
-            memcpy(output, input, tensorNumBytes(inputDesc));
+            UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
             ret = SUCCESS;
-        } else if (inputDesc.df == DF_NCHW || inputDesc.df == DF_MTK) {
+        } else if (inputDesc.df == DF_NCHW || inputDesc.df == DF_MTK || inputDesc.df == DF_NORMAL) {
             ret = transformNCHWToNCHWC8(inputDesc, input, outputDesc, output);
         } else if (inputDesc.df == DF_NHWC) {
             ret = transformNHWCToNCHWC8(inputDesc, input, outputDesc, output);
@@ -648,6 +684,8 @@ inline EE transformFormat(
         }
     } else if (outputDesc.df == DF_NCHWC16) {
         ret = transformToNCHWC16(inputDesc, input, outputDesc, output);
+    } else if (outputDesc.df == DF_NHWC) {
+        ret = transformToNHWC(inputDesc, input, outputDesc, output);
     } else {
         UNI_ERROR_LOG("layout transpose can not support transform to %s format.\n",
             DataFormatName()[outputDesc.df]);
@@ -664,34 +702,39 @@ inline EE transposeFilter(
     DataType idt, odt;
     DataFormat idf, odf;
     U32 in, ic, ih, iw, on, oc, oh, ow;
-    CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
-    CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+    if (tensorIs4d(inputDesc) && tensorIs4d(outputDesc)) {
+        CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+        CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+    } else {
+        UNI_ERROR_LOG("currently only support to transpose 4-dim filter.\n");
+        return NOT_SUPPORTED;
+    }
     CHECK_REQUIREMENT(idf == odf);
-    const U8 *inputPtr = (const U8 *)input;
-    U8 *outputPtr = (U8 *)output;
-
+    const U8 *src = (const U8 *)input;
+    U8 *dst = (U8 *)output;
+    EE ret = SUCCESS;
     switch (idf) {
         case DF_NHWCN8: {
             CHECK_REQUIREMENT(in % 8 == 0);
             in /= 8;
             U32 hwMax = ih * iw - 1;
-
             U32 innerSize = bytesOf(idt) * ic * 8;
-
             for (U32 o = 0; o < in; o++) {
                 for (U32 hw = 0; hw < ih * iw; hw++) {
                     U32 srcIndex = o * ih * iw * innerSize + hw * innerSize;
                     U32 dstIndex = o * ih * iw * innerSize + (hwMax - hw) * innerSize;
-                    memcpy(outputPtr + dstIndex, inputPtr + srcIndex, innerSize);
+                    UNI_MEMCPY(dst + dstIndex, src + srcIndex, innerSize);
                 }
             }
             break;
         }
         default: {
-            CHECK_STATUS(NOT_SUPPORTED);
+            UNI_ERROR_LOG(
+                "currently not support to transpose %s format filter.\n", DataFormatName()[idf]);
+            ret = NOT_SUPPORTED;
+            break;
         }
     }
-    return SUCCESS;
+    return ret;
 }
-
 #endif
diff --git a/common/model_spec/include/model_common.h b/common/model_spec/include/model_common.h
index 264f618f..b5b255e9 100644
--- a/common/model_spec/include/model_common.h
+++ b/common/model_spec/include/model_common.h
@@ -16,10 +16,33 @@
 
 #include <string>
 #include "model_spec.h"
+#include "memory_cpu.h"
 
 EE str_copy(I8 *dst, const I8 *src, I32 src_len, I32 dst_len = NAME_LEN);
 
-void *mt_new_storage(size_t size);
+inline void *mt_malloc(U32 size)
+{
+    return UNI_OPERATOR_NEW(size);
+}
+
+template <typename T>
+inline void mt_free(T *&p)
+{
+    UNI_OPERATOR_DELETE(p);
+    p = nullptr;
+}
+
+// only WeightSpec's weight and vec varialbles free by using this.
+// because this will use mmap memory.
+template <typename T>
+inline void mt_free(T *&p, ModelSpec *spec)
+{
+    if (spec == nullptr || spec->mfd == nullptr || (uintptr_t(p) < uintptr_t(spec->mfd->bytes)) ||
+        (uintptr_t(p) >= uintptr_t(spec->mfd->bytes + spec->mfd->fileLength))) {
+        UNI_OPERATOR_DELETE(p);
+    }
+    p = nullptr;
+}
 
 OperatorSpec mt_create_operator(
     const char *name, OperatorType type, U32 num_inputs, U32 num_outputs);
@@ -34,4 +57,7 @@ bool isDeprecatedOp(OperatorType opType);
 bool isDeprecatedOpWeight(const ModelSpec *spec, int index);
 
 std::string concat_dir_file(std::string dir, std::string file);
+
+void modify_ms_inputs_and_outputs(
+    ModelSpec *ms, std::string modifiedInputs, std::string modifiedOutputs);
 #endif
diff --git a/common/model_spec/include/model_spec.h b/common/model_spec/include/model_spec.h
index 3df6008f..121c79e4 100644
--- a/common/model_spec/include/model_spec.h
+++ b/common/model_spec/include/model_spec.h
@@ -16,7 +16,7 @@
 
 #include "parameter_spec.h"
 
-static const int sg_boltVersion = 20201120;
+static const int sg_boltVersion = 20220126;
 static const int sg_magicNumber = 1141119;
 
 #pragma pack(8)
@@ -87,14 +87,10 @@ typedef struct {
 } ModelSpec;
 #pragma pack()
 
-#define outOfFileMapRange(addr, mfd)                                  \
-    ((mfd == nullptr) || (uintptr_t(addr) < uintptr_t(mfd->bytes)) || \
-        (uintptr_t(addr) >= uintptr_t(mfd->bytes + mfd->fileLength)))
-
-EE mt_create_model(ModelSpec *md);
+EE mt_create_model(ModelSpec *spec);
 EE serialize_model_to_file(const ModelSpec *spec, const char *fn);
 EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStream = false);
-EE mt_destroy_model(ModelSpec *md);
+EE mt_destroy_model(ModelSpec *spec);
 
 #include "model_print.h"
 #endif
diff --git a/common/model_spec/src/CMakeLists.txt b/common/model_spec/src/CMakeLists.txt
index e6efbc94..d610b2d6 100644
--- a/common/model_spec/src/CMakeLists.txt
+++ b/common/model_spec/src/CMakeLists.txt
@@ -3,6 +3,9 @@ file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 # shared library
 add_library(${PROJECT_NAME} SHARED ${srcs})
 target_link_libraries(${PROJECT_NAME} LINK_PUBLIC uni)
+if (USE_SECURE_C)
+    target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY})
+endif ()
 
 # static library
 add_library(${PROJECT_NAME}_static STATIC ${srcs})
diff --git a/common/model_spec/src/model_common.cpp b/common/model_spec/src/model_common.cpp
index 1dee4ae8..96b24691 100644
--- a/common/model_spec/src/model_common.cpp
+++ b/common/model_spec/src/model_common.cpp
@@ -17,7 +17,7 @@
 OperatorSpec mt_create_operator(const char *name, OperatorType type, U32 num_inputs, U32 num_outputs)
 {
     OperatorSpec newOperator;
-    memset(&(newOperator), 0, sizeof(OperatorSpec));
+    UNI_MEMSET(&(newOperator), 0, sizeof(OperatorSpec));
     U32 length = UNI_MIN(strlen(name), NAME_LEN - 1);
     str_copy(newOperator.name, name, length);
     if (length < NAME_LEN) {
@@ -25,14 +25,14 @@ OperatorSpec mt_create_operator(const char *name, OperatorType type, U32 num_inp
     }
     newOperator.type = type;
     newOperator.num_inputs = num_inputs;
-    newOperator.input_tensors_name = (I8 **)mt_new_storage(num_inputs * sizeof(I8 *));
+    newOperator.input_tensors_name = (I8 **)mt_malloc(num_inputs * sizeof(I8 *));
     for (U32 i = 0; i < num_inputs; i++) {
-        newOperator.input_tensors_name[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+        newOperator.input_tensors_name[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
     }
     newOperator.num_outputs = num_outputs;
-    newOperator.output_tensors_name = (I8 **)mt_new_storage(num_outputs * sizeof(I8 *));
+    newOperator.output_tensors_name = (I8 **)mt_malloc(num_outputs * sizeof(I8 *));
     for (U32 i = 0; i < num_outputs; i++) {
-        newOperator.output_tensors_name[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+        newOperator.output_tensors_name[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
     }
     newOperator.tensor_positions = NULL;
     newOperator.num_quant_feature = 0;
@@ -46,7 +46,7 @@ EE mt_insert_operator(ModelSpec *ms, int index, OperatorSpec newOperator)
         return NULL_POINTER;
     }
     OperatorSpec *operatorList =
-        (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * (ms->num_operator_specs + 1));
+        (OperatorSpec *)mt_malloc(sizeof(OperatorSpec) * (ms->num_operator_specs + 1));
     for (int i = 0; i < index; i++) {
         operatorList[i] = ms->ops[i];
     }
@@ -54,7 +54,7 @@ EE mt_insert_operator(ModelSpec *ms, int index, OperatorSpec newOperator)
     for (int i = index; i < ms->num_operator_specs; i++) {
         operatorList[i + 1] = ms->ops[i];
     }
-    delete ms->ops;
+    mt_free(ms->ops);
     ms->ops = operatorList;
     ms->num_operator_specs++;
     return SUCCESS;
@@ -64,7 +64,7 @@ WeightSpec mt_create_weight(
     const char *name, DataType dataType, U32 bytesOfWeight, U32 bytesOfVec, U32 numQuantScale)
 {
     WeightSpec newWeight;
-    memset(&(newWeight), 0, sizeof(WeightSpec));
+    UNI_MEMSET(&(newWeight), 0, sizeof(WeightSpec));
     U32 length = UNI_MIN(strlen(name), NAME_LEN - 1);
     str_copy(newWeight.op_name, name, length);
     if (length < NAME_LEN) {
@@ -72,11 +72,11 @@ WeightSpec mt_create_weight(
     }
     newWeight.mdt = dataType;
     newWeight.bytes_of_weight = bytesOfWeight;
-    newWeight.weight = (U8 *)mt_new_storage(bytesOfWeight);
+    newWeight.weight = (U8 *)mt_malloc(bytesOfWeight);
     newWeight.bytes_of_vec = bytesOfVec;
-    newWeight.vec = (U8 *)mt_new_storage(bytesOfVec);
+    newWeight.vec = (U8 *)mt_malloc(bytesOfVec);
     newWeight.num_quant_scale = numQuantScale;
-    newWeight.weight_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec) * numQuantScale);
+    newWeight.weight_scale = (QuantSpec *)mt_malloc(sizeof(QuantSpec) * numQuantScale);
     return newWeight;
 }
 
@@ -100,31 +100,18 @@ bool isDeprecatedOpWeight(const ModelSpec *spec, int index)
 
 EE str_copy(I8 *dst, const I8 *src, I32 srcLen, I32 dstLen)
 {
-    //memset(dst, 0, dstLen);
+    //UNI_MEMSET(dst, 0, dstLen);
     //I32 copyLen = UNI_MIN(srcLen, dstLen);
-    //memcpy(dst, src, copyLen);
-    memset(dst, 0, dstLen);
+    //UNI_MEMCPY(dst, src, copyLen);
+    UNI_MEMSET(dst, 0, dstLen);
     I32 copyLen = NAME_LEN - 1;
     if (copyLen > srcLen) {
         copyLen = srcLen;
     }
-    memcpy(dst, src, copyLen * sizeof(I8));
+    UNI_MEMCPY(dst, src, copyLen * sizeof(I8));
     return SUCCESS;
 }
 
-void *mt_new_storage(size_t size)
-{
-    void *ret = nullptr;
-    if (size > 0) {
-        try {
-            ret = operator new(size);
-        } catch (const std::bad_alloc &e) {
-            UNI_ERROR_LOG("%s alloc %d bytes failed\n", __FUNCTION__, (int)size);
-        }
-    }
-    return ret;
-}
-
 std::string concat_dir_file(std::string dir, std::string file)
 {
     std::string ret;
@@ -143,3 +130,66 @@ std::string concat_dir_file(std::string dir, std::string file)
 
     return ret;
 }
+
+std::vector<std::string> string_parser(std::string s, std::string delimiter)
+{
+    std::vector<std::string> res;
+    size_t pos = 0;
+    std::string token;
+    while ((pos = s.find(delimiter)) != std::string::npos) {
+        token = s.substr(0, pos);
+        res.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    res.push_back(s);
+    return res;
+}
+
+void modify_ms_inputs_and_outputs(
+    ModelSpec *ms, std::string modifiedInputs, std::string modifiedOutputs)
+{
+    std::map<std::string, std::string> modifiedStrMap;
+    if (modifiedInputs.length() > 0) {
+        std::vector<std::string> modified_input_names = string_parser(modifiedInputs, ",");
+        if ((I32)(modified_input_names.size()) != ms->num_inputs) {
+            UNI_ERROR_LOG("input names not match, please check your params meticulously.\n");
+        }
+        for (int i = 0; i < ms->num_inputs; i++) {
+            std::string tmpStr = modified_input_names[i];
+            modifiedStrMap[std::string(ms->input_names[i])] = tmpStr;
+            str_copy(ms->input_names[i], tmpStr.c_str(), tmpStr.length());
+        }
+    }
+    if (modifiedOutputs.length() > 0) {
+        std::vector<std::string> modified_output_names = string_parser(modifiedOutputs, ",");
+        if ((I32)(modified_output_names.size()) != ms->num_outputs) {
+            UNI_ERROR_LOG("output names not match, please check your params meticulously.\n");
+        }
+        for (int i = 0; i < ms->num_outputs; i++) {
+            std::string tmpStr = modified_output_names[i];
+            modifiedStrMap[std::string(ms->output_names[i])] = tmpStr;
+            str_copy(ms->output_names[i], tmpStr.c_str(), tmpStr.length());
+        }
+    }
+
+    if (modifiedStrMap.size() > 0) {
+        for (I32 i = 0; i < ms->num_operator_specs; i++) {
+            for (U32 j = 0; j < ms->ops[i].num_inputs; j++) {
+                std::string curStr = std::string(ms->ops[i].input_tensors_name[j]);
+                if (modifiedStrMap.find(curStr) != modifiedStrMap.end()) {
+                    std::string modifiedStr = modifiedStrMap[curStr];
+                    str_copy(ms->ops[i].input_tensors_name[j], modifiedStr.c_str(),
+                        modifiedStr.length());
+                }
+            }
+            for (U32 j = 0; j < ms->ops[i].num_outputs; j++) {
+                std::string curStr = std::string(ms->ops[i].output_tensors_name[j]);
+                if (modifiedStrMap.find(curStr) != modifiedStrMap.end()) {
+                    std::string modifiedStr = modifiedStrMap[curStr];
+                    str_copy(ms->ops[i].output_tensors_name[j], modifiedStr.c_str(),
+                        modifiedStr.length());
+                }
+            }
+        }
+    }
+}
diff --git a/common/model_spec/src/model_deserialize.cpp b/common/model_spec/src/model_deserialize.cpp
index 13f8bcbb..8388c5a9 100644
--- a/common/model_spec/src/model_deserialize.cpp
+++ b/common/model_spec/src/model_deserialize.cpp
@@ -128,16 +128,16 @@ EE operator_relationship(ModelSpec *spec)
     int opNum = spec->num_operator_specs;
     spec->num_op_tensor_entries = opNum;
     OperatorSpec *opsPtr2 = spec->ops;
-    OperatorRelationshipMapEntry *oprmePtr = (OperatorRelationshipMapEntry *)mt_new_storage(
-        sizeof(OperatorRelationshipMapEntry) * opNum);
+    OperatorRelationshipMapEntry *oprmePtr =
+        (OperatorRelationshipMapEntry *)mt_malloc(sizeof(OperatorRelationshipMapEntry) * opNum);
     spec->op_relationship_entries = oprmePtr;
     for (int j = 0; j < opNum; j++) {
         str_copy(oprmePtr[j].op, opsPtr2[j].name, NAME_LEN);
         int opInOpNum = opInTensorNew[opsPtr2[j].name].size();
         oprmePtr[j].num_inputs = opInOpNum;
-        oprmePtr[j].input_op_names = (I8 **)mt_new_storage(opInOpNum * sizeof(I8 *));
+        oprmePtr[j].input_op_names = (I8 **)mt_malloc(opInOpNum * sizeof(I8 *));
         for (int k = 0; k < opInOpNum; k++) {
-            oprmePtr[j].input_op_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+            oprmePtr[j].input_op_names[k] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
             std::string ten_name = opInTensorNew[opsPtr2[j].name][k];
             std::string tensor2op = tensorOpMapping[ten_name];
             str_copy(oprmePtr[j].input_op_names[k], tensor2op.c_str(), tensor2op.length());
@@ -145,9 +145,9 @@ EE operator_relationship(ModelSpec *spec)
 
         int opOutOpNum = tensorFlowsToOpSet[opOutTensorNew[opsPtr2[j].name]].size();
         oprmePtr[j].num_outputs = opOutOpNum;
-        oprmePtr[j].output_op_names = (I8 **)mt_new_storage(opOutOpNum * sizeof(I8 *));
+        oprmePtr[j].output_op_names = (I8 **)mt_malloc(opOutOpNum * sizeof(I8 *));
         for (int k = 0; k < opOutOpNum; k++) {
-            oprmePtr[j].output_op_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+            oprmePtr[j].output_op_names[k] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
             std::string tensor2op = tensorFlowsToOpSet[opOutTensorNew[opsPtr2[j].name]][k];
             str_copy(oprmePtr[j].output_op_names[k], tensor2op.c_str(), tensor2op.length());
         }
@@ -163,11 +163,11 @@ void dequantize_int8_weight(int num, F32 scale, INT8 *q, T *d)
     int base = -127;
     for (int i = 0; i < 255; i++) {
         F32 value = factor * base;
-#ifndef __aarch64__
+#ifndef _USE_FP16
         if (dt != DT_F16) {
 #endif
             table[i] = value;
-#ifndef __aarch64__
+#ifndef _USE_FP16
         } else {
             transformFromFloat(DT_F16, &value, table + i, 1);
         }
@@ -184,7 +184,7 @@ template <typename T>
 inline void deserialize_field(const char **buffer, U32 *position, T *element, int length = 1)
 {
     int size = length * sizeof(T);
-    memcpy(element, *buffer, size);
+    UNI_MEMCPY(element, *buffer, size);
     *buffer += size;
     *position += size;
 }
@@ -196,18 +196,20 @@ EE deserialize_header(const char *bytes, ModelSpec *spec, U32 *pos)
 
     deserialize_field<I32>(pointer, pos, &spec->version);
     if (spec->version != sg_boltVersion) {
-        UNI_ERROR_LOG("X2bolt version is [%d], but your model version is : [%d].\n Please update "
-                      "X2bolt to version[%d].\n",
-            sg_boltVersion, spec->version, spec->version);
-        CHECK_STATUS(NOT_MATCH);
+        UNI_WARNING_LOG("The read model module version(%d) of the library should match the model "
+                        "file of the same version, but your model version is %d. This may "
+                        "encounter error.\nPlease use another library or reconverter model.\n",
+            sg_boltVersion, spec->version);
+    }
+    if (spec->version < 20201120) {
+        UNI_ERROR_LOG("This library can not read model with version(%d),\n", spec->version);
         return NOT_MATCH;
     }
 
     deserialize_field<I32>(pointer, pos, &spec->magic_number);
     if (spec->magic_number != sg_magicNumber) {
-        UNI_ERROR_LOG(
-            "magic_number not_match: code %d bolt model %d\n", sg_magicNumber, spec->magic_number);
-        CHECK_STATUS(NOT_MATCH);
+        UNI_ERROR_LOG("magic number not match: library is %d, bolt model is %d\n", sg_magicNumber,
+            spec->magic_number);
         return NOT_MATCH;
     }
 
@@ -215,18 +217,18 @@ EE deserialize_header(const char *bytes, ModelSpec *spec, U32 *pos)
     deserialize_field<DataType>(pointer, pos, &spec->dt);
 
     deserialize_field<I32>(pointer, pos, &spec->num_inputs);
-    spec->input_names = (I8 **)mt_new_storage(spec->num_inputs * sizeof(I8 *));
-    spec->input_dims = (TensorDesc *)mt_new_storage(spec->num_inputs * sizeof(TensorDesc));
+    spec->input_names = (I8 **)mt_malloc(spec->num_inputs * sizeof(I8 *));
+    spec->input_dims = (TensorDesc *)mt_malloc(spec->num_inputs * sizeof(TensorDesc));
     for (int i = 0; i < spec->num_inputs; i++) {
-        spec->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+        spec->input_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
         deserialize_field<I8>(pointer, pos, spec->input_names[i], NAME_LEN);
     }
     deserialize_field<TensorDesc>(pointer, pos, spec->input_dims, spec->num_inputs);
 
     deserialize_field<I32>(pointer, pos, &spec->num_outputs);
-    spec->output_names = (I8 **)mt_new_storage(spec->num_outputs * NAME_LEN);
+    spec->output_names = (I8 **)mt_malloc(spec->num_outputs * NAME_LEN);
     for (int i = 0; i < spec->num_outputs; i++) {
-        spec->output_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+        spec->output_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
         deserialize_field<I8>(pointer, pos, spec->output_names[i], NAME_LEN);
     }
     return SUCCESS;
@@ -238,43 +240,57 @@ EE deserialize_operator(const char *bytes, ModelSpec *spec, U32 *pos)
     const char **pointer = &operator_pointer;
 
     deserialize_field<I32>(pointer, pos, &spec->num_operator_specs);
-    spec->ops = (OperatorSpec *)mt_new_storage(spec->num_operator_specs * sizeof(OperatorSpec));
+    spec->ops = (OperatorSpec *)mt_malloc(spec->num_operator_specs * sizeof(OperatorSpec));
     OperatorSpec *ptr = spec->ops;
     for (int i = 0; i < spec->num_operator_specs; i++) {
         deserialize_field<I8>(pointer, pos, ptr[i].name, NAME_LEN);
         deserialize_field<OperatorType>(pointer, pos, &ptr[i].type);
 
         deserialize_field<U32>(pointer, pos, &ptr[i].num_inputs);
-        ptr[i].input_tensors_name = (I8 **)mt_new_storage(ptr[i].num_inputs * sizeof(I8 *));
+        ptr[i].input_tensors_name = (I8 **)mt_malloc(ptr[i].num_inputs * sizeof(I8 *));
         for (U32 j = 0; j < ptr[i].num_inputs; j++) {
-            ptr[i].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+            ptr[i].input_tensors_name[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
             deserialize_field<I8>(pointer, pos, ptr[i].input_tensors_name[j], NAME_LEN);
         }
 
         deserialize_field<U32>(pointer, pos, &ptr[i].num_outputs);
-        ptr[i].output_tensors_name = (I8 **)mt_new_storage(ptr[i].num_outputs * sizeof(I8 *));
+        ptr[i].output_tensors_name = (I8 **)mt_malloc(ptr[i].num_outputs * sizeof(I8 *));
         for (U32 j = 0; j < ptr[i].num_outputs; j++) {
-            ptr[i].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+            ptr[i].output_tensors_name[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
             deserialize_field<I8>(pointer, pos, ptr[i].output_tensors_name[j], NAME_LEN);
         }
 
         U32 numTensors = ptr[i].num_inputs + ptr[i].num_outputs;
-        ptr[i].tensor_positions = (I32 *)mt_new_storage(numTensors * sizeof(I32));
+        ptr[i].tensor_positions = (I32 *)mt_malloc(numTensors * sizeof(I32));
         deserialize_field<I32>(pointer, pos, ptr[i].tensor_positions, numTensors);
 
         deserialize_field<U32>(pointer, pos, &ptr[i].num_quant_feature);
-        ptr[i].feature_scale =
-            (QuantSpec *)mt_new_storage(ptr[i].num_quant_feature * sizeof(QuantSpec));
+        ptr[i].feature_scale = (QuantSpec *)mt_malloc(ptr[i].num_quant_feature * sizeof(QuantSpec));
         for (U32 j = 0; j < ptr[i].num_quant_feature; j++) {
             deserialize_field<I32>(pointer, pos, &(ptr[i].feature_scale[j].num_scale));
             ptr[i].feature_scale[j].scale =
-                (F32 *)mt_new_storage(ptr[i].feature_scale[j].num_scale * sizeof(F32));
+                (F32 *)mt_malloc(ptr[i].feature_scale[j].num_scale * sizeof(F32));
             deserialize_field<F32>(
                 pointer, pos, ptr[i].feature_scale[j].scale, ptr[i].feature_scale[j].num_scale);
         }
 
-        deserialize_field<U8>(
-            pointer, pos, (U8 *)&(ptr[i].ps), get_operator_parameter_size(ptr[i].type));
+        deserialize_field<U8>(pointer, pos, (U8 *)&(ptr[i].ps),
+            get_operator_parameter_size(spec->version, ptr[i].type));
+        if (spec->version == 20201120) {
+            if (ptr[i].type == OT_Conv || ptr[i].type == OT_Deconvolution) {
+                ptr[i].ps.conv_spec.output_pad_t = 0;
+                ptr[i].ps.conv_spec.output_pad_h = 0;
+                ptr[i].ps.conv_spec.output_pad_w = 0;
+            }
+            if (ptr[i].type == OT_LayerNorm) {
+                ptr[i].ps.ln_spec.axis = -1;
+            }
+        }
+        if (spec->version == 20201120 || spec->version == 20211021) {
+            if (ptr[i].type == OT_Transpose) {
+                ptr[i].ps.transpose_spec.df = DF_NCHW;
+            }
+        }
     }
     return SUCCESS;
 }
@@ -285,7 +301,7 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
     const char **pointer = &weight_pointer;
 
     deserialize_field<I32>(pointer, pos, &spec->num_weight_specs);
-    spec->ws = (WeightSpec *)mt_new_storage(spec->num_weight_specs * sizeof(WeightSpec));
+    spec->ws = (WeightSpec *)mt_malloc(spec->num_weight_specs * sizeof(WeightSpec));
     WeightSpec *ptr = spec->ws;
     for (int i = 0; i < spec->num_weight_specs; i++) {
         U32 length = 0, count = 0;
@@ -296,17 +312,19 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
 
         bool quantFP16 = false;
         bool quantInt8 = false;
-        if (DT_F16 == ptr[i].mdt && DT_F32 == spec->dt) {
-            ptr[i].mdt = DT_F32;
-            quantFP16 = true;
-        } else if (DT_I8 == ptr[i].mdt && DT_I8 != spec->dt) {
-            if (spec->dt == DT_F16_8Q) {
-                ptr[i].mdt = DT_F16;
-            } else if (spec->dt == DT_F32_8Q) {
-                ptr[i].mdt = DT_F32;
-            } else {
-                ptr[i].mdt = spec->dt;
+        if (DT_F32 == spec->dt) {
+            if (ptr[i].mdt == DT_F16) {
+                quantFP16 = true;
+            }
+            if (ptr[i].mdt == DT_I8) {
+                quantInt8 = true;
             }
+            ptr[i].mdt = DT_F32;
+        } else if (DT_F16_8Q == ptr[i].mdt) {
+            ptr[i].mdt = DT_F16;
+            quantInt8 = true;
+        } else if (DT_F32_8Q == ptr[i].mdt) {
+            ptr[i].mdt = DT_F32;
             quantInt8 = true;
         }
 
@@ -338,12 +356,11 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
         }
 
         deserialize_field<U32>(pointer, pos, &ptr[i].num_quant_scale);
-        ptr[i].weight_scale =
-            (QuantSpec *)mt_new_storage(ptr[i].num_quant_scale * sizeof(QuantSpec));
+        ptr[i].weight_scale = (QuantSpec *)mt_malloc(ptr[i].num_quant_scale * sizeof(QuantSpec));
         for (U32 j = 0; j < ptr[i].num_quant_scale; j++) {
             deserialize_field<I32>(pointer, pos, &(ptr[i].weight_scale[j].num_scale));
             ptr[i].weight_scale[j].scale =
-                (F32 *)mt_new_storage(ptr[i].weight_scale[j].num_scale * sizeof(F32));
+                (F32 *)mt_malloc(ptr[i].weight_scale[j].num_scale * sizeof(F32));
             deserialize_field<F32>(
                 pointer, pos, ptr[i].weight_scale[j].scale, ptr[i].weight_scale[j].num_scale);
         }
@@ -351,21 +368,21 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
         CHECK_REQUIREMENT(length == count);
 
         if (quantFP16) {
-            ptr[i].weight = (U8 *)mt_new_storage(ptr[i].bytes_of_weight);
-            ptr[i].vec = (U8 *)mt_new_storage(ptr[i].bytes_of_vec);
+            ptr[i].weight = (U8 *)mt_malloc(ptr[i].bytes_of_weight);
+            ptr[i].vec = (U8 *)mt_malloc(ptr[i].bytes_of_vec);
             transformToFloat(DT_F16, serialWeight, (F32 *)ptr[i].weight, ptr[i].bytes_of_weight / 4);
             transformToFloat(DT_F16, serialBias, (F32 *)ptr[i].vec, ptr[i].bytes_of_vec / 4);
         } else {
             if (quantInt8) {
                 CHECK_REQUIREMENT(
                     1 == ptr[i].num_quant_scale && 1 == ptr[i].weight_scale[0].num_scale);
-                ptr[i].weight = (U8 *)mt_new_storage(ptr[i].bytes_of_weight);
+                ptr[i].weight = (U8 *)mt_malloc(ptr[i].bytes_of_weight);
                 F32 scale = ptr[i].weight_scale[0].scale[0];
                 if (DT_F32 == ptr[i].mdt) {
                     dequantize_int8_weight<DT_F32, F32>(ptr[i].bytes_of_weight / 4, scale,
                         (INT8 *)serialWeight, (F32 *)ptr[i].weight);
                 } else if (DT_F16 == ptr[i].mdt) {
-#ifdef __aarch64__
+#ifdef _USE_FP16
                     dequantize_int8_weight<DT_F16, F16>(ptr[i].bytes_of_weight / 2, scale,
                         (INT8 *)serialWeight, (F16 *)ptr[i].weight);
 #else
@@ -375,7 +392,7 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
                 } else {
                     UNI_ERROR_LOG(
                         "Can not support convert INT8 data to %s.\n", DataTypeName()[ptr[i].mdt]);
-                    exit(1);
+                    return NOT_SUPPORTED;
                 }
             } else {
                 ptr[i].weight = serialWeight;
@@ -389,28 +406,36 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
 EE deserialize_model(const char *bytes, ModelSpec *spec)
 {
     U32 pos = 0;
-    CHECK_STATUS(deserialize_header(bytes, spec, &pos));
-    CHECK_STATUS(deserialize_operator(bytes, spec, &pos));
-    CHECK_STATUS(deserialize_weight(bytes, spec, &pos));
-    CHECK_STATUS(operator_relationship(spec));
+    EE ret = deserialize_header(bytes, spec, &pos);
+    if (ret == SUCCESS) {
+        ret = deserialize_operator(bytes, spec, &pos);
+    }
+    if (ret == SUCCESS) {
+        ret = deserialize_weight(bytes, spec, &pos);
+    }
+    if (ret == SUCCESS) {
+        ret = operator_relationship(spec);
+    }
     if (spec->mfd->useFileStream) {
         spec->mfd->fileLength = pos;
     }
-    return SUCCESS;
+    return ret;
 }
 
 EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStream)
 {
     UNI_DEBUG_LOG("Read bolt model from %s...\n", (useFileStream ? "file stream" : fn));
+    EE ret = NOT_SUPPORTED;
     UNI_PROFILE(
         {
             char *bytes = nullptr;
             int fd = -1;
             size_t fileLength;
-            spec->mfd = (ModelFileDescriptor *)mt_new_storage(sizeof(ModelFileDescriptor));
+            spec->mfd = (ModelFileDescriptor *)mt_malloc(sizeof(ModelFileDescriptor));
             spec->mfd->useFileStream = useFileStream;
             if (useFileStream) {
                 bytes = (char *)fn;
+                ret = SUCCESS;
             } else {
 #ifdef _WIN32
                 FILE *file = fopen(fn, "rb");
@@ -423,7 +448,7 @@ EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStre
                 fileLength = ftell(file);
                 rewind(file);
 
-                bytes = (char *)malloc(sizeof(char) * fileLength);
+                bytes = (char *)UNI_MALLOC(sizeof(char) * fileLength);
                 if (bytes == NULL) {
                     UNI_ERROR_LOG("Memory allocated for model failed.\n");
                 }
@@ -459,9 +484,9 @@ EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStre
             }
             spec->mfd->bytes = bytes;
 
-            CHECK_STATUS(deserialize_model(bytes, spec));
+            ret = deserialize_model(bytes, spec);
         },
         std::string("deserialize_model_from_file"), std::string("prepare"));
     UNI_DEBUG_LOG("Read bolt model end.\n");
-    return SUCCESS;
+    return ret;
 }
diff --git a/common/model_spec/src/model_print.cpp b/common/model_spec/src/model_print.cpp
index f526b79e..5a3f5654 100644
--- a/common/model_spec/src/model_print.cpp
+++ b/common/model_spec/src/model_print.cpp
@@ -16,6 +16,7 @@
 
 void print_header(const ModelSpec ms)
 {
+#ifdef _USE_MODEL_PRINT
     printf("[Model] %s\n    [DataType] %s\n    [Inputs] %d\n", ms.model_name, DataTypeName()[ms.dt],
         ms.num_inputs);
     if (ms.num_inputs > 0) {
@@ -32,10 +33,12 @@ void print_header(const ModelSpec ms)
     for (int i = 0; i < ms.num_outputs; i++) {
         printf("        %2d %s\n", i, ms.output_names[i]);
     }
+#endif
 }
 
 void print_operator_tensor_relationship(const ModelSpec ms, bool deleteDeprecatedOp)
 {
+#ifdef _USE_MODEL_PRINT
     int number = ms.num_operator_specs;
     printf("    [Operators] %d\n", number);
     if (number > 0) {
@@ -72,10 +75,12 @@ void print_operator_tensor_relationship(const ModelSpec ms, bool deleteDeprecate
         }
         printf("\n");
     }
+#endif
 }
 
 void print_weights(const ModelSpec ms)
 {
+#ifdef _USE_MODEL_PRINT
     std::map<std::string, DataType> vec_data_type;
     for (int i = 0; i < ms.num_operator_specs; i++) {
         switch (ms.ops[i].type) {
@@ -129,10 +134,12 @@ void print_weights(const ModelSpec ms)
         }
         printf("\n");
     }
+#endif
 }
 
 void print_relationship(const ModelSpec ms)
 {
+#ifdef _USE_MODEL_PRINT
     int number = ms.num_op_tensor_entries;
     printf("    [Relationships] %d\n", number);
     if (number > 0) {
@@ -149,6 +156,7 @@ void print_relationship(const ModelSpec ms)
         }
         printf("\n");
     }
+#endif
 }
 
 void print_ms(const ModelSpec ms)
diff --git a/common/model_spec/src/model_serialize.cpp b/common/model_spec/src/model_serialize.cpp
index 617bc183..136a34ce 100644
--- a/common/model_spec/src/model_serialize.cpp
+++ b/common/model_spec/src/model_serialize.cpp
@@ -18,14 +18,14 @@ EE serialize_header(const ModelSpec *spec, std::string *tmp)
     U32 bufSize = sizeof(I32) * 2 + sizeof(I8) * NAME_LEN + sizeof(DataType) + sizeof(I32) +
         sizeof(I8) * NAME_LEN * spec->num_inputs + sizeof(TensorDesc) * spec->num_inputs +
         sizeof(I32) + sizeof(I8) * NAME_LEN * spec->num_outputs;
-    I8 *data = (I8 *)mt_new_storage(bufSize);
+    I8 *data = (I8 *)mt_malloc(bufSize);
 
     I32 *pointer4version = (I32 *)data;
-    memcpy(pointer4version, &spec->version, sizeof(I32));
+    UNI_MEMCPY(pointer4version, &spec->version, sizeof(I32));
     pointer4version += 1;
 
     I32 *pointer4magicNumber = (I32 *)pointer4version;
-    memcpy(pointer4magicNumber, &spec->magic_number, sizeof(I32));
+    UNI_MEMCPY(pointer4magicNumber, &spec->magic_number, sizeof(I32));
     pointer4magicNumber += 1;
 
     I8 *pointer4modelName = (I8 *)pointer4magicNumber;
@@ -47,7 +47,7 @@ EE serialize_header(const ModelSpec *spec, std::string *tmp)
     }
 
     TensorDesc *pointer4TensorDesc = (TensorDesc *)pointer4InputNames;
-    memcpy(pointer4TensorDesc, spec->input_dims, sizeof(TensorDesc) * spec->num_inputs);
+    UNI_MEMCPY(pointer4TensorDesc, spec->input_dims, sizeof(TensorDesc) * spec->num_inputs);
     pointer4TensorDesc += spec->num_inputs;
 
     I32 *pointer4numOutputs = (I32 *)pointer4TensorDesc;
@@ -63,7 +63,7 @@ EE serialize_header(const ModelSpec *spec, std::string *tmp)
     tmp->clear();
     CHECK_REQUIREMENT((U32)(pointer4outputNames - data) == bufSize);
     tmp->assign(data, data + bufSize);
-    delete data;
+    mt_free(data);
     return SUCCESS;
 }
 
@@ -72,7 +72,8 @@ U32 operator_memory_size(OperatorSpec *ops)
     // sizeof(U32) * 4 : type + num_inputs + num_output + num_quant_feature
     U32 allocatedBufferSize = sizeof(I8) * NAME_LEN + sizeof(U32) * 4 +
         ops->num_inputs * NAME_LEN * sizeof(I8) + ops->num_outputs * NAME_LEN * sizeof(I8) +
-        (ops->num_inputs + ops->num_outputs) * sizeof(I32) + get_operator_parameter_size(ops->type);
+        (ops->num_inputs + ops->num_outputs) * sizeof(I32) +
+        get_operator_parameter_size(sg_boltVersion, ops->type);
 
     for (U32 i = 0; i < ops->num_quant_feature; i++) {
         allocatedBufferSize += sizeof(int);  // num_scale
@@ -95,7 +96,7 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp)
         opsTmp++;
     }
 
-    char *data = (char *)mt_new_storage(bufSize);
+    char *data = (char *)mt_malloc(bufSize);
 
     I32 *pointer4numOperatorSpecs = (I32 *)data;
     *pointer4numOperatorSpecs = spec->num_operator_specs - removeOpNum;  // attention
@@ -139,7 +140,7 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp)
         I32 *pointer4tensorPos = (I32 *)pointer4opsOutputTensorsName;
         U32 numTensors = opsPointer[i].num_inputs + opsPointer[i].num_outputs;
         if (nullptr != opsPointer[i].tensor_positions) {
-            memcpy(pointer4tensorPos, opsPointer[i].tensor_positions, numTensors * sizeof(I32));
+            UNI_MEMCPY(pointer4tensorPos, opsPointer[i].tensor_positions, numTensors * sizeof(I32));
         } else {
             for (U32 j = 0; j < numTensors; j++) {
                 pointer4tensorPos[j] = -1;
@@ -156,13 +157,13 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp)
             *pointer4quant = opsPointer[i].feature_scale[j].num_scale;
             int num = *pointer4quant;
             pointer4quant++;
-            memcpy(pointer4quant, opsPointer[i].feature_scale[j].scale, num * sizeof(F32));
+            UNI_MEMCPY(pointer4quant, opsPointer[i].feature_scale[j].scale, num * sizeof(F32));
             pointer4quant += num;
         }
 
         char *pointer4parameterSpecs = (char *)pointer4quant;
-        int operatorParameterSize = get_operator_parameter_size(opsPointer[i].type);
-        memcpy(pointer4parameterSpecs, &(opsPointer[i].ps), operatorParameterSize);
+        int operatorParameterSize = get_operator_parameter_size(sg_boltVersion, opsPointer[i].type);
+        UNI_MEMCPY(pointer4parameterSpecs, &(opsPointer[i].ps), operatorParameterSize);
         pointer4parameterSpecs += operatorParameterSize;
         pointer4opsName = (I8 *)pointer4parameterSpecs;
     }
@@ -170,7 +171,7 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp)
     tmp->clear();
     CHECK_REQUIREMENT((U32)(pointer4opsName - data) == bufSize);
     tmp->assign(data, data + bufSize);
-    delete data;
+    mt_free(data);
     return SUCCESS;
 }
 
@@ -194,7 +195,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp)
 
         weightCount++;
     }
-    char *data = (char *)mt_new_storage(bufSize);
+    char *data = (char *)mt_malloc(bufSize);
 
     I32 *pointer4numWeightSpecs = (I32 *)data;
     *pointer4numWeightSpecs = weightCount;
@@ -225,7 +226,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp)
         pointer4wsBytesOfWeight++;
 
         U8 *pointer4wsWeight = (U8 *)pointer4wsBytesOfWeight;
-        memcpy(pointer4wsWeight, wsPointer[i].weight, wsPointer[i].bytes_of_weight);
+        UNI_MEMCPY(pointer4wsWeight, wsPointer[i].weight, wsPointer[i].bytes_of_weight);
         pointer4wsWeight += wsPointer[i].bytes_of_weight;
 
         U32 *pointer4wsBytesOfVec = (U32 *)pointer4wsWeight;
@@ -233,7 +234,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp)
         pointer4wsBytesOfVec++;
 
         U8 *pointer4wsVec = (U8 *)pointer4wsBytesOfVec;
-        memcpy(pointer4wsVec, wsPointer[i].vec, wsPointer[i].bytes_of_vec);
+        UNI_MEMCPY(pointer4wsVec, wsPointer[i].vec, wsPointer[i].bytes_of_vec);
         pointer4wsVec += wsPointer[i].bytes_of_vec;
 
         U32 *pointer4numquant = (U32 *)pointer4wsVec;
@@ -245,7 +246,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp)
             *pointer4quant = wsPointer[i].weight_scale[j].num_scale;
             int num = *pointer4quant;
             pointer4quant++;
-            memcpy(pointer4quant, wsPointer[i].weight_scale[j].scale, num * sizeof(F32));
+            UNI_MEMCPY(pointer4quant, wsPointer[i].weight_scale[j].scale, num * sizeof(F32));
             pointer4quant += num;
         }
 
@@ -255,7 +256,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp)
     tmp->clear();
     CHECK_REQUIREMENT((U32)(pointer4wsOpName - data) == bufSize);
     tmp->assign(data, data + bufSize);
-    delete data;
+    mt_free(data);
     return SUCCESS;
 }
 
@@ -299,8 +300,10 @@ EE serialize_model_to_file(const ModelSpec *spec, const char *fn)
 {
     UNI_DEBUG_LOG("Write bolt model to %s...\n", fn);
     std::string bytes = "";
-    CHECK_STATUS(serialize_model(spec, &bytes));
-    CHECK_STATUS(write_to_file(&bytes, fn));
+    EE ret = serialize_model(spec, &bytes);
+    if (ret == SUCCESS) {
+        ret = write_to_file(&bytes, fn);
+    }
     UNI_DEBUG_LOG("Write bolt model end.\n");
     return SUCCESS;
 }
diff --git a/common/model_spec/src/model_spec.cpp b/common/model_spec/src/model_spec.cpp
index 6de15409..0876089c 100644
--- a/common/model_spec/src/model_spec.cpp
+++ b/common/model_spec/src/model_spec.cpp
@@ -15,7 +15,7 @@
 #include <sys/mman.h>
 #endif
 
-#include "model_spec.h"
+#include "model_common.h"
 
 EE mt_create_model(ModelSpec *ms)
 {
@@ -49,29 +49,22 @@ EE mt_destroy_model(ModelSpec *ms)
 
     if (nullptr != ms->input_names) {
         for (int i = 0; i < ms->num_inputs; i++) {
-            if (nullptr != ms->input_names[i]) {
-                delete ms->input_names[i];
-            }
-            ms->input_names[i] = nullptr;
+            mt_free(ms->input_names[i]);
         }
-        delete ms->input_names;
-        ms->input_names = nullptr;
+        ms->num_inputs = 0;
+        mt_free(ms->input_names);
     }
 
     if (nullptr != ms->input_dims) {
-        delete ms->input_dims;
-        ms->input_dims = nullptr;
+        mt_free(ms->input_dims);
     }
 
     if (nullptr != ms->output_names) {
         for (int i = 0; i < ms->num_outputs; i++) {
-            if (nullptr != ms->output_names[i]) {
-                delete ms->output_names[i];
-            }
-            ms->output_names[i] = nullptr;
+            mt_free(ms->output_names[i]);
         }
-        delete ms->output_names;
-        ms->output_names = nullptr;
+        ms->num_outputs = 0;
+        mt_free(ms->output_names);
     }
 
     if (nullptr != ms->ops) {
@@ -79,92 +72,79 @@ EE mt_destroy_model(ModelSpec *ms)
         for (int i = 0; i < op_num; i++) {
             if (nullptr != ms->ops[i].input_tensors_name) {
                 for (U32 j = 0; j < ms->ops[i].num_inputs; j++) {
-                    if (nullptr != ms->ops[i].input_tensors_name[j]) {
-                        delete ms->ops[i].input_tensors_name[j];
-                    }
-                    ms->ops[i].input_tensors_name[j] = nullptr;
+                    mt_free(ms->ops[i].input_tensors_name[j]);
                 }
-                delete ms->ops[i].input_tensors_name;
-                ms->ops[i].input_tensors_name = nullptr;
+                ms->ops[i].num_inputs = 0;
+                mt_free(ms->ops[i].input_tensors_name);
             }
             if (nullptr != ms->ops[i].output_tensors_name) {
                 for (U32 j = 0; j < ms->ops[i].num_outputs; j++) {
-                    if (nullptr != ms->ops[i].output_tensors_name[j]) {
-                        delete ms->ops[i].output_tensors_name[j];
-                    }
-                    ms->ops[i].output_tensors_name[j] = nullptr;
+                    mt_free(ms->ops[i].output_tensors_name[j]);
                 }
-                delete ms->ops[i].output_tensors_name;
-                ms->ops[i].output_tensors_name = nullptr;
-            }
-
-            if (nullptr != ms->ops[i].tensor_positions) {
-                delete ms->ops[i].tensor_positions;
+                ms->ops[i].num_outputs = 0;
+                mt_free(ms->ops[i].output_tensors_name);
             }
+            mt_free(ms->ops[i].tensor_positions);
 
             if (0 != ms->ops[i].num_quant_feature && nullptr != ms->ops[i].feature_scale) {
                 for (U32 j = 0; j < ms->ops[i].num_quant_feature; j++) {
                     if (0 != ms->ops[i].feature_scale[j].num_scale) {
-                        if (nullptr != ms->ops[i].feature_scale[j].scale) {
-                            delete ms->ops[i].feature_scale[j].scale;
-                        }
+                        ms->ops[i].feature_scale[j].num_scale = 0;
+                        mt_free(ms->ops[i].feature_scale[j].scale);
                     }
                 }
-                delete ms->ops[i].feature_scale;
+                ms->ops[i].num_quant_feature = 0;
+                mt_free(ms->ops[i].feature_scale);
             }
         }
-        delete ms->ops;
-        ms->ops = nullptr;
+        ms->num_operator_specs = 0;
+        mt_free(ms->ops);
     }
 
     if (nullptr != ms->ws) {
-        int weightOpNum = ms->num_weight_specs;
-        for (int i = 0; i < weightOpNum; i++) {
-            if (nullptr != ms->ws[i].weight && outOfFileMapRange(ms->ws[i].weight, ms->mfd)) {
-                delete ms->ws[i].weight;
-            }
-            ms->ws[i].weight = nullptr;
-            if (nullptr != ms->ws[i].vec && outOfFileMapRange(ms->ws[i].vec, ms->mfd)) {
-                delete ms->ws[i].vec;
+        for (int i = 0; i < ms->num_weight_specs; i++) {
+            ms->ws[i].bytes_of_weight = 0;
+            mt_free(ms->ws[i].weight, ms);
+            ms->ws[i].bytes_of_vec = 0;
+            mt_free(ms->ws[i].vec, ms);
+            for (U32 j = 0; j < ms->ws[i].num_quant_scale; j++) {
+                if (0 != ms->ws[i].weight_scale[j].num_scale) {
+                    ms->ws[i].weight_scale[j].num_scale = 0;
+                    mt_free(ms->ws[i].weight_scale[j].scale);
+                }
             }
-            ms->ws[i].vec = nullptr;
+            ms->ws[i].num_quant_scale = 0;
+            mt_free(ms->ws[i].weight_scale);
         }
-        delete ms->ws;
-        ms->ws = nullptr;
+        ms->num_weight_specs = 0;
+        mt_free(ms->ws);
     }
 
     if (nullptr != ms->op_relationship_entries) {
-        int numOpRelationPair = ms->num_op_tensor_entries;
-        for (int i = 0; i < numOpRelationPair; i++) {
+        for (int i = 0; i < ms->num_op_tensor_entries; i++) {
             if (nullptr != ms->op_relationship_entries[i].input_op_names) {
                 for (U32 j = 0; j < ms->op_relationship_entries[i].num_inputs; j++) {
-                    if (nullptr != ms->op_relationship_entries[i].input_op_names[j]) {
-                        delete ms->op_relationship_entries[i].input_op_names[j];
-                    }
-                    ms->op_relationship_entries[i].input_op_names[j] = nullptr;
+                    mt_free(ms->op_relationship_entries[i].input_op_names[j]);
                 }
-                delete ms->op_relationship_entries[i].input_op_names;
-                ms->op_relationship_entries[i].input_op_names = nullptr;
+                ms->op_relationship_entries[i].num_inputs = 0;
+                mt_free(ms->op_relationship_entries[i].input_op_names);
             }
             if (nullptr != ms->op_relationship_entries[i].output_op_names) {
                 for (U32 j = 0; j < ms->op_relationship_entries[i].num_outputs; j++) {
-                    if (nullptr != ms->op_relationship_entries[i].output_op_names[j]) {
-                        delete ms->op_relationship_entries[i].output_op_names[j];
-                    }
-                    ms->op_relationship_entries[i].output_op_names[j] = nullptr;
+                    mt_free(ms->op_relationship_entries[i].output_op_names[j]);
                 }
-                delete ms->op_relationship_entries[i].output_op_names;
-                ms->op_relationship_entries[i].output_op_names = nullptr;
+                ms->op_relationship_entries[i].num_outputs = 0;
+                mt_free(ms->op_relationship_entries[i].output_op_names);
             }
         }
-        delete ms->op_relationship_entries;
-        ms->op_relationship_entries = nullptr;
+        ms->num_op_tensor_entries = 0;
+        mt_free(ms->op_relationship_entries);
     }
 
     if (ms->mfd != nullptr && !ms->mfd->useFileStream && ms->mfd->bytes != nullptr) {
 #ifdef _WIN32
         // use fread to read model file
-        free(ms->mfd->bytes);
+        UNI_FREE(ms->mfd->bytes);
 #else
         // use mmap to read model file
         munmap(ms->mfd->bytes, ms->mfd->fileLength);
@@ -173,9 +153,6 @@ EE mt_destroy_model(ModelSpec *ms)
         }
 #endif
     }
-
-    delete ms->mfd;
-    ms->mfd = nullptr;
-
+    mt_free(ms->mfd);
     return SUCCESS;
 }
diff --git a/common/uni/include/affinity_policy.h b/common/uni/include/affinity_policy.h
new file mode 100644
index 00000000..b0f9b85f
--- /dev/null
+++ b/common/uni/include/affinity_policy.h
@@ -0,0 +1,94 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_AFFINITY_POLICY
+#define _H_AFFINITY_POLICY
+
+#include "sys.h"
+#ifdef _USE_OPENMP
+#include <omp.h>
+#define OMP_MAX_NUM_THREADS \
+    (getenv("OMP_NUM_THREADS") == NULL ? omp_get_num_procs() : atoi(getenv("OMP_NUM_THREADS")))
+#else
+#define OMP_MAX_NUM_THREADS 1
+#endif
+extern int OMP_NUM_THREADS;
+const int CPU_MAX_NUMBER = 128;
+
+typedef enum {
+    AFFINITY_CPU = 0,
+    AFFINITY_CPU_LOW_POWER = 1,
+    AFFINITY_CPU_HIGH_PERFORMANCE = 2,
+    AFFINITY_GPU = 3
+} AffinityPolicy;
+
+typedef struct CpuStat {
+    unsigned long idle;
+    unsigned long total;
+} CpuStat;
+
+typedef struct DeviceInfo {
+    int cpuNum;
+    Arch archs[CPU_MAX_NUMBER];
+    long freqs[CPU_MAX_NUMBER];
+    float occupys[CPU_MAX_NUMBER];
+    int cpuids[CPU_MAX_NUMBER];
+    CpuStat cpuStats[CPU_MAX_NUMBER];
+
+    float maxOccupy;
+    AffinityPolicy affinityPolicy;
+    Arch schedule;
+} DeviceInfo;
+
+inline const char *const *AffinityPolicyNames()
+{
+    static const char *const names[] = {
+        "CPU_AFFINITY", "CPU_AFFINITY_LOW_POWER", "CPU_AFFINITY_HIGH_PERFORMANCE", "GPU"};
+    return names;
+}
+
+inline const AffinityPolicy *AffinityPolicies()
+{
+    static const AffinityPolicy policies[] = {
+        AFFINITY_CPU, AFFINITY_CPU_LOW_POWER, AFFINITY_CPU_HIGH_PERFORMANCE, AFFINITY_GPU};
+    return policies;
+}
+
+inline AffinityPolicy thread_affinity_get_policy_by_name(const char *name)
+{
+    for (int i = 0; i < 4; i++) {
+        const char *target = AffinityPolicyNames()[i];
+        if (strcmp(target, name) == 0) {
+            return AffinityPolicies()[i];
+        }
+    }
+    return AFFINITY_CPU_HIGH_PERFORMANCE;
+}
+
+inline void set_cpu_num_threads(int threadNum)
+{
+#ifndef _USE_OPENMP
+    if (threadNum > 1) {
+        UNI_WARNING_LOG("this library not support multi-threads parallel, please rebuild with "
+                        "--openmp option.\n");
+    }
+#endif
+    if (threadNum < 0) {
+        threadNum = 1;
+    }
+    if (threadNum > OMP_MAX_NUM_THREADS) {
+        threadNum = OMP_MAX_NUM_THREADS;
+    }
+    OMP_NUM_THREADS = threadNum;
+}
+#endif
diff --git a/common/uni/include/algorithm_map.h b/common/uni/include/algorithm_map.h
index 22c315e9..5adecb42 100644
--- a/common/uni/include/algorithm_map.h
+++ b/common/uni/include/algorithm_map.h
@@ -58,9 +58,7 @@ class AlgorithmMap {
             if (i == 96) {
                 continue;
             }
-            char j[8];
-            sprintf(j, "%c", i);
-            charSet.insert(j);
+            charSet.insert(std::string(1, i));
         }
 
         std::string name = modelName;
diff --git a/common/uni/include/arm_neon_expand.h b/common/uni/include/arm_neon_expand.h
index 83580162..0ff739c1 100644
--- a/common/uni/include/arm_neon_expand.h
+++ b/common/uni/include/arm_neon_expand.h
@@ -344,7 +344,7 @@ inline void vst1q_lane_f16_builtin(__fp16 *address, float16x8_t vec, const int l
 #endif
 
 #ifdef _USE_INT8
-#ifdef __aarch64__
+#ifdef _USE_FP16
 inline int32x4_t vdotq_laneq_s32_builtin(int32x4_t c, int8x16_t a, int8x16_t b, const int laneId)
 {
     int32x4_t ret;
diff --git a/common/uni/include/array_transpose.h b/common/uni/include/array_transpose.h
index 52380110..579ef341 100644
--- a/common/uni/include/array_transpose.h
+++ b/common/uni/include/array_transpose.h
@@ -14,7 +14,8 @@
 #ifndef _H_ARRAY_TRANSPOSE
 #define _H_ARRAY_TRANSPOSE
 
-#include "string.h"
+#include "secure_c_wrapper.h"
+#include "affinity_policy.h"
 
 template <int branch, typename T>
 static inline void inner_transpose_template(unsigned int tileSize,
@@ -26,25 +27,33 @@ static inline void inner_transpose_template(unsigned int tileSize,
     int inputDimsNum,
     int outputDimsNum,
     unsigned int outputSize,
-    int sizeInnerIndex,
-    unsigned int *inputLocalIndex)
+    int sizeInnerIndex)
 {
-    for (unsigned int i = 0; i < outputSize; i++) {
-        unsigned int outputIndex = i;
-        for (int j = sizeInnerIndex; j < outputDimsNum; j++) {
-            unsigned int value = outputIndex % outputDims[j];
-            outputIndex /= outputDims[j];
-            inputLocalIndex[inputDimsNum - 1 - transposeDims[outputDimsNum - 1 - j]] = value;
-        }
-        unsigned int inputIndex = 0;
-        for (int j = inputDimsNum - 1; j > sizeInnerIndex; j--) {
-            inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
-        }
-        inputIndex += inputLocalIndex[sizeInnerIndex];
-        if (branch == 0) {
-            *(output + i) = *(input + inputIndex);
-        } else {
-            memcpy(output + i * tileSize, input + inputIndex * tileSize, tileSize);
+#ifdef _USE_OPENMP
+#pragma omp parallel num_threads(OMP_NUM_THREADS)
+#endif
+    {
+        std::vector<unsigned int> inputLocalIndex(inputDimsNum);
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (unsigned int i = 0; i < outputSize; i++) {
+            unsigned int outputIndex = i;
+            for (int j = sizeInnerIndex; j < outputDimsNum; j++) {
+                unsigned int value = outputIndex % outputDims[j];
+                outputIndex /= outputDims[j];
+                inputLocalIndex[inputDimsNum - 1 - transposeDims[outputDimsNum - 1 - j]] = value;
+            }
+            unsigned int inputIndex = 0;
+            for (int j = inputDimsNum - 1; j > sizeInnerIndex; j--) {
+                inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
+            }
+            inputIndex += inputLocalIndex[sizeInnerIndex];
+            if (branch == 0) {
+                *(output + i) = *(input + inputIndex);
+            } else {
+                UNI_MEMCPY(output + i * tileSize, input + inputIndex * tileSize, tileSize);
+            }
         }
     }
 }
@@ -58,15 +67,6 @@ inline void array_transpose(unsigned int elementSize,
     int inputDimsNum,
     int outputDimsNum)
 {
-    unsigned int inputSize = 1, outputSize = 1;
-    for (int i = 0; i < inputDimsNum; i++) {
-        inputSize *= inputDims[i];
-    }
-    for (int i = 0; i < outputDimsNum; i++) {
-        outputSize *= outputDims[i];
-    }
-    CHECK_REQUIREMENT(inputSize == outputSize);
-
     unsigned int sizeInner = 1;
     int sizeInnerIndex = 0;
     for (int i = outputDimsNum - 1; i >= 0; i--) {
@@ -77,23 +77,55 @@ inline void array_transpose(unsigned int elementSize,
             break;
         }
     }
+    int tileSize = elementSize * sizeInner;
+    int in = inputDims[inputDimsNum - 1], ihiw = 0, ic = 0;
+    if (outputDimsNum - sizeInnerIndex == 3 && transposeDims[0] == 0 && transposeDims[1] == 2 &&
+        transposeDims[2] == 1) {
+        ic = inputDims[inputDimsNum - 2];
+        ihiw = inputDims[inputDimsNum - 3];
+    }
+    if (outputDimsNum - sizeInnerIndex == 4 && transposeDims[0] == 0 && transposeDims[1] == 2 &&
+        transposeDims[2] == 3 && transposeDims[3] == 1) {
+        ic = inputDims[inputDimsNum - 2];
+        ihiw = inputDims[inputDimsNum - 3] * inputDims[inputDimsNum - 4];
+    }
+    if (ic > 0 && ihiw > 0 && input != output) {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+        for (int o = 0; o < in * ihiw; o++) {
+            int n = o / ihiw;
+            int hw = o % ihiw;
+            U8 *dst = (U8 *)output + o * ic * tileSize;
+            for (int c = 0; c < ic; c++, dst += tileSize) {
+                const U8 *src = (const U8 *)input + ((n * ic + c) * ihiw + hw) * tileSize;
+                UNI_MEMCPY(dst, src, tileSize);
+            }
+        }
+        return;
+    }
+
+    unsigned int inputSize = 1, outputSize = 1;
+    for (int i = 0; i < inputDimsNum; i++) {
+        inputSize *= inputDims[i];
+    }
+    for (int i = 0; i < outputDimsNum; i++) {
+        outputSize *= outputDims[i];
+    }
+    CHECK_REQUIREMENT(inputSize == outputSize);
     outputSize = outputSize / sizeInner;
 
-    std::vector<unsigned int> inputLocalIndex(inputDimsNum, 0);
     const char *inputPtr = (const char *)input;
     char *outputPtr = (char *)output;
     if (sizeInner == 1 && elementSize == 4) {
         inner_transpose_template<0, int>(elementSize, inputDims, (const int *)input, outputDims,
-            (int *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex,
-            inputLocalIndex.data());
+            (int *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex);
     } else if (sizeInner == 1 && elementSize == 2) {
         inner_transpose_template<0, short>(elementSize, inputDims, (const short *)input, outputDims,
-            (short *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex,
-            inputLocalIndex.data());
+            (short *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex);
     } else {
-        inner_transpose_template<1, char>(sizeInner * elementSize, inputDims, (const char *)input,
-            outputDims, (char *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize,
-            sizeInnerIndex, inputLocalIndex.data());
+        inner_transpose_template<1, char>(tileSize, inputDims, (const char *)input, outputDims,
+            (char *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex);
     }
 }
 
@@ -113,22 +145,31 @@ inline void array_transpose_naive(unsigned int elementSize,
         inputSize *= inputDims[i];
         outputSize *= outputDims[i];
     }
-    std::vector<unsigned int> inputLocalIndex(dimsNum);
     const char *inputPtr = (const char *)input;
     char *outputPtr = (char *)output;
-    for (unsigned int i = 0; i < outputSize; i++) {
-        unsigned int outputIndex = i;
-        for (int j = 0; j < dimsNum; j++) {
-            unsigned int value = outputIndex % outputDims[j];
-            outputIndex /= outputDims[j];
-            inputLocalIndex[dimsNum - 1 - transposeDims[dimsNum - 1 - j]] = value;
-        }
-        unsigned int inputIndex = 0;
-        for (int j = dimsNum - 1; j > 0; j--) {
-            inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
+#ifdef _USE_OPENMP
+#pragma omp parallel num_threads(OMP_NUM_THREADS)
+#endif
+    {
+        std::vector<unsigned int> inputLocalIndex(dimsNum);
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (unsigned int i = 0; i < outputSize; i++) {
+            unsigned int outputIndex = i;
+            for (int j = 0; j < dimsNum; j++) {
+                unsigned int value = outputIndex % outputDims[j];
+                outputIndex /= outputDims[j];
+                inputLocalIndex[dimsNum - 1 - transposeDims[dimsNum - 1 - j]] = value;
+            }
+            unsigned int inputIndex = 0;
+            for (int j = dimsNum - 1; j > 0; j--) {
+                inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
+            }
+            inputIndex += inputLocalIndex[0];
+            UNI_MEMCPY(
+                outputPtr + i * elementSize, inputPtr + inputIndex * elementSize, elementSize);
         }
-        inputIndex += inputLocalIndex[0];
-        memcpy(outputPtr + i * elementSize, inputPtr + inputIndex * elementSize, elementSize);
     }
 }
 #endif
diff --git a/common/uni/include/data_type.h b/common/uni/include/data_type.h
index 58dbb121..9c152678 100644
--- a/common/uni/include/data_type.h
+++ b/common/uni/include/data_type.h
@@ -15,9 +15,9 @@
 #define _H_DATA_TYPE
 
 #include <bitset>
-#include <string.h>
 #include <math.h>
-#ifdef __aarch64__
+#include <limits.h>
+#ifdef _USE_FP16
 #include <arm_neon.h>
 typedef __fp16 F16;
 #endif
@@ -25,8 +25,9 @@ typedef __fp16 F16;
 #include <immintrin.h>
 #include <xmmintrin.h>
 #define FTZ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-typedef float F16;
 #endif
+#define _USE_ULTRA_OPTIMIZATION
+#include "secure_c_wrapper.h"
 
 typedef int8_t INT8;
 typedef uint8_t UINT8;
@@ -56,26 +57,41 @@ typedef enum {
     DT_BIN11 = 8,
     DT_F32_8Q = 9,
     DT_U8_Q = 10,
-    DT_NUM = 11
+    DT_I64 = 11,
+    DT_U64 = 12,
+    DT_F64 = 13,
+    DT_NUM = 14
 } DataType;
 
 inline const char *const *DataTypeName()
 {
     static const char *const names[] = {"DT_U8", "DT_I8", "DT_U32", "DT_I32", "DT_F16", "DT_F16_8Q",
-        "DT_F32", "DT_BIN01", "DT_BIN11", "DT_F32_8Q", "DT_U8_Q", "DT_NUM"};
+        "DT_F32", "DT_BIN01", "DT_BIN11", "DT_F32_8Q", "DT_U8_Q", "DT_I64", "DT_U64", "DT_F64",
+        "DT_NUM"};
     return names;
 }
 
 inline U32 bytesOf(DataType dt)
 {
     // Please divide number of elements by 8 first in the case of binary data types
-    U32 bytes[] = {1, 1, 4, 4, 2, 2, 4, 1, 1, 4, 1};
-    return dt < DT_NUM ? bytes[dt] : 0;
+    U32 bytes[] = {1, 1, 4, 4, 2, 2, 4, 1, 1, 4, 1, 8, 8, 8};
+    U32 ret;
+    if (dt < DT_NUM) {
+        ret = bytes[dt];
+    } else {
+        ret = 0;
+        printf("[ERROR] try to get unknown type:%s bytes.\n", DataTypeName()[dt]);
+        exit(1);
+    }
+    return ret;
 }
 
 #ifdef _USE_FP16
 inline void transformFromHalf(DataType dataType, const F16 *src, void *dst, int num)
 {
+    if (num <= 0) {
+        return;
+    }
     if (num % 8 != 0) {
         printf("[ERROR] can not support to transformFromHalf for array(length(%d) mod 8 != 0).\n",
             num);
@@ -110,6 +126,9 @@ inline void transformFromHalf(DataType dataType, const F16 *src, void *dst, int
 
 inline void transformToHalf(DataType dataType, const void *src, F16 *dst, int num)
 {
+    if (num <= 0) {
+        return;
+    }
     if (num % 8 != 0) {
         printf(
             "[ERROR] can not support to transformToHalf for array(length(%d) mod 8 != 0).\n", num);
@@ -148,12 +167,81 @@ inline void transformToHalf(DataType dataType, const void *src, F16 *dst, int nu
 }
 #endif
 
+inline void transformToInt(DataType dataType, const void *src, int *dst, int num)
+{
+    if (num <= 0) {
+        return;
+    }
+    switch (dataType) {
+        case DT_I64: {
+            I64 value;
+            const U8 *ptr = (const U8 *)src;
+            for (int i = 0; i < num; i++) {
+                UNI_MEMCPY(&value, ptr, sizeof(I64));
+                ptr += sizeof(I64);
+                value = value > INT_MAX ? INT_MAX : value;
+                dst[i] = value < INT_MIN ? INT_MIN : value;
+            }
+            break;
+        }
+        case DT_U32:
+        case DT_I32: {
+            UNI_MEMCPY(dst, src, sizeof(int) * num);
+            break;
+        }
+        default: {
+            printf("[ERROR] can not transform %s to int.\n", DataTypeName()[dataType]);
+            exit(1);
+        }
+    }
+}
+
+inline unsigned short float32ToFloat16(float value)
+{
+    const U32 *word = (const U32 *)(&value);
+    unsigned short sign = (word[0] & 0x80000000) >> 31;
+    unsigned short exponent = (word[0] & 0x7F800000) >> 23;
+    unsigned int significand = word[0] & 0x7FFFFF;
+
+    unsigned short u;
+    if (exponent == 0) {
+        u = (sign << 15) | (0x00 << 10) | 0x00;
+    } else if (exponent == 0xFF) {
+        u = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00);
+    } else {
+        short newexp = exponent + (-127 + 15);
+        if (newexp >= 31) {
+            u = (sign << 15) | (0x1F << 10) | 0x00;
+        } else if (newexp <= 0) {
+            if (newexp >= -10) {
+                unsigned short sig = (significand | 0x800000) >> (14 - newexp);
+                u = (sign << 15) | (0x00 << 10) | sig;
+            } else {
+                u = (sign << 15) | (0x00 << 10) | 0x00;
+            }
+        } else {
+            u = (sign << 15) | (newexp << 10) | (significand >> 13);
+        }
+    }
+    return u;
+}
+
 inline void transformFromFloat(
     DataType dataType, const float *src, void *dst, int num, float scale = 1)
 {
+    if (num <= 0) {
+        return;
+    }
     switch (dataType) {
         case DT_F32: {
-            memcpy(dst, src, sizeof(float) * num);
+            UNI_MEMCPY(dst, src, sizeof(float) * num);
+            break;
+        }
+        case DT_I64: {
+            I64 *ptr = (I64 *)dst;
+            for (int i = 0; i < num; i++) {
+                ptr[i] = src[i];
+            }
             break;
         }
         case DT_U32: {
@@ -172,41 +260,16 @@ inline void transformFromFloat(
         }
         case DT_F16_8Q:
         case DT_F16: {
-#ifdef __aarch64__
+#ifdef _USE_FP16
             F16 *ptr = (F16 *)dst;
 #else
-            const U32 *word = (const U32 *)src;
             unsigned short *q = (unsigned short *)dst;
 #endif
             for (int i = 0; i < num; i++) {
-#ifdef __aarch64__
+#ifdef _USE_FP16
                 ptr[i] = src[i];
 #else
-                unsigned short sign = (word[i] & 0x80000000) >> 31;
-                unsigned short exponent = (word[i] & 0x7F800000) >> 23;
-                unsigned int significand = word[i] & 0x7FFFFF;
-
-                unsigned short u;
-                if (exponent == 0) {
-                    u = (sign << 15) | (0x00 << 10) | 0x00;
-                } else if (exponent == 0xFF) {
-                    u = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00);
-                } else {
-                    short newexp = exponent + (-127 + 15);
-                    if (newexp >= 31) {
-                        u = (sign << 15) | (0x1F << 10) | 0x00;
-                    } else if (newexp <= 0) {
-                        if (newexp >= -10) {
-                            unsigned short sig = (significand | 0x800000) >> (14 - newexp);
-                            u = (sign << 15) | (0x00 << 10) | sig;
-                        } else {
-                            u = (sign << 15) | (0x00 << 10) | 0x00;
-                        }
-                    } else {
-                        u = (sign << 15) | (newexp << 10) | (significand >> 13);
-                    }
-                }
-                q[i] = u;
+                q[i] = float32ToFloat16(src[i]);
 #endif
             }
             break;
@@ -235,10 +298,20 @@ inline void transformFromFloat(
 inline void transformToFloat(
     DataType dataType, const void *src, float *dst, int num, float scale = 1)
 {
+    if (num <= 0) {
+        return;
+    }
     switch (dataType) {
         case DT_F32_8Q:
         case DT_F32: {
-            memcpy(dst, src, sizeof(float) * num);
+            UNI_MEMCPY(dst, src, sizeof(float) * num);
+            break;
+        }
+        case DT_I64: {
+            const I64 *ptr = (const I64 *)src;
+            for (int i = 0; i < num; i++) {
+                dst[i] = ptr[i];
+            }
             break;
         }
         case DT_U32: {
@@ -257,14 +330,14 @@ inline void transformToFloat(
         }
         case DT_F16_8Q:
         case DT_F16: {
-#ifdef __aarch64__
+#ifdef _USE_FP16
             const F16 *ptr = (const F16 *)src;
 #else
             const unsigned short *q = (const unsigned short *)src;
             U32 *word = (U32 *)dst;
 #endif
             for (int i = 0; i < num; i++) {
-#ifdef __aarch64__
+#ifdef _USE_FP16
                 dst[i] = ptr[i];
 #else
                 unsigned short value = q[i];
@@ -350,13 +423,19 @@ inline void transformToFloat(
 
 inline void UNI_INIT(U32 num, DataType dt, F32 val, void *dst)
 {
+    if (num <= 0) {
+        return;
+    }
+    if (val == 0) {
+        UNI_MEMSET(dst, 0, bytesOf(dt) * num);
+        return;
+    }
     switch (dt) {
         case DT_F16: {
-            unsigned int short mem;
-            transformFromFloat(DT_F16, &val, &mem, 1);
-            U8 *arr = (U8 *)dst;
+            unsigned short mem = float32ToFloat16(val);
+            unsigned short *arr = (unsigned short *)dst;
             for (U32 i = 0; i < num; i++) {
-                memcpy(arr + i * bytesOf(DT_F16), &mem, bytesOf(DT_F16));
+                arr[i] = mem;
             }
             break;
         }
diff --git a/common/uni/include/error.h b/common/uni/include/error.h
index 00af4c70..e35e2227 100644
--- a/common/uni/include/error.h
+++ b/common/uni/include/error.h
@@ -19,7 +19,12 @@
 #include <unistd.h>
 
 #ifdef _WIN32
+#ifdef _USE_JNI
 #define UNI_THREADID int tid = 0;
+#else
+#include <windows.h>
+#define UNI_THREADID int tid = GetThreadId(GetCurrentThread());
+#endif
 #elif defined(__GLIBC__) || defined(__linux__)
 #include <sys/syscall.h>
 #define UNI_THREADID pid_t tid = syscall(SYS_gettid);
@@ -80,23 +85,23 @@ extern "C" {
         })                                      \
     }
 
-#define UNI_WARNING_LOG(...)                                                           \
-    {                                                                                  \
-        UNI_THREADID                                                                   \
-        UNI_THREAD_SAFE({                                                              \
-            UNI_LOGD("[WARNING] thread %d file %s line %d ", tid, __FILE__, __LINE__); \
-            UNI_LOGD(__VA_ARGS__);                                                     \
-        })                                                                             \
+#define UNI_WARNING_LOG(...)                                                            \
+    {                                                                                   \
+        UNI_THREADID                                                                    \
+        UNI_THREAD_SAFE({                                                               \
+            UNI_LOGD("[WARNING] thread %d file %s line %d: ", tid, __FILE__, __LINE__); \
+            UNI_LOGD(__VA_ARGS__);                                                      \
+        })                                                                              \
     }
 
-#define UNI_ERROR_LOG(...)                                                           \
-    {                                                                                \
-        UNI_THREADID                                                                 \
-        UNI_THREAD_SAFE({                                                            \
-            UNI_LOGD("[ERROR] thread %d file %s line %d ", tid, __FILE__, __LINE__); \
-            UNI_LOGD(__VA_ARGS__);                                                   \
-        })                                                                           \
-        UNI_EXIT;                                                                    \
+#define UNI_ERROR_LOG(...)                                                            \
+    {                                                                                 \
+        UNI_THREADID                                                                  \
+        UNI_THREAD_SAFE({                                                             \
+            UNI_LOGD("[ERROR] thread %d file %s line %d: ", tid, __FILE__, __LINE__); \
+            UNI_LOGD(__VA_ARGS__);                                                    \
+        })                                                                            \
+        UNI_EXIT;                                                                     \
     }
 
 #ifdef _DEBUG
diff --git a/common/uni/include/memory_cpu.h b/common/uni/include/memory_cpu.h
new file mode 100644
index 00000000..271f9156
--- /dev/null
+++ b/common/uni/include/memory_cpu.h
@@ -0,0 +1,123 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_UNI_MEMORY_CPU
+#define _H_UNI_MEMORY_CPU
+
+#include "secure_c_wrapper.h"
+#include <string>
+#ifdef _USE_MEM_CHECK
+#include <map>
+extern std::map<std::string, unsigned int> mem_statistics;
+#endif
+
+inline std::string ptr2Str(const void *p)
+{
+    char b[64];
+#ifdef _USE_SECURE_C
+    sprintf_s(b, 64, "%p", p);
+#else
+    sprintf(b, "%p", p);
+#endif
+    return std::string(b);
+}
+
+inline void *UNI_MALLOC(unsigned int size)
+{
+    void *p = nullptr;
+    if (size > 0) {
+        p = malloc(size);
+        if (p == nullptr) {
+            UNI_ERROR_LOG("cpu malloc ptr:%p bytes:%u error.\n", p, size);
+        }
+#ifdef _USE_MEM_CHECK
+        UNI_DEBUG_LOG("cpu malloc ptr:%p bytes:%u.\n", p, size);
+        std::string key = ptr2Str(p) + std::string("(alloc by malloc)");
+        mem_statistics[key] = size;
+#endif
+    }
+    return p;
+}
+
+inline void UNI_FREE(void *p)
+{
+    if (p == nullptr) {
+        return;
+    }
+#ifdef _USE_MEM_CHECK
+    UNI_DEBUG_LOG("cpu free ptr:%p.\n", p);
+    std::string key = ptr2Str(p) + std::string("(alloc by malloc)");
+    if (mem_statistics.find(key) == mem_statistics.end()) {
+        UNI_ERROR_LOG("try to free unalloc ptr:%p.\n", p);
+    } else {
+        mem_statistics.erase(key);
+    }
+#endif
+    free(p);
+}
+
+inline void *UNI_OPERATOR_NEW(unsigned int size)
+{
+    void *p = nullptr;
+    if (size > 0) {
+        try {
+            p = operator new(size);
+        } catch (const std::bad_alloc &e) {
+            UNI_ERROR_LOG("cpu operator new ptr:%p bytes:%u error.\n", p, size);
+        }
+#ifdef _USE_MEM_CHECK
+        UNI_DEBUG_LOG("cpu operator new ptr:%p bytes:%u.\n", p, size);
+        std::string key = ptr2Str(p) + std::string("(alloc by operator new)");
+        mem_statistics[key] = size;
+#endif
+    }
+    return p;
+}
+
+inline void UNI_OPERATOR_DELETE(void *p)
+{
+    if (p == nullptr) {
+        return;
+    }
+#ifdef _USE_MEM_CHECK
+    UNI_DEBUG_LOG("cpu operator delete ptr:%p.\n", p);
+    std::string key = ptr2Str(p) + std::string("(alloc by operator new)");
+    if (mem_statistics.find(key) == mem_statistics.end()) {
+        UNI_ERROR_LOG("try to operator delete unalloc ptr:%p.\n", p);
+    } else {
+        mem_statistics.erase(key);
+    }
+#endif
+    operator delete(p);
+}
+
+inline size_t UNI_MEM_SIZE()
+{
+    size_t size = 0;
+#ifdef _USE_MEM_CHECK
+    for (auto iter : mem_statistics) {
+        size += iter.second;
+    }
+#endif
+    return size;
+}
+
+inline void UNI_MEM_STATISTICS()
+{
+#ifdef _USE_MEM_CHECK
+    for (auto iter : mem_statistics) {
+        UNI_ERROR_LOG("ptr:%s bytes:%u is not free.\n", iter.first.c_str(), iter.second);
+    }
+#endif
+}
+#endif
diff --git a/common/uni/include/operator_type.h b/common/uni/include/operator_type.h
index 455a25f8..72e8775d 100644
--- a/common/uni/include/operator_type.h
+++ b/common/uni/include/operator_type.h
@@ -92,14 +92,14 @@ typedef enum {
     OT_SoftPlus = 69,
 
     OT_Exp = 70,
-    OT_Split = 71,
+    OT_OneHot = 71,
     OT_Tdnn = 72,
     OT_Dropout = 73,
     OT_TopK = 74,
     OT_SpaceToBatchNd = 75,
     OT_BatchToSpaceNd = 76,
     OT_Abs = 77,
-    OT_Equal = 78,
+    OT_NonZero = 78,
     OT_Sign = 79,
 
     OT_HSwishNoDiv = 80,
@@ -113,7 +113,18 @@ typedef enum {
     OT_GenerateProposals = 88,
     OT_RoIAlign = 89,
 
-    OT_GAT = 90
+    OT_GAT = 90,
+    OT_QuantizeLinear = 91,
+    OT_Round = 92,
+    OT_Floor = 93,
+    OT_Ceil = 94,
+    OT_RandomUniform = 95,
+    OT_CumSum = 96,
+    OT_GridSample = 97,
+    OT_NonMaxSuppression = 98,
+    OT_Range = 99,
+
+    OT_Swish = 100
 } OperatorType;
 
 inline const char *const *OperatorTypeName()
@@ -140,13 +151,16 @@ inline const char *const *OperatorTypeName()
         "OT_DetectionOutput", "OT_Yolov3DetectionOutput", "OT_MultiHeadAttention", "OT_SqDiff",
         "OT_Tile", "OT_Splice", "OT_Neg", "OT_Greater", "OT_Where", "OT_SoftPlus", "OT_Exp",
 
-        "OT_Split", "OT_Tdnn", "OT_Dropout", "OT_TopK", "OT_SpaceToBatchNd", "OT_BatchToSpaceNd",
-        "OT_Abs", "OT_Equal", "OT_Sign", "OT_HSwishNoDiv",
+        "OT_OneHot", "OT_Tdnn", "OT_Dropout", "OT_TopK", "OT_SpaceToBatchNd", "OT_BatchToSpaceNd",
+        "OT_Abs", "OT_NonZero", "OT_Sign", "OT_HSwishNoDiv",
 
         "OT_InstanceNorm", "OT_Expand", "OT_Scatter", "OT_Select", "OT_Not", "OT_Reciprocal",
         "OT_Log", "OT_GenerateProposals", "OT_RoIAlign",
 
-        "OT_GAT"};
+        "OT_GAT", "OT_QuantizeLinear", "OT_Round", "OT_Floor", "OT_Ceil", "OT_RandomUniform",
+        "OT_CumSum", "OT_GridSample", "OT_NonMaxSuppression", "OT_Range",
+
+        "OT_Swish"};
     return names;
 }
 #endif
diff --git a/common/uni/include/parameter_spec.h b/common/uni/include/parameter_spec.h
index 6ec321c2..a3cf1296 100644
--- a/common/uni/include/parameter_spec.h
+++ b/common/uni/include/parameter_spec.h
@@ -22,21 +22,24 @@
 
 typedef enum { POOLING_MAX, POOLING_MEAN } PoolingMode;
 
-typedef enum { CEIL, FLOOR, TF_SAME, TF_VALID, ROUND_PREFER_FLOOR, ROUND_PREFER_CEIL } RoundMode;
-
-typedef enum { LINEAR, NEAREST, CUBIC } ResizeMode;
-
 typedef enum {
-    ROIALIGN_HALF_PIXEL,
-    ROIALIGN_OUTPUT_HALF_PIXEL
-} ROIAlignCoordinateTransformationMode;
+    ROUND_CEIL,
+    ROUND_FLOOR,
+    ROUND_TF_SAME,
+    ROUND_TF_VALID,
+    ROUND_PREFER_FLOOR,
+    ROUND_PREFER_CEIL
+} RoundMode;
+
+typedef enum { RESIZE_LINEAR, RESIZE_NEAREST, RESIZE_CUBIC } ResizeMode;
 
 typedef enum {
-    ALIGN_CORNERS,
-    HALF_PIXEL,
-    PYTORCH_HALF_PIXEL,
-    ASYMMETRIC
-} ResizeCoordinateTransMode;
+    COORDINATE_TRANS_ALIGN_CORNERS,
+    COORDINATE_TRANS_HALF_PIXEL,
+    COORDINATE_TRANS_PYTORCH_HALF_PIXEL,
+    COORDINATE_TRANS_ASYMMETRIC,
+    COORDINATE_TRANS_OUTPUT_HALF_PIXEL
+} CoordinateTransMode;
 
 typedef enum {
     ELTWISE_SUM,
@@ -47,7 +50,6 @@ typedef enum {
     ELTWISE_DIV,
     ELTWISE_SQRT,
     ELTWISE_ERF,
-
     ELTWISE_AND,
     ELTWISE_OR,
     ELTWISE_XOR
@@ -71,23 +73,35 @@ typedef enum {
     ACTIVATION_H_SWISH_NODIV,
     ACTIVATION_LOG,
     ACTIVATION_NOT,
-    ACTIVATION_NEG
+    ACTIVATION_NEG,
+    ACTIVATION_ROUND,
+    ACTIVATION_FLOOR,
+    ACTIVATION_CEIL,
+    ACTIVATION_SWISH,
+    ACTIVATION_RECIPROCAL
 } ActivationMode;
 
-typedef enum { BSliceApply_NULL, BSliceApply_CONV } BilateralSliceApplyMode;
+typedef enum { BSLICE_APPLY_NULL, BSLICE_APPLY_CONV } BilateralSliceApplyMode;
 
 typedef enum {
-    Convolution_Pointwise,
-    Convolution_Dilation,
-    Convolution_Depthwise,
-    Convolution_Depthwise_Pointwise,
-    Convolution_Deconvolution,
-    Convolution_Depthwise_Deconvolution
+    CONVOLUTION_POINTWISE,
+    CONVOLUTION_DILATION,
+    CONVOLUTION_DEPTHWISE,
+    CONVOLUTION_DEPTHWISE_POINTWISE,
+    CONVOLUTION_DECONVOLUTION,
+    CONVOLUTION_DEPTHWISE_DECONVOLUTION
 } ConvolutionMode;
 
-typedef enum { Pad_Constant, Pad_Reflect, Pad_Edge, Pad_Symmetric } PadMode;
+typedef enum { PAD_CONSTANT, PAD_REFLECT, PAD_EDGE, PAD_SYMMETRIC } PadMode;
 
-typedef enum { CHECK_EQUAL, CHECK_GREATEQUAL, CHECK_GREAT } CheckMode;
+typedef enum {
+    CHECK_EQUAL,
+    CHECK_GREATER_EQUAL,
+    CHECK_GREATER,
+    CHECK_LESS,
+    CHECK_LESS_EQUAL,
+    CHECK_NOT_EQUAL
+} CheckMode;
 
 typedef enum {
     REDUCTION_SUM,
@@ -112,16 +126,6 @@ typedef enum {
     BGR_SC_RAW = 5
 } ImageFormat;
 
-#pragma pack(8)
-typedef struct ActivationParamSpec {
-    ActivationMode mode;
-    float value[4] = {0, 0, 0, 0};
-} ActivationParamSpec;
-
-typedef struct {
-    bool propagate_down;
-} PReLUParamSpec;
-
 typedef enum {
     CONVOLUTION_NO_TMP_MEM,
     CONVOLUTION_FASTEST,
@@ -137,30 +141,11 @@ typedef enum {
     CONVOLUTION_ALGORITHM_GEMM_ICNCHW,
     CONVOLUTION_ALGORITHM_WINOGRAD,
     CONVOLUTION_ALGORITHM_BNN,
-    CONVOLUTION_ALGORITHM_DIRECT_SPE_CK,
+    CONVOLUTION_ALGORITHM_INVGEMM,
     CONVOLUTION_ALGORITHM_GROUP_DECONV,
     CONVOLUTION_ALGORITHM_NULL
 } ConvolutionForwardAlgorithm;
 
-typedef struct {
-    float xmin;
-    float ymin;
-    float xmax;
-    float ymax;
-    unsigned int label;
-} BoxRect;
-
-typedef struct {
-    unsigned int label;
-    I64 box_index;
-} BoxInfo;
-
-typedef struct {
-    unsigned int max_output_boxes_per_class;
-    float iou_threshold;
-    float score_threshold;
-} NonMaxSuppressionParamSpec;
-
 typedef enum {
     DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT,
     DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT,
@@ -170,28 +155,47 @@ typedef enum {
     DEPTHWISE_CONVOLUTION_ALGORITHM_NULL
 } DepthwiseConvolutionForwardAlgorithm;
 
+#pragma pack(8)
+typedef struct ActivationParamSpec {
+    ActivationMode mode;
+    float value[4] = {0, 0, 0, 0};
+} ActivationParamSpec;
+
+typedef struct {
+    bool propagate_down;
+} PReLUParamSpec;
+
+typedef struct {
+    int center_point_box;
+    unsigned int max_output_boxes_per_class;
+    float iou_threshold;
+    float score_threshold;
+} NonMaxSuppressionParamSpec;
+
 typedef struct {
+    // save h, w
     unsigned int sizes[2];
+    // save n, c, h, w
     float scales[4];
     unsigned int num_sizes;
     unsigned int num_scales;
     ResizeMode mode;
-    ResizeCoordinateTransMode trans_mode;
+    CoordinateTransMode trans_mode;
     RoundMode round_mode;
 } ResizeParamSpec;
 
 typedef struct {
     int axes[8];
-    int axes_num;
+    int num_axes;
 } SqueezeParamSpec;
 
 typedef struct {
     int axes[8];
-    int axes_num;
+    int num_axes;
 } UnsqueezeParamSpec;
 
 typedef struct {
-    DataType targetDt;
+    DataType dt;
 } CastParamSpec;
 
 typedef struct {
@@ -204,8 +208,8 @@ typedef struct {
 } ReLUParamSpec;
 
 typedef struct {
-    float coeff_values[8];
-    int coeff_size;
+    float coeff[8];
+    int num_coeff;
 } EltwiseSumSpec;
 
 typedef struct {
@@ -219,8 +223,8 @@ typedef union {
 } ActivationSpec;
 
 typedef struct {
-    EltwiseMode elt_mode;
-    EltwiseSumSpec elt_sum_spec;
+    EltwiseMode mode;
+    EltwiseSumSpec sum_spec;
     ActivationMode activation_type;
     ActivationSpec activation_spec;
 } EltwiseParamSpec;
@@ -233,12 +237,12 @@ typedef struct {
     unsigned int stride_t;
     unsigned int stride_h;
     unsigned int stride_w;
-    unsigned int padding_before;
-    unsigned int padding_after;
-    unsigned int padding_top;
-    unsigned int padding_bottom;
-    unsigned int padding_left;
-    unsigned int padding_right;
+    unsigned int pad_before;
+    unsigned int pad_after;
+    unsigned int pad_top;
+    unsigned int pad_bottom;
+    unsigned int pad_left;
+    unsigned int pad_right;
     unsigned int group;
     unsigned int dilatedRate_t;
     unsigned int dilatedRate_h;
@@ -248,7 +252,10 @@ typedef struct {
     ActivationMode dw_activation_type;
     ActivationMode pw_activation_type;
     ActivationSpec activation_spec;
-    RoundMode rm;
+    RoundMode round_mode;
+    unsigned int output_pad_t;
+    unsigned int output_pad_h;
+    unsigned int output_pad_w;
 } ConvolutionParamSpec;
 
 typedef struct {
@@ -258,14 +265,15 @@ typedef struct {
     unsigned int stride_t;
     unsigned int stride_h;
     unsigned int stride_w;
-    unsigned int padding_before;
-    unsigned int padding_after;
-    unsigned int padding_top;
-    unsigned int padding_bottom;
-    unsigned int padding_left;
-    unsigned int padding_right;
-    RoundMode rm;
+    unsigned int pad_before;
+    unsigned int pad_after;
+    unsigned int pad_top;
+    unsigned int pad_bottom;
+    unsigned int pad_left;
+    unsigned int pad_right;
+    RoundMode round_mode;
     PoolingMode mode;
+    bool count_include_pad;
 } PoolingParamSpec;
 
 // FC's weight is reordered to NxK, K is removed dimension.
@@ -307,8 +315,8 @@ typedef struct {
 } PadParamSpec;
 
 typedef struct {
-    unsigned int input_dim;
-    unsigned int num_output;
+    unsigned int num_inputs;
+    unsigned int num_outputs;
     bool bias_term;
     bool transpose;
     int axis;
@@ -321,21 +329,22 @@ typedef struct {
 } PowerParamSpec;
 
 typedef struct {
-    int shape_dims[8];
-    int shape_size;
+    int shape[8];
+    int num_shape;
     int axis;
     int num_axes;
 } ReshapeParamSpec;
 
 typedef struct {
     int slice_points[8];
-    unsigned int slice_size;
+    unsigned int num_slice;
     int axis;
 } SliceParamSpec;
 
-typedef struct {
-    unsigned int trans_dims[8];
-    unsigned int trans_size;
+typedef struct TransposeParamSpec {
+    unsigned int axes[8];
+    unsigned int num_axes;
+    DataFormat df = DF_NCHW;
 } TransposeParamSpec;
 
 typedef struct {
@@ -346,29 +355,29 @@ typedef struct {
 
 typedef struct {
     RNNMode mode;
-    unsigned int numOutput;
+    unsigned int num_outputs;
     // steps >= 0 for multi-steps RNN
     // steps = -1 for RNNCell
     int steps;
-    int numProjection;
-    float zoneoutCell;
-    float zoneoutOutput;
+    int num_projection;
+    float zoneout_cell;
+    float zoneout_output;
 
-    bool biDirection;
-    float forgetBias;
-    ActivationMode activationMode;
+    bool bi_direction;
+    float forget_bias;
+    ActivationMode activation_type;
 } RNNParamSpec;
 
 typedef struct {
-    unsigned int coefficient_len;
+    unsigned int coefficient;
     BilateralSliceApplyMode mode;
     bool has_offset;
 } BilateralSliceApplyParamSpec;
 
 typedef struct {
     int axes[8];
-    int axes_num;
-    ReductionMode reduction_mode;
+    int num_axes;
+    ReductionMode mode;
     float coeff;
     bool keep_dim;
 } ReductionParamSpec;
@@ -384,7 +393,7 @@ typedef struct {
 } CopyParamSpec;
 
 typedef struct {
-    CheckMode check_mode;
+    CheckMode mode;
 } CheckParamSpec;
 
 typedef struct {
@@ -392,8 +401,9 @@ typedef struct {
     int axis;
 } RepeatParamSpec;
 
-typedef struct {
+typedef struct PreAllocatedMemoryParamSpec {
     TensorDesc desc;
+    float value = 0;
 } PreAllocatedMemoryParamSpec;
 
 typedef struct {
@@ -434,7 +444,7 @@ typedef struct {
     char ellipsis_mask[8];
     char new_axis_mask[8];
     char shrink_axis_mask[8];
-    unsigned int dim_size;
+    unsigned int num_dims;
 } TfSliceParamSpec;
 
 typedef struct {
@@ -478,17 +488,17 @@ typedef struct {
 } ChannelResizeParamSpec;
 
 typedef struct {
-    int blockSize;
+    int block_size;
 } Space2DepthParamSpec;
 
 typedef struct {
-    int blockSize;
-    I8 reMode[8];
+    int block_size;
+    I8 mode[8];
 } Depth2SpaceParamSpec;
 
 typedef struct {
-    int repeatsInfo[8];
-    int dimsSize;
+    int repeats[8];
+    int num_repeats;
     int axis;
 } TileParamSpec;
 
@@ -511,26 +521,21 @@ typedef struct {
     FullyConnectedParamSpec fc_desc[6];
     PowerParamSpec power_spec;
     bool eltwiseWithLayerNormIn[2];
-    ActivationMode actiMode;
+    ActivationMode activation_type;
     ReshapeParamSpec reshapeDesc[4];
     EltwiseParamSpec eltwiseDesc[2];
-} MultiheadAttentionParamSpec;
+} MultiHeadAttentionParamSpec;
 
 typedef struct {
     int axis;
     int largest;
     int sorted;
-    int topk;
+    int k;
 } TopKParamSpec;
 
 typedef struct {
-    TensorDesc conditionDesc;
-    TensorDesc yDesc;
-} WhereParamSpec;
-
-typedef struct {
-    int shape_dims[8];
-    int shape_size;
+    int shape[8];
+    int num_shape;
 } ExpandParamSpec;
 
 typedef struct ScatterParamSpec {
@@ -558,17 +563,13 @@ typedef struct GatherParamSpec {
     int batch_dims = 0;
 } GatherParamSpec;
 
-typedef struct EqualParamSpec {
-    bool invert = false;
-} EqualParamSpec;
-
 typedef struct {
     unsigned int num_heads;
-    ActivationParamSpec activation;
+    ActivationParamSpec activation_type;
 } GATParamSpec;
 
 typedef struct RoIAlignParamSpec {
-    ROIAlignCoordinateTransformationMode coordinateTransformationMode;
+    CoordinateTransMode trans_mode;
     PoolingMode mode;
     unsigned int output_h;
     unsigned int output_w;
@@ -589,6 +590,57 @@ typedef struct GenerateProposalsParamSpec {
     float spatial_scale;
 } GenerateProposalsParamSpec;
 
+typedef struct QuantizeLinearParamSpec {
+    // get the scales from input tensor
+    int axis;
+    DataType dt;
+} QuantizeLinearParamSpec;
+
+typedef struct {
+    int axis;
+    float eps;
+} LayerNormParamSpec;
+
+typedef struct RandomUniformParamSpec {
+    DataType dt;
+    float low;
+    float high;
+    float seed;
+    int shape[8];
+    int num_shape;
+} RandomUniformParamSpec;
+
+typedef struct CumSumParamSpec {
+    bool exclusive;
+    bool reverse;
+    bool axis;
+} CumSumParamSpec;
+
+typedef struct GridSampleParamSpec {
+    ResizeMode mode;
+    PadMode pad_mode;
+    float constant_value = 0;
+    bool align_corners;
+} GridSampleParamSpec;
+
+typedef struct OneHotParamSpec {
+    int axis;
+    int depth;
+    float values[2];
+} OneHotParamSpec;
+
+typedef struct ConstantOfShapeParamSpec {
+    DataType dt;
+    float value = 0;
+} ConstantOfShapeParamSpec;
+
+typedef struct RangeParamSpec {
+    DataType dt;
+    float start;
+    float limit;
+    float delta;
+} RangeParamSpec;
+
 typedef union ParameterSpec {
     ParameterSpec()
     {}
@@ -634,18 +686,25 @@ typedef union ParameterSpec {
     PriorBoxParamSpec prior_box_spec;
     DetectionOutputParamSpec detection_output_spec;
     Yolov3DetectionOutputParamSpec yolov3_detection_output_spec;
-    MultiheadAttentionParamSpec multiheadAttention_spec;
+    MultiHeadAttentionParamSpec multihead_attention_spec;
     TileParamSpec tile_spec;
     SpliceParamSpec splice_spec;
     TdnnParamSpec tdnn_spec;
     TopKParamSpec topk_spec;
-    WhereParamSpec where_spec;
     ExpandParamSpec expand_spec;
     ScatterParamSpec scatter_spec;
-    EqualParamSpec equal_spec;
     RoIAlignParamSpec roialign_spec;
     GenerateProposalsParamSpec generate_proposals_spec;
     GATParamSpec gat_spec;
+    QuantizeLinearParamSpec quant_spec;
+    LayerNormParamSpec ln_spec;
+    RandomUniformParamSpec random_uniform_spec;
+    CumSumParamSpec cumsum_spec;
+    GridSampleParamSpec grid_sample_spec;
+    OneHotParamSpec onehot_spec;
+    NonMaxSuppressionParamSpec non_max_suppression_spec;
+    ConstantOfShapeParamSpec constant_of_shape_spec;
+    RangeParamSpec range_spec;
 } ParameterSpec;
 
 typedef struct {
@@ -654,7 +713,7 @@ typedef struct {
 } QuantSpec;
 #pragma pack()
 
-inline int get_operator_parameter_size(OperatorType operatorType)
+inline int get_operator_parameter_size(int version, OperatorType operatorType)
 {
     std::map<OperatorType, int> operatorParameterSizeMap = {{OT_Conv, sizeof(ConvolutionParamSpec)},
         {OT_Deconvolution, sizeof(ConvolutionParamSpec)}, {OT_FC, sizeof(FullyConnectedParamSpec)},
@@ -683,20 +742,41 @@ inline int get_operator_parameter_size(OperatorType operatorType)
         {OT_RelativeShift, sizeof(RelativeShiftParamSpec)}, {OT_PriorBox, sizeof(PriorBoxParamSpec)},
         {OT_DetectionOutput, sizeof(DetectionOutputParamSpec)},
         {OT_Yolov3DetectionOutput, sizeof(Yolov3DetectionOutputParamSpec)},
-        {OT_MultiHeadAttention, sizeof(MultiheadAttentionParamSpec)},
+        {OT_MultiHeadAttention, sizeof(MultiHeadAttentionParamSpec)},
         {OT_Tile, sizeof(TileParamSpec)}, {OT_Splice, sizeof(SpliceParamSpec)},
         {OT_Tdnn, sizeof(TdnnParamSpec)}, {OT_TopK, sizeof(TopKParamSpec)},
-        {OT_Where, sizeof(WhereParamSpec)}, {OT_Expand, sizeof(ExpandParamSpec)},
-        {OT_InstanceNorm, sizeof(InstanceNormParamSpec)}, {OT_Scatter, sizeof(ScatterParamSpec)},
-        {OT_LogSoftmax, sizeof(SoftmaxParamSpec)}, {OT_Equal, sizeof(EqualParamSpec)},
+        {OT_Expand, sizeof(ExpandParamSpec)}, {OT_InstanceNorm, sizeof(InstanceNormParamSpec)},
+        {OT_Scatter, sizeof(ScatterParamSpec)}, {OT_LogSoftmax, sizeof(SoftmaxParamSpec)},
         {OT_GenerateProposals, sizeof(GenerateProposalsParamSpec)},
-        {OT_RoIAlign, sizeof(RoIAlignParamSpec)}, {OT_GAT, sizeof(GATParamSpec)}};
+        {OT_RoIAlign, sizeof(RoIAlignParamSpec)}, {OT_GAT, sizeof(GATParamSpec)},
+        {OT_QuantizeLinear, sizeof(QuantizeLinearParamSpec)},
+        {OT_LayerNorm, sizeof(LayerNormParamSpec)},
+        {OT_QuantizeLinear, sizeof(QuantizeLinearParamSpec)}, {OT_CumSum, sizeof(CumSumParamSpec)},
+        {OT_RandomUniform, sizeof(RandomUniformParamSpec)},
+        {OT_GridSample, sizeof(GridSampleParamSpec)}, {OT_OneHot, sizeof(OneHotParamSpec)},
+        {OT_NonMaxSuppression, sizeof(NonMaxSuppressionParamSpec)},
+        {OT_Range, sizeof(RangeParamSpec)}, {OT_ConstantOfShape, sizeof(ConstantOfShapeParamSpec)}};
     int size;
     if (operatorParameterSizeMap.find(operatorType) == operatorParameterSizeMap.end()) {
         size = 0;
     } else {
         size = operatorParameterSizeMap[operatorType];
     }
+    if (version == 20201120) {
+        if (operatorType == OT_Conv || operatorType == OT_Deconvolution) {
+            size -= 3 * sizeof(unsigned int);
+        }
+        if (operatorType == OT_LayerNorm) {
+            size = 0;
+        }
+    } else {
+        size = (size + 3) / 4 * 4;
+    }
+    if (version == 20201120 || version == 20211021) {
+        if (operatorType == OT_Transpose) {
+            size -= sizeof(DataFormat);
+        }
+    }
     return size;
 }
 
@@ -707,12 +787,12 @@ inline ConvolutionParamSpec createConvolutionParamSpec(unsigned int group,
     unsigned int stride_t,
     unsigned int stride_h,
     unsigned int stride_w,
-    unsigned int padding_before,
-    unsigned int padding_after,
-    unsigned int padding_top,
-    unsigned int padding_bottom,
-    unsigned int padding_left,
-    unsigned int padding_right,
+    unsigned int pad_before,
+    unsigned int pad_after,
+    unsigned int pad_top,
+    unsigned int pad_bottom,
+    unsigned int pad_left,
+    unsigned int pad_right,
     unsigned int dilateRate_t,
     unsigned int dilateRate_h,
     unsigned int dilateRate_w,
@@ -727,17 +807,20 @@ inline ConvolutionParamSpec createConvolutionParamSpec(unsigned int group,
     p.stride_t = stride_t;
     p.stride_h = stride_h;
     p.stride_w = stride_w;
-    p.padding_before = padding_before;
-    p.padding_after = padding_after;
-    p.padding_top = padding_top;
-    p.padding_bottom = padding_bottom;
-    p.padding_left = padding_left;
-    p.padding_right = padding_right;
+    p.pad_before = pad_before;
+    p.pad_after = pad_after;
+    p.pad_top = pad_top;
+    p.pad_bottom = pad_bottom;
+    p.pad_left = pad_left;
+    p.pad_right = pad_right;
     p.dilatedRate_t = dilateRate_t;
     p.dilatedRate_h = dilateRate_h;
     p.dilatedRate_w = dilateRate_w;
     p.num_outputs = num_outputs;
     p.convolution_type = convMode;
+    p.output_pad_t = 0;
+    p.output_pad_h = 0;
+    p.output_pad_w = 0;
     return p;
 }
 
@@ -762,13 +845,13 @@ inline PoolingParamSpec createPoolingParamSpec(PoolingMode pm,
     unsigned int stride_t,
     unsigned int stride_h,
     unsigned int stride_w,
-    unsigned int padding_before,
-    unsigned int padding_after,
-    unsigned int padding_top,
-    unsigned int padding_bottom,
-    unsigned int padding_left,
-    unsigned int padding_right,
-    RoundMode rm)
+    unsigned int pad_before,
+    unsigned int pad_after,
+    unsigned int pad_top,
+    unsigned int pad_bottom,
+    unsigned int pad_left,
+    unsigned int pad_right,
+    RoundMode round_mode)
 {
     PoolingParamSpec p;
     p.mode = pm;
@@ -778,26 +861,25 @@ inline PoolingParamSpec createPoolingParamSpec(PoolingMode pm,
     p.stride_t = stride_t;
     p.stride_h = stride_h;
     p.stride_w = stride_w;
-    p.padding_before = padding_before;
-    p.padding_after = padding_after;
-    p.padding_top = padding_top;
-    p.padding_bottom = padding_bottom;
-    p.padding_left = padding_left;
-    p.padding_right = padding_right;
-    p.rm = rm;
+    p.pad_before = pad_before;
+    p.pad_after = pad_after;
+    p.pad_top = pad_top;
+    p.pad_bottom = pad_bottom;
+    p.pad_left = pad_left;
+    p.pad_right = pad_right;
+    p.round_mode = round_mode;
     return p;
 }
 
-inline ReshapeParamSpec createReshapeParamSpec(
-    int *shape_dims, int shape_size, int axis, int num_axes)
+inline ReshapeParamSpec createReshapeParamSpec(int *shape, int num_shape, int axis, int num_axes)
 {
     ReshapeParamSpec p;
-    p.shape_size = shape_size;
+    p.num_shape = num_shape;
     p.axis = axis;
     p.num_axes = num_axes;
-    if (shape_dims != nullptr && shape_size != 0) {
-        for (int i = 0; i < shape_size; i++) {
-            p.shape_dims[i] = shape_dims[i];
+    if (shape != nullptr && num_shape != 0) {
+        for (int i = 0; i < num_shape; i++) {
+            p.shape[i] = shape[i];
         }
     }
     return p;
@@ -811,12 +893,12 @@ inline ClipParamSpec createClipParamSpec(float min, float max)
     return p;
 }
 
-inline SqueezeParamSpec createSqueezeParamSpec(int *axes, int axes_num)
+inline SqueezeParamSpec createSqueezeParamSpec(int *axes, int num_axes)
 {
     SqueezeParamSpec p;
-    p.axes_num = axes_num;
-    if (axes != nullptr && axes_num != 0) {
-        for (int i = 0; i < axes_num; i++) {
+    p.num_axes = num_axes;
+    if (axes != nullptr && num_axes != 0) {
+        for (int i = 0; i < num_axes; i++) {
             p.axes[i] = axes[i];
         }
     }
diff --git a/common/uni/include/profiling.h b/common/uni/include/profiling.h
index e987be19..31e29740 100644
--- a/common/uni/include/profiling.h
+++ b/common/uni/include/profiling.h
@@ -18,15 +18,21 @@
 
 double ut_time_ms();
 void ut_time_init();
+void ut_time_start();
+void ut_time_stop();
 void ut_time_process(
     const std::string &name, const std::string &category, double time_start_ms, double time_end_ms);
 void ut_time_statistics();
 
 #ifdef _PROFILE_STATISTICS
 #define UNI_TIME_INIT ut_time_init();
+#define UNI_TIME_START ut_time_start();
+#define UNI_TIME_STOP ut_time_stop();
 #define UNI_TIME_STATISTICS ut_time_statistics();
 #else
 #define UNI_TIME_INIT
+#define UNI_TIME_START
+#define UNI_TIME_STOP
 #define UNI_TIME_STATISTICS
 #endif
 
diff --git a/common/uni/include/secure_c_wrapper.h b/common/uni/include/secure_c_wrapper.h
new file mode 100644
index 00000000..06b3e9aa
--- /dev/null
+++ b/common/uni/include/secure_c_wrapper.h
@@ -0,0 +1,66 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_SECURE_C_WRAPPER
+#define _H_SECURE_C_WRAPPER
+#ifdef _USE_SECURE_C
+#include <securec.h>
+#else
+#include <string.h>
+#endif
+
+#include "error.h"
+
+inline void UNI_MEMCPY(void *dst, const void *src, size_t size)
+{
+    if (src == dst || size == 0) {
+        return;
+    }
+    if (dst == NULL || src == NULL) {
+        printf("cpu memcpy error dst:%p src:%p bytes:%d.\n", dst, src, (int)size);
+    }
+    //UNI_DEBUG_LOG("cpu memcpy dst:%p src:%p bytes:%d.\n", dst, src, (int)size);
+#ifdef _USE_SECURE_C
+    memcpy_s(dst, size, src, size);
+#else
+    memcpy(dst, src, size);
+#endif
+}
+
+inline void UNI_MEMSET(void *dst, int c, size_t size)
+{
+#ifdef _USE_SECURE_C
+    memset_s(dst, size, c, size);
+#else
+    memset(dst, c, size);
+#endif
+}
+
+inline void UNI_STRCPY(char *dst, const char *src)
+{
+#ifdef _USE_SECURE_C
+    strcpy_s(dst, strlen(src) + 1, src);
+#else
+    strcpy(dst, src);
+#endif
+}
+
+#ifdef _USE_SECURE_C
+#define UNI_SSCANF sscanf_s
+#define UNI_SNPRINTF snprintf_truncated_s
+#else
+#define UNI_SSCANF sscanf
+#define UNI_SNPRINTF snprintf
+#endif
+
+#endif
diff --git a/common/uni/include/thread_affinity.h b/common/uni/include/thread_affinity.h
index db21b777..2f63dc00 100644
--- a/common/uni/include/thread_affinity.h
+++ b/common/uni/include/thread_affinity.h
@@ -17,15 +17,14 @@
 #ifndef _WIN32
 #include <sys/syscall.h>
 #include <sched.h>
-#endif
-#ifdef _USE_OPENMP
-#include <omp.h>
+#else
+#include <windows.h>
 #endif
 #include <unistd.h>
-#include <string.h>
 #include "sys.h"
 #include "error.h"
 #include "data_type.h"
+#include "affinity_policy.h"
 
 #ifdef _USE_X86
 #define __cpuid(data, eaxIn, ecxIn)                                                   \
@@ -34,53 +33,6 @@
                          : "0"(eaxIn), "2"(ecxIn))
 #endif
 
-const int CPU_MAX_NUMBER = 128;
-#ifdef _USE_OPENMP
-#define OMP_MAX_NUM_THREADS \
-    (getenv("OMP_NUM_THREADS") == NULL ? omp_get_num_procs() : atoi(getenv("OMP_NUM_THREADS")))
-#else
-#define OMP_MAX_NUM_THREADS 1
-#endif
-extern int OMP_NUM_THREADS;
-
-typedef enum {
-    AFFINITY_CPU_LOW_POWER = 0,
-    AFFINITY_CPU_HIGH_PERFORMANCE = 1,
-    AFFINITY_GPU = 2
-} AffinityPolicy;
-
-typedef struct CpuStat {
-    unsigned long idle;
-    unsigned long total;
-} CpuStat;
-
-typedef struct DeviceInfo {
-    int cpuNum;
-    Arch archs[CPU_MAX_NUMBER];
-    long freqs[CPU_MAX_NUMBER];
-    float occupys[CPU_MAX_NUMBER];
-    int cpuids[CPU_MAX_NUMBER];
-    CpuStat cpuStats[CPU_MAX_NUMBER];
-
-    float maxOccupy;
-    AffinityPolicy affinityPolicy;
-    Arch schedule;
-} DeviceInfo;
-
-inline const char *const *AffinityPolicyNames()
-{
-    static const char *const names[] = {
-        "CPU_AFFINITY_LOW_POWER", "CPU_AFFINITY_HIGH_PERFORMANCE", "GPU"};
-    return names;
-}
-
-inline const AffinityPolicy *AffinityPolicies()
-{
-    static const AffinityPolicy policies[] = {
-        AFFINITY_CPU_LOW_POWER, AFFINITY_CPU_HIGH_PERFORMANCE, AFFINITY_GPU};
-    return policies;
-}
-
 inline int get_cpus_num()
 {
     int cpuNum = 0;
@@ -166,7 +118,7 @@ inline void get_cpus_arch(Arch *archs, int cpuNum)
     }
     const int bufferSize = 1024;
     char buffer[bufferSize];
-    while (!feof(fp)) {
+    while (!feof(fp) && cpuid < cpuNum) {
         char *status = fgets(buffer, bufferSize, fp);
         if (!status) {
             break;
@@ -175,7 +127,7 @@ inline void get_cpus_arch(Arch *archs, int cpuNum)
         if (memcmp(buffer, "CPU part", 8) == 0) {
             Arch arch = ARM_V8;
             int id = 0;
-            sscanf(buffer, "CPU part\t: %x", &id);
+            UNI_SSCANF(buffer, "CPU part\t: %x", &id);
             switch (id) {
                 case 0xc07:
                     arch = ARM_V7;
@@ -244,7 +196,7 @@ inline void get_cpus_arch(Arch *archs, int cpuNum)
                     arch = ARM_V8;
                     break;
                 default:
-                    UNI_WARNING_LOG("unknown CPU %d arch %x, set to ARM_V8\n", cpuid, id);
+                    UNI_DEBUG_LOG("unknown CPU %d arch %x, set to ARM_V8\n", cpuid, id);
                     break;
             }
             archs[cpuid++] = arch;
@@ -257,6 +209,28 @@ inline void get_cpus_arch(Arch *archs, int cpuNum)
     }
 }
 
+inline Arch get_cpu_arch()
+{
+    static bool blank = true;
+    static Arch arch = CPU_GENERAL;
+    if (blank) {
+        UNI_THREAD_SAFE({
+            if (blank) {
+                int num = get_cpus_num();
+                Arch archs[CPU_MAX_NUMBER];
+                get_cpus_arch(archs, num);
+                for (int i = 0; i < num; i++) {
+                    if (archs[i] > arch) {
+                        arch = archs[i];
+                    }
+                }
+                blank = false;
+            }
+        });
+    }
+    return arch;
+}
+
 inline long get_cpu_freq(int cpuid)
 {
     long maxFrequency = -1;
@@ -264,24 +238,26 @@ inline long get_cpu_freq(int cpuid)
     char path[256];
     FILE *fp = NULL;
     if (fp == NULL) {
-        snprintf(
+        UNI_SNPRINTF(
             path, sizeof(path), "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
         fp = fopen(path, "rb");
     }
     if (fp == NULL) {
-        snprintf(
+        UNI_SNPRINTF(
             path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
         fp = fopen(path, "rb");
     }
     if (fp == NULL) {
-        snprintf(
+        UNI_SNPRINTF(
             path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
         fp = fopen(path, "rb");
     }
     if (fp == NULL) {
-        UNI_WARNING_LOG("can not get CPU max frequency\n");
+        UNI_DEBUG_LOG("can not get CPU max frequency\n");
     } else {
-        fscanf(fp, "%ld", &maxFrequency);
+        char buffer[32];
+        fgets(buffer, 32, fp);
+        UNI_SSCANF(buffer, "%ld", &maxFrequency);
         fclose(fp);
     }
 #endif
@@ -314,7 +290,7 @@ inline void get_cpus_occupy(CpuStat *cpuStat, float *cpuOccupy, int cpuNum)
 
     for (int i = 0; i < cpuNum; i++) {
         fgets(buffer, bufferSize, fp);
-        sscanf(buffer, "%s %lu %lu %lu %lu %lu %lu %lu", name, &user, &nice, &system, &idle,
+        UNI_SSCANF(buffer, "%s %lu %lu %lu %lu %lu %lu %lu", name, &user, &nice, &system, &idle,
             &iowait, &irq, &softirq);
         total = user + nice + system + idle + iowait + irq + softirq;
         cpuOccupy[i] = 0;
@@ -334,9 +310,9 @@ inline void get_cpus_occupy(CpuStat *cpuStat, float *cpuOccupy, int cpuNum)
 inline void swap_variable(void *a, void *b, const int size)
 {
     char buffer[size];
-    memcpy(buffer, a, size);
-    memcpy(a, b, size);
-    memcpy(b, buffer, size);
+    UNI_MEMCPY(buffer, a, size);
+    UNI_MEMCPY(a, b, size);
+    UNI_MEMCPY(b, buffer, size);
 }
 
 inline void disable_cpus(float *occupys, int *cpuids, int cpuNum, float cpuOccupyMax)
@@ -386,7 +362,19 @@ inline void sort_cpus_by_arch_freq_occupy(
 
 inline int set_thread_affinity(int threadid, const int *cpuids, int num)
 {
-#if !(defined(__APPLE__) || defined(_WIN32))
+#ifdef _WIN32
+    DWORD_PTR mask = 0x0;
+    for (int i = 0; i < num; i++) {
+        UNI_DEBUG_LOG("bind thread %d to core %d\n", threadid, cpuids[i]);
+        DWORD_PTR m = 0x1;
+        for (int j = 0; j < cpuids[i]; j++) {
+            m = m << 1;
+        }
+        mask |= m;
+    }
+    HANDLE thread = GetCurrentThread();
+    SetThreadAffinityMask(thread, mask);
+#elif !defined(__APPLE__)
     UNI_THREADID;
     cpu_set_t mask;
     CPU_ZERO(&mask);
@@ -396,38 +384,13 @@ inline int set_thread_affinity(int threadid, const int *cpuids, int num)
     }
     int status = syscall(__NR_sched_setaffinity, tid, sizeof(mask), &mask);
     if (status) {
-        UNI_WARNING_LOG("fail to set affinity %d\n", status);
+        UNI_DEBUG_LOG("fail to set affinity %d\n", status);
         return -1;
     }
 #endif
     return 0;
 }
 
-inline AffinityPolicy thread_affinity_get_policy_by_name(const char *name)
-{
-    int nameLength = strlen(name);
-    for (int i = 0; i < 3; i++) {
-        const char *target = AffinityPolicyNames()[i];
-        int targetLength = strlen(target);
-        if (nameLength < targetLength) {
-            continue;
-        }
-        int match = 1;
-        for (int j = 0; j < targetLength; j++) {
-            if (name[j] == target[j] || name[j] == target[j] + 32) {
-                continue;
-            } else {
-                match = 0;
-                break;
-            }
-        }
-        if (match) {
-            return AffinityPolicies()[i];
-        }
-    }
-    return AFFINITY_CPU_HIGH_PERFORMANCE;
-}
-
 inline Arch thread_affinity_set_by_policy(
     Arch *archs, int *cpuids, int cpuNum, AffinityPolicy policy, int threadId)
 {
@@ -435,7 +398,9 @@ inline Arch thread_affinity_set_by_policy(
         UNI_WARNING_LOG("can not allocate more cores for thread %d\n", threadId);
         return CPU_GENERAL;
     }
-    if (policy == AFFINITY_GPU) {
+    if (policy == AFFINITY_CPU) {
+        return archs[cpuNum - 1];
+    } else if (policy == AFFINITY_GPU) {
         return MALI;
     }
 #ifndef _USE_OPENMP
@@ -481,6 +446,12 @@ inline Arch thread_affinity_set_by_policy(
             candidates[count++] = i;
         }
     }
+    if (OMP_NUM_THREADS > count) {
+        count = 0;
+        for (int i = 0; i < cpuNum; i++) {
+            candidates[count++] = i;
+        }
+    }
     set_thread_affinity(threadId, candidates, count);
     Arch arch = archs[index];
 #endif
@@ -546,21 +517,4 @@ inline void set_cpu_dynamic(DeviceInfo *deviceInfo, int threadId)
         deviceInfo->schedule = MALI;
     }
 }
-
-inline void set_cpu_num_threads(int threadNum)
-{
-#ifndef _USE_OPENMP
-    if (threadNum > 1) {
-        UNI_WARNING_LOG("this library not support multi-threads parallel, please rebuild with "
-                        "--openmp option.\n");
-    }
-#endif
-    if (threadNum < 0) {
-        threadNum = 1;
-    }
-    if (threadNum > OMP_MAX_NUM_THREADS) {
-        threadNum = OMP_MAX_NUM_THREADS;
-    }
-    OMP_NUM_THREADS = threadNum;
-}
 #endif
diff --git a/common/uni/include/uni.h b/common/uni/include/uni.h
index 1af06c54..99499ccc 100644
--- a/common/uni/include/uni.h
+++ b/common/uni/include/uni.h
@@ -14,39 +14,21 @@
 #ifndef _H_UNI
 #define _H_UNI
 
-#include <string.h>
-#include <math.h>
-
 #include "sys.h"
 #include "data_type.h"
 #include "operator_type.h"
 #include "parameter_spec.h"
 #include "error.h"
 #include "array_transpose.h"
+#include "memory_cpu.h"
+#include "affinity_policy.h"
 
 #define UNUSED(x) (void)x
 #define UNI_MIN(a, b) (((a) < (b)) ? (a) : (b))
 #define UNI_MAX(a, b) (((a) > (b)) ? (a) : (b))
 #define UNI_ABS(a) (((a) > 0) ? (a) : (-1 * (a)))
 #define UNI_SIGN(a) (((a) > 0) ? 1 : (((a) < 0) ? -1 : 0))
+#define UNI_ALIGN(a, b) (((a + b - 1) / b) * b)
 #define UNI_F16_MIN -65504.0f
 #define UNI_F16_MAX 65504.0f
-
-inline int UNI_ISNAN(float x)
-{
-    return isnan(x);
-}
-
-inline int UNI_ISINF(float x)
-{
-    return isinf(x);
-}
-
-inline void UNI_MEMCPY(void *dst, const void *src, int size)
-{
-    if (src == dst || size <= 0 || dst == nullptr || src == nullptr) {
-        return;
-    }
-    memcpy(dst, src, size);
-}
 #endif
diff --git a/common/uni/include/ut_util.h b/common/uni/include/ut_util.h
index e8447ff5..414f3a07 100644
--- a/common/uni/include/ut_util.h
+++ b/common/uni/include/ut_util.h
@@ -14,7 +14,7 @@
 #ifndef _H_UT_UTIL
 #define _H_UT_UTIL
 
-#include <string.h>
+#include <math.h>
 
 #include "sys.h"
 #include "uni.h"
@@ -181,6 +181,10 @@ inline void ut_check_v(
                 a = ((INT8 *)A)[i];
                 b = ((INT8 *)B)[i];
                 break;
+            case DT_U8:
+                a = ((U8 *)A)[i];
+                b = ((U8 *)B)[i];
+                break;
             case DT_BIN11:
                 a = ((BIN8 *)A)[i];
                 b = ((BIN8 *)B)[i];
@@ -217,6 +221,9 @@ inline void ut_check_v(void *A, F32 val, U32 len, DataType dt, const char *file,
             case DT_U32:
                 a = ((U32 *)A)[i];
                 break;
+            case DT_U8:
+                a = ((U8 *)A)[i];
+                break;
             case DT_BIN11:
                 a = ((BIN8 *)A)[i];
                 break;
@@ -245,10 +252,10 @@ inline void ut_check_a(void *A, void *B, U32 len, DataType dt)
     switch (dt) {
         case DT_F32:
         case DT_F16:
-            memcpy(threshold, threshold_float, sizeof(F32) * num);
+            UNI_MEMCPY(threshold, threshold_float, sizeof(F32) * num);
             break;
         case DT_U8:
-            memcpy(threshold, threshold_int8, sizeof(F32) * num);
+            UNI_MEMCPY(threshold, threshold_int8, sizeof(F32) * num);
             break;
         default:
             UNI_ERROR_LOG("unsupported data type.\n");
@@ -274,11 +281,11 @@ inline void ut_check_a(void *A, void *B, U32 len, DataType dt)
                 break;
         }
 
-        if (UNI_ISNAN((float)a) || UNI_ISINF((float)a)) {
+        if (isnan((float)a) || isinf((float)a)) {
             UNI_ERROR_LOG("nan or inf value in ut_check_a of input A\n");
             return;
         }
-        if (UNI_ISNAN((float)b) || UNI_ISINF((float)b)) {
+        if (isnan((float)b) || isinf((float)b)) {
             UNI_ERROR_LOG("nan or inf value in ut_check_a of input B\n");
             return;
         }
diff --git a/common/uni/include/x86_avx2_expand.h b/common/uni/include/x86_avx2_expand.h
index b8d422f7..fbcca54e 100644
--- a/common/uni/include/x86_avx2_expand.h
+++ b/common/uni/include/x86_avx2_expand.h
@@ -30,22 +30,31 @@ inline unsigned int _mm256_hadd_u32(__m256i x)
 
 inline __m256 _mm256_log_ps(__m256 x)
 {
-    static const __m256 CONST_one = _mm256_set1_ps(1.0f);
-    static const __m256 CONST_two = _mm256_set1_ps(2.0f);
-    static const __m256 CONST_neg_one = _mm256_set1_ps(-1.0f);
-    F32 i = 30;
-    __m256 n = _mm256_set1_ps(i);
-    __m256 nk = _mm256_add_ps(_mm256_mul_ps(CONST_two, n), CONST_one);
-    x = _mm256_div_ps(_mm256_add_ps(x, CONST_neg_one), _mm256_add_ps(x, CONST_one));
-    __m256 xx = _mm256_mul_ps(x, x);
-    __m256 y = _mm256_div_ps(CONST_one, nk);
-    for (; i > 0; i--) {
-        nk = _mm256_sub_ps(nk, CONST_two);
-        y = _mm256_add_ps(_mm256_div_ps(CONST_one, nk), _mm256_mul_ps(xx, y));
-    }
-
-    y = _mm256_mul_ps(CONST_two, _mm256_mul_ps(x, y));
-    return y;
+    __m256i ux = _mm256_castps_si256(x);
+    __m256 fx = _mm256_cvtepi32_ps(ux);
+    fx = _mm256_mul_ps(fx,
+        _mm256_div_ps(
+            _mm256_set1_ps(1.0f), _mm256_cvtepi32_ps(_mm256_slli_epi32(_mm256_set1_epi32(1), 23))));
+
+    __m256i umx = _mm256_or_si256(_mm256_and_si256(ux, _mm256_set1_epi32(0x007FFFFF)),
+        _mm256_slli_epi32(_mm256_set1_epi32(0x7e), 23));
+    __m256 mx = _mm256_castsi256_ps(umx);
+
+    const __m256 c_124_22551499 = _mm256_set1_ps(124.22551499f);
+    const __m256 c_1_498030302 = _mm256_set1_ps(1.498030302f);
+    const __m256 c_1_725877999 = _mm256_set1_ps(1.72587999f);
+    const __m256 c_0_3520087068 = _mm256_set1_ps(0.3520887068f);
+
+    __m256 tmp = _mm256_div_ps(c_1_725877999, _mm256_add_ps(c_0_3520087068, mx));
+    tmp = _mm256_add_ps(c_124_22551499, tmp);
+    tmp = _mm256_fmadd_ps(c_1_498030302, mx, tmp);
+    const __m256 c_0_69314718 = _mm256_set1_ps(0.69314718f);
+    __m256 result_v = _mm256_mul_ps(_mm256_sub_ps(fx, tmp), c_0_69314718);
+    result_v = _mm256_blendv_ps(
+        result_v, _mm256_set1_ps(NAN), _mm256_cmp_ps(x, _mm256_set1_ps(0), _CMP_LT_OS));
+    result_v = _mm256_blendv_ps(
+        result_v, _mm256_set1_ps(-INFINITY), _mm256_cmp_ps(x, _mm256_set1_ps(0), _CMP_EQ_OS));
+    return result_v;
 }
 
 inline __m256 _mm256_exp_ps(__m256 x)
@@ -121,6 +130,17 @@ inline F32 _mm256_sum_ps(__m256 x)
     return _mm_cvtss_f32(sum);
 }
 
+inline I32 _mm256_sum_epi32(__m256i x)
+{
+    __m128i low = _mm256_extractf128_si256(x, 0);
+    __m128i high = _mm256_extractf128_si256(x, 1);
+    __m128i sum = _mm_hadd_epi32(low, high);
+    low = _mm_hadd_epi32(sum, sum);
+    high = _mm_shuffle_epi32(low, 0b01);
+    sum = _mm_add_epi32(low, high);
+    return _mm_cvtsi128_si32(sum);
+}
+
 // horizontal min
 inline F32 _mm256_hmin_ps(__m256 x)
 {
diff --git a/common/uni/src/CMakeLists.txt b/common/uni/src/CMakeLists.txt
index ef8301af..3ba24ae9 100644
--- a/common/uni/src/CMakeLists.txt
+++ b/common/uni/src/CMakeLists.txt
@@ -6,6 +6,10 @@ add_library(${PROJECT_NAME} SHARED ${srcs})
 # static library
 add_library(${PROJECT_NAME}_static STATIC ${srcs})
 
+if (USE_SECURE_C)
+    target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SECUREC_SHARED_LIBRARY})
+endif ()
+
 set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}")
 set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1)
 set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
diff --git a/common/uni/src/profiling.cpp b/common/uni/src/profiling.cpp
index 9271184f..b7e90851 100644
--- a/common/uni/src/profiling.cpp
+++ b/common/uni/src/profiling.cpp
@@ -25,9 +25,15 @@ int OMP_NUM_THREADS = OMP_MAX_NUM_THREADS;
 #ifdef _THREAD_SAFE
 pthread_mutex_t uniThreadMutex = PTHREAD_MUTEX_INITIALIZER;
 #endif
-std::map<std::string, double> time_statistics;
+static std::map<std::string, double> time_statistics;
+static bool time_statistics_flag = true;
 #ifndef _EAGER_LOG
-std::vector<std::string> logs;
+static std::vector<std::string> logs;
+#endif
+
+#ifdef _USE_MEM_CHECK
+#include "memory_cpu.h"
+std::map<std::string, unsigned int> mem_statistics;
 #endif
 
 double ut_time_ms()
@@ -40,7 +46,20 @@ double ut_time_ms()
 
 void ut_time_init()
 {
-    UNI_THREAD_SAFE(time_statistics.clear());
+    UNI_THREAD_SAFE({
+        time_statistics.clear();
+        time_statistics_flag = true;
+    });
+}
+
+void ut_time_start()
+{
+    UNI_THREAD_SAFE({ time_statistics_flag = true; });
+}
+
+void ut_time_stop()
+{
+    UNI_THREAD_SAFE({ time_statistics_flag = false; });
 }
 
 inline std::string ut_profile_log(const std::string &name,
@@ -84,6 +103,9 @@ void ut_time_process(
 #endif
 #endif
 
+    if (!time_statistics_flag) {
+        return;
+    }
 #ifdef _PROFILE_STATISTICS
     double duration = time_end_ms - time_start_ms;
     UNI_THREAD_SAFE({
@@ -99,6 +121,9 @@ void ut_time_process(
 void ut_time_statistics()
 {
 #ifndef _EAGER_LOG
+    printf("\nFunction Time:\n{\"name\": function name, \"cat\": function category, \"ph\": "
+           "function type, \"pid\": process id, \"tid\": thread id, \"ts\": start time(ms), "
+           "\"dur\": duration time(vs, gpu will have 1 ms synchronization overhead)\n");
     for (unsigned int i = 0; i < logs.size(); i++) {
         UNI_PROFILE_LOG("%s\n", logs[i].c_str());
     }
diff --git a/compute/blas_enhance/src/CMakeLists.txt b/compute/blas_enhance/src/CMakeLists.txt
index 00fb24a4..24fda937 100644
--- a/compute/blas_enhance/src/CMakeLists.txt
+++ b/compute/blas_enhance/src/CMakeLists.txt
@@ -29,6 +29,8 @@ if (USE_NEON)
     if (USE_INT8)
         file(GLOB arm_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/*.cpp)
         if (USE_FP16)
+            file(GLOB armv8_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v8.2/*.cpp)
+        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
             file(GLOB armv8_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v8/*.cpp)
         else ()
             file(GLOB armv8_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v7/*.cpp)
@@ -47,6 +49,9 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 # shared library 
 add_library(${PROJECT_NAME} SHARED ${srcs})
 target_link_libraries(${PROJECT_NAME} LINK_PUBLIC uni)
+if (USE_SECURE_C)
+    target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY})
+endif ()
 
 # static library 
 add_library(${PROJECT_NAME}_static STATIC ${srcs})
diff --git a/compute/blas_enhance/src/axpby.cpp b/compute/blas_enhance/src/axpby.cpp
index 6f7cb448..6cc6fd02 100644
--- a/compute/blas_enhance/src/axpby.cpp
+++ b/compute/blas_enhance/src/axpby.cpp
@@ -18,6 +18,9 @@
 #ifdef _USE_NEON
 #include "cpu/arm/blas_arm.h"
 #endif
+#ifdef _USE_X86
+#include "cpu/x86/blas_x86.h"
+#endif
 
 EE vector_vector_axpby(
     F32 a, TensorDesc xDesc, const void *x, F32 b, TensorDesc yDesc, void *y, Arch arch)
@@ -45,8 +48,12 @@ EE vector_vector_axpby(
         ret = axpby_general(yLen, yDataType, a, x, b, y);
 #endif
 #ifdef _USE_NEON
-    } else {
+    } else if (IS_ARM(arch)) {
         ret = axpby_arm(yLen, yDataType, a, x, b, y, arch);
+#endif
+#ifdef _USE_X86
+    } else if (IS_X86(arch)) {
+        ret = axpby_x86(yLen, yDataType, a, x, b, y);
 #endif
     }
     return ret;
diff --git a/compute/blas_enhance/src/cpu/arm/axpby.cpp b/compute/blas_enhance/src/cpu/arm/axpby.cpp
index 681ac07c..f13510bf 100644
--- a/compute/blas_enhance/src/cpu/arm/axpby.cpp
+++ b/compute/blas_enhance/src/cpu/arm/axpby.cpp
@@ -11,8 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include "error.h"
-
 #include "cpu/arm/blas_arm.h"
 #ifdef _USE_FP16
 #include "cpu/arm/fp16/blas_fp16.h"
@@ -23,13 +21,10 @@
 
 EE axpby_arm(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y, Arch arch)
 {
-    EE ret = SUCCESS;
+    EE ret = NOT_SUPPORTED;
     switch (dt) {
 #ifdef _USE_FP16
         case DT_F16:
-            if (ARM_A55 != arch && ARM_A76 != arch) {
-                return NOT_SUPPORTED;
-            }
             ret = axpby_fp16(len, a, (F16 *)x, b, (F16 *)y);
             break;
 #endif
@@ -39,7 +34,6 @@ EE axpby_arm(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y, Arch ar
             break;
 #endif
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
     return ret;
diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h b/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h
index 9b618c59..f879ba01 100644
--- a/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h
+++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h
@@ -13,7 +13,7 @@
 
 #ifndef _H_MMM_COMMON
 #define _H_MMM_COMMON
-#include <string.h>
+
 #include <arm_neon.h>
 #include "data_type.h"
 #include "uni.h"
@@ -41,7 +41,7 @@ inline void matrix2_trans(U32 size, U32 blockK, U32 M, F16 *src, F16 *dst)
 {
     for (U32 i = 0; i < blockK; i++) {
         asm volatile("prfm pldl2keep, [%0, #48]\n" : "+r"(src) : : "memory", "cc");
-        memcpy(dst, src, size * sizeof(F16));
+        UNI_MEMCPY(dst, src, size * sizeof(F16));
         dst += size;
         src += M;
     }
diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp
index 2b8af932..baedff5f 100644
--- a/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp
+++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp
@@ -31,7 +31,7 @@ EE matrix_vector_multiply_transform_weight_fp16(TensorDesc desc, F16 *src, F16 *
                 matrix1_trans(64, K, K, src + i * K, dst + i * K);
             }
             if (i < (int)N) {
-                memcpy(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F16));
+                UNI_MEMCPY(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F16));
             }
             break;
         }
diff --git a/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp b/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp
index a1761246..36e5e6f5 100644
--- a/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp
+++ b/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include "error.h"
 #include "cpu/arm/fp32/blas_fp32.h"
 
 EE axpby_fp32(U32 len, F32 a, const F32 *x, F32 b, F32 *y)
diff --git a/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h b/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h
index 4517bd72..2e5c5d21 100644
--- a/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h
+++ b/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h
@@ -66,7 +66,7 @@ inline void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst)
         if (i % 16 == 0) {
             __builtin_prefetch(src + 16);
         }
-        memcpy(dst, src, size * sizeof(F32));
+        UNI_MEMCPY(dst, src, size * sizeof(F32));
         dst += size;
         src += M;
     }
diff --git a/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp
index 0ce3b7ae..e695f424 100644
--- a/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp
+++ b/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp
@@ -28,7 +28,7 @@ EE matrix_vector_multiply_transform_weight_fp32(TensorDesc desc, F32 *src, F32 *
                 matrix1_trans(16, K, K, src + i * K, dst + i * K);
             }
             if (i < (int)N) {
-                memcpy(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F32));
+                UNI_MEMCPY(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F32));
             }
             break;
         }
diff --git a/compute/blas_enhance/src/cpu/arm/int8/blas_matrix_transpose.h b/compute/blas_enhance/src/cpu/arm/int8/blas_matrix_transpose.h
index e711b16c..436bac54 100644
--- a/compute/blas_enhance/src/cpu/arm/int8/blas_matrix_transpose.h
+++ b/compute/blas_enhance/src/cpu/arm/int8/blas_matrix_transpose.h
@@ -14,11 +14,10 @@
 #ifndef _H_BLAS_MATRIX_TRANSPOSE
 #define _H_BLAS_MATRIX_TRANSPOSE
 
-#include <string.h>
 #include <arm_neon.h>
 #include "data_type.h"
 
-#ifndef __aarch64__        
+#ifndef _USE_FP16
 inline void matrix1_trans_int8(U32 size, U32 blockK, U32 K, INT8 *src, INT8 *dst)
 {
     INT8 *src1 = src;
@@ -33,7 +32,7 @@ inline void matrix1_trans_int8(U32 size, U32 blockK, U32 K, INT8 *src, INT8 *dst
     }
     U32 K4 = pad_to_4_multiple(blockK);
     for (U32 i = 0; i < K4 - blockK; i++) {
-        memset(dst, 0, size * sizeof(INT8));
+        UNI_MEMSET(dst, 0, size * sizeof(INT8));
         dst += size;
     }
 }
@@ -44,13 +43,13 @@ inline void matrix2_trans_int8(U32 size, U32 blockK, U32 M, INT8 *src, INT8 *dst
         if (i % 16 == 0) {
             __builtin_prefetch(src + 16);
         }
-        memcpy(dst, src, size * sizeof(INT8));
+        UNI_MEMCPY(dst, src, size * sizeof(INT8));
         dst += size;
         src += M;
     }
     U32 K4 = pad_to_4_multiple(blockK);
     for (U32 i = 0; i < K4 - blockK; i++) {
-        memset(dst, 0, size * sizeof(INT8));
+        UNI_MEMSET(dst, 0, size * sizeof(INT8));
         dst += size;
     }
 }
@@ -67,19 +66,19 @@ inline void matrix1_trans_n8(U32 blockK, U32 K, INT8 *src, INT8 *dst)
     U32 k = 0;
     for (; k < blockK - 7; k += 8) {
         if (k % 64 == 0) {
-            asm volatile("prfm pldl2keep, [%[in0], 64]\n"
-                         "prfm pldl2keep, [%[in1], 64]\n"
-                         "prfm pldl2keep, [%[in2], 64]\n"
-                         "prfm pldl2keep, [%[in3], 64]\n"
-                         "prfm pldl2keep, [%[in4], 64]\n"
-                         "prfm pldl2keep, [%[in5], 64]\n"
-                         "prfm pldl2keep, [%[in6], 64]\n"
-                         "prfm pldl2keep, [%[in7], 64]\n"
-                         : [in0] "+r"(in[0]), [in1] "+r"(in[1]), [in2] "+r"(in[2]),
-                         [in3] "+r"(in[3]), [in4] "+r"(in[4]), [in5] "+r"(in[5]), [in6] "+r"(in[6]),
-                         [in7] "+r"(in[7])
-                         :
-                         : "memory", "cc");
+            asm volatile(
+                "prfm pldl2keep, [%[in0], 64]\n"
+                "prfm pldl2keep, [%[in1], 64]\n"
+                "prfm pldl2keep, [%[in2], 64]\n"
+                "prfm pldl2keep, [%[in3], 64]\n"
+                "prfm pldl2keep, [%[in4], 64]\n"
+                "prfm pldl2keep, [%[in5], 64]\n"
+                "prfm pldl2keep, [%[in6], 64]\n"
+                "prfm pldl2keep, [%[in7], 64]\n"
+                : [in0] "+r"(in[0]), [in1] "+r"(in[1]), [in2] "+r"(in[2]), [in3] "+r"(in[3]),
+                [in4] "+r"(in[4]), [in5] "+r"(in[5]), [in6] "+r"(in[6]), [in7] "+r"(in[7])
+                :
+                : "memory", "cc");
         }
         asm volatile("ldr d0, [%[in0]], 8\n"
                      "ldr d1, [%[in1]], 8\n"
@@ -199,27 +198,27 @@ inline void matrix2_trans_m12(U32 blockK, U32 M, INT8 *src, INT8 *dst)
         }
         src1 += offset;
 
-        asm volatile("ldr d0, [%[in0]]\n"
-                     "ldr d1, [%[in1]]\n"
-                     "ldr d2, [%[in2]]\n"
-                     "ldr d3, [%[in3]]\n"
-                     "zip1 v4.8b, v0.8b, v1.8b\n"
-                     "zip2 v5.8b, v0.8b, v1.8b\n"
-                     "zip1 v6.8b, v2.8b, v3.8b\n"
-                     "zip2 v7.8b, v2.8b, v3.8b\n"
+        asm volatile(
+            "ldr d0, [%[in0]]\n"
+            "ldr d1, [%[in1]]\n"
+            "ldr d2, [%[in2]]\n"
+            "ldr d3, [%[in3]]\n"
+            "zip1 v4.8b, v0.8b, v1.8b\n"
+            "zip2 v5.8b, v0.8b, v1.8b\n"
+            "zip1 v6.8b, v2.8b, v3.8b\n"
+            "zip2 v7.8b, v2.8b, v3.8b\n"
 
-                     "zip1 v0.4h, v4.4h, v6.4h\n"
-                     "zip2 v1.4h, v4.4h, v6.4h\n"
-                     "zip1 v2.4h, v5.4h, v7.4h\n"
-                     "zip2 v3.4h, v5.4h, v7.4h\n"
-                     "str d0, [%[out]]\n"
-                     "str d1, [%[out], 8]\n"
-                     "str d2, [%[out], 16]\n"
-                     "str d3, [%[out], 24]\n"
-                     :
-                     : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]),
-                     [in3] "r"(in12[3]), [out] "r"(dst1)
-                     : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+            "zip1 v0.4h, v4.4h, v6.4h\n"
+            "zip2 v1.4h, v4.4h, v6.4h\n"
+            "zip1 v2.4h, v5.4h, v7.4h\n"
+            "zip2 v3.4h, v5.4h, v7.4h\n"
+            "str d0, [%[out]]\n"
+            "str d1, [%[out], 8]\n"
+            "str d2, [%[out], 16]\n"
+            "str d3, [%[out], 24]\n"
+            :
+            : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]), [in3] "r"(in12[3]), [out] "r"(dst1)
+            : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
         for (U32 j = 0; j < 4; j++) {
             for (U32 k = 0; k < 4; k++) {
                 dst1[32 + j * 4 + k] = in12[k][8 + j];
@@ -241,27 +240,27 @@ inline void matrix2_trans_m12(U32 blockK, U32 M, INT8 *src, INT8 *dst)
             }
         }
 
-        asm volatile("ldr d0, [%[in0]]\n"
-                     "ldr d1, [%[in1]]\n"
-                     "ldr d2, [%[in2]]\n"
-                     "ldr d3, [%[in3]]\n"
-                     "zip1 v4.8b, v0.8b, v1.8b\n"
-                     "zip2 v5.8b, v0.8b, v1.8b\n"
-                     "zip1 v6.8b, v2.8b, v3.8b\n"
-                     "zip2 v7.8b, v2.8b, v3.8b\n"
+        asm volatile(
+            "ldr d0, [%[in0]]\n"
+            "ldr d1, [%[in1]]\n"
+            "ldr d2, [%[in2]]\n"
+            "ldr d3, [%[in3]]\n"
+            "zip1 v4.8b, v0.8b, v1.8b\n"
+            "zip2 v5.8b, v0.8b, v1.8b\n"
+            "zip1 v6.8b, v2.8b, v3.8b\n"
+            "zip2 v7.8b, v2.8b, v3.8b\n"
 
-                     "zip1 v0.4h, v4.4h, v6.4h\n"
-                     "zip2 v1.4h, v4.4h, v6.4h\n"
-                     "zip1 v2.4h, v5.4h, v7.4h\n"
-                     "zip2 v3.4h, v5.4h, v7.4h\n"
-                     "str d0, [%[out]]\n"
-                     "str d1, [%[out], 8]\n"
-                     "str d2, [%[out], 16]\n"
-                     "str d3, [%[out], 24]\n"
-                     :
-                     : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]),
-                     [in3] "r"(in12[3]), [out] "r"(dst1)
-                     : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+            "zip1 v0.4h, v4.4h, v6.4h\n"
+            "zip2 v1.4h, v4.4h, v6.4h\n"
+            "zip1 v2.4h, v5.4h, v7.4h\n"
+            "zip2 v3.4h, v5.4h, v7.4h\n"
+            "str d0, [%[out]]\n"
+            "str d1, [%[out], 8]\n"
+            "str d2, [%[out], 16]\n"
+            "str d3, [%[out], 24]\n"
+            :
+            : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]), [in3] "r"(in12[3]), [out] "r"(dst1)
+            : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
         for (U32 j = 0; j < 4; j++) {
             for (U32 k = 0; k < 4; k++) {
                 dst1[32 + j * 4 + k] = in12[k][8 + j];
diff --git a/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp b/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp
index 637a0007..a57e532c 100644
--- a/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp
+++ b/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp
@@ -15,6 +15,7 @@
 #include "cpu/arm/blas_arm.h"
 #include "cpu/arm/int8/blas_matrix_transpose.h"
 #include "arm_neon_expand.h"
+#include "uni.h"
 
 #define ALIGN 32
 
@@ -28,7 +29,7 @@ EE matrix_vector_multiply_transform_weight_int8(TensorDesc desc, INT8 *src, INT8
     switch (desc.df) {
         case DF_NORMAL: {
             CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K));
-#ifdef __aarch64__
+#ifdef _USE_FP16
             U32 K4 = pad_to_4_multiple(K);
 #else
             U32 K4 = K;
@@ -37,13 +38,13 @@ EE matrix_vector_multiply_transform_weight_int8(TensorDesc desc, INT8 *src, INT8
                 matrix1_trans_int8(ALIGN, K, K, src + i * K, dst + i * K4);
             }
             if (i < (int)N) {
-                memcpy(dst + i * K4, src + i * K, (N - i) * K * bytesOf(DT_I8));
+                UNI_MEMCPY(dst + i * K4, src + i * K, (N - i) * K * bytesOf(DT_I8));
             }
             break;
         }
         case DF_TRANSPOSE: {
             CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N));
-#ifdef __aarch64__
+#ifdef _USE_FP16
             U32 K4 = pad_to_4_multiple(K);
 #else
             U32 K4 = K;
@@ -69,7 +70,7 @@ EE matrix_vector_multiply_transform_weight_int8(TensorDesc desc, INT8 *src, INT8
     return ret;
 }
 
-#ifndef __aarch64__
+#ifndef _USE_FP16
 #if 1
 void mvm_row_pack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *result)
 {
@@ -197,7 +198,7 @@ void mvm_row_pack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *result)
 inline void mvm_row_unpack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *result)
 {
     U32 N = Nbatch * 8;
-#ifdef __aarch64__
+#ifdef _USE_FP16
     int8x16_t mat[8];
 #else
     int16x4_t mat[8][2];
@@ -213,7 +214,7 @@ inline void mvm_row_unpack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *r
         int32x4_t bias0 = vld1q_s32(result + n);
         int32x4_t bias1 = vld1q_s32(result + n + 4);
         int32x4_t res[8] = {0};
-#ifdef __aarch64__
+#ifdef _USE_FP16
         for (U32 k = 0; k < K_inner; k += 16) {
             int8x16_t v = vld1q_s8(vector + k);
             for (int i = 0; i < 8; i++) {
@@ -319,7 +320,7 @@ inline void mvm_col(U32 numRows, U32 numColumns, INT8 *matrix, INT8 *vector, I32
     U32 NInner = N - NTail;
 
     for (U32 n = 0; n < NInner; n += 64) {
-        memset(tmp, 0, sizeof(I32) * 64);
+        UNI_MEMSET(tmp, 0, sizeof(I32) * 64);
         for (U32 k = 0; k < K; k++) {
             for (U32 i = 0; i < 64; i++) {
                 tmp[i] += vector[k] * matrix[k * N + n + i];
@@ -331,7 +332,7 @@ inline void mvm_col(U32 numRows, U32 numColumns, INT8 *matrix, INT8 *vector, I32
         }
     }
 
-    memset(tmp, 0, sizeof(I32) * 64);
+    UNI_MEMSET(tmp, 0, sizeof(I32) * 64);
     for (U32 k = 0; k < K; k++) {
         for (U32 i = 0; i < NTail; i++) {
             tmp[i] += vector[k] * matrix[k * N + NInner + i];
diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm.cpp b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm.cpp
new file mode 100644
index 00000000..52b9f1f6
--- /dev/null
+++ b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/arm/int8/v8.2/mmm_v8.h"
+#include "cpu/arm/int8/blas_int8.h"
+#include "cpu/arm/int8/blas_matrix_transpose.h"
+#include "cpu/arm/blas_arm.h"
+
+EE mmm_int8(
+    int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result, Arch arch)
+{
+    EE ret = SUCCESS;
+    switch (arch) {
+        case ARM_A55:
+            mmm_A55(M, N, K, transposeA, matrix1, matrix2, tmp, result);
+            break;
+        case ARM_A76:
+            mmm_A76(M, N, K, transposeA, matrix1, matrix2, tmp, result);
+            break;
+        default:
+            ret = NOT_SUPPORTED;
+            break;
+    }
+    return ret;
+}
+
+EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *dst)
+{
+    DataType dt;
+    DataFormat df;
+    U32 N, K;
+    CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N));
+    U32 K4 = pad_to_4_multiple(K);
+    int i = 0;
+    for (; i < (int)N - 11; i += 12) {
+        matrix2_trans_m12(K, N, src + i, dst + i * K4);
+    }
+    for (; i < (int)N - 7; i += 8) {
+        matrix2_trans_int8(8, K, N, src + i, dst + i * K4);
+    }
+    for (; i < (int)N - 3; i += 4) {
+        matrix2_trans_int8(4, K, N, src + i, dst + i * K4);
+    }
+    if ((int)N > i) {
+        matrix2_trans_int8(N - i, K, N, src + i, dst + i * K4);
+    }
+    return SUCCESS;
+}
+
+EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *dst)
+{
+    DataType dt;
+    DataFormat df;
+    U32 N, K;
+    CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K));
+    U32 K4 = pad_to_4_multiple(K);
+    int i = 0;
+    for (; i < (int)N - 11; i += 12) {
+        matrix1_trans_int8(12, K, K, src + i * K, dst + i * K4);
+    }
+    for (; i < (int)N - 7; i += 8) {
+        matrix1_trans_int8(8, K, K, src + i * K, dst + i * K4);
+    }
+    for (; i < (int)N - 3; i += 4) {
+        matrix1_trans_int8(4, K, K, src + i * K, dst + i * K4);
+    }
+    if ((int)N > i) {
+        matrix1_trans_int8(N - i, K, K, src + i * K, dst + i * K4);
+    }
+    return SUCCESS;
+}
diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A55.cpp b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A55.cpp
similarity index 99%
rename from compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A55.cpp
rename to compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A55.cpp
index 4db08cd3..a4086415 100644
--- a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A55.cpp
+++ b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A55.cpp
@@ -12,8 +12,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/arm/blas_arm.h"
-#include "cpu/arm/int8/v8/mmm_common.h"
-#include "cpu/arm/int8/v8/mmm_v8.h"
+#include "cpu/arm/int8/v8.2/mmm_common.h"
+#include "cpu/arm/int8/v8.2/mmm_v8.h"
 #include "cpu/arm/int8/blas_matrix_transpose.h"
 #include "uni.h"
 
diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A76.cpp b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A76.cpp
similarity index 99%
rename from compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A76.cpp
rename to compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A76.cpp
index 0495fd81..0ff95dfe 100644
--- a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A76.cpp
+++ b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A76.cpp
@@ -12,8 +12,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/arm/blas_arm.h"
-#include "cpu/arm/int8/v8/mmm_common.h"
-#include "cpu/arm/int8/v8/mmm_v8.h"
+#include "cpu/arm/int8/v8.2/mmm_common.h"
+#include "cpu/arm/int8/v8.2/mmm_v8.h"
 #include "cpu/arm/int8/blas_matrix_transpose.h"
 #include "uni.h"
 
diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_common.h b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_common.h
similarity index 100%
rename from compute/blas_enhance/src/cpu/arm/int8/v8/mmm_common.h
rename to compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_common.h
diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_v8.h b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_v8.h
similarity index 100%
rename from compute/blas_enhance/src/cpu/arm/int8/v8/mmm_v8.h
rename to compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_v8.h
diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm.cpp b/compute/blas_enhance/src/cpu/arm/int8/v8/mmm.cpp
index 8b1e6640..035d0a30 100644
--- a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm.cpp
+++ b/compute/blas_enhance/src/cpu/arm/int8/v8/mmm.cpp
@@ -11,29 +11,13 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include "cpu/arm/int8/v8/mmm_v8.h"
 #include "cpu/arm/int8/blas_int8.h"
-#include "cpu/arm/int8/blas_matrix_transpose.h"
 #include "cpu/arm/blas_arm.h"
+#include "cpu/arm/int8/blas_matrix_transpose.h"
+#include "uni.h"
+#include "thread_affinity.h"
 
-EE mmm_int8(
-    int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result, Arch arch)
-{
-    EE ret = SUCCESS;
-    switch (arch) {
-        case ARM_A55:
-            mmm_A55(M, N, K, transposeA, matrix1, matrix2, tmp, result);
-            break;
-        case ARM_A76:
-            mmm_A76(M, N, K, transposeA, matrix1, matrix2, tmp, result);
-            break;
-        default:
-            ret = NOT_SUPPORTED;
-            break;
-    }
-    return ret;
-}
-
+static const int tileN = 8;
 EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *dst)
 {
     DataType dt;
@@ -42,15 +26,9 @@ EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *
     CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N));
     U32 K4 = pad_to_4_multiple(K);
     int i = 0;
-    for (; i < (int)N - 11; i += 12) {
-        matrix2_trans_m12(K, N, src + i, dst + i * K4);
-    }
     for (; i < (int)N - 7; i += 8) {
         matrix2_trans_int8(8, K, N, src + i, dst + i * K4);
     }
-    for (; i < (int)N - 3; i += 4) {
-        matrix2_trans_int8(4, K, N, src + i, dst + i * K4);
-    }
     if ((int)N > i) {
         matrix2_trans_int8(N - i, K, N, src + i, dst + i * K4);
     }
@@ -65,17 +43,354 @@ EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *
     CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K));
     U32 K4 = pad_to_4_multiple(K);
     int i = 0;
-    for (; i < (int)N - 11; i += 12) {
-        matrix1_trans_int8(12, K, K, src + i * K, dst + i * K4);
-    }
     for (; i < (int)N - 7; i += 8) {
         matrix1_trans_int8(8, K, K, src + i * K, dst + i * K4);
     }
-    for (; i < (int)N - 3; i += 4) {
-        matrix1_trans_int8(4, K, K, src + i * K, dst + i * K4);
-    }
     if ((int)N > i) {
         matrix1_trans_int8(N - i, K, K, src + i * K, dst + i * K4);
     }
     return SUCCESS;
 }
+
+void mmm_4x8(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out)
+{
+#if 1
+    int32x4_t ret[tileN][2];
+    for (int i = 0; i < tileN; i++) {
+        for (int j = 0; j < 2; j++) {
+            ret[i][j] = vld1q_s32(out + i * offset + j * 4);
+        }
+    }
+    int16x8_t c[tileN];
+    for (U32 n = 0; n < K; n += 4) {
+        int8x8_t b0 = vld1_s8(w);
+        w += 8;
+        for (int i = 0; i < tileN; i++) {
+            int8x8_t a0 = vdup_n_s8(in[0]);
+            c[i] = vmull_s8(a0, b0);
+            in++;
+        }
+        for (U32 j = 0; j < 3; j++) {
+            int8x8_t b0 = vld1_s8(w);
+            w += 8;
+            for (int i = 0; i < tileN; i++) {
+                int8x8_t a0 = vdup_n_s8(in[0]);
+                c[i] = vmlal_s8(c[i], a0, b0);
+                in++;
+            }
+        }
+        for (int i = 0; i < tileN; i++) {
+            ret[i][0] = vaddw_s16(ret[i][0], vget_low_s16(c[i]));
+            ret[i][1] = vaddw_s16(ret[i][1], vget_high_s16(c[i]));
+        }
+    }
+    for (int i = 0; i < tileN; i++) {
+        for (int j = 0; j < 2; j++) {
+            vst1q_s32(out + i * offset + j * 4, ret[i][j]);
+        }
+    }
+#else
+    offset *= 4;
+    asm volatile("mov x3, %0\n"
+                 "ld1r {v0.8b}, [x3]\n"
+                 "ld1r {v1.8b}, [x3]!\n"
+                 "ld1r {v2.8b}, [x3]!\n"
+                 "ld1r {v3.8b}, [x3]!\n"
+                 //"ld1r {v4.8b}, [x3]!\n"
+                 //"ld1r {v5.8b}, [x3]!\n"
+    
+                 "mov x0, %1\n"
+                 "ldp d6, d7, [x0]!\n"
+    
+                 // give out address to x26
+                 "mov x26, %2\n"
+    
+                 // load in bias
+                 "ldp  q8, q9, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "ldp  q10, q11, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "ldp  q12, q13, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "ldp  q14, q15, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "ldp  q24, q25, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "ldp  q26, q27, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "ldp  q28, q29, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "ldp  q30, q31, [x26]\n"
+    
+                 // K- > x26
+                 "mov x26, %3\n"
+    
+                 // Computation loop
+                 "0:\n"
+    
+                 "smull v16.8h, v0.8b, v6.8b\n"
+                 "ld1r {v0.8b}, [x3]!\n"
+                 "smull v17.8h, v1.8b, v6.8b\n"
+                 "ld1r {v1.8b}, [x3]!\n"
+                 "smull v18.8h, v2.8b, v6.8b\n"
+                 "ld1r {v2.8b}, [x3]!\n"
+                 "smull v19.8h, v3.8b, v6.8b\n"
+                 "ld1r {v3.8b}, [x3]!\n"
+                 "smull v20.8h, v0.8b, v6.8b\n"
+                 "ld1r {v0.8b}, [x3]!\n"
+                 "smull v21.8h, v1.8b, v6.8b\n"
+                 "ld1r {v1.8b}, [x3]!\n"
+                 "smull v22.8h, v2.8b, v6.8b\n"
+                 "ld1r {v2.8b}, [x3]!\n"
+                 "smull v23.8h, v3.8b, v6.8b\n"
+                 "ld1r {v3.8b}, [x3]!\n"
+                 "ldr d6, [x0]!\n"
+
+                 "smlal v16.8h, v0.8b, v7.8b\n"
+                 "ld1r {v0.8b}, [x3]!\n"
+                 "smlal v17.8h, v1.8b, v7.8b\n"
+                 "ld1r {v1.8b}, [x3]!\n"
+                 "smlal v18.8h, v2.8b, v7.8b\n"
+                 "ld1r {v2.8b}, [x3]!\n"
+                 "smlal v19.8h, v3.8b, v7.8b\n"
+                 "ld1r {v3.8b}, [x3]!\n"
+                 "smlal v20.8h, v0.8b, v7.8b\n"
+                 "ld1r {v0.8b}, [x3]!\n"
+                 "smlal v21.8h, v1.8b, v7.8b\n"
+                 "ld1r {v1.8b}, [x3]!\n"
+                 "smlal v22.8h, v2.8b, v7.8b\n"
+                 "ld1r {v2.8b}, [x3]!\n"
+                 "smlal v23.8h, v3.8b, v7.8b\n"
+                 "ld1r {v3.8b}, [x3]!\n"
+                 "ldr d7, [x0]!\n"
+
+                 "smlal v16.8h, v0.8b, v6.8b\n"
+                 "ld1r {v0.8b}, [x3]!\n"
+                 "smlal v17.8h, v1.8b, v6.8b\n"
+                 "ld1r {v1.8b}, [x3]!\n"
+                 "smlal v18.8h, v2.8b, v6.8b\n"
+                 "ld1r {v2.8b}, [x3]!\n"
+                 "smlal v19.8h, v3.8b, v6.8b\n"
+                 "ld1r {v3.8b}, [x3]!\n"
+                 "smlal v20.8h, v4.8b, v6.8b\n"
+                 "ld1r {v0.8b}, [x3]!\n"
+                 "smlal v21.8h, v1.8b, v6.8b\n"
+                 "ld1r {v1.8b}, [x3]!\n"
+                 "smlal v22.8h, v2.8b, v6.8b\n"
+                 "ld1r {v2.8b}, [x3]!\n"
+                 "smlal v23.8h, v3.8b, v6.8b\n"
+                 "ld1r {v3.8b}, [x3]!\n"
+                 "ldr d6, [x0]!\n"
+
+                 "smlal v16.8h, v0.8b, v7.8b\n"
+                 "ld1r {v0.8b}, [x3]!\n"
+                 "smlal v17.8h, v1.8b, v7.8b\n"
+                 "ld1r {v1.8b}, [x3]!\n"
+                 "smlal v18.8h, v2.8b, v7.8b\n"
+                 "ld1r {v2.8b}, [x3]!\n"
+                 "smlal v19.8h, v3.8b, v7.8b\n"
+                 "ld1r {v3.8b}, [x3]!\n"
+                 "smlal v20.8h, v0.8b, v7.8b\n"
+                 "ld1r {v0.8b}, [x3]!\n"
+                 "smlal v21.8h, v1.8b, v7.8b\n"
+                 "ld1r {v1.8b}, [x3]!\n"
+                 "smlal v22.8h, v2.8b, v7.8b\n"
+                 "ld1r {v2.8b}, [x3]!\n"
+                 "smlal v23.8h, v3.8b, v7.8b\n"
+                 "ld1r {v3.8b}, [x3]!\n"
+                 "ldr d7, [x0]!\n"
+    
+                 "subs x26, x26, #4\n"
+    
+                 "saddw    v8.4s,  v8.4s, v16.4h\n"
+                 "saddw2   v9.4s,  v9.4s, v16.8h\n"
+                 "saddw   v10.4s, v10.4s, v17.4h\n"
+                 "saddw2  v11.4s, v11.4s, v17.8h\n"
+                 "saddw   v12.4s, v12.4s, v18.4h\n"
+                 "saddw2  v13.4s, v13.4s, v18.8h\n"
+                 "saddw   v14.4s, v14.4s, v19.4h\n"
+                 "saddw2  v15.4s, v15.4s, v19.8h\n"
+                 "saddw   v24.4s, v24.4s, v20.4h\n"
+                 "saddw2  v25.4s, v25.4s, v20.8h\n"
+                 "saddw   v26.4s, v26.4s, v21.4h\n"
+                 "saddw2  v27.4s, v27.4s, v21.8h\n"
+                 "saddw   v28.4s, v28.4s, v22.4h\n"
+                 "saddw2  v29.4s, v29.4s, v22.8h\n"
+                 "saddw   v30.4s, v30.4s, v23.4h\n"
+                 "saddw2  v31.4s, v31.4s, v23.8h\n"
+    
+                 "bne 0b\n"
+    
+                 // give out address to x26
+                 "mov x26, %2\n"
+    
+                 "stp  q8, q9, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "stp  q10, q11, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "stp  q12, q13, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "stp  q14, q15, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "stp  q24, q25, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "stp  q26, q27, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "stp  q28, q29, [x26]\n"
+                 "add x26, x26, %4\n"
+                 "stp  q30, q31, [x26]\n"
+                 : "+r"(in), "+r"(w), "+r"(out)
+                 : "r"((I64)K), "r"((I64)offset)
+                 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+                 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+                 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+                 "v30", "v31", "x26");
+#endif
+}
+
+inline void mmm_NTail_M8(U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result)
+{
+    for (U32 i = 0; i < N; i++) {
+        int32x4_t res1 = vld1q_s32(result + i * M);
+        int32x4_t res2 = vld1q_s32(result + i * M + 4);
+        for (U32 q = 0; q < K; q += 1) {
+            int8x8_t mat2 = vld1_s8(matrix2 + q * 8);
+            int8x8_t mat1 = vdup_n_s8(matrix1[q * N + i]);
+            int16x8_t r = vmull_s8(mat1, mat2);
+            res1 = vaddw_s16(res1, vget_low_s16(r));
+            res2 = vaddw_s16(res2, vget_high_s16(r));
+        }
+        vst1q_s32(result + i * M, res1);
+        vst1q_s32(result + i * M + 4, res2);
+    }
+}
+
+inline void mmm_NTail_M(U32 MInner, U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result)
+{
+    for (U32 i = 0; i < N; i++) {
+        for (U32 j = 0; j < MInner; j++) {
+            for (U32 k = 0; k < K; k++) {
+                result[i * M + j] += ((I32)matrix1[k * N + i]) * matrix2[k * MInner + j];
+            }
+        }
+    }
+}
+
+inline void mmm_N4_MTail(U32 MInner, U32 M, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result)
+{
+    const int unroll = 4;
+    int32x4_t res[tileN][2] = {0};
+    for (U32 k = 0; k < K; k += unroll) {
+        int16x8_t res_s16[tileN] = {0};
+        for (U32 kk = 0; kk < unroll; kk++) {
+            U32 z = k + kk;
+            int8x8_t mat2 = vld1_s8(matrix2 + z * MInner);
+            for (int i = 0; i < tileN; i++) {
+                int8x8_t mat10 = vdup_n_s8(matrix1[z * tileN + i]);
+                res_s16[i] = vmlal_s8(res_s16[i], mat10, mat2);
+            }
+        }
+        for (int i = 0; i < tileN; i++) {
+            res[i][0] = vaddw_s16(res[i][0], vget_low_s16(res_s16[i]));
+            res[i][1] = vaddw_s16(res[i][1], vget_high_s16(res_s16[i]));
+        }
+    }
+    int tmp[8];
+    for (int i = 0; i < tileN; i++) {
+        vst1q_s32(tmp, res[i][0]);
+        vst1q_s32(tmp + 4, res[i][1]);
+        for (U32 p = 0; p < MInner; p++) {
+            result[i * M + p] += tmp[p];
+        }
+    }
+}
+
+EE mmm_int8(
+    int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result, Arch arch)
+{
+    int blockK = K;
+    U32 K4 = pad_to_4_multiple(K);
+    int blockM = 96;
+    for (int k = 0; k < K; k += blockK) {
+        int KInner = UNI_MIN(blockK, K - k);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+        for (int n = 0; n <= N - tileN; n += tileN) {
+            INT8 *matrix1Trans = tmp + n * K4;
+            if (transposeA) {
+                matrix2_trans_int8(tileN, KInner, N, matrix1 + n, matrix1Trans);
+            } else {
+                matrix1_trans_int8(tileN, KInner, K, matrix1 + n * K + k, matrix1Trans);
+            }
+        }
+        int n = N / tileN * tileN;
+        if (N - n > 0) {
+            INT8 *matrix1Trans = tmp + n * K4;
+            if (transposeA) {
+                matrix2_trans_int8(N - n, KInner, N, matrix1 + n, matrix1Trans);
+            } else {
+                matrix1_trans_int8(N - n, KInner, K, matrix1 + n * K + k, matrix1Trans);
+            }
+        }
+
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+        for (int i = 0; i < M; i += blockM) {
+            int MInner = UNI_MIN(blockM, M - i);
+            I32 *resultCurrent;
+            int m, n;
+            for (n = 0; n <= N - tileN; n += tileN) {
+                INT8 *matrix1Trans = tmp + n * K4;
+                //if (i == 0) {
+                //    if (transposeA) {
+                //        matrix2_trans_int8(4, KInner, N, matrix1 + n, matrix1Trans + n * K4);
+                //    } else {
+                //        matrix1_trans_int8(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4);
+                //    }
+                //}
+                for (m = 0; m <= (MInner - 8); m += 8) {
+                    resultCurrent = result + n * M + m + i;
+                    mmm_4x8(M, K4, matrix1Trans, matrix2 + (i + m) * K4, resultCurrent);
+                    //mmm_NTail_M(8, M, 4, K4, matrix1Trans, matrix2 + (i + m) * K4, resultCurrent);
+                }
+
+                if (MInner - m) {
+                    resultCurrent = result + n * M + m + i;
+                    mmm_N4_MTail(
+                        MInner - m, M, K4, matrix1Trans, matrix2 + (i + m) * K4, resultCurrent);
+                    //mmm_NTail_M(MInner - m, M, 4, K4, matrix1Trans, matrix2 + (i + m) * K4,
+                    //    resultCurrent);
+                }
+            }
+
+            if (N - n) {
+                INT8 *matrix1Trans = tmp + n * K4;
+                //if (i == 0) {
+                //    if (transposeA) {
+                //        matrix2_trans_int8(N - n, KInner, N, matrix1 + n, matrix1Trans + n * K4);
+                //    } else {
+                //        matrix1_trans_int8(
+                //            N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4);
+                //    }
+                //}
+
+                for (m = 0; m <= (MInner - 8); m += 8) {
+                    resultCurrent = result + n * M + m + i;
+                    mmm_NTail_M8(
+                        M, N - n, KInner, matrix1Trans, matrix2 + (i + m) * K4, resultCurrent);
+                    //mmm_NTail_M(8, M, N - n, K4, matrix1Trans,
+                    //    matrix2 + (i + m) * K4, resultCurrent);
+                }
+
+                if (MInner - m) {
+                    resultCurrent = result + n * M + m + i;
+                    mmm_NTail_M(MInner - m, M, N - n, K4, matrix1Trans, matrix2 + (i + m) * K4,
+                        resultCurrent);
+                }
+            }
+        }
+    }
+    return SUCCESS;
+}
diff --git a/compute/blas_enhance/src/cpu/x86/axpby.cpp b/compute/blas_enhance/src/cpu/x86/axpby.cpp
new file mode 100644
index 00000000..b105d41a
--- /dev/null
+++ b/compute/blas_enhance/src/cpu/x86/axpby.cpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/x86/blas_x86.h"
+#ifdef _USE_FP32
+#include "cpu/x86/fp32/blas_fp32.h"
+#endif
+
+EE axpby_x86(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y)
+{
+    EE ret = NOT_SUPPORTED;
+    switch (dt) {
+#ifdef _USE_FP32
+        case DT_F32:
+            ret = axpby_fp32(len, a, (F32 *)x, b, (F32 *)y);
+            break;
+#endif
+        default:
+            break;
+    }
+    return ret;
+}
diff --git a/compute/blas_enhance/src/cpu/x86/blas_x86.h b/compute/blas_enhance/src/cpu/x86/blas_x86.h
index 997b21fe..c667c99b 100644
--- a/compute/blas_enhance/src/cpu/x86/blas_x86.h
+++ b/compute/blas_enhance/src/cpu/x86/blas_x86.h
@@ -15,9 +15,10 @@
 #define _H_BLAS_X86
 
 #include "error.h"
-#include "sys.h"
 #include "tensor_desc.h"
 
+EE axpby_x86(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y);
+
 EE matrix_vector_multiply_transform_weight_x86(
     TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, void *offsetCBias);
 
@@ -34,10 +35,10 @@ EE mvm_x86(U32 row,
     const F32 *scale);
 
 EE matrix_matrix_multiply_tmp_bytes_x86(
-    U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes);
+    U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataFormat df, DataType dt, U32 *bytes);
 
 EE matrix_matrix_multiply_transform_rhs_x86(
-    TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, void *offsetCBias);
+    TensorDesc desc, const void *src, TensorDesc *descTran, void *dst);
 
 EE mmm_x86(U32 matrixC_N,
     U32 matrixC_M,
diff --git a/compute/blas_enhance/src/cpu/x86/fp32/axpby.cpp b/compute/blas_enhance/src/cpu/x86/fp32/axpby.cpp
new file mode 100644
index 00000000..c96b33aa
--- /dev/null
+++ b/compute/blas_enhance/src/cpu/x86/fp32/axpby.cpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/x86/fp32/blas_fp32.h"
+
+EE axpby_fp32(U32 len, F32 a, const F32 *x, F32 b, F32 *y)
+{
+    __m256 alpha = _mm256_set1_ps(a);
+    __m256 beta = _mm256_set1_ps(b);
+    I32 i = 0;
+    for (; i < ((I32)len) - 7; i += 8) {
+        __m256 in = _mm256_loadu_ps(x + i);
+        __m256 out = _mm256_loadu_ps(y + i);
+        out = _mm256_mul_ps(out, beta);
+        out = _mm256_fmadd_ps(alpha, in, out);
+        _mm256_storeu_ps(y + i, out);
+    }
+    for (; i < (I32)len; i++) {
+        y[i] = a * x[i] + b * y[i];
+    }
+    return SUCCESS;
+}
diff --git a/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h b/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h
index 02d9ab5e..2cdaece0 100644
--- a/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h
+++ b/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h
@@ -14,13 +14,12 @@
 #ifndef _H_BLAS_FP32
 #define _H_BLAS_FP32
 
-#include "sys.h"
-
-#include "error.h"
 #include "tensor_desc.h"
 #include "thread_affinity.h"
 #include "uni.h"
 
+EE axpby_fp32(U32 len, F32 a, const F32 *x, F32 b, F32 *y);
+
 void mvm_col_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result);
 
 void mvm_row_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result);
@@ -68,14 +67,15 @@ EE mmm_avx2_fp32(int M,
     F32 *tmp,
     F32 *result);
 
-inline void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst)
+inline void matrix1_trans_w(U32 size, U32 realSize, U32 blockK, U32 K, F32 *src, F32 *dst)
 {
-    U32 remain = size % 4;
-    size = size / 4 * 4;
+    U32 remain = realSize % 4;
+    U32 mainSize = realSize / 4 * 4;
     __m128i vindex = _mm_set_epi32(K * 3, K * 2, K, 0);
+    F32 *rdst = dst;
     for (U32 i = 0; i < blockK; ++i) {
         U32 j;
-        for (j = 0; j < size; j += 4) {
+        for (j = 0; j < mainSize; j += 4) {
             if (i % 16 == 0) {
                 _mm_prefetch(src + i + j * K + 16, _MM_HINT_NTA);
                 _mm_prefetch(src + i + (j + 1) * K + 16, _MM_HINT_NTA);
@@ -85,7 +85,49 @@ inline void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst)
             _mm_store_ps(dst, _mm_i32gather_ps(src + i + j * K, vindex, 4));
             dst += 4;
         }
-        for (; j < remain; ++j) {
+        for (; j < realSize; ++j) {
+            if (i % 16 == 0) {
+                _mm_prefetch(src + i + (j + mainSize) * K + 16, _MM_HINT_NTA);
+            }
+            *(dst++) = *(src + i + j * K);
+        }
+
+        for (; j < size; ++j) {
+            *(dst++) = 0;
+        }
+    }
+}
+
+inline void matrix2_trans_w(U32 size, U32 realSize, U32 blockK, U32 M, F32 *src, F32 *dst)
+{
+    for (U32 i = 0; i < blockK; i++) {
+        for (U32 j = 0; j < size; j += 16) {
+            _mm_prefetch(src + M + j, _MM_HINT_NTA);
+        }
+        UNI_MEMCPY(dst, src, realSize * sizeof(F32));
+        dst += size;
+        src += M;
+    }
+}
+
+inline void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst)
+{
+    U32 remain = size % 8;
+    size = size / 8 * 8;
+    __m256i vindex = _mm256_set_epi32(K * 7, K * 6, K * 5, K * 4, K * 3, K * 2, K, 0);
+    for (U32 i = 0; i < blockK; ++i) {
+        U32 j;
+        for (j = 0; j < size; j += 8) {
+            if (i % 16 == 0) {
+                _mm_prefetch(src + i + j * K + 16, _MM_HINT_NTA);
+                _mm_prefetch(src + i + (j + 1) * K + 16, _MM_HINT_NTA);
+                _mm_prefetch(src + i + (j + 2) * K + 16, _MM_HINT_NTA);
+                _mm_prefetch(src + i + (j + 3) * K + 16, _MM_HINT_NTA);
+            }
+            _mm256_storeu_ps(dst, _mm256_i32gather_ps(src + i + j * K, vindex, 4));
+            dst += 8;
+        }
+        for (; j < (remain + size); ++j) {
             if (i % 16 == 0) {
                 _mm_prefetch(src + i + (j + size) * K + 16, _MM_HINT_NTA);
             }
@@ -100,7 +142,7 @@ inline void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst)
         for (U32 j = 0; j < size; j += 16) {
             _mm_prefetch(src + M + j, _MM_HINT_NTA);
         }
-        memcpy(dst, src, size * sizeof(F32));
+        UNI_MEMCPY(dst, src, size * sizeof(F32));
         dst += size;
         src += M;
     }
diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp
index d1758191..6ad2c4e0 100644
--- a/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp
+++ b/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp
@@ -17,17 +17,576 @@
 #define UNROLL_K 4
 #define UNROLL_N 24
 #define UNROLL_M 4
-#define BOLCK_M_DIM 768
-#define BOLCK_K_DIM 768
+#define BOLCK_M_DIM 1024
+#define BOLCK_K_DIM 1024
 #define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit)
 
-typedef void (*kernel_func)(
-    U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 ldc);
+typedef void (*kernel_func)(U32 um,
+    U32 un,
+    U32 bk,
+    F32 *matrixA,
+    F32 *matrixB,
+    F32 *matrixC,
+    U32 ldc,
+    I32 *mask,
+    F32 *A1,
+    F32 *A2,
+    F32 *A3);
+
+// clang-format off
+#define clear1Regs(rtype) \
+    "vxorps "#rtype"0, "#rtype"0, "#rtype"0                     \n\t"
+
+#define clear2Regs(rtype) \
+    clear1Regs(rtype) \
+    "vxorps "#rtype"1, "#rtype"1, "#rtype"1                     \n\t"
+
+#define clear3Regs(rtype) \
+    clear2Regs(rtype) \
+    "vxorps "#rtype"2, "#rtype"2, "#rtype"2                     \n\t"
+
+#define clear4Regs(rtype) \
+    clear3Regs(rtype) \
+    "vxorps "#rtype"3, "#rtype"3, "#rtype"3                     \n\t"
+
+#define clear6Regs(rtype) \
+    clear4Regs(rtype) \
+    "vxorps "#rtype"4, "#rtype"4, "#rtype"4                     \n\t" \
+    "vxorps "#rtype"5, "#rtype"5, "#rtype"5                     \n\t"
+
+#define clear8Regs(rtype) \
+    clear6Regs(rtype) \
+    "vxorps "#rtype"6, "#rtype"6, "#rtype"6                     \n\t" \
+    "vxorps "#rtype"7, "#rtype"7, "#rtype"7                     \n\t"
+
+#define clear9Regs(rtype) \
+    clear8Regs(rtype) \
+    "vxorps "#rtype"8, "#rtype"8, "#rtype"8                     \n\t"
+
+#define clear12Regs(rtype) \
+    clear9Regs(rtype) \
+    "vxorps "#rtype"9, "#rtype"9, "#rtype"9                     \n\t" \
+    "vxorps "#rtype"10, "#rtype"10, "#rtype"10                  \n\t" \
+    "vxorps "#rtype"11, "#rtype"11, "#rtype"11                  \n\t"
+
+#define asm_1x24_kernel(i0, f0, f1, f2) \
+    "vbroadcastss "#i0"(%[A0]), %%ymm15                   \n\t" \
+    "vmovaps "#f0"(%[B]), %%ymm12                        \n\t" \
+    "vmovaps "#f1"(%[B]), %%ymm13                    \n\t" \
+    "vmovaps "#f2"(%[B]), %%ymm14                    \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm12, %%ymm0         \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm13, %%ymm1         \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm14, %%ymm2         \n\t"
+
+#define asm_2x24_kernel(i0, f0, f1, f2) \
+    asm_1x24_kernel(i0, f0, f1, f2) \
+    "vbroadcastss "#i0"(%[A1]), %%ymm15                   \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm12, %%ymm3         \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm13, %%ymm4         \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm14, %%ymm5         \n\t"
+
+#define asm_3x24_kernel(i0, f0, f1, f2) \
+    asm_2x24_kernel(i0, f0, f1, f2) \
+    "vbroadcastss "#i0"(%[A2]), %%ymm15                   \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm12, %%ymm6         \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm13, %%ymm7         \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm14, %%ymm8         \n\t"
+
+#define asm_4x24_kernel(i0, f0, f1, f2) \
+    asm_3x24_kernel(i0, f0, f1, f2) \
+    "vbroadcastss "#i0"(%[A3]), %%ymm15                   \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm12, %%ymm9         \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm13, %%ymm10         \n\t" \
+    "vfmadd231ps %%ymm15, %%ymm14, %%ymm11         \n\t"
+
+#define store_1x24_0(N) \
+    "vaddps (%[C]), %%ymm0, %%ymm0                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm1, %%ymm1                   \n\t" \
+    "vaddps 0x40(%[C]), %%ymm2, %%ymm2                   \n\t" \
+    "vmovups %%ymm0,  (%[C])                             \n\t" \
+    "vmovups %%ymm1,  0x20(%[C])                         \n\t" \
+    "vmovups %%ymm2,  0x40(%[C])                         \n\t"
+
+#define store_2x24_0(N) \
+    store_1x24_0(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vaddps (%[C]), %%ymm3, %%ymm3                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm4, %%ymm4                   \n\t" \
+    "vaddps 0x40(%[C]), %%ymm5, %%ymm5                   \n\t" \
+    "vmovups %%ymm3,  (%[C])                             \n\t" \
+    "vmovups %%ymm4,  0x20(%[C])                         \n\t" \
+    "vmovups %%ymm5,  0x40(%[C])                         \n\t"
+
+#define store_3x24_0(N) \
+    store_2x24_0(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vaddps (%[C]), %%ymm6, %%ymm6                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm7, %%ymm7                   \n\t" \
+    "vaddps 0x40(%[C]), %%ymm8, %%ymm8                   \n\t" \
+    "vmovups %%ymm6,  (%[C])                             \n\t" \
+    "vmovups %%ymm7,  0x20(%[C])                         \n\t" \
+    "vmovups %%ymm8,  0x40(%[C])                         \n\t"
+
+#define store_4x24_0(N) \
+    store_3x24_0(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vaddps (%[C]), %%ymm9, %%ymm9                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm10, %%ymm10                 \n\t" \
+    "vaddps 0x40(%[C]), %%ymm11, %%ymm11                 \n\t" \
+    "vmovups %%ymm9,  (%[C])                             \n\t" \
+    "vmovups %%ymm10, 0x20(%[C])                         \n\t" \
+    "vmovups %%ymm11, 0x40(%[C])                         \n\t"
+
+#define store_1x24_1(N) \
+    "vmovups (%[mask]), %%ymm15                             \n\t" \
+    "vmaskmovps 0x40(%[C]), %%ymm15, %%ymm14                       \n\t" \
+    "vaddps (%[C]), %%ymm0, %%ymm0                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm1, %%ymm1                   \n\t" \
+    "vaddps %%ymm14, %%ymm2, %%ymm2                   \n\t" \
+    "vmovups %%ymm0,  (%[C])                             \n\t" \
+    "vmovups %%ymm1,  0x20(%[C])                         \n\t" \
+    "vmaskmovps %%ymm2, %%ymm15,  0x40(%[C])                         \n\t"
+
+#define store_2x24_1(N) \
+    store_1x24_1(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vmaskmovps 0x40(%[C]), %%ymm15, %%ymm14                       \n\t" \
+    "vaddps (%[C]), %%ymm3, %%ymm3                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm4, %%ymm4                   \n\t" \
+    "vaddps %%ymm14, %%ymm5, %%ymm5                   \n\t" \
+    "vmovups %%ymm3,  (%[C])                             \n\t" \
+    "vmovups %%ymm4,  0x20(%[C])                         \n\t" \
+    "vmaskmovps %%ymm5, %%ymm15,  0x40(%[C])                         \n\t"
+
+#define store_3x24_1(N) \
+    store_2x24_1(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vmaskmovps 0x40(%[C]), %%ymm15, %%ymm14                       \n\t" \
+    "vaddps (%[C]), %%ymm6, %%ymm6                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm7, %%ymm7                   \n\t" \
+    "vaddps %%ymm14, %%ymm8, %%ymm8                   \n\t" \
+    "vmovups %%ymm6,  (%[C])                             \n\t" \
+    "vmovups %%ymm7,  0x20(%[C])                         \n\t" \
+    "vmaskmovps %%ymm8, %%ymm15,  0x40(%[C])                         \n\t"
+
+#define store_4x24_1(N) \
+    store_3x24_1(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vmaskmovps 0x40(%[C]), %%ymm15, %%ymm14                       \n\t" \
+    "vaddps (%[C]), %%ymm9, %%ymm9                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm10, %%ymm10                 \n\t" \
+    "vaddps %%ymm14, %%ymm11, %%ymm11                 \n\t" \
+    "vmovups %%ymm9,  (%[C])                             \n\t" \
+    "vmovups %%ymm10, 0x20(%[C])                         \n\t" \
+    "vmaskmovps %%ymm11, %%ymm15, 0x40(%[C])                         \n\t"
+
+
+#define asm_1x16_kernel(i0, f0, f1) \
+    "vbroadcastss "#i0"(%[A0]), %%ymm10                   \n\t" \
+    "vmovaps "#f0"(%[B]), %%ymm8                        \n\t" \
+    "vmovaps "#f1"(%[B]), %%ymm9                    \n\t" \
+    "vfmadd231ps %%ymm10, %%ymm8, %%ymm0         \n\t" \
+    "vfmadd231ps %%ymm10, %%ymm9, %%ymm1         \n\t" \
+
+#define asm_2x16_kernel(i0, f0, f1) \
+    asm_1x16_kernel(i0, f0, f1) \
+    "vbroadcastss "#i0"(%[A1]), %%ymm10                   \n\t" \
+    "vfmadd231ps %%ymm10, %%ymm8, %%ymm2         \n\t" \
+    "vfmadd231ps %%ymm10, %%ymm9, %%ymm3         \n\t" \
+
+#define asm_3x16_kernel(i0, f0, f1) \
+    asm_2x16_kernel(i0, f0, f1) \
+    "vbroadcastss "#i0"(%[A2]), %%ymm10                   \n\t" \
+    "vfmadd231ps %%ymm10, %%ymm8, %%ymm4         \n\t" \
+    "vfmadd231ps %%ymm10, %%ymm9, %%ymm5         \n\t" \
+
+#define asm_4x16_kernel(i0, f0, f1) \
+    asm_3x16_kernel(i0, f0, f1) \
+    "vbroadcastss "#i0"(%[A3]), %%ymm10                   \n\t" \
+    "vfmadd231ps %%ymm10, %%ymm8, %%ymm6         \n\t" \
+    "vfmadd231ps %%ymm10, %%ymm9, %%ymm7         \n\t" \
+
+#define store_1x16_0(N) \
+    "vaddps (%[C]), %%ymm0, %%ymm0                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm1, %%ymm1                   \n\t" \
+    "vmovups %%ymm0,  (%[C])                             \n\t" \
+    "vmovups %%ymm1,  0x20(%[C])                         \n\t" \
+
+#define store_2x16_0(N) \
+    store_1x16_0(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vaddps (%[C]), %%ymm2, %%ymm2                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm3, %%ymm3                   \n\t" \
+    "vmovups %%ymm2,  (%[C])                             \n\t" \
+    "vmovups %%ymm3,  0x20(%[C])                         \n\t" \
+
+#define store_3x16_0(N) \
+    store_2x16_0(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vaddps (%[C]), %%ymm4, %%ymm4                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm5, %%ymm5                   \n\t" \
+    "vmovups %%ymm4,  (%[C])                             \n\t" \
+    "vmovups %%ymm5,  0x20(%[C])                         \n\t" \
+
+#define store_4x16_0(N) \
+    store_3x16_0(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vaddps (%[C]), %%ymm6, %%ymm6                       \n\t" \
+    "vaddps 0x20(%[C]), %%ymm7, %%ymm7                   \n\t" \
+    "vmovups %%ymm6,  (%[C])                             \n\t" \
+    "vmovups %%ymm7,  0x20(%[C])                         \n\t" \
+
+#define store_1x16_1(N) \
+    "vmovups (%[mask]), %%ymm10                             \n\t" \
+    "vmaskmovps 0x20(%[C]), %%ymm10, %%ymm9                       \n\t" \
+    "vaddps (%[C]), %%ymm0, %%ymm0                       \n\t" \
+    "vaddps %%ymm9, %%ymm1, %%ymm1                   \n\t" \
+    "vmovups %%ymm0,  (%[C])                             \n\t" \
+    "vmaskmovps %%ymm1, %%ymm10,  0x20(%[C])                         \n\t" \
+
+#define store_2x16_1(N) \
+    store_1x16_1(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vmaskmovps 0x20(%[C]), %%ymm10, %%ymm9                       \n\t" \
+    "vaddps (%[C]), %%ymm2, %%ymm2                       \n\t" \
+    "vaddps %%ymm9, %%ymm3, %%ymm3                   \n\t" \
+    "vmovups %%ymm2,  (%[C])                             \n\t" \
+    "vmaskmovps %%ymm3, %%ymm10,  0x20(%[C])                         \n\t" \
+
+#define store_3x16_1(N) \
+    store_2x16_1(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vmaskmovps 0x20(%[C]), %%ymm10, %%ymm9                       \n\t" \
+    "vaddps (%[C]), %%ymm4, %%ymm4                       \n\t" \
+    "vaddps %%ymm9, %%ymm5, %%ymm5                   \n\t" \
+    "vmovups %%ymm4,  (%[C])                             \n\t" \
+    "vmaskmovps %%ymm5, %%ymm10,  0x20(%[C])                         \n\t" \
+
+#define store_4x16_1(N) \
+    store_3x16_1(N) \
+    "add "#N", %[C]                                     \n\t" \
+    "vmaskmovps 0x20(%[C]), %%ymm10, %%ymm9                       \n\t" \
+    "vaddps (%[C]), %%ymm6, %%ymm6                       \n\t" \
+    "vaddps %%ymm9, %%ymm7, %%ymm7                   \n\t" \
+    "vmovups %%ymm6,  (%[C])                             \n\t" \
+    "vmaskmovps %%ymm7, %%ymm10,  0x20(%[C])                         \n\t" \
+
+
+#define asm_1x8_kernel(i0, f0, rtype) \
+    "vmovaps "#f0"(%[B]), "#rtype"4                              \n\t" \
+    "vbroadcastss "#i0"(%[A0]), "#rtype"5                      \n\t" \
+    "vfmadd231ps "#rtype"5, "#rtype"4, "#rtype"0                \n\t"
+
+#define asm_2x8_kernel(i0, f0, rtype) \
+    asm_1x8_kernel(i0, f0, rtype) \
+    "vbroadcastss "#i0"(%[A1]), "#rtype"5                   \n\t" \
+    "vfmadd231ps "#rtype"5, "#rtype"4, "#rtype"1                \n\t"
+
+#define asm_3x8_kernel(i0, f0, rtype) \
+    asm_2x8_kernel(i0, f0, rtype) \
+    "vbroadcastss "#i0"(%[A2]), "#rtype"5                   \n\t" \
+    "vfmadd231ps "#rtype"5, "#rtype"4, "#rtype"2                \n\t"
+
+#define asm_4x8_kernel(i0, f0, rtype) \
+    asm_3x8_kernel(i0, f0, rtype) \
+    "vbroadcastss "#i0"(%[A3]), "#rtype"5                      \n\t" \
+    "vfmadd231ps "#rtype"5, "#rtype"4, "#rtype"3                \n\t"
+
+#define store_1x8_0(N, rtype) \
+    "vaddps (%[C]), "#rtype"0, "#rtype"0                       \n\t" \
+    "vmovups "#rtype"0,  (%[C])                             \n\t"
+
+#define store_2x8_0(N, rtype) \
+    store_1x8_0(N, rtype) \
+    "add "#N", %[C]                                     \n\t" \
+    "vaddps (%[C]), "#rtype"1, "#rtype"1                       \n\t" \
+    "vmovups "#rtype"1,  (%[C])                             \n\t"
+
+#define store_3x8_0(N, rtype) \
+    store_2x8_0(N, rtype) \
+    "add "#N", %[C]                                     \n\t" \
+    "vaddps (%[C]), "#rtype"2, "#rtype"2                       \n\t" \
+    "vmovups "#rtype"2,  (%[C])                             \n\t"
+
+#define store_4x8_0(N, rtype) \
+    store_3x8_0(N, rtype) \
+    "add "#N", %[C]                                     \n\t" \
+    "vaddps (%[C]), "#rtype"3, "#rtype"3                       \n\t" \
+    "vmovups "#rtype"3,  (%[C])                             \n\t"
+
+#define store_1x8_1(N, rtype) \
+    "vmovups (%[mask]), "#rtype"5                             \n\t" \
+    "vmaskmovps (%[C]), "#rtype"5, "#rtype"4                       \n\t" \
+    "vaddps "#rtype"4, "#rtype"0, "#rtype"0                       \n\t" \
+    "vmaskmovps "#rtype"0, "#rtype"5, (%[C])                             \n\t"
+
+#define store_2x8_1(N, rtype) \
+    store_1x8_1(N, rtype) \
+    "add "#N", %[C]                                     \n\t" \
+    "vmaskmovps (%[C]), "#rtype"5, "#rtype"4                       \n\t" \
+    "vaddps "#rtype"4, "#rtype"1, "#rtype"1                       \n\t" \
+    "vmaskmovps "#rtype"1, "#rtype"5, (%[C])                             \n\t"
+
+#define store_3x8_1(N, rtype) \
+    store_2x8_1(N, rtype) \
+    "add "#N", %[C]                                     \n\t" \
+    "vmaskmovps (%[C]), "#rtype"5, "#rtype"4                       \n\t" \
+    "vaddps "#rtype"4, "#rtype"2, "#rtype"2                       \n\t" \
+    "vmaskmovps "#rtype"2, "#rtype"5, (%[C])                             \n\t"
+
+#define store_4x8_1(N, rtype) \
+    store_3x8_1(N, rtype) \
+    "add "#N", %[C]                                     \n\t" \
+    "vmaskmovps (%[C]), "#rtype"5, "#rtype"4                       \n\t" \
+    "vaddps "#rtype"4, "#rtype"3, "#rtype"3                       \n\t" \
+    "vmaskmovps "#rtype"3, "#rtype"5, (%[C])                             \n\t"
+
+#define kernel_24_4_loop(m) \
+    "prefetcht0 0x140(%[B])                              \n\t" \
+    "prefetcht0 0x180(%[B])                              \n\t" \
+    asm_##m##x24_kernel(0x0, 0x0, 0x20, 0x40) \
+    "prefetcht0 0x1C0(%[B])                              \n\t" \
+    asm_##m##x24_kernel(0x4, 0x60, 0x80, 0xA0) \
+    "prefetcht0 0x200(%[B])                              \n\t" \
+    "prefetcht0 0x240(%[B])                              \n\t" \
+    asm_##m##x24_kernel(0x8, 0xC0, 0xE0, 0x100) \
+    "prefetcht0 0x280(%[B])                              \n\t" \
+    asm_##m##x24_kernel(0xC, 0x120, 0x140, 0x160) \
+    "add $0x180, %[B]                             \n\t"
+
+#define kernel_16_4_loop(m) \
+    "prefetcht0 0x140(%1)                              \n\t" \
+    asm_##m##x16_kernel(0x0, 0x0, 0x20) \
+    "prefetcht0 0x180(%1)                              \n\t" \
+    asm_##m##x16_kernel(0x4, 0x40, 0x60) \
+    "prefetcht0 0x1C0(%1)                              \n\t" \
+    asm_##m##x16_kernel(0x8, 0x80, 0xA0) \
+    "prefetcht0 0x200(%1)                              \n\t" \
+    asm_##m##x16_kernel(0xC, 0xC0, 0xE0) \
+    "add $0x100, %[B]                             \n\t"
+
+#define kernel_8_4_loop(m) \
+    asm_##m##x8_kernel(0x0, 0x0, %%ymm) \
+    asm_##m##x8_kernel(0x4, 0x20, %%ymm) \
+    asm_##m##x8_kernel(0x8, 0x40, %%ymm) \
+    asm_##m##x8_kernel(0xC, 0x60, %%ymm) \
+    "add $0x80, %[B]                             \n\t"
+
+#define kernel_4_4_loop(m) \
+    asm_##m##x8_kernel(0x0, 0x0, %%xmm) \
+    asm_##m##x8_kernel(0x4, 0x10, %%xmm) \
+    asm_##m##x8_kernel(0x8, 0x20, %%xmm) \
+    asm_##m##x8_kernel(0xC, 0x30, %%xmm) \
+    "add $0x40, %[B]                             \n\t"
+
+#define m_24_kernel(m, x, edge) \
+    __asm__ __volatile__(clear##x##Regs(%%ymm)                               \
+                         "mov %[bk], %%ecx                             \n\t" \
+                         "shr $2, %%ecx                                \n\t" \
+                         "je 1f                                        \n\t" \
+                         ".align 16                                    \n\t" \
+                         "0:                                           \n\t" \
+                         kernel_24_4_loop(m)                                 \
+                         "add $0x10, %[A0]                             \n\t" \
+                         "add $0x10, %[A1]                             \n\t" \
+                         "add $0x10, %[A2]                             \n\t" \
+                         "add $0x10, %[A3]                             \n\t" \
+                         "dec %%ecx                                    \n\t" \
+                         "jg 0b                                        \n\t" \
+                         ".align 16                                    \n\t" \
+                         "1:                                           \n\t" \
+                         "mov %[bk], %%ecx                             \n\t" \
+                         "and $3, %%ecx                                \n\t" \
+                         "je 3f                                        \n\t" \
+                         ".align 16                                    \n\t" \
+                         "2:                                           \n\t" \
+                         asm_##m##x24_kernel(0x0, 0x0, 0x20, 0x40)           \
+                         "add $0x60, %[B]                              \n\t" \
+                         "add $0x4, %[A0]                              \n\t" \
+                         "add $0x4, %[A1]                              \n\t" \
+                         "add $0x4, %[A2]                              \n\t" \
+                         "add $0x4, %[A3]                              \n\t" \
+                         "dec %%ecx                                    \n\t" \
+                         "jg 2b                                        \n\t" \
+                         "3:                                           \n\t" \
+                         "shl $2, %%rax                                \n\t" \
+                         store_##m##x24_##edge(%%rax)                               \
+                         : [B] "+r" (matrixB),                               \
+                           [A0] "+r" (matrixA),                              \
+                           [A1] "+r" (A1),                                   \
+                           [A2] "+r" (A2),                                   \
+                           [A3] "+r" (A3),                                   \
+                           [C] "+r" (matrixC)                                \
+                         : "a"((I64)N),                                      \
+                           [bk] "r" (bk),                                     \
+                           [mask] "r" (mask)                                     \
+                         : "%ecx",                                           \
+                           "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4",      \
+                           "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9",      \
+                           "%ymm10", "%ymm11", "%ymm12", "%ymm13",           \
+                           "%ymm14", "%ymm15", "memory");
+
+#define m_16_kernel(m, x, edge) \
+    __asm__ __volatile__(clear##x##Regs(%%ymm)                               \
+                         "mov %[bk], %%ecx                             \n\t" \
+                         "shr $2, %%ecx                                \n\t" \
+                         "je 1f                                        \n\t" \
+                         ".align 16                                    \n\t" \
+                         "0:                                           \n\t" \
+                         kernel_16_4_loop(m)                                 \
+                         "add $0x10, %[A0]                             \n\t" \
+                         "add $0x10, %[A1]                             \n\t" \
+                         "add $0x10, %[A2]                             \n\t" \
+                         "add $0x10, %[A3]                             \n\t" \
+                         "dec %%ecx                                    \n\t" \
+                         "jg 0b                                        \n\t" \
+                         ".align 16                                    \n\t" \
+                         "1:                                           \n\t" \
+                         "mov %[bk], %%ecx                             \n\t" \
+                         "and $3, %%ecx                                \n\t" \
+                         "je 3f                                        \n\t" \
+                         ".align 16                                    \n\t" \
+                         "2:                                           \n\t" \
+                         asm_##m##x16_kernel(0x0, 0x0, 0x20)                 \
+                         "add $0x40, %[B]                              \n\t" \
+                         "add $0x4, %[A0]                              \n\t" \
+                         "add $0x4, %[A1]                              \n\t" \
+                         "add $0x4, %[A2]                              \n\t" \
+                         "add $0x4, %[A3]                              \n\t" \
+                         "dec %%ecx                                    \n\t" \
+                         "jg 2b                                        \n\t" \
+                         "3:                                           \n\t" \
+                         "shl $2, %%rax                                \n\t" \
+                         store_##m##x16_##edge(%%rax)                               \
+                         : [B] "+r" (matrixB),                               \
+                           [A0] "+r" (matrixA),                              \
+                           [A1] "+r" (A1),                                   \
+                           [A2] "+r" (A2),                                   \
+                           [A3] "+r" (A3),                                   \
+                           [C] "+r" (matrixC)                                \
+                         : "a"((I64)N),                                      \
+                           [bk] "r" (bk),                                     \
+                           [mask] "r" (mask)                                     \
+                         : "%ecx",                                           \
+                           "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4",      \
+                           "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9",      \
+                           "%ymm10", "memory");
+
+#define asm_4_kernel(m) \
+    asm_##m##x8_kernel(0x0, 0x0, %%xmm)                 \
+    "add $0x10, %[B]                              \n\t"
+
+#define asm_8_kernel(m) \
+    asm_##m##x8_kernel(0x0, 0x0, %%ymm)                 \
+    "add $0x20, %[B]                              \n\t"
+
+#define m_8_kernel_wrap(m, n, x, rtype, edge) \
+    __asm__ __volatile__(clear##x##Regs(rtype)                               \
+                         "mov %[bk], %%ecx                             \n\t" \
+                         "shr $2, %%ecx                                \n\t" \
+                         "je 1f                                        \n\t" \
+                         ".align 16                                    \n\t" \
+                         "0:                                           \n\t" \
+                         kernel_##n##_4_loop(m)                                 \
+                         "add $0x10, %[A0]                             \n\t" \
+                         "add $0x10, %[A1]                             \n\t" \
+                         "add $0x10, %[A2]                             \n\t" \
+                         "add $0x10, %[A3]                             \n\t" \
+                         "dec %%ecx                                    \n\t" \
+                         "jg 0b                                        \n\t" \
+                         ".align 16                                    \n\t" \
+                         "1:                                           \n\t" \
+                         "mov %[bk], %%ecx                             \n\t" \
+                         "and $3, %%ecx                                \n\t" \
+                         "je 3f                                        \n\t" \
+                         ".align 16                                    \n\t" \
+                         "2:                                           \n\t" \
+                         asm_##n##_kernel(m)                                     \
+                         "add $0x4, %[A0]                              \n\t" \
+                         "add $0x4, %[A1]                              \n\t" \
+                         "add $0x4, %[A2]                              \n\t" \
+                         "add $0x4, %[A3]                              \n\t" \
+                         "dec %%ecx                                    \n\t" \
+                         "jg 2b                                        \n\t" \
+                         "3:                                           \n\t" \
+                         "shl $2, %%rax                                \n\t" \
+                         store_##m##x8_##edge(%%rax, rtype)                         \
+                         : [B] "+r" (matrixB),                               \
+                           [A0] "+r" (matrixA),                              \
+                           [A1] "+r" (A1),                                   \
+                           [A2] "+r" (A2),                                   \
+                           [A3] "+r" (A3),                                   \
+                           [C] "+r" (matrixC)                                \
+                         : "a"((I64)N),                                      \
+                           [bk] "r" (bk),                                     \
+                           [mask] "r" (mask)                                     \
+                         : "%ecx",                                           \
+                           "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4",      \
+                           "%ymm5", "memory");
+
+#define m_8_kernel(m, x, edge) \
+    m_8_kernel_wrap(m, 8, x, %%ymm, edge)
+
+#define m_4_kernel(m, x, edge) \
+    m_8_kernel_wrap(m, 4, x, %%xmm, edge)
+
+#define mmm_mxn_asm(m, n, regNum) \
+    void mmm_avx2_##m##x##n##_asm( \
+        U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, \
+        F32 *matrixC, U32 N, I32 *mask, F32 *A1, F32 *A2, F32 *A3) \
+{ \
+    if (mask == nullptr) { \
+        m_##n##_kernel(m, regNum, 0) \
+    } else { \
+        m_##n##_kernel(m, regNum, 1) \
+    } \
+}
+
+mmm_mxn_asm(4, 24, 12)
+mmm_mxn_asm(3, 24, 9)
+mmm_mxn_asm(2, 24, 6)
+mmm_mxn_asm(1, 24, 3)
+mmm_mxn_asm(4, 16, 8)
+mmm_mxn_asm(3, 16, 6)
+mmm_mxn_asm(2, 16, 4)
+mmm_mxn_asm(1, 16, 2)
+mmm_mxn_asm(4, 8, 4)
+mmm_mxn_asm(3, 8, 3)
+mmm_mxn_asm(2, 8, 2)
+mmm_mxn_asm(1, 8, 1)
+mmm_mxn_asm(4, 4, 4)
+mmm_mxn_asm(3, 4, 3)
+mmm_mxn_asm(2, 4, 2)
+mmm_mxn_asm(1, 4, 1)
+
+// clang-format on
+
+void mmm_avx2_n_mtail(U32 um,
+    U32 un,
+    U32 bk,
+    F32 *matrixA,
+    F32 *matrixB,
+    F32 *matrixC,
+    U32 N,
+    I32 *mask,
+    F32 *A1,
+    F32 *A2,
+    F32 *A3)
+{
+    F32 *ar[4] = {matrixA, A1, A2, A3};
+    for (U32 i = 0; i < um; ++i) {
+        for (U32 j = 0; j < un; ++j) {
+            for (U32 k = 0; k < bk; ++k) {
+                matrixC[i * N + j] += ar[i][k] * matrixB[k * un + j];
+            }
+        }
+    }
+}
 
 void matrix_matrix_multiply_tmp_bytes_fp32(
     U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes)
 {
-    *bytes = row1 * col1 + row2 * col2;
+    *bytes = row1 * col1 + (col2 + 7) / 8 * 8 * row2;
     *bytes *= sizeof(dt);
     *bytes += 32;
 }
@@ -39,15 +598,18 @@ EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *ds
     U32 N, K, blockSizeK, unrollSizeN;
     CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N));
     F32 unrollSize[4] = {4, 8, 16, 24};
+    U32 resN = N % UNROLL_N;
+    U32 edgeBlockNSizeIdx = (resN > 4) ? ((resN + 7) / 8) : 0;
+    U32 edgeBlockNSize = unrollSize[edgeBlockNSizeIdx];
 
     // buffer addr algined to 32
     F32 *packB = (F32 *)align_addr(dst, 32);
     for (U32 bk = 0; bk < K; bk += blockSizeK) {
         blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk);
         for (U32 un = 0; un < N; un += unrollSizeN) {
-            unrollSizeN = UNI_MIN(UNROLL_N, N - un);
-            unrollSizeN = UNI_MIN(unrollSize[unrollSizeN / 8], unrollSizeN);
-            matrix2_trans(unrollSizeN, blockSizeK, N, src + un, packB);
+            unrollSizeN = UNI_MAX(UNI_MIN(UNROLL_N, N - un), edgeBlockNSize);
+            matrix2_trans_w(
+                unrollSizeN, UNI_MIN(N - un, unrollSizeN), blockSizeK, N, src + un, packB);
             packB += unrollSizeN * blockSizeK;
         }
         src += blockSizeK * N;
@@ -62,15 +624,18 @@ EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *ds
     U32 N, K, blockSizeK, unrollSizeN;
     CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K));
     F32 unrollSize[4] = {4, 8, 16, 24};
+    U32 resN = N % UNROLL_N;
+    U32 edgeBlockNSizeIdx = (resN > 4) ? ((resN + 7) / 8) : 0;
+    U32 edgeBlockNSize = unrollSize[edgeBlockNSizeIdx];
 
     // buffer addr aligned to 32
     F32 *packB = (F32 *)align_addr(dst, 32);
     for (U32 bk = 0; bk < K; bk += blockSizeK) {
         blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk);
         for (U32 un = 0; un < N; un += unrollSizeN) {
-            unrollSizeN = UNI_MIN(UNROLL_N, N - un);
-            unrollSizeN = UNI_MIN(unrollSize[unrollSizeN >> 3], unrollSizeN);
-            matrix1_trans(unrollSizeN, blockSizeK, K, src + un * K, packB);
+            unrollSizeN = UNI_MAX(UNI_MIN(UNROLL_N, N - un), edgeBlockNSize);
+            matrix1_trans_w(
+                unrollSizeN, UNI_MIN(N - un, unrollSizeN), blockSizeK, K, src + un * K, packB);
             packB += unrollSizeN * blockSizeK;
         }
         src += blockSizeK;
@@ -78,1367 +643,79 @@ EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *ds
     return SUCCESS;
 }
 
-void mmm_avx2_4x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-                         "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-                         "vxorps %%ymm2, %%ymm2, %%ymm2                     \n\t"
-                         "vxorps %%ymm3, %%ymm3, %%ymm3                     \n\t"
-                         "vxorps %%ymm4, %%ymm4, %%ymm4                     \n\t"
-                         "vxorps %%ymm5, %%ymm5, %%ymm5                     \n\t"
-                         "vxorps %%ymm6, %%ymm6, %%ymm6                     \n\t"
-                         "vxorps %%ymm7, %%ymm7, %%ymm7                     \n\t"
-                         "vxorps %%ymm8, %%ymm8, %%ymm8                     \n\t"
-                         "vxorps %%ymm9, %%ymm9, %%ymm9                     \n\t"
-                         "vxorps %%ymm10, %%ymm10, %%ymm10                  \n\t"
-                         "vxorps %%ymm11, %%ymm11, %%ymm11                  \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "shr $2, %%ecx                                     \n\t"
-                         "je .k_loop_4x24_end                                \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_4x24:                                      \n\t"
-
-                         "prefetcht0 0x140(%1)                              \n\t"
-                         "prefetcht0 0x180(%1)                              \n\t"
-                         "prefetcht0 0x140(%2)                              \n\t"
-
-                         "vmovaps (%1), %%ymm12                             \n\t"
-                         "vmovaps 0x20(%1), %%ymm13                         \n\t"
-                         "vmovaps 0x40(%1), %%ymm14                         \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm15                     \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm15                     \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
-                         "vbroadcastss 0x8(%2), %%ymm15                     \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
-                         "vbroadcastss 0xC(%2), %%ymm15                     \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
-
-                         "prefetcht0 0x1C0(%1)                              \n\t"
-
-                         "vmovaps 0x60(%1), %%ymm12                         \n\t"
-                         "vmovaps 0x80(%1), %%ymm13                         \n\t"
-                         "vmovaps 0xA0(%1), %%ymm14                         \n\t"
-                         "vbroadcastss 0x10(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
-                         "vbroadcastss 0x14(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
-                         "vbroadcastss 0x18(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
-                         "vbroadcastss 0x1C(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
-
-                         "prefetcht0 0x200(%1)                              \n\t"
-                         "prefetcht0 0x240(%1)                              \n\t"
-
-                         "vmovaps 0xC0(%1), %%ymm12                         \n\t"
-                         "vmovaps 0xE0(%1), %%ymm13                         \n\t"
-                         "vmovaps 0x100(%1), %%ymm14                        \n\t"
-                         "vbroadcastss 0x20(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
-                         "vbroadcastss 0x24(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
-                         "vbroadcastss 0x28(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
-                         "vbroadcastss 0x2C(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
-
-                         "prefetcht0 0x280(%1)                              \n\t"
-
-                         "vmovaps 0x120(%1), %%ymm12                        \n\t"
-                         "vmovaps 0x140(%1), %%ymm13                        \n\t"
-                         "vmovaps 0x160(%1), %%ymm14                        \n\t"
-                         "vbroadcastss 0x30(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
-                         "vbroadcastss 0x34(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
-                         "vbroadcastss 0x38(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
-                         "vbroadcastss 0x3C(%2), %%ymm15                    \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
-
-                         "add $0x180, %1                                    \n\t"
-                         "add $0x40, %2                                     \n\t"
-
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_4x24                                    \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_4x24_end:                                  \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "and $3, %%ecx                                     \n\t"
-                         "je .k_loop_4x24_remain_end                         \n\t"
-                         ".k_loop_4x24_remain:                               \n\t"
-                         "vmovaps (%1), %%ymm12                             \n\t"
-                         "vmovaps 0x20(%1), %%ymm13                         \n\t"
-                         "vmovaps 0x40(%1), %%ymm14                         \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm15                     \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm15                     \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
-                         "vbroadcastss 0x8(%2), %%ymm15                     \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
-                         "vbroadcastss 0xC(%2), %%ymm15                     \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
-                         "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
-                         "add $0x60, %1                                     \n\t"
-                         "add $0x10, %2                                     \n\t"
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_4x24_remain                             \n\t"
-
-                         ".k_loop_4x24_remain_end:                           \n\t"
-                         "mov %4, %%eax                                     \n\t"
-                         "shl $2, %%eax                                     \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-                         "prefetcht0 0x40(%3)                               \n\t"
-                         "prefetcht0 (%3, %%rax)                            \n\t"
-                         "prefetcht0 0x40(%3, %%rax)                        \n\t"
-                         "vaddps (%3), %%ymm0, %%ymm0                       \n\t"
-                         "vaddps 0x20(%3), %%ymm1, %%ymm1                   \n\t"
-                         "vaddps 0x40(%3), %%ymm2, %%ymm2                   \n\t"
-                         "vmovups %%ymm0,  (%3)                             \n\t"
-                         "vmovups %%ymm1,  0x20(%3)                         \n\t"
-                         "vmovups %%ymm2,  0x40(%3)                         \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         "prefetcht0 (%3, %%rax)                            \n\t"
-                         "prefetcht0 0x40(%3, %%rax)                        \n\t"
-                         "vaddps (%3), %%ymm3, %%ymm3                       \n\t"
-                         "vaddps 0x20(%3), %%ymm4, %%ymm4                   \n\t"
-                         "vaddps 0x40(%3), %%ymm5, %%ymm5                   \n\t"
-                         "vmovups %%ymm3,  (%3)                             \n\t"
-                         "vmovups %%ymm4,  0x20(%3)                         \n\t"
-                         "vmovups %%ymm5,  0x40(%3)                         \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         "prefetcht0 (%3, %%rax)                            \n\t"
-                         "prefetcht0 0x40(%3, %%rax)                        \n\t"
-                         "vaddps (%3), %%ymm6, %%ymm6                       \n\t"
-                         "vaddps 0x20(%3), %%ymm7, %%ymm7                   \n\t"
-                         "vaddps 0x40(%3), %%ymm8, %%ymm8                   \n\t"
-                         "vmovups %%ymm6,  (%3)                             \n\t"
-                         "vmovups %%ymm7,  0x20(%3)                         \n\t"
-                         "vmovups %%ymm8,  0x40(%3)                         \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         "prefetcht0 0x40(%3)                               \n\t"
-                         "vaddps (%3), %%ymm9, %%ymm9                       \n\t"
-                         "vaddps 0x20(%3), %%ymm10, %%ymm10                 \n\t"
-                         "vaddps 0x40(%3), %%ymm11, %%ymm11                 \n\t"
-                         "vmovups %%ymm9,  (%3)                             \n\t"
-                         "vmovups %%ymm10, 0x20(%3)                         \n\t"
-                         "vmovups %%ymm11, 0x40(%3)                         \n\t"
-                         :
-                         : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-                         : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4",
-                         "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12",
-                         "%ymm13", "%ymm14", "%ymm15", "memory");
-}
-
-void mmm_avx2_4x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-                         "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-                         "vxorps %%ymm2, %%ymm2, %%ymm2                     \n\t"
-                         "vxorps %%ymm3, %%ymm3, %%ymm3                     \n\t"
-                         "vxorps %%ymm4, %%ymm4, %%ymm4                     \n\t"
-                         "vxorps %%ymm5, %%ymm5, %%ymm5                     \n\t"
-                         "vxorps %%ymm6, %%ymm6, %%ymm6                     \n\t"
-                         "vxorps %%ymm7, %%ymm7, %%ymm7                     \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "shr $2, %%ecx                                     \n\t"
-                         "je .k_loop_4x16_end                                \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_4x16:                                      \n\t"
-
-                         "prefetcht0 0x140(%1)                              \n\t"
-                         "prefetcht0 0x140(%2)                              \n\t"
-
-                         "vmovaps (%1), %%ymm8                              \n\t"
-                         "vmovaps 0x20(%1), %%ymm9                          \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm10                     \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm0               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm1               \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm10                     \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm2               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm3               \n\t"
-                         "vbroadcastss 0x8(%2), %%ymm10                     \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm4               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm5               \n\t"
-                         "vbroadcastss 0xC(%2), %%ymm10                     \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm6               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm7               \n\t"
-
-                         "prefetcht0 0x180(%1)                              \n\t"
-
-                         "vmovaps 0x40(%1), %%ymm8                          \n\t"
-                         "vmovaps 0x60(%1), %%ymm9                          \n\t"
-                         "vbroadcastss 0x10(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm0               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm1               \n\t"
-                         "vbroadcastss 0x14(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm2               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm3               \n\t"
-                         "vbroadcastss 0x18(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm4               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm5               \n\t"
-                         "vbroadcastss 0x1C(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm6               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm7               \n\t"
-
-                         "prefetcht0 0x1C0(%1)                              \n\t"
-
-                         "vmovaps 0x80(%1), %%ymm8                          \n\t"
-                         "vmovaps 0xA0(%1), %%ymm9                          \n\t"
-                         "vbroadcastss 0x20(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm0               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm1               \n\t"
-                         "vbroadcastss 0x24(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm2               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm3               \n\t"
-                         "vbroadcastss 0x28(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm4               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm5               \n\t"
-                         "vbroadcastss 0x2C(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm6               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm7               \n\t"
-
-                         "prefetcht0 0x200(%1)                              \n\t"
-
-                         "vmovaps 0xC0(%1), %%ymm8                          \n\t"
-                         "vmovaps 0xE0(%1), %%ymm9                          \n\t"
-                         "vbroadcastss 0x30(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm0               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm1               \n\t"
-                         "vbroadcastss 0x34(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm2               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm3               \n\t"
-                         "vbroadcastss 0x38(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm4               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm5               \n\t"
-                         "vbroadcastss 0x3C(%2), %%ymm10                    \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm6               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm7               \n\t"
-
-                         "add $0x100, %1                                    \n\t"
-                         "add $0x40, %2                                     \n\t"
-
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_4x16                                    \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_4x16_end:                                  \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "and $3, %%ecx                                     \n\t"
-                         "je .k_loop_4x16_remain_end                         \n\t"
-                         ".k_loop_4x16_remain:                               \n\t"
-                         "vmovaps (%1), %%ymm8                              \n\t"
-                         "vmovaps 0x20(%1), %%ymm9                          \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm10                     \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm0               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm1               \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm10                     \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm2               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm3               \n\t"
-                         "vbroadcastss 0x8(%2), %%ymm10                     \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm4               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm5               \n\t"
-                         "vbroadcastss 0xC(%2), %%ymm10                     \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm8, %%ymm6               \n\t"
-                         "vfmadd231ps %%ymm10, %%ymm9, %%ymm7               \n\t"
-                         "add $0x40, %1                                     \n\t"
-                         "add $0x10, %2                                     \n\t"
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_4x16_remain                             \n\t"
-
-                         ".k_loop_4x16_remain_end:                           \n\t"
-                         "mov %4, %%eax                                     \n\t"
-                         "shl $2, %%eax                                     \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-                         "prefetcht0 (%3, %%rax)                            \n\t"
-                         "vaddps (%3), %%ymm0, %%ymm0                       \n\t"
-                         "vaddps 0x20(%3), %%ymm1, %%ymm1                   \n\t"
-                         "vmovups %%ymm0,  (%3)                             \n\t"
-                         "vmovups %%ymm1,  0x20(%3)                         \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         "prefetcht0 (%3, %%rax)                            \n\t"
-                         "vaddps (%3), %%ymm2, %%ymm2                       \n\t"
-                         "vaddps 0x20(%3), %%ymm3, %%ymm3                   \n\t"
-                         "vmovups %%ymm2,  (%3)                             \n\t"
-                         "vmovups %%ymm3,  0x20(%3)                         \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         "prefetcht0 (%3, %%rax)                            \n\t"
-                         "vaddps (%3), %%ymm4, %%ymm4                       \n\t"
-                         "vaddps 0x20(%3), %%ymm5, %%ymm5                   \n\t"
-                         "vmovups %%ymm4,  (%3)                             \n\t"
-                         "vmovups %%ymm5,  0x20(%3)                         \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         "vaddps (%3), %%ymm6, %%ymm6                       \n\t"
-                         "vaddps 0x20(%3), %%ymm7, %%ymm7                   \n\t"
-                         "vmovups %%ymm6,  (%3)                             \n\t"
-                         "vmovups %%ymm7, 0x20(%3)                          \n\t"
-                         :
-                         : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-                         : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4",
-                         "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "memory");
-}
-
-void mmm_avx2_4x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__(
-        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-        "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-        "vxorps %%ymm2, %%ymm2, %%ymm2                     \n\t"
-        "vxorps %%ymm3, %%ymm3, %%ymm3                     \n\t"
-
-        "mov %0, %%ecx                                     \n\t"
-        "shr $2, %%ecx                                     \n\t"
-        "je .k_loop_4x8_end                                 \n\t"
-        ".align 16                                         \n\t"
-        ".k_loop_4x8:                                       \n\t"
-
-        "prefetcht0 0x140(%1)                              \n\t"
-        "prefetcht0 0x140(%2)                              \n\t"
-
-        "vmovaps (%1), %%ymm4                              \n\t"
-        "vbroadcastss 0x0(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm0                \n\t"
-        "vbroadcastss 0x4(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm1                \n\t"
-        "vbroadcastss 0x8(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm2                \n\t"
-        "vbroadcastss 0xC(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm3                \n\t"
-
-        "vmovaps 0x20(%1), %%ymm4                          \n\t"
-        "vbroadcastss 0x10(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm0                \n\t"
-        "vbroadcastss 0x14(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm1                \n\t"
-        "vbroadcastss 0x18(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm2                \n\t"
-        "vbroadcastss 0x1C(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm3                \n\t"
-
-        "prefetcht0 0x180(%1)                              \n\t"
-
-        "vmovaps 0x40(%1), %%ymm4                          \n\t"
-        "vbroadcastss 0x20(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm0                \n\t"
-        "vbroadcastss 0x24(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm1                \n\t"
-        "vbroadcastss 0x28(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm2                \n\t"
-        "vbroadcastss 0x2C(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm3                \n\t"
-
-        "vmovaps 0x60(%1), %%ymm4                          \n\t"
-        "vbroadcastss 0x30(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm0                \n\t"
-        "vbroadcastss 0x34(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm1                \n\t"
-        "vbroadcastss 0x38(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm2                \n\t"
-        "vbroadcastss 0x3C(%2), %%ymm5                     \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm3                \n\t"
-
-        "add $0x80, %1                                     \n\t"
-        "add $0x40, %2                                     \n\t"
-
-        "sub $1, %%ecx                                     \n\t"
-        "jg .k_loop_4x8                                     \n\t"
-        ".align 16                                         \n\t"
-        ".k_loop_4x8_end:                                   \n\t"
-
-        "mov %0, %%ecx                                     \n\t"
-        "and $3, %%ecx                                     \n\t"
-        "je .k_loop_4x8_remain_end                          \n\t"
-        ".k_loop_4x8_remain:                                \n\t"
-        "vmovaps (%1), %%ymm4                              \n\t"
-        "vbroadcastss 0x0(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm0                \n\t"
-        "vbroadcastss 0x4(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm1                \n\t"
-        "vbroadcastss 0x8(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm2                \n\t"
-        "vbroadcastss 0xC(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm4, %%ymm3                \n\t"
-        "add $0x20, %1                                     \n\t"
-        "add $0x10, %2                                     \n\t"
-        "sub $1, %%ecx                                     \n\t"
-        "jg .k_loop_4x8_remain                              \n\t"
-
-        ".k_loop_4x8_remain_end:                            \n\t"
-        "mov %4, %%eax                                     \n\t"
-        "shl $2, %%eax                                     \n\t"
-        "mov %%eax, %%eax                                  \n\t"
-        "prefetcht0 (%3, %%rax)                            \n\t"
-        "vaddps (%3), %%ymm0, %%ymm0                       \n\t"
-        "vmovups %%ymm0,  (%3)                             \n\t"
-        "add %%rax, %3                                     \n\t"
-        "prefetcht0 (%3, %%rax)                            \n\t"
-        "vaddps (%3), %%ymm1, %%ymm1                       \n\t"
-        "vmovups %%ymm1,  (%3)                             \n\t"
-        "add %%rax, %3                                     \n\t"
-        "prefetcht0 (%3, %%rax)                            \n\t"
-        "vaddps (%3), %%ymm2, %%ymm2                       \n\t"
-        "vmovups %%ymm2,  (%3)                             \n\t"
-        "add %%rax, %3                                     \n\t"
-        "vaddps (%3), %%ymm3, %%ymm3                       \n\t"
-        "vmovups %%ymm3,  (%3)                             \n\t"
-        :
-        : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-        : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "memory");
-}
-
-void mmm_avx2_4x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__(
-        "vxorps %%xmm0, %%xmm0, %%xmm0                     \n\t"
-        "vxorps %%xmm1, %%xmm1, %%xmm1                     \n\t"
-        "vxorps %%xmm2, %%xmm2, %%xmm2                     \n\t"
-        "vxorps %%xmm3, %%xmm3, %%xmm3                     \n\t"
-
-        "mov %0, %%ecx                                     \n\t"
-        "shr $2, %%ecx                                     \n\t"
-        "je .k_loop_4x4_end                                 \n\t"
-        ".align 16                                         \n\t"
-        ".k_loop_4x4:                                       \n\t"
-
-        "prefetcht0 0x140(%1)                              \n\t"
-        "prefetcht0 0x140(%2)                              \n\t"
-
-        "vmovaps (%1), %%xmm4                              \n\t"
-        "vbroadcastss 0x0(%2), %%xmm5                      \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm0                \n\t"
-        "vbroadcastss 0x4(%2), %%xmm5                      \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm1                \n\t"
-        "vbroadcastss 0x8(%2), %%xmm5                      \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm2                \n\t"
-        "vbroadcastss 0xC(%2), %%xmm5                      \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm3                \n\t"
-
-        "vmovaps 0x10(%1), %%xmm4                          \n\t"
-        "vbroadcastss 0x10(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm0                \n\t"
-        "vbroadcastss 0x14(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm1                \n\t"
-        "vbroadcastss 0x18(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm2                \n\t"
-        "vbroadcastss 0x1C(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm3                \n\t"
-
-        "vmovaps 0x20(%1), %%xmm4                          \n\t"
-        "vbroadcastss 0x20(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm0                \n\t"
-        "vbroadcastss 0x24(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm1                \n\t"
-        "vbroadcastss 0x28(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm2                \n\t"
-        "vbroadcastss 0x2C(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm3                \n\t"
-
-        "vmovaps 0x30(%1), %%xmm4                          \n\t"
-        "vbroadcastss 0x30(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm0                \n\t"
-        "vbroadcastss 0x34(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm1                \n\t"
-        "vbroadcastss 0x38(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm2                \n\t"
-        "vbroadcastss 0x3C(%2), %%xmm5                     \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm3                \n\t"
-
-        "add $0x40, %1                                     \n\t"
-        "add $0x40, %2                                     \n\t"
-
-        "sub $1, %%ecx                                     \n\t"
-        "jg .k_loop_4x4                                     \n\t"
-        ".align 16                                         \n\t"
-        ".k_loop_4x4_end:                                   \n\t"
-
-        "mov %0, %%ecx                                     \n\t"
-        "and $3, %%ecx                                     \n\t"
-        "je .k_loop_4x4_remain_end                          \n\t"
-
-        ".k_loop_4x4_remain:                                \n\t"
-        "vmovaps (%1), %%xmm4                              \n\t"
-        "vbroadcastss 0x0(%2), %%xmm5                      \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm0                \n\t"
-        "vbroadcastss 0x4(%2), %%xmm5                      \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm1                \n\t"
-        "vbroadcastss 0x8(%2), %%xmm5                      \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm2                \n\t"
-        "vbroadcastss 0xC(%2), %%xmm5                      \n\t"
-        "vfmadd231ps %%xmm5, %%xmm4, %%xmm3                \n\t"
-        "add $0x10, %1                                     \n\t"
-        "add $0x10, %2                                     \n\t"
-        "sub $1, %%ecx                                     \n\t"
-        "jg .k_loop_4x4_remain                              \n\t"
-
-        ".k_loop_4x4_remain_end:                            \n\t"
-        "mov %4, %%eax                                     \n\t"
-        "shl $2, %%eax                                     \n\t"
-        "mov %%eax, %%eax                                  \n\t"
-        "prefetcht0 (%3, %%rax)                            \n\t"
-        "vaddps (%3), %%xmm0, %%xmm0                       \n\t"
-        "vmovups %%xmm0,  (%3)                             \n\t"
-        "add %%rax, %3                                     \n\t"
-        "prefetcht0 (%3, %%rax)                            \n\t"
-        "vaddps (%3), %%xmm1, %%xmm1                       \n\t"
-        "vmovups %%xmm1,  (%3)                             \n\t"
-        "add %%rax, %3                                     \n\t"
-        "prefetcht0 (%3, %%rax)                            \n\t"
-        "vaddps (%3), %%xmm2, %%xmm2                       \n\t"
-        "vmovups %%xmm2,  (%3)                             \n\t"
-        "add %%rax, %3                                     \n\t"
-        "vaddps (%3), %%xmm3, %%xmm3                       \n\t"
-        "vmovups %%xmm3,  (%3)                             \n\t"
-        :
-        : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-        : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "memory");
-}
-
-void mmm_avx2_2x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__("mov %4, %%eax                                     \n\t"
-                         "shl $2, %%eax                                     \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-
-                         "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-                         "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-                         "vxorps %%ymm2, %%ymm2, %%ymm2                     \n\t"
-                         "vxorps %%ymm3, %%ymm3, %%ymm3                     \n\t"
-                         "vxorps %%ymm4, %%ymm4, %%ymm4                     \n\t"
-                         "vxorps %%ymm5, %%ymm5, %%ymm5                     \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "shr $2, %%ecx                                     \n\t"
-                         "je .k_loop_2x24_end                                \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x24:                                      \n\t"
-
-                         "prefetcht0 0x140(%1)                              \n\t"
-                         "prefetcht0 0x180(%1)                              \n\t"
-
-                         "vmovaps (%1), %%ymm6                              \n\t"
-                         "vmovaps 0x20(%1), %%ymm7                          \n\t"
-                         "vmovaps 0x40(%1), %%ymm8                          \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm1                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm8, %%ymm2                \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm3                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm4                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm8, %%ymm5                \n\t"
-
-                         "prefetcht0 0x1C0(%1)                              \n\t"
-
-                         "vmovaps 0x60(%1), %%ymm6                          \n\t"
-                         "vmovaps 0x80(%1), %%ymm7                          \n\t"
-                         "vmovaps 0xA0(%1), %%ymm8                          \n\t"
-                         "vbroadcastss 0x8(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm1                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm8, %%ymm2                \n\t"
-                         "vbroadcastss 0xC(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm3                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm4                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm8, %%ymm5                \n\t"
-
-                         "prefetcht0 0x200(%1)                              \n\t"
-                         "prefetcht0 0x240(%1)                              \n\t"
-
-                         "vmovaps 0xC0(%1), %%ymm6                          \n\t"
-                         "vmovaps 0xE0(%1), %%ymm7                          \n\t"
-                         "vmovaps 0x100(%1), %%ymm8                         \n\t"
-                         "vbroadcastss 0x10(%2), %%ymm9                     \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm1                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm8, %%ymm2                \n\t"
-                         "vbroadcastss 0x14(%2), %%ymm9                     \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm3                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm4                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm8, %%ymm5                \n\t"
-
-                         "prefetcht0 0x280(%1)                              \n\t"
-
-                         "vmovaps 0x120(%1), %%ymm6                         \n\t"
-                         "vmovaps 0x140(%1), %%ymm7                         \n\t"
-                         "vmovaps 0x160(%1), %%ymm8                         \n\t"
-                         "vbroadcastss 0x18(%2), %%ymm9                     \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm1                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm8, %%ymm2                \n\t"
-                         "vbroadcastss 0x1C(%2), %%ymm9                     \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm3                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm4                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm8, %%ymm5                \n\t"
-
-                         "add $0x180, %1                                    \n\t"
-                         "add $0x20, %2                                     \n\t"
-
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_2x24                                    \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x24_end:                                  \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "and $3, %%ecx                                     \n\t"
-                         "je .k_loop_2x24_remain_end                         \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x24_remain:                               \n\t"
-                         "vmovaps (%1), %%ymm6                              \n\t"
-                         "vmovaps 0x20(%1), %%ymm7                          \n\t"
-                         "vmovaps 0x40(%1), %%ymm8                          \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm1                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm8, %%ymm2                \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm3                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm4                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm8, %%ymm5                \n\t"
-                         "add $0x60, %1                                     \n\t"
-                         "add $0x8, %2                                      \n\t"
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_2x24_remain                             \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x24_remain_end:                           \n\t"
-                         "prefetcht0 (%3, %%rax)                            \n\t"
-                         "prefetcht0 0x40(%3, %%rax)                        \n\t"
-                         "vaddps (%3), %%ymm0, %%ymm0                       \n\t"
-                         "vaddps 0x20(%3), %%ymm1, %%ymm1                   \n\t"
-                         "vaddps 0x40(%3), %%ymm2, %%ymm2                   \n\t"
-                         "vmovups %%ymm0,  (%3)                             \n\t"
-                         "vmovups %%ymm1,  0x20(%3)                         \n\t"
-                         "vmovups %%ymm2,  0x40(%3)                         \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         "vaddps (%3), %%ymm3, %%ymm3                       \n\t"
-                         "vaddps 0x20(%3), %%ymm4, %%ymm4                   \n\t"
-                         "vaddps 0x40(%3), %%ymm5, %%ymm5                   \n\t"
-                         "vmovups %%ymm3,  (%3)                             \n\t"
-                         "vmovups %%ymm4,  0x20(%3)                         \n\t"
-                         "vmovups %%ymm5,  0x40(%3)                         \n\t"
-                         :
-                         : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-                         : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4",
-                         "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "memory");
-}
-
-void mmm_avx2_2x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__("mov %4, %%eax                                     \n\t"
-                         "shl $2, %%eax                                     \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-
-                         "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-                         "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-                         "vxorps %%ymm3, %%ymm3, %%ymm3                     \n\t"
-                         "vxorps %%ymm4, %%ymm4, %%ymm4                     \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "shr $2, %%ecx                                     \n\t"
-                         "je .k_loop_2x16_end                                \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x16:                                      \n\t"
-
-                         "prefetcht0 0x140(%1)                              \n\t"
-
-                         "vmovaps (%1), %%ymm6                              \n\t"
-                         "vmovaps 0x20(%1), %%ymm7                          \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm1                \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm3                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm4                \n\t"
-
-                         "prefetcht0 0x180(%1)                              \n\t"
-
-                         "vmovaps 0x40(%1), %%ymm6                          \n\t"
-                         "vmovaps 0x60(%1), %%ymm7                          \n\t"
-                         "vbroadcastss 0x8(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm1                \n\t"
-                         "vbroadcastss 0xC(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm3                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm4                \n\t"
-
-                         "prefetcht0 0x1C0(%1)                              \n\t"
-
-                         "vmovaps 0x80(%1), %%ymm6                          \n\t"
-                         "vmovaps 0xA0(%1), %%ymm7                          \n\t"
-                         "vbroadcastss 0x10(%2), %%ymm9                     \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm1                \n\t"
-                         "vbroadcastss 0x14(%2), %%ymm9                     \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm3                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm4                \n\t"
-
-                         "prefetcht0 0x200(%1)                              \n\t"
-
-                         "vmovaps 0xC0(%1), %%ymm6                         \n\t"
-                         "vmovaps 0xE0(%1), %%ymm7                         \n\t"
-                         "vbroadcastss 0x18(%2), %%ymm9                     \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm1                \n\t"
-                         "vbroadcastss 0x1C(%2), %%ymm9                     \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm3                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm4                \n\t"
-
-                         "add $0x100, %1                                    \n\t"
-                         "add $0x20, %2                                     \n\t"
-
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_2x16                                    \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x16_end:                                  \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "and $3, %%ecx                                     \n\t"
-                         "je .k_loop_2x16_remain_end                         \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x16_remain:                               \n\t"
-                         "vmovaps (%1), %%ymm6                              \n\t"
-                         "vmovaps 0x20(%1), %%ymm7                          \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm1                \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm9                      \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm6, %%ymm3                \n\t"
-                         "vfmadd231ps %%ymm9, %%ymm7, %%ymm4                \n\t"
-                         "add $0x40, %1                                     \n\t"
-                         "add $0x8, %2                                      \n\t"
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_2x16_remain                             \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x16_remain_end:                           \n\t"
-                         "prefetcht0 (%3, %%rax)                            \n\t"
-                         "vaddps (%3), %%ymm0, %%ymm0                       \n\t"
-                         "vaddps 0x20(%3), %%ymm1, %%ymm1                   \n\t"
-                         "vmovups %%ymm0,  (%3)                             \n\t"
-                         "vmovups %%ymm1,  0x20(%3)                         \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         "prefetcht0 (%3, %%rax)                            \n\t"
-                         "vaddps (%3), %%ymm3, %%ymm3                       \n\t"
-                         "vaddps 0x20(%3), %%ymm4, %%ymm4                   \n\t"
-                         "vmovups %%ymm3,  (%3)                             \n\t"
-                         "vmovups %%ymm4,  0x20(%3)                         \n\t"
-                         :
-                         : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-                         : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm3", "%ymm4", "%ymm6",
-                         "%ymm7", "%ymm9", "memory");
-}
-
-void mmm_avx2_2x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__("mov %4, %%eax                                     \n\t"
-                         "shl $2, %%eax                                     \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-
-                         "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-                         "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "shr $2, %%ecx                                     \n\t"
-                         "je .k_loop_2x8_end                                 \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x8:                                       \n\t"
-
-                         "prefetcht0 0x140(%1)                              \n\t"
-                         "vmovaps (%1), %%ymm2                              \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm3                      \n\t"
-                         "vfmadd231ps %%ymm3, %%ymm2, %%ymm0                \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm3                      \n\t"
-                         "vfmadd231ps %%ymm3, %%ymm2, %%ymm1                \n\t"
-
-                         "vmovaps 0x20(%1), %%ymm2                          \n\t"
-                         "vbroadcastss 0x8(%2), %%ymm3                      \n\t"
-                         "vfmadd231ps %%ymm3, %%ymm2, %%ymm0                \n\t"
-                         "vbroadcastss 0xC(%2), %%ymm3                      \n\t"
-                         "vfmadd231ps %%ymm3, %%ymm2, %%ymm1                \n\t"
-
-                         "prefetcht0 0x180(%1)                              \n\t"
-                         "vmovaps 0x40(%1), %%ymm2                          \n\t"
-                         "vbroadcastss 0x10(%2), %%ymm3                     \n\t"
-                         "vfmadd231ps %%ymm3, %%ymm2, %%ymm0                \n\t"
-                         "vbroadcastss 0x14(%2), %%ymm3                     \n\t"
-                         "vfmadd231ps %%ymm3, %%ymm2, %%ymm1                \n\t"
-
-                         "vmovaps 0x60(%1), %%ymm2                          \n\t"
-                         "vbroadcastss 0x18(%2), %%ymm3                     \n\t"
-                         "vfmadd231ps %%ymm3, %%ymm2, %%ymm0                \n\t"
-                         "vbroadcastss 0x1C(%2), %%ymm3                     \n\t"
-                         "vfmadd231ps %%ymm3, %%ymm2, %%ymm1                \n\t"
-
-                         "add $0x80, %1                                     \n\t"
-                         "add $0x20, %2                                     \n\t"
-
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_2x8                                     \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x8_end:                                   \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "and $3, %%ecx                                     \n\t"
-                         "je .k_loop_2x8_remain_end                          \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x8_remain:                                \n\t"
-                         "vmovaps (%1), %%ymm2                              \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm3                      \n\t"
-                         "vfmadd231ps %%ymm3, %%ymm2, %%ymm0                \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm3                      \n\t"
-                         "vfmadd231ps %%ymm3, %%ymm2, %%ymm1                \n\t"
-                         "add $0x20, %1                                     \n\t"
-                         "add $0x8, %2                                      \n\t"
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_2x8_remain                              \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x8_remain_end:                            \n\t"
-
-                         "vaddps (%3), %%ymm0, %%ymm0                       \n\t"
-                         "vmovups %%ymm0,  (%3)                             \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         "vaddps (%3), %%ymm1, %%ymm1                       \n\t"
-                         "vmovups %%ymm1,  (%3)                             \n\t"
-                         :
-                         : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-                         : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory");
-}
-
-void mmm_avx2_2x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__("mov %4, %%eax                                     \n\t"
-                         "shl $2, %%eax                                     \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-
-                         "vxorps %%xmm0, %%xmm0, %%xmm0                     \n\t"
-                         "vxorps %%xmm1, %%xmm1, %%xmm1                     \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "shr $2, %%ecx                                     \n\t"
-                         "je .k_loop_2x4_end                                 \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x4:                                       \n\t"
-
-                         "prefetcht0 0x140(%1)                              \n\t"
-                         "vmovaps (%1), %%xmm2                              \n\t"
-                         "vbroadcastss 0x0(%2), %%xmm3                      \n\t"
-                         "vfmadd231ps %%xmm3, %%xmm2, %%xmm0                \n\t"
-                         "vbroadcastss 0x4(%2), %%xmm3                      \n\t"
-                         "vfmadd231ps %%xmm3, %%xmm2, %%xmm1                \n\t"
-
-                         "vmovaps 0x10(%1), %%xmm2                          \n\t"
-                         "vbroadcastss 0x8(%2), %%xmm3                      \n\t"
-                         "vfmadd231ps %%xmm3, %%xmm2, %%xmm0                \n\t"
-                         "vbroadcastss 0xC(%2), %%xmm3                      \n\t"
-                         "vfmadd231ps %%xmm3, %%xmm2, %%xmm1                \n\t"
-
-                         "vmovaps 0x20(%1), %%xmm2                          \n\t"
-                         "vbroadcastss 0x10(%2), %%xmm3                     \n\t"
-                         "vfmadd231ps %%xmm3, %%xmm2, %%xmm0                \n\t"
-                         "vbroadcastss 0x14(%2), %%xmm3                     \n\t"
-                         "vfmadd231ps %%xmm3, %%xmm2, %%xmm1                \n\t"
-
-                         "vmovaps 0x30(%1), %%xmm2                          \n\t"
-                         "vbroadcastss 0x18(%2), %%xmm3                     \n\t"
-                         "vfmadd231ps %%xmm3, %%xmm2, %%xmm0                \n\t"
-                         "vbroadcastss 0x1C(%2), %%xmm3                     \n\t"
-                         "vfmadd231ps %%xmm3, %%xmm2, %%xmm1                \n\t"
-
-                         "add $0x40, %1                                     \n\t"
-                         "add $0x20, %2                                     \n\t"
-
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_2x4                                     \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x4_end:                                   \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "and $3, %%ecx                                     \n\t"
-                         "je .k_loop_2x4_remain_end                          \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x4_remain:                                \n\t"
-                         "vmovaps (%1), %%xmm2                              \n\t"
-                         "vbroadcastss 0x0(%2), %%xmm3                      \n\t"
-                         "vfmadd231ps %%xmm3, %%xmm2, %%xmm0                \n\t"
-                         "vbroadcastss 0x4(%2), %%xmm3                      \n\t"
-                         "vfmadd231ps %%xmm3, %%xmm2, %%xmm1                \n\t"
-                         "add $0x10, %1                                     \n\t"
-                         "add $0x8, %2                                      \n\t"
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_2x4_remain                              \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_2x4_remain_end:                            \n\t"
-
-                         "vaddps (%3), %%xmm0, %%xmm0                       \n\t"
-                         "vmovups %%xmm0, (%3)                              \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         "vaddps (%3), %%xmm1, %%xmm1                       \n\t"
-                         "vmovups %%xmm1, (%3)                              \n\t"
-                         :
-                         : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-                         : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory");
-}
-
-void mmm_avx2_1x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__("mov %4, %%eax                                     \n\t"
-                         "shl $2, %%eax                                     \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-
-                         "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-                         "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-                         "vxorps %%ymm2, %%ymm2, %%ymm2                     \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "shr $2, %%ecx                                     \n\t"
-                         "je .k_loop_1x24_end                                \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x24:                                      \n\t"
-
-                         "prefetcht0 0x140(%1)                              \n\t"
-                         "prefetcht0 0x180(%1)                              \n\t"
-
-                         "vmovaps (%1), %%ymm3                              \n\t"
-                         "vmovaps 0x20(%1), %%ymm4                          \n\t"
-                         "vmovaps 0x40(%1), %%ymm5                          \n\t"
-                         "vbroadcastss 0x0(%2), %%ymm6                      \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm3, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm4, %%ymm1                \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm5, %%ymm2                \n\t"
-
-                         "prefetcht0 0x1C0(%1)                              \n\t"
-
-                         "vmovaps 0x60(%1), %%ymm3                          \n\t"
-                         "vmovaps 0x80(%1), %%ymm4                          \n\t"
-                         "vmovaps 0xA0(%1), %%ymm5                          \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm6                      \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm3, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm4, %%ymm1                \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm5, %%ymm2                \n\t"
-
-                         "prefetcht0 0x200(%1)                              \n\t"
-                         "prefetcht0 0x240(%1)                              \n\t"
-
-                         "vmovaps 0xC0(%1), %%ymm3                          \n\t"
-                         "vmovaps 0xE0(%1), %%ymm4                          \n\t"
-                         "vmovaps 0x100(%1), %%ymm5                         \n\t"
-                         "vbroadcastss 0x8(%2), %%ymm6                      \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm3, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm4, %%ymm1                \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm5, %%ymm2                \n\t"
-
-                         "prefetcht0 0x280(%1)                              \n\t"
-
-                         "vmovaps 0x120(%1), %%ymm3                         \n\t"
-                         "vmovaps 0x140(%1), %%ymm4                         \n\t"
-                         "vmovaps 0x160(%1), %%ymm5                         \n\t"
-                         "vbroadcastss 0xC(%2), %%ymm6                      \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm3, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm4, %%ymm1                \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm5, %%ymm2                \n\t"
-
-                         "add $0x180, %1                                    \n\t"
-                         "add $0x10, %2                                     \n\t"
-
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_1x24                                    \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x24_end:                                  \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "and $3, %%ecx                                     \n\t"
-                         "je .k_loop_1x24_remain_end                         \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x24_remain:                               \n\t"
-                         "vmovaps (%1), %%ymm3                              \n\t"
-                         "vmovaps 0x20(%1), %%ymm4                          \n\t"
-                         "vmovaps 0x40(%1), %%ymm5                          \n\t"
-                         "vbroadcastss (%2), %%ymm6                         \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm3, %%ymm0                \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm4, %%ymm1                \n\t"
-                         "vfmadd231ps %%ymm6, %%ymm5, %%ymm2                \n\t"
-                         "add $0x60, %1                                     \n\t"
-                         "add $0x4, %2                                      \n\t"
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_1x24_remain                             \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x24_remain_end:                           \n\t"
-                         "prefetcht0 (%3, %%rax)                            \n\t"
-                         "prefetcht0 0x40(%3, %%rax)                        \n\t"
-                         "vaddps (%3), %%ymm0, %%ymm0                       \n\t"
-                         "vaddps 0x20(%3), %%ymm1, %%ymm1                   \n\t"
-                         "vaddps 0x40(%3), %%ymm2, %%ymm2                   \n\t"
-                         "vmovups %%ymm0,  (%3)                             \n\t"
-                         "vmovups %%ymm1,  0x20(%3)                         \n\t"
-                         "vmovups %%ymm2,  0x40(%3)                         \n\t"
-                         :
-                         : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-                         : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4",
-                         "%ymm5", "%ymm6", "memory");
-}
-
-void mmm_avx2_1x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__(
-        "mov %4, %%eax                                     \n\t"
-        "shl $2, %%eax                                     \n\t"
-        "mov %%eax, %%eax                                  \n\t"
-
-        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-        "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-
-        "mov %0, %%ecx                                     \n\t"
-        "shr $2, %%ecx                                     \n\t"
-        "je .k_loop_1x16_end                                \n\t"
-
-        ".align 16                                         \n\t"
-        ".k_loop_1x16:                                      \n\t"
-
-        "prefetcht0 0x140(%1)                              \n\t"
-
-        "vmovaps (%1), %%ymm2                              \n\t"
-        "vmovaps 0x20(%1), %%ymm3                          \n\t"
-        "vbroadcastss (%2), %%ymm5                         \n\t"
-        "vfmadd231ps %%ymm5, %%ymm2, %%ymm0                \n\t"
-        "vfmadd231ps %%ymm5, %%ymm3, %%ymm1                \n\t"
-
-        "prefetcht0 0x180(%1)                              \n\t"
-
-        "vmovaps 0x40(%1), %%ymm2                          \n\t"
-        "vmovaps 0x60(%1), %%ymm3                          \n\t"
-        "vbroadcastss 0x4(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm2, %%ymm0                \n\t"
-        "vfmadd231ps %%ymm5, %%ymm3, %%ymm1                \n\t"
-
-        "prefetcht0 0x1C0(%1)                              \n\t"
-
-        "vmovaps 0x80(%1), %%ymm2                          \n\t"
-        "vmovaps 0xA0(%1), %%ymm3                          \n\t"
-        "vbroadcastss 0x8(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm2, %%ymm0                \n\t"
-        "vfmadd231ps %%ymm5, %%ymm3, %%ymm1                \n\t"
-
-        "prefetcht0 0x200(%1)                              \n\t"
-
-        "vmovaps 0xC0(%1), %%ymm2                          \n\t"
-        "vmovaps 0xE0(%1), %%ymm3                          \n\t"
-        "vbroadcastss 0xC(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm2, %%ymm0                \n\t"
-        "vfmadd231ps %%ymm5, %%ymm3, %%ymm1                \n\t"
-
-        "add $0x100, %1                                    \n\t"
-        "add $0x10, %2                                     \n\t"
-
-        "sub $1, %%ecx                                     \n\t"
-        "jg .k_loop_1x16                                    \n\t"
-        ".align 16                                         \n\t"
-        ".k_loop_1x16_end:                                  \n\t"
-
-        "mov %0, %%ecx                                     \n\t"
-        "and $3, %%ecx                                     \n\t"
-        "je .k_loop_1x16_remain_end                         \n\t"
-
-        ".align 16                                         \n\t"
-        ".k_loop_1x16_remain:                               \n\t"
-        "vmovaps (%1), %%ymm2                              \n\t"
-        "vmovaps 0x20(%1), %%ymm3                          \n\t"
-        "vbroadcastss 0x0(%2), %%ymm5                      \n\t"
-        "vfmadd231ps %%ymm5, %%ymm2, %%ymm0                \n\t"
-        "vfmadd231ps %%ymm5, %%ymm3, %%ymm1                \n\t"
-        "add $0x40, %1                                     \n\t"
-        "add $0x4, %2                                      \n\t"
-        "sub $1, %%ecx                                     \n\t"
-        "jg .k_loop_1x16_remain                             \n\t"
-
-        ".align 16                                         \n\t"
-        ".k_loop_1x16_remain_end:                           \n\t"
-        "prefetcht0 (%3, %%rax)                            \n\t"
-        "vaddps (%3), %%ymm0, %%ymm0                       \n\t"
-        "vaddps 0x20(%3), %%ymm1, %%ymm1                   \n\t"
-        "vmovups %%ymm0,  (%3)                             \n\t"
-        "vmovups %%ymm1,  0x20(%3)                         \n\t"
-        :
-        : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-        : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm5", "memory");
-}
-
-void mmm_avx2_1x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__("mov %4, %%eax                                     \n\t"
-                         "shl $2, %%eax                                     \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-
-                         "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "shr $2, %%ecx                                     \n\t"
-                         "je .k_loop_1x8_end                                 \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x8:                                       \n\t"
-
-                         "prefetcht0 0x140(%1)                              \n\t"
-                         "vmovaps (%1), %%ymm1                              \n\t"
-                         "vbroadcastss (%2), %%ymm2                         \n\t"
-                         "vfmadd231ps %%ymm2, %%ymm1, %%ymm0                \n\t"
-
-                         "vmovaps 0x20(%1), %%ymm1                          \n\t"
-                         "vbroadcastss 0x4(%2), %%ymm2                      \n\t"
-                         "vfmadd231ps %%ymm2, %%ymm1, %%ymm0                \n\t"
-
-                         "prefetcht0 0x180(%1)                              \n\t"
-                         "vmovaps 0x40(%1), %%ymm1                          \n\t"
-                         "vbroadcastss 0x8(%2), %%ymm2                      \n\t"
-                         "vfmadd231ps %%ymm2, %%ymm1, %%ymm0                \n\t"
-
-                         "vmovaps 0x60(%1), %%ymm1                          \n\t"
-                         "vbroadcastss 0xC(%2), %%ymm2                      \n\t"
-                         "vfmadd231ps %%ymm2, %%ymm1, %%ymm0                \n\t"
-
-                         "add $0x80, %1                                     \n\t"
-                         "add $0x10, %2                                     \n\t"
-
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_1x8                                     \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x8_end:                                   \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "and $3, %%ecx                                     \n\t"
-                         "je .k_loop_1x8_remain_end                          \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x8_remain:                                \n\t"
-                         "vmovaps (%1), %%ymm1                              \n\t"
-                         "vbroadcastss (%2), %%ymm2                         \n\t"
-                         "vfmadd231ps %%ymm2, %%ymm1, %%ymm0                \n\t"
-                         "add $0x20, %1                                     \n\t"
-                         "add $0x4, %2                                      \n\t"
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_1x8_remain                              \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x8_remain_end:                            \n\t"
-
-                         "vaddps (%3), %%ymm0, %%ymm0                       \n\t"
-                         "vmovups %%ymm0,  (%3)                             \n\t"
-                         :
-                         : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-                         : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "memory");
-}
-
-void mmm_avx2_1x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    __asm__ __volatile__("mov %4, %%eax                                     \n\t"
-                         "shl $2, %%eax                                     \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-
-                         "vxorps %%xmm0, %%xmm0, %%xmm0                     \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "shr $2, %%ecx                                     \n\t"
-                         "je .k_loop_1x4_end                                 \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x4:                                       \n\t"
-
-                         "prefetcht0 0x40(%1)                               \n\t"
-
-                         "vmovaps (%1), %%xmm1                              \n\t"
-                         "vbroadcastss 0x0(%2), %%xmm2                      \n\t"
-                         "vfmadd231ps %%xmm2, %%xmm1, %%xmm0                \n\t"
-
-                         "vmovaps 0x10(%1), %%xmm1                          \n\t"
-                         "vbroadcastss 0x4(%2), %%xmm2                      \n\t"
-                         "vfmadd231ps %%xmm2, %%xmm1, %%xmm0                \n\t"
-
-                         "vmovaps 0x20(%1), %%xmm1                          \n\t"
-                         "vbroadcastss 0x8(%2), %%xmm2                      \n\t"
-                         "vfmadd231ps %%xmm2, %%xmm1, %%xmm0                \n\t"
-
-                         "vmovaps 0x30(%1), %%xmm1                          \n\t"
-                         "vbroadcastss 0xC(%2), %%xmm2                      \n\t"
-                         "vfmadd231ps %%xmm2, %%xmm1, %%xmm0                \n\t"
-
-                         "add $0x40, %1                                     \n\t"
-                         "add $0x10, %2                                     \n\t"
-
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_1x4                                     \n\t"
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x4_end:                                   \n\t"
-
-                         "mov %0, %%ecx                                     \n\t"
-                         "and $3, %%ecx                                     \n\t"
-                         "je .k_loop_1x4_remain_end                          \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x4_remain:                                \n\t"
-                         "vmovaps (%1), %%xmm1                              \n\t"
-                         "vbroadcastss 0x0(%2), %%xmm2                      \n\t"
-                         "vfmadd231ps %%xmm2, %%xmm1, %%xmm0                \n\t"
-                         "add $0x10, %1                                     \n\t"
-                         "add $0x4, %2                                      \n\t"
-                         "sub $1, %%ecx                                     \n\t"
-                         "jg .k_loop_1x4_remain                              \n\t"
-
-                         ".align 16                                         \n\t"
-                         ".k_loop_1x4_remain_end:                            \n\t"
-
-                         "vaddps (%3), %%xmm0, %%xmm0                       \n\t"
-                         "vmovups %%xmm0,  (%3)                             \n\t"
-                         "add %%rax, %3                                     \n\t"
-                         :
-                         : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N)
-                         : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "memory");
-}
-
-void mmm_avx2_n_mtail(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N)
-{
-    for (U32 i = 0; i < um; ++i) {
-        for (U32 j = 0; j < un; ++j) {
-            for (U32 k = 0; k < bk; ++k) {
-                matrixC[i * N + j] += matrixA[k * um + i] * matrixB[k * un + j];
-            }
-        }
-    }
-}
-
 EE mmm_avx2_fp32(
     int N, int M, int K, DataFormat matrix1Df, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result)
 {
     // buffer addr algined to 32
     F32 *packA = (F32 *)align_addr(tmp, 32);
     F32 *packB = (F32 *)align_addr(matrix2, 32);
-    kernel_func kernel[3][5] = {
+    kernel_func kernel[4][5] = {
         {mmm_avx2_n_mtail, mmm_avx2_1x4_asm, mmm_avx2_1x8_asm, mmm_avx2_1x16_asm, mmm_avx2_1x24_asm},
         {mmm_avx2_n_mtail, mmm_avx2_2x4_asm, mmm_avx2_2x8_asm, mmm_avx2_2x16_asm, mmm_avx2_2x24_asm},
+        {mmm_avx2_n_mtail, mmm_avx2_3x4_asm, mmm_avx2_3x8_asm, mmm_avx2_3x16_asm, mmm_avx2_3x24_asm},
         {mmm_avx2_n_mtail, mmm_avx2_4x4_asm, mmm_avx2_4x8_asm, mmm_avx2_4x16_asm, mmm_avx2_4x24_asm}};
     F32 unrollNSize[4] = {4, 8, 16, 24};
-    F32 unrollMSize[3] = {1, 2, 4};
-    I32 resN = N % 24;
-    I32 blockNNum = N / 24;
-    I32 edgeblockNSizeArray[5] = {0};
-    for (U32 i = 0; resN > 0; ++i) {
-        U32 value = UNI_MIN(unrollNSize[resN >> 3], resN);
-        edgeblockNSizeArray[i] += value;
-        edgeblockNSizeArray[i + 1] = edgeblockNSizeArray[i];
-        resN -= value;
-        blockNNum += 1;
+    F32 unrollMSize[4] = {1, 2, 3, 4};
+    I32 resN = N % UNROLL_N;
+    I32 blockNNum = N / UNROLL_N + (resN > 0);
+    I32 edgeBlockNSizeIdx = (resN > 4) ? ((resN + 7) / 8) : 0;
+    I32 edgeBlockNSize = (resN > 0) ? unrollNSize[edgeBlockNSizeIdx] : 0;
+    I32 mask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+    if (resN != edgeBlockNSize) {
+        UNI_MEMSET(mask + resN % 8, 0, (edgeBlockNSize - resN) * 4);
+    }
+    I32 *maskPtr = (N % 4 != 0) ? mask : nullptr;
+    I32 alginedN = (blockNNum - 1) * UNROLL_N + edgeBlockNSize;
+    if (edgeBlockNSize == 0) {
+        alginedN += UNROLL_N;
     }
+    I32 blockNum = (M + 3) / 4 * blockNNum;
+    I32 mainBlockNum = (BOLCK_M_DIM + 3) / 4 * blockNNum;
 
 #ifdef _USE_OPENMP
-#pragma omp parallel num_threads(OMP_NUM_THREADS)
+    int in_parallel = omp_in_parallel();
+#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0)
     {
 #endif
-        I32 blockSizeM = 0, blockSizeK = 0;
+        I32 blockSizeK = 0;
         for (int k = 0; k < K; k += blockSizeK) {
             blockSizeK = UNI_MIN(BOLCK_K_DIM, K - k);
-            for (int j = 0; j < M; j += blockSizeM) {
-                blockSizeM = UNI_MIN(BOLCK_M_DIM, M - j);
-                I32 blockMNum = blockSizeM / 4 + (blockSizeM % 4 + 1) / 2;
-#ifdef _USE_OPENMP
-#pragma omp for
-#endif
-                for (I32 mIdx = 0; mIdx < blockMNum; ++mIdx) {
-                    I32 m = mIdx * 4 - ((mIdx * 4) > blockSizeM) * 2;
-                    I32 unrollSizeM = UNI_MIN(UNROLL_M, blockSizeM - m);
-                    unrollSizeM = unrollMSize[unrollSizeM >> 1];
-
-                    I32 blockSizeN = UNI_MIN(UNROLL_N, N);
-                    blockSizeN = UNI_MIN(unrollNSize[blockSizeN >> 3], blockSizeN);
-
-                    F32 *curB = packB + k * N;
-                    F32 *curA = packA + m * blockSizeK;
-                    if (matrix1Df == DF_TRANSPOSE) {
-                        matrix2_trans(unrollSizeM, blockSizeK, M, matrix1 + (j + m) + k * M, curA);
-                    } else if (matrix1Df == DF_NORMAL) {
-                        matrix1_trans(unrollSizeM, blockSizeK, K, matrix1 + k + (j + m) * K, curA);
-                    } else if (matrix1Df == DF_NKN8) {
-                        matrix2_trans_c8(
-                            unrollSizeM, blockSizeK, M, matrix1 + (j + m) * 8 + k * M, curA);
-                    }
-                    kernel[unrollSizeM >> 1][(blockSizeN >> 3) + (blockSizeN > 3)](
-                        unrollSizeM, blockSizeN, blockSizeK, curA, curB, result + (m + j) * N, N);
-                }
+            if (matrix1Df == DF_TRANSPOSE) {
+                matrix1_trans(blockSizeK, M, M, matrix1 + k * M, packA);
+            }
+
 #ifdef _USE_OPENMP
 #pragma omp for
 #endif
-                for (int mnIdx = blockMNum; mnIdx < blockNNum * blockMNum; ++mnIdx) {
-                    I32 nIdx = mnIdx / blockMNum;
-                    I32 n = nIdx * UNROLL_N;
-                    if (n >= N) {
-                        U32 idx = (n - N) / UNROLL_N;
-                        CHECK_REQUIREMENT(idx <= 4);
-                        n = N / UNROLL_N * UNROLL_N + edgeblockNSizeArray[idx];
-                    }
-                    I32 blockSizeN = UNI_MIN(UNROLL_N, N - n);
-                    blockSizeN = UNI_MIN(unrollNSize[blockSizeN >> 3], blockSizeN);
-                    F32 *curB = packB + k * N + n * blockSizeK;
-
-                    I32 mIdx = mnIdx % blockMNum;
-                    I32 m = mIdx * 4 - ((mIdx * 4) > blockSizeM) * 2;
-                    I32 unrollSizeM = UNI_MIN(UNROLL_M, blockSizeM - m);
-                    unrollSizeM = unrollMSize[unrollSizeM >> 1];
-                    kernel[unrollSizeM >> 1][(blockSizeN >> 3) + (blockSizeN > 3)](unrollSizeM,
-                        blockSizeN, blockSizeK, packA + m * blockSizeK, curB,
-                        result + (m + j) * N + n, N);
+            for (int mnIdx = 0; mnIdx < blockNum; ++mnIdx) {
+                I32 j = mnIdx / mainBlockNum * BOLCK_M_DIM;
+                I32 blockSizeM = UNI_MIN(BOLCK_M_DIM, M - j);
+                I32 blockMNum = (blockSizeM + 3) / 4;
+
+                I32 n = (mnIdx % mainBlockNum) / blockMNum * UNROLL_N;
+                I32 blockSizeN = UNI_MAX(UNI_MIN(UNROLL_N, N - n), edgeBlockNSize);
+                F32 *curB = packB + k * alginedN + n * blockSizeK;
+                maskPtr = ((blockSizeN + n) > N) ? mask : nullptr;
+
+                I32 m = ((mnIdx % mainBlockNum) % blockMNum) * UNROLL_M;
+                I32 unrollSizeM = UNI_MIN(UNROLL_M, blockSizeM - m);
+
+                F32 *curA, *A1, *A2, *A3;
+                if (matrix1Df == DF_TRANSPOSE) {
+                    curA = packA + m * blockSizeK;
+                    A1 = curA + blockSizeK;
+                    A2 = curA + 2 * blockSizeK;
+                    A3 = curA + 3 * blockSizeK;
+                } else {
+                    curA = matrix1 + k + (j + m) * K;
+                    A1 = curA + K;
+                    A2 = curA + 2 * K;
+                    A3 = curA + 3 * K;
                 }
+
+                kernel[unrollSizeM - 1][(blockSizeN >> 3) + (blockSizeN > 3)](unrollSizeM,
+                    blockSizeN, blockSizeK, curA, curB, result + (m + j) * N + n, N, maskPtr,
+                    A1, A2, A3);
             }
         }
 #ifdef _USE_OPENMP
diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_pack.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_pack.cpp
index ab67bff0..9334496f 100644
--- a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_pack.cpp
+++ b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_pack.cpp
@@ -61,7 +61,7 @@ EE matrix_vector_multiply_transform_weight_fp32(TensorDesc desc, F32 *src, F32 *
                     unrollSizeN = unrollSize[unrollSizeN / 16 - (unrollSizeN >= 48)];
                     if (N - un < unrollSizeN) {
                         for (U32 k = 0; k < blockKSize; ++k) {
-                            memcpy(packB + k * (N - un), src + (k + bk) * N + un,
+                            UNI_MEMCPY(packB + k * (N - un), src + (k + bk) * N + un,
                                 (N - un) * sizeof(F32));
                         }
                         packB += (N - un) * blockKSize;
@@ -265,7 +265,8 @@ void mvm_pack_fp32(U32 numRows, U32 numColumns, F32 *packB, F32 *vector, F32 *re
         blockNum += 1;
     }
 #ifdef _USE_OPENMP
-#pragma omp parallel num_threads(OMP_NUM_THREADS)
+    int in_parallel = omp_in_parallel();
+#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0)
     {
 #endif
         U32 private_blockKSize = 0;
diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp
index c8fd8afd..77aeb1b8 100644
--- a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp
+++ b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp
@@ -238,13 +238,13 @@ void mvm_row_avx_4_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result)
 
                          ".align 16                                         \n\t"
                          ".k_loop_remain_1_end:                             \n\t"
-                         "vaddps (%3), %%xmm0, %%xmm0                       \n\t"
+                         "addss (%3), %%xmm0                       \n\t"
                          "vmovss %%xmm0, (%3)                               \n\t"
-                         "vaddps 0x4(%3), %%xmm1, %%xmm1                    \n\t"
+                         "addss 0x4(%3), %%xmm1                    \n\t"
                          "vmovss %%xmm1, 0x4(%3)                            \n\t"
-                         "vaddps 0x8(%3), %%xmm2, %%xmm2                    \n\t"
+                         "addss 0x8(%3), %%xmm2                    \n\t"
                          "vmovss %%xmm2, 0x8(%3)                            \n\t"
-                         "vaddps 0xC(%3), %%xmm3, %%xmm3                    \n\t"
+                         "addss 0xC(%3), %%xmm3                    \n\t"
                          "vmovss %%xmm3, 0xC(%3)                            \n\t"
                          :
                          : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda)
@@ -398,9 +398,9 @@ void mvm_row_avx_2_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result)
 
                          ".align 16                                         \n\t"
                          ".n2_k_loop_remain_1_end:                          \n\t"
-                         "vaddps (%3), %%xmm0, %%xmm0                       \n\t"
+                         "addss (%3), %%xmm0                       \n\t"
                          "vmovss %%xmm0, (%3)                               \n\t"
-                         "vaddps 0x4(%3), %%xmm1, %%xmm1                    \n\t"
+                         "addss 0x4(%3), %%xmm1                    \n\t"
                          "vmovss %%xmm1, 0x4(%3)                            \n\t"
                          :
                          : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda)
@@ -513,7 +513,7 @@ void mvm_row_avx_1_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result)
 
                          ".align 16                                         \n\t"
                          ".n1_k_loop_remain_1_end:                          \n\t"
-                         "vaddps (%3), %%xmm0, %%xmm0                       \n\t"
+                         "addss (%3), %%xmm0                       \n\t"
                          "vmovss %%xmm0, (%3)                               \n\t"
                          :
                          : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda)
@@ -528,7 +528,8 @@ void mvm_row_fp32(U32 numRows, U32 numColumns, F32 *matrix, F32 *vector, F32 *re
     U32 unrollNSize[3] = {1, 2, 4};
     U32 blockNum = numRows / 4 + (numRows % 4 + 1) / 2;
 #ifdef _USE_OPENMP
-#pragma omp parallel num_threads(OMP_NUM_THREADS)
+    int in_parallel = omp_in_parallel();
+#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0)
     {
 #endif
         U32 private_blockKSize = 0;
diff --git a/compute/blas_enhance/src/cpu/x86/int8/blas_int8.h b/compute/blas_enhance/src/cpu/x86/int8/blas_int8.h
index 603b4d61..2f761711 100644
--- a/compute/blas_enhance/src/cpu/x86/int8/blas_int8.h
+++ b/compute/blas_enhance/src/cpu/x86/int8/blas_int8.h
@@ -21,15 +21,15 @@
 #include "uni.h"
 
 #define SIMDW 8
-#define align_size(size, unit) ((size + unit - 1) / unit * unit)
 
 void matrix_matrix_multiply_tmp_bytes_int8(
-    U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes);
+    U32 row1, U32 col1, U32 row2, U32 col2, DataFormat df, DataType dt, U32 *bytes);
 
 // transform no-transposed B to K4, offline
-inline void matrix1_trans_l(int size, int blockK, int K, int alignSize, INT8 *src, INT8 *dst)
+inline void matrix1_trans_l(
+    int size, int alignedN, int blockK, int K, int alignSize, INT8 *src, INT8 *dst)
 {
-    int alignedBlockK = align_size(blockK, alignSize);
+    int alignedBlockK = UNI_ALIGN(blockK, alignSize);
     int blockKF32 = blockK / 4;
     __m256i vindex = _mm256_set_epi32(K * 7, K * 6, K * 5, K * 4, K * 3, K * 2, K, 0);
     int i;
@@ -52,13 +52,18 @@ inline void matrix1_trans_l(int size, int blockK, int K, int alignSize, INT8 *sr
         }
         j *= 8;
         for (; j < size; ++j) {
-            memcpy(dst, src + i * 4 + j * K, 4);
+            UNI_MEMCPY(dst, src + i * 4 + j * K, 4);
             dst += 4;
         }
+        if (j < alignedN) {
+            UNI_MEMSET(dst, 0, 4 * (alignedN - size));
+            dst += 4 * (alignedN - size);
+        }
     }
     i *= 4;
     for (; i < alignedBlockK; i += 4) {
-        for (int j = 0; j < size; ++j) {
+        int j = 0;
+        for (; j < size; ++j) {
             for (int ii = i; ii < i + 4; ++ii) {
                 if (ii < blockK) {
                     *(dst++) = src[ii + j * K];
@@ -67,15 +72,21 @@ inline void matrix1_trans_l(int size, int blockK, int K, int alignSize, INT8 *sr
                 }
             }
         }
+        if (j < alignedN) {
+            UNI_MEMSET(dst, 0, 4 * (alignedN - size));
+            dst += 4 * (alignedN - size);
+        }
     }
 }
 
 // transform transposed B to K4, offline
-inline void matrix2_trans_l(int size, int blockK, int N, int alignSize, INT8 *src, INT8 *dst)
+inline void matrix2_trans_l(
+    int size, int alignedN, int blockK, int N, int alignSize, INT8 *src, INT8 *dst)
 {
-    int alignedBlockK = align_size(blockK, alignSize);
+    int alignedBlockK = UNI_ALIGN(blockK, alignSize);
     for (int i = 0; i < alignedBlockK; i += 4) {
-        for (int j = 0; j < size; ++j) {
+        int j = 0;
+        for (; j < size; ++j) {
             for (int ii = i; ii < (i + 4); ++ii) {
                 if (ii < blockK) {
                     *(dst++) = src[ii * N + j];
@@ -84,6 +95,10 @@ inline void matrix2_trans_l(int size, int blockK, int N, int alignSize, INT8 *sr
                 }
             }
         }
+        if (j < alignedN) {
+            UNI_MEMSET(dst, 0, 4 * (alignedN - size));
+            dst += 4 * (alignedN - size);
+        }
     }
 }
 
@@ -91,7 +106,7 @@ inline void matrix2_trans_l(int size, int blockK, int N, int alignSize, INT8 *sr
 inline void matrix2_trans_r(int size, int blockK, int M, int alignSize, UINT8 *src, UINT8 *dst)
 {
     // TODO: optimize
-    int alignedBlockK = align_size(blockK, alignSize);
+    int alignedBlockK = UNI_ALIGN(blockK, alignSize);
     for (int j = 0; j < size; ++j) {
         int i = 0;
         for (i = 0; i < blockK; ++i) {
@@ -101,7 +116,7 @@ inline void matrix2_trans_r(int size, int blockK, int M, int alignSize, UINT8 *s
             *(dst++) = *(src + i * M + j);
         }
         for (; i < alignedBlockK; ++i) {
-            *(dst++) = 0;
+            *(dst++) = 128;
         }
     }
 }
@@ -109,23 +124,21 @@ inline void matrix2_trans_r(int size, int blockK, int M, int alignSize, UINT8 *s
 // transpose A, online
 inline void matrix1_trans_r(int size, int blockK, int K, int alignSize, UINT8 *src, UINT8 *dst)
 {
-    int alignedBlockK = align_size(blockK, alignSize);
+    int alignedBlockK = UNI_ALIGN(blockK, alignSize);
     if (alignedBlockK != blockK) {
-        memset(dst, 0, alignedBlockK * size);
+        UNI_MEMSET(dst, 0, alignedBlockK * size);
     }
     for (int j = 0; j < size; ++j) {
-        memcpy(dst + j * alignedBlockK, src + j * K, blockK);
+        UNI_MEMCPY(dst + j * alignedBlockK, src + j * K, blockK);
     }
 }
 
 EE matrix_vector_multiply_transform_weight_int8(
     TensorDesc desc, INT8 *src, INT8 *packB, I32 *offsetCBias);
 
-EE matrix_matrix_multiply_transform_rhsN_int8(
-    TensorDesc desc, INT8 *src, INT8 *dst, I32 *offsetCBias);
+EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *dst);
 
-EE matrix_matrix_multiply_transform_rhsT_int8(
-    TensorDesc desc, INT8 *src, INT8 *dst, I32 *offsetCBias);
+EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *dst);
 
 EE mmm_avx512_vnni_int8(U32 M,
     U32 N,
diff --git a/compute/blas_enhance/src/cpu/x86/int8/mmm_avx512_vnni.cpp b/compute/blas_enhance/src/cpu/x86/int8/mmm_avx512_vnni.cpp
index 6dc580a0..d2a4a168 100644
--- a/compute/blas_enhance/src/cpu/x86/int8/mmm_avx512_vnni.cpp
+++ b/compute/blas_enhance/src/cpu/x86/int8/mmm_avx512_vnni.cpp
@@ -30,22 +30,2014 @@ typedef void (*kernel_func)(U32 um,
     U32 N,
     U32 stepK,
     const F32 *scale,
+    U32 nmask,
+    UINT8 *resK,
     U32 flags);
 
+// clang-format off
+#define loadOffset_1_1(rtype) \
+    "vmovups (%[offset]), "#rtype"0            \n\t"
+
+#define loadOffset_6_1(rtype) \
+    loadOffset_1_1(rtype) \
+    "vmovups "#rtype"0, "#rtype"1              \n\t" \
+    "vmovups "#rtype"0, "#rtype"2              \n\t" \
+    "vmovups "#rtype"0, "#rtype"3              \n\t" \
+    "vmovups "#rtype"0, "#rtype"4              \n\t" \
+    "vmovups "#rtype"0, "#rtype"5              \n\t"
+
+#define loadOffset_12_1(rtype) \
+    loadOffset_6_1(rtype) \
+    "vmovups "#rtype"0, "#rtype"6              \n\t" \
+    "vmovups "#rtype"0, "#rtype"7              \n\t" \
+    "vmovups "#rtype"0, "#rtype"8              \n\t" \
+    "vmovups "#rtype"0, "#rtype"9              \n\t" \
+    "vmovups "#rtype"0, "#rtype"10             \n\t" \
+    "vmovups "#rtype"0, "#rtype"11             \n\t"
+
+#define loadOffset_24_1(rtype) \
+    loadOffset_12_1(rtype) \
+    "vmovups "#rtype"0, "#rtype"12             \n\t" \
+    "vmovups "#rtype"0, "#rtype"13             \n\t" \
+    "vmovups "#rtype"0, "#rtype"14             \n\t" \
+    "vmovups "#rtype"0, "#rtype"15             \n\t" \
+    "vmovups "#rtype"0, "#rtype"16             \n\t" \
+    "vmovups "#rtype"0, "#rtype"17             \n\t" \
+    "vmovups "#rtype"0, "#rtype"18             \n\t" \
+    "vmovups "#rtype"0, "#rtype"19             \n\t" \
+    "vmovups "#rtype"0, "#rtype"20             \n\t" \
+    "vmovups "#rtype"0, "#rtype"21             \n\t" \
+    "vmovups "#rtype"0, "#rtype"22             \n\t" \
+    "vmovups "#rtype"0, "#rtype"23             \n\t"
+
+#define loadOffset_1_2 \
+    loadOffset_1_1(%%zmm) \
+    "vmovups 0x40(%[offset]), %%zmm1           \n\t"
+
+#define loadOffset_3_2 \
+    loadOffset_1_2 \
+    "vmovups %%zmm0, %%zmm2                    \n\t" \
+    "vmovups %%zmm1, %%zmm3                    \n\t" \
+    "vmovups %%zmm0, %%zmm4                    \n\t" \
+    "vmovups %%zmm1, %%zmm5                    \n\t"
+
+#define loadOffset_6_2 \
+    loadOffset_3_2 \
+    "vmovups %%zmm0, %%zmm6                    \n\t" \
+    "vmovups %%zmm1, %%zmm7                    \n\t" \
+    "vmovups %%zmm0, %%zmm8                    \n\t" \
+    "vmovups %%zmm1, %%zmm9                    \n\t" \
+    "vmovups %%zmm0, %%zmm10                   \n\t" \
+    "vmovups %%zmm1, %%zmm11                   \n\t"
+
+#define loadOffset_12_2 \
+    loadOffset_6_2 \
+    "vmovups %%zmm0, %%zmm12                   \n\t" \
+    "vmovups %%zmm1, %%zmm13                   \n\t" \
+    "vmovups %%zmm0, %%zmm14                   \n\t" \
+    "vmovups %%zmm1, %%zmm15                   \n\t" \
+    "vmovups %%zmm0, %%zmm16                   \n\t" \
+    "vmovups %%zmm1, %%zmm17                   \n\t" \
+    "vmovups %%zmm0, %%zmm18                   \n\t" \
+    "vmovups %%zmm1, %%zmm19                   \n\t" \
+    "vmovups %%zmm0, %%zmm20                   \n\t" \
+    "vmovups %%zmm1, %%zmm21                   \n\t" \
+    "vmovups %%zmm0, %%zmm22                   \n\t" \
+    "vmovups %%zmm1, %%zmm23                   \n\t"
+
+#define loadOffset_1_3 \
+    loadOffset_1_2 \
+    "vmovups 0x80(%[offset]), %%zmm2           \n\t"
+
+#define loadOffset_2_3 \
+    loadOffset_1_3 \
+    "vmovups %%zmm0, %%zmm3                    \n\t" \
+    "vmovups %%zmm1, %%zmm4                    \n\t" \
+    "vmovups %%zmm2, %%zmm5                    \n\t"
+
+#define loadOffset_4_3 \
+    loadOffset_2_3 \
+    "vmovups %%zmm0, %%zmm6                    \n\t" \
+    "vmovups %%zmm1, %%zmm7                    \n\t" \
+    "vmovups %%zmm2, %%zmm8                    \n\t" \
+    "vmovups %%zmm0, %%zmm9                    \n\t" \
+    "vmovups %%zmm1, %%zmm10                   \n\t" \
+    "vmovups %%zmm2, %%zmm11                   \n\t"
+
+#define loadOffset_8_3 \
+    loadOffset_4_3 \
+    "vmovups %%zmm0, %%zmm12                   \n\t" \
+    "vmovups %%zmm1, %%zmm13                   \n\t" \
+    "vmovups %%zmm2, %%zmm14                   \n\t" \
+    "vmovups %%zmm0, %%zmm15                   \n\t" \
+    "vmovups %%zmm1, %%zmm16                   \n\t" \
+    "vmovups %%zmm2, %%zmm17                   \n\t" \
+    "vmovups %%zmm0, %%zmm18                   \n\t" \
+    "vmovups %%zmm1, %%zmm19                   \n\t" \
+    "vmovups %%zmm2, %%zmm20                   \n\t" \
+    "vmovups %%zmm0, %%zmm21                   \n\t" \
+    "vmovups %%zmm1, %%zmm22                   \n\t" \
+    "vmovups %%zmm2, %%zmm23                   \n\t"
+
+#define addC_1_1(rtype, C) \
+    "movq "#C", %%rax  \n\t" \
+    "vpaddd (%%rax), "#rtype"0, "#rtype"0       \n\t"
+
+#define addC_6_1(rtype, C) \
+    addC_1_1(rtype, C) \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"1, "#rtype"1       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"2, "#rtype"2       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"3, "#rtype"3       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"4, "#rtype"4       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"5, "#rtype"5       \n\t"
+
+#define addC_12_1(rtype, C) \
+    addC_6_1(rtype, C) \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"6, "#rtype"6       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"7, "#rtype"7       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"8, "#rtype"8       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"9, "#rtype"9       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"10, "#rtype"10     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"11, "#rtype"11     \n\t"
+
+#define addC_24_1(rtype, C) \
+    addC_12_1(rtype, C) \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"12, "#rtype"12     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"13, "#rtype"13     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"14, "#rtype"14     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"15, "#rtype"15     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"16, "#rtype"16     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"17, "#rtype"17     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"18, "#rtype"18     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"19, "#rtype"19     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"20, "#rtype"20     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"21, "#rtype"21     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"22, "#rtype"22     \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), "#rtype"23, "#rtype"23     \n\t"
+
+#define addC_1_2(C) \
+    addC_1_1(%%zmm, C) \
+    "vpaddd 0x40(%%rax), %%zmm1, %%zmm1         \n\t"
+
+#define addC_3_2(C) \
+    addC_1_2(C) \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm2, %%zmm2             \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm3, %%zmm3         \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm4, %%zmm4             \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm5, %%zmm5         \n\t"
+
+#define addC_6_2(C) \
+    addC_3_2(C) \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm6, %%zmm6             \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm7, %%zmm7         \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm8, %%zmm8             \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm9, %%zmm9         \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm10, %%zmm10           \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm11, %%zmm11       \n\t"
+
+#define addC_12_2(C) \
+    addC_6_2(C) \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm12, %%zmm12           \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm13, %%zmm13       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm14, %%zmm14           \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm15, %%zmm15       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm16, %%zmm16           \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm17, %%zmm17       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm18, %%zmm18           \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm19, %%zmm19       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm20, %%zmm20           \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm21, %%zmm21       \n\t" \
+    "addq %[N], %%rax                           \n\t" \
+    "vpaddd (%%rax), %%zmm22, %%zmm22           \n\t" \
+    "vpaddd 0x40(%%rax), %%zmm23, %%zmm23       \n\t"
+
+#define addC_1_3(C) \
+    "vpaddd ("#C"), %%zmm0, %%zmm0              \n\t" \
+    "vpaddd 0x40("#C"), %%zmm1, %%zmm1          \n\t" \
+    "vpaddd 0x80("#C"), %%zmm2, %%zmm2          \n\t"
+
+#define addC_2_3(C) \
+    "vpaddd ("#C"), %%zmm0, %%zmm0              \n\t" \
+    "vpaddd 0x40("#C"), %%zmm1, %%zmm1          \n\t" \
+    "vpaddd 0x80("#C"), %%zmm2, %%zmm2          \n\t" \
+    "vpaddd ("#C", %[N]), %%zmm3, %%zmm3        \n\t" \
+    "vpaddd 0x40("#C", %[N]), %%zmm4, %%zmm4    \n\t" \
+    "vpaddd 0x80("#C", %[N]), %%zmm5, %%zmm5    \n\t"
+
+#define addC_4_3(C) \
+    addC_2_3(C) \
+    "addq %%rcx, "#C"  \n\t" \
+    "vpaddd ("#C"), %%zmm6, %%zmm6              \n\t" \
+    "vpaddd 0x40("#C"), %%zmm7, %%zmm7          \n\t" \
+    "vpaddd 0x80("#C"), %%zmm8, %%zmm8          \n\t" \
+    "vpaddd ("#C", %[N]), %%zmm9, %%zmm9        \n\t" \
+    "vpaddd 0x40("#C", %[N]), %%zmm10, %%zmm10  \n\t" \
+    "vpaddd 0x80("#C", %[N]), %%zmm11, %%zmm11  \n\t"
+
+#define addC_8_3(C) \
+    addC_4_3(C) \
+    "addq %%rcx, "#C"  \n\t" \
+    "vpaddd ("#C"), %%zmm12, %%zmm12                   \n\t" \
+    "vpaddd 0x40("#C"), %%zmm13, %%zmm13                   \n\t" \
+    "vpaddd 0x80("#C"), %%zmm14, %%zmm14                   \n\t" \
+    "vpaddd ("#C", %[N]), %%zmm15, %%zmm15                   \n\t" \
+    "vpaddd 0x40("#C", %[N]), %%zmm16, %%zmm16                   \n\t" \
+    "vpaddd 0x80("#C", %[N]), %%zmm17, %%zmm17                   \n\t" \
+    "addq %%rcx, "#C"  \n\t" \
+    "vpaddd ("#C"), %%zmm18, %%zmm18                   \n\t" \
+    "vpaddd 0x40("#C"), %%zmm19, %%zmm19                   \n\t" \
+    "vpaddd 0x80("#C"), %%zmm20, %%zmm20                   \n\t" \
+    "vpaddd ("#C", %[N]), %%zmm21, %%zmm21                   \n\t" \
+    "vpaddd 0x40("#C", %[N]), %%zmm22, %%zmm22                   \n\t" \
+    "vpaddd 0x80("#C", %[N]), %%zmm23, %%zmm23                   \n\t" \
+
+#define storeC_1_1_0(op, rtype, C, off0, off1) \
+    "movq "#C", %%rax  \n\t" \
+    #op" "#rtype"0, (%%rax)       \n\t"
+
+#define storeC_2_1_0(op, rtype, C, off0, off1) \
+    storeC_1_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"1, (%%rax)       \n\t"
+
+#define storeC_3_1_0(op, rtype, C, off0, off1) \
+    storeC_2_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"2, (%%rax)       \n\t"
+
+#define storeC_4_1_0(op, rtype, C, off0, off1) \
+    storeC_3_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"3, (%%rax)       \n\t"
+
+#define storeC_5_1_0(op, rtype, C, off0, off1) \
+    storeC_4_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"4, (%%rax)       \n\t"
+
+#define storeC_6_1_0(op, rtype, C, off0, off1) \
+    storeC_5_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"5, (%%rax)       \n\t"
+
+#define storeC_7_1_0(op, rtype, C, off0, off1) \
+    storeC_6_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"6, (%%rax)       \n\t"
+
+#define storeC_8_1_0(op, rtype, C, off0, off1) \
+    storeC_7_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"7, (%%rax)       \n\t"
+
+#define storeC_9_1_0(op, rtype, C, off0, off1) \
+    storeC_8_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"8, (%%rax)       \n\t"
+
+#define storeC_10_1_0(op, rtype, C, off0, off1) \
+    storeC_9_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"9, (%%rax)       \n\t"
+
+#define storeC_11_1_0(op, rtype, C, off0, off1) \
+    storeC_10_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"10, (%%rax)       \n\t"
+
+#define storeC_12_1_0(op, rtype, C, off0, off1) \
+    storeC_11_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"11, (%%rax)       \n\t"
+
+#define storeC_13_1_0(op, rtype, C, off0, off1) \
+    storeC_12_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"12, (%%rax)       \n\t"
+
+#define storeC_14_1_0(op, rtype, C, off0, off1) \
+    storeC_13_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"13, (%%rax)       \n\t"
+
+#define storeC_15_1_0(op, rtype, C, off0, off1) \
+    storeC_14_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"14, (%%rax)       \n\t"
+
+#define storeC_16_1_0(op, rtype, C, off0, off1) \
+    storeC_15_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"15, (%%rax)       \n\t"
+
+#define storeC_17_1_0(op, rtype, C, off0, off1) \
+    storeC_16_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"16, (%%rax)       \n\t"
+
+#define storeC_18_1_0(op, rtype, C, off0, off1) \
+    storeC_17_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"17, (%%rax)       \n\t"
+
+#define storeC_19_1_0(op, rtype, C, off0, off1) \
+    storeC_18_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"18, (%%rax)       \n\t"
+
+#define storeC_20_1_0(op, rtype, C, off0, off1) \
+    storeC_19_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"19, (%%rax)       \n\t"
+
+#define storeC_21_1_0(op, rtype, C, off0, off1) \
+    storeC_20_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"20, (%%rax)       \n\t"
+
+#define storeC_22_1_0(op, rtype, C, off0, off1) \
+    storeC_21_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"21, (%%rax)       \n\t"
+
+#define storeC_23_1_0(op, rtype, C, off0, off1) \
+    storeC_22_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"22, (%%rax)       \n\t"
+
+#define storeC_24_1_0(op, rtype, C, off0, off1) \
+    storeC_23_1_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"23, (%%rax)       \n\t"
+
+#define storeC_1_2_0(op, rtype, C, off0, off1) \
+    storeC_1_1_0(op, rtype, C, off0, off1) \
+    #op" "#rtype"1, "#off0"(%%rax)       \n\t"
+
+#define storeC_2_2_0(op, rtype, C, off0, off1) \
+    storeC_1_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"2, (%%rax)                        \n\t" \
+    #op" "#rtype"3, "#off0"(%%rax)                    \n\t"
+
+#define storeC_3_2_0(op, rtype, C, off0, off1) \
+    storeC_2_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"4, (%%rax)                        \n\t" \
+    #op" "#rtype"5, "#off0"(%%rax)                    \n\t"
+
+#define storeC_4_2_0(op, rtype, C, off0, off1) \
+    storeC_3_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"6, (%%rax)                        \n\t" \
+    #op" "#rtype"7, "#off0"(%%rax)                    \n\t"
+
+#define storeC_5_2_0(op, rtype, C, off0, off1) \
+    storeC_4_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"8, (%%rax)                        \n\t" \
+    #op" "#rtype"9, "#off0"(%%rax)                    \n\t"
+
+#define storeC_6_2_0(op, rtype, C, off0, off1) \
+    storeC_5_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"10, (%%rax)                        \n\t" \
+    #op" "#rtype"11, "#off0"(%%rax)                    \n\t"
+
+#define storeC_7_2_0(op, rtype, C, off0, off1) \
+    storeC_6_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"12, (%%rax)                        \n\t" \
+    #op" "#rtype"13, "#off0"(%%rax)                    \n\t"
+
+#define storeC_8_2_0(op, rtype, C, off0, off1) \
+    storeC_7_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"14, (%%rax)                        \n\t" \
+    #op" "#rtype"15, "#off0"(%%rax)                    \n\t"
+
+#define storeC_9_2_0(op, rtype, C, off0, off1) \
+    storeC_8_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"16, (%%rax)                        \n\t" \
+    #op" "#rtype"17, "#off0"(%%rax)                    \n\t"
+
+#define storeC_10_2_0(op, rtype, C, off0, off1) \
+    storeC_9_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"18, (%%rax)                        \n\t" \
+    #op" "#rtype"19, "#off0"(%%rax)                    \n\t"
+
+#define storeC_11_2_0(op, rtype, C, off0, off1) \
+    storeC_10_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"20, (%%rax)                        \n\t" \
+    #op" "#rtype"21, "#off0"(%%rax)                    \n\t"
+
+#define storeC_12_2_0(op, rtype, C, off0, off1) \
+    storeC_11_2_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"22, (%%rax)                        \n\t" \
+    #op" "#rtype"23, "#off0"(%%rax)                    \n\t"
+
+#define storeC_1_3_0(op, rtype, C, off0, off1) \
+    "movq "#C", %%rax  \n\t" \
+    #op" "#rtype"0, (%%rax)       \n\t" \
+    #op" "#rtype"1, "#off0"(%%rax)       \n\t" \
+    #op" "#rtype"2, "#off1"(%%rax)       \n\t"
+
+#define storeC_2_3_0(op, rtype, C, off0, off1) \
+    storeC_1_3_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"3, (%%rax)                        \n\t" \
+    #op" "#rtype"4, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"5, "#off1"(%%rax)                    \n\t"
+
+#define storeC_3_3_0(op, rtype, C, off0, off1) \
+    storeC_2_3_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"6, (%%rax)                        \n\t" \
+    #op" "#rtype"7, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"8, "#off1"(%%rax)                    \n\t"
+
+#define storeC_4_3_0(op, rtype, C, off0, off1) \
+    storeC_3_3_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"9, (%%rax)                        \n\t" \
+    #op" "#rtype"10, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"11, "#off1"(%%rax)                    \n\t"
+
+#define storeC_5_3_0(op, rtype, C, off0, off1) \
+    storeC_4_3_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"12, (%%rax)                        \n\t" \
+    #op" "#rtype"13, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"14, "#off1"(%%rax)                    \n\t"
+
+#define storeC_6_3_0(op, rtype, C, off0, off1) \
+    storeC_5_3_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"15, (%%rax)                        \n\t" \
+    #op" "#rtype"16, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"17, "#off1"(%%rax)                    \n\t"
+
+#define storeC_7_3_0(op, rtype, C, off0, off1) \
+    storeC_6_3_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"18, (%%rax)                        \n\t" \
+    #op" "#rtype"19, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"20, "#off1"(%%rax)                    \n\t"
+
+#define storeC_8_3_0(op, rtype, C, off0, off1) \
+    storeC_7_3_0(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"21, (%%rax)                        \n\t" \
+    #op" "#rtype"22, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"23, "#off1"(%%rax)                    \n\t"
+
+#define storeC_1_1_1(op, rtype, C, off0, off1) \
+    "movq "#C", %%rax  \n\t" \
+    "kmovw %[nmask], %%k1  \n\t" \
+    #op" "#rtype"0, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_2_1_1(op, rtype, C, off0, off1) \
+    storeC_1_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"1, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_3_1_1(op, rtype, C, off0, off1) \
+    storeC_2_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"2, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_4_1_1(op, rtype, C, off0, off1) \
+    storeC_3_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"3, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_5_1_1(op, rtype, C, off0, off1) \
+    storeC_4_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"4, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_6_1_1(op, rtype, C, off0, off1) \
+    storeC_5_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"5, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_7_1_1(op, rtype, C, off0, off1) \
+    storeC_6_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"6, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_8_1_1(op, rtype, C, off0, off1) \
+    storeC_7_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"7, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_9_1_1(op, rtype, C, off0, off1) \
+    storeC_8_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"8, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_10_1_1(op, rtype, C, off0, off1) \
+    storeC_9_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"9, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_11_1_1(op, rtype, C, off0, off1) \
+    storeC_10_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"10, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_12_1_1(op, rtype, C, off0, off1) \
+    storeC_11_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"11, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_13_1_1(op, rtype, C, off0, off1) \
+    storeC_12_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"12, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_14_1_1(op, rtype, C, off0, off1) \
+    storeC_13_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"13, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_15_1_1(op, rtype, C, off0, off1) \
+    storeC_14_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"14, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_16_1_1(op, rtype, C, off0, off1) \
+    storeC_15_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"15, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_17_1_1(op, rtype, C, off0, off1) \
+    storeC_16_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"16, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_18_1_1(op, rtype, C, off0, off1) \
+    storeC_17_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"17, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_19_1_1(op, rtype, C, off0, off1) \
+    storeC_18_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"18, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_20_1_1(op, rtype, C, off0, off1) \
+    storeC_19_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"19, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_21_1_1(op, rtype, C, off0, off1) \
+    storeC_20_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"20, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_22_1_1(op, rtype, C, off0, off1) \
+    storeC_21_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"21, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_23_1_1(op, rtype, C, off0, off1) \
+    storeC_22_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"22, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_24_1_1(op, rtype, C, off0, off1) \
+    storeC_23_1_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"23, (%%rax) %{%%k1%}       \n\t"
+
+#define storeC_1_2_1(op, rtype, C, off0, off1) \
+    "kmovw %[nmask], %%k1  \n\t" \
+    storeC_1_1_0(op, rtype, C, off0, off1) \
+    #op" "#rtype"1, "#off0"(%%rax) %{%%k1%}       \n\t"
+
+#define storeC_2_2_1(op, rtype, C, off0, off1) \
+    storeC_1_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"2, (%%rax)                        \n\t" \
+    #op" "#rtype"3, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_3_2_1(op, rtype, C, off0, off1) \
+    storeC_2_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"4, (%%rax)                        \n\t" \
+    #op" "#rtype"5, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_4_2_1(op, rtype, C, off0, off1) \
+    storeC_3_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"6, (%%rax)                        \n\t" \
+    #op" "#rtype"7, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_5_2_1(op, rtype, C, off0, off1) \
+    storeC_4_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"8, (%%rax)                        \n\t" \
+    #op" "#rtype"9, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_6_2_1(op, rtype, C, off0, off1) \
+    storeC_5_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"10, (%%rax)                        \n\t" \
+    #op" "#rtype"11, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_7_2_1(op, rtype, C, off0, off1) \
+    storeC_6_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"12, (%%rax)                        \n\t" \
+    #op" "#rtype"13, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_8_2_1(op, rtype, C, off0, off1) \
+    storeC_7_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"14, (%%rax)                        \n\t" \
+    #op" "#rtype"15, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_9_2_1(op, rtype, C, off0, off1) \
+    storeC_8_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"16, (%%rax)                        \n\t" \
+    #op" "#rtype"17, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_10_2_1(op, rtype, C, off0, off1) \
+    storeC_9_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"18, (%%rax)                        \n\t" \
+    #op" "#rtype"19, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_11_2_1(op, rtype, C, off0, off1) \
+    storeC_10_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"20, (%%rax)                        \n\t" \
+    #op" "#rtype"21, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_12_2_1(op, rtype, C, off0, off1) \
+    storeC_11_2_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"22, (%%rax)                        \n\t" \
+    #op" "#rtype"23, "#off0"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_1_3_1(op, rtype, C, off0, off1) \
+    "kmovw %[nmask], %%k1  \n\t" \
+    storeC_1_2_0(op, rtype, C, off0, off1) \
+    #op" "#rtype"2, "#off1"(%%rax) %{%%k1%}       \n\t"
+
+#define storeC_2_3_1(op, rtype, C, off0, off1) \
+    storeC_1_3_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"3, (%%rax)                        \n\t" \
+    #op" "#rtype"4, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"5, "#off1"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_3_3_1(op, rtype, C, off0, off1) \
+    storeC_2_3_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"6, (%%rax)                        \n\t" \
+    #op" "#rtype"7, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"8, "#off1"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_4_3_1(op, rtype, C, off0, off1) \
+    storeC_3_3_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"9, (%%rax)                        \n\t" \
+    #op" "#rtype"10, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"11, "#off1"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_5_3_1(op, rtype, C, off0, off1) \
+    storeC_4_3_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"12, (%%rax)                        \n\t" \
+    #op" "#rtype"13, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"14, "#off1"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_6_3_1(op, rtype, C, off0, off1) \
+    storeC_5_3_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"15, (%%rax)                        \n\t" \
+    #op" "#rtype"16, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"17, "#off1"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_7_3_1(op, rtype, C, off0, off1) \
+    storeC_6_3_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"18, (%%rax)                        \n\t" \
+    #op" "#rtype"19, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"20, "#off1"(%%rax) %{%%k1%}                    \n\t"
+
+#define storeC_8_3_1(op, rtype, C, off0, off1) \
+    storeC_7_3_1(op, rtype, C, off0, off1) \
+    "addq %[N], %%rax                           \n\t" \
+    #op" "#rtype"21, (%%rax)                        \n\t" \
+    #op" "#rtype"22, "#off0"(%%rax)                    \n\t" \
+    #op" "#rtype"23, "#off1"(%%rax) %{%%k1%}                    \n\t"
+
+#define clear1Regs(rtype) \
+    "vxorps "#rtype"0, "#rtype"0, "#rtype"0    \n\t"
+
+#define clear2Regs(rtype) \
+    clear1Regs(rtype) \
+    "vxorps "#rtype"1, "#rtype"1, "#rtype"1    \n\t"
+
+#define clear3Regs(rtype) \
+    clear2Regs(rtype) \
+    "vxorps "#rtype"2, "#rtype"2, "#rtype"2    \n\t"
+
+#define clear4Regs(rtype) \
+    clear3Regs(rtype) \
+    "vxorps "#rtype"3, "#rtype"3, "#rtype"3    \n\t"
+
+#define clear6Regs(rtype) \
+    clear4Regs(rtype) \
+    "vxorps "#rtype"4, "#rtype"4, "#rtype"4    \n\t" \
+    "vxorps "#rtype"5, "#rtype"5, "#rtype"5    \n\t"
+
+#define clear8Regs(rtype) \
+    clear6Regs(rtype) \
+    "vxorps "#rtype"6, "#rtype"6, "#rtype"6    \n\t" \
+    "vxorps "#rtype"7, "#rtype"7, "#rtype"7    \n\t"
+
+#define clear9Regs(rtype) \
+    clear8Regs(rtype) \
+    "vxorps "#rtype"8, "#rtype"8, "#rtype"8    \n\t"
+
+#define clear12Regs(rtype) \
+    clear9Regs(rtype) \
+    "vxorps "#rtype"9, "#rtype"9, "#rtype"9    \n\t" \
+    "vxorps "#rtype"10, "#rtype"10, "#rtype"10 \n\t" \
+    "vxorps "#rtype"11, "#rtype"11, "#rtype"11 \n\t"
+
+#define clear24Regs(rtype) \
+    clear12Regs(rtype) \
+    "vxorps "#rtype"12, "#rtype"12, "#rtype"12 \n\t" \
+    "vxorps "#rtype"13, "#rtype"13, "#rtype"13 \n\t" \
+    "vxorps "#rtype"14, "#rtype"14, "#rtype"14 \n\t" \
+    "vxorps "#rtype"15, "#rtype"15, "#rtype"15 \n\t" \
+    "vxorps "#rtype"16, "#rtype"16, "#rtype"16 \n\t" \
+    "vxorps "#rtype"17, "#rtype"17, "#rtype"17 \n\t" \
+    "vxorps "#rtype"18, "#rtype"18, "#rtype"18 \n\t" \
+    "vxorps "#rtype"19, "#rtype"19, "#rtype"19 \n\t" \
+    "vxorps "#rtype"20, "#rtype"20, "#rtype"20 \n\t" \
+    "vxorps "#rtype"21, "#rtype"21, "#rtype"21 \n\t" \
+    "vxorps "#rtype"22, "#rtype"22, "#rtype"22 \n\t" \
+    "vxorps "#rtype"23, "#rtype"23, "#rtype"23 \n\t"
+
+#define convert1I32Regs2Ps(rtype, sReg) \
+    "vbroadcastss ("#sReg"), "#rtype"24                        \n\t" \
+    "vcvtdq2ps "#rtype"0, "#rtype"0                  \n\t" \
+    "vmulps "#rtype"0, "#rtype"24, "#rtype"0            \n\t"
+
+#define convert2I32Regs2Ps(rtype, sReg) \
+    "vbroadcastss ("#sReg"), "#rtype"24                        \n\t" \
+    "vcvtdq2ps "#rtype"0, "#rtype"0                  \n\t" \
+    "vcvtdq2ps "#rtype"1, "#rtype"1                  \n\t" \
+    "vmulps "#rtype"0, "#rtype"24, "#rtype"0            \n\t" \
+    "vmulps "#rtype"1, "#rtype"24, "#rtype"1            \n\t"
+
+#define convert3I32Regs2Ps(rtype, sReg) \
+    "vbroadcastss ("#sReg"), "#rtype"24                        \n\t" \
+    "vcvtdq2ps "#rtype"0, "#rtype"0                  \n\t" \
+    "vcvtdq2ps "#rtype"1, "#rtype"1                  \n\t" \
+    "vcvtdq2ps "#rtype"2, "#rtype"2                  \n\t" \
+    "vmulps "#rtype"0, "#rtype"24, "#rtype"0            \n\t" \
+    "vmulps "#rtype"1, "#rtype"24, "#rtype"1            \n\t" \
+    "vmulps "#rtype"2, "#rtype"24, "#rtype"2            \n\t"
+
+#define convert4I32Regs2Ps(rtype, sReg) \
+    "vbroadcastss ("#sReg"), "#rtype"24                        \n\t" \
+    "vcvtdq2ps "#rtype"0, "#rtype"0                  \n\t" \
+    "vcvtdq2ps "#rtype"1, "#rtype"1                  \n\t" \
+    "vcvtdq2ps "#rtype"2, "#rtype"2                  \n\t" \
+    "vcvtdq2ps "#rtype"3, "#rtype"3                  \n\t" \
+    "vmulps "#rtype"0, "#rtype"24, "#rtype"0            \n\t" \
+    "vmulps "#rtype"1, "#rtype"24, "#rtype"1            \n\t" \
+    "vmulps "#rtype"2, "#rtype"24, "#rtype"2            \n\t" \
+    "vmulps "#rtype"3, "#rtype"24, "#rtype"3            \n\t"
+
+#define convert6I32Regs2Ps(rtype, sReg) \
+    "vbroadcastss ("#sReg"), "#rtype"24                        \n\t" \
+    "vcvtdq2ps "#rtype"0, "#rtype"0                  \n\t" \
+    "vcvtdq2ps "#rtype"1, "#rtype"1                  \n\t" \
+    "vcvtdq2ps "#rtype"2, "#rtype"2                  \n\t" \
+    "vcvtdq2ps "#rtype"3, "#rtype"3                  \n\t" \
+    "vcvtdq2ps "#rtype"4, "#rtype"4                  \n\t" \
+    "vcvtdq2ps "#rtype"5, "#rtype"5                  \n\t" \
+    "vmulps "#rtype"0, "#rtype"24, "#rtype"0            \n\t" \
+    "vmulps "#rtype"1, "#rtype"24, "#rtype"1            \n\t" \
+    "vmulps "#rtype"2, "#rtype"24, "#rtype"2            \n\t" \
+    "vmulps "#rtype"3, "#rtype"24, "#rtype"3            \n\t" \
+    "vmulps "#rtype"4, "#rtype"24, "#rtype"4            \n\t" \
+    "vmulps "#rtype"5, "#rtype"24, "#rtype"5            \n\t"
+
+#define convert12I32Regs2Ps(rtype, sReg) \
+    convert6I32Regs2Ps(rtype, sReg) \
+    "vcvtdq2ps "#rtype"6, "#rtype"6                  \n\t" \
+    "vcvtdq2ps "#rtype"7, "#rtype"7                  \n\t" \
+    "vcvtdq2ps "#rtype"8, "#rtype"8                  \n\t" \
+    "vcvtdq2ps "#rtype"9, "#rtype"9                  \n\t" \
+    "vcvtdq2ps "#rtype"10, "#rtype"10                \n\t" \
+    "vcvtdq2ps "#rtype"11, "#rtype"11                \n\t" \
+    "vmulps "#rtype"6, "#rtype"24, "#rtype"6            \n\t" \
+    "vmulps "#rtype"7, "#rtype"24, "#rtype"7            \n\t" \
+    "vmulps "#rtype"8, "#rtype"24, "#rtype"8            \n\t" \
+    "vmulps "#rtype"9, "#rtype"24, "#rtype"9            \n\t" \
+    "vmulps "#rtype"10, "#rtype"24, "#rtype"10          \n\t" \
+    "vmulps "#rtype"11, "#rtype"24, "#rtype"11          \n\t"
+
+#define convert24I32Regs2Ps(rtype, sReg) \
+    convert12I32Regs2Ps(rtype, sReg) \
+    "vcvtdq2ps "#rtype"12, "#rtype"12                \n\t" \
+    "vcvtdq2ps "#rtype"13, "#rtype"13                \n\t" \
+    "vcvtdq2ps "#rtype"14, "#rtype"14                \n\t" \
+    "vcvtdq2ps "#rtype"15, "#rtype"15                \n\t" \
+    "vcvtdq2ps "#rtype"16, "#rtype"16                \n\t" \
+    "vcvtdq2ps "#rtype"17, "#rtype"17                \n\t" \
+    "vcvtdq2ps "#rtype"18, "#rtype"18                \n\t" \
+    "vcvtdq2ps "#rtype"19, "#rtype"19                \n\t" \
+    "vcvtdq2ps "#rtype"20, "#rtype"20                \n\t" \
+    "vcvtdq2ps "#rtype"21, "#rtype"21                \n\t" \
+    "vcvtdq2ps "#rtype"22, "#rtype"22                \n\t" \
+    "vcvtdq2ps "#rtype"23, "#rtype"23                \n\t" \
+    "vmulps "#rtype"12, "#rtype"24, "#rtype"12          \n\t" \
+    "vmulps "#rtype"13, "#rtype"24, "#rtype"13          \n\t" \
+    "vmulps "#rtype"14, "#rtype"24, "#rtype"14          \n\t" \
+    "vmulps "#rtype"15, "#rtype"24, "#rtype"15          \n\t" \
+    "vmulps "#rtype"16, "#rtype"24, "#rtype"16          \n\t" \
+    "vmulps "#rtype"17, "#rtype"24, "#rtype"17          \n\t" \
+    "vmulps "#rtype"18, "#rtype"24, "#rtype"18          \n\t" \
+    "vmulps "#rtype"19, "#rtype"24, "#rtype"19          \n\t" \
+    "vmulps "#rtype"20, "#rtype"24, "#rtype"20          \n\t" \
+    "vmulps "#rtype"21, "#rtype"24, "#rtype"21          \n\t" \
+    "vmulps "#rtype"22, "#rtype"24, "#rtype"22          \n\t" \
+    "vmulps "#rtype"23, "#rtype"24, "#rtype"23          \n\t"
+
+#define convert1PsRegs2U8(rtype) \
+    "mov $128, %%eax \n\t" \
+    "vmovd %%eax, %%xmm25                            \n\t" \
+    "vbroadcastss %%xmm25, "#rtype"24                 \n\t" \
+    "vcvtps2dq "#rtype"0, "#rtype"0                  \n\t" \
+    "vpaddd "#rtype"0, "#rtype"24, "#rtype"0         \n\t"
+
+#define convert2PsRegs2U8(rtype) \
+    "mov $128, %%eax \n\t" \
+    "vmovd %%eax, %%xmm25                            \n\t" \
+    "vbroadcastss %%xmm25, "#rtype"24                 \n\t" \
+    "vcvtps2dq "#rtype"0, "#rtype"0                  \n\t" \
+    "vcvtps2dq "#rtype"1, "#rtype"1                  \n\t" \
+    "vpaddd "#rtype"0, "#rtype"24, "#rtype"0         \n\t" \
+    "vpaddd "#rtype"1, "#rtype"24, "#rtype"1         \n\t"
+
+#define convert3PsRegs2U8(rtype) \
+    "mov $128, %%eax \n\t" \
+    "vmovd %%eax, %%xmm25                            \n\t" \
+    "vbroadcastss %%xmm25, "#rtype"24                 \n\t" \
+    "vcvtps2dq "#rtype"0, "#rtype"0                  \n\t" \
+    "vcvtps2dq "#rtype"1, "#rtype"1                  \n\t" \
+    "vcvtps2dq "#rtype"2, "#rtype"2                  \n\t" \
+    "vpaddd "#rtype"0, "#rtype"24, "#rtype"0         \n\t" \
+    "vpaddd "#rtype"1, "#rtype"24, "#rtype"1         \n\t" \
+    "vpaddd "#rtype"2, "#rtype"24, "#rtype"2         \n\t"
+
+#define convert4PsRegs2U8(rtype) \
+    "mov $128, %%eax \n\t" \
+    "vmovd %%eax, %%xmm25                            \n\t" \
+    "vbroadcastss %%xmm25, "#rtype"24                 \n\t" \
+    "vcvtps2dq "#rtype"0, "#rtype"0                  \n\t" \
+    "vcvtps2dq "#rtype"1, "#rtype"1                  \n\t" \
+    "vcvtps2dq "#rtype"2, "#rtype"2                  \n\t" \
+    "vcvtps2dq "#rtype"3, "#rtype"3                  \n\t" \
+    "vpaddd "#rtype"0, "#rtype"24, "#rtype"0         \n\t" \
+    "vpaddd "#rtype"1, "#rtype"24, "#rtype"1         \n\t" \
+    "vpaddd "#rtype"2, "#rtype"24, "#rtype"2         \n\t" \
+    "vpaddd "#rtype"3, "#rtype"24, "#rtype"3         \n\t"
+
+#define convert6PsRegs2U8(rtype) \
+    "mov $128, %%eax \n\t" \
+    "vmovd %%eax, %%xmm25                            \n\t" \
+    "vbroadcastss %%xmm25, "#rtype"24                 \n\t" \
+    "vcvtps2dq "#rtype"0, "#rtype"0                  \n\t" \
+    "vcvtps2dq "#rtype"1, "#rtype"1                  \n\t" \
+    "vcvtps2dq "#rtype"2, "#rtype"2                  \n\t" \
+    "vcvtps2dq "#rtype"3, "#rtype"3                  \n\t" \
+    "vcvtps2dq "#rtype"4, "#rtype"4                  \n\t" \
+    "vcvtps2dq "#rtype"5, "#rtype"5                  \n\t" \
+    "vpaddd "#rtype"0, "#rtype"24, "#rtype"0         \n\t" \
+    "vpaddd "#rtype"1, "#rtype"24, "#rtype"1         \n\t" \
+    "vpaddd "#rtype"2, "#rtype"24, "#rtype"2         \n\t" \
+    "vpaddd "#rtype"3, "#rtype"24, "#rtype"3         \n\t" \
+    "vpaddd "#rtype"4, "#rtype"24, "#rtype"4         \n\t" \
+    "vpaddd "#rtype"5, "#rtype"24, "#rtype"5         \n\t"
+
+#define convert12PsRegs2U8(rtype) \
+    convert6PsRegs2U8(rtype) \
+    "vcvtps2dq "#rtype"6, "#rtype"6                  \n\t" \
+    "vcvtps2dq "#rtype"7, "#rtype"7                  \n\t" \
+    "vcvtps2dq "#rtype"8, "#rtype"8                  \n\t" \
+    "vcvtps2dq "#rtype"9, "#rtype"9                  \n\t" \
+    "vcvtps2dq "#rtype"10, "#rtype"10                \n\t" \
+    "vcvtps2dq "#rtype"11, "#rtype"11                \n\t" \
+    "vpaddd "#rtype"6, "#rtype"24, "#rtype"6         \n\t" \
+    "vpaddd "#rtype"7, "#rtype"24, "#rtype"7         \n\t" \
+    "vpaddd "#rtype"8, "#rtype"24, "#rtype"8         \n\t" \
+    "vpaddd "#rtype"9, "#rtype"24, "#rtype"9         \n\t" \
+    "vpaddd "#rtype"10, "#rtype"24, "#rtype"10       \n\t" \
+    "vpaddd "#rtype"11, "#rtype"24, "#rtype"11       \n\t"
+
+#define convert24PsRegs2U8(rtype) \
+    convert12PsRegs2U8(rtype) \
+    "vcvtps2dq "#rtype"12, "#rtype"12                \n\t" \
+    "vcvtps2dq "#rtype"13, "#rtype"13                \n\t" \
+    "vcvtps2dq "#rtype"14, "#rtype"14                \n\t" \
+    "vcvtps2dq "#rtype"15, "#rtype"15                \n\t" \
+    "vcvtps2dq "#rtype"16, "#rtype"16                \n\t" \
+    "vcvtps2dq "#rtype"17, "#rtype"17                \n\t" \
+    "vcvtps2dq "#rtype"18, "#rtype"18                \n\t" \
+    "vcvtps2dq "#rtype"19, "#rtype"19                \n\t" \
+    "vcvtps2dq "#rtype"20, "#rtype"20                \n\t" \
+    "vcvtps2dq "#rtype"21, "#rtype"21                \n\t" \
+    "vcvtps2dq "#rtype"22, "#rtype"22                \n\t" \
+    "vcvtps2dq "#rtype"23, "#rtype"23                \n\t" \
+    "vpaddd "#rtype"12, "#rtype"24, "#rtype"12       \n\t" \
+    "vpaddd "#rtype"13, "#rtype"24, "#rtype"13       \n\t" \
+    "vpaddd "#rtype"14, "#rtype"24, "#rtype"14       \n\t" \
+    "vpaddd "#rtype"15, "#rtype"24, "#rtype"15       \n\t" \
+    "vpaddd "#rtype"16, "#rtype"24, "#rtype"16       \n\t" \
+    "vpaddd "#rtype"17, "#rtype"24, "#rtype"17       \n\t" \
+    "vpaddd "#rtype"18, "#rtype"24, "#rtype"18       \n\t" \
+    "vpaddd "#rtype"19, "#rtype"24, "#rtype"19       \n\t" \
+    "vpaddd "#rtype"20, "#rtype"24, "#rtype"20       \n\t" \
+    "vpaddd "#rtype"21, "#rtype"24, "#rtype"21       \n\t" \
+    "vpaddd "#rtype"22, "#rtype"24, "#rtype"22       \n\t" \
+    "vpaddd "#rtype"23, "#rtype"24, "#rtype"23       \n\t"
+
+#define mmm_1_48(A, K) \
+    "movq "#A", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30          \n\t" \
+    "vpbroadcastd 0x4(%%rax), %%zmm31       \n\t" \
+    "vmovups (%[B]), %%zmm27               \n\t" \
+    "vmovups 0x40(%[B]), %%zmm28           \n\t" \
+    "vmovups 0x80(%[B]), %%zmm29           \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm0     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm1     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm2     \n\t" \
+    "vmovups 0xC0(%[B]), %%zmm24           \n\t" \
+    "vmovups 0x100(%[B]), %%zmm25          \n\t" \
+    "vmovups 0x140(%[B]), %%zmm26          \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm0     \n\t" \
+    "vpdpbusd %%zmm28, %%zmm31, %%zmm1     \n\t" \
+    "vpdpbusd %%zmm29, %%zmm31, %%zmm2     \n\t"
+
+#define mmm_2_48(A, K) \
+    "movq "#A", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31    \n\t" \
+    "prefetcht0 0xC0(%1)                   \n\t" \
+    "prefetcht0 0x100(%1)                  \n\t" \
+    "prefetcht0 0x140(%1)                  \n\t" \
+    "vmovups (%[B]), %%zmm27               \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm0     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm1     \n\t" \
+    "vmovups 0x40(%[B]), %%zmm28           \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm2     \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm3     \n\t" \
+    "vmovups 0x80(%[B]), %%zmm29           \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm4     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm5     \n\t" \
+    "vpbroadcastd 0x4(%%rax), %%zmm30       \n\t" \
+    "vpbroadcastd 0x4(%%rax, "#K"), %%zmm31 \n\t" \
+    "prefetcht0 0x180(%[B])                \n\t" \
+    "prefetcht0 0x1C0(%[B])                \n\t" \
+    "prefetcht0 0x200(%[B])                \n\t" \
+    "vmovups 0xC0(%[B]), %%zmm24           \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm0     \n\t" \
+    "vpdpbusd %%zmm28, %%zmm30, %%zmm1     \n\t" \
+    "vmovups 0x100(%[B]), %%zmm25          \n\t" \
+    "vpdpbusd %%zmm29, %%zmm30, %%zmm2     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm3     \n\t" \
+    "vmovups 0x140(%[B]), %%zmm26          \n\t" \
+    "vpdpbusd %%zmm28, %%zmm31, %%zmm4     \n\t" \
+    "vpdpbusd %%zmm29, %%zmm31, %%zmm5     \n\t"
+
+#define mmm_4_48(A, K) \
+    "movq "#A", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31    \n\t" \
+    "prefetcht0 0xC0(%1)                   \n\t" \
+    "prefetcht0 0x100(%1)                  \n\t" \
+    "prefetcht0 0x140(%1)                  \n\t" \
+    "vmovups (%[B]), %%zmm27               \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm0     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm1     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm2     \n\t" \
+    "vmovups 0x40(%[B]), %%zmm28           \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm3     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm4     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm5     \n\t" \
+    "addq "#K", %%rax                       \n\t" \
+    "addq "#K", %%rax                       \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31    \n\t" \
+    "vmovups 0x80(%[B]), %%zmm29           \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm6     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm7     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm8     \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm9     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm10    \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm11    \n\t" \
+    "movq "#A", %%rax                         \n\t" \
+    "addq $0x4, %%rax                       \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31    \n\t" \
+    "prefetcht0 0x180(%[B])                \n\t" \
+    "prefetcht0 0x1C0(%[B])                \n\t" \
+    "prefetcht0 0x200(%[B])                \n\t" \
+    "vmovups 0xC0(%[B]), %%zmm24           \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm0     \n\t" \
+    "vpdpbusd %%zmm28, %%zmm30, %%zmm1     \n\t" \
+    "vpdpbusd %%zmm29, %%zmm30, %%zmm2     \n\t" \
+    "vmovups 0x100(%[B]), %%zmm25          \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm3     \n\t" \
+    "vpdpbusd %%zmm28, %%zmm31, %%zmm4     \n\t" \
+    "vpdpbusd %%zmm29, %%zmm31, %%zmm5     \n\t" \
+    "addq "#K", %%rax                       \n\t" \
+    "addq "#K", %%rax                       \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31    \n\t" \
+    "vmovups 0x140(%[B]), %%zmm26          \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm6     \n\t" \
+    "vpdpbusd %%zmm28, %%zmm30, %%zmm7     \n\t" \
+    "vpdpbusd %%zmm29, %%zmm30, %%zmm8     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm9     \n\t" \
+    "vpdpbusd %%zmm28, %%zmm31, %%zmm10    \n\t" \
+    "vpdpbusd %%zmm29, %%zmm31, %%zmm11    \n\t"
+
+#define mmm_8_48(A, K) \
+    "movq "#A", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30         \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31   \n\t" \
+    "prefetcht0 0xC0(%[B])                 \n\t" \
+    "prefetcht0 0x100(%[B])                \n\t" \
+    "prefetcht0 0x140(%[B])                \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm0     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm1     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm2     \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm3     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm4     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm5     \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30         \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31   \n\t" \
+    "vmovups (%[B]), %%zmm27               \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm6     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm7     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm8     \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm9     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm10    \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm11    \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30         \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31   \n\t" \
+    "vmovups 0x40(%[B]), %%zmm28           \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm12    \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm13    \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm14    \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm15    \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm16    \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm17    \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30         \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31   \n\t" \
+    "vmovups 0x80(%[B]), %%zmm29           \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm18    \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm19    \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm20    \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm21    \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm22    \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm23    \n\t" \
+    "movq "#A", %%rax                      \n\t" \
+    "addq $0x4, %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30         \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31   \n\t" \
+    "prefetcht0 0x180(%[B])                \n\t" \
+    "prefetcht0 0x1C0(%[B])                \n\t" \
+    "prefetcht0 0x200(%[B])                \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm0     \n\t" \
+    "vpdpbusd %%zmm28, %%zmm30, %%zmm1     \n\t" \
+    "vpdpbusd %%zmm29, %%zmm30, %%zmm2     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm3     \n\t" \
+    "vpdpbusd %%zmm28, %%zmm31, %%zmm4     \n\t" \
+    "vpdpbusd %%zmm29, %%zmm31, %%zmm5     \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30         \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31   \n\t" \
+    "vmovups 0xC0(%[B]), %%zmm24           \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm6     \n\t" \
+    "vpdpbusd %%zmm28, %%zmm30, %%zmm7     \n\t" \
+    "vpdpbusd %%zmm29, %%zmm30, %%zmm8     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm9     \n\t" \
+    "vpdpbusd %%zmm28, %%zmm31, %%zmm10    \n\t" \
+    "vpdpbusd %%zmm29, %%zmm31, %%zmm11    \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30         \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31   \n\t" \
+    "vmovups 0x100(%[B]), %%zmm25          \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm12    \n\t" \
+    "vpdpbusd %%zmm28, %%zmm30, %%zmm13    \n\t" \
+    "vpdpbusd %%zmm29, %%zmm30, %%zmm14    \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm15    \n\t" \
+    "vpdpbusd %%zmm28, %%zmm31, %%zmm16    \n\t" \
+    "vpdpbusd %%zmm29, %%zmm31, %%zmm17    \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "addq "#K", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm30         \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm31   \n\t" \
+    "vmovups 0x140(%[B]), %%zmm26          \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm18    \n\t" \
+    "vpdpbusd %%zmm28, %%zmm30, %%zmm19    \n\t" \
+    "vpdpbusd %%zmm29, %%zmm30, %%zmm20    \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm21    \n\t" \
+    "vpdpbusd %%zmm28, %%zmm31, %%zmm22    \n\t" \
+    "vpdpbusd %%zmm29, %%zmm31, %%zmm23    \n\t"
+
+#define mmm_1_32(A, K) \
+    "movq "#A", %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28         \n\t" \
+    "vpbroadcastd 0x4(%%rax), %%zmm29      \n\t" \
+    "prefetcht0 0x80(%[B])                 \n\t" \
+    "prefetcht0 0xC0(%[B])                 \n\t" \
+    "vmovups (%[B]), %%zmm26               \n\t" \
+    "vmovups 0x40(%[B]), %%zmm27           \n\t" \
+    "vpdpbusd %%zmm24, %%zmm28, %%zmm0     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm28, %%zmm1     \n\t" \
+    "prefetcht0 0x100(%[B])                \n\t" \
+    "prefetcht0 0x140(%[B])                \n\t" \
+    "vmovups 0x80(%[B]), %%zmm24           \n\t" \
+    "vmovups 0xC0(%[B]), %%zmm25           \n\t" \
+    "vpdpbusd %%zmm26, %%zmm29, %%zmm0     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm29, %%zmm1     \n\t"
+
+#define mmm_3_32(A, K) \
+    "movq "#A", %%rax                       \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \
+    "prefetcht0 0x80(%[B])                  \n\t" \
+    "prefetcht0 0xC0(%[B])                  \n\t" \
+    "vmovups (%[B]), %%zmm26                \n\t" \
+    "vpdpbusd %%zmm24, %%zmm28, %%zmm0      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm28, %%zmm1      \n\t" \
+    "vpdpbusd %%zmm24, %%zmm29, %%zmm2      \n\t" \
+    "vmovups 0x40(%[B]), %%zmm27            \n\t" \
+    "vpdpbusd %%zmm25, %%zmm29, %%zmm3      \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm4      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm5      \n\t" \
+    "movq "#A", %%rax                       \n\t" \
+    "addq $0x4, %%rax                       \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \
+    "vpbroadcastd (%%rax, %%rbx), %%zmm31   \n\t" \
+    "prefetcht0 0x100(%[B])                 \n\t" \
+    "prefetcht0 0x140(%[B])                 \n\t" \
+    "vmovups 0x80(%[B]), %%zmm24            \n\t" \
+    "vpdpbusd %%zmm26, %%zmm28, %%zmm0      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm28, %%zmm1      \n\t" \
+    "vpdpbusd %%zmm26, %%zmm29, %%zmm2      \n\t" \
+    "vmovups 0xC0(%[B]), %%zmm25            \n\t" \
+    "vpdpbusd %%zmm27, %%zmm29, %%zmm3      \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm4      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm5      \n\t"
+
+#define mmm_6_32(A, K) \
+    "movq "#A", %%rax                       \n\t" \
+    "movq "#K", %%rbx                       \n\t" \
+    "addq "#K", %%rbx                       \n\t" \
+    "addq "#K", %%rbx                       \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \
+    "vpbroadcastd (%%rax, %%rbx), %%zmm31   \n\t" \
+    "prefetcht0 0x80(%[B])                  \n\t" \
+    "prefetcht0 0xC0(%[B])                  \n\t" \
+    "vmovups (%[B]), %%zmm26                \n\t" \
+    "vpdpbusd %%zmm24, %%zmm28, %%zmm0      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm28, %%zmm1      \n\t" \
+    "vpdpbusd %%zmm24, %%zmm29, %%zmm2      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm29, %%zmm3      \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm4      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm5      \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm6      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm7      \n\t" \
+    "addq "#K", %%rax                       \n\t" \
+    "addq %%rbx, %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vmovups 0x40(%[B]), %%zmm27            \n\t" \
+    "vpdpbusd %%zmm24, %%zmm28, %%zmm8      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm28, %%zmm9      \n\t" \
+    "vpdpbusd %%zmm24, %%zmm29, %%zmm10     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm29, %%zmm11     \n\t" \
+    "movq "#A", %%rax                       \n\t" \
+    "addq $0x4, %%rax                       \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \
+    "vpbroadcastd (%%rax, %%rbx), %%zmm31   \n\t" \
+    "prefetcht0 0x100(%[B])                 \n\t" \
+    "prefetcht0 0x140(%[B])                 \n\t" \
+    "vmovups 0x80(%[B]), %%zmm24            \n\t" \
+    "vpdpbusd %%zmm26, %%zmm28, %%zmm0      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm28, %%zmm1      \n\t" \
+    "vpdpbusd %%zmm26, %%zmm29, %%zmm2      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm29, %%zmm3      \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm4      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm5      \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm6      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm7      \n\t" \
+    "addq "#K", %%rax                       \n\t" \
+    "addq %%rbx, %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vmovups 0xC0(%[B]), %%zmm25            \n\t" \
+    "vpdpbusd %%zmm26, %%zmm28, %%zmm8      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm28, %%zmm9      \n\t" \
+    "vpdpbusd %%zmm26, %%zmm29, %%zmm10     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm29, %%zmm11     \n\t"
+
+#define mmm_12_32(A, K) \
+    "movq "#A", %%rax                       \n\t" \
+    "movq "#K", %%rbx                       \n\t" \
+    "addq "#K", %%rbx                       \n\t" \
+    "addq "#K", %%rbx                       \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \
+    "vpbroadcastd (%%rax, %%rbx), %%zmm31   \n\t" \
+    "prefetcht0 0x80(%[B])                  \n\t" \
+    "prefetcht0 0xC0(%[B])                  \n\t" \
+    "vpdpbusd %%zmm24, %%zmm28, %%zmm0      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm28, %%zmm1      \n\t" \
+    "vpdpbusd %%zmm24, %%zmm29, %%zmm2      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm29, %%zmm3      \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm4      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm5      \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm6      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm7      \n\t" \
+    "addq "#K", %%rax                       \n\t" \
+    "addq %%rbx, %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \
+    "vpbroadcastd (%%rax, %%rbx), %%zmm31   \n\t" \
+    "vmovups (%[B]), %%zmm26                \n\t" \
+    "vpdpbusd %%zmm24, %%zmm28, %%zmm8      \n\t" \
+    "vpdpbusd %%zmm25, %%zmm28, %%zmm9      \n\t" \
+    "vpdpbusd %%zmm24, %%zmm29, %%zmm10     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm29, %%zmm11     \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm12     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm13     \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm14     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm15     \n\t" \
+    "addq "#K", %%rax                       \n\t" \
+    "addq %%rbx, %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \
+    "vpbroadcastd (%%rax, %%rbx), %%zmm31   \n\t" \
+    "vmovups 0x40(%[B]), %%zmm27            \n\t" \
+    "vpdpbusd %%zmm24, %%zmm28, %%zmm16     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm28, %%zmm17     \n\t" \
+    "vpdpbusd %%zmm24, %%zmm29, %%zmm18     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm29, %%zmm19     \n\t" \
+    "vpdpbusd %%zmm24, %%zmm30, %%zmm20     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm30, %%zmm21     \n\t" \
+    "vpdpbusd %%zmm24, %%zmm31, %%zmm22     \n\t" \
+    "vpdpbusd %%zmm25, %%zmm31, %%zmm23     \n\t" \
+    "movq "#A", %%rax                       \n\t" \
+    "addq $0x4, %%rax                       \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \
+    "vpbroadcastd (%%rax, %%rbx), %%zmm31   \n\t" \
+    "prefetcht0 0x100(%[B])                 \n\t" \
+    "prefetcht0 0x140(%[B])                 \n\t" \
+    "vpdpbusd %%zmm26, %%zmm28, %%zmm0      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm28, %%zmm1      \n\t" \
+    "vpdpbusd %%zmm26, %%zmm29, %%zmm2      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm29, %%zmm3      \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm4      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm5      \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm6      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm7      \n\t" \
+    "addq "#K", %%rax                       \n\t" \
+    "addq %%rbx, %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \
+    "vpbroadcastd (%%rax, %%rbx), %%zmm31   \n\t" \
+    "vmovups 0x80(%[B]), %%zmm24            \n\t" \
+    "vpdpbusd %%zmm26, %%zmm28, %%zmm8      \n\t" \
+    "vpdpbusd %%zmm27, %%zmm28, %%zmm9      \n\t" \
+    "vpdpbusd %%zmm26, %%zmm29, %%zmm10     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm29, %%zmm11     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm12     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm13     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm14     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm15     \n\t" \
+    "addq "#K", %%rax                       \n\t" \
+    "addq %%rbx, %%rax                      \n\t" \
+    "vpbroadcastd (%%rax), %%zmm28          \n\t" \
+    "vpbroadcastd (%%rax, "#K"), %%zmm29    \n\t" \
+    "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \
+    "vpbroadcastd (%%rax, %%rbx), %%zmm31   \n\t" \
+    "vmovups 0xC0(%[B]), %%zmm25            \n\t" \
+    "vpdpbusd %%zmm26, %%zmm28, %%zmm16     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm28, %%zmm17     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm29, %%zmm18     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm29, %%zmm19     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm30, %%zmm20     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm30, %%zmm21     \n\t" \
+    "vpdpbusd %%zmm26, %%zmm31, %%zmm22     \n\t" \
+    "vpdpbusd %%zmm27, %%zmm31, %%zmm23     \n\t"
+
+#define mmm_1_16(A, K, rtype, off) \
+    "vpbroadcastd ("#A"), "#rtype"25                     \n\t"        \
+    "vpbroadcastd 0x4("#A"), "#rtype"26                     \n\t"     \
+    "vmovups (%[B]), "#rtype"31                             \n\t"     \
+    "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"0              \n\t"        \
+    "vmovups "#off"(%[B]), "#rtype"24                             \n\t" \
+    "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"0              \n\t"
+
+#define mmm_6_16(A, K, rtype, off) \
+    "movq "#A", %%rax  \n\t"                                          \
+    "movq "#K", %%rbx                       \n\t" \
+    "addq "#K", %%rbx                       \n\t" \
+    "addq "#K", %%rbx                       \n\t" \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"0              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"1              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"2              \n\t"           \
+    "vmovups (%[B]), "#rtype"31                             \n\t"        \
+    "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"3              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"4              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"5              \n\t"           \
+    "movq "#A", %%rax  \n\t"                                          \
+    "addq $0x4, %%rax  \n\t"                                        \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"0              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"1              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"2              \n\t"           \
+    "vmovups "#off"(%[B]), "#rtype"24                             \n\t"    \
+    "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"3              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"4              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"5              \n\t"           \
+    
+#define mmm_12_16(A, K, rtype, off) \
+    "movq "#A", %%rax  \n\t"                                          \
+    "movq "#K", %%rbx                       \n\t" \
+    "addq "#K", %%rbx                       \n\t" \
+    "addq "#K", %%rbx                       \n\t" \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"0              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"1              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"2              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"3              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"4              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"5              \n\t"           \
+    "vmovups (%[B]), "#rtype"31                             \n\t"        \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"6              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"7              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"8              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"9              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"10              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"11              \n\t"          \
+    "movq "#A", %%rax  \n\t"                                          \
+    "addq $0x4, %%rax  \n\t"                                        \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"0              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"1              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"2              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"3              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"4              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"5              \n\t"           \
+    "vmovups "#off"(%[B]), "#rtype"24                             \n\t"    \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"6              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"7              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"8              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"9              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"10              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"11              \n\t"
+
+#define mmm_24_16(A, K, rtype, off) \
+    "movq "#A", %%rax  \n\t"                                          \
+    "movq "#K", %%rbx                       \n\t" \
+    "addq "#K", %%rbx                       \n\t" \
+    "addq "#K", %%rbx                       \n\t" \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "prefetcht0 0x80(%[B])                              \n\t"         \
+    "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"0              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"1              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"2              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"3              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"4              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"5              \n\t"           \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"6              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"7              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"8              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"9              \n\t"           \
+    "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"10              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"11              \n\t"          \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vmovups (%[B]), "#rtype"31                             \n\t"        \
+    "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"12              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"13              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"14              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"15              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"16              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"17              \n\t"          \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"18              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"19              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"20              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"21              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"22              \n\t"          \
+    "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"23              \n\t"          \
+    "movq "#A", %%rax  \n\t"                                          \
+    "addq $0x4, %%rax  \n\t"                                        \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "prefetcht0 0xC0(%[B])                              \n\t"         \
+    "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"0              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"1              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"2              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"3              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"4              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"5              \n\t"           \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"6              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"7              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"8              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"9              \n\t"           \
+    "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"10              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"11              \n\t"          \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vmovups "#off"(%[B]), "#rtype"24                             \n\t"    \
+    "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"12              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"13              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"14              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"15              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"16              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"17              \n\t"          \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"25                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"26                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"27                     \n\t" \
+    "addq %%rbx, %%rax  \n\t"                                       \
+    "vpbroadcastd (%%rax), "#rtype"28                     \n\t"        \
+    "vpbroadcastd (%%rax, "#K"), "#rtype"29                     \n\t"    \
+    "vpbroadcastd (%%rax, "#K", 2), "#rtype"30                     \n\t" \
+    "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"18              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"19              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"20              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"21              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"22              \n\t"          \
+    "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"23              \n\t"
+
+#define mmm_m_48_asm(m, n, nRegs, mRegs, edge) \
+    __asm__ __volatile__(                                                  \
+        "prefetcht0 0xC0(%[B])                              \n\t" \
+        "prefetcht0 0x100(%[B])                              \n\t" \
+        "prefetcht0 0x140(%[B])                              \n\t" \
+        "vmovups (%[B]), %%zmm24                                     \n\t" \
+        "vmovups 0x40(%[B]), %%zmm25                                 \n\t" \
+        "vmovups 0x80(%[B]), %%zmm26                                 \n\t" \
+        "add $0xC0, %[B]                                             \n\t" \
+        "movq %[flags], %%rax                                        \n\t" \
+        "andq $0x1, %%rax                                            \n\t" \
+        "jne 0f                                                      \n\t" \
+        loadOffset_##m##_##n                                                     \
+        "jmp 1f                                                      \n\t" \
+        ".align 16                                                   \n\t" \
+        "0:                                                          \n\t" \
+        clear##nRegs##Regs(%%zmm)                                                 \
+        ".align 16                                                   \n\t" \
+        "1:                                                          \n\t" \
+        "movq %[C], %%rax  \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 0x40(%%rax)                              \n\t"     \
+        "prefetcht0 0x80(%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x40(%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x80(%%rax, %[N])                              \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 0x40(%%rax)                              \n\t"     \
+        "prefetcht0 0x80(%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x40(%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x80(%%rax, %[N])                              \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 0x40(%%rax)                              \n\t"     \
+        "prefetcht0 0x80(%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x40(%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x80(%%rax, %[N])                              \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 0x40(%%rax)                              \n\t"     \
+        "prefetcht0 0x80(%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x40(%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x80(%%rax, %[N])                              \n\t"     \
+        "movq %[bk], %%rcx                                           \n\t" \
+        "shr $3, %%rcx                                \n\t" \
+        "je 3f                                        \n\t" \
+        ".align 16                                                   \n\t" \
+        "2:                                                          \n\t" \
+        mmm_##m##_48(%[A], %[K])                                                     \
+        "add $0x180, %[B]                                            \n\t" \
+        "add $0x8, %[A]                                              \n\t" \
+        "dec %%rcx                                                   \n\t" \
+        "jg 2b                                                       \n\t" \
+        ".align 16                                                   \n\t" \
+        "3:                                                          \n\t" \
+        "movq %[bk], %%rcx                                           \n\t" \
+        "and $7, %%rcx                                \n\t" \
+        "je 4f                                        \n\t" \
+        "movq $8, %%rcx                                           \n\t" \
+        mmm_##m##_48(%[resK], %%rcx)                                                     \
+        ".align 16                                                   \n\t" \
+        "4:                                                          \n\t" \
+        "movq %[C], %%rax  \n\t" \
+        "movq %[N], %%rcx  \n\t" \
+        "addq %[N], %%rcx                                     \n\t" \
+        addC_##m##_##n(%%rax)                                                     \
+        "cmpq $0x0, %[s]                                             \n\t" \
+        "je 5f                                                       \n\t" \
+        convert##nRegs##I32Regs2Ps(%%zmm, %[s])                                   \
+        "movq %[flags], %%rax                                        \n\t" \
+        "andq $0x2, %%rax                                            \n\t" \
+        "je 5f                                                       \n\t" \
+        convert##nRegs##PsRegs2U8(%%zmm)                                          \
+        storeC_##mRegs##_##n##_##edge(vpmovusdb, %%zmm, %[u8C], 0x10, 0x20)                   \
+        "jmp 6f                                                      \n\t" \
+        ".align 16                                                   \n\t" \
+        "5:                                                          \n\t" \
+        storeC_##mRegs##_##n##_##edge(vmovups, %%zmm, %[C], 0x40, 0x80)                       \
+        ".align 16                                                   \n\t" \
+        "6:                                                          \n\t" \
+        : [B] "+r" (matrixB)                                               \
+        : [A] "r" (matrixA),                                               \
+          [C] "r" (matrixC),                                               \
+          [bk] "r" ((int64_t)bk),                                          \
+          [N]"r" ((int64_t)(N * 4)),                                       \
+          [s] "r" (scale),                                                 \
+          [K] "r" ((int64_t)stepK),                                        \
+          [offset] "r" (offsetC),                                          \
+          [flags] "b" ((int64_t)flags),                                    \
+          [u8C] "r" (u8Result),                                             \
+          [nmask] "r" (nmask),                                             \
+          [resK] "r" (resK)                                             \
+        : "%rax", "%rcx",                                                  \
+          "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",            \
+          "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",          \
+          "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",      \
+          "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",      \
+          "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",      \
+          "%zmm30", "%zmm31", "memory", "cc");
+
+#define mmm_m_32_asm(m, n, nRegs, mRegs, edge) \
+    __asm__ __volatile__(                                                  \
+        "prefetcht0 0xC0(%[B])                              \n\t" \
+        "prefetcht0 0x100(%[B])                              \n\t" \
+        "vmovups (%[B]), %%zmm24                                     \n\t" \
+        "vmovups 0x40(%[B]), %%zmm25                                 \n\t" \
+        "add $0x80, %[B]                                             \n\t" \
+        "movq %[flags], %%rax                                        \n\t" \
+        "andq $0x1, %%rax                                            \n\t" \
+        "jne 0f                                                      \n\t" \
+        loadOffset_##m##_##n                                                     \
+        "jmp 1f                                                      \n\t" \
+        ".align 16                                                   \n\t" \
+        "0:                                                          \n\t" \
+        clear##nRegs##Regs(%%zmm)                                                 \
+        ".align 16                                                   \n\t" \
+        "1:                                                          \n\t" \
+        "movq %[C], %%rax  \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 0x40(%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x40(%%rax, %[N])                              \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 0x40(%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x40(%%rax, %[N])                              \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 0x40(%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x40(%%rax, %[N])                              \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 0x40(%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "prefetcht0 0x40(%%rax, %[N])                              \n\t"     \
+        "movq %[bk], %%rcx                                           \n\t" \
+        "shr $3, %%rcx                                \n\t" \
+        "je 3f                                        \n\t" \
+        ".align 16                                                   \n\t" \
+        "2:                                                          \n\t" \
+        mmm_##m##_32(%[A], %[K])                                                     \
+        "add $0x100, %[B]                                            \n\t" \
+        "add $0x8, %[A]                                              \n\t" \
+        "dec %%rcx                                                   \n\t" \
+        "jg 2b                                                       \n\t" \
+        ".align 16                                                   \n\t" \
+        "3:                                                          \n\t" \
+        "movq %[bk], %%rcx                                           \n\t" \
+        "and $7, %%rcx                                \n\t" \
+        "je 4f                                        \n\t" \
+        "movq $8, %%rcx                                           \n\t" \
+        mmm_##m##_32(%[resK], %%rcx)                                                     \
+        ".align 16                                                   \n\t" \
+        "4:                                                          \n\t" \
+        addC_##m##_##n(%[C])                                                     \
+        "cmpq $0x0, %[s]                                             \n\t" \
+        "je 5f                                                       \n\t" \
+        convert##nRegs##I32Regs2Ps(%%zmm, %[s])                                   \
+        "movq %[flags], %%rax                                        \n\t" \
+        "andq $0x2, %%rax                                            \n\t" \
+        "je 5f                                                       \n\t" \
+        convert##nRegs##PsRegs2U8(%%zmm)                                          \
+        storeC_##mRegs##_##n##_##edge(vpmovusdb, %%zmm, %[u8C], 0x10, 0x20)                   \
+        "jmp 6f                                                      \n\t" \
+        ".align 16                                                   \n\t" \
+        "5:                                                          \n\t" \
+        storeC_##mRegs##_##n##_##edge(vmovups, %%zmm, %[C], 0x40, 0x80)                       \
+        ".align 16                                                   \n\t" \
+        "6:                                                          \n\t" \
+        : [B] "+r" (matrixB)                                               \
+        : [A] "r" (matrixA),                                               \
+          [C] "r" (matrixC),                                               \
+          [bk] "r" ((int64_t)bk),                                          \
+          [N]"r" ((int64_t)(N * 4)),                                       \
+          [s] "r" (scale),                                                 \
+          [K] "r" ((int64_t)stepK),                                        \
+          [offset] "r" (offsetC),                                          \
+          [flags] "b" ((int64_t)flags),                                    \
+          [u8C] "r" (u8Result),                                             \
+          [nmask] "r" (nmask),                                             \
+          [resK] "r" (resK)                                             \
+        : "%rax", "%rcx",                                                  \
+          "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",            \
+          "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",          \
+          "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",      \
+          "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",      \
+          "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",      \
+          "%zmm30", "%zmm31", "memory", "cc");
+
+#define mmm_m_16_8_asm(m, n, nRegs, mRegs, rtype, off0, off1, edge) \
+    __asm__ __volatile__(                                                  \
+        "vmovups (%[B]), "#rtype"24                                     \n\t" \
+        "add $"#off0", %[B]                                             \n\t" \
+        "movq %[flags], %%rax                                        \n\t" \
+        "andq $0x1, %%rax                                            \n\t" \
+        "jne 0f                                                      \n\t" \
+        loadOffset_##m##_##n(rtype)                                                     \
+        "jmp 1f                                                      \n\t" \
+        ".align 16                                                   \n\t" \
+        "0:                                                          \n\t" \
+        clear##nRegs##Regs(rtype)                                                 \
+        ".align 16                                                   \n\t" \
+        "1:                                                          \n\t" \
+        "movq %[C], %%rax  \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "add %[N], %%rax                                     \n\t"     \
+        "prefetcht0 (%%rax)                              \n\t"     \
+        "prefetcht0 (%%rax, %[N])                              \n\t"     \
+        "movq %[bk], %%rcx                                           \n\t" \
+        "shr $3, %%rcx                                \n\t" \
+        "je 3f                                        \n\t" \
+        ".align 16                                                   \n\t" \
+        "2:                                                          \n\t" \
+        mmm_##m##_16(%[A], %[K], rtype, off0)                                                     \
+        "add $"#off1", %[B]                                            \n\t" \
+        "add $0x8, %[A]                                              \n\t" \
+        "dec %%rcx                                                   \n\t" \
+        "jg 2b                                                       \n\t" \
+        ".align 16                                                   \n\t" \
+        "3:                                                          \n\t" \
+        "movq %[bk], %%rcx                                           \n\t" \
+        "and $7, %%rcx                                \n\t" \
+        "je 4f                                        \n\t" \
+        "movq $8, %%rcx                                           \n\t" \
+        mmm_##m##_16(%[resK], %%rcx, rtype, off0)                                                     \
+        ".align 16                                                   \n\t" \
+        "4:                                                          \n\t" \
+        addC_##m##_##n(rtype, %[C])                                                     \
+        "cmpq $0x0, %[s]                                             \n\t" \
+        "je 5f                                                       \n\t" \
+        convert##nRegs##I32Regs2Ps(rtype, %[s])                                   \
+        "movq %[flags], %%rax                                        \n\t" \
+        "andq $0x2, %%rax                                            \n\t" \
+        "je 5f                                                       \n\t" \
+        convert##nRegs##PsRegs2U8(rtype)                                          \
+        storeC_##mRegs##_##n##_##edge(vpmovusdb, rtype, %[u8C], 0x0, 0x0)                   \
+        "jmp 6f                                                      \n\t" \
+        ".align 16                                                   \n\t" \
+        "5:                                                          \n\t" \
+        storeC_##mRegs##_##n##_##edge(vmovups, rtype, %[C], 0x0, 0x0)                       \
+        ".align 16                                                   \n\t" \
+        "6:                                                          \n\t" \
+        : [B] "+r" (matrixB)                                               \
+        : [A] "r" (matrixA),                                               \
+          [C] "r" (matrixC),                                               \
+          [bk] "r" ((int64_t)bk),                                          \
+          [N]"r" ((int64_t)(N * 4)),                                       \
+          [s] "r" (scale),                                                 \
+          [K] "r" ((int64_t)stepK),                                        \
+          [offset] "r" (offsetC),                                          \
+          [flags] "r" ((int64_t)flags),                                    \
+          [u8C] "r" (u8Result),                                             \
+          [nmask] "r" (nmask),                                             \
+          [resK] "r" (resK)                                             \
+        : "%rax", "%rbx", "%rcx",                                                  \
+          "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",            \
+          "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",          \
+          "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",      \
+          "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",      \
+          "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",      \
+          "%zmm30", "%zmm31", "memory", "cc");
+
+#define mmm_m_16_asm(m, n, nRegs, mRegs, edge) \
+    mmm_m_16_8_asm(m, n, nRegs, mRegs, %%zmm, 0x40, 0x80, edge)
+
+#define mmm_m_8_asm(m, n, nRegs, mRegs, edge) \
+    mmm_m_16_8_asm(m, n, nRegs, mRegs, %%ymm, 0x20, 0x40, edge)
+
+#define mmm_m_n_asm(m, n, nRegs, mRegs, regs) \
+    void mmm_avx512_##mRegs##x##n##_asm(U32 um, \
+        U32 un, \
+        U32 bk, \
+        UINT8 *matrixA, \
+        INT8 *matrixB, \
+        I32 *matrixC, \
+        UINT8 *u8Result, \
+        I32 *offsetC, \
+        U32 N, \
+        U32 stepK, \
+        const F32 *scale, \
+        U32 nmask, \
+        UINT8 *resK, \
+        U32 flags) \
+    { \
+        if (nmask == 0) { \
+            mmm_m_##n##_asm(m, nRegs, regs, mRegs, 0) \
+        } else { \
+            mmm_m_##n##_asm(m, nRegs, regs, mRegs, 1) \
+        } \
+    }
+
+mmm_m_n_asm(8, 48, 3, 8, 24)
+mmm_m_n_asm(8, 48, 3, 7, 24)
+mmm_m_n_asm(8, 48, 3, 6, 24)
+mmm_m_n_asm(8, 48, 3, 5, 24)
+mmm_m_n_asm(4, 48, 3, 4, 12)
+mmm_m_n_asm(4, 48, 3, 3, 12)
+mmm_m_n_asm(2, 48, 3, 2, 6)
+mmm_m_n_asm(1, 48, 3, 1, 1)
+
+mmm_m_n_asm(12, 32, 2, 12, 24)
+mmm_m_n_asm(12, 32, 2, 11, 24)
+mmm_m_n_asm(12, 32, 2, 10, 24)
+mmm_m_n_asm(12, 32, 2, 9, 24)
+mmm_m_n_asm(12, 32, 2, 8, 24)
+mmm_m_n_asm(12, 32, 2, 7, 24)
+mmm_m_n_asm(6, 32, 2, 6, 12)
+mmm_m_n_asm(6, 32, 2, 5, 12)
+mmm_m_n_asm(6, 32, 2, 4, 12)
+mmm_m_n_asm(3, 32, 2, 3, 6)
+mmm_m_n_asm(3, 32, 2, 2, 6)
+mmm_m_n_asm(1, 32, 2, 1, 1)
+
+mmm_m_n_asm(24, 16, 1, 24, 24)
+mmm_m_n_asm(24, 16, 1, 23, 24)
+mmm_m_n_asm(24, 16, 1, 22, 24)
+mmm_m_n_asm(24, 16, 1, 21, 24)
+mmm_m_n_asm(24, 16, 1, 20, 24)
+mmm_m_n_asm(24, 16, 1, 19, 24)
+mmm_m_n_asm(24, 16, 1, 18, 24)
+mmm_m_n_asm(24, 16, 1, 17, 24)
+mmm_m_n_asm(24, 16, 1, 16, 24)
+mmm_m_n_asm(24, 16, 1, 15, 24)
+mmm_m_n_asm(24, 16, 1, 14, 24)
+mmm_m_n_asm(24, 16, 1, 13, 24)
+mmm_m_n_asm(12, 16, 1, 12, 12)
+mmm_m_n_asm(12, 16, 1, 11, 12)
+mmm_m_n_asm(12, 16, 1, 10, 12)
+mmm_m_n_asm(12, 16, 1, 9, 12)
+mmm_m_n_asm(12, 16, 1, 8, 12)
+mmm_m_n_asm(12, 16, 1, 7, 12)
+mmm_m_n_asm(6, 16, 1, 6, 6)
+mmm_m_n_asm(6, 16, 1, 5, 6)
+mmm_m_n_asm(6, 16, 1, 4, 6)
+mmm_m_n_asm(6, 16, 1, 3, 6)
+mmm_m_n_asm(6, 16, 1, 2, 6)
+mmm_m_n_asm(1, 16, 1, 1, 1)
+
+mmm_m_n_asm(24, 8, 1, 24, 24)
+mmm_m_n_asm(24, 8, 1, 23, 24)
+mmm_m_n_asm(24, 8, 1, 22, 24)
+mmm_m_n_asm(24, 8, 1, 21, 24)
+mmm_m_n_asm(24, 8, 1, 20, 24)
+mmm_m_n_asm(24, 8, 1, 19, 24)
+mmm_m_n_asm(24, 8, 1, 18, 24)
+mmm_m_n_asm(24, 8, 1, 17, 24)
+mmm_m_n_asm(24, 8, 1, 16, 24)
+mmm_m_n_asm(24, 8, 1, 15, 24)
+mmm_m_n_asm(24, 8, 1, 14, 24)
+mmm_m_n_asm(24, 8, 1, 13, 24)
+mmm_m_n_asm(12, 8, 1, 12, 12)
+mmm_m_n_asm(12, 8, 1, 11, 12)
+mmm_m_n_asm(12, 8, 1, 10, 12)
+mmm_m_n_asm(12, 8, 1, 9, 12)
+mmm_m_n_asm(12, 8, 1, 8, 12)
+mmm_m_n_asm(12, 8, 1, 7, 12)
+mmm_m_n_asm(6, 8, 1, 6, 6)
+mmm_m_n_asm(6, 8, 1, 5, 6)
+mmm_m_n_asm(6, 8, 1, 4, 6)
+mmm_m_n_asm(6, 8, 1, 3, 6)
+mmm_m_n_asm(6, 8, 1, 2, 6)
+mmm_m_n_asm(1, 8, 1, 1, 1)
+
+
 void matrix_matrix_multiply_tmp_bytes_int8(
-    U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes)
+    U32 row1, U32 col1, U32 row2, U32 col2, DataFormat df, DataType dt, U32 *bytes)
 {
-    row1 = align_size(row1, SIMDW);
-    row2 = align_size(row2, SIMDW);
-    col1 = align_size(col1, SIMDW);
-    col2 = align_size(col2, SIMDW);
-    *bytes = row1 * col1 + row2 * col2 + UNI_MAX(row2, col2) * 4;
-    *bytes *= sizeof(dt);
+    U32 alignedN = UNI_ALIGN(col2, 16);
+    U32 alignedK = UNI_ALIGN(row2, 8);
+    *bytes = 2 * alignedN * bytesOf(DT_I32) + alignedN * alignedK;
+    if (df == DF_NORMAL) {
+        *bytes += 32 * col1;
+        if (col1 % 8 != 0) {
+            *bytes += UNI_ALIGN(row1, 24) * 8;
+        }
+    } else if (df == DF_TRANSPOSE) {
+        *bytes += UNI_ALIGN(col1, 24) * UNI_MIN(BOLCK_K_DIM, alignedK);
+    } else {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
     *bytes += 64;
 }
 
-EE matrix_matrix_multiply_transform_rhsN_int8(
-    TensorDesc desc, INT8 *src, INT8 *packB, I32 *offsetCBias)
+// clang-format on
+
+EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *packB)
 {
     DataType dt;
     DataFormat df;
@@ -53,25 +2045,15 @@ EE matrix_matrix_multiply_transform_rhsN_int8(
     CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N));
     U32 unrollSize[4] = {8, 16, 32, 48};
     INT8 *tmpS = src;
-    bool hasBias = (offsetCBias != nullptr);
-    I32 *sumB = nullptr;
-    if (!hasBias) {
-        sumB = (I32 *)packB;
-        memset(sumB, 0, N * sizeof(I32));
-    } else {
-        sumB = offsetCBias;
-    }
-    packB += N * bytesOf(DT_I32);
+    I32 *offsetCBias = (I32 *)(packB + UNI_ALIGN(K, SIMDW) * UNI_ALIGN(N, 16));
 
     for (U32 bk = 0; bk < K; bk += blockSizeK) {
         blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk);
-        blockSizeK = UNI_MAX(blockSizeK % SIMDW, blockSizeK - blockSizeK % SIMDW);
-        U32 alignedBlockSizeK = align_size(blockSizeK, SIMDW);
         for (U32 un = 0; un < N; un += unrollSizeN) {
             unrollSizeN = UNI_MIN(UNROLL_N, N - un);
-            unrollSizeN = UNI_MIN(unrollSize[unrollSizeN >> 4], unrollSizeN);
-            matrix2_trans_l(unrollSizeN, blockSizeK, N, SIMDW, tmpS + un, packB);
-            packB += unrollSizeN * alignedBlockSizeK;
+            U32 alignedN = (unrollSizeN > 8) ? UNI_ALIGN(unrollSizeN, 16) : 8;
+            matrix2_trans_l(unrollSizeN, alignedN, blockSizeK, N, SIMDW, tmpS + un, packB);
+            packB += alignedN * UNI_ALIGN(blockSizeK, SIMDW);
         }
         tmpS += blockSizeK * N;
     }
@@ -81,13 +2063,12 @@ EE matrix_matrix_multiply_transform_rhsN_int8(
         for (U32 k = 0; k < K; ++k) {
             tmp += (I32)(src[k * N + n]);
         }
-        sumB[n] += tmp * (-128);
+        offsetCBias[n] = tmp * (-128);
     }
     return SUCCESS;
 }
 
-EE matrix_matrix_multiply_transform_rhsT_int8(
-    TensorDesc desc, INT8 *src, INT8 *packB, I32 *offsetCBias)
+EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *packB)
 {
     DataType dt;
     DataFormat df;
@@ -95,25 +2076,15 @@ EE matrix_matrix_multiply_transform_rhsT_int8(
     CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K));
     U32 unrollSize[4] = {8, 16, 32, 48};
     INT8 *tmpS = src;
-    bool hasBias = (offsetCBias != nullptr);
-    I32 *sumB = nullptr;
-    if (!hasBias) {
-        sumB = (I32 *)packB;
-        memset(sumB, 0, N * sizeof(I32));
-    } else {
-        sumB = offsetCBias;
-    }
-    packB += N * bytesOf(DT_I32);
+    I32 *offsetCBias = (I32 *)(packB + UNI_ALIGN(K, SIMDW) * UNI_ALIGN(N, 16));
 
     for (U32 bk = 0; bk < K; bk += blockSizeK) {
         blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk);
-        blockSizeK = UNI_MAX(blockSizeK % SIMDW, blockSizeK - blockSizeK % SIMDW);
-        U32 alignedBlockSizeK = align_size(blockSizeK, SIMDW);
         for (U32 un = 0; un < N; un += unrollSizeN) {
             unrollSizeN = UNI_MIN(UNROLL_N, N - un);
-            unrollSizeN = UNI_MIN(unrollSize[unrollSizeN >> 4], unrollSizeN);
-            matrix1_trans_l(unrollSizeN, blockSizeK, K, SIMDW, tmpS + un * K, packB);
-            packB += unrollSizeN * alignedBlockSizeK;
+            U32 alignedN = (unrollSizeN > 8) ? UNI_ALIGN(unrollSizeN, 16) : 8;
+            matrix1_trans_l(unrollSizeN, alignedN, blockSizeK, K, SIMDW, tmpS + un * K, packB);
+            packB += alignedN * UNI_ALIGN(blockSizeK, SIMDW);
         }
         tmpS += blockSizeK;
     }
@@ -123,4707 +2094,12 @@ EE matrix_matrix_multiply_transform_rhsT_int8(
         for (U32 k = 0; k < K; ++k) {
             tmp += (I32)(src[n * K + k]);
         }
-        sumB[n] += tmp * (-128);
+        offsetCBias[n] = tmp * (-128);
     }
 
     return SUCCESS;
 }
 
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel8x48                                             \
-    "movq %0, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "prefetcht0 0xC0(%1)                              \n\t"       \
-    "prefetcht0 0x100(%1)                              \n\t"      \
-    "prefetcht0 0x140(%1)                              \n\t"      \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm0              \n\t"         \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm1              \n\t"         \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm2              \n\t"         \
-    "vpdpbusd %%zmm24, %%zmm31, %%zmm3              \n\t"         \
-    "vpdpbusd %%zmm25, %%zmm31, %%zmm4              \n\t"         \
-    "vpdpbusd %%zmm26, %%zmm31, %%zmm5              \n\t"         \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "vmovups (%1), %%zmm27                             \n\t"      \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm6              \n\t"         \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm7              \n\t"         \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm8              \n\t"         \
-    "vpdpbusd %%zmm24, %%zmm31, %%zmm9              \n\t"         \
-    "vpdpbusd %%zmm25, %%zmm31, %%zmm10              \n\t"        \
-    "vpdpbusd %%zmm26, %%zmm31, %%zmm11              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "vmovups 0x40(%1), %%zmm28                             \n\t"  \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm12              \n\t"        \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm13              \n\t"        \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm14              \n\t"        \
-    "vpdpbusd %%zmm24, %%zmm31, %%zmm15              \n\t"        \
-    "vpdpbusd %%zmm25, %%zmm31, %%zmm16              \n\t"        \
-    "vpdpbusd %%zmm26, %%zmm31, %%zmm17              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "vmovups 0x80(%1), %%zmm29                             \n\t"  \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm18              \n\t"        \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm19              \n\t"        \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm20              \n\t"        \
-    "vpdpbusd %%zmm24, %%zmm31, %%zmm21              \n\t"        \
-    "vpdpbusd %%zmm25, %%zmm31, %%zmm22              \n\t"        \
-    "vpdpbusd %%zmm26, %%zmm31, %%zmm23              \n\t"        \
-    "movq %0, %%rax  \n\t"                                        \
-    "addq $0x4, %%rax  \n\t"                                      \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "prefetcht0 0x180(%1)                              \n\t"      \
-    "prefetcht0 0x1C0(%1)                              \n\t"      \
-    "prefetcht0 0x200(%1)                              \n\t"      \
-    "vpdpbusd %%zmm27, %%zmm30, %%zmm0              \n\t"         \
-    "vpdpbusd %%zmm28, %%zmm30, %%zmm1              \n\t"         \
-    "vpdpbusd %%zmm29, %%zmm30, %%zmm2              \n\t"         \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm3              \n\t"         \
-    "vpdpbusd %%zmm28, %%zmm31, %%zmm4              \n\t"         \
-    "vpdpbusd %%zmm29, %%zmm31, %%zmm5              \n\t"         \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "vmovups 0xC0(%1), %%zmm24                             \n\t"  \
-    "vpdpbusd %%zmm27, %%zmm30, %%zmm6              \n\t"         \
-    "vpdpbusd %%zmm28, %%zmm30, %%zmm7              \n\t"         \
-    "vpdpbusd %%zmm29, %%zmm30, %%zmm8              \n\t"         \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm9              \n\t"         \
-    "vpdpbusd %%zmm28, %%zmm31, %%zmm10              \n\t"        \
-    "vpdpbusd %%zmm29, %%zmm31, %%zmm11              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "vmovups 0x100(%1), %%zmm25                             \n\t" \
-    "vpdpbusd %%zmm27, %%zmm30, %%zmm12              \n\t"        \
-    "vpdpbusd %%zmm28, %%zmm30, %%zmm13              \n\t"        \
-    "vpdpbusd %%zmm29, %%zmm30, %%zmm14              \n\t"        \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm15              \n\t"        \
-    "vpdpbusd %%zmm28, %%zmm31, %%zmm16              \n\t"        \
-    "vpdpbusd %%zmm29, %%zmm31, %%zmm17              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "vmovups 0x140(%1), %%zmm26                             \n\t" \
-    "vpdpbusd %%zmm27, %%zmm30, %%zmm18              \n\t"        \
-    "vpdpbusd %%zmm28, %%zmm30, %%zmm19              \n\t"        \
-    "vpdpbusd %%zmm29, %%zmm30, %%zmm20              \n\t"        \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm21              \n\t"        \
-    "vpdpbusd %%zmm28, %%zmm31, %%zmm22              \n\t"        \
-    "vpdpbusd %%zmm29, %%zmm31, %%zmm23              \n\t"
-#else
-#define mmmKernel8x48                                             \
-    "movq %0, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "prefetcht0 0xC0(%1)                              \n\t"       \
-    "prefetcht0 0x100(%1)                              \n\t"      \
-    "prefetcht0 0x140(%1)                              \n\t"      \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm0, %%zmm27, %%zmm0              \n\t"            \
-    "vpaddd %%zmm1, %%zmm28, %%zmm1              \n\t"            \
-    "vpaddd %%zmm2, %%zmm29, %%zmm2              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpaddd %%zmm3, %%zmm27, %%zmm3              \n\t"            \
-    "vpaddd %%zmm4, %%zmm28, %%zmm4              \n\t"            \
-    "vpaddd %%zmm5, %%zmm29, %%zmm5              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm6, %%zmm27, %%zmm6              \n\t"            \
-    "vpaddd %%zmm7, %%zmm28, %%zmm7              \n\t"            \
-    "vpaddd %%zmm8, %%zmm29, %%zmm8              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpaddd %%zmm9, %%zmm27, %%zmm9              \n\t"            \
-    "vpaddd %%zmm10, %%zmm28, %%zmm10              \n\t"          \
-    "vpaddd %%zmm11, %%zmm29, %%zmm11              \n\t"          \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm12, %%zmm27, %%zmm12              \n\t"          \
-    "vpaddd %%zmm13, %%zmm28, %%zmm13              \n\t"          \
-    "vpaddd %%zmm14, %%zmm29, %%zmm14              \n\t"          \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpaddd %%zmm15, %%zmm27, %%zmm15              \n\t"          \
-    "vpaddd %%zmm16, %%zmm28, %%zmm16              \n\t"          \
-    "vpaddd %%zmm17, %%zmm29, %%zmm17              \n\t"          \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm18, %%zmm27, %%zmm18              \n\t"          \
-    "vpaddd %%zmm19, %%zmm28, %%zmm19              \n\t"          \
-    "vpaddd %%zmm20, %%zmm29, %%zmm20              \n\t"          \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vmovups (%1), %%zmm24                             \n\t"      \
-    "vmovups 0x40(%1), %%zmm25                             \n\t"  \
-    "vmovups 0x80(%1), %%zmm26                             \n\t"  \
-    "vpaddd %%zmm21, %%zmm27, %%zmm21              \n\t"          \
-    "vpaddd %%zmm22, %%zmm28, %%zmm22              \n\t"          \
-    "vpaddd %%zmm23, %%zmm29, %%zmm23              \n\t"          \
-    "movq %0, %%rax  \n\t"                                        \
-    "addq $0x4, %%rax  \n\t"                                      \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "prefetcht0 0x180(%1)                              \n\t"      \
-    "prefetcht0 0x1C0(%1)                              \n\t"      \
-    "prefetcht0 0x200(%1)                              \n\t"      \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm0, %%zmm27, %%zmm0              \n\t"            \
-    "vpaddd %%zmm1, %%zmm28, %%zmm1              \n\t"            \
-    "vpaddd %%zmm2, %%zmm29, %%zmm2              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpaddd %%zmm3, %%zmm27, %%zmm3              \n\t"            \
-    "vpaddd %%zmm4, %%zmm28, %%zmm4              \n\t"            \
-    "vpaddd %%zmm5, %%zmm29, %%zmm5              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm6, %%zmm27, %%zmm6              \n\t"            \
-    "vpaddd %%zmm7, %%zmm28, %%zmm7              \n\t"            \
-    "vpaddd %%zmm8, %%zmm29, %%zmm8              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpaddd %%zmm9, %%zmm27, %%zmm9              \n\t"            \
-    "vpaddd %%zmm10, %%zmm28, %%zmm10              \n\t"          \
-    "vpaddd %%zmm11, %%zmm29, %%zmm11              \n\t"          \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm12, %%zmm27, %%zmm12              \n\t"          \
-    "vpaddd %%zmm13, %%zmm28, %%zmm13              \n\t"          \
-    "vpaddd %%zmm14, %%zmm29, %%zmm14              \n\t"          \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpaddd %%zmm15, %%zmm27, %%zmm15              \n\t"          \
-    "vpaddd %%zmm16, %%zmm28, %%zmm16              \n\t"          \
-    "vpaddd %%zmm17, %%zmm29, %%zmm17              \n\t"          \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm18, %%zmm27, %%zmm18              \n\t"          \
-    "vpaddd %%zmm19, %%zmm28, %%zmm19              \n\t"          \
-    "vpaddd %%zmm20, %%zmm29, %%zmm20              \n\t"          \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vmovups 0xC0(%1), %%zmm24                             \n\t"  \
-    "vmovups 0x100(%1), %%zmm25                             \n\t" \
-    "vmovups 0x140(%1), %%zmm26                             \n\t" \
-    "vpaddd %%zmm21, %%zmm27, %%zmm21              \n\t"          \
-    "vpaddd %%zmm22, %%zmm28, %%zmm22              \n\t"          \
-    "vpaddd %%zmm23, %%zmm29, %%zmm23              \n\t"
-#endif
-
-inline void mmm_avx512_8x48_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0xC0(%1)                              \n\t"
-        "prefetcht0 0x100(%1)                              \n\t"
-        "prefetcht0 0x140(%1)                              \n\t"
-        "vmovups (%1), %%zmm24                             \n\t"
-        "vmovups 0x40(%1), %%zmm25                             \n\t"
-        "vmovups 0x80(%1), %%zmm26                             \n\t"
-        "add $0xC0, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%eax \n\t"
-        "vmovd %%eax, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%zmm31            \n\t"
-#endif
-
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x1, %%rax          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%7), %%zmm0                       \n\t"
-        "vmovups 0x40(%7), %%zmm1                   \n\t"
-        "vmovups 0x80(%7), %%zmm2                   \n\t"
-        "vmovups %%zmm0, %%zmm3                   \n\t"
-        "vmovups %%zmm1, %%zmm4                   \n\t"
-        "vmovups %%zmm2, %%zmm5                   \n\t"
-        "vmovups %%zmm0, %%zmm6                   \n\t"
-        "vmovups %%zmm1, %%zmm7                   \n\t"
-        "vmovups %%zmm2, %%zmm8                   \n\t"
-        "vmovups %%zmm0, %%zmm9                   \n\t"
-        "vmovups %%zmm1, %%zmm10                   \n\t"
-        "vmovups %%zmm2, %%zmm11                   \n\t"
-        "vmovups %%zmm0, %%zmm12                   \n\t"
-        "vmovups %%zmm1, %%zmm13                   \n\t"
-        "vmovups %%zmm2, %%zmm14                   \n\t"
-        "vmovups %%zmm0, %%zmm15                   \n\t"
-        "vmovups %%zmm1, %%zmm16                   \n\t"
-        "vmovups %%zmm2, %%zmm17                   \n\t"
-        "vmovups %%zmm0, %%zmm18                   \n\t"
-        "vmovups %%zmm1, %%zmm19                   \n\t"
-        "vmovups %%zmm2, %%zmm20                   \n\t"
-        "vmovups %%zmm0, %%zmm21                   \n\t"
-        "vmovups %%zmm1, %%zmm22                   \n\t"
-        "vmovups %%zmm2, %%zmm23                   \n\t"
-        "jmp 1f          \n\t"
-
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
-        "vxorps %%zmm2, %%zmm2, %%zmm2                     \n\t"
-        "vxorps %%zmm3, %%zmm3, %%zmm3                     \n\t"
-        "vxorps %%zmm4, %%zmm4, %%zmm4                     \n\t"
-        "vxorps %%zmm5, %%zmm5, %%zmm5                     \n\t"
-        "vxorps %%zmm6, %%zmm6, %%zmm6                     \n\t"
-        "vxorps %%zmm7, %%zmm7, %%zmm7                     \n\t"
-        "vxorps %%zmm8, %%zmm8, %%zmm8                     \n\t"
-        "vxorps %%zmm9, %%zmm9, %%zmm9                     \n\t"
-        "vxorps %%zmm10, %%zmm10, %%zmm10                  \n\t"
-        "vxorps %%zmm11, %%zmm11, %%zmm11                  \n\t"
-        "vxorps %%zmm12, %%zmm12, %%zmm12                  \n\t"
-        "vxorps %%zmm13, %%zmm13, %%zmm13                  \n\t"
-        "vxorps %%zmm14, %%zmm14, %%zmm14                  \n\t"
-        "vxorps %%zmm15, %%zmm15, %%zmm15                  \n\t"
-        "vxorps %%zmm16, %%zmm16, %%zmm16                  \n\t"
-        "vxorps %%zmm17, %%zmm17, %%zmm17                  \n\t"
-        "vxorps %%zmm18, %%zmm18, %%zmm18                  \n\t"
-        "vxorps %%zmm19, %%zmm19, %%zmm19                  \n\t"
-        "vxorps %%zmm20, %%zmm20, %%zmm20                  \n\t"
-        "vxorps %%zmm21, %%zmm21, %%zmm21                  \n\t"
-        "vxorps %%zmm22, %%zmm22, %%zmm22                  \n\t"
-        "vxorps %%zmm23, %%zmm23, %%zmm23                  \n\t"
-
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 0x80(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "prefetcht0 0x80(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 0x80(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "prefetcht0 0x80(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 0x80(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "prefetcht0 0x80(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 0x80(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "prefetcht0 0x80(%%rax, %4)                              \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t" mmmKernel8x48
-
-        "add $0x180, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 2b                                             \n\t"
-
-        "movq %2, %%rax  \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpaddd (%%rax), %%zmm0, %%zmm0                       \n\t"
-        "vpaddd 0x40(%%rax), %%zmm1, %%zmm1                   \n\t"
-        "vpaddd 0x80(%%rax), %%zmm2, %%zmm2                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm3, %%zmm3                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm4, %%zmm4                   \n\t"
-        "vpaddd 0x80(%%rax, %4), %%zmm5, %%zmm5                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm6, %%zmm6                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm7, %%zmm7                   \n\t"
-        "vpaddd 0x80(%%rax), %%zmm8, %%zmm8                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm9, %%zmm9                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm10, %%zmm10                   \n\t"
-        "vpaddd 0x80(%%rax, %4), %%zmm11, %%zmm11                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm12, %%zmm12                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm13, %%zmm13                   \n\t"
-        "vpaddd 0x80(%%rax), %%zmm14, %%zmm14                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm15, %%zmm15                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm16, %%zmm16                   \n\t"
-        "vpaddd 0x80(%%rax, %4), %%zmm17, %%zmm17                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm18, %%zmm18                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm19, %%zmm19                   \n\t"
-        "vpaddd 0x80(%%rax), %%zmm20, %%zmm20                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm21, %%zmm21                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm22, %%zmm22                   \n\t"
-        "vpaddd 0x80(%%rax, %4), %%zmm23, %%zmm23                   \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 3f      \n\t"
-
-        "vbroadcastss (%5), %%zmm24                        \n\t"
-        "vcvtdq2ps %%zmm0, %%zmm0                       \n\t"
-        "vcvtdq2ps %%zmm1, %%zmm1                       \n\t"
-        "vcvtdq2ps %%zmm2, %%zmm2                       \n\t"
-        "vcvtdq2ps %%zmm3, %%zmm3                       \n\t"
-        "vcvtdq2ps %%zmm4, %%zmm4                       \n\t"
-        "vcvtdq2ps %%zmm5, %%zmm5                       \n\t"
-        "vcvtdq2ps %%zmm6, %%zmm6                       \n\t"
-        "vcvtdq2ps %%zmm7, %%zmm7                       \n\t"
-        "vcvtdq2ps %%zmm8, %%zmm8                       \n\t"
-        "vcvtdq2ps %%zmm9, %%zmm9                       \n\t"
-        "vcvtdq2ps %%zmm10, %%zmm10                       \n\t"
-        "vcvtdq2ps %%zmm11, %%zmm11                       \n\t"
-        "vcvtdq2ps %%zmm12, %%zmm12                       \n\t"
-        "vcvtdq2ps %%zmm13, %%zmm13                       \n\t"
-        "vcvtdq2ps %%zmm14, %%zmm14                       \n\t"
-        "vcvtdq2ps %%zmm15, %%zmm15                       \n\t"
-        "vcvtdq2ps %%zmm16, %%zmm16                       \n\t"
-        "vcvtdq2ps %%zmm17, %%zmm17                       \n\t"
-        "vcvtdq2ps %%zmm18, %%zmm18                       \n\t"
-        "vcvtdq2ps %%zmm19, %%zmm19                       \n\t"
-        "vcvtdq2ps %%zmm20, %%zmm20                       \n\t"
-        "vcvtdq2ps %%zmm21, %%zmm21                       \n\t"
-        "vcvtdq2ps %%zmm22, %%zmm22                       \n\t"
-        "vcvtdq2ps %%zmm23, %%zmm23                       \n\t"
-        "vmulps %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vmulps %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vmulps %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vmulps %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vmulps %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vmulps %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vmulps %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vmulps %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vmulps %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vmulps %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vmulps %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vmulps %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "vmulps %%zmm12, %%zmm24, %%zmm12                     \n\t"
-        "vmulps %%zmm13, %%zmm24, %%zmm13                     \n\t"
-        "vmulps %%zmm14, %%zmm24, %%zmm14                     \n\t"
-        "vmulps %%zmm15, %%zmm24, %%zmm15                     \n\t"
-        "vmulps %%zmm16, %%zmm24, %%zmm16                     \n\t"
-        "vmulps %%zmm17, %%zmm24, %%zmm17                     \n\t"
-        "vmulps %%zmm18, %%zmm24, %%zmm18                     \n\t"
-        "vmulps %%zmm19, %%zmm24, %%zmm19                     \n\t"
-        "vmulps %%zmm20, %%zmm24, %%zmm20                     \n\t"
-        "vmulps %%zmm21, %%zmm24, %%zmm21                     \n\t"
-        "vmulps %%zmm22, %%zmm24, %%zmm22                     \n\t"
-        "vmulps %%zmm23, %%zmm24, %%zmm23                     \n\t"
-
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x2, %%rax          \n\t"
-        "je 3f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "vcvtps2dq %%zmm1, %%zmm1                       \n\t"
-        "vcvtps2dq %%zmm2, %%zmm2                       \n\t"
-        "vcvtps2dq %%zmm3, %%zmm3                       \n\t"
-        "vcvtps2dq %%zmm4, %%zmm4                       \n\t"
-        "vcvtps2dq %%zmm5, %%zmm5                       \n\t"
-        "vcvtps2dq %%zmm6, %%zmm6                       \n\t"
-        "vcvtps2dq %%zmm7, %%zmm7                       \n\t"
-        "vcvtps2dq %%zmm8, %%zmm8                       \n\t"
-        "vcvtps2dq %%zmm9, %%zmm9                       \n\t"
-        "vcvtps2dq %%zmm10, %%zmm10                       \n\t"
-        "vcvtps2dq %%zmm11, %%zmm11                       \n\t"
-        "vcvtps2dq %%zmm12, %%zmm12                       \n\t"
-        "vcvtps2dq %%zmm13, %%zmm13                       \n\t"
-        "vcvtps2dq %%zmm14, %%zmm14                       \n\t"
-        "vcvtps2dq %%zmm15, %%zmm15                       \n\t"
-        "vcvtps2dq %%zmm16, %%zmm16                       \n\t"
-        "vcvtps2dq %%zmm17, %%zmm17                       \n\t"
-        "vcvtps2dq %%zmm18, %%zmm18                       \n\t"
-        "vcvtps2dq %%zmm19, %%zmm19                       \n\t"
-        "vcvtps2dq %%zmm20, %%zmm20                       \n\t"
-        "vcvtps2dq %%zmm21, %%zmm21                       \n\t"
-        "vcvtps2dq %%zmm22, %%zmm22                       \n\t"
-        "vcvtps2dq %%zmm23, %%zmm23                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpaddd %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vpaddd %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vpaddd %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vpaddd %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vpaddd %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vpaddd %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vpaddd %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vpaddd %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vpaddd %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vpaddd %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vpaddd %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "vpaddd %%zmm12, %%zmm24, %%zmm12                     \n\t"
-        "vpaddd %%zmm13, %%zmm24, %%zmm13                     \n\t"
-        "vpaddd %%zmm14, %%zmm24, %%zmm14                     \n\t"
-        "vpaddd %%zmm15, %%zmm24, %%zmm15                     \n\t"
-        "vpaddd %%zmm16, %%zmm24, %%zmm16                     \n\t"
-        "vpaddd %%zmm17, %%zmm24, %%zmm17                     \n\t"
-        "vpaddd %%zmm18, %%zmm24, %%zmm18                     \n\t"
-        "vpaddd %%zmm19, %%zmm24, %%zmm19                     \n\t"
-        "vpaddd %%zmm20, %%zmm24, %%zmm20                     \n\t"
-        "vpaddd %%zmm21, %%zmm24, %%zmm21                     \n\t"
-        "vpaddd %%zmm22, %%zmm24, %%zmm22                     \n\t"
-        "vpaddd %%zmm23, %%zmm24, %%zmm23                     \n\t"
-        "movq %9, %%rax  \n\t"
-        "shr $2, %4                                     \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpmovusdb %%zmm0,  (%%rax)                             \n\t"
-        "vpmovusdb %%zmm1,  0x10(%%rax)                         \n\t"
-        "vpmovusdb %%zmm2,  0x20(%%rax)                         \n\t"
-        "vpmovusdb %%zmm3,  (%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm4,  0x10(%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm5,  0x20(%%rax, %4)                         \n\t"
-        "add %%rcx, %%rax                                     \n\t"
-        "vpmovusdb %%zmm6,  (%%rax)                         \n\t"
-        "vpmovusdb %%zmm7,  0x10(%%rax)                         \n\t"
-        "vpmovusdb %%zmm8,  0x20(%%rax)                         \n\t"
-        "vpmovusdb %%zmm9,  (%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm10,  0x10(%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm11,  0x20(%%rax, %4)                         \n\t"
-        "add %%rcx, %%rax                                     \n\t"
-        "vpmovusdb %%zmm12,  (%%rax)                         \n\t"
-        "vpmovusdb %%zmm13,  0x10(%%rax)                         \n\t"
-        "vpmovusdb %%zmm14,  0x20(%%rax)                         \n\t"
-        "vpmovusdb %%zmm15,  (%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm16,  0x10(%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm17,  0x20(%%rax, %4)                         \n\t"
-        "add %%rcx, %%rax                                     \n\t"
-        "vpmovusdb %%zmm18,  (%%rax)                             \n\t"
-        "vpmovusdb %%zmm19,  0x10(%%rax)                             \n\t"
-        "vpmovusdb %%zmm20,  0x20(%%rax)                             \n\t"
-        "vpmovusdb %%zmm21,  (%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm22,  0x10(%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm23,  0x20(%%rax, %4)                         \n\t"
-        "jmp 4f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "vmovups %%zmm0,  (%%rax)                             \n\t"
-        "vmovups %%zmm1,  0x40(%%rax)                         \n\t"
-        "vmovups %%zmm2,  0x80(%%rax)                         \n\t"
-        "vmovups %%zmm3,  (%%rax, %4)                         \n\t"
-        "vmovups %%zmm4,  0x40(%%rax, %4)                         \n\t"
-        "vmovups %%zmm5,  0x80(%%rax, %4)                         \n\t"
-        "add %%rcx, %%rax                                     \n\t"
-        "vmovups %%zmm6,  (%%rax)                         \n\t"
-        "vmovups %%zmm7,  0x40(%%rax)                         \n\t"
-        "vmovups %%zmm8,  0x80(%%rax)                         \n\t"
-        "vmovups %%zmm9,  (%%rax, %4)                         \n\t"
-        "vmovups %%zmm10,  0x40(%%rax, %4)                         \n\t"
-        "vmovups %%zmm11,  0x80(%%rax, %4)                         \n\t"
-        "add %%rcx, %%rax                                     \n\t"
-        "vmovups %%zmm12,  (%%rax)                         \n\t"
-        "vmovups %%zmm13,  0x40(%%rax)                         \n\t"
-        "vmovups %%zmm14,  0x80(%%rax)                         \n\t"
-        "vmovups %%zmm15,  (%%rax, %4)                         \n\t"
-        "vmovups %%zmm16,  0x40(%%rax, %4)                         \n\t"
-        "vmovups %%zmm17,  0x80(%%rax, %4)                         \n\t"
-        "add %%rcx, %%rax                                     \n\t"
-        "vmovups %%zmm18,  (%%rax)                             \n\t"
-        "vmovups %%zmm19,  0x40(%%rax)                             \n\t"
-        "vmovups %%zmm20,  0x80(%%rax)                             \n\t"
-        "vmovups %%zmm21,  (%%rax, %4)                         \n\t"
-        "vmovups %%zmm22,  0x40(%%rax, %4)                         \n\t"
-        "vmovups %%zmm23,  0x80(%%rax, %4)                         \n\t"
-        ".align 16                                         \n\t"
-        "4:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)),
-        "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8",
-        "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
-        "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26",
-        "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel12x32                                              \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm31                     \n\t" \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "prefetcht0 0xC0(%1)                              \n\t"         \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm0              \n\t"           \
-    "vpdpbusd %%zmm25, %%zmm28, %%zmm1              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm2              \n\t"           \
-    "vpdpbusd %%zmm25, %%zmm29, %%zmm3              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm4              \n\t"           \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm5              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm31, %%zmm6              \n\t"           \
-    "vpdpbusd %%zmm25, %%zmm31, %%zmm7              \n\t"           \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm31                     \n\t" \
-    "vmovups (%1), %%zmm26                             \n\t"        \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm8              \n\t"           \
-    "vpdpbusd %%zmm25, %%zmm28, %%zmm9              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm10              \n\t"          \
-    "vpdpbusd %%zmm25, %%zmm29, %%zmm11              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm12              \n\t"          \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm13              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm31, %%zmm14              \n\t"          \
-    "vpdpbusd %%zmm25, %%zmm31, %%zmm15              \n\t"          \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm31                     \n\t" \
-    "vmovups 0x40(%1), %%zmm27                             \n\t"    \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm16              \n\t"          \
-    "vpdpbusd %%zmm25, %%zmm28, %%zmm17              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm18              \n\t"          \
-    "vpdpbusd %%zmm25, %%zmm29, %%zmm19              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm20              \n\t"          \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm21              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm31, %%zmm22              \n\t"          \
-    "vpdpbusd %%zmm25, %%zmm31, %%zmm23              \n\t"          \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm31                     \n\t" \
-    "prefetcht0 0x100(%1)                              \n\t"        \
-    "prefetcht0 0x140(%1)                              \n\t"        \
-    "vpdpbusd %%zmm26, %%zmm28, %%zmm0              \n\t"           \
-    "vpdpbusd %%zmm27, %%zmm28, %%zmm1              \n\t"           \
-    "vpdpbusd %%zmm26, %%zmm29, %%zmm2              \n\t"           \
-    "vpdpbusd %%zmm27, %%zmm29, %%zmm3              \n\t"           \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm4              \n\t"           \
-    "vpdpbusd %%zmm27, %%zmm30, %%zmm5              \n\t"           \
-    "vpdpbusd %%zmm26, %%zmm31, %%zmm6              \n\t"           \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm7              \n\t"           \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm31                     \n\t" \
-    "vmovups 0x80(%1), %%zmm24                             \n\t"    \
-    "vpdpbusd %%zmm26, %%zmm28, %%zmm8              \n\t"           \
-    "vpdpbusd %%zmm27, %%zmm28, %%zmm9              \n\t"           \
-    "vpdpbusd %%zmm26, %%zmm29, %%zmm10              \n\t"          \
-    "vpdpbusd %%zmm27, %%zmm29, %%zmm11              \n\t"          \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm12              \n\t"          \
-    "vpdpbusd %%zmm27, %%zmm30, %%zmm13              \n\t"          \
-    "vpdpbusd %%zmm26, %%zmm31, %%zmm14              \n\t"          \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm15              \n\t"          \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm31                     \n\t" \
-    "vmovups 0xC0(%1), %%zmm25                             \n\t"    \
-    "vpdpbusd %%zmm26, %%zmm28, %%zmm16              \n\t"          \
-    "vpdpbusd %%zmm27, %%zmm28, %%zmm17              \n\t"          \
-    "vpdpbusd %%zmm26, %%zmm29, %%zmm18              \n\t"          \
-    "vpdpbusd %%zmm27, %%zmm29, %%zmm19              \n\t"          \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm20              \n\t"          \
-    "vpdpbusd %%zmm27, %%zmm30, %%zmm21              \n\t"          \
-    "vpdpbusd %%zmm26, %%zmm31, %%zmm22              \n\t"          \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm23              \n\t"
-#else
-#define mmmKernel12x32                                              \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "prefetcht0 0xC0(%1)                              \n\t"         \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpaddd %%zmm0, %%zmm26, %%zmm0              \n\t"              \
-    "vpaddd %%zmm1, %%zmm27, %%zmm1              \n\t"              \
-    "vpaddd %%zmm2, %%zmm28, %%zmm2              \n\t"              \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"        \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm29                     \n\t" \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"        \
-    "vpaddd %%zmm3, %%zmm26, %%zmm3              \n\t"              \
-    "vpaddd %%zmm4, %%zmm27, %%zmm4              \n\t"              \
-    "vpaddd %%zmm5, %%zmm28, %%zmm5              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpaddd %%zmm6, %%zmm26, %%zmm6              \n\t"              \
-    "vpaddd %%zmm7, %%zmm27, %%zmm7              \n\t"              \
-    "vpaddd %%zmm8, %%zmm28, %%zmm8              \n\t"              \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm28              \n\t"        \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm29                     \n\t" \
-    "vpaddd %%zmm9, %%zmm26, %%zmm9              \n\t"              \
-    "vpaddd %%zmm10, %%zmm27, %%zmm10              \n\t"            \
-    "vpaddd %%zmm11, %%zmm28, %%zmm11              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"        \
-    "vpaddd %%zmm12, %%zmm26, %%zmm12              \n\t"            \
-    "vpaddd %%zmm13, %%zmm27, %%zmm13              \n\t"            \
-    "vpaddd %%zmm14, %%zmm28, %%zmm14              \n\t"            \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpaddd %%zmm15, %%zmm26, %%zmm15              \n\t"            \
-    "vpaddd %%zmm16, %%zmm27, %%zmm16              \n\t"            \
-    "vpaddd %%zmm17, %%zmm28, %%zmm17              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm29                     \n\t" \
-    "vpaddd %%zmm18, %%zmm26, %%zmm18              \n\t"            \
-    "vpaddd %%zmm19, %%zmm27, %%zmm19              \n\t"            \
-    "vpaddd %%zmm20, %%zmm28, %%zmm20              \n\t"            \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm28              \n\t"        \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vmovups (%1), %%zmm24                             \n\t"        \
-    "vmovups 0x40(%1), %%zmm25                             \n\t"    \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpaddd %%zmm21, %%zmm26, %%zmm21              \n\t"            \
-    "vpaddd %%zmm22, %%zmm27, %%zmm22              \n\t"            \
-    "vpaddd %%zmm23, %%zmm28, %%zmm23              \n\t"            \
-    "prefetcht0 0x100(%1)                              \n\t"        \
-    "prefetcht0 0x140(%1)                              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpaddd %%zmm0, %%zmm26, %%zmm0              \n\t"              \
-    "vpaddd %%zmm1, %%zmm27, %%zmm1              \n\t"              \
-    "vpaddd %%zmm2, %%zmm28, %%zmm2              \n\t"              \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"        \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm29                     \n\t" \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"        \
-    "vpaddd %%zmm3, %%zmm26, %%zmm3              \n\t"              \
-    "vpaddd %%zmm4, %%zmm27, %%zmm4              \n\t"              \
-    "vpaddd %%zmm5, %%zmm28, %%zmm5              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpaddd %%zmm6, %%zmm26, %%zmm6              \n\t"              \
-    "vpaddd %%zmm7, %%zmm27, %%zmm7              \n\t"              \
-    "vpaddd %%zmm8, %%zmm28, %%zmm8              \n\t"              \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm28              \n\t"        \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm29                     \n\t" \
-    "vpaddd %%zmm9, %%zmm26, %%zmm9              \n\t"              \
-    "vpaddd %%zmm10, %%zmm27, %%zmm10              \n\t"            \
-    "vpaddd %%zmm11, %%zmm28, %%zmm11              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"        \
-    "vpaddd %%zmm12, %%zmm26, %%zmm12              \n\t"            \
-    "vpaddd %%zmm13, %%zmm27, %%zmm13              \n\t"            \
-    "vpaddd %%zmm14, %%zmm28, %%zmm14              \n\t"            \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpaddd %%zmm15, %%zmm26, %%zmm15              \n\t"            \
-    "vpaddd %%zmm16, %%zmm27, %%zmm16              \n\t"            \
-    "vpaddd %%zmm17, %%zmm28, %%zmm17              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm29                     \n\t" \
-    "vpaddd %%zmm18, %%zmm26, %%zmm18              \n\t"            \
-    "vpaddd %%zmm19, %%zmm27, %%zmm19              \n\t"            \
-    "vpaddd %%zmm20, %%zmm28, %%zmm20              \n\t"            \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vmovups 0x80(%1), %%zmm24                             \n\t"    \
-    "vmovups 0xC0(%1), %%zmm25                             \n\t"    \
-    "vpaddd %%zmm21, %%zmm26, %%zmm21              \n\t"            \
-    "vpaddd %%zmm22, %%zmm27, %%zmm22              \n\t"            \
-    "vpaddd %%zmm23, %%zmm28, %%zmm23              \n\t"
-#endif
-
-inline void mmm_avx512_12x32_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0x80(%1)                              \n\t"
-        "prefetcht0 0xC0(%1)                              \n\t"
-        "vmovups (%1), %%zmm24                             \n\t"
-        "vmovups 0x40(%1), %%zmm25                             \n\t"
-        "add $0x80, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%ebx \n\t"
-        "vmovd %%ebx, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%zmm31            \n\t"
-#endif
-
-        "movq %8, %%rbx          \n\t"
-        "andq $0x1, %%rbx          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%7), %%zmm0                       \n\t"
-        "vmovups 0x40(%7), %%zmm1                   \n\t"
-        "vmovups %%zmm0, %%zmm2                   \n\t"
-        "vmovups %%zmm1, %%zmm3                   \n\t"
-        "vmovups %%zmm0, %%zmm4                   \n\t"
-        "vmovups %%zmm1, %%zmm5                   \n\t"
-        "vmovups %%zmm0, %%zmm6                   \n\t"
-        "vmovups %%zmm1, %%zmm7                   \n\t"
-        "vmovups %%zmm0, %%zmm8                   \n\t"
-        "vmovups %%zmm1, %%zmm9                   \n\t"
-        "vmovups %%zmm0, %%zmm10                   \n\t"
-        "vmovups %%zmm1, %%zmm11                   \n\t"
-        "vmovups %%zmm0, %%zmm12                   \n\t"
-        "vmovups %%zmm1, %%zmm13                   \n\t"
-        "vmovups %%zmm0, %%zmm14                   \n\t"
-        "vmovups %%zmm1, %%zmm15                   \n\t"
-        "vmovups %%zmm0, %%zmm16                   \n\t"
-        "vmovups %%zmm1, %%zmm17                   \n\t"
-        "vmovups %%zmm0, %%zmm18                   \n\t"
-        "vmovups %%zmm1, %%zmm19                   \n\t"
-        "vmovups %%zmm0, %%zmm20                   \n\t"
-        "vmovups %%zmm1, %%zmm21                   \n\t"
-        "vmovups %%zmm0, %%zmm22                   \n\t"
-        "vmovups %%zmm1, %%zmm23                   \n\t"
-        "jmp 1f          \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
-        "vxorps %%zmm2, %%zmm2, %%zmm2                     \n\t"
-        "vxorps %%zmm3, %%zmm3, %%zmm3                     \n\t"
-        "vxorps %%zmm4, %%zmm4, %%zmm4                     \n\t"
-        "vxorps %%zmm5, %%zmm5, %%zmm5                     \n\t"
-        "vxorps %%zmm6, %%zmm6, %%zmm6                     \n\t"
-        "vxorps %%zmm7, %%zmm7, %%zmm7                     \n\t"
-        "vxorps %%zmm8, %%zmm8, %%zmm8                     \n\t"
-        "vxorps %%zmm9, %%zmm9, %%zmm9                     \n\t"
-        "vxorps %%zmm10, %%zmm10, %%zmm10                  \n\t"
-        "vxorps %%zmm11, %%zmm11, %%zmm11                  \n\t"
-        "vxorps %%zmm12, %%zmm12, %%zmm12                  \n\t"
-        "vxorps %%zmm13, %%zmm13, %%zmm13                  \n\t"
-        "vxorps %%zmm14, %%zmm14, %%zmm14                  \n\t"
-        "vxorps %%zmm15, %%zmm15, %%zmm15                  \n\t"
-        "vxorps %%zmm16, %%zmm16, %%zmm16                  \n\t"
-        "vxorps %%zmm17, %%zmm17, %%zmm17                  \n\t"
-        "vxorps %%zmm18, %%zmm18, %%zmm18                  \n\t"
-        "vxorps %%zmm19, %%zmm19, %%zmm19                  \n\t"
-        "vxorps %%zmm20, %%zmm20, %%zmm20                  \n\t"
-        "vxorps %%zmm21, %%zmm21, %%zmm21                  \n\t"
-        "vxorps %%zmm22, %%zmm22, %%zmm22                  \n\t"
-        "vxorps %%zmm23, %%zmm23, %%zmm23                  \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "movq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t" mmmKernel12x32
-
-        "add $0x100, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 2b                                             \n\t"
-
-        "movq %2, %%rax  \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpaddd (%%rax), %%zmm0, %%zmm0                       \n\t"
-        "vpaddd 0x40(%%rax), %%zmm1, %%zmm1                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm2, %%zmm2                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm3, %%zmm3                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm4, %%zmm4                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm5, %%zmm5                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm6, %%zmm6                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm7, %%zmm7                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm8, %%zmm8                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm9, %%zmm9                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm10, %%zmm10                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm11, %%zmm11                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm12, %%zmm12                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm13, %%zmm13                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm14, %%zmm14                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm15, %%zmm15                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm16, %%zmm16                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm17, %%zmm17                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm18, %%zmm18                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm19, %%zmm19                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm20, %%zmm20                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm21, %%zmm21                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm22, %%zmm22                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm23, %%zmm23                   \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 3f      \n\t"
-
-        "vbroadcastss (%5), %%zmm24                        \n\t"
-        "vcvtdq2ps %%zmm0, %%zmm0                       \n\t"
-        "vcvtdq2ps %%zmm1, %%zmm1                       \n\t"
-        "vcvtdq2ps %%zmm2, %%zmm2                       \n\t"
-        "vcvtdq2ps %%zmm3, %%zmm3                       \n\t"
-        "vcvtdq2ps %%zmm4, %%zmm4                       \n\t"
-        "vcvtdq2ps %%zmm5, %%zmm5                       \n\t"
-        "vcvtdq2ps %%zmm6, %%zmm6                       \n\t"
-        "vcvtdq2ps %%zmm7, %%zmm7                       \n\t"
-        "vcvtdq2ps %%zmm8, %%zmm8                       \n\t"
-        "vcvtdq2ps %%zmm9, %%zmm9                       \n\t"
-        "vcvtdq2ps %%zmm10, %%zmm10                       \n\t"
-        "vcvtdq2ps %%zmm11, %%zmm11                       \n\t"
-        "vcvtdq2ps %%zmm12, %%zmm12                       \n\t"
-        "vcvtdq2ps %%zmm13, %%zmm13                       \n\t"
-        "vcvtdq2ps %%zmm14, %%zmm14                       \n\t"
-        "vcvtdq2ps %%zmm15, %%zmm15                       \n\t"
-        "vcvtdq2ps %%zmm16, %%zmm16                       \n\t"
-        "vcvtdq2ps %%zmm17, %%zmm17                       \n\t"
-        "vcvtdq2ps %%zmm18, %%zmm18                       \n\t"
-        "vcvtdq2ps %%zmm19, %%zmm19                       \n\t"
-        "vcvtdq2ps %%zmm20, %%zmm20                       \n\t"
-        "vcvtdq2ps %%zmm21, %%zmm21                       \n\t"
-        "vcvtdq2ps %%zmm22, %%zmm22                       \n\t"
-        "vcvtdq2ps %%zmm23, %%zmm23                       \n\t"
-        "vmulps %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vmulps %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vmulps %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vmulps %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vmulps %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vmulps %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vmulps %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vmulps %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vmulps %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vmulps %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vmulps %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vmulps %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "vmulps %%zmm12, %%zmm24, %%zmm12                     \n\t"
-        "vmulps %%zmm13, %%zmm24, %%zmm13                     \n\t"
-        "vmulps %%zmm14, %%zmm24, %%zmm14                     \n\t"
-        "vmulps %%zmm15, %%zmm24, %%zmm15                     \n\t"
-        "vmulps %%zmm16, %%zmm24, %%zmm16                     \n\t"
-        "vmulps %%zmm17, %%zmm24, %%zmm17                     \n\t"
-        "vmulps %%zmm18, %%zmm24, %%zmm18                     \n\t"
-        "vmulps %%zmm19, %%zmm24, %%zmm19                     \n\t"
-        "vmulps %%zmm20, %%zmm24, %%zmm20                     \n\t"
-        "vmulps %%zmm21, %%zmm24, %%zmm21                     \n\t"
-        "vmulps %%zmm22, %%zmm24, %%zmm22                     \n\t"
-        "vmulps %%zmm23, %%zmm24, %%zmm23                     \n\t"
-
-        "movq %8, %%rbx          \n\t"
-        "andq $0x2, %%rbx          \n\t"
-        "je 3f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "vcvtps2dq %%zmm1, %%zmm1                       \n\t"
-        "vcvtps2dq %%zmm2, %%zmm2                       \n\t"
-        "vcvtps2dq %%zmm3, %%zmm3                       \n\t"
-        "vcvtps2dq %%zmm4, %%zmm4                       \n\t"
-        "vcvtps2dq %%zmm5, %%zmm5                       \n\t"
-        "vcvtps2dq %%zmm6, %%zmm6                       \n\t"
-        "vcvtps2dq %%zmm7, %%zmm7                       \n\t"
-        "vcvtps2dq %%zmm8, %%zmm8                       \n\t"
-        "vcvtps2dq %%zmm9, %%zmm9                       \n\t"
-        "vcvtps2dq %%zmm10, %%zmm10                       \n\t"
-        "vcvtps2dq %%zmm11, %%zmm11                       \n\t"
-        "vcvtps2dq %%zmm12, %%zmm12                       \n\t"
-        "vcvtps2dq %%zmm13, %%zmm13                       \n\t"
-        "vcvtps2dq %%zmm14, %%zmm14                       \n\t"
-        "vcvtps2dq %%zmm15, %%zmm15                       \n\t"
-        "vcvtps2dq %%zmm16, %%zmm16                       \n\t"
-        "vcvtps2dq %%zmm17, %%zmm17                       \n\t"
-        "vcvtps2dq %%zmm18, %%zmm18                       \n\t"
-        "vcvtps2dq %%zmm19, %%zmm19                       \n\t"
-        "vcvtps2dq %%zmm20, %%zmm20                       \n\t"
-        "vcvtps2dq %%zmm21, %%zmm21                       \n\t"
-        "vcvtps2dq %%zmm22, %%zmm22                       \n\t"
-        "vcvtps2dq %%zmm23, %%zmm23                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpaddd %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vpaddd %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vpaddd %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vpaddd %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vpaddd %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vpaddd %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vpaddd %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vpaddd %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vpaddd %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vpaddd %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vpaddd %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "vpaddd %%zmm12, %%zmm24, %%zmm12                     \n\t"
-        "vpaddd %%zmm13, %%zmm24, %%zmm13                     \n\t"
-        "vpaddd %%zmm14, %%zmm24, %%zmm14                     \n\t"
-        "vpaddd %%zmm15, %%zmm24, %%zmm15                     \n\t"
-        "vpaddd %%zmm16, %%zmm24, %%zmm16                     \n\t"
-        "vpaddd %%zmm17, %%zmm24, %%zmm17                     \n\t"
-        "vpaddd %%zmm18, %%zmm24, %%zmm18                     \n\t"
-        "vpaddd %%zmm19, %%zmm24, %%zmm19                     \n\t"
-        "vpaddd %%zmm20, %%zmm24, %%zmm20                     \n\t"
-        "vpaddd %%zmm21, %%zmm24, %%zmm21                     \n\t"
-        "vpaddd %%zmm22, %%zmm24, %%zmm22                     \n\t"
-        "vpaddd %%zmm23, %%zmm24, %%zmm23                     \n\t"
-        "movq %9, %%rax  \n\t"
-        "shr $2, %4                                     \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpmovusdb %%zmm0, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm1, 0x10(%%rax)                       \n\t"
-        "vpmovusdb %%zmm2, (%%rax, %4)                       \n\t"
-        "vpmovusdb %%zmm3, 0x10(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm4, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm5, 0x10(%%rax)                       \n\t"
-        "vpmovusdb %%zmm6, (%%rax, %4)                       \n\t"
-        "vpmovusdb %%zmm7, 0x10(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm8, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm9, 0x10(%%rax)                       \n\t"
-        "vpmovusdb %%zmm10, (%%rax, %4)                       \n\t"
-        "vpmovusdb %%zmm11, 0x10(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm12, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm13, 0x10(%%rax)                       \n\t"
-        "vpmovusdb %%zmm14, (%%rax, %4)                       \n\t"
-        "vpmovusdb %%zmm15, 0x10(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm16, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm17, 0x10(%%rax)                       \n\t"
-        "vpmovusdb %%zmm18, (%%rax, %4)                       \n\t"
-        "vpmovusdb %%zmm19, 0x10(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm20, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm21, 0x10(%%rax)                       \n\t"
-        "vpmovusdb %%zmm22, (%%rax, %4)                       \n\t"
-        "vpmovusdb %%zmm23, 0x10(%%rax, %4)                       \n\t"
-        "jmp 4f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "vmovups %%zmm0, (%%rax)                       \n\t"
-        "vmovups %%zmm1, 0x40(%%rax)                       \n\t"
-        "vmovups %%zmm2, (%%rax, %4)                       \n\t"
-        "vmovups %%zmm3, 0x40(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm4, (%%rax)                       \n\t"
-        "vmovups %%zmm5, 0x40(%%rax)                       \n\t"
-        "vmovups %%zmm6, (%%rax, %4)                       \n\t"
-        "vmovups %%zmm7, 0x40(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm8, (%%rax)                       \n\t"
-        "vmovups %%zmm9, 0x40(%%rax)                       \n\t"
-        "vmovups %%zmm10, (%%rax, %4)                       \n\t"
-        "vmovups %%zmm11, 0x40(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm12, (%%rax)                       \n\t"
-        "vmovups %%zmm13, 0x40(%%rax)                       \n\t"
-        "vmovups %%zmm14, (%%rax, %4)                       \n\t"
-        "vmovups %%zmm15, 0x40(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm16, (%%rax)                       \n\t"
-        "vmovups %%zmm17, 0x40(%%rax)                       \n\t"
-        "vmovups %%zmm18, (%%rax, %4)                       \n\t"
-        "vmovups %%zmm19, 0x40(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm20, (%%rax)                       \n\t"
-        "vmovups %%zmm21, 0x40(%%rax)                       \n\t"
-        "vmovups %%zmm22, (%%rax, %4)                       \n\t"
-        "vmovups %%zmm23, 0x40(%%rax, %4)                       \n\t"
-
-        ".align 16                                         \n\t"
-        "4:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)),
-        "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%rbx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7",
-        "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16",
-        "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25",
-        "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel24x16                                              \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "vpdpbusd %%zmm24, %%zmm25, %%zmm0              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm26, %%zmm1              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm27, %%zmm2              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm3              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm4              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm5              \n\t"           \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpdpbusd %%zmm24, %%zmm25, %%zmm6              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm26, %%zmm7              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm27, %%zmm8              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm9              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm10              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm11              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vmovups (%1), %%zmm31                             \n\t"        \
-    "vpdpbusd %%zmm24, %%zmm25, %%zmm12              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm26, %%zmm13              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm27, %%zmm14              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm15              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm16              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm17              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpdpbusd %%zmm24, %%zmm25, %%zmm18              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm26, %%zmm19              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm27, %%zmm20              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm21              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm22              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm23              \n\t"          \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "prefetcht0 0xC0(%1)                              \n\t"         \
-    "vpdpbusd %%zmm31, %%zmm25, %%zmm0              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm26, %%zmm1              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm27, %%zmm2              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm28, %%zmm3              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm29, %%zmm4              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm30, %%zmm5              \n\t"           \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpdpbusd %%zmm31, %%zmm25, %%zmm6              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm26, %%zmm7              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm27, %%zmm8              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm28, %%zmm9              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm29, %%zmm10              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm30, %%zmm11              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vmovups 0x40(%1), %%zmm24                             \n\t"    \
-    "vpdpbusd %%zmm31, %%zmm25, %%zmm12              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm26, %%zmm13              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm27, %%zmm14              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm28, %%zmm15              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm29, %%zmm16              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm30, %%zmm17              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpdpbusd %%zmm31, %%zmm25, %%zmm18              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm26, %%zmm19              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm27, %%zmm20              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm28, %%zmm21              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm29, %%zmm22              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm30, %%zmm23              \n\t"
-#else
-#define mmmKernel24x16                                              \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm0, %%zmm28, %%zmm0              \n\t"              \
-    "vpaddd %%zmm1, %%zmm29, %%zmm1              \n\t"              \
-    "vpaddd %%zmm2, %%zmm30, %%zmm2              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm3, %%zmm28, %%zmm3              \n\t"              \
-    "vpaddd %%zmm4, %%zmm29, %%zmm4              \n\t"              \
-    "vpaddd %%zmm5, %%zmm30, %%zmm5              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm6, %%zmm28, %%zmm6              \n\t"              \
-    "vpaddd %%zmm7, %%zmm29, %%zmm7              \n\t"              \
-    "vpaddd %%zmm8, %%zmm30, %%zmm8              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm9, %%zmm28, %%zmm9              \n\t"              \
-    "vpaddd %%zmm10, %%zmm29, %%zmm10              \n\t"            \
-    "vpaddd %%zmm11, %%zmm30, %%zmm11              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm12, %%zmm28, %%zmm12              \n\t"            \
-    "vpaddd %%zmm13, %%zmm29, %%zmm13              \n\t"            \
-    "vpaddd %%zmm14, %%zmm30, %%zmm14              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm15, %%zmm28, %%zmm15              \n\t"            \
-    "vpaddd %%zmm16, %%zmm29, %%zmm16              \n\t"            \
-    "vpaddd %%zmm17, %%zmm30, %%zmm17              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm18, %%zmm28, %%zmm18              \n\t"            \
-    "vpaddd %%zmm19, %%zmm29, %%zmm19              \n\t"            \
-    "vpaddd %%zmm20, %%zmm30, %%zmm20              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vmovups (%1), %%zmm24                             \n\t"        \
-    "vpaddd %%zmm21, %%zmm28, %%zmm21              \n\t"            \
-    "vpaddd %%zmm22, %%zmm29, %%zmm22              \n\t"            \
-    "vpaddd %%zmm23, %%zmm30, %%zmm23              \n\t"            \
-    "prefetcht0 0xC0(%1)                              \n\t"         \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm0, %%zmm28, %%zmm0              \n\t"              \
-    "vpaddd %%zmm1, %%zmm29, %%zmm1              \n\t"              \
-    "vpaddd %%zmm2, %%zmm30, %%zmm2              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm3, %%zmm28, %%zmm3              \n\t"              \
-    "vpaddd %%zmm4, %%zmm29, %%zmm4              \n\t"              \
-    "vpaddd %%zmm5, %%zmm30, %%zmm5              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm6, %%zmm28, %%zmm6              \n\t"              \
-    "vpaddd %%zmm7, %%zmm29, %%zmm7              \n\t"              \
-    "vpaddd %%zmm8, %%zmm30, %%zmm8              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm9, %%zmm28, %%zmm9              \n\t"              \
-    "vpaddd %%zmm10, %%zmm29, %%zmm10              \n\t"            \
-    "vpaddd %%zmm11, %%zmm30, %%zmm11              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm12, %%zmm28, %%zmm12              \n\t"            \
-    "vpaddd %%zmm13, %%zmm29, %%zmm13              \n\t"            \
-    "vpaddd %%zmm14, %%zmm30, %%zmm14              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm15, %%zmm28, %%zmm15              \n\t"            \
-    "vpaddd %%zmm16, %%zmm29, %%zmm16              \n\t"            \
-    "vpaddd %%zmm17, %%zmm30, %%zmm17              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm18, %%zmm28, %%zmm18              \n\t"            \
-    "vpaddd %%zmm19, %%zmm29, %%zmm19              \n\t"            \
-    "vpaddd %%zmm20, %%zmm30, %%zmm20              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "vmovups 0x40(%1), %%zmm24                             \n\t"    \
-    "vpaddd %%zmm21, %%zmm28, %%zmm21              \n\t"            \
-    "vpaddd %%zmm22, %%zmm29, %%zmm22              \n\t"            \
-    "vpaddd %%zmm23, %%zmm30, %%zmm23              \n\t"
-#endif
-
-inline void mmm_avx512_24x16_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0x80(%1)                              \n\t"
-        "vmovups (%1), %%zmm24                             \n\t"
-        "add $0x40, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%ebx \n\t"
-        "vmovd %%ebx, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%zmm31            \n\t"
-#endif
-        "movq %8, %%rbx          \n\t"
-        "andq $0x1, %%rbx          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%7), %%zmm0                       \n\t"
-        "vmovups %%zmm0, %%zmm1                   \n\t"
-        "vmovups %%zmm0, %%zmm2                   \n\t"
-        "vmovups %%zmm0, %%zmm3                   \n\t"
-        "vmovups %%zmm0, %%zmm4                   \n\t"
-        "vmovups %%zmm0, %%zmm5                   \n\t"
-        "vmovups %%zmm0, %%zmm6                   \n\t"
-        "vmovups %%zmm0, %%zmm7                   \n\t"
-        "vmovups %%zmm0, %%zmm8                   \n\t"
-        "vmovups %%zmm0, %%zmm9                   \n\t"
-        "vmovups %%zmm0, %%zmm10                   \n\t"
-        "vmovups %%zmm0, %%zmm11                   \n\t"
-        "vmovups %%zmm0, %%zmm12                   \n\t"
-        "vmovups %%zmm0, %%zmm13                   \n\t"
-        "vmovups %%zmm0, %%zmm14                   \n\t"
-        "vmovups %%zmm0, %%zmm15                   \n\t"
-        "vmovups %%zmm0, %%zmm16                   \n\t"
-        "vmovups %%zmm0, %%zmm17                   \n\t"
-        "vmovups %%zmm0, %%zmm18                   \n\t"
-        "vmovups %%zmm0, %%zmm19                   \n\t"
-        "vmovups %%zmm0, %%zmm20                   \n\t"
-        "vmovups %%zmm0, %%zmm21                   \n\t"
-        "vmovups %%zmm0, %%zmm22                   \n\t"
-        "vmovups %%zmm0, %%zmm23                   \n\t"
-        "jmp 1f          \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
-        "vxorps %%zmm2, %%zmm2, %%zmm2                     \n\t"
-        "vxorps %%zmm3, %%zmm3, %%zmm3                     \n\t"
-        "vxorps %%zmm4, %%zmm4, %%zmm4                     \n\t"
-        "vxorps %%zmm5, %%zmm5, %%zmm5                     \n\t"
-        "vxorps %%zmm6, %%zmm6, %%zmm6                     \n\t"
-        "vxorps %%zmm7, %%zmm7, %%zmm7                     \n\t"
-        "vxorps %%zmm8, %%zmm8, %%zmm8                     \n\t"
-        "vxorps %%zmm9, %%zmm9, %%zmm9                     \n\t"
-        "vxorps %%zmm10, %%zmm10, %%zmm10                  \n\t"
-        "vxorps %%zmm11, %%zmm11, %%zmm11                  \n\t"
-        "vxorps %%zmm12, %%zmm12, %%zmm12                  \n\t"
-        "vxorps %%zmm13, %%zmm13, %%zmm13                  \n\t"
-        "vxorps %%zmm14, %%zmm14, %%zmm14                  \n\t"
-        "vxorps %%zmm15, %%zmm15, %%zmm15                  \n\t"
-        "vxorps %%zmm16, %%zmm16, %%zmm16                  \n\t"
-        "vxorps %%zmm17, %%zmm17, %%zmm17                  \n\t"
-        "vxorps %%zmm18, %%zmm18, %%zmm18                  \n\t"
-        "vxorps %%zmm19, %%zmm19, %%zmm19                  \n\t"
-        "vxorps %%zmm20, %%zmm20, %%zmm20                  \n\t"
-        "vxorps %%zmm21, %%zmm21, %%zmm21                  \n\t"
-        "vxorps %%zmm22, %%zmm22, %%zmm22                  \n\t"
-        "vxorps %%zmm23, %%zmm23, %%zmm23                  \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "movq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t" mmmKernel24x16
-
-        "add $0x80, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 2b                                             \n\t"
-
-        "movq %2, %%rax  \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpaddd (%%rax), %%zmm0, %%zmm0                       \n\t"
-        "vpaddd (%%rax, %4), %%zmm1, %%zmm1                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm2, %%zmm2                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm3, %%zmm3                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm4, %%zmm4                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm5, %%zmm5                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm6, %%zmm6                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm7, %%zmm7                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm8, %%zmm8                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm9, %%zmm9                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm10, %%zmm10                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm11, %%zmm11                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm12, %%zmm12                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm13, %%zmm13                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm14, %%zmm14                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm15, %%zmm15                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm16, %%zmm16                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm17, %%zmm17                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm18, %%zmm18                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm19, %%zmm19                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm20, %%zmm20                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm21, %%zmm21                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm22, %%zmm22                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm23, %%zmm23                   \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 3f      \n\t"
-
-        "vbroadcastss (%5), %%zmm24                        \n\t"
-        "vcvtdq2ps %%zmm0, %%zmm0                       \n\t"
-        "vcvtdq2ps %%zmm1, %%zmm1                       \n\t"
-        "vcvtdq2ps %%zmm2, %%zmm2                       \n\t"
-        "vcvtdq2ps %%zmm3, %%zmm3                       \n\t"
-        "vcvtdq2ps %%zmm4, %%zmm4                       \n\t"
-        "vcvtdq2ps %%zmm5, %%zmm5                       \n\t"
-        "vcvtdq2ps %%zmm6, %%zmm6                       \n\t"
-        "vcvtdq2ps %%zmm7, %%zmm7                       \n\t"
-        "vcvtdq2ps %%zmm8, %%zmm8                       \n\t"
-        "vcvtdq2ps %%zmm9, %%zmm9                       \n\t"
-        "vcvtdq2ps %%zmm10, %%zmm10                       \n\t"
-        "vcvtdq2ps %%zmm11, %%zmm11                       \n\t"
-        "vcvtdq2ps %%zmm12, %%zmm12                       \n\t"
-        "vcvtdq2ps %%zmm13, %%zmm13                       \n\t"
-        "vcvtdq2ps %%zmm14, %%zmm14                       \n\t"
-        "vcvtdq2ps %%zmm15, %%zmm15                       \n\t"
-        "vcvtdq2ps %%zmm16, %%zmm16                       \n\t"
-        "vcvtdq2ps %%zmm17, %%zmm17                       \n\t"
-        "vcvtdq2ps %%zmm18, %%zmm18                       \n\t"
-        "vcvtdq2ps %%zmm19, %%zmm19                       \n\t"
-        "vcvtdq2ps %%zmm20, %%zmm20                       \n\t"
-        "vcvtdq2ps %%zmm21, %%zmm21                       \n\t"
-        "vcvtdq2ps %%zmm22, %%zmm22                       \n\t"
-        "vcvtdq2ps %%zmm23, %%zmm23                       \n\t"
-        "vmulps %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vmulps %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vmulps %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vmulps %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vmulps %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vmulps %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vmulps %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vmulps %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vmulps %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vmulps %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vmulps %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vmulps %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "vmulps %%zmm12, %%zmm24, %%zmm12                     \n\t"
-        "vmulps %%zmm13, %%zmm24, %%zmm13                     \n\t"
-        "vmulps %%zmm14, %%zmm24, %%zmm14                     \n\t"
-        "vmulps %%zmm15, %%zmm24, %%zmm15                     \n\t"
-        "vmulps %%zmm16, %%zmm24, %%zmm16                     \n\t"
-        "vmulps %%zmm17, %%zmm24, %%zmm17                     \n\t"
-        "vmulps %%zmm18, %%zmm24, %%zmm18                     \n\t"
-        "vmulps %%zmm19, %%zmm24, %%zmm19                     \n\t"
-        "vmulps %%zmm20, %%zmm24, %%zmm20                     \n\t"
-        "vmulps %%zmm21, %%zmm24, %%zmm21                     \n\t"
-        "vmulps %%zmm22, %%zmm24, %%zmm22                     \n\t"
-        "vmulps %%zmm23, %%zmm24, %%zmm23                     \n\t"
-
-        "movq %8, %%rbx          \n\t"
-        "andq $0x2, %%rbx          \n\t"
-        "je 3f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "vcvtps2dq %%zmm1, %%zmm1                       \n\t"
-        "vcvtps2dq %%zmm2, %%zmm2                       \n\t"
-        "vcvtps2dq %%zmm3, %%zmm3                       \n\t"
-        "vcvtps2dq %%zmm4, %%zmm4                       \n\t"
-        "vcvtps2dq %%zmm5, %%zmm5                       \n\t"
-        "vcvtps2dq %%zmm6, %%zmm6                       \n\t"
-        "vcvtps2dq %%zmm7, %%zmm7                       \n\t"
-        "vcvtps2dq %%zmm8, %%zmm8                       \n\t"
-        "vcvtps2dq %%zmm9, %%zmm9                       \n\t"
-        "vcvtps2dq %%zmm10, %%zmm10                       \n\t"
-        "vcvtps2dq %%zmm11, %%zmm11                       \n\t"
-        "vcvtps2dq %%zmm12, %%zmm12                       \n\t"
-        "vcvtps2dq %%zmm13, %%zmm13                       \n\t"
-        "vcvtps2dq %%zmm14, %%zmm14                       \n\t"
-        "vcvtps2dq %%zmm15, %%zmm15                       \n\t"
-        "vcvtps2dq %%zmm16, %%zmm16                       \n\t"
-        "vcvtps2dq %%zmm17, %%zmm17                       \n\t"
-        "vcvtps2dq %%zmm18, %%zmm18                       \n\t"
-        "vcvtps2dq %%zmm19, %%zmm19                       \n\t"
-        "vcvtps2dq %%zmm20, %%zmm20                       \n\t"
-        "vcvtps2dq %%zmm21, %%zmm21                       \n\t"
-        "vcvtps2dq %%zmm22, %%zmm22                       \n\t"
-        "vcvtps2dq %%zmm23, %%zmm23                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpaddd %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vpaddd %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vpaddd %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vpaddd %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vpaddd %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vpaddd %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vpaddd %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vpaddd %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vpaddd %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vpaddd %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vpaddd %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "vpaddd %%zmm12, %%zmm24, %%zmm12                     \n\t"
-        "vpaddd %%zmm13, %%zmm24, %%zmm13                     \n\t"
-        "vpaddd %%zmm14, %%zmm24, %%zmm14                     \n\t"
-        "vpaddd %%zmm15, %%zmm24, %%zmm15                     \n\t"
-        "vpaddd %%zmm16, %%zmm24, %%zmm16                     \n\t"
-        "vpaddd %%zmm17, %%zmm24, %%zmm17                     \n\t"
-        "vpaddd %%zmm18, %%zmm24, %%zmm18                     \n\t"
-        "vpaddd %%zmm19, %%zmm24, %%zmm19                     \n\t"
-        "vpaddd %%zmm20, %%zmm24, %%zmm20                     \n\t"
-        "vpaddd %%zmm21, %%zmm24, %%zmm21                     \n\t"
-        "vpaddd %%zmm22, %%zmm24, %%zmm22                     \n\t"
-        "vpaddd %%zmm23, %%zmm24, %%zmm23                     \n\t"
-        "movq %9, %%rax  \n\t"
-        "shr $2, %4                                     \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpmovusdb %%zmm0, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm1, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm2, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm3, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm4, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm5, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm6, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm7, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm8, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm9, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm10, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm11, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm12, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm13, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm14, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm15, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm16, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm17, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm18, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm19, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm20, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm21, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm22, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm23, (%%rax, %4)                       \n\t"
-        "jmp 4f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "vmovups %%zmm0, (%%rax)                       \n\t"
-        "vmovups %%zmm1, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm2, (%%rax)                       \n\t"
-        "vmovups %%zmm3, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm4, (%%rax)                       \n\t"
-        "vmovups %%zmm5, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm6, (%%rax)                       \n\t"
-        "vmovups %%zmm7, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm8, (%%rax)                       \n\t"
-        "vmovups %%zmm9, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm10, (%%rax)                       \n\t"
-        "vmovups %%zmm11, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm12, (%%rax)                       \n\t"
-        "vmovups %%zmm13, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm14, (%%rax)                       \n\t"
-        "vmovups %%zmm15, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm16, (%%rax)                       \n\t"
-        "vmovups %%zmm17, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm18, (%%rax)                       \n\t"
-        "vmovups %%zmm19, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm20, (%%rax)                       \n\t"
-        "vmovups %%zmm21, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm22, (%%rax)                       \n\t"
-        "vmovups %%zmm23, (%%rax, %4)                       \n\t"
-
-        ".align 16                                         \n\t"
-        "4:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((int64_t)(N * 4)),
-        "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%rbx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7",
-        "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16",
-        "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25",
-        "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel24x8                                               \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "vpdpbusd %%ymm24, %%ymm25, %%ymm0              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm26, %%ymm1              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm27, %%ymm2              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm28, %%ymm3              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm29, %%ymm4              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm30, %%ymm5              \n\t"           \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "vpdpbusd %%ymm24, %%ymm25, %%ymm6              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm26, %%ymm7              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm27, %%ymm8              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm28, %%ymm9              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm29, %%ymm10              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm30, %%ymm11              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "vmovups (%1), %%ymm31                             \n\t"        \
-    "vpdpbusd %%ymm24, %%ymm25, %%ymm12              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm26, %%ymm13              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm27, %%ymm14              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm28, %%ymm15              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm29, %%ymm16              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm30, %%ymm17              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "vpdpbusd %%ymm24, %%ymm25, %%ymm18              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm26, %%ymm19              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm27, %%ymm20              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm28, %%ymm21              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm29, %%ymm22              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm30, %%ymm23              \n\t"          \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "vpdpbusd %%ymm31, %%ymm25, %%ymm0              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm26, %%ymm1              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm27, %%ymm2              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm28, %%ymm3              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm29, %%ymm4              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm30, %%ymm5              \n\t"           \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "vpdpbusd %%ymm31, %%ymm25, %%ymm6              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm26, %%ymm7              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm27, %%ymm8              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm28, %%ymm9              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm29, %%ymm10              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm30, %%ymm11              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "vmovups 0x20(%1), %%ymm24                             \n\t"    \
-    "vpdpbusd %%ymm31, %%ymm25, %%ymm12              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm26, %%ymm13              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm27, %%ymm14              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm28, %%ymm15              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm29, %%ymm16              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm30, %%ymm17              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "vpdpbusd %%ymm31, %%ymm25, %%ymm18              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm26, %%ymm19              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm27, %%ymm20              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm28, %%ymm21              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm29, %%ymm22              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm30, %%ymm23              \n\t"
-#else
-#define mmmKernel24x8                                               \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm0, %%ymm28, %%ymm0              \n\t"              \
-    "vpaddd %%ymm1, %%ymm29, %%ymm1              \n\t"              \
-    "vpaddd %%ymm2, %%ymm30, %%ymm2              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm3, %%ymm28, %%ymm3              \n\t"              \
-    "vpaddd %%ymm4, %%ymm29, %%ymm4              \n\t"              \
-    "vpaddd %%ymm5, %%ymm30, %%ymm5              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm6, %%ymm28, %%ymm6              \n\t"              \
-    "vpaddd %%ymm7, %%ymm29, %%ymm7              \n\t"              \
-    "vpaddd %%ymm8, %%ymm30, %%ymm8              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm9, %%ymm28, %%ymm9              \n\t"              \
-    "vpaddd %%ymm10, %%ymm29, %%ymm10              \n\t"            \
-    "vpaddd %%ymm11, %%ymm30, %%ymm11              \n\t"            \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm12, %%ymm28, %%ymm12              \n\t"            \
-    "vpaddd %%ymm13, %%ymm29, %%ymm13              \n\t"            \
-    "vpaddd %%ymm14, %%ymm30, %%ymm14              \n\t"            \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm15, %%ymm28, %%ymm15              \n\t"            \
-    "vpaddd %%ymm16, %%ymm29, %%ymm16              \n\t"            \
-    "vpaddd %%ymm17, %%ymm30, %%ymm17              \n\t"            \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm18, %%ymm28, %%ymm18              \n\t"            \
-    "vpaddd %%ymm19, %%ymm29, %%ymm19              \n\t"            \
-    "vpaddd %%ymm20, %%ymm30, %%ymm20              \n\t"            \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vmovups (%1), %%ymm24                             \n\t"        \
-    "vpaddd %%ymm21, %%ymm28, %%ymm21              \n\t"            \
-    "vpaddd %%ymm22, %%ymm29, %%ymm22              \n\t"            \
-    "vpaddd %%ymm23, %%ymm30, %%ymm23              \n\t"            \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm0, %%ymm28, %%ymm0              \n\t"              \
-    "vpaddd %%ymm1, %%ymm29, %%ymm1              \n\t"              \
-    "vpaddd %%ymm2, %%ymm30, %%ymm2              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm3, %%ymm28, %%ymm3              \n\t"              \
-    "vpaddd %%ymm4, %%ymm29, %%ymm4              \n\t"              \
-    "vpaddd %%ymm5, %%ymm30, %%ymm5              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm6, %%ymm28, %%ymm6              \n\t"              \
-    "vpaddd %%ymm7, %%ymm29, %%ymm7              \n\t"              \
-    "vpaddd %%ymm8, %%ymm30, %%ymm8              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm9, %%ymm28, %%ymm9              \n\t"              \
-    "vpaddd %%ymm10, %%ymm29, %%ymm10              \n\t"            \
-    "vpaddd %%ymm11, %%ymm30, %%ymm11              \n\t"            \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm12, %%ymm28, %%ymm12              \n\t"            \
-    "vpaddd %%ymm13, %%ymm29, %%ymm13              \n\t"            \
-    "vpaddd %%ymm14, %%ymm30, %%ymm14              \n\t"            \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm15, %%ymm28, %%ymm15              \n\t"            \
-    "vpaddd %%ymm16, %%ymm29, %%ymm16              \n\t"            \
-    "vpaddd %%ymm17, %%ymm30, %%ymm17              \n\t"            \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm18, %%ymm28, %%ymm18              \n\t"            \
-    "vpaddd %%ymm19, %%ymm29, %%ymm19              \n\t"            \
-    "vpaddd %%ymm20, %%ymm30, %%ymm20              \n\t"            \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "vmovups 0x20(%1), %%ymm24                             \n\t"    \
-    "vpaddd %%ymm21, %%ymm28, %%ymm21              \n\t"            \
-    "vpaddd %%ymm22, %%ymm29, %%ymm22              \n\t"            \
-    "vpaddd %%ymm23, %%ymm30, %%ymm23              \n\t"
-#endif
-
-inline void mmm_avx512_24x8_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0x40(%1)                              \n\t"
-        "vmovups (%1), %%ymm24                             \n\t"
-        "add $0x20, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%ebx \n\t"
-        "vmovd %%ebx, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%ymm31            \n\t"
-#endif
-        "movq %8, %%rbx          \n\t"
-        "andq $0x1, %%rbx          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%7), %%ymm0                       \n\t"
-        "vmovups %%ymm0, %%ymm1                   \n\t"
-        "vmovups %%ymm0, %%ymm2                   \n\t"
-        "vmovups %%ymm0, %%ymm3                   \n\t"
-        "vmovups %%ymm0, %%ymm4                   \n\t"
-        "vmovups %%ymm0, %%ymm5                   \n\t"
-        "vmovups %%ymm0, %%ymm6                   \n\t"
-        "vmovups %%ymm0, %%ymm7                   \n\t"
-        "vmovups %%ymm0, %%ymm8                   \n\t"
-        "vmovups %%ymm0, %%ymm9                   \n\t"
-        "vmovups %%ymm0, %%ymm10                   \n\t"
-        "vmovups %%ymm0, %%ymm11                   \n\t"
-        "vmovups %%ymm0, %%ymm12                   \n\t"
-        "vmovups %%ymm0, %%ymm13                   \n\t"
-        "vmovups %%ymm0, %%ymm14                   \n\t"
-        "vmovups %%ymm0, %%ymm15                   \n\t"
-        "vmovups %%ymm0, %%ymm16                   \n\t"
-        "vmovups %%ymm0, %%ymm17                   \n\t"
-        "vmovups %%ymm0, %%ymm18                   \n\t"
-        "vmovups %%ymm0, %%ymm19                   \n\t"
-        "vmovups %%ymm0, %%ymm20                   \n\t"
-        "vmovups %%ymm0, %%ymm21                   \n\t"
-        "vmovups %%ymm0, %%ymm22                   \n\t"
-        "vmovups %%ymm0, %%ymm23                   \n\t"
-        "jmp 1f          \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-        "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-        "vxorps %%ymm2, %%ymm2, %%ymm2                     \n\t"
-        "vxorps %%ymm3, %%ymm3, %%ymm3                     \n\t"
-        "vxorps %%ymm4, %%ymm4, %%ymm4                     \n\t"
-        "vxorps %%ymm5, %%ymm5, %%ymm5                     \n\t"
-        "vxorps %%ymm6, %%ymm6, %%ymm6                     \n\t"
-        "vxorps %%ymm7, %%ymm7, %%ymm7                     \n\t"
-        "vxorps %%ymm8, %%ymm8, %%ymm8                     \n\t"
-        "vxorps %%ymm9, %%ymm9, %%ymm9                     \n\t"
-        "vxorps %%ymm10, %%ymm10, %%ymm10                  \n\t"
-        "vxorps %%ymm11, %%ymm11, %%ymm11                  \n\t"
-        "vxorps %%ymm12, %%ymm12, %%ymm12                  \n\t"
-        "vxorps %%ymm13, %%ymm13, %%ymm13                  \n\t"
-        "vxorps %%ymm14, %%ymm14, %%ymm14                  \n\t"
-        "vxorps %%ymm15, %%ymm15, %%ymm15                  \n\t"
-        "vxorps %%ymm16, %%ymm16, %%ymm16                  \n\t"
-        "vxorps %%ymm17, %%ymm17, %%ymm17                  \n\t"
-        "vxorps %%ymm18, %%ymm18, %%ymm18                  \n\t"
-        "vxorps %%ymm19, %%ymm19, %%ymm19                  \n\t"
-        "vxorps %%ymm20, %%ymm20, %%ymm20                  \n\t"
-        "vxorps %%ymm21, %%ymm21, %%ymm21                  \n\t"
-        "vxorps %%ymm22, %%ymm22, %%ymm22                  \n\t"
-        "vxorps %%ymm23, %%ymm23, %%ymm23                  \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "movq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t" mmmKernel24x8
-
-        "add $0x40, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 2b                                             \n\t"
-
-        "movq %2, %%rax  \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpaddd (%%rax), %%ymm0, %%ymm0                       \n\t"
-        "vpaddd (%%rax, %4), %%ymm1, %%ymm1                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm2, %%ymm2                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm3, %%ymm3                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm4, %%ymm4                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm5, %%ymm5                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm6, %%ymm6                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm7, %%ymm7                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm8, %%ymm8                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm9, %%ymm9                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm10, %%ymm10                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm11, %%ymm11                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm12, %%ymm12                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm13, %%ymm13                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm14, %%ymm14                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm15, %%ymm15                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm16, %%ymm16                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm17, %%ymm17                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm18, %%ymm18                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm19, %%ymm19                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm20, %%ymm20                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm21, %%ymm21                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm22, %%ymm22                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm23, %%ymm23                   \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 3f      \n\t"
-
-        "vbroadcastss (%5), %%ymm24                        \n\t"
-        "vcvtdq2ps %%ymm0, %%ymm0                       \n\t"
-        "vcvtdq2ps %%ymm1, %%ymm1                       \n\t"
-        "vcvtdq2ps %%ymm2, %%ymm2                       \n\t"
-        "vcvtdq2ps %%ymm3, %%ymm3                       \n\t"
-        "vcvtdq2ps %%ymm4, %%ymm4                       \n\t"
-        "vcvtdq2ps %%ymm5, %%ymm5                       \n\t"
-        "vcvtdq2ps %%ymm6, %%ymm6                       \n\t"
-        "vcvtdq2ps %%ymm7, %%ymm7                       \n\t"
-        "vcvtdq2ps %%ymm8, %%ymm8                       \n\t"
-        "vcvtdq2ps %%ymm9, %%ymm9                       \n\t"
-        "vcvtdq2ps %%ymm10, %%ymm10                       \n\t"
-        "vcvtdq2ps %%ymm11, %%ymm11                       \n\t"
-        "vcvtdq2ps %%ymm12, %%ymm12                       \n\t"
-        "vcvtdq2ps %%ymm13, %%ymm13                       \n\t"
-        "vcvtdq2ps %%ymm14, %%ymm14                       \n\t"
-        "vcvtdq2ps %%ymm15, %%ymm15                       \n\t"
-        "vcvtdq2ps %%ymm16, %%ymm16                       \n\t"
-        "vcvtdq2ps %%ymm17, %%ymm17                       \n\t"
-        "vcvtdq2ps %%ymm18, %%ymm18                       \n\t"
-        "vcvtdq2ps %%ymm19, %%ymm19                       \n\t"
-        "vcvtdq2ps %%ymm20, %%ymm20                       \n\t"
-        "vcvtdq2ps %%ymm21, %%ymm21                       \n\t"
-        "vcvtdq2ps %%ymm22, %%ymm22                       \n\t"
-        "vcvtdq2ps %%ymm23, %%ymm23                       \n\t"
-        "vmulps %%ymm0, %%ymm24, %%ymm0                       \n\t"
-        "vmulps %%ymm1, %%ymm24, %%ymm1                       \n\t"
-        "vmulps %%ymm2, %%ymm24, %%ymm2                       \n\t"
-        "vmulps %%ymm3, %%ymm24, %%ymm3                       \n\t"
-        "vmulps %%ymm4, %%ymm24, %%ymm4                       \n\t"
-        "vmulps %%ymm5, %%ymm24, %%ymm5                       \n\t"
-        "vmulps %%ymm6, %%ymm24, %%ymm6                       \n\t"
-        "vmulps %%ymm7, %%ymm24, %%ymm7                       \n\t"
-        "vmulps %%ymm8, %%ymm24, %%ymm8                       \n\t"
-        "vmulps %%ymm9, %%ymm24, %%ymm9                       \n\t"
-        "vmulps %%ymm10, %%ymm24, %%ymm10                     \n\t"
-        "vmulps %%ymm11, %%ymm24, %%ymm11                     \n\t"
-        "vmulps %%ymm12, %%ymm24, %%ymm12                     \n\t"
-        "vmulps %%ymm13, %%ymm24, %%ymm13                     \n\t"
-        "vmulps %%ymm14, %%ymm24, %%ymm14                     \n\t"
-        "vmulps %%ymm15, %%ymm24, %%ymm15                     \n\t"
-        "vmulps %%ymm16, %%ymm24, %%ymm16                     \n\t"
-        "vmulps %%ymm17, %%ymm24, %%ymm17                     \n\t"
-        "vmulps %%ymm18, %%ymm24, %%ymm18                     \n\t"
-        "vmulps %%ymm19, %%ymm24, %%ymm19                     \n\t"
-        "vmulps %%ymm20, %%ymm24, %%ymm20                     \n\t"
-        "vmulps %%ymm21, %%ymm24, %%ymm21                     \n\t"
-        "vmulps %%ymm22, %%ymm24, %%ymm22                     \n\t"
-        "vmulps %%ymm23, %%ymm24, %%ymm23                     \n\t"
-
-        "movq %8, %%rbx          \n\t"
-        "andq $0x2, %%rbx          \n\t"
-        "je 3f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "vcvtps2dq %%zmm1, %%zmm1                       \n\t"
-        "vcvtps2dq %%zmm2, %%zmm2                       \n\t"
-        "vcvtps2dq %%zmm3, %%zmm3                       \n\t"
-        "vcvtps2dq %%zmm4, %%zmm4                       \n\t"
-        "vcvtps2dq %%zmm5, %%zmm5                       \n\t"
-        "vcvtps2dq %%zmm6, %%zmm6                       \n\t"
-        "vcvtps2dq %%zmm7, %%zmm7                       \n\t"
-        "vcvtps2dq %%zmm8, %%zmm8                       \n\t"
-        "vcvtps2dq %%zmm9, %%zmm9                       \n\t"
-        "vcvtps2dq %%zmm10, %%zmm10                       \n\t"
-        "vcvtps2dq %%zmm11, %%zmm11                       \n\t"
-        "vcvtps2dq %%zmm12, %%zmm12                       \n\t"
-        "vcvtps2dq %%zmm13, %%zmm13                       \n\t"
-        "vcvtps2dq %%zmm14, %%zmm14                       \n\t"
-        "vcvtps2dq %%zmm15, %%zmm15                       \n\t"
-        "vcvtps2dq %%zmm16, %%zmm16                       \n\t"
-        "vcvtps2dq %%zmm17, %%zmm17                       \n\t"
-        "vcvtps2dq %%zmm18, %%zmm18                       \n\t"
-        "vcvtps2dq %%zmm19, %%zmm19                       \n\t"
-        "vcvtps2dq %%zmm20, %%zmm20                       \n\t"
-        "vcvtps2dq %%zmm21, %%zmm21                       \n\t"
-        "vcvtps2dq %%zmm22, %%zmm22                       \n\t"
-        "vcvtps2dq %%zmm23, %%zmm23                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpaddd %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vpaddd %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vpaddd %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vpaddd %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vpaddd %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vpaddd %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vpaddd %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vpaddd %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vpaddd %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vpaddd %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vpaddd %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "vpaddd %%zmm12, %%zmm24, %%zmm12                     \n\t"
-        "vpaddd %%zmm13, %%zmm24, %%zmm13                     \n\t"
-        "vpaddd %%zmm14, %%zmm24, %%zmm14                     \n\t"
-        "vpaddd %%zmm15, %%zmm24, %%zmm15                     \n\t"
-        "vpaddd %%zmm16, %%zmm24, %%zmm16                     \n\t"
-        "vpaddd %%zmm17, %%zmm24, %%zmm17                     \n\t"
-        "vpaddd %%zmm18, %%zmm24, %%zmm18                     \n\t"
-        "vpaddd %%zmm19, %%zmm24, %%zmm19                     \n\t"
-        "vpaddd %%zmm20, %%zmm24, %%zmm20                     \n\t"
-        "vpaddd %%zmm21, %%zmm24, %%zmm21                     \n\t"
-        "vpaddd %%zmm22, %%zmm24, %%zmm22                     \n\t"
-        "vpaddd %%zmm23, %%zmm24, %%zmm23                     \n\t"
-        "movq %9, %%rax  \n\t"
-        "shr $2, %4                                     \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpmovusdb %%zmm0, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm1, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm2, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm3, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm4, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm5, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm6, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm7, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm8, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm9, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm10, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm11, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm12, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm13, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm14, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm15, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm16, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm17, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm18, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm19, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm20, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm21, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm22, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm23, (%%rax, %4)                       \n\t"
-        "jmp 4f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "vmovups %%ymm0, (%%rax)                       \n\t"
-        "vmovups %%ymm1, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm2, (%%rax)                       \n\t"
-        "vmovups %%ymm3, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm4, (%%rax)                       \n\t"
-        "vmovups %%ymm5, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm6, (%%rax)                       \n\t"
-        "vmovups %%ymm7, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm8, (%%rax)                       \n\t"
-        "vmovups %%ymm9, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm10, (%%rax)                       \n\t"
-        "vmovups %%ymm11, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm12, (%%rax)                       \n\t"
-        "vmovups %%ymm13, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm14, (%%rax)                       \n\t"
-        "vmovups %%ymm15, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm16, (%%rax)                       \n\t"
-        "vmovups %%ymm17, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm18, (%%rax)                       \n\t"
-        "vmovups %%ymm19, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm20, (%%rax)                       \n\t"
-        "vmovups %%ymm21, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm22, (%%rax)                       \n\t"
-        "vmovups %%ymm23, (%%rax, %4)                       \n\t"
-
-        ".align 16                                         \n\t"
-        "4:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)),
-        "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%rbx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7",
-        "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%ymm16",
-        "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", "%ymm23", "%ymm24", "%ymm25",
-        "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30", "%ymm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel4x48                                             \
-    "movq %0, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "prefetcht0 0xC0(%1)                              \n\t"       \
-    "prefetcht0 0x100(%1)                              \n\t"      \
-    "prefetcht0 0x140(%1)                              \n\t"      \
-    "vmovups (%1), %%zmm27                             \n\t"      \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm0              \n\t"         \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm1              \n\t"         \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm2              \n\t"         \
-    "vmovups 0x40(%1), %%zmm28                             \n\t"  \
-    "vpdpbusd %%zmm24, %%zmm31, %%zmm3              \n\t"         \
-    "vpdpbusd %%zmm25, %%zmm31, %%zmm4              \n\t"         \
-    "vpdpbusd %%zmm26, %%zmm31, %%zmm5              \n\t"         \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "vmovups 0x80(%1), %%zmm29                             \n\t"  \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm6              \n\t"         \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm7              \n\t"         \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm8              \n\t"         \
-    "vpdpbusd %%zmm24, %%zmm31, %%zmm9              \n\t"         \
-    "vpdpbusd %%zmm25, %%zmm31, %%zmm10              \n\t"        \
-    "vpdpbusd %%zmm26, %%zmm31, %%zmm11              \n\t"        \
-    "movq %0, %%rax  \n\t"                                        \
-    "addq $0x4, %%rax  \n\t"                                      \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "prefetcht0 0x180(%1)                              \n\t"      \
-    "prefetcht0 0x1C0(%1)                              \n\t"      \
-    "prefetcht0 0x200(%1)                              \n\t"      \
-    "vmovups 0xC0(%1), %%zmm24                             \n\t"  \
-    "vpdpbusd %%zmm27, %%zmm30, %%zmm0              \n\t"         \
-    "vpdpbusd %%zmm28, %%zmm30, %%zmm1              \n\t"         \
-    "vpdpbusd %%zmm29, %%zmm30, %%zmm2              \n\t"         \
-    "vmovups 0x100(%1), %%zmm25                             \n\t" \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm3              \n\t"         \
-    "vpdpbusd %%zmm28, %%zmm31, %%zmm4              \n\t"         \
-    "vpdpbusd %%zmm29, %%zmm31, %%zmm5              \n\t"         \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd (%%rax, %6), %%zmm31                     \n\t"  \
-    "vmovups 0x140(%1), %%zmm26                             \n\t" \
-    "vpdpbusd %%zmm27, %%zmm30, %%zmm6              \n\t"         \
-    "vpdpbusd %%zmm28, %%zmm30, %%zmm7              \n\t"         \
-    "vpdpbusd %%zmm29, %%zmm30, %%zmm8              \n\t"         \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm9              \n\t"         \
-    "vpdpbusd %%zmm28, %%zmm31, %%zmm10              \n\t"        \
-    "vpdpbusd %%zmm29, %%zmm31, %%zmm11              \n\t"
-#else
-#define mmmKernel4x48                                             \
-    "movq %0, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "prefetcht0 0xC0(%1)                              \n\t"       \
-    "prefetcht0 0x100(%1)                              \n\t"      \
-    "prefetcht0 0x140(%1)                              \n\t"      \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm0, %%zmm27, %%zmm0              \n\t"            \
-    "vpaddd %%zmm1, %%zmm28, %%zmm1              \n\t"            \
-    "vpaddd %%zmm2, %%zmm29, %%zmm2              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpaddd %%zmm3, %%zmm27, %%zmm3              \n\t"            \
-    "vpaddd %%zmm4, %%zmm28, %%zmm4              \n\t"            \
-    "vpaddd %%zmm5, %%zmm29, %%zmm5              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm6, %%zmm27, %%zmm6              \n\t"            \
-    "vpaddd %%zmm7, %%zmm28, %%zmm7              \n\t"            \
-    "vpaddd %%zmm8, %%zmm29, %%zmm8              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vmovups (%1), %%zmm24                             \n\t"      \
-    "vmovups 0x40(%1), %%zmm25                             \n\t"  \
-    "vmovups 0x80(%1), %%zmm26                             \n\t"  \
-    "vpaddd %%zmm9, %%zmm27, %%zmm9              \n\t"            \
-    "vpaddd %%zmm10, %%zmm28, %%zmm10              \n\t"          \
-    "vpaddd %%zmm11, %%zmm29, %%zmm11              \n\t"          \
-    "movq %0, %%rax  \n\t"                                        \
-    "addq $0x4, %%rax  \n\t"                                      \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "prefetcht0 0x180(%1)                              \n\t"      \
-    "prefetcht0 0x1C0(%1)                              \n\t"      \
-    "prefetcht0 0x200(%1)                              \n\t"      \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm0, %%zmm27, %%zmm0              \n\t"            \
-    "vpaddd %%zmm1, %%zmm28, %%zmm1              \n\t"            \
-    "vpaddd %%zmm2, %%zmm29, %%zmm2              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "addq %6, %%rax  \n\t"                                        \
-    "addq %6, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpaddd %%zmm3, %%zmm27, %%zmm3              \n\t"            \
-    "vpaddd %%zmm4, %%zmm28, %%zmm4              \n\t"            \
-    "vpaddd %%zmm5, %%zmm29, %%zmm5              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm30                     \n\t"  \
-    "vpaddd %%zmm6, %%zmm27, %%zmm6              \n\t"            \
-    "vpaddd %%zmm7, %%zmm28, %%zmm7              \n\t"            \
-    "vpaddd %%zmm8, %%zmm29, %%zmm8              \n\t"            \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vmovups 0xC0(%1), %%zmm24                             \n\t"  \
-    "vmovups 0x100(%1), %%zmm25                             \n\t" \
-    "vmovups 0x140(%1), %%zmm26                             \n\t" \
-    "vpaddd %%zmm9, %%zmm27, %%zmm9              \n\t"            \
-    "vpaddd %%zmm10, %%zmm28, %%zmm10              \n\t"          \
-    "vpaddd %%zmm11, %%zmm29, %%zmm11              \n\t"
-#endif
-
-inline void mmm_avx512_4x48_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0xC0(%1)                              \n\t"
-        "prefetcht0 0x100(%1)                              \n\t"
-        "prefetcht0 0x140(%1)                              \n\t"
-        "vmovups (%1), %%zmm24                             \n\t"
-        "vmovups 0x40(%1), %%zmm25                             \n\t"
-        "vmovups 0x80(%1), %%zmm26                             \n\t"
-        "add $0xC0, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%eax \n\t"
-        "vmovd %%eax, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%zmm31            \n\t"
-#endif
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x1, %%rax          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%7), %%zmm0                       \n\t"
-        "vmovups 0x40(%7), %%zmm1                   \n\t"
-        "vmovups 0x80(%7), %%zmm2                   \n\t"
-        "vmovups %%zmm0, %%zmm3                   \n\t"
-        "vmovups %%zmm1, %%zmm4                   \n\t"
-        "vmovups %%zmm2, %%zmm5                   \n\t"
-        "vmovups %%zmm0, %%zmm6                   \n\t"
-        "vmovups %%zmm1, %%zmm7                   \n\t"
-        "vmovups %%zmm2, %%zmm8                   \n\t"
-        "vmovups %%zmm0, %%zmm9                   \n\t"
-        "vmovups %%zmm1, %%zmm10                   \n\t"
-        "vmovups %%zmm2, %%zmm11                   \n\t"
-        "jmp 1f          \n\t"
-
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
-        "vxorps %%zmm2, %%zmm2, %%zmm2                     \n\t"
-        "vxorps %%zmm3, %%zmm3, %%zmm3                     \n\t"
-        "vxorps %%zmm4, %%zmm4, %%zmm4                     \n\t"
-        "vxorps %%zmm5, %%zmm5, %%zmm5                     \n\t"
-        "vxorps %%zmm6, %%zmm6, %%zmm6                     \n\t"
-        "vxorps %%zmm7, %%zmm7, %%zmm7                     \n\t"
-        "vxorps %%zmm8, %%zmm8, %%zmm8                     \n\t"
-        "vxorps %%zmm9, %%zmm9, %%zmm9                     \n\t"
-        "vxorps %%zmm10, %%zmm10, %%zmm10                  \n\t"
-        "vxorps %%zmm11, %%zmm11, %%zmm11                  \n\t"
-
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 0x80(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "prefetcht0 0x80(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 0x80(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "prefetcht0 0x80(%%rax, %4)                              \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t" mmmKernel4x48
-
-        "add $0x180, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 2b                                             \n\t"
-
-        "movq %2, %%rax  \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpaddd (%%rax), %%zmm0, %%zmm0                       \n\t"
-        "vpaddd 0x40(%%rax), %%zmm1, %%zmm1                   \n\t"
-        "vpaddd 0x80(%%rax), %%zmm2, %%zmm2                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm3, %%zmm3                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm4, %%zmm4                   \n\t"
-        "vpaddd 0x80(%%rax, %4), %%zmm5, %%zmm5                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm6, %%zmm6                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm7, %%zmm7                   \n\t"
-        "vpaddd 0x80(%%rax), %%zmm8, %%zmm8                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm9, %%zmm9                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm10, %%zmm10                   \n\t"
-        "vpaddd 0x80(%%rax, %4), %%zmm11, %%zmm11                   \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 3f      \n\t"
-
-        "vbroadcastss (%5), %%zmm24                        \n\t"
-        "vcvtdq2ps %%zmm0, %%zmm0                       \n\t"
-        "vcvtdq2ps %%zmm1, %%zmm1                       \n\t"
-        "vcvtdq2ps %%zmm2, %%zmm2                       \n\t"
-        "vcvtdq2ps %%zmm3, %%zmm3                       \n\t"
-        "vcvtdq2ps %%zmm4, %%zmm4                       \n\t"
-        "vcvtdq2ps %%zmm5, %%zmm5                       \n\t"
-        "vcvtdq2ps %%zmm6, %%zmm6                       \n\t"
-        "vcvtdq2ps %%zmm7, %%zmm7                       \n\t"
-        "vcvtdq2ps %%zmm8, %%zmm8                       \n\t"
-        "vcvtdq2ps %%zmm9, %%zmm9                       \n\t"
-        "vcvtdq2ps %%zmm10, %%zmm10                       \n\t"
-        "vcvtdq2ps %%zmm11, %%zmm11                       \n\t"
-        "vmulps %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vmulps %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vmulps %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vmulps %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vmulps %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vmulps %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vmulps %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vmulps %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vmulps %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vmulps %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vmulps %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vmulps %%zmm11, %%zmm24, %%zmm11                     \n\t"
-
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x2, %%rax          \n\t"
-        "je 3f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "vcvtps2dq %%zmm1, %%zmm1                       \n\t"
-        "vcvtps2dq %%zmm2, %%zmm2                       \n\t"
-        "vcvtps2dq %%zmm3, %%zmm3                       \n\t"
-        "vcvtps2dq %%zmm4, %%zmm4                       \n\t"
-        "vcvtps2dq %%zmm5, %%zmm5                       \n\t"
-        "vcvtps2dq %%zmm6, %%zmm6                       \n\t"
-        "vcvtps2dq %%zmm7, %%zmm7                       \n\t"
-        "vcvtps2dq %%zmm8, %%zmm8                       \n\t"
-        "vcvtps2dq %%zmm9, %%zmm9                       \n\t"
-        "vcvtps2dq %%zmm10, %%zmm10                       \n\t"
-        "vcvtps2dq %%zmm11, %%zmm11                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpaddd %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vpaddd %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vpaddd %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vpaddd %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vpaddd %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vpaddd %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vpaddd %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vpaddd %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vpaddd %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vpaddd %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vpaddd %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "movq %9, %%rax  \n\t"
-        "shr $2, %4                                     \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpmovusdb %%zmm0,  (%%rax)                             \n\t"
-        "vpmovusdb %%zmm1,  0x10(%%rax)                         \n\t"
-        "vpmovusdb %%zmm2,  0x20(%%rax)                         \n\t"
-        "vpmovusdb %%zmm3,  (%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm4,  0x10(%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm5,  0x20(%%rax, %4)                         \n\t"
-        "add %%rcx, %%rax                                     \n\t"
-        "vpmovusdb %%zmm6,  (%%rax)                         \n\t"
-        "vpmovusdb %%zmm7,  0x10(%%rax)                         \n\t"
-        "vpmovusdb %%zmm8,  0x20(%%rax)                         \n\t"
-        "vpmovusdb %%zmm9,  (%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm10,  0x10(%%rax, %4)                         \n\t"
-        "vpmovusdb %%zmm11,  0x20(%%rax, %4)                         \n\t"
-        "jmp 4f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "vmovups %%zmm0,  (%%rax)                             \n\t"
-        "vmovups %%zmm1,  0x40(%%rax)                         \n\t"
-        "vmovups %%zmm2,  0x80(%%rax)                         \n\t"
-        "vmovups %%zmm3,  (%%rax, %4)                         \n\t"
-        "vmovups %%zmm4,  0x40(%%rax, %4)                         \n\t"
-        "vmovups %%zmm5,  0x80(%%rax, %4)                         \n\t"
-        "add %%rcx, %%rax                                     \n\t"
-        "vmovups %%zmm6,  (%%rax)                         \n\t"
-        "vmovups %%zmm7,  0x40(%%rax)                         \n\t"
-        "vmovups %%zmm8,  0x80(%%rax)                         \n\t"
-        "vmovups %%zmm9,  (%%rax, %4)                         \n\t"
-        "vmovups %%zmm10,  0x40(%%rax, %4)                         \n\t"
-        "vmovups %%zmm11,  0x80(%%rax, %4)                         \n\t"
-        ".align 16                                         \n\t"
-        "4:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)),
-        "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8",
-        "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
-        "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26",
-        "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel6x32                                               \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm31                     \n\t" \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "prefetcht0 0xC0(%1)                              \n\t"         \
-    "vmovups (%1), %%zmm26                             \n\t"        \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm0              \n\t"           \
-    "vpdpbusd %%zmm25, %%zmm28, %%zmm1              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm2              \n\t"           \
-    "vpdpbusd %%zmm25, %%zmm29, %%zmm3              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm4              \n\t"           \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm5              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm31, %%zmm6              \n\t"           \
-    "vpdpbusd %%zmm25, %%zmm31, %%zmm7              \n\t"           \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vmovups 0x40(%1), %%zmm27                             \n\t"    \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm8              \n\t"           \
-    "vpdpbusd %%zmm25, %%zmm28, %%zmm9              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm10              \n\t"          \
-    "vpdpbusd %%zmm25, %%zmm29, %%zmm11              \n\t"          \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm31                     \n\t" \
-    "prefetcht0 0x100(%1)                              \n\t"        \
-    "prefetcht0 0x140(%1)                              \n\t"        \
-    "vmovups 0x80(%1), %%zmm24                             \n\t"    \
-    "vpdpbusd %%zmm26, %%zmm28, %%zmm0              \n\t"           \
-    "vpdpbusd %%zmm27, %%zmm28, %%zmm1              \n\t"           \
-    "vpdpbusd %%zmm26, %%zmm29, %%zmm2              \n\t"           \
-    "vpdpbusd %%zmm27, %%zmm29, %%zmm3              \n\t"           \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm4              \n\t"           \
-    "vpdpbusd %%zmm27, %%zmm30, %%zmm5              \n\t"           \
-    "vpdpbusd %%zmm26, %%zmm31, %%zmm6              \n\t"           \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm7              \n\t"           \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vmovups 0xC0(%1), %%zmm25                             \n\t"    \
-    "vpdpbusd %%zmm26, %%zmm28, %%zmm8              \n\t"           \
-    "vpdpbusd %%zmm27, %%zmm28, %%zmm9              \n\t"           \
-    "vpdpbusd %%zmm26, %%zmm29, %%zmm10              \n\t"          \
-    "vpdpbusd %%zmm27, %%zmm29, %%zmm11              \n\t"
-#else
-#define mmmKernel6x32                                               \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "prefetcht0 0xC0(%1)                              \n\t"         \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpaddd %%zmm0, %%zmm26, %%zmm0              \n\t"              \
-    "vpaddd %%zmm1, %%zmm27, %%zmm1              \n\t"              \
-    "vpaddd %%zmm2, %%zmm28, %%zmm2              \n\t"              \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"        \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm29                     \n\t" \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"        \
-    "vpaddd %%zmm3, %%zmm26, %%zmm3              \n\t"              \
-    "vpaddd %%zmm4, %%zmm27, %%zmm4              \n\t"              \
-    "vpaddd %%zmm5, %%zmm28, %%zmm5              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpaddd %%zmm6, %%zmm26, %%zmm6              \n\t"              \
-    "vpaddd %%zmm7, %%zmm27, %%zmm7              \n\t"              \
-    "vpaddd %%zmm8, %%zmm28, %%zmm8              \n\t"              \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm28              \n\t"        \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vmovups (%1), %%zmm24                             \n\t"        \
-    "vmovups 0x40(%1), %%zmm25                             \n\t"    \
-    "vpaddd %%zmm9, %%zmm26, %%zmm9              \n\t"              \
-    "vpaddd %%zmm10, %%zmm27, %%zmm10              \n\t"            \
-    "vpaddd %%zmm11, %%zmm28, %%zmm11              \n\t"            \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "prefetcht0 0x100(%1)                              \n\t"        \
-    "prefetcht0 0x140(%1)                              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpaddd %%zmm0, %%zmm26, %%zmm0              \n\t"              \
-    "vpaddd %%zmm1, %%zmm27, %%zmm1              \n\t"              \
-    "vpaddd %%zmm2, %%zmm28, %%zmm2              \n\t"              \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"        \
-    "vpbroadcastd (%%rax, %%rbx), %%zmm29                     \n\t" \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "addq %6, %%rax  \n\t"                                          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"        \
-    "vpaddd %%zmm3, %%zmm26, %%zmm3              \n\t"              \
-    "vpaddd %%zmm4, %%zmm27, %%zmm4              \n\t"              \
-    "vpaddd %%zmm5, %%zmm28, %%zmm5              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpaddd %%zmm6, %%zmm26, %%zmm6              \n\t"              \
-    "vpaddd %%zmm7, %%zmm27, %%zmm7              \n\t"              \
-    "vpaddd %%zmm8, %%zmm28, %%zmm8              \n\t"              \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm26              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm29, %%zmm27              \n\t"        \
-    "vpmaddubsw %%zmm25, %%zmm29, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"          \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"          \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vmovups 0x80(%1), %%zmm24                             \n\t"    \
-    "vmovups 0xC0(%1), %%zmm25                             \n\t"    \
-    "vpaddd %%zmm9, %%zmm26, %%zmm9              \n\t"              \
-    "vpaddd %%zmm10, %%zmm27, %%zmm10              \n\t"            \
-    "vpaddd %%zmm11, %%zmm28, %%zmm11              \n\t"
-#endif
-
-inline void mmm_avx512_6x32_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0x80(%1)                              \n\t"
-        "prefetcht0 0xC0(%1)                              \n\t"
-        "vmovups (%1), %%zmm24                             \n\t"
-        "vmovups 0x40(%1), %%zmm25                             \n\t"
-        "add $0x80, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%ebx \n\t"
-        "vmovd %%ebx, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%zmm31            \n\t"
-#endif
-        "movq %8, %%rbx          \n\t"
-        "andq $0x1, %%rbx          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%7), %%zmm0                       \n\t"
-        "vmovups 0x40(%7), %%zmm1                   \n\t"
-        "vmovups %%zmm0, %%zmm2                   \n\t"
-        "vmovups %%zmm1, %%zmm3                   \n\t"
-        "vmovups %%zmm0, %%zmm4                   \n\t"
-        "vmovups %%zmm1, %%zmm5                   \n\t"
-        "vmovups %%zmm0, %%zmm6                   \n\t"
-        "vmovups %%zmm1, %%zmm7                   \n\t"
-        "vmovups %%zmm0, %%zmm8                   \n\t"
-        "vmovups %%zmm1, %%zmm9                   \n\t"
-        "vmovups %%zmm0, %%zmm10                   \n\t"
-        "vmovups %%zmm1, %%zmm11                   \n\t"
-        "jmp 1f          \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
-        "vxorps %%zmm2, %%zmm2, %%zmm2                     \n\t"
-        "vxorps %%zmm3, %%zmm3, %%zmm3                     \n\t"
-        "vxorps %%zmm4, %%zmm4, %%zmm4                     \n\t"
-        "vxorps %%zmm5, %%zmm5, %%zmm5                     \n\t"
-        "vxorps %%zmm6, %%zmm6, %%zmm6                     \n\t"
-        "vxorps %%zmm7, %%zmm7, %%zmm7                     \n\t"
-        "vxorps %%zmm8, %%zmm8, %%zmm8                     \n\t"
-        "vxorps %%zmm9, %%zmm9, %%zmm9                     \n\t"
-        "vxorps %%zmm10, %%zmm10, %%zmm10                  \n\t"
-        "vxorps %%zmm11, %%zmm11, %%zmm11                  \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "movq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t" mmmKernel6x32
-
-        "add $0x100, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 2b                                             \n\t"
-
-        "movq %2, %%rax  \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpaddd (%%rax), %%zmm0, %%zmm0                       \n\t"
-        "vpaddd 0x40(%%rax), %%zmm1, %%zmm1                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm2, %%zmm2                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm3, %%zmm3                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm4, %%zmm4                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm5, %%zmm5                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm6, %%zmm6                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm7, %%zmm7                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm8, %%zmm8                   \n\t"
-        "vpaddd 0x40(%%rax), %%zmm9, %%zmm9                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm10, %%zmm10                   \n\t"
-        "vpaddd 0x40(%%rax, %4), %%zmm11, %%zmm11                   \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 3f      \n\t"
-
-        "vbroadcastss (%5), %%zmm24                        \n\t"
-        "vcvtdq2ps %%zmm0, %%zmm0                       \n\t"
-        "vcvtdq2ps %%zmm1, %%zmm1                       \n\t"
-        "vcvtdq2ps %%zmm2, %%zmm2                       \n\t"
-        "vcvtdq2ps %%zmm3, %%zmm3                       \n\t"
-        "vcvtdq2ps %%zmm4, %%zmm4                       \n\t"
-        "vcvtdq2ps %%zmm5, %%zmm5                       \n\t"
-        "vcvtdq2ps %%zmm6, %%zmm6                       \n\t"
-        "vcvtdq2ps %%zmm7, %%zmm7                       \n\t"
-        "vcvtdq2ps %%zmm8, %%zmm8                       \n\t"
-        "vcvtdq2ps %%zmm9, %%zmm9                       \n\t"
-        "vcvtdq2ps %%zmm10, %%zmm10                       \n\t"
-        "vcvtdq2ps %%zmm11, %%zmm11                       \n\t"
-        "vmulps %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vmulps %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vmulps %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vmulps %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vmulps %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vmulps %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vmulps %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vmulps %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vmulps %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vmulps %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vmulps %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vmulps %%zmm11, %%zmm24, %%zmm11                     \n\t"
-
-        "movq %8, %%rbx          \n\t"
-        "andq $0x2, %%rbx          \n\t"
-        "je 3f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "vcvtps2dq %%zmm1, %%zmm1                       \n\t"
-        "vcvtps2dq %%zmm2, %%zmm2                       \n\t"
-        "vcvtps2dq %%zmm3, %%zmm3                       \n\t"
-        "vcvtps2dq %%zmm4, %%zmm4                       \n\t"
-        "vcvtps2dq %%zmm5, %%zmm5                       \n\t"
-        "vcvtps2dq %%zmm6, %%zmm6                       \n\t"
-        "vcvtps2dq %%zmm7, %%zmm7                       \n\t"
-        "vcvtps2dq %%zmm8, %%zmm8                       \n\t"
-        "vcvtps2dq %%zmm9, %%zmm9                       \n\t"
-        "vcvtps2dq %%zmm10, %%zmm10                       \n\t"
-        "vcvtps2dq %%zmm11, %%zmm11                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpaddd %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vpaddd %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vpaddd %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vpaddd %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vpaddd %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vpaddd %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vpaddd %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vpaddd %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vpaddd %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vpaddd %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vpaddd %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "movq %9, %%rax  \n\t"
-        "shr $2, %4                                     \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpmovusdb %%zmm0, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm1, 0x10(%%rax)                       \n\t"
-        "vpmovusdb %%zmm2, (%%rax, %4)                       \n\t"
-        "vpmovusdb %%zmm3, 0x10(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm4, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm5, 0x10(%%rax)                       \n\t"
-        "vpmovusdb %%zmm6, (%%rax, %4)                       \n\t"
-        "vpmovusdb %%zmm7, 0x10(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm8, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm9, 0x10(%%rax)                       \n\t"
-        "vpmovusdb %%zmm10, (%%rax, %4)                       \n\t"
-        "vpmovusdb %%zmm11, 0x10(%%rax, %4)                       \n\t"
-        "jmp 4f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "vmovups %%zmm0, (%%rax)                       \n\t"
-        "vmovups %%zmm1, 0x40(%%rax)                       \n\t"
-        "vmovups %%zmm2, (%%rax, %4)                       \n\t"
-        "vmovups %%zmm3, 0x40(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm4, (%%rax)                       \n\t"
-        "vmovups %%zmm5, 0x40(%%rax)                       \n\t"
-        "vmovups %%zmm6, (%%rax, %4)                       \n\t"
-        "vmovups %%zmm7, 0x40(%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm8, (%%rax)                       \n\t"
-        "vmovups %%zmm9, 0x40(%%rax)                       \n\t"
-        "vmovups %%zmm10, (%%rax, %4)                       \n\t"
-        "vmovups %%zmm11, 0x40(%%rax, %4)                       \n\t"
-
-        ".align 16                                         \n\t"
-        "4:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)),
-        "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%rbx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7",
-        "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16",
-        "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25",
-        "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel12x16                                              \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "vpdpbusd %%zmm24, %%zmm25, %%zmm0              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm26, %%zmm1              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm27, %%zmm2              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm3              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm4              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm5              \n\t"           \
-    "vmovups (%1), %%zmm31                             \n\t"        \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpdpbusd %%zmm24, %%zmm25, %%zmm6              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm26, %%zmm7              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm27, %%zmm8              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm9              \n\t"           \
-    "vpdpbusd %%zmm24, %%zmm29, %%zmm10              \n\t"          \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm11              \n\t"          \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "prefetcht0 0xC0(%1)                              \n\t"         \
-    "vpdpbusd %%zmm31, %%zmm25, %%zmm0              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm26, %%zmm1              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm27, %%zmm2              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm28, %%zmm3              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm29, %%zmm4              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm30, %%zmm5              \n\t"           \
-    "vmovups 0x40(%1), %%zmm24                             \n\t"    \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm30                     \n\t" \
-    "vpdpbusd %%zmm31, %%zmm25, %%zmm6              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm26, %%zmm7              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm27, %%zmm8              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm28, %%zmm9              \n\t"           \
-    "vpdpbusd %%zmm31, %%zmm29, %%zmm10              \n\t"          \
-    "vpdpbusd %%zmm31, %%zmm30, %%zmm11              \n\t"
-#else
-#define mmmKernel12x16                                              \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm0, %%zmm28, %%zmm0              \n\t"              \
-    "vpaddd %%zmm1, %%zmm29, %%zmm1              \n\t"              \
-    "vpaddd %%zmm2, %%zmm30, %%zmm2              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm3, %%zmm28, %%zmm3              \n\t"              \
-    "vpaddd %%zmm4, %%zmm29, %%zmm4              \n\t"              \
-    "vpaddd %%zmm5, %%zmm30, %%zmm5              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm6, %%zmm28, %%zmm6              \n\t"              \
-    "vpaddd %%zmm7, %%zmm29, %%zmm7              \n\t"              \
-    "vpaddd %%zmm8, %%zmm30, %%zmm8              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm9, %%zmm28, %%zmm9              \n\t"              \
-    "vpaddd %%zmm10, %%zmm29, %%zmm10              \n\t"            \
-    "vpaddd %%zmm11, %%zmm30, %%zmm11              \n\t"            \
-    "vmovups (%1), %%zmm24                             \n\t"        \
-    "prefetcht0 0xC0(%1)                              \n\t"         \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm0, %%zmm28, %%zmm0              \n\t"              \
-    "vpaddd %%zmm1, %%zmm29, %%zmm1              \n\t"              \
-    "vpaddd %%zmm2, %%zmm30, %%zmm2              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm3, %%zmm28, %%zmm3              \n\t"              \
-    "vpaddd %%zmm4, %%zmm29, %%zmm4              \n\t"              \
-    "vpaddd %%zmm5, %%zmm30, %%zmm5              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%zmm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%zmm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%zmm27                     \n\t" \
-    "vpaddd %%zmm6, %%zmm28, %%zmm6              \n\t"              \
-    "vpaddd %%zmm7, %%zmm29, %%zmm7              \n\t"              \
-    "vpaddd %%zmm8, %%zmm30, %%zmm8              \n\t"              \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm26, %%zmm29              \n\t"        \
-    "vpmaddubsw %%zmm24, %%zmm27, %%zmm30              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"          \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"          \
-    "vpmaddwd %%zmm30, %%zmm31, %%zmm30              \n\t"          \
-    "vmovups 0x40(%1), %%zmm24                             \n\t"    \
-    "vpaddd %%zmm9, %%zmm28, %%zmm9              \n\t"              \
-    "vpaddd %%zmm10, %%zmm29, %%zmm10              \n\t"            \
-    "vpaddd %%zmm11, %%zmm30, %%zmm11              \n\t"
-#endif
-
-inline void mmm_avx512_12x16_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0x80(%1)                              \n\t"
-        "vmovups (%1), %%zmm24                             \n\t"
-        "add $0x40, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%ebx \n\t"
-        "vmovd %%ebx, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%zmm31            \n\t"
-#endif
-        "movq %8, %%rbx          \n\t"
-        "andq $0x1, %%rbx          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%7), %%zmm0                       \n\t"
-        "vmovups %%zmm0, %%zmm1                   \n\t"
-        "vmovups %%zmm0, %%zmm2                   \n\t"
-        "vmovups %%zmm0, %%zmm3                   \n\t"
-        "vmovups %%zmm0, %%zmm4                   \n\t"
-        "vmovups %%zmm0, %%zmm5                   \n\t"
-        "vmovups %%zmm0, %%zmm6                   \n\t"
-        "vmovups %%zmm0, %%zmm7                   \n\t"
-        "vmovups %%zmm0, %%zmm8                   \n\t"
-        "vmovups %%zmm0, %%zmm9                   \n\t"
-        "vmovups %%zmm0, %%zmm10                   \n\t"
-        "vmovups %%zmm0, %%zmm11                   \n\t"
-        "jmp 1f          \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
-        "vxorps %%zmm2, %%zmm2, %%zmm2                     \n\t"
-        "vxorps %%zmm3, %%zmm3, %%zmm3                     \n\t"
-        "vxorps %%zmm4, %%zmm4, %%zmm4                     \n\t"
-        "vxorps %%zmm5, %%zmm5, %%zmm5                     \n\t"
-        "vxorps %%zmm6, %%zmm6, %%zmm6                     \n\t"
-        "vxorps %%zmm7, %%zmm7, %%zmm7                     \n\t"
-        "vxorps %%zmm8, %%zmm8, %%zmm8                     \n\t"
-        "vxorps %%zmm9, %%zmm9, %%zmm9                     \n\t"
-        "vxorps %%zmm10, %%zmm10, %%zmm10                  \n\t"
-        "vxorps %%zmm11, %%zmm11, %%zmm11                  \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "movq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t" mmmKernel12x16
-
-        "add $0x80, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 2b                                             \n\t"
-
-        "movq %2, %%rax  \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpaddd (%%rax), %%zmm0, %%zmm0                       \n\t"
-        "vpaddd (%%rax, %4), %%zmm1, %%zmm1                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm2, %%zmm2                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm3, %%zmm3                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm4, %%zmm4                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm5, %%zmm5                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm6, %%zmm6                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm7, %%zmm7                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm8, %%zmm8                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm9, %%zmm9                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%zmm10, %%zmm10                   \n\t"
-        "vpaddd (%%rax, %4), %%zmm11, %%zmm11                   \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 3f      \n\t"
-
-        "vbroadcastss (%5), %%zmm24                        \n\t"
-        "vcvtdq2ps %%zmm0, %%zmm0                       \n\t"
-        "vcvtdq2ps %%zmm1, %%zmm1                       \n\t"
-        "vcvtdq2ps %%zmm2, %%zmm2                       \n\t"
-        "vcvtdq2ps %%zmm3, %%zmm3                       \n\t"
-        "vcvtdq2ps %%zmm4, %%zmm4                       \n\t"
-        "vcvtdq2ps %%zmm5, %%zmm5                       \n\t"
-        "vcvtdq2ps %%zmm6, %%zmm6                       \n\t"
-        "vcvtdq2ps %%zmm7, %%zmm7                       \n\t"
-        "vcvtdq2ps %%zmm8, %%zmm8                       \n\t"
-        "vcvtdq2ps %%zmm9, %%zmm9                       \n\t"
-        "vcvtdq2ps %%zmm10, %%zmm10                       \n\t"
-        "vcvtdq2ps %%zmm11, %%zmm11                       \n\t"
-        "vmulps %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vmulps %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vmulps %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vmulps %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vmulps %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vmulps %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vmulps %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vmulps %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vmulps %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vmulps %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vmulps %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vmulps %%zmm11, %%zmm24, %%zmm11                     \n\t"
-
-        "movq %8, %%rbx          \n\t"
-        "andq $0x2, %%rbx          \n\t"
-        "je 3f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "vcvtps2dq %%zmm1, %%zmm1                       \n\t"
-        "vcvtps2dq %%zmm2, %%zmm2                       \n\t"
-        "vcvtps2dq %%zmm3, %%zmm3                       \n\t"
-        "vcvtps2dq %%zmm4, %%zmm4                       \n\t"
-        "vcvtps2dq %%zmm5, %%zmm5                       \n\t"
-        "vcvtps2dq %%zmm6, %%zmm6                       \n\t"
-        "vcvtps2dq %%zmm7, %%zmm7                       \n\t"
-        "vcvtps2dq %%zmm8, %%zmm8                       \n\t"
-        "vcvtps2dq %%zmm9, %%zmm9                       \n\t"
-        "vcvtps2dq %%zmm10, %%zmm10                       \n\t"
-        "vcvtps2dq %%zmm11, %%zmm11                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpaddd %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vpaddd %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vpaddd %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vpaddd %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vpaddd %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vpaddd %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vpaddd %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vpaddd %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vpaddd %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vpaddd %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vpaddd %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "movq %9, %%rax  \n\t"
-        "shr $2, %4                                     \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpmovusdb %%zmm0, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm1, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm2, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm3, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm4, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm5, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm6, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm7, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm8, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm9, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm10, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm11, (%%rax, %4)                       \n\t"
-        "jmp 4f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "vmovups %%zmm0, (%%rax)                       \n\t"
-        "vmovups %%zmm1, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm2, (%%rax)                       \n\t"
-        "vmovups %%zmm3, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm4, (%%rax)                       \n\t"
-        "vmovups %%zmm5, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm6, (%%rax)                       \n\t"
-        "vmovups %%zmm7, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm8, (%%rax)                       \n\t"
-        "vmovups %%zmm9, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%zmm10, (%%rax)                       \n\t"
-        "vmovups %%zmm11, (%%rax, %4)                       \n\t"
-
-        ".align 16                                         \n\t"
-        "4:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((int64_t)(N * 4)),
-        "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%rbx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7",
-        "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16",
-        "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25",
-        "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel12x8                                               \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "vpdpbusd %%ymm24, %%ymm25, %%ymm0              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm26, %%ymm1              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm27, %%ymm2              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm28, %%ymm3              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm29, %%ymm4              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm30, %%ymm5              \n\t"           \
-    "vmovups (%1), %%ymm31                             \n\t"        \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "vpdpbusd %%ymm24, %%ymm25, %%ymm6              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm26, %%ymm7              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm27, %%ymm8              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm28, %%ymm9              \n\t"           \
-    "vpdpbusd %%ymm24, %%ymm29, %%ymm10              \n\t"          \
-    "vpdpbusd %%ymm24, %%ymm30, %%ymm11              \n\t"          \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "vpdpbusd %%ymm31, %%ymm25, %%ymm0              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm26, %%ymm1              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm27, %%ymm2              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm28, %%ymm3              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm29, %%ymm4              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm30, %%ymm5              \n\t"           \
-    "vmovups 0x20(%1), %%ymm24                             \n\t"    \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm28                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm29                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm30                     \n\t" \
-    "vpdpbusd %%ymm31, %%ymm25, %%ymm6              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm26, %%ymm7              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm27, %%ymm8              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm28, %%ymm9              \n\t"           \
-    "vpdpbusd %%ymm31, %%ymm29, %%ymm10              \n\t"          \
-    "vpdpbusd %%ymm31, %%ymm30, %%ymm11              \n\t"
-#else
-#define mmmKernel12x8                                               \
-    "movq %0, %%rax  \n\t"                                          \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "prefetcht0 0x80(%1)                              \n\t"         \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm0, %%ymm28, %%ymm0              \n\t"              \
-    "vpaddd %%ymm1, %%ymm29, %%ymm1              \n\t"              \
-    "vpaddd %%ymm2, %%ymm30, %%ymm2              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm3, %%ymm28, %%ymm3              \n\t"              \
-    "vpaddd %%ymm4, %%ymm29, %%ymm4              \n\t"              \
-    "vpaddd %%ymm5, %%ymm30, %%ymm5              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm6, %%ymm28, %%ymm6              \n\t"              \
-    "vpaddd %%ymm7, %%ymm29, %%ymm7              \n\t"              \
-    "vpaddd %%ymm8, %%ymm30, %%ymm8              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "movq %0, %%rax  \n\t"                                          \
-    "addq $0x4, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm9, %%ymm28, %%ymm9              \n\t"              \
-    "vpaddd %%ymm10, %%ymm29, %%ymm10              \n\t"            \
-    "vpaddd %%ymm11, %%ymm30, %%ymm11              \n\t"            \
-    "vmovups (%1), %%ymm24                             \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm0, %%ymm28, %%ymm0              \n\t"              \
-    "vpaddd %%ymm1, %%ymm29, %%ymm1              \n\t"              \
-    "vpaddd %%ymm2, %%ymm30, %%ymm2              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm3, %%ymm28, %%ymm3              \n\t"              \
-    "vpaddd %%ymm4, %%ymm29, %%ymm4              \n\t"              \
-    "vpaddd %%ymm5, %%ymm30, %%ymm5              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "addq %%rbx, %%rax  \n\t"                                       \
-    "vpbroadcastd (%%rax), %%ymm25                     \n\t"        \
-    "vpbroadcastd (%%rax, %6), %%ymm26                     \n\t"    \
-    "vpbroadcastd (%%rax, %6, 2), %%ymm27                     \n\t" \
-    "vpaddd %%ymm6, %%ymm28, %%ymm6              \n\t"              \
-    "vpaddd %%ymm7, %%ymm29, %%ymm7              \n\t"              \
-    "vpaddd %%ymm8, %%ymm30, %%ymm8              \n\t"              \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm26, %%ymm29              \n\t"        \
-    "vpmaddubsw %%ymm24, %%ymm27, %%ymm30              \n\t"        \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"          \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"          \
-    "vpmaddwd %%ymm30, %%ymm31, %%ymm30              \n\t"          \
-    "vmovups 0x20(%1), %%ymm24                             \n\t"    \
-    "vpaddd %%ymm9, %%ymm28, %%ymm9              \n\t"              \
-    "vpaddd %%ymm10, %%ymm29, %%ymm10              \n\t"            \
-    "vpaddd %%ymm11, %%ymm30, %%ymm11              \n\t"
-#endif
-
-inline void mmm_avx512_12x8_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0x40(%1)                              \n\t"
-        "vmovups (%1), %%ymm24                             \n\t"
-        "add $0x20, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%ebx \n\t"
-        "vmovd %%ebx, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%ymm31            \n\t"
-#endif
-        "movq %8, %%rbx          \n\t"
-        "andq $0x1, %%rbx          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%7), %%ymm0                       \n\t"
-        "vmovups %%ymm0, %%ymm1                   \n\t"
-        "vmovups %%ymm0, %%ymm2                   \n\t"
-        "vmovups %%ymm0, %%ymm3                   \n\t"
-        "vmovups %%ymm0, %%ymm4                   \n\t"
-        "vmovups %%ymm0, %%ymm5                   \n\t"
-        "vmovups %%ymm0, %%ymm6                   \n\t"
-        "vmovups %%ymm0, %%ymm7                   \n\t"
-        "vmovups %%ymm0, %%ymm8                   \n\t"
-        "vmovups %%ymm0, %%ymm9                   \n\t"
-        "vmovups %%ymm0, %%ymm10                   \n\t"
-        "vmovups %%ymm0, %%ymm11                   \n\t"
-        "jmp 1f          \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-        "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-        "vxorps %%ymm2, %%ymm2, %%ymm2                     \n\t"
-        "vxorps %%ymm3, %%ymm3, %%ymm3                     \n\t"
-        "vxorps %%ymm4, %%ymm4, %%ymm4                     \n\t"
-        "vxorps %%ymm5, %%ymm5, %%ymm5                     \n\t"
-        "vxorps %%ymm6, %%ymm6, %%ymm6                     \n\t"
-        "vxorps %%ymm7, %%ymm7, %%ymm7                     \n\t"
-        "vxorps %%ymm8, %%ymm8, %%ymm8                     \n\t"
-        "vxorps %%ymm9, %%ymm9, %%ymm9                     \n\t"
-        "vxorps %%ymm10, %%ymm10, %%ymm10                  \n\t"
-        "vxorps %%ymm11, %%ymm11, %%ymm11                  \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "movq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-        "addq %6, %%rbx  \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t" mmmKernel12x8
-
-        "add $0x40, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 2b                                             \n\t"
-
-        "movq %2, %%rax  \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpaddd (%%rax), %%ymm0, %%ymm0                       \n\t"
-        "vpaddd (%%rax, %4), %%ymm1, %%ymm1                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm2, %%ymm2                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm3, %%ymm3                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm4, %%ymm4                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm5, %%ymm5                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm6, %%ymm6                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm7, %%ymm7                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm8, %%ymm8                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm9, %%ymm9                   \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpaddd (%%rax), %%ymm10, %%ymm10                   \n\t"
-        "vpaddd (%%rax, %4), %%ymm11, %%ymm11                   \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 3f      \n\t"
-
-        "vbroadcastss (%5), %%ymm24                        \n\t"
-        "vcvtdq2ps %%ymm0, %%ymm0                       \n\t"
-        "vcvtdq2ps %%ymm1, %%ymm1                       \n\t"
-        "vcvtdq2ps %%ymm2, %%ymm2                       \n\t"
-        "vcvtdq2ps %%ymm3, %%ymm3                       \n\t"
-        "vcvtdq2ps %%ymm4, %%ymm4                       \n\t"
-        "vcvtdq2ps %%ymm5, %%ymm5                       \n\t"
-        "vcvtdq2ps %%ymm6, %%ymm6                       \n\t"
-        "vcvtdq2ps %%ymm7, %%ymm7                       \n\t"
-        "vcvtdq2ps %%ymm8, %%ymm8                       \n\t"
-        "vcvtdq2ps %%ymm9, %%ymm9                       \n\t"
-        "vcvtdq2ps %%ymm10, %%ymm10                       \n\t"
-        "vcvtdq2ps %%ymm11, %%ymm11                       \n\t"
-        "vmulps %%ymm0, %%ymm24, %%ymm0                       \n\t"
-        "vmulps %%ymm1, %%ymm24, %%ymm1                       \n\t"
-        "vmulps %%ymm2, %%ymm24, %%ymm2                       \n\t"
-        "vmulps %%ymm3, %%ymm24, %%ymm3                       \n\t"
-        "vmulps %%ymm4, %%ymm24, %%ymm4                       \n\t"
-        "vmulps %%ymm5, %%ymm24, %%ymm5                       \n\t"
-        "vmulps %%ymm6, %%ymm24, %%ymm6                       \n\t"
-        "vmulps %%ymm7, %%ymm24, %%ymm7                       \n\t"
-        "vmulps %%ymm8, %%ymm24, %%ymm8                       \n\t"
-        "vmulps %%ymm9, %%ymm24, %%ymm9                       \n\t"
-        "vmulps %%ymm10, %%ymm24, %%ymm10                     \n\t"
-        "vmulps %%ymm11, %%ymm24, %%ymm11                     \n\t"
-
-        "movq %8, %%rbx          \n\t"
-        "andq $0x2, %%rbx          \n\t"
-        "je 3f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "vcvtps2dq %%zmm1, %%zmm1                       \n\t"
-        "vcvtps2dq %%zmm2, %%zmm2                       \n\t"
-        "vcvtps2dq %%zmm3, %%zmm3                       \n\t"
-        "vcvtps2dq %%zmm4, %%zmm4                       \n\t"
-        "vcvtps2dq %%zmm5, %%zmm5                       \n\t"
-        "vcvtps2dq %%zmm6, %%zmm6                       \n\t"
-        "vcvtps2dq %%zmm7, %%zmm7                       \n\t"
-        "vcvtps2dq %%zmm8, %%zmm8                       \n\t"
-        "vcvtps2dq %%zmm9, %%zmm9                       \n\t"
-        "vcvtps2dq %%zmm10, %%zmm10                       \n\t"
-        "vcvtps2dq %%zmm11, %%zmm11                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpaddd %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vpaddd %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vpaddd %%zmm3, %%zmm24, %%zmm3                       \n\t"
-        "vpaddd %%zmm4, %%zmm24, %%zmm4                       \n\t"
-        "vpaddd %%zmm5, %%zmm24, %%zmm5                       \n\t"
-        "vpaddd %%zmm6, %%zmm24, %%zmm6                       \n\t"
-        "vpaddd %%zmm7, %%zmm24, %%zmm7                       \n\t"
-        "vpaddd %%zmm8, %%zmm24, %%zmm8                       \n\t"
-        "vpaddd %%zmm9, %%zmm24, %%zmm9                       \n\t"
-        "vpaddd %%zmm10, %%zmm24, %%zmm10                     \n\t"
-        "vpaddd %%zmm11, %%zmm24, %%zmm11                     \n\t"
-        "movq %9, %%rax  \n\t"
-        "shr $2, %4                                     \n\t"
-        "movq %4, %%rcx  \n\t"
-        "addq %4, %%rcx                                     \n\t"
-        "vpmovusdb %%zmm0, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm1, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm2, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm3, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm4, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm5, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm6, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm7, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm8, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm9, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vpmovusdb %%zmm10, (%%rax)                       \n\t"
-        "vpmovusdb %%zmm11, (%%rax, %4)                       \n\t"
-        "jmp 4f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "vmovups %%ymm0, (%%rax)                       \n\t"
-        "vmovups %%ymm1, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm2, (%%rax)                       \n\t"
-        "vmovups %%ymm3, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm4, (%%rax)                       \n\t"
-        "vmovups %%ymm5, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm6, (%%rax)                       \n\t"
-        "vmovups %%ymm7, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm8, (%%rax)                       \n\t"
-        "vmovups %%ymm9, (%%rax, %4)                       \n\t"
-        "addq %%rcx, %%rax  \n\t"
-        "vmovups %%ymm10, (%%rax)                       \n\t"
-        "vmovups %%ymm11, (%%rax, %4)                       \n\t"
-
-        ".align 16                                         \n\t"
-        "4:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)),
-        "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%rbx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7",
-        "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%ymm16",
-        "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", "%ymm23", "%ymm24", "%ymm25",
-        "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30", "%ymm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel1x48                                             \
-    "movq %0, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "vpbroadcastd 0x4(%%rax), %%zmm31                     \n\t"   \
-    "prefetcht0 0xC0(%1)                              \n\t"       \
-    "prefetcht0 0x100(%1)                              \n\t"      \
-    "prefetcht0 0x140(%1)                              \n\t"      \
-    "vmovups (%1), %%zmm27                             \n\t"      \
-    "vmovups 0x40(%1), %%zmm28                             \n\t"  \
-    "vmovups 0x80(%1), %%zmm29                             \n\t"  \
-    "vpdpbusd %%zmm24, %%zmm30, %%zmm0              \n\t"         \
-    "vpdpbusd %%zmm25, %%zmm30, %%zmm1              \n\t"         \
-    "vpdpbusd %%zmm26, %%zmm30, %%zmm2              \n\t"         \
-    "prefetcht0 0x180(%1)                              \n\t"      \
-    "prefetcht0 0x1C0(%1)                              \n\t"      \
-    "prefetcht0 0x200(%1)                              \n\t"      \
-    "vmovups 0xC0(%1), %%zmm24                             \n\t"  \
-    "vmovups 0x100(%1), %%zmm25                             \n\t" \
-    "vmovups 0x140(%1), %%zmm26                             \n\t" \
-    "vpdpbusd %%zmm27, %%zmm31, %%zmm0              \n\t"         \
-    "vpdpbusd %%zmm28, %%zmm31, %%zmm1              \n\t"         \
-    "vpdpbusd %%zmm29, %%zmm31, %%zmm2              \n\t"
-#else
-#define mmmKernel1x48                                             \
-    "movq %0, %%rax  \n\t"                                        \
-    "vpbroadcastd (%%rax), %%zmm30                     \n\t"      \
-    "prefetcht0 0xC0(%1)                              \n\t"       \
-    "prefetcht0 0x100(%1)                              \n\t"      \
-    "prefetcht0 0x140(%1)                              \n\t"      \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vmovups (%1), %%zmm24                             \n\t"      \
-    "vmovups 0x40(%1), %%zmm25                             \n\t"  \
-    "vmovups 0x80(%1), %%zmm26                             \n\t"  \
-    "vpbroadcastd 0x4(%%rax), %%zmm30                     \n\t"   \
-    "vpaddd %%zmm0, %%zmm27, %%zmm0              \n\t"            \
-    "vpaddd %%zmm1, %%zmm28, %%zmm1              \n\t"            \
-    "vpaddd %%zmm2, %%zmm29, %%zmm2              \n\t"            \
-    "prefetcht0 0x180(%1)                              \n\t"      \
-    "prefetcht0 0x1C0(%1)                              \n\t"      \
-    "prefetcht0 0x200(%1)                              \n\t"      \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm27              \n\t"      \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm28              \n\t"      \
-    "vpmaddubsw %%zmm26, %%zmm30, %%zmm29              \n\t"      \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"        \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"        \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"        \
-    "vmovups 0xC0(%1), %%zmm24                             \n\t"  \
-    "vmovups 0x100(%1), %%zmm25                             \n\t" \
-    "vmovups 0x140(%1), %%zmm26                             \n\t" \
-    "vpaddd %%zmm0, %%zmm27, %%zmm0              \n\t"            \
-    "vpaddd %%zmm1, %%zmm28, %%zmm1              \n\t"            \
-    "vpaddd %%zmm2, %%zmm29, %%zmm2              \n\t"
-#endif
-
-inline void mmm_avx512_1x48_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0xC0(%1)                              \n\t"
-        "prefetcht0 0x100(%1)                              \n\t"
-        "prefetcht0 0x140(%1)                              \n\t"
-        "vmovups (%1), %%zmm24                             \n\t"
-        "vmovups 0x40(%1), %%zmm25                             \n\t"
-        "vmovups 0x80(%1), %%zmm26                             \n\t"
-        "add $0xC0, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%eax \n\t"
-        "vmovd %%eax, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%zmm31            \n\t"
-#endif
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x1, %%rax          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%6), %%zmm0                       \n\t"
-        "vmovups 0x40(%6), %%zmm1                   \n\t"
-        "vmovups 0x80(%6), %%zmm2                   \n\t"
-        "jmp 1f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
-        "vxorps %%zmm2, %%zmm2, %%zmm2                     \n\t"
-
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "movq %2, %%rax  \n\t"
-        "add %4, %%rax                                     \n\t"
-        "prefetcht0 (%%rax)                              \n\t"
-        "prefetcht0 0x40(%%rax)                              \n\t"
-        "prefetcht0 0x80(%%rax)                              \n\t"
-        "prefetcht0 (%%rax, %4)                              \n\t"
-        "prefetcht0 0x40(%%rax, %4)                              \n\t"
-        "prefetcht0 0x80(%%rax, %4)                              \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t" mmmKernel1x48
-
-        "add $0x180, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 2b                                             \n\t"
-
-        "vpaddd (%2), %%zmm0, %%zmm0                       \n\t"
-        "vpaddd 0x40(%2), %%zmm1, %%zmm1                   \n\t"
-        "vpaddd 0x80(%2), %%zmm2, %%zmm2                   \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 3f      \n\t"
-
-        "vbroadcastss (%5), %%zmm24                        \n\t"
-        "vcvtdq2ps %%zmm0, %%zmm0                       \n\t"
-        "vcvtdq2ps %%zmm1, %%zmm1                       \n\t"
-        "vcvtdq2ps %%zmm2, %%zmm2                       \n\t"
-        "vmulps %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vmulps %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vmulps %%zmm2, %%zmm24, %%zmm2                       \n\t"
-
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x2, %%rax          \n\t"
-        "je 3f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "vcvtps2dq %%zmm1, %%zmm1                       \n\t"
-        "vcvtps2dq %%zmm2, %%zmm2                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpaddd %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vpaddd %%zmm2, %%zmm24, %%zmm2                       \n\t"
-        "vpmovusdb %%zmm0,  (%8)                             \n\t"
-        "vpmovusdb %%zmm1,  0x10(%8)                         \n\t"
-        "vpmovusdb %%zmm2,  0x20(%8)                         \n\t"
-        "jmp 4f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        "vmovups %%zmm0,  (%2)                             \n\t"
-        "vmovups %%zmm1,  0x40(%2)                         \n\t"
-        "vmovups %%zmm2,  0x80(%2)                         \n\t"
-        ".align 16                                         \n\t"
-        "4:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)),
-        "r"(scale), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8",
-        "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
-        "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26",
-        "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel1x32                                            \
-    "vpbroadcastd (%0), %%zmm28                     \n\t"        \
-    "vpbroadcastd 0x4(%0), %%zmm29                     \n\t"     \
-    "prefetcht0 0x80(%1)                              \n\t"      \
-    "prefetcht0 0xC0(%1)                              \n\t"      \
-    "vmovups (%1), %%zmm26                             \n\t"     \
-    "vmovups 0x40(%1), %%zmm27                             \n\t" \
-    "vpdpbusd %%zmm24, %%zmm28, %%zmm0              \n\t"        \
-    "vpdpbusd %%zmm25, %%zmm28, %%zmm1              \n\t"        \
-    "prefetcht0 0x100(%1)                              \n\t"     \
-    "prefetcht0 0x140(%1)                              \n\t"     \
-    "vmovups 0x80(%1), %%zmm24                             \n\t" \
-    "vmovups 0xC0(%1), %%zmm25                             \n\t" \
-    "vpdpbusd %%zmm26, %%zmm29, %%zmm0              \n\t"        \
-    "vpdpbusd %%zmm27, %%zmm29, %%zmm1              \n\t"
-#else
-#define mmmKernel1x32                                            \
-    "vpbroadcastd (%0), %%zmm30                     \n\t"        \
-    "prefetcht0 0x80(%1)                              \n\t"      \
-    "prefetcht0 0xC0(%1)                              \n\t"      \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm26              \n\t"     \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm27              \n\t"     \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"       \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"       \
-    "vpbroadcastd 0x4(%0), %%zmm30                     \n\t"     \
-    "vmovups (%1), %%zmm24                             \n\t"     \
-    "vmovups 0x40(%1), %%zmm25                             \n\t" \
-    "vpaddd %%zmm0, %%zmm26, %%zmm0              \n\t"           \
-    "vpaddd %%zmm1, %%zmm27, %%zmm1              \n\t"           \
-    "prefetcht0 0x100(%1)                              \n\t"     \
-    "prefetcht0 0x140(%1)                              \n\t"     \
-    "vpmaddubsw %%zmm24, %%zmm30, %%zmm26              \n\t"     \
-    "vpmaddubsw %%zmm25, %%zmm30, %%zmm27              \n\t"     \
-    "vpmaddwd %%zmm26, %%zmm31, %%zmm26              \n\t"       \
-    "vpmaddwd %%zmm27, %%zmm31, %%zmm27              \n\t"       \
-    "vmovups 0x80(%1), %%zmm24                             \n\t" \
-    "vmovups 0xC0(%1), %%zmm25                             \n\t" \
-    "vpaddd %%zmm0, %%zmm26, %%zmm0              \n\t"           \
-    "vpaddd %%zmm1, %%zmm27, %%zmm1              \n\t"
-#endif
-
-inline void mmm_avx512_1x32_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0x80(%1)                              \n\t"
-        "prefetcht0 0xC0(%1)                              \n\t"
-        "vmovups (%1), %%zmm24                             \n\t"
-        "vmovups 0x40(%1), %%zmm25                             \n\t"
-        "add $0x80, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%eax \n\t"
-        "vmovd %%eax, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%zmm31            \n\t"
-#endif
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x1, %%rax          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%6), %%zmm0                       \n\t"
-        "vmovups 0x40(%6), %%zmm1                   \n\t"
-        "jmp 1f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
-
-        ".align 16                                         \n\t"
-        "1:                                                \n\t" mmmKernel1x32
-
-        "add $0x100, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 1b                                             \n\t"
-
-        "vpaddd (%2), %%zmm0, %%zmm0                       \n\t"
-        "vpaddd 0x40(%2), %%zmm1, %%zmm1                   \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 2f      \n\t"
-
-        "vbroadcastss (%5), %%zmm24                        \n\t"
-        "vcvtdq2ps %%zmm0, %%zmm0                       \n\t"
-        "vcvtdq2ps %%zmm1, %%zmm1                       \n\t"
-        "vmulps %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vmulps %%zmm1, %%zmm24, %%zmm1                       \n\t"
-
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x2, %%rax          \n\t"
-        "je 2f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "vcvtps2dq %%zmm1, %%zmm1                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpaddd %%zmm1, %%zmm24, %%zmm1                       \n\t"
-        "vpmovusdb %%zmm0,  (%8)                             \n\t"
-        "vpmovusdb %%zmm1,  0x10(%8)                         \n\t"
-        "jmp 3f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t"
-        "vmovups %%zmm0, (%2)                       \n\t"
-        "vmovups %%zmm1, 0x40(%2)                       \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)),
-        "r"(scale), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8",
-        "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
-        "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26",
-        "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel1x16                                            \
-    "vpbroadcastd (%0), %%zmm25                     \n\t"        \
-    "vpbroadcastd 0x4(%0), %%zmm26                     \n\t"     \
-    "prefetcht0 0x80(%1)                              \n\t"      \
-    "vmovups (%1), %%zmm31                             \n\t"     \
-    "vpdpbusd %%zmm24, %%zmm25, %%zmm0              \n\t"        \
-    "prefetcht0 0xC0(%1)                              \n\t"      \
-    "vmovups 0x40(%1), %%zmm24                             \n\t" \
-    "vpdpbusd %%zmm31, %%zmm26, %%zmm0              \n\t"
-#else
-#define mmmKernel1x16                                            \
-    "vpbroadcastd (%0), %%zmm25                     \n\t"        \
-    "vpbroadcastd 0x4(%0), %%zmm26                     \n\t"     \
-    "prefetcht0 0x80(%1)                              \n\t"      \
-    "vmovups (%1), %%zmm30                             \n\t"     \
-    "vpmaddubsw %%zmm24, %%zmm25, %%zmm28              \n\t"     \
-    "vpmaddubsw %%zmm30, %%zmm26, %%zmm29              \n\t"     \
-    "vpmaddwd %%zmm28, %%zmm31, %%zmm28              \n\t"       \
-    "vpmaddwd %%zmm29, %%zmm31, %%zmm29              \n\t"       \
-    "prefetcht0 0xC0(%1)                              \n\t"      \
-    "vmovups 0x40(%1), %%zmm24                             \n\t" \
-    "vpaddd %%zmm0, %%zmm28, %%zmm0              \n\t"           \
-    "vpaddd %%zmm0, %%zmm29, %%zmm0              \n\t"
-#endif
-
-inline void mmm_avx512_1x16_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0x80(%1)                              \n\t"
-        "vmovups (%1), %%zmm24                             \n\t"
-        "add $0x40, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%eax \n\t"
-        "vmovd %%eax, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%zmm31            \n\t"
-#endif
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x1, %%rax          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%6), %%zmm0                       \n\t"
-        "jmp 1f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-
-        ".align 16                                         \n\t"
-        "1:                                                \n\t" mmmKernel1x16
-
-        "add $0x80, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 1b                                             \n\t"
-
-        "vpaddd (%2), %%zmm0, %%zmm0                       \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 2f      \n\t"
-
-        "vbroadcastss (%5), %%zmm24                        \n\t"
-        "vcvtdq2ps %%zmm0, %%zmm0                       \n\t"
-        "vmulps %%zmm0, %%zmm24, %%zmm0                       \n\t"
-
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x2, %%rax          \n\t"
-        "je 2f                                         \n\t"
-        "vcvtps2dq %%zmm0, %%zmm0                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%zmm24            \n\t"
-        "vpaddd %%zmm0, %%zmm24, %%zmm0                       \n\t"
-        "vpmovusdb %%zmm0,  (%8)                             \n\t"
-        "jmp 3f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t"
-        "vmovups %%zmm0, (%2)                       \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)),
-        "r"(scale), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8",
-        "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
-        "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26",
-        "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc");
-}
-
-#ifdef _USE_AVX512_VNNI
-#define mmmKernel1x8                                             \
-    "vpbroadcastd (%0), %%ymm25                     \n\t"        \
-    "vpbroadcastd 0x4(%0), %%ymm26                     \n\t"     \
-    "prefetcht0 0x40(%1)                              \n\t"      \
-    "vmovups (%1), %%ymm31                             \n\t"     \
-    "vpdpbusd %%ymm24, %%ymm25, %%ymm0              \n\t"        \
-    "vmovups 0x20(%1), %%ymm24                             \n\t" \
-    "vpdpbusd %%ymm31, %%ymm26, %%ymm0              \n\t"
-#else
-#define mmmKernel1x8                                             \
-    "vpbroadcastd (%0), %%ymm25                     \n\t"        \
-    "vpbroadcastd 0x4(%0), %%ymm26                     \n\t"     \
-    "prefetcht0 0x80(%1)                              \n\t"      \
-    "vmovups (%1), %%ymm30                             \n\t"     \
-    "vpmaddubsw %%ymm24, %%ymm25, %%ymm28              \n\t"     \
-    "vpmaddubsw %%ymm30, %%ymm26, %%ymm29              \n\t"     \
-    "vpmaddwd %%ymm28, %%ymm31, %%ymm28              \n\t"       \
-    "vpmaddwd %%ymm29, %%ymm31, %%ymm29              \n\t"       \
-    "vmovups 0x20(%1), %%ymm24                             \n\t" \
-    "vpaddd %%ymm0, %%ymm28, %%ymm0              \n\t"           \
-    "vpaddd %%ymm0, %%ymm29, %%ymm0              \n\t"
-#endif
-
-inline void mmm_avx512_1x8_asm(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    __asm__ __volatile__(
-        "prefetcht0 0x40(%1)                              \n\t"
-        "vmovups (%1), %%ymm24                             \n\t"
-        "add $0x20, %1                                    \n\t"
-#ifndef _USE_AVX512_VNNI
-        "mov $1, %%eax \n\t"
-        "vmovd %%eax, %%xmm0                    \n\t"
-        "vpbroadcastw %%xmm0, %%ymm31            \n\t"
-#endif
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x1, %%rax          \n\t"
-        "jne 0f                                         \n\t"
-        "vmovups (%6), %%ymm0                       \n\t"
-        "jmp 1f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-
-        ".align 16                                         \n\t"
-        "1:                                                \n\t" mmmKernel1x8
-
-        "add $0x40, %1                                    \n\t"
-        "add $0x8, %0                                     \n\t"
-        "dec %%rcx                                         \n\t"
-        "jg 1b                                             \n\t"
-
-        "vpaddd (%2), %%ymm0, %%ymm0                       \n\t"
-
-        "cmpq $0x0, %5 \n\t"
-        "je 2f      \n\t"
-
-        "vbroadcastss (%5), %%ymm24                        \n\t"
-        "vcvtdq2ps %%ymm0, %%ymm0                       \n\t"
-        "vmulps %%ymm0, %%ymm24, %%ymm0                       \n\t"
-        "movq %%rbx, %%rax          \n\t"
-        "andq $0x2, %%rax          \n\t"
-        "je 2f                                         \n\t"
-        "vcvtps2dq %%ymm0, %%ymm0                       \n\t"
-        "mov $128, %%eax \n\t"
-        "vmovd %%eax, %%xmm25                    \n\t"
-        "vbroadcastss %%xmm25, %%ymm24            \n\t"
-        "vpaddd %%ymm0, %%ymm24, %%ymm0                       \n\t"
-        "vpmovusdb %%ymm0,  (%8)                             \n\t"
-        "jmp 3f                                         \n\t"
-
-        ".align 16                                         \n\t"
-        "2:                                                \n\t"
-        "vmovups %%ymm0, (%2)                       \n\t"
-
-        ".align 16                                         \n\t"
-        "3:                                                \n\t"
-        :
-        : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)),
-        "r"(scale), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result)
-        : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8",
-        "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%ymm16", "%ymm17",
-        "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", "%ymm23", "%ymm24", "%ymm25", "%ymm26",
-        "%ymm27", "%ymm28", "%ymm29", "%ymm30", "%ymm31", "memory", "cc");
-}
-
-void mmm_avx512_n_mtail(U32 um,
-    U32 un,
-    U32 bk,
-    UINT8 *matrixA,
-    INT8 *matrixB,
-    I32 *matrixC,
-    UINT8 *u8Result,
-    I32 *offsetC,
-    U32 N,
-    U32 stepK,
-    const F32 *scale,
-    U32 flags)
-{
-    I32 *result = (I32 *)matrixC;
-    F32 *resultF32 = (F32 *)matrixC;
-    for (U32 i = 0; i < um; ++i) {
-        for (U32 j = 0; j < un; ++j) {
-            I32 tmp = result[i * N + j];
-            for (U32 k = 0; k < bk * 8; k += 4) {
-                if (((flags & 0x1) == 0) && (k == 0)) {
-                    tmp += offsetC[j];
-                }
-                for (U32 k4 = 0; k4 < 4; ++k4) {
-                    tmp += (int)matrixA[i * stepK + k4 + k] * (int)matrixB[k * un + j * 4 + k4];
-                }
-            }
-            if (scale != nullptr) {
-                resultF32[i * N + j] = tmp * scale[0];
-                if ((flags & 0x2) != 0) {
-                    tmp = (I32)(resultF32[i * N + j] + 128);
-                    u8Result[i * N + j] = (tmp > 255) ? 255 : tmp;
-                }
-            } else {
-                result[i * N + j] = tmp;
-            }
-        }
-    }
-}
-
 //TODO: matrixC alloc
 EE mmm_avx512_vnni_int8(U32 N,
     U32 M,
@@ -4836,38 +2112,50 @@ EE mmm_avx512_vnni_int8(U32 N,
     const F32 *scale)
 {
     UINT8 *packA = matrix1;
-    kernel_func kernel[3][5] = {{mmm_avx512_n_mtail, mmm_avx512_1x8_asm, mmm_avx512_1x16_asm,
-                                    mmm_avx512_1x32_asm, mmm_avx512_1x48_asm},
-        {mmm_avx512_n_mtail, mmm_avx512_12x8_asm, mmm_avx512_12x16_asm, mmm_avx512_6x32_asm,
-            mmm_avx512_4x48_asm},
-        {mmm_avx512_n_mtail, mmm_avx512_24x8_asm, mmm_avx512_24x16_asm, mmm_avx512_12x32_asm,
-            mmm_avx512_8x48_asm}};
-    U32 unrollNSizes[5] = {8, 8, 16, 32, 48};
-    U32 unrollMSize[5] = {M, 24, 24, 12, 8};
+    kernel_func kernel[24][4] = {
+        {mmm_avx512_1x8_asm, mmm_avx512_1x16_asm, mmm_avx512_1x32_asm, mmm_avx512_1x48_asm},
+        {mmm_avx512_2x8_asm, mmm_avx512_2x16_asm, mmm_avx512_2x32_asm, mmm_avx512_2x48_asm},
+        {mmm_avx512_3x8_asm, mmm_avx512_3x16_asm, mmm_avx512_3x32_asm, mmm_avx512_3x48_asm},
+        {mmm_avx512_4x8_asm, mmm_avx512_4x16_asm, mmm_avx512_4x32_asm, mmm_avx512_4x48_asm},
+        {mmm_avx512_5x8_asm, mmm_avx512_5x16_asm, mmm_avx512_5x32_asm, mmm_avx512_5x48_asm},
+        {mmm_avx512_6x8_asm, mmm_avx512_6x16_asm, mmm_avx512_6x32_asm, mmm_avx512_6x48_asm},
+        {mmm_avx512_7x8_asm, mmm_avx512_7x16_asm, mmm_avx512_7x32_asm, mmm_avx512_7x48_asm},
+        {mmm_avx512_8x8_asm, mmm_avx512_8x16_asm, mmm_avx512_8x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_9x8_asm, mmm_avx512_9x16_asm, mmm_avx512_9x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_10x8_asm, mmm_avx512_10x16_asm, mmm_avx512_10x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_11x8_asm, mmm_avx512_11x16_asm, mmm_avx512_11x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_12x8_asm, mmm_avx512_12x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_13x8_asm, mmm_avx512_13x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_14x8_asm, mmm_avx512_14x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_15x8_asm, mmm_avx512_15x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_16x8_asm, mmm_avx512_16x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_17x8_asm, mmm_avx512_17x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_18x8_asm, mmm_avx512_18x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_19x8_asm, mmm_avx512_19x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_20x8_asm, mmm_avx512_20x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_21x8_asm, mmm_avx512_21x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_22x8_asm, mmm_avx512_22x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_23x8_asm, mmm_avx512_23x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm},
+        {mmm_avx512_24x8_asm, mmm_avx512_24x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}};
+    U32 unrollNSizes[4] = {8, 16, 32, 48};
+    U32 unrollMSizes[5] = {24, 24, 12, 8};
     U32 alignedK = (K + 7) / 8 * 8;
 
     I32 *offsetC = (I32 *)(tmp);
     tmp += N * bytesOf(DT_I32);
 
-    UINT8 *tmpA = tmp;
-    tmp += M * alignedK * bytesOf(DT_U8_Q);
-    packB += N * bytesOf(DT_I32);
-    if (uintptr_t(tmp + N * bytesOf(DT_I32)) == uintptr_t(packB)) {  // matmul
-        tmp += N * alignedK * bytesOf(DT_I8) + N * bytesOf(DT_I32);
-    }
-
     U32 flags = 0;
     F32 *factorPtr = nullptr;
     F32 factor = 0;
     I32 *i32Result = (I32 *)result;
     UINT8 *u8Result = result;
     if (scale != nullptr) {
-        if (scale[0] <
-            0) {  // when use offline scale, the output datatype is U8_Q, you need more tmp buffer
+        if (scale[0] < 0) {
+            // when use offline scale, the output datatype is U8_Q, you need more tmp buffer
             flags |= 1 << 1;
             factor = scale[1];
             i32Result = (I32 *)tmp;
-            memset(i32Result, 0, M * N * bytesOf(DT_I32));
+            UNI_MEMSET(i32Result, 0, M * N * bytesOf(DT_I32));
             tmp += M * N * bytesOf(DT_I32);
         } else {
             factor = 1 / (scale[0]);
@@ -4875,194 +2163,124 @@ EE mmm_avx512_vnni_int8(U32 N,
         factorPtr = &factor;
     }
 
-    auto computeMNums = [=](U32 block, U32 unit) {
-        return block / unit + (block % unit >= (unit / 2)) + (block % (unit / 2));
+    auto getEdgeMSize = [](U32 resM, U32 unrollM) {
+        U32 unit = unrollM / 2;
+        U32 low = unrollM / 4;
+        return (resM > 1) ? ((resM > low) ? ((resM + unit - 1) / unit * unit) : low) : resM;
     };
-
-    U32 mNum = M / BOLCK_M_DIM;
-    U32 unNum = N / UNROLL_N;
-    U32 unArrays[4] = {0};
-    U32 umArrays[4] = {0};
-    U32 umNums[4] = {0};
-    U32 umResNums[4] = {0};
-    U32 res = N % UNROLL_N;
-    unArrays[0] = UNROLL_N;
-    umArrays[0] = unrollMSize[(UNROLL_N >> 4) + 1];
-    umNums[0] = computeMNums(BOLCK_M_DIM, umArrays[0]);
-    U32 idx = 1;
-    while (res > 0) {
-        unArrays[idx] = UNI_MIN(unrollNSizes[(res >> 4) + 1], res);
-        umArrays[idx] = unrollMSize[(res >> 4) + 1];
-        umNums[idx] = computeMNums(BOLCK_M_DIM, umArrays[idx]);
-        if (unArrays[idx] < 8) {
-            umArrays[idx] = UNI_MIN(unrollMSize[0], BOLCK_M_DIM);
-            umNums[idx] = 1;
-        }
-        res -= unArrays[idx++];
+    auto getMNum = [](U32 mDim, U32 unrollM) { return mDim / unrollM + ((mDim % unrollM) > 0); };
+
+    U32 resN = N % UNROLL_N;
+    U32 edgeNSize = (resN > 8) ? UNI_ALIGN(resN, 16) : 8;
+    U32 resM = M % UNROLL_M;
+    U32 mainEdgeMSize = getEdgeMSize(resM, UNROLL_M);
+    UINT8 *lastMainBlockA = packA + M / UNROLL_M * UNROLL_M * K;
+    if (resM < mainEdgeMSize && matrix1Df == DF_NORMAL) {  // padding last block
+        UNI_MEMCPY(tmp, lastMainBlockA, resM * K);
+        UNI_MEMSET(tmp + resM * K, 128, (mainEdgeMSize - resM) * K);
+        lastMainBlockA = tmp;
+        tmp += mainEdgeMSize * K;
     }
-    U32 nLoopNum = unNum * umNums[0] + umNums[1] + umNums[2] + umNums[3];
-    U32 mLoopNum = nLoopNum * mNum;
-    U32 nLoopResNum = 0;
-    if (M % BOLCK_M_DIM > 0) {
-        res = M % BOLCK_M_DIM;
-        for (U32 i = 0; i < 4 && umArrays[i] > 0; ++i) {
-            if (unArrays[i] < 8) {
-                umResNums[i] = 1;
-            } else {
-                umResNums[i] = computeMNums(res, umArrays[i]);
-            }
-        }
-        nLoopResNum = (unNum * umResNums[0] + umResNums[1] + umResNums[2] + umResNums[3]);
+    U32 mloopNum = getMNum(BOLCK_M_DIM, UNROLL_M) * (M / BOLCK_M_DIM) +
+        getMNum(M % BOLCK_M_DIM, UNROLL_M) * (M % BOLCK_M_DIM > 0);
+
+    U32 newUnrollM = unrollMSizes[edgeNSize >> 4];
+    resM = M % newUnrollM;
+    U32 resEdgeMSize = getEdgeMSize(resM, newUnrollM);
+    UINT8 *lastResBlockA = packA + M / newUnrollM * newUnrollM * K;
+    if (resM < resEdgeMSize && matrix1Df == DF_NORMAL) {  // padding last block
+        UNI_MEMCPY(tmp, lastResBlockA, resM * K);
+        UNI_MEMSET(tmp + resM * K, 128, (resEdgeMSize - resM) * K);
+        lastResBlockA = tmp;
+        tmp += resEdgeMSize * K;
     }
-    idx = (unNum > 0) ? 0 : 1;
-    U32 umUnit = umArrays[idx];
-    U32 firstLoopNum = (unArrays[idx] >= 8) ? computeMNums(M, umUnit) : 1;
-    U32 loopNum = mLoopNum + nLoopResNum - firstLoopNum;
-    if (unNum >= 1) {
-        unNum -= 1;
-        nLoopNum -= umNums[0];
-        nLoopResNum -= umResNums[0];
-    } else {
-        nLoopNum -= umNums[1];
-        nLoopResNum -= umResNums[1];
+    U32 resMloopNum = getMNum(BOLCK_M_DIM, newUnrollM) * (M / BOLCK_M_DIM) +
+        getMNum(M % BOLCK_M_DIM, newUnrollM) * (M % BOLCK_M_DIM > 0);
+
+    U32 padM = UNI_MAX(UNI_ALIGN(M, UNROLL_M), UNI_ALIGN(M, newUnrollM));
+    UINT8 *tmpK = tmp;
+    U32 resK = K % SIMDW;
+    if (resK > 0 && matrix1Df == DF_NORMAL) {
+        for (U32 i = 0; i < M; ++i) {
+            UNI_MEMCPY(tmpK + i * SIMDW, packA + (i + 1) * K - resK, resK);
+            UNI_MEMSET(tmpK + i * SIMDW + resK, 128, SIMDW - resK);
+        }
+        UNI_MEMSET(tmpK + M * SIMDW, 128, (padM - M) * SIMDW);
+        tmp += padM * SIMDW;
     }
-    mLoopNum = nLoopNum * mNum;
+    U32 mNNum = N / UNROLL_N;
+    U32 alginedN = mNNum * UNROLL_N + (resN > 0) * edgeNSize;
+    U32 nmask = pow(2, N % 16) - 1;
+    U32 loopNum = mNNum * mloopNum + (resN > 0) * resMloopNum;
+    U32 bmLoopNum =
+        mNNum * getMNum(BOLCK_M_DIM, UNROLL_M) + (resN > 0) * getMNum(BOLCK_M_DIM, newUnrollM);
 
 #ifdef _USE_OPENMP
-#pragma omp parallel num_threads(OMP_NUM_THREADS) if (mLoopNum + nLoopResNum > OMP_NUM_THREADS)
-    {
+#pragma omp parallel num_threads(OMP_NUM_THREADS)
 #endif
+    {
         U32 blockSizeK = 0;
         for (U32 k = 0; k < K; k += blockSizeK) {
             blockSizeK = UNI_MIN(BOLCK_K_DIM, K - k);
-            blockSizeK = UNI_MAX(blockSizeK % SIMDW, blockSizeK - blockSizeK % SIMDW);
-            U32 alignedBlockSizeK = align_size(blockSizeK, SIMDW);
             F32 *useFactor = nullptr;
             flags |= (k > 0);
             if (k == K - blockSizeK) {
                 useFactor = factorPtr;
             }
 
-#ifdef _USE_OPENMP
-#pragma omp for schedule(static)
-#endif
-            for (U32 l = 0; l < firstLoopNum; ++l) {
-                U32 umNum = M / umUnit;
-                U32 idxM = 2;
-                U32 m = 0;
-                U32 unrollSizeM = 0;
-                if (l < umNum) {
-                    m = l * umUnit;
-                    unrollSizeM = umUnit;
-                } else if (l == umNum) {
-                    m = umNum * umUnit;
-                    if ((M - umNum * umUnit) >= (umUnit / 2)) {
-                        unrollSizeM = umUnit / 2;
-                        idxM = 1;
-                    } else {
-                        unrollSizeM = 1;
-                        idxM = 0;
-                    }
-                } else {
-                    if (M >= (umNum * umUnit + umUnit / 2)) {
-                        m = umNum * umUnit + umUnit / 2 + (l - umNum - 1);
-                    } else {
-                        m = umNum * umUnit + (l - umNum);
-                    }
-                    unrollSizeM = 1;
-                    idxM = 0;
-                }
-
-                U32 stepK = K;
-                INT8 *curB = packB + k * N;
-                UINT8 *curA = packA + m * stepK + k;
-                if (matrix1Df == DF_TRANSPOSE) {
-                    curA = tmpA + m * alignedBlockSizeK;
-                    matrix2_trans_r(unrollSizeM, blockSizeK, M, SIMDW, matrix1 + m + k * M, curA);
-                    stepK = alignedBlockSizeK;
-                } else if (matrix1Df == DF_NORMAL && blockSizeK < SIMDW) {
-                    curA = tmpA + m * alignedBlockSizeK;
-                    matrix1_trans_r(unrollSizeM, blockSizeK, K, SIMDW, matrix1 + k + m * K, curA);
-                    stepK = alignedBlockSizeK;
-                }
-                kernel[idxM][(unArrays[idx] >> 4) + (unArrays[idx] >= 8)](unrollSizeM,
-                    unArrays[idx], alignedBlockSizeK / 8, curA, curB, i32Result + m * N,
-                    u8Result + m * N, offsetC, N, stepK, useFactor, flags);
+            U32 realK = blockSizeK;
+            U32 stepK = K;
+            if (matrix1Df == DF_TRANSPOSE) {
+                matrix2_trans_r(M, blockSizeK, M, SIMDW, packA, tmp);
+                realK = UNI_ALIGN(realK, SIMDW);
+                packA = tmp;
+                stepK = realK;
             }
 
 #ifdef _USE_OPENMP
 #pragma omp for schedule(static)
 #endif
             for (U32 l = 0; l < loopNum; ++l) {
-                U32 bm = l / nLoopNum * BOLCK_M_DIM;
-                U32 nLoop = l % nLoopNum;
-                U32 unrollSizeN = 0;
-                U32 blockSizeM = 0;
-                U32 unrollM = 0;
-                U32 m = 0, n = 0;
-                U32 *umNumsPtr;
-                if (l < mLoopNum) {
-                    blockSizeM = BOLCK_M_DIM;
-                    umNumsPtr = umNums;
-                } else {
-                    blockSizeM = M % BOLCK_M_DIM;
-                    umNumsPtr = umResNums;
+                U32 bm = l / bmLoopNum * BOLCK_M_DIM;
+                U32 blockSizeM = UNI_MIN(BOLCK_M_DIM, M - bm);
+                U32 mMNum = getMNum(blockSizeM, UNROLL_M);
+                U32 bn = l % bmLoopNum;
+                U32 nLoop = bn / mMNum;
+                U32 n = nLoop * UNROLL_N;
+                U32 mLoop = bn % mMNum;
+                U32 m = mLoop * UNROLL_M;
+                U32 edgeMSize = mainEdgeMSize;
+                U32 unrollM = UNROLL_M;
+                U32 mNum = mMNum;
+                U32 nSize = UNROLL_N;
+                UINT8 *lastBlockA = lastMainBlockA;
+                if (bn >= mNNum * mMNum) {
+                    nLoop = mNNum;
+                    n = mNNum * UNROLL_N;
+                    mLoop = bn - mNNum * mMNum;
+                    m = mLoop * newUnrollM;
+                    edgeMSize = resEdgeMSize;
+                    lastBlockA = lastResBlockA;
+                    unrollM = newUnrollM;
+                    mNum = getMNum(blockSizeM, newUnrollM);
+                    nSize = edgeNSize;
                 }
 
-                if (nLoop < unNum * umNumsPtr[0]) {
-                    n = nLoop / umNumsPtr[0] * unArrays[0];
-                    m = nLoop % umNumsPtr[0];
-                    unrollSizeN = unArrays[0];
-                    unrollM = umArrays[0];
-                } else {
-                    n = unNum * unArrays[0];
-                    U32 x = unNum * umNumsPtr[0];
-                    for (int j = idx + 1; j < 4; x += umNumsPtr[j], n += unArrays[j], ++j) {
-                        if (nLoop < x + umNumsPtr[j]) {
-                            m = nLoop - x;
-                            unrollSizeN = unArrays[j];
-                            unrollM = umArrays[j];
-                            break;
-                        }
-                    }
+                U32 um = (unrollM + m > blockSizeM) ? edgeMSize : unrollM;
+                U32 rm = UNI_MIN(unrollM, blockSizeM - m);
+                INT8 *curB = packB + k * alginedN + n * UNI_ALIGN(realK, SIMDW);
+                UINT8 *curA = packA + (m + bm) * stepK + k;
+                if ((mLoop == (mNum - 1)) && (M - bm <= BOLCK_M_DIM) && (resM < edgeMSize) &&
+                    (matrix1Df == DF_NORMAL)) {
+                    curA = lastBlockA + k;
                 }
-
-                U32 unrollSizeM = 0;
-                U32 umNum = blockSizeM / unrollM;
-                U32 idxM = 2;
-                if (m < umNum) {
-                    m = m * unrollM;
-                    unrollSizeM = unrollM;
-                } else if (m == umNum) {
-                    m = umNum * unrollM;
-                    if ((blockSizeM - umNum * unrollM) >= (unrollM / 2)) {
-                        unrollSizeM = unrollM / 2;
-                        idxM = 1;
-                    } else {
-                        unrollSizeM = 1;
-                        idxM = 0;
-                    }
-                } else {
-                    if (blockSizeM >= (umNum * unrollM + unrollM / 2)) {
-                        m = umNum * unrollM + unrollM / 2 + (m - umNum - 1);
-                    } else {
-                        m = umNum * unrollM + (m - umNum);
-                    }
-                    unrollSizeM = 1;
-                    idxM = 0;
-                }
-
-                n += unArrays[idx];
-                INT8 *curB = packB + k * N + n * alignedBlockSizeK;
-                UINT8 *curA = packA + (m + bm) * K + k;
-                kernel[idxM][(unrollSizeN >> 4) + (unrollSizeN >= 8)](unrollSizeM, unrollSizeN,
-                    alignedBlockSizeK / 8, curA, curB, i32Result + (m + bm) * N + n,
-                    u8Result + (m + bm) * N + n, offsetC + n, N, K, useFactor, flags);
+                UINT8 *kpad = tmpK + (m + bm) * SIMDW;
+                U32 tnmask = (nLoop == mNNum - 1 + (resN > 0)) ? nmask : 0;
+                kernel[rm - 1][nSize >> 4](um, nSize, realK, curA, curB,
+                    i32Result + (m + bm) * N + n, u8Result + (m + bm) * N + n, offsetC + n, N,
+                    stepK, useFactor, tnmask, kpad, flags);
             }
         }
-#ifdef _USE_OPENMP
     }
-#endif
-
     return SUCCESS;
 }
diff --git a/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni.cpp b/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni.cpp
index 4ec2c1e3..b3fa74be 100644
--- a/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni.cpp
+++ b/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni.cpp
@@ -41,7 +41,7 @@ EE matrix_vector_multiply_transform_weight_int8(
             I32 *sumB = nullptr;
             if (!hasBias) {
                 sumB = (I32 *)packB;
-                memset(sumB, 0, N * sizeof(I32));
+                UNI_MEMSET(sumB, 0, N * sizeof(I32));
                 packB += N * bytesOf(DT_I32);
             } else {
                 sumB = offsetCBias;
@@ -49,13 +49,13 @@ EE matrix_vector_multiply_transform_weight_int8(
             U32 blockKSize = 0;
             for (U32 bk = 0; bk < K; bk += blockKSize) {
                 blockKSize = UNI_MIN(K - bk, BOLCK_K_DIM);
-                U32 alignedBlockSizeK = align_size(blockKSize, 4);
+                U32 alignedBlockSizeK = UNI_ALIGN(blockKSize, 4);
                 for (U32 un = 0; un < N; un += unrollSizeN) {
                     unrollSizeN = UNI_MIN(UNROLL_N, N - un);
                     unrollSizeN = unrollSize[unrollSizeN >> 4];
                     if (N - un < unrollSizeN) {
                         unrollSizeN = N - un;
-                        memset(packB, 0, unrollSizeN * alignedBlockSizeK);
+                        UNI_MEMSET(packB, 0, unrollSizeN * alignedBlockSizeK);
                         for (U32 k = 0; k < alignedBlockSizeK; k += 4) {
                             for (U32 i = 0; i < unrollSizeN; ++i) {
                                 for (U32 ii = 0; ii < 4 && k + ii < blockKSize; ++ii) {
@@ -65,7 +65,8 @@ EE matrix_vector_multiply_transform_weight_int8(
                             }
                         }
                     } else {
-                        matrix1_trans_l(unrollSizeN, blockKSize, K, 4, src + un * K + bk, packB);
+                        matrix1_trans_l(
+                            unrollSizeN, unrollSizeN, blockKSize, K, 4, src + un * K + bk, packB);
                     }
                     packB += unrollSizeN * alignedBlockSizeK;
                 }
@@ -84,7 +85,7 @@ EE matrix_vector_multiply_transform_weight_int8(
             I32 *sumB = nullptr;
             if (!hasBias) {
                 sumB = (I32 *)packB;
-                memset(sumB, 0, N * sizeof(I32));
+                UNI_MEMSET(sumB, 0, N * sizeof(I32));
                 packB += N * bytesOf(DT_I32);
             } else {
                 sumB = offsetCBias;
@@ -92,13 +93,13 @@ EE matrix_vector_multiply_transform_weight_int8(
             U32 blockKSize = 0;
             for (U32 bk = 0; bk < K; bk += blockKSize) {
                 blockKSize = UNI_MIN(K - bk, BOLCK_K_DIM);
-                U32 alignedBlockSizeK = align_size(blockKSize, 4);
+                U32 alignedBlockSizeK = UNI_ALIGN(blockKSize, 4);
                 for (U32 un = 0; un < N; un += unrollSizeN) {
                     unrollSizeN = UNI_MIN(UNROLL_N, N - un);
                     unrollSizeN = unrollSize[unrollSizeN >> 4];
                     if (N - un < unrollSizeN) {
                         unrollSizeN = N - un;
-                        memset(packB, 0, unrollSizeN * alignedBlockSizeK);
+                        UNI_MEMSET(packB, 0, unrollSizeN * alignedBlockSizeK);
                         for (U32 k = 0; k < blockKSize; k += 4) {
                             for (U32 i = 0; i < unrollSizeN; ++i) {
                                 for (U32 ii = 0; ii < 4 && k + ii < blockKSize; ++ii) {
@@ -108,7 +109,8 @@ EE matrix_vector_multiply_transform_weight_int8(
                             }
                         }
                     } else {
-                        matrix2_trans_l(unrollSizeN, blockKSize, N, 4, src + un + bk * N, packB);
+                        matrix2_trans_l(
+                            unrollSizeN, unrollSizeN, blockKSize, N, 4, src + un + bk * N, packB);
                     }
                     packB += unrollSizeN * alignedBlockSizeK;
                 }
@@ -680,6 +682,8 @@ void mvm_row_avx512_tail(U32 bn,
         I32 tmp = 0;
         if ((flags & 0x1) == 0) {
             tmp += offsetC[n];
+        } else {
+            tmp = ((I32 *)result)[n];
         }
         for (U32 k = 0; k < bk; k += 4) {
             for (U32 k4 = 0; k4 < 4; ++k4) {
@@ -717,12 +721,12 @@ EE mvm_avx512_int8(U32 numRows,
     I32 *i32Result = (I32 *)result;
     UINT8 *u8Result = result;
     if (scale != nullptr) {
-        if (scale[0] <
-            0) {  // when use offline scale, the output datatype is U8_Q, you need more tmp buffer
+        // when use offline scale, the output datatype is U8_Q, you need more tmp buffer
+        if (scale[0] < 0) {
             flags |= 1 << 1;
             factor = scale[1];
             i32Result = offsetCBias + numRows;
-            memset(i32Result, 0, numRows * bytesOf(DT_I32));
+            UNI_MEMSET(i32Result, 0, numRows * bytesOf(DT_I32));
         } else {
             factor = 1 / (*scale);
         }
@@ -731,7 +735,7 @@ EE mvm_avx512_int8(U32 numRows,
     packB += numRows * bytesOf(DT_I32);
     for (U32 k = 0; k < numColumns; k += blockSizeK) {
         blockSizeK = UNI_MIN(BOLCK_K_DIM, numColumns - k);
-        U32 alignedBlockSizeK = align_size(blockSizeK, 4);
+        U32 alignedBlockSizeK = UNI_ALIGN(blockSizeK, 4);
         flags |= (k > 0);
         F32 *useFactor = nullptr;
         if (k == numColumns - blockSizeK) {
diff --git a/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni_row.cpp b/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni_row.cpp
index 5ad5b8bd..5030eea6 100644
--- a/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni_row.cpp
+++ b/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni_row.cpp
@@ -1000,7 +1000,7 @@ EE mvm_avx512_int8_row_i8u8(U32 numRows,
             flags |= 1 << 1;
             factor = scale[1];
             i32Result = (I32 *)((UINT8 *)tmp + numRows * numColumns);
-            memset(i32Result, 0, numRows * bytesOf(DT_I32));
+            UNI_MEMSET(i32Result, 0, numRows * bytesOf(DT_I32));
         } else {
             factor = 1 / (*scale);
         }
diff --git a/compute/blas_enhance/src/cpu/x86/mmm.cpp b/compute/blas_enhance/src/cpu/x86/mmm.cpp
index d75353e8..62fe0455 100644
--- a/compute/blas_enhance/src/cpu/x86/mmm.cpp
+++ b/compute/blas_enhance/src/cpu/x86/mmm.cpp
@@ -23,7 +23,7 @@
 #endif
 
 EE matrix_matrix_multiply_tmp_bytes_x86(
-    U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes)
+    U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataFormat df, DataType dt, U32 *bytes)
 {
     EE ret = SUCCESS;
     switch (dt) {
@@ -38,7 +38,7 @@ EE matrix_matrix_multiply_tmp_bytes_x86(
         case DT_U8_Q:
         case DT_I8: {
             matrix_matrix_multiply_tmp_bytes_int8(
-                matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes);
+                matrixA_M, matrixA_K, matrixB_K, matrixB_N, df, dt, bytes);
             break;
         }
 #endif
@@ -51,7 +51,7 @@ EE matrix_matrix_multiply_tmp_bytes_x86(
 }
 
 static EE matrix_matrix_multiply_transform_rhsN(
-    TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, void *offsetCBias)
+    TensorDesc desc, const void *src, TensorDesc *descTran, void *dst)
 {
     EE ret = SUCCESS;
     switch (desc.dt) {
@@ -64,7 +64,7 @@ static EE matrix_matrix_multiply_transform_rhsN(
 #ifdef _USE_INT8
         case DT_I8: {
             ret = matrix_matrix_multiply_transform_rhsN_int8(
-                desc, (INT8 *)src, (INT8 *)dst, (I32 *)offsetCBias);
+                desc, (INT8 *)src, (INT8 *)dst);
             break;
         }
 #endif
@@ -78,7 +78,7 @@ static EE matrix_matrix_multiply_transform_rhsN(
 }
 
 static EE matrix_matrix_multiply_transform_rhsT(
-    TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, void *offsetCBias)
+    TensorDesc desc, const void *src, TensorDesc *descTran, void *dst)
 {
     EE ret = SUCCESS;
     switch (desc.dt) {
@@ -91,7 +91,7 @@ static EE matrix_matrix_multiply_transform_rhsT(
 #ifdef _USE_INT8
         case DT_I8: {
             ret = matrix_matrix_multiply_transform_rhsT_int8(
-                desc, (INT8 *)src, (INT8 *)dst, (I32 *)offsetCBias);
+                desc, (INT8 *)src, (INT8 *)dst);
             break;
         }
 #endif
@@ -106,7 +106,7 @@ static EE matrix_matrix_multiply_transform_rhsT(
 }
 
 EE matrix_matrix_multiply_transform_rhs_x86(
-    TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, void *offsetCBias)
+    TensorDesc desc, const void *src, TensorDesc *descTran, void *dst)
 {
     if (desc.df == targetFormat4MatrixB(desc.dt)) {
         return SUCCESS;
@@ -114,11 +114,11 @@ EE matrix_matrix_multiply_transform_rhs_x86(
     EE ret = SUCCESS;
     switch (desc.df) {
         case DF_NORMAL: {
-            ret = matrix_matrix_multiply_transform_rhsN(desc, src, descTran, dst, offsetCBias);
+            ret = matrix_matrix_multiply_transform_rhsN(desc, src, descTran, dst);
             break;
         }
         case DF_TRANSPOSE: {
-            ret = matrix_matrix_multiply_transform_rhsT(desc, src, descTran, dst, offsetCBias);
+            ret = matrix_matrix_multiply_transform_rhsT(desc, src, descTran, dst);
             break;
         }
         default:
diff --git a/compute/blas_enhance/src/mmm.cpp b/compute/blas_enhance/src/mmm.cpp
index 5d982700..871121cb 100644
--- a/compute/blas_enhance/src/mmm.cpp
+++ b/compute/blas_enhance/src/mmm.cpp
@@ -12,6 +12,7 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "blas_enhance.h"
+#include "uni.h"
 #ifdef _USE_GENERAL
 #include "cpu/general/blas_general.h"
 #endif
@@ -44,10 +45,10 @@ EE matrix_matrix_multiply_tmp_bytes(
 #ifdef _USE_X86
     } else if (IS_X86(arch)) {
         ret = matrix_matrix_multiply_tmp_bytes_x86(
-            matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataType, bytes);
+            matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataFormat, matrixADataType, bytes);
 #endif
 #ifdef _USE_NEON
-    } else {
+    } else if (IS_ARM(arch)) {
         ret = matrix_matrix_multiply_tmp_bytes_arm(
             matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataType, bytes);
 #endif
@@ -59,23 +60,21 @@ EE matrix_matrix_multiply_transform_rhs(
     TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch)
 {
     EE ret = NOT_SUPPORTED;
-#ifdef _USE_NEON
     if (IS_ARM(arch)) {
+#ifdef _USE_NEON
         ret = matrix_matrix_multiply_transform_rhs_arm(desc, src, descTran, dst);
-    }
 #endif
 #ifdef _USE_GENERAL
-    if (IS_GENERAL(arch)) {
-        memcpy(dst, src, tensorNumBytes(desc));
+    } else if (IS_GENERAL(arch)) {
+        UNI_MEMCPY(dst, src, tensorNumBytes(desc));
         (*descTran) = desc;
         ret = SUCCESS;
-    }
 #endif
 #ifdef _USE_X86
-    if (IS_X86(arch)) {
-        ret = matrix_matrix_multiply_transform_rhs_x86(desc, src, descTran, dst, nullptr);
-    }
+    } else if (IS_X86(arch)) {
+        ret = matrix_matrix_multiply_transform_rhs_x86(desc, src, descTran, dst);
 #endif
+    }
     return ret;
 }
 
@@ -142,23 +141,23 @@ EE matrix_matrix_multiply(TensorDesc matrixADesc,
         TensorDesc tranDescB;
         U8 *dataB = (U8 *)matrixBData;
         if (matrixBDataFormat != targetFormat4MatrixB(matrixBDataType)) {
-            U8 *offsetCBias = nullptr;
-            U32 alignedAK = matrixA_K;
             dataB = ((U8 *)tmp);
             if (matrixADataType == DT_U8_Q && matrixBDataType == DT_I8) {
-                offsetCBias = (U8 *)tmp;
-                alignedAK = (matrixA_K + 7) / 8 * 8;
-                dataB += matrixC_N * bytesOf(DT_I32);
+                U32 alignedK = (matrixB_K + 7) / 8 * 8;
+                U32 alignedN = (matrixB_N + 15) / 16 * 16;
+                tmp = (U8 *)tmp + alignedK * alignedN;
+            } else {
+                U32 alignedN = (matrixB_N + 7) / 8 * 8;
+                tmp = (U8 *)tmp + matrixB_K * alignedN;
             }
-            dataB += matrixA_M * alignedAK * bytesOf(matrixADataType);
             ret = matrix_matrix_multiply_transform_rhs_x86(
-                matrixBDesc, matrixBData, &tranDescB, dataB, offsetCBias);
+                matrixBDesc, matrixBData, &tranDescB, dataB);
         }
         ret = mmm_x86(matrixC_N, matrixC_M, matrixA_K, matrixBDataType, matrixADataFormat,
             matrixAData, dataB, tmp, matrixCData, scale);
 #endif
 #ifdef _USE_NEON
-    } else {
+    } else if (IS_ARM(arch)) {
         TensorDesc tranDescB;
         U8 *dataB = (U8 *)matrixBData;
         if (matrixBDataFormat != targetFormat4MatrixB(matrixBDataType)) {
diff --git a/compute/blas_enhance/src/mvm.cpp b/compute/blas_enhance/src/mvm.cpp
index 359a7d58..501423d2 100644
--- a/compute/blas_enhance/src/mvm.cpp
+++ b/compute/blas_enhance/src/mvm.cpp
@@ -48,23 +48,21 @@ EE matrix_vector_multiply_transform_weight(
     TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch)
 {
     EE ret = NOT_SUPPORTED;
-#ifdef _USE_NEON
     if (IS_ARM(arch)) {
+#ifdef _USE_NEON
         ret = matrix_vector_multiply_transform_weight_arm(desc, src, descTran, dst);
-    }
 #endif
 #ifdef _USE_GENERAL
-    if (IS_GENERAL(arch)) {
-        memcpy(dst, src, tensorNumBytes(desc));
+    } else if (IS_GENERAL(arch)) {
+        UNI_MEMCPY(dst, src, tensorNumBytes(desc));
         (*descTran) = desc;
         ret = SUCCESS;
-    }
 #endif
 #ifdef _USE_X86
-    if (IS_X86(arch)) {
+    } else if (IS_X86(arch)) {
         ret = matrix_vector_multiply_transform_weight_x86(desc, src, descTran, dst, nullptr);
-    }
 #endif
+    }
     return ret;
 }
 
@@ -140,7 +138,7 @@ EE matrix_vector_multiply(TensorDesc matrixDesc,
             result, tmp, scale);
 #endif
 #ifdef _USE_NEON
-    } else {
+    } else if (IS_ARM(arch)) {
         ret = mvm_arm(matrixRow, matrixColumn, matrixDataType, matrixDataFormat, matrix, vector,
             tmp, result, arch);
 #endif
diff --git a/compute/blas_enhance/tests/test_mmm.cpp b/compute/blas_enhance/tests/test_mmm.cpp
index 80f3923e..b5de8983 100644
--- a/compute/blas_enhance/tests/test_mmm.cpp
+++ b/compute/blas_enhance/tests/test_mmm.cpp
@@ -31,7 +31,9 @@ int mmmTestKernel(U32 m, U32 k, U32 n, DataType dt)
     U32 bytes = 0;
     U8 *A = ut_input_v(m * k, dt, UT_INIT_RANDOM);
     U8 *B = ut_input_v(k * n, dt, UT_INIT_RANDOM);
-    U8 *B_tran = ut_input_v(k * n + 32, dt, UT_INIT_ZERO);
+
+    U32 alignedN = (n + 7) / 8 * 8;
+    U8 *B_tran = ut_input_v(k * alignedN + 32, dt, UT_INIT_ZERO);
     U8 *C = ut_input_v(m * n, dt, UT_INIT_ZERO);
     U8 *C_ref = ut_input_v(m * n, dt, UT_INIT_ZERO);
     CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(A_desc, B_desc, &bytes, UT_ARCH));
diff --git a/compute/blas_enhance/tests/test_mmm_int8.cpp b/compute/blas_enhance/tests/test_mmm_int8.cpp
index 18a88be6..c7c72777 100644
--- a/compute/blas_enhance/tests/test_mmm_int8.cpp
+++ b/compute/blas_enhance/tests/test_mmm_int8.cpp
@@ -11,17 +11,16 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+#include <string.h>
 #include "blas_enhance.h"
 #include "ut_util.h"
+#include "thread_affinity.h"
 
-int main(int argc, char **argv)
+//#define COVER_TEST
+
+int testMMM(U32 m, U32 k, U32 n)
 {
 #ifdef _USE_INT8
-    CHECK_REQUIREMENT(argc == 4);
-    U32 m = atoi(argv[1]);
-    U32 k = atoi(argv[2]);
-    U32 n = atoi(argv[3]);
-
     DataType dt = DT_I8;
     DataType odt = DT_I32;
     TensorDesc A_desc = tensor2df(dt, DF_NORMAL, m, k);
@@ -31,17 +30,22 @@ int main(int argc, char **argv)
 
     U32 bytes = 0;
     U32 k8 = k;
+    U32 n8 = n;
     if (k8 % 8 != 0) {
         k8 = (k8 / 8) * 8 + 8;
     }
+    if (n8 % 16 != 0) {
+        n8 = (n8 / 16) * 16 + 16;
+    }
     INT8 *A = (INT8 *)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM);
     INT8 *A_ref = (INT8 *)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM);
-    memcpy(A_ref, A, m * k);
+    UNI_MEMCPY(A_ref, A, m * k);
     INT8 *B = (INT8 *)ut_input_v(k * n, DT_I8, UT_INIT_RANDOM);
-    INT8 *B_tran = (INT8 *)ut_input_v(k8 * n + 64 + n * 4, DT_I8, UT_INIT_ZERO);
+    INT8 *B_tran = (INT8 *)ut_input_v(k8 * n8 + 64 + n8 * 4, DT_I8, UT_INIT_ZERO);
     I32 *C = (I32 *)ut_input_v(m * n, DT_I32, UT_INIT_ZERO);
     I32 *C_ref = (I32 *)ut_input_v(m * n, DT_I32, UT_INIT_ZERO);
     CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(A_desc, B_desc, &bytes, UT_ARCH));
+    bytes += m * n;
     INT8 *tmp = (INT8 *)ut_input_v(bytes, DT_I8, UT_INIT_ZERO);
 
     matrix_matrix_multiply_transform_rhs(B_desc, B, &tranDescB, B_tran, UT_ARCH);
@@ -51,15 +55,15 @@ int main(int argc, char **argv)
     for (U32 i = 0; i < m * k; ++i) {
         uA[i] = (UINT8)((I32)A[i] + 128);
     }
-    memcpy(tmp, B_tran, n * bytesOf(DT_I32));
+    UNI_MEMCPY(tmp, B_tran + n8 * k8, n * bytesOf(DT_I32));
 #endif
 
     if (UT_CHECK) {
-        CHECK_STATUS(matrix_matrix_multiply(
-            A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, nullptr, UT_ARCH));
+        CHECK_STATUS(
+            matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, nullptr, UT_ARCH));
 
-        CHECK_STATUS(matrix_matrix_multiply(
-            A_desc, A_ref, B_desc, B, bytes, tmp, C_desc, C_ref, nullptr, CPU_GENERAL));
+        CHECK_STATUS(
+            matrix_matrix_multiply(A_desc, A_ref, B_desc, B, bytes, tmp, C_desc, C_ref, nullptr, CPU_GENERAL));
 
         // check
         ut_check_v(C, C_ref, m * n, DT_I32, 1, __FILE__, __LINE__);
@@ -68,8 +72,7 @@ int main(int argc, char **argv)
     // benchmark
     double time_start = ut_time_ms();
     for (int iter = 0; iter < UT_LOOPS; iter++) {
-        matrix_matrix_multiply(
-            A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, nullptr, UT_ARCH);
+        matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, nullptr, UT_ARCH);
     }
     double time_end = ut_time_ms();
     double time = (time_end - time_start) / UT_LOOPS;
@@ -91,3 +94,21 @@ int main(int argc, char **argv)
 #endif
     return 0;
 }
+
+int main(int argc, char **argv)
+{
+#ifdef COVER_TEST
+    int ret = 0;
+    for (U32 m = 1; m < 48; ++m) {
+        for (U32 k = 1; k < 48; ++k) {
+            for (U32 n = 1; n < 48; ++n) {
+                ret = testMMM(m, k, n);
+            }
+        }
+    }
+    return ret;
+#else
+    CHECK_REQUIREMENT(argc == 4);
+    return testMMM(atoi(argv[1]), atoi(argv[2]), atoi(argv[3]));
+#endif
+}
diff --git a/compute/blas_enhance/tests/test_mvm_int8.cpp b/compute/blas_enhance/tests/test_mvm_int8.cpp
index 3165c956..1af973d4 100644
--- a/compute/blas_enhance/tests/test_mvm_int8.cpp
+++ b/compute/blas_enhance/tests/test_mvm_int8.cpp
@@ -47,7 +47,7 @@ int main(int argc, char **argv)
     INT8 *matTran = (INT8 *)ut_input_v(m * k4 + m * 4, DT_I8, UT_INIT_ZERO);
     INT8 *vec = (INT8 *)ut_input_v(vc, DT_I8, UT_INIT_RANDOM);
     INT8 *vec_ref = (INT8 *)ut_input_v(vc, DT_I8, UT_INIT_RANDOM);
-    memcpy(vec_ref, vec, vc);
+    UNI_MEMCPY(vec_ref, vec, vc);
     I32 *res = (I32 *)ut_input_v(rc, DT_I32, UT_INIT_ZERO);
     I32 *res_ref = (I32 *)ut_input_v(rc, DT_I32, UT_INIT_ZERO);
 
@@ -62,7 +62,7 @@ int main(int argc, char **argv)
     for (U32 i = 0; i < vc; ++i) {
         uA[i] = (UINT8)((I32)vec[i] + 128);
     }
-    memcpy(tmp, matTran, rc * bytesOf(DT_I32));
+    UNI_MEMCPY(tmp, matTran, rc * bytesOf(DT_I32));
 #endif
 
     // check
diff --git a/compute/image/include/image.h b/compute/image/include/image.h
index 13202f80..516707c5 100644
--- a/compute/image/include/image.h
+++ b/compute/image/include/image.h
@@ -23,13 +23,29 @@
 #include "ocl_desc_trans.h"
 #endif
 
-EE resize_infer_output_size(Tensor *inputTensor,
-    DataType paramDT,
-    void *params,
-    Tensor *outputTensor,
-    U32 *outputBytes,
-    ArchInfo_t archInfo);
+EE resize_infer_output_size(
+    Tensor *inputTensor, ResizeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo);
+
+EE resize_infer_forward_tmp_bytes(
+    Tensor inputTensor, ResizeParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo);
 
 EE resize(
-    Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ResizeParamSpec p, ArchInfo_t archInfo);
+    Tensor inputTensor, ResizeParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo);
+
+EE grid_sample_infer_output_size(
+    Tensor *inputTensor, Tensor *gridTensor, Tensor *outputTensor, ArchInfo_t archInfo);
+
+EE grid_sample_infer_forward_tmp_bytes(Tensor inputTensor,
+    Tensor gridTensor,
+    GridSampleParamSpec p,
+    Tensor outputTensor,
+    U32 *bytes,
+    ArchInfo_t archInfo);
+
+EE grid_sample(Tensor inputTensor,
+    Tensor gridTensor,
+    GridSampleParamSpec p,
+    Tensor tmpTensor,
+    Tensor outputTensor,
+    ArchInfo_t archInfo);
 #endif
diff --git a/compute/image/src/CMakeLists.txt b/compute/image/src/CMakeLists.txt
index 7c2da109..32d08e45 100644
--- a/compute/image/src/CMakeLists.txt
+++ b/compute/image/src/CMakeLists.txt
@@ -26,6 +26,9 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 # shared library 
 add_library(${PROJECT_NAME} SHARED ${srcs})
 target_link_libraries (${PROJECT_NAME} LINK_PUBLIC uni)
+if (USE_SECURE_C)
+    target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY})
+endif ()
 
 # static library 
 add_library(${PROJECT_NAME}_static STATIC ${srcs})
diff --git a/compute/image/src/cpu/arm/image_arm.h b/compute/image/src/cpu/arm/image_arm.h
index cfbe7f19..5374fd18 100644
--- a/compute/image/src/cpu/arm/image_arm.h
+++ b/compute/image/src/cpu/arm/image_arm.h
@@ -16,6 +16,7 @@
 
 #include "error.h"
 #include "tensor_desc.h"
+#include "parameter_spec.h"
 
-EE resize_bilinear_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output);
+EE resize_bilinear_arm(TensorDesc inputDesc, void *input, ResizeParamSpec p, TensorDesc outputDesc, void *output);
 #endif
diff --git a/compute/image/src/cpu/arm/resize_bilinear.cpp b/compute/image/src/cpu/arm/resize_bilinear.cpp
index 1dbf6722..4b87c17c 100644
--- a/compute/image/src/cpu/arm/resize_bilinear.cpp
+++ b/compute/image/src/cpu/arm/resize_bilinear.cpp
@@ -16,7 +16,8 @@
 #include "uni.h"
 
 #ifdef _USE_FP16
-EE resize_bilinear_fp16(TensorDesc inputDesc, F16 *inArray, TensorDesc outputDesc, F16 *outArray)
+EE resize_bilinear_fp16(
+    TensorDesc inputDesc, F16 *inArray, ResizeParamSpec p, TensorDesc outputDesc, F16 *outArray)
 {
     DataType idt, odt;
     DataFormat idf, odf;
@@ -28,8 +29,14 @@ EE resize_bilinear_fp16(TensorDesc inputDesc, F16 *inArray, TensorDesc outputDes
     if (idf != DF_NCHWC8 || odf != DF_NCHWC8) {
         CHECK_STATUS(NOT_MATCH);
     }
-    F32 strideH = ((F32)ih) / oh;
-    F32 strideW = ((F32)iw) / ow;
+    F32 strideH, strideW;
+    if (p.trans_mode == COORDINATE_TRANS_ALIGN_CORNERS) {
+        strideH = ((F32)ih - 1) / (oh - 1);
+        strideW = ((F32)iw - 1) / (ow - 1);
+    } else {
+        strideH = ((F32)ih) / oh;
+        strideW = ((F32)iw) / ow;
+    }
     U32 ic_align = 8, oc_align = 8;
     ic /= ic_align;
     oc /= oc_align;
@@ -78,7 +85,8 @@ EE resize_bilinear_fp16(TensorDesc inputDesc, F16 *inArray, TensorDesc outputDes
 #endif
 
 #ifdef _USE_FP32
-EE resize_bilinear_fp32(TensorDesc inputDesc, F32 *inArray, TensorDesc outputDesc, F32 *outArray)
+EE resize_bilinear_fp32(
+    TensorDesc inputDesc, F32 *inArray, ResizeParamSpec p, TensorDesc outputDesc, F32 *outArray)
 {
     DataType idt, odt;
     DataFormat idf, odf;
@@ -90,8 +98,14 @@ EE resize_bilinear_fp32(TensorDesc inputDesc, F32 *inArray, TensorDesc outputDes
     if (idf != DF_NCHWC8 || odf != DF_NCHWC8) {
         CHECK_STATUS(NOT_MATCH);
     }
-    F32 strideH = ((F32)ih) / oh;
-    F32 strideW = ((F32)iw) / ow;
+    F32 strideH, strideW;
+    if (p.trans_mode == COORDINATE_TRANS_ALIGN_CORNERS) {
+        strideH = ((F32)ih - 1) / (oh - 1);
+        strideW = ((F32)iw - 1) / (ow - 1);
+    } else {
+        strideH = ((F32)ih) / oh;
+        strideW = ((F32)iw) / ow;
+    }
     U32 ic_align = 8, oc_align = 8;
     ic /= ic_align;
     oc /= oc_align;
@@ -148,18 +162,19 @@ EE resize_bilinear_fp32(TensorDesc inputDesc, F32 *inArray, TensorDesc outputDes
 }
 #endif
 
-EE resize_bilinear_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output)
+EE resize_bilinear_arm(
+    TensorDesc inputDesc, void *input, ResizeParamSpec p, TensorDesc outputDesc, void *output)
 {
     EE ret = SUCCESS;
     switch (inputDesc.dt) {
 #ifdef _USE_FP16
         case DT_F16:
-            ret = resize_bilinear_fp16(inputDesc, (F16 *)input, outputDesc, (F16 *)output);
+            ret = resize_bilinear_fp16(inputDesc, (F16 *)input, p, outputDesc, (F16 *)output);
             break;
 #endif
 #ifdef _USE_FP32
         case DT_F32:
-            ret = resize_bilinear_fp32(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
+            ret = resize_bilinear_fp32(inputDesc, (F32 *)input, p, outputDesc, (F32 *)output);
             break;
 #endif
         default:
diff --git a/compute/image/src/cpu/general/image_general.h b/compute/image/src/cpu/general/image_general.h
index a44a19bc..cb5e98a3 100644
--- a/compute/image/src/cpu/general/image_general.h
+++ b/compute/image/src/cpu/general/image_general.h
@@ -15,6 +15,7 @@
 #define _H_IMAGE_GENERAL
 
 #include "tensor_desc.h"
+#include "parameter_spec.h"
 
-EE resize_bilinear_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output);
+EE resize_bilinear_general(TensorDesc inputDesc, void *input, ResizeParamSpec p, TensorDesc outputDesc, void *output);
 #endif
diff --git a/compute/image/src/cpu/general/resize_bilinear.cpp b/compute/image/src/cpu/general/resize_bilinear.cpp
index 09705e3c..cc38e0ed 100644
--- a/compute/image/src/cpu/general/resize_bilinear.cpp
+++ b/compute/image/src/cpu/general/resize_bilinear.cpp
@@ -15,7 +15,8 @@
 #include "uni.h"
 
 template <typename IT, typename OT>
-EE resize_bilinear(TensorDesc inputDesc, IT *inArray, TensorDesc outputDesc, OT *outArray)
+EE resize_bilinear(
+    TensorDesc inputDesc, IT *inArray, ResizeParamSpec p, TensorDesc outputDesc, OT *outArray)
 {
     DataType idt, odt;
     DataFormat idf, odf;
@@ -32,8 +33,14 @@ EE resize_bilinear(TensorDesc inputDesc, IT *inArray, TensorDesc outputDesc, OT
         oc_align = 8;
     }
 
-    F32 strideH = ((F32)ih) / oh;
-    F32 strideW = ((F32)iw) / ow;
+    F32 strideH, strideW;
+    if (p.trans_mode == COORDINATE_TRANS_ALIGN_CORNERS) {
+        strideH = ((F32)ih - 1) / (oh - 1);
+        strideW = ((F32)iw - 1) / (ow - 1);
+    } else {
+        strideH = ((F32)ih) / oh;
+        strideW = ((F32)iw) / ow;
+    }
     ic /= ic_align;
     oc /= oc_align;
     U32 srcTL, srcTR, srcBL, srcBR;
@@ -75,6 +82,7 @@ EE resize_bilinear(TensorDesc inputDesc, IT *inArray, TensorDesc outputDesc, OT
                             srcBL = ((n * ic + cc) * ih + hBB) * iw + wL;
                             srcBR = ((n * ic + cc) * ih + hBB) * iw + wRR;
                         }
+
                         outArray[dst] = inArray[srcTL] * factorTL + inArray[srcTR] * factorTR +
                             inArray[srcBL] * factorBL + inArray[srcBR] * factorBR;
                     }
@@ -85,33 +93,34 @@ EE resize_bilinear(TensorDesc inputDesc, IT *inArray, TensorDesc outputDesc, OT
     return SUCCESS;
 }
 
-EE resize_bilinear_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output)
+EE resize_bilinear_general(
+    TensorDesc inputDesc, void *input, ResizeParamSpec p, TensorDesc outputDesc, void *output)
 {
     EE ret = NOT_SUPPORTED;
     switch (inputDesc.dt) {
 #ifdef _USE_FP16
         case DT_F16: {
-            ret = resize_bilinear<F16, F16>(inputDesc, (F16 *)input, outputDesc, (F16 *)output);
+            ret = resize_bilinear<F16, F16>(inputDesc, (F16 *)input, p, outputDesc, (F16 *)output);
             break;
         }
 #endif
 #ifdef _USE_FP32
         case DT_F32: {
-            ret = resize_bilinear<F32, F32>(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
+            ret = resize_bilinear<F32, F32>(inputDesc, (F32 *)input, p, outputDesc, (F32 *)output);
             break;
         }
 #endif
         case DT_U8: {
+            if (0) {
 #ifdef _USE_FP16
-            if (DT_F16 == outputDesc.dt) {
-                ret = resize_bilinear<U8, F16>(inputDesc, (U8 *)input, outputDesc, (F16 *)output);
-            }
+            } else if (DT_F16 == outputDesc.dt) {
+                ret = resize_bilinear<U8, F16>(inputDesc, (U8 *)input, p, outputDesc, (F16 *)output);
 #endif
 #ifdef _USE_FP32
-            if (DT_F32 == outputDesc.dt) {
-                ret = resize_bilinear<U8, F32>(inputDesc, (U8 *)input, outputDesc, (F32 *)output);
-            }
+            } else if (DT_F32 == outputDesc.dt) {
+                ret = resize_bilinear<U8, F32>(inputDesc, (U8 *)input, p, outputDesc, (F32 *)output);
 #endif
+            }
             break;
         }
         default:
diff --git a/compute/image/src/cpu/grid_sample.cpp b/compute/image/src/cpu/grid_sample.cpp
new file mode 100644
index 00000000..c53fabb3
--- /dev/null
+++ b/compute/image/src/cpu/grid_sample.cpp
@@ -0,0 +1,246 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/image_cpu.h"
+#include "uni.h"
+
+EE grid_sample_infer_output_size_cpu(
+    TensorDesc inputDesc, TensorDesc gridDesc, TensorDesc *outputDesc)
+{
+    *outputDesc = inputDesc;
+    outputDesc->dims[0] = gridDesc.dims[1];
+    outputDesc->dims[1] = gridDesc.dims[2];
+    CHECK_REQUIREMENT(gridDesc.dims[0] == inputDesc.nDims - 2);
+    return SUCCESS;
+}
+
+static inline float denormalize(float n, int length, bool align_corners)
+{
+    float x;
+    if (align_corners) {
+        x = (n + 1) / 2. * (length - 1);
+    } else {
+        x = ((n + 1) * length - 1) / 2.;
+    }
+    return x;
+}
+
+static inline float border(float x, float x_min, float x_max)
+{
+    return UNI_MIN(UNI_MAX(x, x_min), x_max);
+}
+
+static inline float reflect(float x, float x_min, float x_max)
+{
+    float range = x_max - x_min;
+    if (x < x_min) {
+        float dx = x_min - x;
+        int n = dx / range;
+        float r = dx - n * range;
+        if (n % 2 == 0) {
+            x = x_min + r;
+        } else {
+            x = x_max - r;
+        }
+    } else if (x > x_max) {
+        float dx = x - x_max;
+        int n = dx / range;
+        float r = dx - n * range;
+        if (n % 2 == 0) {
+            x = x_max - r;
+        } else {
+            x = x_min + r;
+        }
+    }
+    return x;
+}
+
+template <typename T>
+static inline float get(
+    const T *image, int it, int ih, int iw, int t, int h, int w, int cAlign, PadMode mode, float *bound)
+{
+    float pixel;
+    if (mode == PAD_CONSTANT) {
+        if (t >= 0 && t < it && h >= 0 && h < ih && w >= 0 && w < iw) {
+            pixel = image[(((t * ih) + h) * iw + w) * cAlign];
+        } else {
+            pixel = 0;
+        }
+    } else if (mode == PAD_EDGE) {
+        w = border(w, 0, iw - 1);
+        h = border(h, 0, ih - 1);
+        //t = border(t, 0, it - 1);
+        pixel = image[(((t * ih) + h) * iw + w) * cAlign];
+    } else {
+        w = reflect(w, bound[0], bound[1]);
+        h = reflect(h, bound[2], bound[3]);
+        //t = reflect(t, bound[4], bound[5]);
+        pixel = image[(((t * ih) + h) * iw + w) * cAlign];
+    }
+    return pixel;
+}
+
+template <typename T>
+static EE grid_sample_kernel(TensorDesc inputDesc,
+    T *input,
+    TensorDesc gridDesc,
+    T *grid,
+    GridSampleParamSpec p,
+    T *tmp,
+    TensorDesc outputDesc,
+    T *output)
+{
+    DataType idt;
+    DataFormat idf;
+    U32 in, ic, it, ih, iw;
+    if (tensorIs3d(inputDesc)) {
+        CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &iw));
+        it = ih = 1;
+    } else if (tensorIs4d(inputDesc)) {
+        CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+        it = 1;
+    } else if (tensorIs5d(inputDesc)) {
+        CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
+    } else {
+        return NOT_SUPPORTED;
+    }
+    int olen = tensorNumElements(outputDesc) / in / ic;
+    int S = tensorNumElements(gridDesc) / in / olen;
+    int cAlign = 1;
+    if (idf == DF_NCHWC8) {
+        cAlign = 8;
+    }
+    ic /= cAlign;
+
+    float w_min = -0.5;
+    float w_max = iw - 0.5;
+    float h_min = -0.5;
+    float h_max = ih - 0.5;
+    float t_min = -0.5;
+    float t_max = it - 0.5;
+    if (p.align_corners) {
+        w_min = -0.5;
+        w_max = iw - 0.5;
+        h_min = -0.5;
+        h_max = ih - 0.5;
+        t_min = -0.5;
+        t_max = it - 0.5;
+    }
+    float bound[6] = {w_min, w_max, h_min, h_max, t_min, t_max};
+    EE ret = SUCCESS;
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+    for (U32 o = 0; o < in * ic; o++) {
+        U32 n = o / ic;
+        U32 c = o % ic;
+        float x, y, z;
+        for (int i = 0; i < olen; i++) {
+            T *g = grid + (n * olen + i) * S;
+            for (int c8 = 0; c8 < cAlign; c8++) {
+                T *data = input + o * it * ih * iw * cAlign + c8;
+                T *out = output + (o * olen + i) * cAlign + c8;
+                x = denormalize(g[0], iw, p.align_corners);
+                if (S > 1) {
+                    y = denormalize(g[1], ih, p.align_corners);
+                } else {
+                    y = 0;
+                }
+                if (S > 2) {
+                    z = denormalize(g[2], it, p.align_corners);
+                } else {
+                    z = 0;
+                }
+                //switch (p.pad_mode) {
+                //    case PAD_EDGE: {
+                //        x = border(x, 0, iw - 1);
+                //        y = border(y, 0, ih - 1);
+                //        z = border(z, 0, it - 1);
+                //        break;
+                //    }
+                //    case PAD_REFLECT: {
+                //        x = reflect(x, w_min, w_max);
+                //        y = reflect(y, h_min, h_max);
+                //        z = reflect(z, t_min, t_max);
+                //        break;
+                //    }
+                //    default:
+                //        break;
+                //}
+                switch (p.mode) {
+                    case RESIZE_NEAREST: {
+                        x = round(x);
+                        y = round(y);
+                        z = round(z);
+                        *out = get(data, it, ih, iw, z, y, x, cAlign, p.pad_mode, bound);
+                        break;
+                    }
+                    case RESIZE_LINEAR: {
+                        int x1 = floor(x);
+                        int x2 = x1 + 1;
+                        int y1 = floor(y);
+                        int y2 = y1 + 1;
+                        //int z1 = floor(z);
+                        //int z2 = z1 + 1;
+                        float p11 = get(data, it, ih, iw, 0, y1, x1, cAlign, p.pad_mode, bound);
+                        float p12 = get(data, it, ih, iw, 0, y1, x2, cAlign, p.pad_mode, bound);
+                        float p21 = get(data, it, ih, iw, 0, y2, x1, cAlign, p.pad_mode, bound);
+                        float p22 = get(data, it, ih, iw, 0, y2, x2, cAlign, p.pad_mode, bound);
+                        float dx2 = x2 - x;
+                        float dx1 = x - x1;
+                        float dy2 = y2 - y;
+                        float dy1 = y - y1;
+                        *out = dy2 * (dx2 * p11 + dx1 * p12) + dy1 * (dx2 * p21 + dx1 * p22);
+                        break;
+                    }
+                    default:
+                        UNI_ERROR_LOG("GridSample currently not support this mode.\n");
+                        ret = NOT_SUPPORTED;
+                        break;
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+EE grid_sample_cpu(TensorDesc inputDesc,
+    void *input,
+    TensorDesc gridDesc,
+    void *grid,
+    GridSampleParamSpec p,
+    void *tmp,
+    TensorDesc outputDesc,
+    void *output)
+{
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
+#ifdef _USE_FP16
+        case DT_F16: {
+            ret = grid_sample_kernel<F16>(inputDesc, (F16 *)input, gridDesc, (F16 *)grid, p,
+                (F16 *)tmp, outputDesc, (F16 *)output);
+            break;
+        }
+#endif
+#ifdef _USE_FP32
+        case DT_F32: {
+            ret = grid_sample_kernel<F32>(inputDesc, (F32 *)input, gridDesc, (F32 *)grid, p,
+                (F32 *)tmp, outputDesc, (F32 *)output);
+            break;
+        }
+#endif
+        default:
+            break;
+    }
+    return ret;
+}
diff --git a/compute/image/src/cpu/image_cpu.h b/compute/image/src/cpu/image_cpu.h
index 61030f4d..5049c101 100644
--- a/compute/image/src/cpu/image_cpu.h
+++ b/compute/image/src/cpu/image_cpu.h
@@ -19,4 +19,16 @@
 
 EE resize_nearest_cpu(
     TensorDesc inputDesc, void *input, ResizeParamSpec p, TensorDesc outputDesc, void *output);
+
+EE grid_sample_infer_output_size_cpu(
+    TensorDesc inputDesc, TensorDesc gridDesc, TensorDesc *outputDesc);
+
+EE grid_sample_cpu(TensorDesc inputDesc,
+    void *input,
+    TensorDesc gridDesc,
+    void *grid,
+    GridSampleParamSpec p,
+    void *tmp,
+    TensorDesc outputDesc,
+    void *output);
 #endif
diff --git a/compute/image/src/cpu/resize_nearest.cpp b/compute/image/src/cpu/resize_nearest.cpp
index 0601e0e5..b4c1c776 100644
--- a/compute/image/src/cpu/resize_nearest.cpp
+++ b/compute/image/src/cpu/resize_nearest.cpp
@@ -12,8 +12,36 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/image_cpu.h"
+#include "affinity_policy.h"
 
-template <typename IT, typename OT, ResizeCoordinateTransMode coordinate_transformation_mode>
+template <RoundMode round_mode>
+inline static int round_d(float x)
+{
+    int ret = 0;
+    switch (round_mode) {
+        case ROUND_FLOOR:
+            ret = floor(x);
+            break;
+        case ROUND_CEIL:
+            ret = ceil(x);
+            break;
+        case ROUND_PREFER_FLOOR:
+            ret = round(x);
+            if (ret - x == 0.5) {
+                ret -= 1;
+            }
+            break;
+        case ROUND_PREFER_CEIL:
+            ret = round(x);
+            break;
+        default:
+            UNI_ERROR_LOG("Resize currently not support this round mode.\n");
+            break;
+    }
+    return ret;
+}
+
+template <typename IT, typename OT, CoordinateTransMode coordinate_transformation_mode, RoundMode round_mode>
 inline static EE resize_nearest_kernel(
     const TensorDesc &inputDesc, IT *inArray, const TensorDesc &outputDesc, OT *outArray)
 {
@@ -38,61 +66,64 @@ inline static EE resize_nearest_kernel(
     float ws0 = iw * 1.0 / ow;
     float hs1 = (ih - 1.0) / (oh - 1.0);
     float ws1 = (iw - 1.0) / (ow - 1.0);
-
-    int srcX, srcY, src;
-    for (U32 n = 0, dst = 0; n < on; n++) {
-        for (I32 c = 0; c < oc_d; c++) {
-            for (U32 h = 0; h < oh; h++) {
-                for (U32 w = 0; w < ow; w++) {
-                    for (int k = 0; k < oc_align; k++, dst++) {
-                        switch (coordinate_transformation_mode) {
-                            case HALF_PIXEL: {
-                                srcX = (h + 0.5) * hs0 - 0.5;
-                                srcY = (w + 0.5) * ws0 - 0.5;
-                                if (srcX < 0) {
-                                    srcX = 0;
-                                }
-                                if (srcY < 0) {
-                                    srcY = 0;
-                                }
-                                break;
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+    for (U32 o = 0; o < on * oc_d; o++) {
+        int n = o / oc_d;
+        int c = o % oc_d;
+        int dst = o * oh * ow * oc_align;
+        int srcX, srcY, src;
+        for (U32 h = 0; h < oh; h++) {
+            for (U32 w = 0; w < ow; w++) {
+                for (int k = 0; k < oc_align; k++, dst++) {
+                    switch (coordinate_transformation_mode) {
+                        case COORDINATE_TRANS_HALF_PIXEL: {
+                            srcX = round_d<round_mode>((h + 0.5) * hs0 - 0.5);
+                            srcY = round_d<round_mode>((w + 0.5) * ws0 - 0.5);
+                            if (srcX < 0) {
+                                srcX = 0;
                             }
-                            case PYTORCH_HALF_PIXEL: {
-                                srcX = oh > 1 ? (h + 0.5) * hs0 - 0.5 : 0;
-                                srcY = ow > 1 ? (w + 0.5) * ws0 - 0.5 : 0;
-                                if (srcX < 0) {
-                                    srcX = 0;
-                                }
-                                if (srcY < 0) {
-                                    srcY = 0;
-                                }
-                                break;
+                            if (srcY < 0) {
+                                srcY = 0;
                             }
-                            case ALIGN_CORNERS: {
-                                srcX = h * hs1;
-                                srcY = w * ws1;
-                                break;
+                            break;
+                        }
+                        case COORDINATE_TRANS_PYTORCH_HALF_PIXEL: {
+                            srcX = oh > 1 ? round_d<round_mode>((h + 0.5) * hs0 - 0.5) : 0;
+                            srcY = ow > 1 ? round_d<round_mode>((w + 0.5) * ws0 - 0.5) : 0;
+                            if (srcX < 0) {
+                                srcX = 0;
                             }
-                            case ASYMMETRIC: {
-                                srcX = h * hs0;
-                                srcY = w * ws0;
-                                break;
+                            if (srcY < 0) {
+                                srcY = 0;
                             }
-                            default:
-                                UNI_ERROR_LOG("Resize currently not support this coordinate "
-                                              "transformation mode.\n");
-                                break;
+                            break;
+                        }
+                        case COORDINATE_TRANS_ALIGN_CORNERS: {
+                            srcX = round_d<round_mode>(h * hs1);
+                            srcY = round_d<round_mode>(w * ws1);
+                            break;
                         }
-                        U32 cc = c * oc_align + k;
-                        if (idf == DF_NCHWC8) {
-                            U32 cc1 = cc / ic_align;
-                            U32 cc2 = cc % ic_align;
-                            src = (((n * ic_d + cc1) * ih + srcX) * iw + srcY) * ic_align + cc2;
-                        } else {
-                            src = ((n * ic + cc) * ih + srcX) * iw + srcY;
+                        case COORDINATE_TRANS_ASYMMETRIC: {
+                            srcX = round_d<round_mode>(h * hs0);
+                            srcY = round_d<round_mode>(w * ws0);
+                            break;
                         }
-                        outArray[dst] = (OT)inArray[src];
+                        default:
+                            UNI_ERROR_LOG("Resize currently not support this coordinate "
+                                          "transformation mode.\n");
+                            break;
                     }
+                    U32 cc = c * oc_align + k;
+                    if (idf == DF_NCHWC8) {
+                        U32 cc1 = cc / ic_align;
+                        U32 cc2 = cc % ic_align;
+                        src = (((n * ic_d + cc1) * ih + srcX) * iw + srcY) * ic_align + cc2;
+                    } else {
+                        src = ((n * ic + cc) * ih + srcX) * iw + srcY;
+                    }
+                    outArray[dst] = (OT)inArray[src];
                 }
             }
         }
@@ -100,6 +131,43 @@ inline static EE resize_nearest_kernel(
     return SUCCESS;
 }
 
+template <typename IT, typename OT, CoordinateTransMode coordinate_transformation_mode>
+inline static EE resize_nearest_kernel(const TensorDesc &inputDesc,
+    IT *inArray,
+    ResizeParamSpec p,
+    const TensorDesc &outputDesc,
+    OT *outArray)
+{
+    EE ret = SUCCESS;
+    switch (p.round_mode) {
+        case ROUND_CEIL: {
+            resize_nearest_kernel<IT, OT, coordinate_transformation_mode, ROUND_CEIL>(
+                inputDesc, inArray, outputDesc, outArray);
+            break;
+        }
+        case ROUND_FLOOR: {
+            resize_nearest_kernel<IT, OT, coordinate_transformation_mode, ROUND_FLOOR>(
+                inputDesc, inArray, outputDesc, outArray);
+            break;
+        }
+        case ROUND_PREFER_CEIL: {
+            resize_nearest_kernel<IT, OT, coordinate_transformation_mode, ROUND_PREFER_CEIL>(
+                inputDesc, inArray, outputDesc, outArray);
+            break;
+        }
+        case ROUND_PREFER_FLOOR: {
+            resize_nearest_kernel<IT, OT, coordinate_transformation_mode, ROUND_PREFER_FLOOR>(
+                inputDesc, inArray, outputDesc, outArray);
+            break;
+        }
+        default:
+            UNI_ERROR_LOG("Resize currently not support this round mode.\n");
+            ret = NOT_SUPPORTED;
+            break;
+    }
+    return ret;
+}
+
 template <typename IT, typename OT>
 inline static EE resize_nearest_wrapper(const TensorDesc &inputDesc,
     IT *inArray,
@@ -109,21 +177,24 @@ inline static EE resize_nearest_wrapper(const TensorDesc &inputDesc,
 {
     EE ret = SUCCESS;
     switch (p.trans_mode) {
-        case HALF_PIXEL: {
-            resize_nearest_kernel<IT, OT, HALF_PIXEL>(inputDesc, inArray, outputDesc, outArray);
+        case COORDINATE_TRANS_HALF_PIXEL: {
+            resize_nearest_kernel<IT, OT, COORDINATE_TRANS_HALF_PIXEL>(
+                inputDesc, inArray, p, outputDesc, outArray);
             break;
         }
-        case PYTORCH_HALF_PIXEL: {
-            resize_nearest_kernel<IT, OT, PYTORCH_HALF_PIXEL>(
-                inputDesc, inArray, outputDesc, outArray);
+        case COORDINATE_TRANS_PYTORCH_HALF_PIXEL: {
+            resize_nearest_kernel<IT, OT, COORDINATE_TRANS_PYTORCH_HALF_PIXEL>(
+                inputDesc, inArray, p, outputDesc, outArray);
             break;
         }
-        case ALIGN_CORNERS: {
-            resize_nearest_kernel<IT, OT, ALIGN_CORNERS>(inputDesc, inArray, outputDesc, outArray);
+        case COORDINATE_TRANS_ALIGN_CORNERS: {
+            resize_nearest_kernel<IT, OT, COORDINATE_TRANS_ALIGN_CORNERS>(
+                inputDesc, inArray, p, outputDesc, outArray);
             break;
         }
-        case ASYMMETRIC: {
-            resize_nearest_kernel<IT, OT, ASYMMETRIC>(inputDesc, inArray, outputDesc, outArray);
+        case COORDINATE_TRANS_ASYMMETRIC: {
+            resize_nearest_kernel<IT, OT, COORDINATE_TRANS_ASYMMETRIC>(
+                inputDesc, inArray, p, outputDesc, outArray);
             break;
         }
         default:
diff --git a/compute/image/src/cpu/x86/image_x86.h b/compute/image/src/cpu/x86/image_x86.h
index b6396d35..0fcefe7f 100644
--- a/compute/image/src/cpu/x86/image_x86.h
+++ b/compute/image/src/cpu/x86/image_x86.h
@@ -20,8 +20,8 @@
 
 EE resize_bilinear_x86(TensorDesc inputDesc,
     void *input,
-    TensorDesc outputDesc,
+    ResizeParamSpec p,
     void *tmp,
-    void *output,
-    ResizeParamSpec p);
+    TensorDesc outputDesc,
+    void *output);
 #endif
diff --git a/compute/image/src/cpu/x86/resize_bilinear.cpp b/compute/image/src/cpu/x86/resize_bilinear.cpp
index 43e2c2ba..0a391cfe 100644
--- a/compute/image/src/cpu/x86/resize_bilinear.cpp
+++ b/compute/image/src/cpu/x86/resize_bilinear.cpp
@@ -46,18 +46,22 @@ typedef void (*compute_bilinear_func)(F32 *input0,
     U32 onStep,
     U32 on);
 
-inline F32 infer_src(I32 x, I32 iw, I32 ow, ResizeCoordinateTransMode trans_mode)
+inline F32 infer_src(I32 x, I32 iw, I32 ow, CoordinateTransMode trans_mode)
 {
+    F32 scale = 1.0 * iw / ow;
     F32 ret;
     switch (trans_mode) {
-        case HALF_PIXEL:
-            ret = (x + 0.5f) * 1.0f * iw / ow - 0.5;
+        case COORDINATE_TRANS_HALF_PIXEL:
+            ret = (x + 0.5f) * scale - 0.5;
             break;
-        case ALIGN_CORNERS:
+        case COORDINATE_TRANS_ALIGN_CORNERS:
             ret = x * 1.0f * (iw - 1) / (ow - 1);
             break;
-        case PYTORCH_HALF_PIXEL:
-            ret = (ow > 1) ? ((x + 0.5f) * 1.0f * iw / ow - 0.5) : 0;
+        case COORDINATE_TRANS_PYTORCH_HALF_PIXEL:
+            ret = (ow > 1) ? ((x + 0.5f) * scale - 0.5) : 0;
+            break;
+        case COORDINATE_TRANS_ASYMMETRIC:
+            ret = x * scale;
             break;
         default:
             ret = 0;
@@ -281,8 +285,59 @@ inline void compute_bilinear_nchw_fp32(F32 *input0,
     }
 }
 
+EE resize_bilinear_x86_fp32_nchw(
+    TensorDesc inputDesc, F32 *input, ResizeParamSpec p, F32 *tmp, TensorDesc outputDesc, F32 *output)
+{
+    DataType idt, odt;
+    DataFormat idf, odf;
+    U32 in, ic, ih, iw;
+    U32 on, oc, oh, ow;
+    CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+    CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+    CHECK_REQUIREMENT(odf == DF_NCHW || idf == DF_NCHW);
+
+    for (U32 c = 0; c < oc; ++c) {
+        F32 *outp = output + c * oh * ow;
+        F32 *inp = input + c * ih * iw;
+
+        for (U32 h = 0; h < oh; ++h) {
+            F32 hC = infer_src(h, ih, oh, p.trans_mode);
+            hC = UNI_MIN(ih - 1, UNI_MAX(0, hC));
+            I32 hT = floor(hC);
+            I32 hB = ceil(hC);
+            F32 h1 = hB - hC;
+            F32 h2 = hC - hT;
+
+            for (U32 w = 0; w < ow; ++w) {
+                F32 wC = infer_src(w, iw, ow, p.trans_mode);
+                wC = UNI_MIN(iw - 1, UNI_MAX(0, wC));
+                I32 wL = floor(wC);
+                I32 wR = ceil(wC);
+                F32 w1 = wR - wC;
+                F32 w2 = wC - wL;
+
+                U32 output_idx = h * ow + w;
+                if (hB == hT && wL == wR) {
+                    outp[output_idx] = inp[hT * iw + wL];
+                } else if (hB == hT) {
+                    outp[output_idx] = w1 * inp[hT * iw + wL] + w2 * inp[hT * iw + wR];
+                } else if (wL == wR) {
+                    outp[output_idx] = h1 * inp[hT * iw + wL] + h2 * inp[hB * iw + wL];
+                } else {
+                    outp[output_idx] = h1 * w1 * inp[hT * iw + wL] +
+                        h1 * w2 * inp[hT * iw + wR] +
+                        h2 * w1 * inp[hB * iw + wL] +
+                        h2 * w2 * inp[hB * iw + wR];
+                }
+
+            }
+        }
+    }
+    return SUCCESS;
+}
+
 EE resize_bilinear_x86_fp32(
-    TensorDesc inputDesc, F32 *input, TensorDesc outputDesc, F32 *tmp, F32 *output, ResizeParamSpec p)
+    TensorDesc inputDesc, F32 *input, ResizeParamSpec p, F32 *tmp, TensorDesc outputDesc, F32 *output)
 {
     DataType idt, odt;
     DataFormat idf, odf;
@@ -291,6 +346,7 @@ EE resize_bilinear_x86_fp32(
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     CHECK_REQUIREMENT(idf == DF_NCHWC8 || idf == DF_NCHW);
+
     EE ret = SUCCESS;
 
     U32 ocStep = oh * ow * 8;
@@ -316,22 +372,6 @@ EE resize_bilinear_x86_fp32(
             F32 hC = infer_src(h, ih, oh, p.trans_mode);
             F32 wC = infer_src(w, iw, ow, p.trans_mode);
             U32 output_idx = h * ow * 8 + w * 8;
-            if (h == 0 && w == 0) {
-                copy[func_idx](input, output, icStep, ocStep, ic, inStep, onStep, on);
-                continue;
-            } else if (h == oh - 1 && w == ow - 1) {
-                copy[func_idx](input + ((ih - 1) * iw + iw - 1) * itile_size, output + output_idx,
-                    icStep, ocStep, ic, inStep, onStep, on);
-                continue;
-            } else if (h == 0 && w == ow - 1) {
-                copy[func_idx](input + (iw - 1) * itile_size, output + output_idx, icStep, ocStep,
-                    ic, inStep, onStep, on);
-                continue;
-            } else if (h == oh - 1 && w == 0) {
-                copy[func_idx](input + (ih - 1) * iw * itile_size, output + output_idx, icStep,
-                    ocStep, ic, inStep, onStep, on);
-                continue;
-            }
 
             // process edge pixel, linear
             hC = UNI_MIN(ih - 1, UNI_MAX(0, hC));
@@ -390,17 +430,10 @@ EE resize_bilinear_x86_fp32(
                 }
             }
             I32 mainc = c;
-            for (; c < (I32)oc - 3; c += 4) {
-                for (I32 hw = 0; hw < ohow; ++hw) {
-                    outArray[n * oc * ohow + c * ohow + hw] =
-                        output[n * oc * ohow + mainc * ohow + hw * 4 + (c - mainc)];
-                }
-            }
-            mainc = c;
             for (; c < (I32)oc; ++c) {
                 for (I32 hw = 0; hw < ohow; ++hw) {
                     outArray[n * oc * ohow + c * ohow + hw] =
-                        output[n * oc * ohow + mainc * ohow + hw * ((I32)oc - mainc) + (c - mainc)];
+                        output[n * oc * ohow + mainc * ohow + hw * 8 + (c - mainc)];
                 }
             }
         }
@@ -411,10 +444,10 @@ EE resize_bilinear_x86_fp32(
 
 EE resize_bilinear_x86(TensorDesc inputDesc,
     void *input,
-    TensorDesc outputDesc,
+    ResizeParamSpec p,
     void *tmp,
-    void *output,
-    ResizeParamSpec p)
+    TensorDesc outputDesc,
+    void *output)
 {
     DataType idt, odt;
     DataFormat idf, odf;
@@ -425,8 +458,13 @@ EE resize_bilinear_x86(TensorDesc inputDesc,
     EE ret = NOT_SUPPORTED;
     switch (idt) {
         case DT_F32:
-            ret = resize_bilinear_x86_fp32(
-                inputDesc, (F32 *)input, outputDesc, (F32 *)tmp, (F32 *)output, p);
+            if (idf == DF_NCHW && odf == DF_NCHW) {
+                ret = resize_bilinear_x86_fp32_nchw(
+                    inputDesc, (F32 *)input, p, (F32 *)tmp, outputDesc, (F32 *)output);
+            } else {
+                ret = resize_bilinear_x86_fp32(
+                    inputDesc, (F32 *)input, p, (F32 *)tmp, outputDesc, (F32 *)output);
+            }
         default:
             break;
     }
diff --git a/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h b/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h
index 801ddc61..f3c14b3f 100644
--- a/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h
+++ b/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h
@@ -15,22 +15,22 @@ inline EE set_resize_nearest_opt_mali(ResizeParamSpec p,
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
     std::string modeName = "";
     switch (p.trans_mode) {
-        case HALF_PIXEL: {
+        case COORDINATE_TRANS_HALF_PIXEL: {
             modeName = "_half_pixel";
             CHECK_STATUS(set_chars_define_opt("USE_HALF_PIXEL", opt));
             break;
         }
-        case PYTORCH_HALF_PIXEL: {
+        case COORDINATE_TRANS_PYTORCH_HALF_PIXEL: {
             modeName = "_pytorch_half_pixel";
             CHECK_STATUS(set_chars_define_opt("USE_PYTORCH_HALF_PIXEL", opt));
             break;
         }
-        case ALIGN_CORNERS: {
+        case COORDINATE_TRANS_ALIGN_CORNERS: {
             modeName = "_align_corners";
             CHECK_STATUS(set_chars_define_opt("USE_ALIGN_CORNERS", opt));
             break;
         }
-        case ASYMMETRIC: {
+        case COORDINATE_TRANS_ASYMMETRIC: {
             modeName = "_asymmetric";
             CHECK_STATUS(set_chars_define_opt("USE_ASYMMETRIC", opt));
             break;
@@ -42,8 +42,9 @@ inline EE set_resize_nearest_opt_mali(ResizeParamSpec p,
     if (useNchwFormat) {
         formatName = "nchw";
     }
-    sprintf(kernelName, "resize_nearest_%s%s%s", ioMemName, formatName.c_str(), modeName.c_str());
-    sprintf(kernelOpt->sourceName, "resize_nearest");
+    std::string kernel = std::string("resize_nearest_") + ioMemName + formatName + modeName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "resize_nearest");
     if (useNchwFormat) {
         CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt));
     }
@@ -66,8 +67,9 @@ inline EE set_resize_bilinear_opt_mali(bool useNchwFormat,
     if (useNchwFormat) {
         formatName = "nchw";
     }
-    sprintf(kernelName, "resize_bilinear_%s%s", ioMemName, formatName.c_str());
-    sprintf(kernelOpt->sourceName, "resize_bilinear");
+    std::string kernel = std::string("resize_bilinear_") + ioMemName + formatName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "resize_bilinear");
     if (useNchwFormat) {
         CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt));
     }
diff --git a/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp b/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp
index 9ba37323..91301879 100644
--- a/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp
+++ b/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp
@@ -97,7 +97,7 @@ inline EE resize_nearest_core_mali_fp16(GCLHandle_t handle,
     GCLMemType outputMemType = output->desc.memType;
 
     F32 ratiow, ratioh;
-    if (p.trans_mode == ALIGN_CORNERS) {
+    if (p.trans_mode == COORDINATE_TRANS_ALIGN_CORNERS) {
         ratiow = (iw - 1.0) / (ow - 1.0);
         ratioh = (ih - 1.0) / (oh - 1.0);
     } else {
diff --git a/compute/image/src/grid_sample.cpp b/compute/image/src/grid_sample.cpp
new file mode 100644
index 00000000..36a49754
--- /dev/null
+++ b/compute/image/src/grid_sample.cpp
@@ -0,0 +1,65 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "image.h"
+#ifdef _USE_CPU
+#include "cpu/image_cpu.h"
+#endif
+
+EE grid_sample_infer_output_size(
+    Tensor *inputTensor, Tensor *gridTensor, Tensor *outputTensor, ArchInfo_t archInfo)
+{
+    TensorDesc inputDesc = inputTensor->get_desc();
+    TensorDesc gridDesc = gridTensor->get_desc();
+    TensorDesc outputDesc = outputTensor->get_desc();
+    auto arch = archInfo->arch;
+    EE ret = NOT_SUPPORTED;
+    if (IS_CPU(arch)) {
+        ret = grid_sample_infer_output_size_cpu(inputDesc, gridDesc, &outputDesc);
+    }
+    outputTensor->resize(outputDesc);
+    return ret;
+}
+
+EE grid_sample_infer_forward_tmp_bytes(Tensor inputTensor,
+    Tensor gridTensor,
+    GridSampleParamSpec p,
+    Tensor outputTensor,
+    U32 *bytes,
+    ArchInfo_t archInfo)
+{
+    *bytes = 0;
+    return SUCCESS;
+}
+
+EE grid_sample(Tensor inputTensor,
+    Tensor gridTensor,
+    GridSampleParamSpec p,
+    Tensor tmpTensor,
+    Tensor outputTensor,
+    ArchInfo_t archInfo)
+{
+    auto arch = archInfo->arch;
+    TensorDesc inputDesc = inputTensor.get_desc();
+    TensorDesc gridDesc = gridTensor.get_desc();
+    TensorDesc outputDesc = outputTensor.get_desc();
+    void *input = get_ptr_from_tensor(inputTensor, arch);
+    void *grid = get_ptr_from_tensor(gridTensor, arch);
+    void *tmp = get_ptr_from_tensor(tmpTensor, arch);
+    void *output = get_ptr_from_tensor(outputTensor, arch);
+    EE ret = NOT_SUPPORTED;
+    if (IS_CPU(arch)) {
+        ret = grid_sample_cpu(inputDesc, input, gridDesc, grid, p, tmp, outputDesc, output);
+    }
+    return ret;
+}
diff --git a/compute/image/src/image_processing.cpp b/compute/image/src/image_processing.cpp
index 3b5a754a..6b1c6fb1 100644
--- a/compute/image/src/image_processing.cpp
+++ b/compute/image/src/image_processing.cpp
@@ -11,6 +11,7 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+#include <math.h>
 #include "image.h"
 
 template <typename T>
@@ -87,7 +88,10 @@ std::shared_ptr<Tensor> get_resize_image(
     }
 
     ResizeParamSpec p;
-    p.mode = LINEAR;
+    p.mode = RESIZE_LINEAR;
+    p.trans_mode = COORDINATE_TRANS_ASYMMETRIC;
+    p.round_mode = ROUND_FLOOR;
+
     // consider the dataformat
     if (targetImageFormat == RGB_SC) {  // Specific for Birealnet18, scale short edge to 224 first
         F32 scale = 224.0 / UNI_MIN(height, width);
@@ -98,11 +102,9 @@ std::shared_ptr<Tensor> get_resize_image(
             height = (U32)(scale * height + 0.5);
             width = 224;
         }
-        Tensor scaleTensor;
         TensorDesc scaledDesc = tensor4df(imageDt, imageDf, imageNum, imageChannel, height, width);
-        scaleTensor.resize(scaledDesc);
-        scaleTensor.alloc();
-        resize(rgbTensor, temp, scaleTensor, p, &archInfo);
+        Tensor scaleTensor = Tensor::alloc_sized<CPUMem>(scaledDesc);
+        resize(rgbTensor, p, temp, scaleTensor, &archInfo);
 
         U32 h0 = (U32)((height - 224) * 0.5);
         U32 w0 = (U32)((width - 224) * 0.5);
@@ -113,14 +115,14 @@ std::shared_ptr<Tensor> get_resize_image(
                 for (U32 w = w0; w < w0 + imageWidth; w++) {
                     T value = (scaled[c * height * width + h * width + w] / 255 - meanRGBSC[c]) /
                         stdRGBSC[c];
-                    CHECK_REQUIREMENT(!UNI_ISNAN(value));
+                    CHECK_REQUIREMENT(!isnan((float)value));
                     *transferSpacePtrMov = value;
                     transferSpacePtrMov++;
                 }
             }
         }
     } else if (targetImageFormat == RGB_RAW) {
-        resize(rgbTensor, temp, *transferSpaceTensor.get(), p, &archInfo);
+        resize(rgbTensor, p, temp, *transferSpaceTensor.get(), &archInfo);
     } else if (targetImageFormat == RGB_SC_RAW || targetImageFormat == BGR_SC_RAW) {
         F32 scale = 256.0 / UNI_MIN(height, width);
         if (height < width) {
@@ -130,11 +132,9 @@ std::shared_ptr<Tensor> get_resize_image(
             height = (U32)(scale * (F32)height + 0.5);
             width = 256;
         }
-        Tensor scaleTensor;
         TensorDesc scaledDesc = tensor4df(imageDt, imageDf, imageNum, imageChannel, height, width);
-        scaleTensor.resize(scaledDesc);
-        scaleTensor.alloc();
-        resize(rgbTensor, temp, scaleTensor, p, &archInfo);
+        Tensor scaleTensor = Tensor::alloc_sized<CPUMem>(scaledDesc);
+        resize(rgbTensor, p, temp, scaleTensor, &archInfo);
 
         U32 h0 = (U32)((height - 224) * 0.5);
         U32 w0 = (U32)((width - 224) * 0.5);
@@ -142,16 +142,14 @@ std::shared_ptr<Tensor> get_resize_image(
         T *scaled = (T *)get_ptr_from_tensor(scaleTensor, arch);
         for (U32 c : transform) {
             for (U32 h = h0; h < h0 + 224; h++) {
-                memcpy(transferSpacePtrMov, scaled + c * height * width + h * width + w0,
+                UNI_MEMCPY(transferSpacePtrMov, scaled + c * height * width + h * width + w0,
                     224 * bytesOf(imageDt));
                 transferSpacePtrMov += 224;
             }
         }
     } else {
-        Tensor scaleTensor;
-        scaleTensor.resize(imageDesc);
-        scaleTensor.alloc();
-        resize(rgbTensor, temp, scaleTensor, p, &archInfo);
+        Tensor scaleTensor = Tensor::alloc_sized<CPUMem>(imageDesc);
+        resize(rgbTensor, p, temp, scaleTensor, &archInfo);
 
         T *resized = (T *)get_ptr_from_tensor(scaleTensor, arch);
         for (U32 c : transform) {
@@ -160,7 +158,7 @@ std::shared_ptr<Tensor> get_resize_image(
                     T value = (resized[c * imageHeight * imageWidth + h * imageWidth + w] -
                                   1.0 * meanRGB[c]) *
                         scaleValue;
-                    CHECK_REQUIREMENT(!UNI_ISNAN(value));
+                    CHECK_REQUIREMENT(!isnan((float)value));
                     *transferSpacePtrMov = value;
                     transferSpacePtrMov++;
                 }
diff --git a/compute/image/src/resize.cpp b/compute/image/src/resize.cpp
index 09591837..0589a895 100644
--- a/compute/image/src/resize.cpp
+++ b/compute/image/src/resize.cpp
@@ -27,38 +27,32 @@
 #ifdef _USE_X86
 #include "cpu/x86/image_x86.h"
 #endif
-#include <string.h>
 
 // params is a pointer to either the target size or the resize ratios
 // When paramDT specifies DT_U32, params should point to target sizes (height and width)
 // When paramDT specifies DT_F32, params should point to resize ratios
-EE resize_infer_output_size_cpu(
-    TensorDesc inputDesc, DataType paramDT, void *params, TensorDesc *outputDesc, U32 *outputBytes)
+EE resize_infer_output_size_cpu(TensorDesc inputDesc, ResizeParamSpec p, TensorDesc *outputDesc)
 {
-    if (nullptr == outputDesc || nullptr == outputBytes) {
-        CHECK_STATUS(NULL_POINTER);
-    }
     DataType idt;
     DataFormat idf, odf;
-    U32 in, ic, ih, iw;
-    U32 oh, ow;
-    CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
-
-    switch (paramDT) {
-        case DT_F32: {
-            F32 *scales = (F32 *)params;
-            oh = ih * scales[0];
-            ow = iw * scales[1];
-            break;
-        }
-        case DT_U32: {
-            U32 *len = (U32 *)params;
-            oh = len[0];
-            ow = len[1];
-            break;
+    U32 in, ic, ih, iw = 1;
+    U32 oh, ow = 1;
+    if (tensorIs3d(inputDesc)) {
+        CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih));
+    } else if (tensorIs4d(inputDesc)) {
+        CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+    } else {
+        UNI_ERROR_LOG("can support to resize %d-dim tensor.\n", inputDesc.nDims);
+    }
+    if (p.num_sizes > 0) {
+        oh = p.sizes[0];
+        if (p.num_sizes > 1) {
+            ow = p.sizes[1];
         }
-        default: {
-            return NOT_SUPPORTED;
+    } else {
+        oh = ih * p.scales[2];
+        if (p.num_scales > 3) {
+            ow = iw * p.scales[3];
         }
     }
     if (ic % 8 == 0) {
@@ -66,28 +60,23 @@ EE resize_infer_output_size_cpu(
     } else {
         odf = idf;
     }
-    *outputDesc = tensor4df(idt, odf, in, ic, oh, ow);
-    *outputBytes = tensorNumBytes(*outputDesc);
+    if (tensorIs3d(inputDesc)) {
+        *outputDesc = tensor3df(idt, odf, in, ic, oh);
+    } else if (tensorIs4d(inputDesc)) {
+        *outputDesc = tensor4df(idt, odf, in, ic, oh, ow);
+    }
     return SUCCESS;
 }
 
-EE resize_infer_output_size(Tensor *inputTensor,
-    DataType paramDT,
-    void *params,
-    Tensor *outputTensor,
-    U32 *outputBytes,
-    ArchInfo_t archInfo)
+EE resize_infer_output_size(
+    Tensor *inputTensor, ResizeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo)
 {
-    if (inputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-    if (outputTensor == nullptr) {
+    if (inputTensor == nullptr || outputTensor == nullptr) {
         CHECK_STATUS(NULL_POINTER);
     }
     TensorDesc inputDesc = inputTensor->get_desc();
     TensorDesc outputDesc = outputTensor->get_desc();
-    EE ret = NOT_SUPPORTED;
-    ret = resize_infer_output_size_cpu(inputDesc, paramDT, params, &outputDesc, outputBytes);
+    EE ret = resize_infer_output_size_cpu(inputDesc, p, &outputDesc);
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
         outputDesc.df = inputDesc.df;
@@ -97,23 +86,49 @@ EE resize_infer_output_size(Tensor *inputTensor,
     return ret;
 }
 
+EE resize_infer_forward_tmp_bytes(
+    Tensor inputTensor, ResizeParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo)
+{
+    if (bytes == nullptr) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    TensorDesc inputDesc = inputTensor.get_desc();
+    TensorDesc outputDesc = outputTensor.get_desc();
+    *bytes = 0;
+    auto arch = archInfo->arch;
+    if (IS_GPU(arch)) {
+        if (inputDesc.df == DF_NCHW && inputTensor.get_mem_type() != OCLMem) {
+            *bytes = tensorNumBytes(inputDesc);
+        }
+    } else {
+        if (DF_NCHW == inputDesc.df && (IS_ARM(arch) || IS_X86(arch))) {
+            int channelAxis = inputDesc.nDims - 2;
+            U32 paddedC = (inputDesc.dims[channelAxis] + 7) / 8 * 8;
+            inputDesc.dims[channelAxis] = paddedC;
+            outputDesc.dims[channelAxis] = paddedC;
+            *bytes = tensorNumBytes(inputDesc) + tensorNumBytes(outputDesc);
+        }
+    }
+    return SUCCESS;
+}
+
 EE resize_bilinear(TensorDesc inputDesc,
     void *input,
+    ResizeParamSpec p,
+    void *tmp,
     TensorDesc outputDesc,
     void *output,
-    void *tmp,
-    ResizeParamSpec p,
     ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
     EE ret = NOT_SUPPORTED;
     if (IS_GENERAL(arch)) {
 #ifdef _USE_GENERAL
-        ret = resize_bilinear_general(inputDesc, input, outputDesc, output);
+        ret = resize_bilinear_general(inputDesc, input, p, outputDesc, output);
 #endif
 #ifdef _USE_X86
     } else if (IS_X86(arch)) {
-        ret = resize_bilinear_x86(inputDesc, input, outputDesc, tmp, output, p);
+        ret = resize_bilinear_x86(inputDesc, input, p, tmp, outputDesc, output);
 #endif
 #ifdef _USE_NEON
     } else if (IS_ARM(arch)) {
@@ -131,7 +146,7 @@ EE resize_bilinear(TensorDesc inputDesc,
             outputARM = inputARM + tensorNumBytes(inDescARM);
             transformNCHWToNCHWC8(inputDesc, input, inDescARM, inputARM);
         }
-        ret = resize_bilinear_arm(inDescARM, inputARM, outDescARM, outputARM);
+        ret = resize_bilinear_arm(inDescARM, inputARM, p, outDescARM, outputARM);
         if (DF_NCHWC8 != outputDesc.df) {
             transformToNCHW(outDescARM, outputARM, outputDesc, output);
         }
@@ -142,16 +157,15 @@ EE resize_bilinear(TensorDesc inputDesc,
             (GCLMem_t)input, outputDesc, (GCLMem_t)tmp, (GCLMem_t)output);
 #endif
     }
-    CHECK_STATUS(ret);
     return ret;
 }
 
 EE resize_nearest(TensorDesc inputDesc,
     void *input,
+    ResizeParamSpec p,
+    void *tmp,
     TensorDesc outputDesc,
     void *output,
-    void *tmp,
-    ResizeParamSpec p,
     ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
@@ -170,7 +184,7 @@ EE resize_nearest(TensorDesc inputDesc,
 }
 
 EE resize(
-    Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ResizeParamSpec p, ArchInfo_t archInfo)
+    Tensor inputTensor, ResizeParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
     TensorDesc inputDesc = inputTensor.get_desc();
@@ -179,30 +193,36 @@ EE resize(
     void *output = get_ptr_from_tensor(outputTensor, arch);
     void *tmp = get_ptr_from_tensor(tmpTensor, arch);
 
+    if (inputDesc.nDims == 3) {
+        for (int i = inputDesc.nDims; i > 0; i--) {
+            inputDesc.dims[i] = inputDesc.dims[i - 1];
+            outputDesc.dims[i] = outputDesc.dims[i - 1];
+        }
+        inputDesc.nDims++;
+        outputDesc.nDims++;
+    }
     DataType idt, odt;
     DataFormat idf, odf;
     U32 in, ic, ih, iw;
     U32 on, oc, oh, ow;
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-
     CHECK_REQUIREMENT(in == on && ic == oc);
 
     if (ih == oh && iw == ow && IS_CPU(arch)) {
-        memcpy(output, input, tensorNumBytes(inputDesc));
+        UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
         return SUCCESS;
     }
 
-    EE ret;
+    EE ret = NOT_SUPPORTED;
     switch (p.mode) {
-        case NEAREST:
-            ret = resize_nearest(inputDesc, input, outputDesc, output, tmp, p, archInfo);
+        case RESIZE_NEAREST:
+            ret = resize_nearest(inputDesc, input, p, tmp, outputDesc, output, archInfo);
             break;
-        case LINEAR:
-            ret = resize_bilinear(inputDesc, input, outputDesc, output, tmp, p, archInfo);
+        case RESIZE_LINEAR:
+            ret = resize_bilinear(inputDesc, input, p, tmp, outputDesc, output, archInfo);
             break;
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
     return ret;
diff --git a/compute/image/tests/test_image_processing.cpp b/compute/image/tests/test_image_processing.cpp
index 427ed21e..36232a8f 100644
--- a/compute/image/tests/test_image_processing.cpp
+++ b/compute/image/tests/test_image_processing.cpp
@@ -20,7 +20,7 @@ int main()
     TensorDesc rgbDesc = tensor4df(DT_U8, DF_RGB, 1, 3, 1280, 960);
     U8 *rgb = ut_input_v(tensorNumElements(rgbDesc), DT_U8, UT_INIT_POS);
     Tensor rgbTensor = Tensor::alloc_sized<CPUMem>(rgbDesc);
-    memcpy(get_ptr_from_tensor(rgbTensor, ARM_A76), rgb, tensorNumBytes(rgbDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(rgbTensor, ARM_A76), rgb, tensorNumBytes(rgbDesc));
 
     TensorDesc imageDesc = tensor4df(DT_F32, DF_NCHW, 1, 3, 224, 224);
     load_resize_image(rgbTensor, imageDesc, RGB, 0.017);
diff --git a/compute/image/tests/test_image_resize.cpp b/compute/image/tests/test_image_resize.cpp
index 369590fc..02e4e030 100644
--- a/compute/image/tests/test_image_resize.cpp
+++ b/compute/image/tests/test_image_resize.cpp
@@ -27,46 +27,46 @@ int resizeTest(int argc, char *argv[], DataType dt)
     U32 oc = atoi(argv[6]);
     U32 oh = atoi(argv[7]);
     U32 ow = atoi(argv[8]);
-    ArchInfo archInfo;
-    archInfo.arch = UT_ARCH;
-    ArchInfo archInfo_org;
-    archInfo_org.arch = CPU_GENERAL;
-
     CHECK_REQUIREMENT(in == 1 && on == 1);
 
-    TensorDesc inputDesc, outputDesc;
-    inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw);
+    ArchInfo archInfo;
+    archInfo.arch = UT_ARCH;
 
-    DataType paramDT = DT_F32;
-    F32 scales[2];
-    scales[0] = (F32)oh / (F32)ih;
-    scales[1] = (F32)ow / (F32)iw;
+    ResizeParamSpec p;
+    p.mode = RESIZE_LINEAR;
+    p.trans_mode = COORDINATE_TRANS_ASYMMETRIC;
+    p.num_sizes = 0;
+    p.num_scales = 4;
+    p.scales[0] = oh;
+    p.scales[1] = ow;
+    p.scales[2] = (F32)oh / (F32)ih;
+    p.scales[3] = (F32)ow / (F32)iw;
 
     // setup input
+    TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw);
     U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM);
-    Tensor inputTensor;
-    inputTensor.resize(inputDesc);
-    inputTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc));
+    Tensor inputTensor = Tensor::alloc_sized<CPUMem>(inputDesc);
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc));
 
     // setup output
-    U32 outputBytes;
     Tensor outputTensor;
-    CHECK_STATUS(resize_infer_output_size(
-        &inputTensor, paramDT, scales, &outputTensor, &outputBytes, &archInfo));
-    outputDesc = outputTensor.get_desc();
+    CHECK_STATUS(resize_infer_output_size(&inputTensor, p, &outputTensor, &archInfo));
+    TensorDesc outputDesc = outputTensor.get_desc();
     CHECK_REQUIREMENT(tensorNumElements(outputDesc) == on * oc * oh * ow);
     outputTensor.alloc();
     Tensor outputTensorRef = Tensor::alloc_sized<CPUMem>(outputDesc);
-    Tensor tmpTensor = Tensor::alloc_sized<CPUMem>(tensor1d(DT_U8, 8 * tensorNumBytes(inputDesc)));
-
-    ResizeParamSpec p;
-    p.mode = LINEAR;
+    U32 cpuTmpBytes = 0, cpuTmpBytesSerial = 0;
+    CHECK_STATUS(
+        resize_infer_forward_tmp_bytes(inputTensor, p, outputTensor, &cpuTmpBytes, &archInfo));
+    CHECK_STATUS(resize_infer_forward_tmp_bytes(
+        inputTensor, p, outputTensorRef, &cpuTmpBytesSerial, &UT_SERIAL_ARCHINFO));
+    Tensor tmpTensor = Tensor::alloc_sized<CPUMem>(tensor1d(DT_I8, cpuTmpBytes));
+    Tensor tmpTensorSerial = Tensor::alloc_sized<CPUMem>(tensor1d(DT_I8, cpuTmpBytesSerial));
     if (UT_CHECK) {
-        CHECK_STATUS(resize(inputTensor, tmpTensor, outputTensor, p, &archInfo));
+        CHECK_STATUS(resize(inputTensor, p, tmpTensor, outputTensor, &archInfo));
 
         // naive implement
-        CHECK_STATUS(resize(inputTensor, tmpTensor, outputTensorRef, p, &archInfo_org));
+        CHECK_STATUS(resize(inputTensor, p, tmpTensorSerial, outputTensorRef, &UT_SERIAL_ARCHINFO));
 
         // check
         ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH),
@@ -77,7 +77,7 @@ int resizeTest(int argc, char *argv[], DataType dt)
     // benchmark
     double time_start = ut_time_ms();
     for (int iter = 0; iter < UT_LOOPS; iter++) {
-        CHECK_STATUS(resize(inputTensor, tmpTensor, outputTensor, p, &archInfo));
+        CHECK_STATUS(resize(inputTensor, p, tmpTensor, outputTensor, &archInfo));
     }
     double time_end = ut_time_ms();
     double time = (time_end - time_start) / UT_LOOPS;
diff --git a/compute/image/tests/test_image_resize_ocl.cpp b/compute/image/tests/test_image_resize_ocl.cpp
index 564418ec..95fe27fe 100644
--- a/compute/image/tests/test_image_resize_ocl.cpp
+++ b/compute/image/tests/test_image_resize_ocl.cpp
@@ -31,72 +31,64 @@ int resizeTest(int argc, char *argv[], DataType dt)
 
     CHECK_REQUIREMENT(in == 1 && on == 1);
 
-    ArchInfo archInfo;
-    archInfo.arch = MALI;
-    ArchInfo archInfo_org;
-    archInfo_org.arch = CPU_GENERAL;
-
-    TensorDesc inputDesc_cpu, inputDesc_gpu, outputDesc_cpu, outputDesc_gpu;
-    inputDesc_cpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw);
-    inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw);
-
-    DataType paramDT = DT_U32;
-    U32 scales[2];
-    scales[0] = oh;
-    scales[1] = ow;
+    ResizeParamSpec p;
+    //p.mode = RESIZE_LINEAR;
+    p.mode = RESIZE_NEAREST;
+    p.trans_mode = COORDINATE_TRANS_ASYMMETRIC;
+    p.num_scales = 0;
+    p.num_sizes = 2;
+    p.sizes[0] = oh;
+    p.sizes[1] = ow;
 
     // setup input
+    TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw);
     U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM);
-
-    Tensor inputTensorCpu;
-    inputTensorCpu.resize(inputDesc_cpu);
-    inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc_cpu));
+    Tensor inputTensorCpu = Tensor::alloc_sized<CPUMem>(inputDesc);
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc));
     Tensor outputTensorCpu;
-    Tensor tmpTensorCpu;
-    U32 outputBytes;
-    CHECK_STATUS(resize_infer_output_size(
-        &inputTensorCpu, paramDT, scales, &outputTensorCpu, &outputBytes, &archInfo_org));
+    CHECK_STATUS(
+        resize_infer_output_size(&inputTensorCpu, p, &outputTensorCpu, &UT_SERIAL_ARCHINFO));
     outputTensorCpu.alloc();
+    U32 cpuTmpBytes = 0;
+    CHECK_STATUS(resize_infer_forward_tmp_bytes(
+        inputTensorCpu, p, outputTensorCpu, &cpuTmpBytes, &UT_SERIAL_ARCHINFO));
+    Tensor tmpTensorCpu = Tensor::alloc_sized<CPUMem>(tensor1d(DT_I8, cpuTmpBytes));
 
-    ResizeParamSpec p;
-    //p.mode = LINEAR;
-    p.mode = NEAREST;
-    p.trans_mode = ASYMMETRIC;
     // CPU output
-    CHECK_STATUS(resize(inputTensorCpu, tmpTensorCpu, outputTensorCpu, p, &archInfo_org));
-    std::shared_ptr<GCLHandle> handleSharedPtr = OCLContext::getInstance().handle;
+    CHECK_STATUS(resize(inputTensorCpu, p, tmpTensorCpu, outputTensorCpu, &UT_SERIAL_ARCHINFO));
 
+    ArchInfo archInfo;
+    archInfo.arch = MALI;
+    std::shared_ptr<GCLHandle> handleSharedPtr = OCLContext::getInstance().handle;
     GCLHandle_t handle = handleSharedPtr.get();
     std::vector<GCLKernelInfo> kernelVec;
     handle->kernelVec = &kernelVec;
-    Tensor inputTensor = Tensor(OCLMem);
-    Tensor outputTensor = Tensor(OCLMem);
-    Tensor tmpTensor = Tensor(OCLMem);
-    inputTensor.resize(inputDesc_gpu);
-
     MaliPara maliPara;
     maliPara.handle = handle;
     archInfo.archPara = &maliPara;
 
-    CHECK_STATUS(resize_infer_output_size(
-        &inputTensor, paramDT, scales, &outputTensor, &outputBytes, &archInfo));
+    Tensor inputTensor = Tensor(OCLMem);
+    Tensor outputTensor = Tensor(OCLMem);
+    Tensor tmpTensor = Tensor(OCLMem);
+    inputTensor.resize(inputDesc);
+
+    CHECK_STATUS(resize_infer_output_size(&inputTensor, p, &outputTensor, &archInfo));
     U32 maxBytes = 0;
     U32 tmpBytes = 0;
 
     GCLMem_t output = alloc(outputTensor);
     GCLMem_t input = alloc(inputTensor);
     CHECK_STATUS(gcl_fill_memory_zero(handle, input));
-    outputDesc_gpu = outputTensor.get_desc();
+    TensorDesc outputDesc_gpu = outputTensor.get_desc();
     U8 *output_gpu = ut_input_v(on * oc * oh * ow, dt, UT_INIT_RANDOM);
-    tmpBytes = tensorNumBytes(inputDesc_gpu);
+    tmpBytes = tensorNumBytes(inputDesc);
     maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes;
     tmpBytes = tensorNumBytes(outputDesc_gpu);
     maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes;
     GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes);
-    CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true));
+    CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true));
 
-    CHECK_STATUS(resize(inputTensor, tmpTensor, outputTensor, p, &archInfo));
+    CHECK_STATUS(resize(inputTensor, p, tmpTensor, outputTensor, &archInfo));
     /*warp up*/
     UNI_INFO_LOG("warm up gpu:\n")
     for (U32 i = 0; i < 2; i++) {
diff --git a/compute/tensor/include/feature.h b/compute/tensor/include/feature.h
new file mode 100644
index 00000000..15cba329
--- /dev/null
+++ b/compute/tensor/include/feature.h
@@ -0,0 +1,254 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_FEATURE
+#define _H_FEATURE
+
+#include <vector>
+#include <map>
+#include <string>
+
+#include "farmhash.h"
+
+#include "tensor_desc.h"
+
+typedef enum Combiner {
+    Combiner_Mean,
+    Combiner_Sum,
+} Combiner;
+typedef int TYPE;
+
+inline uint64_t shift_mix(const uint64_t val)
+{
+    return val ^ (val >> 47);
+}
+
+inline uint64_t FingerprintCat64(const uint64_t &fp1, const uint64_t &fp2)
+{
+    static const uint64_t kMul = 0xc6a4a7935bd1e995ULL;
+    uint64_t result = fp1 ^ kMul;
+    result ^= shift_mix(fp2 * kMul) * kMul;
+    result *= kMul;
+    result = shift_mix(result) * kMul;
+    result = shift_mix(result);
+    return result;
+}
+
+static inline uint64_t tf_hash(const std::string &s)
+{
+    return ::util::Fingerprint64(s.data(), s.size());
+}
+
+static inline uint64_t tf_hash64(const uint64_t &s, const uint64_t &key)
+{
+    return FingerprintCat64(key, s);
+}
+
+static inline uint64_t tf_hash64(const std::string &s, const uint64_t &key)
+{
+    return FingerprintCat64(key, tf_hash(s));
+}
+
+std::vector<TYPE> categorical_column_with_vocabulary_list(const std::vector<std::string> &input,
+    std::map<std::string, TYPE> &vocab,
+    int default_value = -1,
+    int num_oov_buckets = 0)
+{
+    int vocab_size = vocab.size();
+    std::vector<TYPE> ret(input.size());
+    for (uint32_t i = 0; i < input.size(); i++) {
+        if (vocab.find(input[i]) != vocab.end()) {
+            ret[i] = vocab[input[i]];
+        } else {
+            if (num_oov_buckets > 0) {
+                ret[i] = tf_hash(input[i]) % num_oov_buckets + vocab_size;
+            } else {
+                ret[i] = default_value;
+            }
+        }
+    }
+    return ret;
+}
+
+std::vector<TYPE> categorical_column_with_hash_bucket(
+    const std::vector<std::string> &input, int hash_bucket_size)
+{
+    std::vector<TYPE> ret(input.size());
+    for (uint32_t i = 0; i < input.size(); i++) {
+        ret[i] = tf_hash(input[i]) % hash_bucket_size;
+    }
+    return ret;
+}
+
+std::vector<TYPE> categorical_column_with_identity(
+    const std::vector<TYPE> &input, int bucket_size, int default_value = 0)
+{
+    std::vector<TYPE> ret(input.size());
+    for (uint32_t i = 0; i < input.size(); i++) {
+        if (input[i] < bucket_size) {
+            ret[i] = input[i];
+        } else {
+            ret[i] = default_value;
+        }
+    }
+    return ret;
+}
+
+inline uint32_t quick_search(
+    const std::vector<TYPE> &data, const TYPE &query, const uint32_t &left, const uint32_t &right)
+{
+#if 1
+    for (int j = left; j < right; j++) {
+        if (query < data[j]) {
+            return j;
+        }
+    }
+#else
+    if (left >= right) {
+        return left;
+    }
+    int mid = (left + right) / 2;
+    if (query < data[mid]) {
+        return quick_search(data, query, left, mid);
+    } else {
+        return quick_search(data, query, mid, right);
+    }
+#endif
+}
+
+std::vector<TYPE> bucketized_column(
+    const std::vector<TYPE> &input, const std::vector<TYPE> &boundaries)
+{
+    std::vector<TYPE> ret(input.size());
+    uint32_t size = boundaries.size();
+    for (uint32_t i = 0; i < input.size(); i++) {
+        ret[i] = quick_search(boundaries, input[i], 0, size);
+        ;
+    }
+    return ret;
+}
+
+void indicator_column(const TensorDesc &input_desc,
+    const TYPE *input,
+    int categorical_num,
+    TensorDesc *output_desc,
+    TYPE *output,
+    const TYPE *weight = nullptr)
+{
+    *output_desc = input_desc;
+    output_desc->dims[0] = categorical_num;
+    uint32_t count = 1;
+    for (uint32_t i = 1; i < input_desc.nDims; i++) {
+        count *= input_desc.dims[i];
+    }
+
+    memset(output, 0, count * categorical_num * sizeof(TYPE));
+    if (weight != nullptr) {
+        for (uint32_t i = 0, j = 0, n = 0; i < count; i++, j += categorical_num) {
+            for (uint32_t k = 0; k < input_desc.dims[0]; k++, n++) {
+                output[j + input[n]] += weight[n];
+            }
+        }
+    } else {
+        for (uint32_t i = 0, j = 0, n = 0; i < count; i++, j += categorical_num) {
+            for (uint32_t k = 0; k < input_desc.dims[0]; k++, n++) {
+                output[j + input[n]]++;
+            }
+        }
+    }
+}
+
+template <typename TI0, typename TI1>
+std::vector<TYPE> crossed_column(const std::vector<TI0> &input0,
+    const std::vector<TI1> &input1,
+    int hash_bucket_size,
+    const uint64_t hash_key = 0xDECAFCAFFE)
+{
+    std::vector<TYPE> ret(input0.size());
+    for (uint32_t i = 0; i < input0.size(); i++) {
+        ret[i] = tf_hash64(input1[i], tf_hash64(input0[i], hash_key)) % hash_bucket_size;
+    }
+    return ret;
+}
+
+template <typename TI, typename TO, typename F>
+std::vector<TO> numeric_column(const std::vector<TI> &input,
+    F const &normalizer_fn = nullptr,
+    int shape = 0,
+    TO default_value = -1)
+{
+    if (shape > 0) {
+        return std::vector<TO>(shape, default_value);
+    }
+    std::vector<TO> ret = std::vector<TO>(input.size());
+    if (normalizer_fn == nullptr) {
+        for (uint32_t i = 0; i < input.size(); i++) {
+            ret[i] = input[i];
+        }
+    } else {
+        for (uint32_t i = 0; i < input.size(); i++) {
+            ret[i] = normalizer_fn(input[i]);
+        }
+    }
+    return ret;
+}
+
+template <Combiner combiner, typename T>
+inline void embedding_combine(const std::vector<T *> &input, const uint32_t &dimension, T *output)
+{
+    if (input.size() == 0) {
+        memset(output, 0, sizeof(T) * dimension);
+        return;
+    }
+    if (combiner == Combiner_Mean || combiner == Combiner_Sum) {
+        memcpy(output, input[0], sizeof(T) * dimension);
+        for (uint32_t i = 1; i < input.size(); i++) {
+            for (uint32_t j = 0; j < dimension; j++) {
+                output[j] += input[i][j];
+            }
+        }
+        if (combiner == Combiner_Mean) {
+            for (uint32_t j = 0; j < dimension; j++) {
+                output[j] /= input.size();
+            }
+        }
+    } else {
+        printf("[ERROR] currently not support combine function %d.\n", combiner);
+        exit(1);
+    }
+}
+
+template <Combiner combiner, typename T>
+void embedding_column(const TensorDesc &input_desc,
+    const TYPE *input,
+    const T *vocab,
+    const uint32_t &dimension,
+    TensorDesc *output_desc,
+    T *output)
+{
+    *output_desc = input_desc;
+    output_desc->dims[0] = dimension;
+    uint32_t count = 1;
+    for (uint32_t i = 1; i < input_desc.nDims; i++) {
+        count *= input_desc.dims[i];
+    }
+
+    std::vector<T *> vec(input_desc.dims[0]);
+    for (uint32_t i = 0, j = 0; i < count; i++, output += dimension) {
+        for (uint32_t k = 0; k < input_desc.dims[0]; k++, j++) {
+            vec[k] = vocab + input[j] * dimension;
+        }
+        embedding_combine<combiner, T>(vec, dimension, output);
+    }
+}
+#endif
diff --git a/compute/tensor/include/tensor_computing.h b/compute/tensor/include/tensor_computing.h
index 6b7992ca..2a562c14 100644
--- a/compute/tensor/include/tensor_computing.h
+++ b/compute/tensor/include/tensor_computing.h
@@ -172,6 +172,7 @@ EE depthwise_pointwise_convolution(std::vector<Tensor> inputTensors,
     Tensor pwFilterTensor,
     ConvolutionParamSpec convParamSpec,
     DepthwiseConvolutionForwardAlgorithm algorithm,
+    void *scale,
     Tensor dwBiasTensor,
     Tensor pwBiasTensor,
     std::vector<Tensor> tmpTensors,
@@ -221,6 +222,7 @@ EE depthwise_convolution(Tensor inputTensor,
     Tensor filterTensor,
     ConvolutionParamSpec convParamSpec,
     DepthwiseConvolutionForwardAlgorithm algorithm,
+    void *scale,
     Tensor biasTensor,
     Tensor tmpTensor,
     Tensor outputTensor,
@@ -272,7 +274,8 @@ EE activation(
 EE concat_infer_output_size(
     std::vector<Tensor *> inputTensor, ConcatParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo);
 
-EE concat_infer_forward_tmp_bytes(std::vector<Tensor> inputTensor, U32 *bytes, ArchInfo_t archInfo);
+EE concat_infer_forward_tmp_bytes(
+    std::vector<Tensor> inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo);
 
 EE concat(std::vector<Tensor> inputTensor,
     ConcatParamSpec p,
@@ -320,14 +323,20 @@ EE fully_connected(Tensor inputTensor,
 EE softmax_infer_output_size(
     Tensor *inputTensor, SoftmaxParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo);
 
+EE softmax_infer_forward_tmp_bytes(
+    Tensor inputTensor, SoftmaxParamSpec p, U32 *bytes, ArchInfo_t archInfo);
+
 EE softmax(Tensor inputTensor,
     SoftmaxParamSpec p,
     Tensor tmpTensor,
     Tensor outputTensor,
     ArchInfo_t archInfo);
 
-EE softmax_infer_forward_tmp_bytes(
-    Tensor inputTensor, SoftmaxParamSpec p, U32 *bytes, ArchInfo_t archInfo);
+EE logsoftmax(Tensor inputTensor,
+    SoftmaxParamSpec p,
+    Tensor tmpTensor,
+    Tensor outputTensor,
+    ArchInfo_t archInfo);
 
 EE rnn_infer_output_size(std::vector<Tensor *> inputTensor,
     RNNParamSpec rnnParamSpec,
@@ -465,6 +474,7 @@ EE normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, Ar
 EE normalization_infer_forward_tmp_bytes(Tensor inputTensor, U32 *bytes, ArchInfo_t archInfo);
 
 EE layer_normalization(Tensor inputTensor,
+    LayerNormParamSpec p,
     Tensor alphaTensor,
     Tensor betaTensor,
     Tensor tmpTensor,
@@ -554,7 +564,8 @@ EE attention_infer_output_size(Tensor *inputTensor, AttentionParamSpec p, Tensor
 
 EE attention(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo);
 
-EE power_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo);
+EE power_infer_output_size(
+    Tensor *inputTensor, PowerParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo);
 
 EE power(Tensor inputTensor, PowerParamSpec p, Tensor outputTensor, ArchInfo_t archInfo);
 
@@ -689,9 +700,12 @@ EE yolov3detectionoutput(std::vector<Tensor> inputTensor,
     Tensor outputTensor,
     ArchInfo_t archInfo);
 
-EE preallocated_memory_infer_output_size(Tensor *outputTensor, ArchInfo_t archInfo);
+EE preallocated_memory_infer_output_size(std::vector<Tensor *> inputTensors,
+    PreAllocatedMemoryParamSpec p,
+    Tensor *outputTensor,
+    ArchInfo_t archInfo);
 
-EE preallocated_memory(Tensor outputTensor, ArchInfo_t archInfo);
+EE preallocated_memory(PreAllocatedMemoryParamSpec p, Tensor outputTensor, ArchInfo_t archInfo);
 
 EE copy_infer_output_size(std::vector<Tensor *> inputTensor, ArchInfo_t archInfo);
 
@@ -795,26 +809,16 @@ EE tile(Tensor inputTensor,
     Tensor outputTensor,
     ArchInfo_t archInfo);
 
-EE where_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo);
+EE where_infer_output_size(
+    Tensor *xTensor, Tensor *yTensor, Tensor *outputTensor, ArchInfo_t archInfo);
 
-EE where(Tensor inputTensor,
-    Tensor conditionTensor,
-    Tensor yTensor,
-    Tensor outputTensor,
-    ArchInfo_t archInfo);
+EE where(
+    Tensor conditionTensor, Tensor xTensor, Tensor yTensor, Tensor outputTensor, ArchInfo_t archInfo);
 
 EE cast_infer_output_size(
-    Tensor *inputTensor, Tensor *outputTensor, CastParamSpec p, ArchInfo_t archInfo);
-
-EE cast(Tensor inputTensor, Tensor outputTensor, CastParamSpec p, ArchInfo_t archInfo);
-
-EE equal_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo);
+    Tensor *inputTensor, CastParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo);
 
-EE equal(Tensor inputTensor,
-    Tensor compareTensor,
-    EqualParamSpec p,
-    Tensor outputTensor,
-    ArchInfo_t archInfo);
+EE cast(Tensor inputTensor, CastParamSpec p, Tensor outputTensor, ArchInfo_t archInfo);
 
 EE quantize(Tensor inputTensor, Tensor *outputTensor, F32 *scale, ArchInfo_t archInfo);
 
@@ -930,4 +934,15 @@ EE generate_proposals(Tensor deltaTensor,
     std::vector<Tensor> tmpTensors,
     Tensor outputTensor,
     ArchInfo_t archInfo);
+
+EE onehot_infer_output_size(
+    Tensor *inputTensor, OneHotParamSpec p, DataType type, Tensor *outputTensor, ArchInfo_t archInfo);
+
+EE onehot(Tensor inputTensor, OneHotParamSpec p, Tensor outputTensor, ArchInfo_t archInfo);
+
+EE cumsum_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo);
+
+EE cumsum(Tensor inputTensor, CumSumParamSpec p, Tensor outputTensor, ArchInfo_t archInfo);
+
+EE non_zero(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo);
 #endif
diff --git a/compute/tensor/src/CMakeLists.txt b/compute/tensor/src/CMakeLists.txt
index bc1b111d..14e606b2 100644
--- a/compute/tensor/src/CMakeLists.txt
+++ b/compute/tensor/src/CMakeLists.txt
@@ -4,6 +4,7 @@ if (USE_GENERAL)
 endif (USE_GENERAL)
 
 if (USE_NEON)
+    file(GLOB arm_int32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int32/*.cpp)
     if (USE_FP32)
         file(GLOB arm_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp32/*.cpp)
         if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
@@ -20,18 +21,19 @@ if (USE_NEON)
     if (USE_INT8)
         file(GLOB arm_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/*.cpp)
         if (USE_FP16)
-            file(GLOB armv8_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v8/*.cpp)
-        else ()
+            file(GLOB armv8_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v8.2/*.cpp)
+        elseif (NOT "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
             file(GLOB armv7_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v7/*.cpp)
         endif ()
         set(arm_int8_srcs "${arm_int8_srcs};${armv8_int8_srcs};${armv7_int8_srcs}")
     endif (USE_INT8)
     file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp)
-    set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs};${arm_bnn_srcs}")
+    set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs};${arm_bnn_srcs};${arm_int32_srcs}")
     file(GLOB cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp)
 endif (USE_NEON)
 
 if (USE_X86)
+    file(GLOB x86_int32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/int32/*.cpp)
     if (USE_FP32)
         file(GLOB x86_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/fp32/*.cpp)
     endif (USE_FP32)
@@ -39,7 +41,7 @@ if (USE_X86)
         file(GLOB x86_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/int8/*.cpp)
     endif (USE_INT8)
     file(GLOB x86_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/*.cpp)
-    set(x86_srcs "${x86_srcs};${x86_fp32_srcs};${x86_int8_srcs}")
+    set(x86_srcs "${x86_srcs};${x86_int32_srcs};${x86_fp32_srcs};${x86_int8_srcs}")
     file(GLOB cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp)
 endif (USE_X86)
 
@@ -58,6 +60,9 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 # shared library 
 add_library(${PROJECT_NAME} SHARED ${srcs})
 target_link_libraries(${PROJECT_NAME} LINK_PUBLIC blas_enhance uni)
+if (USE_SECURE_C)
+    target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY})
+endif ()
 
 # static library 
 add_library(${PROJECT_NAME}_static STATIC ${srcs})
diff --git a/compute/tensor/src/attention.cpp b/compute/tensor/src/attention.cpp
index 9f05d2c0..8541236e 100644
--- a/compute/tensor/src/attention.cpp
+++ b/compute/tensor/src/attention.cpp
@@ -61,8 +61,10 @@ EE attention_infer_output_size(Tensor *inputTensor, AttentionParamSpec p, Tensor
     DataFormat df;
     U32 batch, sequenceLength;
     CHECK_STATUS(tensor2dGet(inputDesc, &dt, &df, &batch, &sequenceLength));
+    U32 oh = UNI_MIN(p.from_sequence_length, sequenceLength);
+    U32 ow = UNI_MIN(p.to_sequence_length, sequenceLength);
     outputDesc =
-        tensor4df(dt, DF_NCHW, batch, p.num_heads, p.from_sequence_length, p.to_sequence_length);
+        tensor4df(dt, DF_NCHW, batch, p.num_heads, oh, ow);
     outputTensor->resize(outputDesc);
     return SUCCESS;
 }
diff --git a/compute/tensor/src/cast.cpp b/compute/tensor/src/cast.cpp
index 243a08b5..1e7fe5e3 100644
--- a/compute/tensor/src/cast.cpp
+++ b/compute/tensor/src/cast.cpp
@@ -10,24 +10,25 @@
 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
 #include "tensor_computing.h"
+#ifdef _USE_CPU
+#include "cpu/tensor_computing_cpu.h"
+#endif
 #ifdef _USE_GPU
 #include "gpu/mali/tensor_computing_mali.h"
 #endif
 
 EE cast_infer_output_size(
-    Tensor *inputTensor, Tensor *outputTensor, CastParamSpec p, ArchInfo_t archInfo)
+    Tensor *inputTensor, CastParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo)
 {
-    if (inputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-    if (outputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
+    if (inputTensor == nullptr || outputTensor == nullptr) {
+        return NULL_POINTER;
     }
     TensorDesc inputDesc = inputTensor->get_desc();
     TensorDesc outputDesc = outputTensor->get_desc();
     outputDesc = inputDesc;
-    outputDesc.dt = p.targetDt;
+    outputDesc.dt = p.dt;
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
         if (outputDesc.dt != DT_I32 && outputDesc.dt != DT_F16) {
@@ -35,107 +36,29 @@ EE cast_infer_output_size(
         }
 #endif
     }
-    outputTensor->resize(outputDesc);
-    return SUCCESS;
-}
-
-template <typename TI, typename TO>
-static EE diffSourceCastKernel(U32 len, TI *inputPtr, TO *outputPtr)
-{
-    for (U32 i = 0; i < len; ++i) {
-        outputPtr[i] = (TO)(inputPtr[i]);
+#ifdef _USE_CPU
+    if (tensorIsShape(inputDesc)) {
+        outputDesc.dt = DT_U32;
     }
-    return SUCCESS;
-}
-
-template <typename T>
-static EE diffSourceCast(TensorDesc inputDesc, T *inputPtr, void *outputPtr, CastParamSpec p)
-{
-    EE ret = SUCCESS;
-    U32 len = tensorNumElements(inputDesc);
-    switch (p.targetDt) {
-        case DT_I32: {
-            diffSourceCastKernel<T, I32>(len, inputPtr, (I32 *)outputPtr);
-            break;
-        }
-        case DT_U32: {
-            diffSourceCastKernel<T, U32>(len, inputPtr, (U32 *)outputPtr);
-            break;
-        }
-#ifdef _USE_FP32
-        case DT_F32: {
-            diffSourceCastKernel<T, F32>(len, inputPtr, (F32 *)outputPtr);
-            break;
-        }
-#endif
-#ifdef _USE_FP16
-        case DT_F16: {
-            diffSourceCastKernel<T, F16>(len, inputPtr, (F16 *)outputPtr);
-            break;
-        }
 #endif
-        case DT_U8: {
-            diffSourceCastKernel<T, U8>(len, inputPtr, (U8 *)outputPtr);
-            break;
-        }
-        case DT_I8: {
-            diffSourceCastKernel<T, INT8>(len, inputPtr, (INT8 *)outputPtr);
-            break;
-        }
-        default:
-            ret = NOT_SUPPORTED;
-            break;
-    }
-    return ret;
+    outputTensor->resize(outputDesc);
+    return SUCCESS;
 }
 
-EE cast(Tensor inputTensor, Tensor outputTensor, CastParamSpec p, ArchInfo_t archInfo)
+EE cast(Tensor inputTensor, CastParamSpec p, Tensor outputTensor, ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
     TensorDesc inputDesc = inputTensor.get_desc();
     void *input = get_ptr_from_tensor(inputTensor, arch);
+    TensorDesc outputDesc = outputTensor.get_desc();
     void *output = get_ptr_from_tensor(outputTensor, arch);
-
     EE ret = NOT_SUPPORTED;
     if (IS_CPU(arch)) {
 #ifdef _USE_CPU
-        switch (inputDesc.dt) {
-#ifdef _USE_FP32
-            case DT_F32: {
-                ret = diffSourceCast<F32>(inputDesc, (F32 *)input, output, p);
-                break;
-            }
-#endif
-#ifdef _USE_FP16
-            case DT_F16: {
-                ret = diffSourceCast<F16>(inputDesc, (F16 *)input, output, p);
-                break;
-            }
-#endif
-            case DT_U32: {
-                ret = diffSourceCast<U32>(inputDesc, (U32 *)input, output, p);
-                break;
-            }
-            case DT_I32: {
-                ret = diffSourceCast<I32>(inputDesc, (I32 *)input, output, p);
-                break;
-            }
-            case DT_U8: {
-                ret = diffSourceCast<U8>(inputDesc, (U8 *)input, output, p);
-                break;
-            }
-            case DT_I8: {
-                ret = diffSourceCast<INT8>(inputDesc, (INT8 *)input, output, p);
-                break;
-            }
-            default:
-                ret = NOT_SUPPORTED;
-                break;
-        }
+        ret = cast_cpu(inputDesc, input, outputDesc, output);
 #endif
 #ifdef _USE_GPU
     } else if (IS_GPU(arch)) {
-        TensorDesc outputDesc = outputTensor.get_desc();
         ret = cast_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p,
             outputDesc, (GCLMem_t)output);
 #endif
diff --git a/compute/tensor/src/check.cpp b/compute/tensor/src/check.cpp
index c61c7feb..0a3707f9 100644
--- a/compute/tensor/src/check.cpp
+++ b/compute/tensor/src/check.cpp
@@ -12,14 +12,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "tensor_computing.h"
-#ifdef _USE_GENERAL
-#include "cpu/general/tensor_computing_general.h"
-#endif
-#ifdef _USE_X86
-#include "cpu/x86/tensor_computing_x86.h"
-#endif
-#ifdef _USE_NEON
-#include "cpu/arm/tensor_computing_arm.h"
+#ifdef _USE_CPU
+#include "cpu/tensor_computing_cpu.h"
 #endif
 #ifdef _USE_GPU
 #include "gpu/mali/tensor_computing_mali.h"
@@ -39,17 +33,9 @@ EE check(Tensor inputTensorA,
     TensorDesc outputDesc = outputTensor.get_desc();
     void *output = get_ptr_from_tensor(outputTensor, arch);
     EE ret = NOT_SUPPORTED;
-    if (IS_GENERAL(arch)) {
+    if (IS_CPU(arch)) {
 #ifdef _USE_GENERAL
-        ret = check_general(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output);
-#endif
-#ifdef _USE_X86
-    } else if (IS_X86(arch)) {
-        ret = check_x86(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output);
-#endif
-#ifdef _USE_NEON
-    } else if (IS_ARM(arch)) {
-        ret = check_arm(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output);
+        ret = check_cpu(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output);
 #endif
 #ifdef _USE_GPU
     } else if (IS_GPU(arch)) {
@@ -63,27 +49,16 @@ EE check(Tensor inputTensorA,
 EE check_infer_output_size(
     std::vector<Tensor *> inputTensor, Tensor *outputTensor, ArchInfo_t archInfo)
 {
-    EE ret = NOT_SUPPORTED;
     if (outputTensor == nullptr) {
         CHECK_STATUS(NULL_POINTER);
     }
-    for (auto p : inputTensor) {
-        if (p == nullptr) {
-            CHECK_STATUS(NULL_POINTER);
-        }
+    TensorDesc outputDesc = inputTensor[0]->get_desc();
+    if (inputTensor.size() > 1 && inputTensor[0]->length() < inputTensor[1]->length()) {
+        outputDesc = inputTensor[1]->get_desc();
     }
-    TensorDesc inputDesc = inputTensor[0]->get_desc();
-    TensorDesc outputDesc = outputTensor->get_desc();
-    outputDesc.dt = DT_I32;
-    outputDesc.nDims = 1;
-    outputDesc.df = DF_NORMAL;
-    outputDesc.dims[0] = inputDesc.dims[inputDesc.nDims - 1];
+    outputDesc.dt = DT_U8;
     if (IS_GPU(archInfo->arch)) {
-#ifdef _USE_GPU
-        if (outputDesc.dims[0] > 1) {
-            CHECK_STATUS(NOT_SUPPORTED);
-        }
-#endif
+        outputDesc.dt = DT_I32;
     }
     outputTensor->resize(outputDesc);
     return SUCCESS;
diff --git a/compute/tensor/src/concat.cpp b/compute/tensor/src/concat.cpp
index f80cf54a..f7c0e32c 100644
--- a/compute/tensor/src/concat.cpp
+++ b/compute/tensor/src/concat.cpp
@@ -22,15 +22,16 @@
 
 inline void processInputDescs(std::vector<TensorDesc> *inputDesc, I32 axis)
 {
-    int inputNum = inputDesc->size();
-    int axisInfo = (axis > 0) ? axis : ((*inputDesc)[0].nDims + axis);
-    axisInfo = (*inputDesc)[0].nDims - 1 - axisInfo;
-    for (int i = 0; i < (int)(*inputDesc)[0].nDims; i++) {
-        if (i == axisInfo) {
+    int num = inputDesc->size();
+    int dim = (*inputDesc)[0].nDims;
+    axis = (axis + dim) % dim;
+    axis = dim - 1 - axis;
+    for (int i = 0; i < dim; i++) {
+        if (i == axis) {
             continue;
         }
         U32 minDim = (*inputDesc)[0].dims[i];
-        for (int j = 1; j < inputNum; j++) {
+        for (int j = 1; j < num; j++) {
             if ((*inputDesc)[j].dims[i] < minDim) {
                 minDim = (*inputDesc)[j].dims[i];
             }
@@ -38,7 +39,7 @@ inline void processInputDescs(std::vector<TensorDesc> *inputDesc, I32 axis)
         if (minDim == 0) {
             continue;
         }
-        for (int j = 0; j < inputNum; j++) {
+        for (int j = 0; j < num; j++) {
             (*inputDesc)[j].dims[i] = minDim;
         }
     }
@@ -48,7 +49,7 @@ inline EE concat_infer_output_size_cpu(
     std::vector<TensorDesc> inputDesc, ConcatParamSpec p, TensorDesc *outputDesc)
 {
     if (inputDesc.size() < 1) {
-        CHECK_STATUS(NOT_MATCH);
+        return NOT_MATCH;
     }
     if (inputDesc.size() == 1) {
         *outputDesc = inputDesc[0];
@@ -70,11 +71,13 @@ inline EE concat_infer_output_size_cpu(
     axis = dim - 1 - axis;
     outputDesc->dims[axis] = 0;
 
+    int shapeCount = 0;
     for (U32 i = 0; i < inputDesc.size(); i++) {
         if (inputDesc[i].nDims == 0) {
             continue;
         }
 
+        shapeCount += tensorIsShape(inputDesc[i]);
         if (inputDesc[i].nDims != (U32)dim) {
             return NOT_MATCH;
         }
@@ -101,7 +104,18 @@ inline EE concat_infer_output_size_cpu(
         outputDesc->df = DF_NCHW;
     }
 
-    return SUCCESS;
+    EE ret = SUCCESS;
+#ifdef _USE_CPU
+    if (shapeCount > 0) {
+        std::vector<void *> input(inputDesc.size());
+        for (U32 i = 0; i < inputDesc.size(); i++) {
+            input[i] = inputDesc[i].dims + inputDesc[i].nDims;
+        }
+        ret = concat_cpu(inputDesc, input, nullptr, p, nullptr, *outputDesc,
+            outputDesc->dims + outputDesc->nDims, nullptr);
+    }
+#endif
+    return ret;
 }
 
 EE concat_infer_output_size(
@@ -130,9 +144,11 @@ EE concat_infer_output_size(
     return ret;
 }
 
-EE concat_infer_forward_tmp_bytes(std::vector<Tensor> inputTensor, U32 *bytes, ArchInfo_t archInfo)
+EE concat_infer_forward_tmp_bytes(
+    std::vector<Tensor> inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo)
 {
     std::vector<TensorDesc> inputDesc = get_desc_from_tensors(inputTensor);
+    TensorDesc outputDesc = outputTensor.get_desc();
     EE ret = NOT_SUPPORTED;
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
@@ -142,7 +158,9 @@ EE concat_infer_forward_tmp_bytes(std::vector<Tensor> inputTensor, U32 *bytes, A
     } else {
         *bytes = 0;
         for (auto p : inputDesc) {
-            *bytes += tensorNumBytes(p);
+            if (p.df != outputDesc.df) {
+                *bytes += tensorNumBytes(p);
+            }
         }
         ret = SUCCESS;
     }
diff --git a/compute/tensor/src/convolution.cpp b/compute/tensor/src/convolution.cpp
index d9e7e9ff..c9ffbd9e 100644
--- a/compute/tensor/src/convolution.cpp
+++ b/compute/tensor/src/convolution.cpp
@@ -57,9 +57,9 @@ inline EE convolution_infer_output_size_cpu(TensorDesc inputDesc,
     U32 ftDilated = (ft - 1) * p.dilatedRate_t + 1;
     U32 fhDilated = (fh - 1) * p.dilatedRate_h + 1;
     U32 fwDilated = (fw - 1) * p.dilatedRate_w + 1;
-    ot = (it + p.padding_before + p.padding_after - ftDilated) / p.stride_t + 1;
-    oh = (ih + p.padding_top + p.padding_bottom - fhDilated) / p.stride_h + 1;
-    ow = (iw + p.padding_left + p.padding_right - fwDilated) / p.stride_w + 1;
+    ot = (it + p.pad_before + p.pad_after - ftDilated) / p.stride_t + 1;
+    oh = (ih + p.pad_top + p.pad_bottom - fhDilated) / p.stride_h + 1;
+    ow = (iw + p.pad_left + p.pad_right - fwDilated) / p.stride_w + 1;
     if (ot < 0 || oh < 0 || ow < 0) {
         ret = NOT_MATCH;
     }
@@ -377,9 +377,8 @@ EE convolution(std::vector<Tensor> inputTensors,
         }
         ret = convolution_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc,
             (GCLMem_t)input, filterDesc, (GCLMem_t)filter, convParamSpec,
-            ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, scaleDesc, (GCLMem_t)scale,
-            biasDesc, (GCLMem_t)bias, tmpBytes, tmpVec, outputDesc, (GCLMem_t)output,
-            activationDesc.mode);
+            ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, scaleDesc, (GCLMem_t)scale, biasDesc,
+            (GCLMem_t)bias, tmpBytes, tmpVec, outputDesc, (GCLMem_t)output, activationDesc.mode);
 #endif
     }
 
@@ -388,7 +387,7 @@ EE convolution(std::vector<Tensor> inputTensors,
     if (inputTensors.size() > 1 && isEltwiseSeperate) {
         std::vector<Tensor> eltwiseInputTensors = {outputTensor, inputTensors[1]};
         EltwiseParamSpec eltwiseDesc;
-        eltwiseDesc.elt_mode = ELTWISE_SUM;
+        eltwiseDesc.mode = ELTWISE_SUM;
         eltwiseDesc.activation_type = eltwiseActDesc.mode;
         eltwiseDesc.activation_spec = convParamSpec.activation_spec;
         ret = eltwise(eltwiseInputTensors, eltwiseDesc, tmpTensors[0], outputTensor, archInfo);
diff --git a/compute/tensor/src/copy.cpp b/compute/tensor/src/copy.cpp
index 17b9da4a..3fcf8102 100644
--- a/compute/tensor/src/copy.cpp
+++ b/compute/tensor/src/copy.cpp
@@ -57,7 +57,7 @@ EE copy(std::vector<Tensor> inputTensor,
             UNI_ERROR_LOG("copy %u bytes from src tensor(%u) beyond size(%u).\n", copyLength,
                 srcIndex, inputTensor[0].bytes());
         }
-        memcpy((U8 *)input[1] + dstIndex, (U8 *)input[0] + srcIndex, copyLength);
+        UNI_MEMCPY((U8 *)input[1] + dstIndex, (U8 *)input[0] + srcIndex, copyLength);
         ret = SUCCESS;
 #endif
     }
diff --git a/compute/tensor/src/cpu/argmax.cpp b/compute/tensor/src/cpu/argmax.cpp
index a5bb6ba6..afb561d6 100644
--- a/compute/tensor/src/cpu/argmax.cpp
+++ b/compute/tensor/src/cpu/argmax.cpp
@@ -77,6 +77,10 @@ EE argmax_cpu(
             break;
         }
 #endif
+        case DT_I32: {
+            ret = argmax<I32>(inputDesc, (const I32 *)input, axis, outputDesc, (U32 *)output);
+            break;
+        }
         default:
             ret = NOT_SUPPORTED;
             break;
diff --git a/compute/tensor/src/cpu/arm/arm_functions.h b/compute/tensor/src/cpu/arm/arm_functions.h
index 20c2fdb8..3c1d60d5 100644
--- a/compute/tensor/src/cpu/arm/arm_functions.h
+++ b/compute/tensor/src/cpu/arm/arm_functions.h
@@ -107,6 +107,9 @@ inline EE array_minmax_value_arm(DataType dt, const void *data, I32 len, int mod
         case DT_I32:
             ret = array_minmax_value_i32((const I32 *)data, len, mode, result);
             break;
+        case DT_U32:
+            ret = array_minmax_value_template<U32>((const U32 *)data, len, mode, result);
+            break;
         default:
             ret = NOT_SUPPORTED;
             break;
diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp
index 4e70ee93..f46dbd41 100644
--- a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp
+++ b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp
@@ -44,10 +44,10 @@ EE convolution_dorefa_A55(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     if (fdf != DF_NCHWN16C8) {
         CHECK_STATUS(NOT_MATCH);
diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp
index 83cb462a..bef43159 100644
--- a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp
+++ b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp
@@ -44,10 +44,10 @@ EE convolution_dorefa_A76(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     if (fdf != DF_NCHWN16C8) {
         CHECK_STATUS(NOT_MATCH);
diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h b/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h
index 9d9329a6..8afeed50 100644
--- a/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h
+++ b/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h
@@ -15,8 +15,8 @@
 #define _H_CONVOLUTION_TRANSFORM_BNN
 
 #include <bitset>
-#include <string.h>
 
+#include "uni.h"
 #include "tensor_desc.h"
 
 inline void bitwise_copy(BIN8 srcVal, U32 srcBit, BIN8 *dest, U32 destBit)
@@ -46,7 +46,7 @@ inline EE convolution_transform_filter_bnn(
     switch (fdf) {
         case DF_NCHWN16C8:
             // Everything is ready
-            memcpy(ftmArray, filterArray, fn * fc * fh * fw / 8 * bytesOf(fdt));
+            UNI_MEMCPY(ftmArray, filterArray, fn * fc * fh * fw / 8 * bytesOf(fdt));
             break;
         case DF_NCHW: {
             /*
diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp
index 92ef5221..1178c32a 100644
--- a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp
+++ b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp
@@ -44,10 +44,10 @@ EE convolution_xnor_A55(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     if (fdf != DF_NCHWN16C8) {
         CHECK_STATUS(NOT_MATCH);
diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp
index 52ae3a88..48ae960b 100644
--- a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp
+++ b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp
@@ -44,10 +44,10 @@ EE convolution_xnor_A76(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     if (fdf != DF_NCHWN16C8) {
         CHECK_STATUS(NOT_MATCH);
diff --git a/compute/tensor/src/cpu/arm/check.cpp b/compute/tensor/src/cpu/arm/check.cpp
deleted file mode 100644
index e4e1ac81..00000000
--- a/compute/tensor/src/cpu/arm/check.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#include "cpu/arm/tensor_computing_arm.h"
-#include "arm_neon_expand.h"
-#ifdef _USE_FP32
-#include "cpu/arm/fp32/tensor_computing_fp32.h"
-#endif
-#ifdef _USE_FP16
-#include "cpu/arm/fp16/tensor_computing_fp16.h"
-#endif
-
-static EE check_u32(TensorDesc inputDescA,
-    const U32 *inputA,
-    TensorDesc inputDescB,
-    const U32 *inputB,
-    CheckMode checkMode,
-    TensorDesc outputDesc,
-    I32 *output)
-{
-    if (nullptr == inputA || nullptr == inputB || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-
-    if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-
-    U32 size = tensorNumElements(inputDescA);
-    U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1];
-    if (tensorNumElements(outputDesc) != loopOuter) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-    I32 length = size / loopOuter;
-    for (U32 j = 0; j < loopOuter; j++) {
-        const U32 *arrayA = inputA + j * length;
-        const U32 *arrayB = inputB + j * length;
-        switch (checkMode) {
-            case CHECK_EQUAL: {
-                uint32x4_t count_v = vdupq_n_u32(0);
-                I32 i = 0;
-                for (; i < length - 3; i += 4) {
-                    uint32x4_t a = vld1q_u32(arrayA + i);
-                    uint32x4_t b = vld1q_u32(arrayA + i);
-                    count_v = vaddq_u32(count_v, vceqq_u32(a, b));
-                }
-                I32 count = vaddvq_u32(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] == arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            default:
-                CHECK_STATUS(NOT_SUPPORTED);
-                break;
-        }
-    }
-    return SUCCESS;
-}
-
-EE check_arm(TensorDesc inputDescA,
-    const void *inputA,
-    TensorDesc inputDescB,
-    const void *inputB,
-    CheckParamSpec p,
-    TensorDesc outputDesc,
-    void *output)
-{
-    DataType idt = inputDescA.dt;
-    EE ret = SUCCESS;
-    switch (idt) {
-#ifdef _USE_FP32
-        case DT_F32: {
-            ret = check_fp32(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-#endif
-#ifdef _USE_FP16
-        case DT_F16: {
-            ret = check_fp16(inputDescA, (const F16 *)inputA, inputDescB, (const F16 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-#endif
-        case DT_U32: {
-            ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-        case DT_I32: {
-            ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-        default:
-            ret = NOT_SUPPORTED;
-            break;
-    }
-
-    return ret;
-}
diff --git a/compute/tensor/src/cpu/arm/convolution.cpp b/compute/tensor/src/cpu/arm/convolution.cpp
index 2391fb0e..505a1f7b 100644
--- a/compute/tensor/src/cpu/arm/convolution.cpp
+++ b/compute/tensor/src/cpu/arm/convolution.cpp
@@ -60,7 +60,7 @@ EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc,
             CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
             it = ft = 1;
             p.dilatedRate_t = p.stride_t = 1;
-            p.padding_before = p.padding_after = 0;
+            p.pad_before = p.pad_after = 0;
         } else if (tensorIs5d(inputDesc)) {
             CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
             CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw));
@@ -75,8 +75,8 @@ EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc,
         if ((idf != DF_NCHWC8 || ic / p.group % 8 != 0) && DT_I8 != idt) {
             *algorithm = CONVOLUTION_ALGORITHM_GEMM_ICNCHW;
         } else if (ft == 1 && fh == 3 && fw == 3 && p.stride_t == 1 && p.stride_h == 1 &&
-            p.stride_w == 1 && p.padding_before == 0 && p.padding_after == 0 && p.padding_top == 1 &&
-            p.padding_bottom == 1 && p.padding_left == 1 && p.padding_right == 1) {
+            p.stride_w == 1 && p.pad_before == 0 && p.pad_after == 0 && p.pad_top == 1 &&
+            p.pad_bottom == 1 && p.pad_left == 1 && p.pad_right == 1) {
             *algorithm = CONVOLUTION_ALGORITHM_WINOGRAD;
         } else {
             *algorithm = CONVOLUTION_ALGORITHM_GEMM;
@@ -141,7 +141,7 @@ EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc,
             CHECK_STATUS(convolution_transform_filter_arm(
                 filterDesc, filter, p, convolutionAlgorithms[i], &ftmDesc, filterTransformed));
 
-            memset(tmp, 0, tmpBytes);
+            UNI_MEMSET(tmp, 0, tmpBytes);
             double timeStart = ut_time_ms();
             CHECK_STATUS(convolution_arm(inputDesc, input, ftmDesc, filterTransformed, p,
                 convolutionAlgorithms[i], scaleDesc, scale, biasDesc, bias, tmpBytes, tmp,
@@ -306,7 +306,7 @@ EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc,
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
         it = ft = ot = 1;
         p.dilatedRate_t = p.stride_t = 1;
-        p.padding_before = p.padding_after = 0;
+        p.pad_before = p.pad_after = 0;
     } else if (tensorIs5d(inputDesc)) {
         CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
         CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw));
@@ -314,9 +314,9 @@ EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc,
     } else {
         return NOT_SUPPORTED;
     }
-    U32 it_pad = it + p.padding_before + p.padding_after;
-    U32 ih_pad = ih + p.padding_top + p.padding_bottom;
-    U32 iw_pad = iw + p.padding_left + p.padding_right;
+    U32 it_pad = it + p.pad_before + p.pad_after;
+    U32 ih_pad = ih + p.pad_top + p.pad_bottom;
+    U32 iw_pad = iw + p.pad_left + p.pad_right;
     U32 tile_size = 0;
     switch (fdt) {
         case DT_F32:
@@ -360,10 +360,10 @@ EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc,
         case CONVOLUTION_ALGORITHM_WINOGRAD: {
             U32 tile_h = (oh + 3) / 4;
             U32 tile_w = (ow + 3) / 4;
-            U32 pad_left = p.padding_left;
-            U32 pad_right = p.padding_right + (tile_w * 4 - ow);
-            U32 pad_top = p.padding_top;
-            U32 pad_bottom = p.padding_bottom + (tile_h * 4 - oh);
+            U32 pad_left = p.pad_left;
+            U32 pad_right = p.pad_right + (tile_w * 4 - ow);
+            U32 pad_top = p.pad_top;
+            U32 pad_bottom = p.pad_bottom + (tile_h * 4 - oh);
             ih_pad = ih + pad_top + pad_bottom;
             iw_pad = iw + pad_left + pad_right;
             *bytes = ic * ih_pad * iw_pad * element_size;
diff --git a/compute/tensor/src/cpu/arm/deconvolution.cpp b/compute/tensor/src/cpu/arm/deconvolution.cpp
index 09f7301e..db47a1cd 100644
--- a/compute/tensor/src/cpu/arm/deconvolution.cpp
+++ b/compute/tensor/src/cpu/arm/deconvolution.cpp
@@ -66,8 +66,8 @@ EE deconvolution_overlap_crop_arm_kernel(T *input,
     U32 fhfw = fh * fw;
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingL = convParamSpec.padding_left;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingL = convParamSpec.pad_left;
     for (U32 kn = 0; kn < in; ++kn) {
 #ifdef _USE_OPENMP
 #pragma omp parallel for num_threads(OMP_NUM_THREADS)
@@ -96,7 +96,7 @@ EE deconvolution_overlap_crop_arm_kernel(T *input,
             }
         }
         output += oc * oh * ow;
-        input += ic * ih * iw;
+        input += oc * fh * fw * ih * iw;
     }
 
     return SUCCESS;
diff --git a/compute/tensor/src/cpu/arm/depthwise_convolution.cpp b/compute/tensor/src/cpu/arm/depthwise_convolution.cpp
index 3e0d5130..2affc5c1 100644
--- a/compute/tensor/src/cpu/arm/depthwise_convolution.cpp
+++ b/compute/tensor/src/cpu/arm/depthwise_convolution.cpp
@@ -41,7 +41,7 @@ EE depthwise_convolution_transform_filter_arm(TensorDesc filterDesc,
     ftmDesc->df = ftmDataFormat;
     EE ret = NOT_SUPPORTED;
     if (filterDesc.df == ftmDataFormat) {
-        memcpy(filterTransformed, filter, tensorNumBytes(filterDesc));
+        UNI_MEMCPY(filterTransformed, filter, tensorNumBytes(filterDesc));
         ret = SUCCESS;
     } else if (filterDesc.df == DF_NCHW) {
         if (ftmDataFormat == DF_NCHWC8) {
@@ -69,10 +69,10 @@ EE depthwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     U32 ih_pad = ih + paddingT + paddingB;
     U32 iw_pad = iw + paddingL + paddingR;
diff --git a/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp
index 84e70dbf..bd8723ac 100644
--- a/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp
+++ b/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp
@@ -56,10 +56,10 @@ EE depthwise_pointwise_convolution_infer_forward_algorithm_arm(TensorDesc inputD
         case DT_F16: {
             U32 strideH = convParamSpec.stride_h;
             U32 strideW = convParamSpec.stride_w;
-            U32 paddingT = convParamSpec.padding_top;
-            U32 paddingB = convParamSpec.padding_bottom;
-            U32 paddingL = convParamSpec.padding_left;
-            U32 paddingR = convParamSpec.padding_right;
+            U32 paddingT = convParamSpec.pad_top;
+            U32 paddingB = convParamSpec.pad_bottom;
+            U32 paddingL = convParamSpec.pad_left;
+            U32 paddingR = convParamSpec.pad_right;
 
             if (fh == 3 && fw == 3 && strideH == 1 && strideW == 1 && paddingT == 1 &&
                 paddingB == 1 && paddingL == 1 && paddingR == 1 && ow % 4 == 0 && ow >= 12) {
@@ -112,10 +112,10 @@ EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputD
     U32 on, oc, oh, ow;
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     U32 ih_pad = ih + paddingT + paddingB;
     U32 iw_pad = iw + paddingL + paddingR;
diff --git a/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h b/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h
index 9674a1c1..1bf628c4 100644
--- a/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h
+++ b/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h
@@ -14,11 +14,8 @@
 #ifndef _H_ARM_FUNCTIONS_FP16
 #define _H_ARM_FUNCTIONS_FP16
 
-#include <math.h>
+#include "cpu/cpu_functions_template.h"
 #include "arm_neon_expand.h"
-#include "uni.h"
-#include "data_type.h"
-#include "parameter_spec.h"
 
 // array sum
 inline F32 array_sum_f16(const F16 *data, I32 len)
@@ -237,7 +234,7 @@ inline void array_power_f16(F16 *input, F16 *output, I32 len, F32 power)
 #endif
     } else if (power == 1) {
         if (input != output) {
-            memcpy(output, input, len * sizeof(F16));
+            UNI_MEMCPY(output, input, len * sizeof(F16));
         }
         i = len;
     } else if (power == 2) {
@@ -263,137 +260,110 @@ inline void array_power_f16(F16 *input, F16 *output, I32 len, F32 power)
 
 inline EE activation_fp16(F16 *input, U32 len, ActivationParamSpec activationDesc, F16 *output)
 {
-    float16x8_t in, out;
     float16x8_t zero = vdupq_n_f16(float16_t(0.));
     float16x8_t one = vdupq_n_f16(float16_t(1.));
     float16x8_t three = vdupq_n_f16(float16_t(3.));
     float16x8_t six = vdupq_n_f16(float16_t(6.));
-    U32 len_main = len / 8;
-    U32 len_tail = len % 8;
-
-    F16 value;
+    U32 loops = len / 8 * 8;
     EE ret = SUCCESS;
     switch (activationDesc.mode) {
         case ACTIVATION_NULL: {
+            if (output != input) {
+                UNI_MEMCPY(output, input, sizeof(F16) * len);
+            }
+            loops = len;
             break;
         }
         case ACTIVATION_RELU: {
             if (activationDesc.value[0] == 0) {
-                for (U32 i = 0; i < len_main; i++) {
-                    in = vld1q_f16(input);
-                    out = vmaxq_f16(zero, in);
-                    vst1q_f16(output, out);
-                    input += 8;
-                    output += 8;
-                }
-                for (U32 i = 0; i < len_tail; i++) {
-                    output[i] = (input[i] < 0) ? 0 : input[i];
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+                for (U32 i = 0; i < loops; i += 8) {
+                    float16x8_t in = vld1q_f16(input + i);
+                    float16x8_t out = vmaxq_f16(zero, in);
+                    vst1q_f16(output + i, out);
                 }
             } else {
                 float16x8_t scale = vdupq_n_f16(activationDesc.value[0]);
-                for (U32 i = 0; i < len_main; i++) {
-                    in = vld1q_f16(input);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+                for (U32 i = 0; i < loops; i += 8) {
+                    float16x8_t in = vld1q_f16(input + i);
                     float16x8_t tmp = vmulq_f16(scale, in);
-                    out = vmaxq_f16(tmp, in);
-                    vst1q_f16(output, out);
-                    input += 8;
-                    output += 8;
-                }
-                for (U32 i = 0; i < len_tail; i++) {
-                    float tmp = activationDesc.value[0] * input[i];
-                    output[i] = (input[i] < tmp) ? tmp : input[i];
+                    float16x8_t out = vmaxq_f16(tmp, in);
+                    vst1q_f16(output + i, out);
                 }
             }
             break;
         }
         case ACTIVATION_RELU6: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vmaxq_f16(zero, in);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vmaxq_f16(zero, in);
                 out = vminq_f16(six, out);
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = (input[i] < 0) ? 0 : input[i];
-                if (value > 6) {
-                    value = 6;
-                }
-                output[i] = value;
+                vst1q_f16(output + i, out);
             }
             break;
         }
         case ACTIVATION_H_SIGMOID: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vaddq_f16(in, three);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vaddq_f16(in, three);
                 out = vmaxq_f16(out, zero);
                 out = vminq_f16(out, six);
                 out = vdivq_f16(out, six);
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] + 3;
-                value = (value < 0) ? 0 : value;
-                value = (value > 6) ? 6 : value;
-                value = value / 6;
-                output[i] = value;
+                vst1q_f16(output + i, out);
             }
             break;
         }
         case ACTIVATION_H_SWISH: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vaddq_f16(in, three);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vaddq_f16(in, three);
                 out = vmaxq_f16(out, zero);
                 out = vminq_f16(out, six);
                 out = vdivq_f16(out, six);
                 out = vmulq_f16(out, in);
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] + 3;
-                value = (value < 0) ? 0 : value;
-                value = (value > 6) ? 6 : value;
-                value = input[i] * value;
-                value = value / 6;
-                output[i] = value;
+                vst1q_f16(output + i, out);
             }
             break;
         }
         case ACTIVATION_H_SWISH_NODIV: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vaddq_f16(in, three);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vaddq_f16(in, three);
                 out = vmaxq_f16(out, zero);
                 out = vminq_f16(out, six);
                 out = vmulq_f16(out, in);
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] + 3;
-                value = (value < 0) ? 0 : value;
-                value = (value > 6) ? 6 : value;
-                value = input[i] * value;
-                output[i] = value;
+                vst1q_f16(output + i, out);
             }
             break;
         }
         case ACTIVATION_GELU: {
-            F16 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846);
-            float16x8_t vec0 = vdupq_n_f16(two_div_PI_sqrt);
+            float16x8_t vec0 = vdupq_n_f16(sqrt(2 / 3.14159265358979323846));
             float16x8_t vec1 = vdupq_n_f16(float16_t(0.044715));
             float16x8_t vec2 = vdupq_n_f16(float16_t(0.5));
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vmulq_f16(in, in);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vmulq_f16(in, in);
                 out = vmulq_f16(out, in);
                 out = vfmaq_f16(in, vec1, out);
                 out = vmulq_f16(vec0, out);
@@ -401,136 +371,122 @@ inline EE activation_fp16(F16 *input, U32 len, ActivationParamSpec activationDes
                 out = vaddq_f16(one, out);
                 out = vmulq_f16(vec2, out);
                 out = vmulq_f16(in, out);
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i];
-                value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3));
-                value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0);
-                value = 0.5 * (1.0 + value);
-                value = input[i] * value;
-                output[i] = value;
+                vst1q_f16(output + i, out);
             }
             break;
         }
         case ACTIVATION_TANH: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vtanhq_f16(in);
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0);
-                output[i] = value;
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vtanhq_f16(in);
+                vst1q_f16(output + i, out);
             }
             break;
         }
         case ACTIVATION_SIGMOID: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vsigmoidq_f16(in);
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = 1.0 / (1.0 + exp(-1.0 * input[i]));
-                output[i] = value;
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vsigmoidq_f16(in);
+                vst1q_f16(output + i, out);
             }
             break;
         }
-        case ACTIVATION_MISH: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vmulq_f16(
-                    in, vtanhq_f16(vlogq_f16(vaddq_f16(vexpq_f16_03_percent_error(in), one))));
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] * tanh(log(exp(input[i]) + 1.0));
-                output[i] = value;
+        case ACTIVATION_SWISH: {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vmulq_f16(in, vsigmoidq_f16(in));
+                vst1q_f16(output + i, out);
             }
             break;
         }
-        case ACTIVATION_GREATER: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = input[i] > 1 ? 1 : 0;
+        case ACTIVATION_MISH: {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vmulq_f16(
+                    in, vtanhq_f16(vlogq_f16(vaddq_f16(vexpq_f16_03_percent_error(in), one))));
+                vst1q_f16(output + i, out);
             }
             break;
         }
         case ACTIVATION_SOFTPLUS: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vlogq_f16(vaddq_f16(vexpq_f16_03_percent_error(in), one));
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                output[i] = log(1 + exp(input[i]));
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vlogq_f16(vaddq_f16(vexpq_f16_03_percent_error(in), one));
+                vst1q_f16(output + i, out);
             }
             break;
         }
         case ACTIVATION_EXP: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vexpq_f16_03_percent_error(in);
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                output[i] = exp(input[i]);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vexpq_f16_03_percent_error(in);
+                vst1q_f16(output + i, out);
             }
             break;
         }
         case ACTIVATION_ABS: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f16(input);
-                out = vabsq_f16(in);
-                vst1q_f16(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                output[i] = UNI_ABS(input[i]);
-            }
-            break;
-        }
-        case ACTIVATION_SIGN: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = UNI_SIGN(input[i]);
-            }
-            break;
-        }
-        case ACTIVATION_LOG: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = log(input[i]);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vabsq_f16(in);
+                vst1q_f16(output + i, out);
             }
             break;
         }
-        case ACTIVATION_NOT: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = (input[i] > 0) ? 0 : 1;
+        case ACTIVATION_RECIPROCAL: {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                float16x8_t in = vld1q_f16(input + i);
+                float16x8_t out = vdivq_f16(one, in);
+                vst1q_f16(output + i, out);
             }
             break;
         }
-        case ACTIVATION_NEG: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = -input[i];
-            }
+        case ACTIVATION_SIGN:
+        case ACTIVATION_LOG:
+        case ACTIVATION_NOT:
+        case ACTIVATION_GREATER:
+        case ACTIVATION_NEG:
+        case ACTIVATION_ROUND:
+        case ACTIVATION_CEIL:
+        case ACTIVATION_FLOOR: {
+            loops = 0;
             break;
         }
         default:
             ret = NOT_SUPPORTED;
             break;
     }
+    if (ret == SUCCESS) {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+        for (U32 i = loops; i < len; i++) {
+            ret = activation_template<F16>(activationDesc, input[i], output + i);
+        }
+    }
     return ret;
 }
 
diff --git a/compute/tensor/src/cpu/arm/fp16/attention.cpp b/compute/tensor/src/cpu/arm/fp16/attention.cpp
index 050203ab..1b22260b 100644
--- a/compute/tensor/src/cpu/arm/fp16/attention.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/attention.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/arm/fp16/tensor_computing_fp16.h"
 
 EE attention_fp16(U32 batch,
@@ -26,14 +25,14 @@ EE attention_fp16(U32 batch,
     }
 
     F16 mask_s = -10000.0;
-    I32 count = array_sum_f16(input, toSequenceLength);
-    I32 valid = UNI_MIN(count, fromSequenceLength);
     float16x8_t mask_v = vdupq_n_f16(float16_t(mask_s));
     float16x8_t one_v = vdupq_n_f16(float16_t(1.0));
     for (U32 n = 0; n < batch; n++) {
+        U32 count = array_sum_f16(input, toSequenceLength);
+        U32 valid = UNI_MIN(count, (U32)fromSequenceLength);
         for (U32 i = 0; i < numHeads; i++) {
             if (i == 0) {
-                for (I32 j = 0; j < valid; j++) {
+                for (U32 j = 0; j < valid; j++) {
                     if (j == 0) {
                         I32 k = 0;
                         for (; k < toSequenceLength - 7; k += 8) {
@@ -47,12 +46,12 @@ EE attention_fp16(U32 batch,
                             output[k] = value;
                         }
                     } else {
-                        memcpy(
+                        UNI_MEMCPY(
                             output + j * toSequenceLength, output, toSequenceLength * sizeof(F16));
                     }
                 }
 
-                for (I32 j = valid; j < fromSequenceLength; j++) {
+                for (U32 j = valid; j < (U32)fromSequenceLength; j++) {
                     if (j == valid) {
                         I32 k = 0;
                         for (; k < toSequenceLength - 7; k += 8) {
@@ -62,12 +61,12 @@ EE attention_fp16(U32 batch,
                             output[j * toSequenceLength + k] = mask_s;
                         }
                     } else {
-                        memcpy(output + j * toSequenceLength, output + valid * toSequenceLength,
+                        UNI_MEMCPY(output + j * toSequenceLength, output + valid * toSequenceLength,
                             toSequenceLength * sizeof(F16));
                     }
                 }
             } else {
-                memcpy(output + i * fromSequenceLength * toSequenceLength, output,
+                UNI_MEMCPY(output + i * fromSequenceLength * toSequenceLength, output,
                     fromSequenceLength * toSequenceLength * sizeof(F16));
             }
         }
diff --git a/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp b/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp
index afad68e5..564e5db0 100644
--- a/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/arm/fp16/tensor_computing_fp16.h"
 
 EE attention_mask_fp16(TensorDesc inputDesc,
@@ -56,7 +55,7 @@ EE attention_mask_fp16(TensorDesc inputDesc,
             if (start + loops > klen) {
                 loops = UNI_MAX(klen - start, 0);
             }
-            memset(&mask[i * klen + start], 0, sizeof(F16) * loops);
+            UNI_MEMSET(&mask[i * klen + start], 0, sizeof(F16) * loops);
         }
     }
     I32 loops = tensorNumElements(inputDesc) / length;
diff --git a/compute/tensor/src/cpu/arm/fp16/check.cpp b/compute/tensor/src/cpu/arm/fp16/check.cpp
deleted file mode 100644
index 139677cd..00000000
--- a/compute/tensor/src/cpu/arm/fp16/check.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#include "cpu/arm/fp16/tensor_computing_fp16.h"
-
-EE check_fp16(TensorDesc inputDescA,
-    const F16 *inputA,
-    TensorDesc inputDescB,
-    const F16 *inputB,
-    CheckMode checkMode,
-    TensorDesc outputDesc,
-    I32 *output)
-{
-    if (nullptr == inputA || nullptr == inputB || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-
-    if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-
-    U32 size = tensorNumElements(inputDescA);
-    U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1];
-    I32 length = size / loopOuter;
-    if (tensorNumElements(outputDesc) != loopOuter) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-    for (U32 j = 0; j < loopOuter; j++) {
-        const F16 *arrayA = inputA + j * length;
-        const F16 *arrayB = inputB + j * length;
-        switch (checkMode) {
-            case CHECK_GREAT: {
-                uint16x8_t count_v = vdupq_n_u16(0);
-                I32 i = 0;
-                for (; i < length - 7; i += 8) {
-                    float16x8_t a = vld1q_f16(arrayA + i);
-                    float16x8_t b = vld1q_f16(arrayA + i);
-                    count_v = vaddq_u16(count_v, vcgtq_f16(a, b));
-                }
-                I32 count = vaddvq_u16(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] > arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            case CHECK_GREATEQUAL: {
-                uint16x8_t count_v = vdupq_n_u16(0);
-                I32 i = 0;
-                for (; i < length - 7; i += 8) {
-                    float16x8_t a = vld1q_f16(arrayA + i);
-                    float16x8_t b = vld1q_f16(arrayA + i);
-                    count_v = vaddq_u16(count_v, vcgeq_f16(a, b));
-                }
-                I32 count = vaddvq_u16(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] >= arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            case CHECK_EQUAL: {
-                uint16x8_t count_v = vdupq_n_u16(0);
-                I32 i = 0;
-                for (; i < length - 7; i += 8) {
-                    float16x8_t a = vld1q_f16(arrayA + i);
-                    float16x8_t b = vld1q_f16(arrayA + i);
-                    count_v = vaddq_u16(count_v, vceqq_f16(a, b));
-                }
-                I32 count = vaddvq_u16(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] == arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            default:
-                CHECK_STATUS(NOT_SUPPORTED);
-                break;
-        }
-    }
-    return SUCCESS;
-}
diff --git a/compute/tensor/src/cpu/arm/fp16/clip.cpp b/compute/tensor/src/cpu/arm/fp16/clip.cpp
index 3f19ae9e..d9b63e61 100644
--- a/compute/tensor/src/cpu/arm/fp16/clip.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/clip.cpp
@@ -21,14 +21,15 @@ EE clip_fp16(F16 *input, F16 *output, I32 len, F32 minValue, F32 maxValue)
 
     float16x8_t min_v = vdupq_n_f16(minValue);
     float16x8_t max_v = vdupq_n_f16(maxValue);
-
-    I32 i = 0;
-    for (i = 0; i < len - 7; i += 8) {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+    for (int i = 0; i < len - 7; i += 8) {
         float16x8_t in = vld1q_f16(input + i);
         float16x8_t tmp_v = vminq_f16(max_v, vmaxq_f16(min_v, in));
         vst1q_f16(output + i, tmp_v);
     }
-    for (; i < len; i++) {
+    for (int i = len / 8 * 8; i < len; i++) {
         F16 value = input[i];
         value = (value > minValue) ? value : minValue;
         value = (value < maxValue) ? value : maxValue;
diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp
index 3782db73..b356e8b6 100644
--- a/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp
@@ -11,8 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
-
 #include "cpu/arm/fp16/convolution_direct.h"
 
 EE convolution_direct(TensorDesc inputDesc,
@@ -43,10 +41,10 @@ EE convolution_direct(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     if (fdf != DF_NCHWN16) {
         CHECK_STATUS(NOT_MATCH);
@@ -67,20 +65,20 @@ EE convolution_direct(TensorDesc inputDesc,
         F16 *inArray_mov = inArray + n * ic * ih * iw * 8;
         for (U32 c = 0; c < ic; c++) {
             for (U32 h = 0; h < paddingT; h++) {
-                memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt));
                 inArray_pad_mov += iw_pad * 8;
             }
             for (U32 h = paddingT; h < ih_pad - paddingB; h++) {
-                memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt));
                 inArray_pad_mov += paddingL * 8;
-                memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt));
+                UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt));
                 inArray_pad_mov += iw * 8;
                 inArray_mov += iw * 8;
-                memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt));
                 inArray_pad_mov += paddingR * 8;
             }
             for (U32 h = ih_pad - paddingB; h < ih_pad; h++) {
-                memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt));
                 inArray_pad_mov += iw_pad * 8;
             }
         }
@@ -400,10 +398,9 @@ EE convolution_direct(TensorDesc inputDesc,
                                     : [in_h0w0] "r"(in_h0w0), [in_h0w1] "r"(in_h0w1),
                                     [in_h0w2] "r"(in_h0w2), [in_h0w3] "r"(in_h0w3),
                                     [in_h1w0] "r"(in_h1w0), [in_h1w1] "r"(in_h1w1),
-                                    [in_h1w2] "r"(in_h1w2), [in_h1w3] "r"(in_h1w3),
-                                    [f_c0] "r"(f_c0), [f_c1] "r"(f_c1), [f_c2] "r"(f_c2),
-                                    [f_c3] "r"(f_c3), [f_c4] "r"(f_c4), [f_c5] "r"(f_c5),
-                                    [f_c6] "r"(f_c6), [f_c7] "r"(f_c7)
+                                    [in_h1w2] "r"(in_h1w2), [in_h1w3] "r"(in_h1w3), [f_c0] "r"(f_c0),
+                                    [f_c1] "r"(f_c1), [f_c2] "r"(f_c2), [f_c3] "r"(f_c3),
+                                    [f_c4] "r"(f_c4), [f_c5] "r"(f_c5), [f_c6] "r"(f_c6), [f_c7] "r"(f_c7)
                                     : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
                                     "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
                                     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp
index 0ac90160..9e39bf6e 100644
--- a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp
@@ -42,7 +42,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc,
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
         it = ft = ot = 1;
         p.dilatedRate_t = p.stride_t = 1;
-        p.padding_before = p.padding_after = 0;
+        p.pad_before = p.pad_after = 0;
     } else if (tensorIs5d(inputDesc)) {
         CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
         CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw));
@@ -56,9 +56,9 @@ EE convolution_gemm_A55(TensorDesc inputDesc,
     }
 
     oc /= 8;
-    U32 it_pad = it + p.padding_before + p.padding_after;
-    U32 ih_pad = ih + p.padding_top + p.padding_bottom;
-    U32 iw_pad = iw + p.padding_left + p.padding_right;
+    U32 it_pad = it + p.pad_before + p.pad_after;
+    U32 ih_pad = ih + p.pad_top + p.pad_bottom;
+    U32 iw_pad = iw + p.pad_left + p.pad_right;
     I64 K = ic * ft * fh * fw;
     I32 ohow = ot * oh * ow;
     int oc_1 = oc - 1;
diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp
index 5aa70b33..d89e4c0f 100644
--- a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp
@@ -42,7 +42,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc,
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
         it = ft = ot = 1;
         p.dilatedRate_t = p.stride_t = 1;
-        p.padding_before = p.padding_after = 0;
+        p.pad_before = p.pad_after = 0;
     } else if (tensorIs5d(inputDesc)) {
         CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
         CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw));
@@ -56,9 +56,9 @@ EE convolution_gemm_A76(TensorDesc inputDesc,
     }
 
     oc /= 8;
-    U32 it_pad = it + p.padding_before + p.padding_after;
-    U32 ih_pad = ih + p.padding_top + p.padding_bottom;
-    U32 iw_pad = iw + p.padding_left + p.padding_right;
+    U32 it_pad = it + p.pad_before + p.pad_after;
+    U32 ih_pad = ih + p.pad_top + p.pad_bottom;
+    U32 iw_pad = iw + p.pad_left + p.pad_right;
     I64 K = ic * ft * fh * fw;
     I32 ohow = ot * oh * ow;
     int oc_1 = oc - 1;
diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h
index e6573d8b..4ee41a70 100644
--- a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h
+++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h
@@ -14,7 +14,6 @@
 #ifndef _H_CONVOLUTION_GEMM_ICNCHW
 #define _H_CONVOLUTION_GEMM_ICNCHW
 
-#include <string.h>
 #include "sys.h"
 #include "tensor_desc.h"
 #include "parameter_spec.h"
diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp
index d784f644..1fbd7293 100644
--- a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp
@@ -42,7 +42,7 @@ EE convolution_gemm_icnchw_A55(TensorDesc inputDesc,
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
         it = ft = ot = 1;
         p.dilatedRate_t = p.stride_t = 1;
-        p.padding_before = p.padding_after = 0;
+        p.pad_before = p.pad_after = 0;
     } else if (tensorIs5d(inputDesc)) {
         CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
         CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw));
@@ -56,9 +56,9 @@ EE convolution_gemm_icnchw_A55(TensorDesc inputDesc,
     }
 
     oc /= 8;
-    U32 it_pad = it + p.padding_before + p.padding_after;
-    U32 ih_pad = ih + p.padding_top + p.padding_bottom;
-    U32 iw_pad = iw + p.padding_left + p.padding_right;
+    U32 it_pad = it + p.pad_before + p.pad_after;
+    U32 ih_pad = ih + p.pad_top + p.pad_bottom;
+    U32 iw_pad = iw + p.pad_left + p.pad_right;
     I64 K = ic * ft * fh * fw;
     I32 ohow = ot * oh * ow;
     int oc_1 = oc - 1;
diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp
index 30eca92b..20418bbe 100644
--- a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp
@@ -42,7 +42,7 @@ EE convolution_gemm_icnchw_A76(TensorDesc inputDesc,
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
         it = ft = ot = 1;
         p.dilatedRate_t = p.stride_t = 1;
-        p.padding_before = p.padding_after = 0;
+        p.pad_before = p.pad_after = 0;
     } else if (tensorIs5d(inputDesc)) {
         CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
         CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw));
@@ -56,9 +56,9 @@ EE convolution_gemm_icnchw_A76(TensorDesc inputDesc,
     }
 
     oc /= 8;
-    U32 it_pad = it + p.padding_before + p.padding_after;
-    U32 ih_pad = ih + p.padding_top + p.padding_bottom;
-    U32 iw_pad = iw + p.padding_left + p.padding_right;
+    U32 it_pad = it + p.pad_before + p.pad_after;
+    U32 ih_pad = ih + p.pad_top + p.pad_bottom;
+    U32 iw_pad = iw + p.pad_left + p.pad_right;
     I64 K = ic * ft * fh * fw;
     I32 ohow = ot * oh * ow;
     int oc_1 = oc - 1;
diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp
index 172dd435..480e5e9b 100644
--- a/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp
@@ -25,7 +25,7 @@ static EE convolution_transform_filter_kernel_fp16(TensorDesc filterDesc,
     }
     if (filterDesc.df == ftmDataFormat) {
         *ftmDesc = filterDesc;
-        memcpy(ftmArray, filterArray, tensorNumBytes(filterDesc));
+        UNI_MEMCPY(ftmArray, filterArray, tensorNumBytes(filterDesc));
         return SUCCESS;
     }
     if (filterDesc.df != DF_NCHW) {
diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp
index 811bdc6e..6b562f58 100644
--- a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp
@@ -40,10 +40,10 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     if (fdf != DF_HWNCN16) {
         CHECK_STATUS(NOT_MATCH);
@@ -77,8 +77,8 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
     int oc_1 = oc - 1;
     // copy input into a input with padding
     for (U32 n = 0; n < in; n++) {
-        convParamSpec.padding_bottom = pad_bottom;
-        convParamSpec.padding_right = pad_right;
+        convParamSpec.pad_bottom = pad_bottom;
+        convParamSpec.pad_right = pad_right;
         F16 *inArray_pad = convolution_input_padding_per_channel<F16, 8>(
             n, ic, 1, ih, iw, convParamSpec, inArray, (F16 *)tmp);
 
@@ -454,15 +454,15 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
                     //     itm[c8*4 + 3] = Iw3[i][c8];
                     // }
 
-                    __asm__ __volatile__("ldr q0, [%[in_0]]\n"
-                                         "ldr q1, [%[in_1]]\n"
-                                         "ldr q2, [%[in_2]]\n"
-                                         "ldr q3, [%[in_3]]\n"
-                                         "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n"
-                                         : [itm] "+r"(itm)
-                                         : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]),
-                                         [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i])
-                                         : "memory", "cc", "v0", "v1", "v2", "v3");
+                    __asm__ __volatile__(
+                        "ldr q0, [%[in_0]]\n"
+                        "ldr q1, [%[in_1]]\n"
+                        "ldr q2, [%[in_2]]\n"
+                        "ldr q3, [%[in_3]]\n"
+                        "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n"
+                        : [itm] "+r"(itm)
+                        : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]), [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i])
+                        : "memory", "cc", "v0", "v1", "v2", "v3");
                 }
             }
             for (I32 o = 0; o < oc_1; o += 2) {
@@ -663,7 +663,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
                     // for (U32 c8 = 0; c8 < 8; c8++) {
                     //     itm[c8] = Iw0[i][c8];
                     // }
-                    memcpy(itm, Iw0[i], 8 * bytesOf(idt));
+                    UNI_MEMCPY(itm, Iw0[i], 8 * bytesOf(idt));
                 }
             }
             for (I32 o = 0; o < oc_1; o += 2) {
diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp
index 852bcb41..ff799909 100644
--- a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp
@@ -40,10 +40,10 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     if (fdf != DF_HWNCN16) {
         CHECK_STATUS(NOT_MATCH);
@@ -78,8 +78,8 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
     EE ret = SUCCESS;
     // copy input into a input with padding
     for (U32 n = 0; n < in; n++) {
-        convParamSpec.padding_bottom = pad_bottom;
-        convParamSpec.padding_right = pad_right;
+        convParamSpec.pad_bottom = pad_bottom;
+        convParamSpec.pad_right = pad_right;
         F16 *inArray_pad = convolution_input_padding_per_channel<F16, 8>(
             n, ic, 1, ih, iw, convParamSpec, inArray, (F16 *)tmp);
 
@@ -413,15 +413,15 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
                     //     itm[c8*4 + 2] = Iw2[i][c8];
                     //     itm[c8*4 + 3] = Iw3[i][c8];
                     // }
-                    __asm__ __volatile__("ldr q0, [%[in_0]]\n"
-                                         "ldr q1, [%[in_1]]\n"
-                                         "ldr q2, [%[in_2]]\n"
-                                         "ldr q3, [%[in_3]]\n"
-                                         "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n"
-                                         : [itm] "+r"(itm)
-                                         : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]),
-                                         [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i])
-                                         : "memory", "cc", "v0", "v1", "v2", "v3");
+                    __asm__ __volatile__(
+                        "ldr q0, [%[in_0]]\n"
+                        "ldr q1, [%[in_1]]\n"
+                        "ldr q2, [%[in_2]]\n"
+                        "ldr q3, [%[in_3]]\n"
+                        "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n"
+                        : [itm] "+r"(itm)
+                        : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]), [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i])
+                        : "memory", "cc", "v0", "v1", "v2", "v3");
                 }
             }
             for (I32 o = 0; o < oc_1; o += 2) {
@@ -603,7 +603,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
                     // for (U32 c8 = 0; c8 < 8; c8++) {
                     //     itm[c8] = Iw0[i][c8];
                     // }
-                    memcpy(itm, Iw0[i], 8 * bytesOf(idt));
+                    UNI_MEMCPY(itm, Iw0[i], 8 * bytesOf(idt));
                 }
             }
             for (I32 o = 0; o < oc_1; o += 2) {
diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h
index 6580f66e..f9c3ee43 100644
--- a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h
+++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h
@@ -14,7 +14,7 @@
 #ifndef _H_WINOGRAD_TRANSFORM
 #define _H_WINOGRAD_TRANSFORM
 
-#include <string.h>
+#include <math.h>
 #include "cpu/arm/fp16/arm_functions_fp16.h"
 
 inline void trans_W_4x4_3x3(F16 *Fw[36], F16 *const F[9])
@@ -297,22 +297,22 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
             vst1q_f16(Iw[i * 6 + 4], v_Iw4);
             vst1q_f16(Iw[i * 6 + 5], v_Iw5);
         } else {
-            F16 max = vmaxvq_f16(v_Iw0);
-            F16 min = vminvq_f16(v_Iw0);
-            if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) {
+            F32 max = vmaxvq_f16(v_Iw0);
+            F32 min = vminvq_f16(v_Iw0);
+            if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) {
                 F16 check[8];
                 vst1q_f16(check, v_Iw0);
                 for (U32 c = 0; c < 8; c++) {
-                    F16 tmp = check[c];
-                    if (UNI_ISINF(tmp)) {
+                    F32 tmp = check[c];
+                    if (isinf(tmp)) {
                         if (tmp > 0) {
                             check[c] = 65504;  // FMAX for F16
                         } else {
                             check[c] = -65504;
                         }
-                    } else if (UNI_ISNAN(tmp)) {
+                    } else if (isnan(tmp)) {
                         tmp = (T[i][0][c] - T[i][2][c]) * 4;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (tmp > 0) {
                                 tmp = 65504;  // FMAX for F16
                             } else {
@@ -321,7 +321,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         }
                         F16 diff = T[i][4][c] - T[i][2][c];
                         tmp += diff;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (diff > 0) {
                                 tmp = 65504;
                             } else {
@@ -331,27 +331,27 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         check[c] = tmp;
                     }
                 }
-                memcpy(Iw[i * 6 + 0], check, 8 * bytesOf(DT_F16));
+                UNI_MEMCPY(Iw[i * 6 + 0], check, 8 * bytesOf(DT_F16));
             } else {
                 vst1q_f16(Iw[i * 6 + 0], v_Iw0);
             }
 
             max = vmaxvq_f16(v_Iw1);
             min = vminvq_f16(v_Iw1);
-            if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) {
+            if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) {
                 F16 check[8];
                 vst1q_f16(check, v_Iw1);
                 for (U32 c = 0; c < 8; c++) {
-                    F16 tmp = check[c];
-                    if (UNI_ISINF(tmp)) {
+                    F32 tmp = check[c];
+                    if (isinf(tmp)) {
                         if (tmp > 0) {
                             check[c] = 65504;  // FMAX for F16
                         } else {
                             check[c] = -65504;
                         }
-                    } else if (UNI_ISNAN(tmp)) {
+                    } else if (isnan(tmp)) {
                         tmp = (T[i][1][c] + T[i][2][c]) * -4;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (tmp > 0) {
                                 tmp = 65504;  // FMAX for F16
                             } else {
@@ -360,7 +360,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         }
                         F16 sum = T[i][3][c] + T[i][4][c];
                         tmp += sum;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (sum > 0) {
                                 tmp = 65504;
                             } else {
@@ -370,27 +370,27 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         check[c] = tmp;
                     }
                 }
-                memcpy(Iw[i * 6 + 1], check, 8 * bytesOf(DT_F16));
+                UNI_MEMCPY(Iw[i * 6 + 1], check, 8 * bytesOf(DT_F16));
             } else {
                 vst1q_f16(Iw[i * 6 + 1], v_Iw1);
             }
 
             max = vmaxvq_f16(v_Iw2);
             min = vminvq_f16(v_Iw2);
-            if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) {
+            if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) {
                 F16 check[8];
                 vst1q_f16(check, v_Iw2);
                 for (U32 c = 0; c < 8; c++) {
-                    F16 tmp = check[c];
-                    if (UNI_ISINF(tmp)) {
+                    F32 tmp = check[c];
+                    if (isinf(tmp)) {
                         if (tmp > 0) {
                             check[c] = 65504;  // FMAX for F16
                         } else {
                             check[c] = -65504;
                         }
-                    } else if (UNI_ISNAN(tmp)) {
+                    } else if (isnan(tmp)) {
                         tmp = (T[i][1][c] - T[i][2][c]) * 4;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (tmp > 0) {
                                 tmp = 65504;  // FMAX for F16
                             } else {
@@ -399,7 +399,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         }
                         F16 diff = T[i][4][c] - T[i][3][c];
                         tmp += diff;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (diff > 0) {
                                 tmp = 65504;
                             } else {
@@ -409,27 +409,27 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         check[c] = tmp;
                     }
                 }
-                memcpy(Iw[i * 6 + 2], check, 8 * bytesOf(DT_F16));
+                UNI_MEMCPY(Iw[i * 6 + 2], check, 8 * bytesOf(DT_F16));
             } else {
                 vst1q_f16(Iw[i * 6 + 2], v_Iw2);
             }
 
             max = vmaxvq_f16(v_Iw3);
             min = vminvq_f16(v_Iw3);
-            if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) {
+            if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) {
                 F16 check[8];
                 vst1q_f16(check, v_Iw3);
                 for (U32 c = 0; c < 8; c++) {
-                    F16 tmp = check[c];
-                    if (UNI_ISINF(tmp)) {
+                    F32 tmp = check[c];
+                    if (isinf(tmp)) {
                         if (tmp > 0) {
                             check[c] = 65504;  // FMAX for F16
                         } else {
                             check[c] = -65504;
                         }
-                    } else if (UNI_ISNAN(tmp)) {
+                    } else if (isnan(tmp)) {
                         tmp = (T[i][3][c] - T[i][1][c]) * 2;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (tmp > 0) {
                                 tmp = 65504;  // FMAX for F16
                             } else {
@@ -438,7 +438,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         }
                         F16 diff = T[i][4][c] - T[i][2][c];
                         tmp += diff;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (diff > 0) {
                                 tmp = 65504;
                             } else {
@@ -448,27 +448,27 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         check[c] = tmp;
                     }
                 }
-                memcpy(Iw[i * 6 + 3], check, 8 * bytesOf(DT_F16));
+                UNI_MEMCPY(Iw[i * 6 + 3], check, 8 * bytesOf(DT_F16));
             } else {
                 vst1q_f16(Iw[i * 6 + 3], v_Iw3);
             }
 
             max = vmaxvq_f16(v_Iw4);
             min = vminvq_f16(v_Iw4);
-            if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) {
+            if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) {
                 F16 check[8];
                 vst1q_f16(check, v_Iw4);
                 for (U32 c = 0; c < 8; c++) {
-                    F16 tmp = check[c];
-                    if (UNI_ISINF(tmp)) {
+                    F32 tmp = check[c];
+                    if (isinf(tmp)) {
                         if (tmp > 0) {
                             check[c] = 65504;  // FMAX for F16
                         } else {
                             check[c] = -65504;
                         }
-                    } else if (UNI_ISNAN(tmp)) {
+                    } else if (isnan(tmp)) {
                         tmp = (T[i][1][c] - T[i][3][c]) * 2;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (tmp > 0) {
                                 tmp = 65504;  // FMAX for F16
                             } else {
@@ -477,7 +477,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         }
                         F16 diff = T[i][4][c] - T[i][2][c];
                         tmp += diff;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (diff > 0) {
                                 tmp = 65504;
                             } else {
@@ -487,27 +487,27 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         check[c] = tmp;
                     }
                 }
-                memcpy(Iw[i * 6 + 4], check, 8 * bytesOf(DT_F16));
+                UNI_MEMCPY(Iw[i * 6 + 4], check, 8 * bytesOf(DT_F16));
             } else {
                 vst1q_f16(Iw[i * 6 + 4], v_Iw4);
             }
 
             max = vmaxvq_f16(v_Iw5);
             min = vminvq_f16(v_Iw5);
-            if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) {
+            if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) {
                 F16 check[8];
                 vst1q_f16(check, v_Iw5);
                 for (U32 c = 0; c < 8; c++) {
-                    F16 tmp = check[c];
-                    if (UNI_ISINF(tmp)) {
+                    F32 tmp = check[c];
+                    if (isinf(tmp)) {
                         if (tmp > 0) {
                             check[c] = 65504;  // FMAX for F16
                         } else {
                             check[c] = -65504;
                         }
-                    } else if (UNI_ISNAN(tmp)) {
+                    } else if (isnan(tmp)) {
                         tmp = (T[i][1][c] - T[i][3][c]) * 4;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (tmp > 0) {
                                 tmp = 65504;  // FMAX for F16
                             } else {
@@ -516,7 +516,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         }
                         F16 diff = T[i][5][c] - T[i][3][c];
                         tmp += diff;
-                        if (UNI_ISINF(tmp)) {
+                        if (isinf(tmp)) {
                             if (diff > 0) {
                                 tmp = 65504;
                             } else {
@@ -526,7 +526,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36])
                         check[c] = tmp;
                     }
                 }
-                memcpy(Iw[i * 6 + 5], check, 8 * bytesOf(DT_F16));
+                UNI_MEMCPY(Iw[i * 6 + 5], check, 8 * bytesOf(DT_F16));
             } else {
                 vst1q_f16(Iw[i * 6 + 5], v_Iw5);
             }
diff --git a/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp b/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp
index 85e52e08..19a57973 100644
--- a/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp
@@ -25,7 +25,7 @@ inline EE deconvolution_transform_filter_kernel_fp16(TensorDesc filterDesc,
     }
     if (filterDesc.df == ftmDataFormat) {
         *ftmDesc = filterDesc;
-        memcpy(ftmArray, filterArray, tensorNumBytes(filterDesc));
+        UNI_MEMCPY(ftmArray, filterArray, tensorNumBytes(filterDesc));
         return SUCCESS;
     }
     if (filterDesc.df != DF_NCHW) {
diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp
index 2998aa5a..e6201bb7 100644
--- a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp
@@ -43,10 +43,10 @@ EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
 
@@ -72,20 +72,20 @@ EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc,
         F16 *inArray_mov = inArray + n * ic * ihiw * 8;
         for (U32 c = 0; c < ic; c++) {
             if (paddingT > 0) {
-                memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingT * iw_pad * 8;
             }
             for (U32 h = paddingT; h < ih_pad - paddingB; h++) {
-                memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingL * 8;
-                memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt));
+                UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt));
                 inArray_pad_mov += iw * 8;
                 inArray_mov += iw * 8;
-                memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingR * 8;
             }
             if (paddingB > 0) {
-                memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingB * iw_pad * 8;
             }
 
@@ -138,30 +138,29 @@ EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc,
                         F16 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8;
                         F16 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8;
                         F16 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8;
-                        __asm__ __volatile__("ldr q17, [%[f0]]\n"
-                                             "ldr q9, [%[in0]]\n"
-                                             "ldr q10, [%[in1]]\n"
-                                             "ldr q11, [%[in2]]\n"
-                                             "ldr q12, [%[in3]]\n"
-                                             "ldr q13, [%[in4]]\n"
-                                             "ldr q14, [%[in5]]\n"
-                                             "ldr q15, [%[in6]]\n"
-                                             "ldr q16, [%[in7]]\n"
-                                             "fmla v0.8h,  v9.8h, v17.8h\n"
-                                             "fmla v1.8h, v10.8h, v17.8h\n"
-                                             "fmla v2.8h, v11.8h, v17.8h\n"
-                                             "fmla v3.8h, v12.8h, v17.8h\n"
-                                             "fmla v4.8h, v13.8h, v17.8h\n"
-                                             "fmla v5.8h, v14.8h, v17.8h\n"
-                                             "fmla v6.8h, v15.8h, v17.8h\n"
-                                             "fmla v7.8h, v16.8h, v17.8h\n"
-                                             :
-                                             : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2),
-                                             [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5),
-                                             [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0)
-                                             : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
-                                             "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14",
-                                             "v15", "v16", "v17");
+                        __asm__ __volatile__(
+                            "ldr q17, [%[f0]]\n"
+                            "ldr q9, [%[in0]]\n"
+                            "ldr q10, [%[in1]]\n"
+                            "ldr q11, [%[in2]]\n"
+                            "ldr q12, [%[in3]]\n"
+                            "ldr q13, [%[in4]]\n"
+                            "ldr q14, [%[in5]]\n"
+                            "ldr q15, [%[in6]]\n"
+                            "ldr q16, [%[in7]]\n"
+                            "fmla v0.8h,  v9.8h, v17.8h\n"
+                            "fmla v1.8h, v10.8h, v17.8h\n"
+                            "fmla v2.8h, v11.8h, v17.8h\n"
+                            "fmla v3.8h, v12.8h, v17.8h\n"
+                            "fmla v4.8h, v13.8h, v17.8h\n"
+                            "fmla v5.8h, v14.8h, v17.8h\n"
+                            "fmla v6.8h, v15.8h, v17.8h\n"
+                            "fmla v7.8h, v16.8h, v17.8h\n"
+                            :
+                            : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), [in3] "r"(in_3),
+                            [in4] "r"(in_4), [in5] "r"(in_5), [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0)
+                            : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9",
+                            "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17");
                     }
                 }
 
diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp
index 46d0c628..8c49bf9c 100644
--- a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp
@@ -43,10 +43,10 @@ EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
 
@@ -71,20 +71,20 @@ EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc,
         F16 *inArray_mov = inArray + n * ic * ihiw * 8;
         for (U32 c = 0; c < ic; c++) {
             if (paddingT > 0) {
-                memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingT * iw_pad * 8;
             }
             for (U32 h = paddingT; h < ih_pad - paddingB; h++) {
-                memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingL * 8;
-                memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt));
+                UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt));
                 inArray_pad_mov += iw * 8;
                 inArray_mov += iw * 8;
-                memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingR * 8;
             }
             if (paddingB > 0) {
-                memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingB * iw_pad * 8;
             }
 
@@ -137,30 +137,29 @@ EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc,
                         F16 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8;
                         F16 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8;
                         F16 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8;
-                        __asm__ __volatile__("ldr q17, [%[f0]]\n"
-                                             "ldr q9, [%[in0]]\n"
-                                             "ldr q10, [%[in1]]\n"
-                                             "ldr q11, [%[in2]]\n"
-                                             "ldr q12, [%[in3]]\n"
-                                             "ldr q13, [%[in4]]\n"
-                                             "ldr q14, [%[in5]]\n"
-                                             "ldr q15, [%[in6]]\n"
-                                             "ldr q16, [%[in7]]\n"
-                                             "fmla v0.8h,  v9.8h, v17.8h\n"
-                                             "fmla v1.8h, v10.8h, v17.8h\n"
-                                             "fmla v2.8h, v11.8h, v17.8h\n"
-                                             "fmla v3.8h, v12.8h, v17.8h\n"
-                                             "fmla v4.8h, v13.8h, v17.8h\n"
-                                             "fmla v5.8h, v14.8h, v17.8h\n"
-                                             "fmla v6.8h, v15.8h, v17.8h\n"
-                                             "fmla v7.8h, v16.8h, v17.8h\n"
-                                             :
-                                             : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2),
-                                             [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5),
-                                             [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0)
-                                             : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
-                                             "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14",
-                                             "v15", "v16", "v17");
+                        __asm__ __volatile__(
+                            "ldr q17, [%[f0]]\n"
+                            "ldr q9, [%[in0]]\n"
+                            "ldr q10, [%[in1]]\n"
+                            "ldr q11, [%[in2]]\n"
+                            "ldr q12, [%[in3]]\n"
+                            "ldr q13, [%[in4]]\n"
+                            "ldr q14, [%[in5]]\n"
+                            "ldr q15, [%[in6]]\n"
+                            "ldr q16, [%[in7]]\n"
+                            "fmla v0.8h,  v9.8h, v17.8h\n"
+                            "fmla v1.8h, v10.8h, v17.8h\n"
+                            "fmla v2.8h, v11.8h, v17.8h\n"
+                            "fmla v3.8h, v12.8h, v17.8h\n"
+                            "fmla v4.8h, v13.8h, v17.8h\n"
+                            "fmla v5.8h, v14.8h, v17.8h\n"
+                            "fmla v6.8h, v15.8h, v17.8h\n"
+                            "fmla v7.8h, v16.8h, v17.8h\n"
+                            :
+                            : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), [in3] "r"(in_3),
+                            [in4] "r"(in_4), [in5] "r"(in_5), [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0)
+                            : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9",
+                            "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17");
                     }
                 }
 
diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h
index dca6b30d..864083b7 100644
--- a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h
+++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h
@@ -14,7 +14,6 @@
 #ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_NO_PADDING
 #define _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_NO_PADDING
 
-#include <string.h>
 #include "sys.h"
 #include "tensor_desc.h"
 #include "parameter_spec.h"
diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp
index e86fe2c9..d5dd04bd 100644
--- a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp
@@ -44,8 +44,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingL = convParamSpec.padding_left;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingL = convParamSpec.pad_left;
 
     if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) {
         CHECK_STATUS(NOT_MATCH);
diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp
index 24bcfb4a..8ea43e8c 100644
--- a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp
@@ -44,8 +44,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingL = convParamSpec.padding_left;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingL = convParamSpec.pad_left;
 
     if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) {
         CHECK_STATUS(NOT_MATCH);
diff --git a/compute/tensor/src/cpu/arm/fp16/gru.cpp b/compute/tensor/src/cpu/arm/fp16/gru.cpp
index 28a46a65..4afe84d9 100644
--- a/compute/tensor/src/cpu/arm/fp16/gru.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/gru.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/arm/fp16/tensor_computing_fp16.h"
 #include "cpu/arm/fp16/mvm_nkn32.h"
 
@@ -54,9 +53,9 @@ EE grucell_fp16(TensorDesc xDesc,
 
     U32 batch = in;
     I32 xDim = ix;
-    I32 hDim = rnnParamSpec.numOutput;
+    I32 hDim = rnnParamSpec.num_outputs;
     I32 column = hDim;
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
     U32 steps = batchStrideH / hDim / num1;
     if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) {
         CHECK_STATUS(NOT_MATCH);
@@ -64,8 +63,7 @@ EE grucell_fp16(TensorDesc xDesc,
     if (!(3 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) {
         CHECK_STATUS(NOT_MATCH);
     }
-    ActivationMode activationMode = rnnParamSpec.activationMode;
-    if (activationMode != ACTIVATION_TANH) {
+    if (rnnParamSpec.activation_type != ACTIVATION_TANH) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
 
@@ -84,16 +82,16 @@ EE grucell_fp16(TensorDesc xDesc,
         F16 *currentBatchH = currentHArray + m * currentHStride;
         F16 *currentOutput = outputArray + m * batchStrideH;
         if (xDim > 0) {
-            memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F16));
-            memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F16));
+            UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F16));
+            UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F16));
         } else {
             intermediateH = tmpArray;
             xhArray = lastBatchH;
-            memcpy(currentOutput, lastBatchH, hDim * sizeof(F16));
+            UNI_MEMCPY(currentOutput, lastBatchH, hDim * sizeof(F16));
         }
 
         const F16 *mBias = (const F16 *)bias[0] + m * steps * column * 3;
-        memcpy(intermediateH, mBias, column * 2 * sizeof(F16));
+        UNI_MEMCPY(intermediateH, mBias, column * 2 * sizeof(F16));
         mvm_nkn32(column * 2 / 32, fk, (const F16 *)filter[0], xhArray, intermediateH);
         F16 *out_z = intermediateH;
         F16 *out_r = out_z + column;
@@ -111,12 +109,12 @@ EE grucell_fp16(TensorDesc xDesc,
         if (rnnParamSpec.mode == RNN_GRU_LBR) {
             F16 *h_x_b = (F16 *)mBias + column * 2;
             F16 *h_h_b = (F16 *)bias[1];
-            memcpy(out_h, h_h_b, column * sizeof(F16));
+            UNI_MEMCPY(out_h, h_h_b, column * sizeof(F16));
             mvm_nkn32(column / 32, hDim, (const F16 *)filter[0] + column * 2 * fk + column * xDim,
                 xhArray + xDim, out_h);
             array_mul_f16(out_r, out_h, out_h, hDim);
             if (xDim > 0) {
-                memcpy(out_r, h_x_b, column * sizeof(F16));
+                UNI_MEMCPY(out_r, h_x_b, column * sizeof(F16));
                 mvm_nkn32(
                     column / 32, xDim, (const F16 *)filter[0] + column * 2 * fk, xhArray, out_r);
                 h_x_b = out_r;
@@ -124,7 +122,7 @@ EE grucell_fp16(TensorDesc xDesc,
             array_add_f16(h_x_b, out_h, out_h, hDim);
         } else {
             array_mul_f16(out_r, xhArray + xDim, xhArray + xDim, hDim);
-            memcpy(out_h, mBias + column * 2, column * sizeof(F16));
+            UNI_MEMCPY(out_h, mBias + column * 2, column * sizeof(F16));
             mvm_nkn32(column / 32, fk, (const F16 *)filter[0] + column * 2 * fk, xhArray, out_h);
         }
         for (h = 0; h < column - 7; h += 8) {
@@ -147,7 +145,7 @@ EE grucell_fp16(TensorDesc xDesc,
         array_scale_f16(out_z, out_z, column, -1, 1);
         array_mul_f16(out_z, out_h, out_h, column);
         array_add_f16(out_r, out_h, currentOutput, column);
-        memcpy(currentBatchH, currentOutput, sizeof(F16) * hDim);
+        UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F16) * hDim);
     }
     return SUCCESS;
 }
diff --git a/compute/tensor/src/cpu/arm/fp16/lstm.cpp b/compute/tensor/src/cpu/arm/fp16/lstm.cpp
index 52e1c9f1..f160d722 100644
--- a/compute/tensor/src/cpu/arm/fp16/lstm.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/lstm.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/arm/fp16/tensor_computing_fp16.h"
 #include "cpu/arm/fp16/mvm_nkn32.h"
 
@@ -54,10 +53,10 @@ EE lstmcell_fp16(TensorDesc xDesc,
 
     U32 batch = in;
     I32 xDim = ix;
-    I32 hDim = rnnParamSpec.numOutput;
-    I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection
-                                                  : rnnParamSpec.numOutput;
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
+    I32 hDim = rnnParamSpec.num_outputs;
+    I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
     U32 steps = batchStrideH / hDim / num1;
     if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) {
         CHECK_STATUS(NOT_MATCH);
@@ -65,9 +64,8 @@ EE lstmcell_fp16(TensorDesc xDesc,
     if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) {
         CHECK_STATUS(NOT_MATCH);
     }
-    F32 forgetBias = rnnParamSpec.forgetBias;
-    ActivationMode activationMode = rnnParamSpec.activationMode;
-    if (activationMode != ACTIVATION_TANH) {
+    F32 forgetBias = rnnParamSpec.forget_bias;
+    if (rnnParamSpec.activation_type != ACTIVATION_TANH) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
 
@@ -88,15 +86,15 @@ EE lstmcell_fp16(TensorDesc xDesc,
     for (U32 m = 0; m < batch; m++) {
         F16 *lastBatchH = lastHArray + m * lastHStride;
         if (xDim > 0) {
-            memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F16));
-            memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F16));
+            UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F16));
+            UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F16));
         } else {
             intermediateH = tmpArray;
             xhArray = lastBatchH;
         }
 
         const F16 *mBias = (const F16 *)bias[0] + m * steps * column * 4;
-        memcpy(intermediateH, mBias, column * 4 * sizeof(F16));
+        UNI_MEMCPY(intermediateH, mBias, column * 4 * sizeof(F16));
         mvm_nkn32(fn, fk, (const F16 *)filter[0], xhArray, intermediateH);
 
         F16 *out_i = intermediateH;
@@ -110,12 +108,12 @@ EE lstmcell_fp16(TensorDesc xDesc,
         F16 *currentOutput = outputArray + m * batchStrideH;
 
         F16 *tmpState, *tmpHH, *tmpH;
-        if (rnnParamSpec.zoneoutCell == 0) {
+        if (rnnParamSpec.zoneout_cell == 0) {
             tmpState = currentBatchState;
         } else {
             tmpState = out_i;
         }
-        if (rnnParamSpec.numProjection > 0) {
+        if (rnnParamSpec.num_projection > 0) {
             tmpHH = out_g;
             tmpH = currentOutput;
         } else {
@@ -150,26 +148,26 @@ EE lstmcell_fp16(TensorDesc xDesc,
             tmpState[h] = C_s;
             tmpHH[h] = value;
         }
-        if (rnnParamSpec.zoneoutCell != 0) {
-            array_scale_f16(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0);
-            array_scale_f16(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0);
+        if (rnnParamSpec.zoneout_cell != 0) {
+            array_scale_f16(tmpState, tmpState, column, 1 - rnnParamSpec.zoneout_cell, 0);
+            array_scale_f16(lastBatchState, lastBatchState, column, rnnParamSpec.zoneout_cell, 0);
             array_add_f16(tmpState, lastBatchState, currentBatchState, column);
         }
 
-        if (rnnParamSpec.numProjection > 0) {
-            memset(tmpH, 0, sizeof(F16) * hDim);
-            mvm_nkn32(hDim / 32, rnnParamSpec.numProjection, (const F16 *)filter[1], tmpHH, tmpH);
+        if (rnnParamSpec.num_projection > 0) {
+            UNI_MEMSET(tmpH, 0, sizeof(F16) * hDim);
+            mvm_nkn32(hDim / 32, rnnParamSpec.num_projection, (const F16 *)filter[1], tmpHH, tmpH);
         }
-        if (rnnParamSpec.zoneoutOutput != 0) {
-            if (rnnParamSpec.numProjection > 0) {
-                array_scale_f16(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0);
+        if (rnnParamSpec.zoneout_output != 0) {
+            if (rnnParamSpec.num_projection > 0) {
+                array_scale_f16(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0);
             } else {
-                array_scale_f16(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0);
+                array_scale_f16(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0);
             }
-            array_scale_f16(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0);
+            array_scale_f16(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneout_output, 0);
             array_add_f16(out_f, lastBatchH, currentBatchH, hDim);
         } else {
-            memcpy(currentBatchH, currentOutput, sizeof(F16) * hDim);
+            UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F16) * hDim);
         }
     }
     return SUCCESS;
diff --git a/compute/tensor/src/cpu/arm/fp16/normalization.cpp b/compute/tensor/src/cpu/arm/fp16/normalization.cpp
index 503e2970..97285ecf 100644
--- a/compute/tensor/src/cpu/arm/fp16/normalization.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/normalization.cpp
@@ -14,10 +14,11 @@
 #include <math.h>
 #include "cpu/arm/fp16/tensor_computing_fp16.h"
 
-inline void array_norm_scale_fp16(
+static float eps = 1e-6;
+
+inline static void array_norm_scale_fp16(
     F16 *input, F16 *output, I32 len, F32 mean, F32 var, F16 *alpha, F16 *beta)
 {
-    F32 eps = 1e-6;
     F32 std_value = sqrt(var + eps);
     float16x8_t mean_v = vdupq_n_f16(mean);
     float16x8_t std_v = vdupq_n_f16(std_value);
@@ -38,14 +39,10 @@ inline void array_norm_scale_fp16(
     }
 }
 
-EE layer_normalization_fp16(
+static EE layer_normalization_nhwc(
     TensorDesc inputDesc, F16 *input, F16 *alpha, F16 *beta, TensorDesc outputDesc, F16 *output)
 {
     UNUSED(outputDesc);
-    if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-
     U32 size = tensorNumElements(inputDesc);
     I32 size_inner = inputDesc.dims[0];
     I32 size_outer = size / size_inner;
@@ -57,6 +54,77 @@ EE layer_normalization_fp16(
 
         array_norm_scale_fp16(current_input, current_output, size_inner, mean, var, alpha, beta);
     }
+    return SUCCESS;
+}
+
+static EE layer_normalization_nchwc8(
+    TensorDesc inputDesc, F16 *input, F16 *alpha, F16 *beta, TensorDesc outputDesc, F16 *output)
+{
+    UNUSED(outputDesc);
+    int n = inputDesc.dims[inputDesc.nDims - 1];
+    int c = inputDesc.dims[inputDesc.nDims - 2];
+    int hw = 1;
+    for (unsigned int i = 0; i < inputDesc.nDims - 2; i++) {
+        hw *= inputDesc.dims[i];
+    }
+    int c8 = c / 8;
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < hw; j++) {
+            float16x8_t sum_v = vdupq_n_f16(0);
+            for (int k = 0; k < c8; k++) {
+                int id = ((i * c8 + k) * hw + j) * 8;
+                sum_v = vaddq_f16(sum_v, vld1q_f16(input + id));
+            }
+            F32 mean = vaddvq_f16(sum_v) / c;
+            float16x8_t mean_v = vdupq_n_f16(mean);
 
+            sum_v = vdupq_n_f16(0);
+            for (int k = 0; k < c8; k++) {
+                int id = ((i * c8 + k) * hw + j) * 8;
+                float16x8_t tmp_v = vsubq_f16(vld1q_f16(input + id), mean_v);
+                sum_v = vfmaq_f16(sum_v, tmp_v, tmp_v);
+            }
+            F32 var = vaddvq_f16(sum_v) / c;
+            F32 std_value = sqrt(var + eps);
+
+            float16x8_t std_v = vdupq_n_f16(std_value);
+            for (int k = 0, kk = 0; k < c8; k++, kk += 8) {
+                int id = ((i * c8 + k) * hw + j) * 8;
+                float16x8_t in = vld1q_f16(input + id);
+                float16x8_t alpha_v = vld1q_f16(alpha + kk);
+                float16x8_t beta_v = vld1q_f16(beta + kk);
+
+                float16x8_t tmp_v = vsubq_f16(in, mean_v);
+                tmp_v = vdivq_f16(tmp_v, std_v);
+                tmp_v = vfmaq_f16(beta_v, alpha_v, tmp_v);
+                vst1q_f16(output + id, tmp_v);
+            }
+        }
+    }
     return SUCCESS;
 }
+
+EE layer_normalization_fp16(TensorDesc inputDesc,
+    F16 *input,
+    LayerNormParamSpec p,
+    F16 *alpha,
+    F16 *beta,
+    TensorDesc outputDesc,
+    F16 *output)
+{
+    if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+
+    EE ret = NOT_SUPPORTED;
+    if (inputDesc.df == DF_NCHWC8) {
+        if (p.axis == 1) {
+            ret = layer_normalization_nchwc8(inputDesc, input, alpha, beta, outputDesc, output);
+        }
+    } else {
+        if (p.axis == -1) {
+            ret = layer_normalization_nhwc(inputDesc, input, alpha, beta, outputDesc, output);
+        }
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/arm/fp16/scale.cpp b/compute/tensor/src/cpu/arm/fp16/scale.cpp
index 0f736c7c..35148077 100644
--- a/compute/tensor/src/cpu/arm/fp16/scale.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/scale.cpp
@@ -35,28 +35,39 @@ EE scale_nchwc8_fp16(
     return SUCCESS;
 }
 
+template <bool icoc_equal>
 EE scale_nchw_fp16(
     F16 *input, F16 *alpha, F16 *beta, I32 in, I32 ic, I32 elements_per_channel, F16 *output)
 {
     float16x8_t one = vdupq_n_f16(1.);
     float16x8_t zero = vdupq_n_f16(0.);
-    U32 index = 0;
+    U32 dst = 0, src = 0;
     for (I32 n = 0; n < in; n++) {
         for (I32 c = 0; c < ic; c++) {
             float16x8_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_f16(alpha[c]);
             float16x8_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f16(beta[c]);
             I32 i = 0;
             for (; i < elements_per_channel - 7; i += 8) {
-                float16x8_t in_vec = vld1q_f16(input + index);
+                if (icoc_equal) {
+                    src = (n * ic + c) * elements_per_channel + i;
+                } else {
+                    src = n * elements_per_channel + i;
+                }
+                float16x8_t in_vec = vld1q_f16(input + src);
                 float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec);
-                vst1q_f16(output + index, out_vec);
-                index += 8;
+                vst1q_f16(output + dst, out_vec);
+                dst += 8;
             }
             for (; i < elements_per_channel; i++) {
+                if (icoc_equal) {
+                    src = (n * ic + c) * elements_per_channel + i;
+                } else {
+                    src = n * elements_per_channel + i;
+                }
                 float alpha_s = (alpha == nullptr) ? 1 : alpha[c];
                 float beta_s = (beta == nullptr) ? 0 : beta[c];
-                output[index] = alpha_s * input[index] + beta_s;
-                index++;
+                output[dst] = alpha_s * input[src] + beta_s;
+                dst++;
             }
         }
     }
@@ -119,7 +130,11 @@ EE scale_fp16(F16 *input,
     EE ret = SUCCESS;
     // If oc is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw.
     if (axis == 1 || axis == 0 || oc == 1) {
-        ret = scale_nchw_fp16(input, alpha, beta, on, oc, elements_per_channel, output);
+        if (ic == oc) {
+            ret = scale_nchw_fp16<true>(input, alpha, beta, on, oc, elements_per_channel, output);
+        } else {
+            ret = scale_nchw_fp16<false>(input, alpha, beta, on, oc, elements_per_channel, output);
+        }
     } else if (axis == nDims - 1) {
         if (ic == oc) {
             ret = scale_nhwc_fp16<true>(input, alpha, beta, on, oc, elements_per_channel, output);
diff --git a/compute/tensor/src/cpu/arm/fp16/softmax.cpp b/compute/tensor/src/cpu/arm/fp16/softmax.cpp
index 2e5b4178..31ecac0b 100644
--- a/compute/tensor/src/cpu/arm/fp16/softmax.cpp
+++ b/compute/tensor/src/cpu/arm/fp16/softmax.cpp
@@ -14,59 +14,77 @@
 #include "cpu/arm/fp16/tensor_computing_fp16.h"
 #include "tensor_transpose.h"
 
-void softmax_lastAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, F16 *output)
+template <bool logsoftmax>
+static void softmax_lastAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, F16 *output)
 {
     for (I32 i = 0; i < loopOuter; i++) {
         const F16 *inputPtr = input + i * loops;
         F16 *outputPtr = output + i * loops;
 
-        float16x8_t max_v, sub_v, sum_v, tmp_v;
+        float16x8_t max_v, tmp_v;
         F32 max_s, tmp_s;
-        array_minmax_value_f16(inputPtr, loops, 2, &max_s);
-        max_v = vdupq_n_f16(max_s);
-        sum_v = vdupq_n_f16(0);
-
+        if (!logsoftmax) {
+            array_minmax_value_f16(inputPtr, loops, 2, &max_s);
+            max_v = vdupq_n_f16(max_s);
+        }
         I32 j = 0;
-        F32 sum_s = 0;
-        for (j = 0; j < loops - 7; j += 8) {
+        float16x8_t sum_v = vdupq_n_f16(0);
+        for (; j < loops - 7; j += 8) {
             float16x8_t in = vld1q_f16(inputPtr + j);
-            sub_v = vsubq_f16(in, max_v);
-            tmp_v = vexpq_f16_f32(sub_v);
+            if (!logsoftmax) {
+                in = vsubq_f16(in, max_v);
+            }
+            tmp_v = vexpq_f16_f32(in);
             sum_v = vaddq_f16(sum_v, tmp_v);
-            vst1q_f16(outputPtr + j, tmp_v);
+            if (!logsoftmax) {
+                vst1q_f16(outputPtr + j, tmp_v);
+            }
         }
-        sum_s += vaddvq_f16(sum_v);
+        F32 sum_s = vaddvq_f16(sum_v);
         for (; j < loops; j++) {
-            tmp_s = exp(inputPtr[j] - max_s);
-            outputPtr[j] = tmp_s;
+            if (logsoftmax) {
+                tmp_s = exp(inputPtr[j]);
+            } else {
+                tmp_s = exp(inputPtr[j] - max_s);
+                outputPtr[j] = tmp_s;
+            }
             sum_s += tmp_s;
         }
-        array_scale_f16(outputPtr, outputPtr, loops, 1.0 / sum_s, 0);
+        if (logsoftmax) {
+            array_scale_f16(inputPtr, outputPtr, loops, 1.0, -log(sum_s));
+        } else {
+            array_scale_f16(outputPtr, outputPtr, loops, 1.0 / sum_s, 0);
+        }
     }
 }
 
-void softmax_anyAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, I32 loopInner, F16 *output)
+template <bool logsoftmax>
+static void softmax_anyAxis_fp16(
+    const F16 *input, I32 loopOuter, I32 loops, I32 loopInner, F16 *output)
 {
     std::vector<F16> buffer(loopInner * 2);
     F16 *maxBuffer = &buffer[0];
     F16 *sumBuffer = &buffer[loopInner];
     I32 k = 0;
+    F32 tmp_s;
     for (I32 i = 0; i < loopOuter; i++) {
         const F16 *inputPtrBase = input + i * loops * loopInner;
         F16 *outputPtrBase = output + i * loops * loopInner;
 
-        memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F16));
-        memset(sumBuffer, 0, loopInner * sizeof(F16));
-        for (I32 j = 1; j < loops; j++) {
-            const F16 *inputPtr = inputPtrBase + j * loopInner;
-            for (k = 0; k < loopInner - 7; k += 8) {
-                float16x8_t in_v = vld1q_f16(inputPtr + k);
-                float16x8_t out_v = vld1q_f16(maxBuffer + k);
-                float16x8_t max_v = vmaxq_f16(in_v, out_v);
-                vst1q_f16(maxBuffer + k, max_v);
-            }
-            for (; k < loopInner; k++) {
-                maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]);
+        UNI_MEMSET(sumBuffer, 0, loopInner * sizeof(F16));
+        if (!logsoftmax) {
+            UNI_MEMCPY(maxBuffer, inputPtrBase, loopInner * sizeof(F16));
+            for (I32 j = 1; j < loops; j++) {
+                const F16 *inputPtr = inputPtrBase + j * loopInner;
+                for (k = 0; k < loopInner - 7; k += 8) {
+                    float16x8_t in_v = vld1q_f16(inputPtr + k);
+                    float16x8_t out_v = vld1q_f16(maxBuffer + k);
+                    float16x8_t max_v = vmaxq_f16(in_v, out_v);
+                    vst1q_f16(maxBuffer + k, max_v);
+                }
+                for (; k < loopInner; k++) {
+                    maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]);
+                }
             }
         }
         for (I32 j = 0; j < loops; j++) {
@@ -74,35 +92,69 @@ void softmax_anyAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, I32 loopIn
             F16 *outputPtr = outputPtrBase + j * loopInner;
             for (k = 0; k < loopInner - 7; k += 8) {
                 float16x8_t in_v = vld1q_f16(inputPtr + k);
-                float16x8_t max_v = vld1q_f16(maxBuffer + k);
-                float16x8_t sub_v = vsubq_f16(in_v, max_v);
-                float16x8_t exp_v = vexpq_f16_f32(sub_v);
+                if (!logsoftmax) {
+                    in_v = vsubq_f16(in_v, vld1q_f16(maxBuffer + k));
+                }
+                float16x8_t exp_v = vexpq_f16_f32(in_v);
                 float16x8_t sum_v = vld1q_f16(sumBuffer + k);
                 sum_v = vaddq_f16(sum_v, exp_v);
                 vst1q_f16(sumBuffer + k, sum_v);
-                vst1q_f16(outputPtr + k, exp_v);
+                if (!logsoftmax) {
+                    vst1q_f16(outputPtr + k, exp_v);
+                }
             }
             for (; k < loopInner; k++) {
-                outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]);
-                sumBuffer[k] += outputPtr[k];
+                if (logsoftmax) {
+                    tmp_s = exp(inputPtr[k]);
+                } else {
+                    tmp_s = exp(inputPtr[k] - maxBuffer[k]);
+                    outputPtr[k] = tmp_s;
+                }
+                sumBuffer[k] += tmp_s;
             }
         }
-        for (I32 j = 0; j < loops; j++) {
-            F16 *outputPtr = outputPtrBase + j * loopInner;
+        if (logsoftmax) {
             for (k = 0; k < loopInner - 7; k += 8) {
-                float16x8_t out_v = vld1q_f16(outputPtr + k);
                 float16x8_t sum_v = vld1q_f16(sumBuffer + k);
-                out_v = vdivq_f16(out_v, sum_v);
-                vst1q_f16(outputPtr + k, out_v);
+                sum_v = vlogq_f16(sum_v);
+                vst1q_f16(sumBuffer + k, sum_v);
             }
             for (; k < loopInner; k++) {
-                outputPtr[k] /= sumBuffer[k];
+                sumBuffer[k] = log(sumBuffer[k]);
+            }
+            for (I32 j = 0; j < loops; j++) {
+                const F16 *inputPtr = inputPtrBase + j * loopInner;
+                F16 *outputPtr = outputPtrBase + j * loopInner;
+                for (k = 0; k < loopInner - 7; k += 8) {
+                    float16x8_t out_v = vld1q_f16(inputPtr + k);
+                    float16x8_t sum_v = vld1q_f16(sumBuffer + k);
+                    out_v = vsubq_f16(out_v, sum_v);
+                    vst1q_f16(outputPtr + k, out_v);
+                }
+                for (; k < loopInner; k++) {
+                    outputPtr[k] -= sumBuffer[k];
+                }
+            }
+        } else {
+            for (I32 j = 0; j < loops; j++) {
+                F16 *outputPtr = outputPtrBase + j * loopInner;
+                for (k = 0; k < loopInner - 7; k += 8) {
+                    float16x8_t out_v = vld1q_f16(outputPtr + k);
+                    float16x8_t sum_v = vld1q_f16(sumBuffer + k);
+                    out_v = vdivq_f16(out_v, sum_v);
+                    vst1q_f16(outputPtr + k, out_v);
+                }
+                for (; k < loopInner; k++) {
+                    outputPtr[k] /= sumBuffer[k];
+                }
             }
         }
     }
 }
 
-EE softmax_fp16(TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output)
+template <bool logsoftmax>
+static EE softmax_kernel(
+    TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output)
 {
     UNUSED(outputDesc);
     if (nullptr == input || nullptr == output) {
@@ -145,9 +197,20 @@ EE softmax_fp16(TensorDesc inputDesc, const F16 *input, int axis, TensorDesc out
     }
     U32 loop_outer = size / loops / loop_inner;
     if (axis == 0) {
-        softmax_lastAxis_fp16(input, loop_outer, loops, output);
+        softmax_lastAxis_fp16<logsoftmax>(input, loop_outer, loops, output);
     } else {
-        softmax_anyAxis_fp16(input, loop_outer, loops, loop_inner, output);
+        softmax_anyAxis_fp16<logsoftmax>(input, loop_outer, loops, loop_inner, output);
     }
     return SUCCESS;
 }
+
+EE softmax_fp16(TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output)
+{
+    return softmax_kernel<false>(inputDesc, input, axis, outputDesc, output);
+}
+
+EE logsoftmax_fp16(
+    TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output)
+{
+    return softmax_kernel<true>(inputDesc, input, axis, outputDesc, output);
+}
diff --git a/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h b/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h
index a9e008de..ca7b1b79 100644
--- a/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h
+++ b/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h
@@ -67,6 +67,12 @@ EE pooling_c8_fp16(const I32 &tstart,
 EE softmax_fp16(
     TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output);
 
+EE logsoftmax_fp16(
+    TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output);
+
+EE logsoftmax_fp16(
+    TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output);
+
 EE attention_fp16(U32 batch,
     U32 numHeads,
     I32 fromSequenceLength,
@@ -167,8 +173,13 @@ EE power_fp16(TensorDesc inputDesc,
     TensorDesc outputDesc,
     F16 *output);
 
-EE layer_normalization_fp16(
-    TensorDesc inputDesc, F16 *input, F16 *alpha, F16 *beta, TensorDesc outputDesc, F16 *output);
+EE layer_normalization_fp16(TensorDesc inputDesc,
+    F16 *input,
+    LayerNormParamSpec p,
+    F16 *alpha,
+    F16 *beta,
+    TensorDesc outputDesc,
+    F16 *output);
 
 EE scale_fp16(F16 *input,
     I32 axis,
diff --git a/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h b/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h
index 9d2e9cf8..a6797bb5 100644
--- a/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h
+++ b/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h
@@ -14,11 +14,8 @@
 #ifndef _H_ARM_FUNCTIONS_FP32
 #define _H_ARM_FUNCTIONS_FP32
 
-#include <math.h>
+#include "cpu/cpu_functions_template.h"
 #include "arm_neon_expand.h"
-#include "uni.h"
-#include "data_type.h"
-#include "parameter_spec.h"
 
 // array sum
 inline F32 array_sum_f32(const F32 *data, I32 len)
@@ -232,7 +229,7 @@ inline void array_power_f32(F32 *input, F32 *output, I32 len, F32 power)
 #endif
     } else if (power == 1) {
         if (input != output) {
-            memcpy(output, input, len * sizeof(F32));
+            UNI_MEMCPY(output, input, len * sizeof(F32));
         }
         i = len;
     } else if (power == 2) {
@@ -249,137 +246,109 @@ inline void array_power_f32(F32 *input, F32 *output, I32 len, F32 power)
 
 inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDesc, F32 *output)
 {
-    float32x4_t in, out;
     float32x4_t zero = vdupq_n_f32(0.);
     float32x4_t one = vdupq_n_f32(1.);
     float32x4_t three = vdupq_n_f32(3.);
     float32x4_t six = vdupq_n_f32(6.);
-    U32 len_main = len / 4;
-    U32 len_tail = len % 4;
-
-    F32 value;
+    U32 loops = len / 4 * 4;
     EE ret = SUCCESS;
     switch (activationDesc.mode) {
         case ACTIVATION_NULL: {
+            if (output != input) {
+                UNI_MEMCPY(output, input, sizeof(float) * len);
+            }
+            loops = len;
             break;
         }
         case ACTIVATION_RELU: {
             if (activationDesc.value[0] == 0) {
-                for (U32 i = 0; i < len_main; i++) {
-                    in = vld1q_f32(input);
-                    out = vmaxq_f32(zero, in);
-                    vst1q_f32(output, out);
-                    input += 4;
-                    output += 4;
-                }
-                for (U32 i = 0; i < len_tail; i++) {
-                    output[i] = (input[i] < 0) ? 0 : input[i];
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+                for (U32 i = 0; i < loops; i += 4) {
+                    float32x4_t in = vld1q_f32(input + i);
+                    float32x4_t out = vmaxq_f32(zero, in);
+                    vst1q_f32(output + i, out);
                 }
             } else {
                 float32x4_t scale = vdupq_n_f32(activationDesc.value[0]);
-                for (U32 i = 0; i < len_main; i++) {
-                    in = vld1q_f32(input);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+                for (U32 i = 0; i < loops; i += 4) {
+                    float32x4_t in = vld1q_f32(input + i);
                     float32x4_t tmp = vmulq_f32(in, scale);
-                    out = vmaxq_f32(tmp, in);
-                    vst1q_f32(output, out);
-                    input += 4;
-                    output += 4;
-                }
-                for (U32 i = 0; i < len_tail; i++) {
-                    float tmp = activationDesc.value[0] * input[i];
-                    output[i] = (input[i] < tmp) ? tmp : input[i];
+                    float32x4_t out = vmaxq_f32(tmp, in);
+                    vst1q_f32(output + i, out);
                 }
             }
             break;
         }
         case ACTIVATION_RELU6: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vmaxq_f32(zero, in);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vmaxq_f32(zero, in);
                 out = vminq_f32(six, out);
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = (input[i] < 0) ? 0 : input[i];
-                if (value > 6) {
-                    value = 6;
-                }
-                output[i] = value;
+                vst1q_f32(output + i, out);
             }
             break;
         }
         case ACTIVATION_H_SIGMOID: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vaddq_f32(in, three);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vaddq_f32(in, three);
                 out = vmaxq_f32(out, zero);
                 out = vminq_f32(out, six);
                 out = vdivq_f32(out, six);
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] + 3;
-                value = (value < 0) ? 0 : value;
-                value = (value > 6) ? 6 : value;
-                value = value / 6;
-                output[i] = value;
+                vst1q_f32(output + i, out);
             }
             break;
         }
         case ACTIVATION_H_SWISH: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vaddq_f32(in, three);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vaddq_f32(in, three);
                 out = vmaxq_f32(out, zero);
                 out = vminq_f32(out, six);
                 out = vdivq_f32(out, six);
                 out = vmulq_f32(out, in);
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] + 3;
-                value = (value < 0) ? 0 : value;
-                value = (value > 6) ? 6 : value;
-                value = input[i] * value;
-                value = value / 6;
-                output[i] = value;
+                vst1q_f32(output + i, out);
             }
             break;
         }
         case ACTIVATION_H_SWISH_NODIV: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vaddq_f32(in, three);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vaddq_f32(in, three);
                 out = vmaxq_f32(out, zero);
                 out = vminq_f32(out, six);
                 out = vmulq_f32(out, in);
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] + 3;
-                value = (value < 0) ? 0 : value;
-                value = (value > 6) ? 6 : value;
-                value = input[i] * value;
-                output[i] = value;
+                vst1q_f32(output + i, out);
             }
             break;
         }
         case ACTIVATION_GELU: {
-            F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846);
-            float32x4_t vec0 = vdupq_n_f32(two_div_PI_sqrt);
+            float32x4_t vec0 = vdupq_n_f32(sqrt(2 / 3.14159265358979323846));
             float32x4_t vec1 = vdupq_n_f32(0.044715);
             float32x4_t vec2 = vdupq_n_f32(0.5);
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vmulq_f32(in, in);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vmulq_f32(in, in);
                 out = vmulq_f32(out, in);
                 out = vfmaq_f32(in, vec1, out);
                 out = vmulq_f32(vec0, out);
@@ -387,136 +356,122 @@ inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDes
                 out = vaddq_f32(one, out);
                 out = vmulq_f32(vec2, out);
                 out = vmulq_f32(in, out);
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i];
-                value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3));
-                value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0);
-                value = 0.5 * (1.0 + value);
-                value = input[i] * value;
-                output[i] = value;
+                vst1q_f32(output + i, out);
             }
             break;
         }
         case ACTIVATION_TANH: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vtanhq_f32(in);
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0);
-                output[i] = value;
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vtanhq_f32(in);
+                vst1q_f32(output + i, out);
             }
             break;
         }
         case ACTIVATION_SIGMOID: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vsigmoidq_f32(in);
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = 1.0 / (1.0 + exp(-1.0 * input[i]));
-                output[i] = value;
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vsigmoidq_f32(in);
+                vst1q_f32(output + i, out);
             }
             break;
         }
-        case ACTIVATION_MISH: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vmulq_f32(
-                    in, vtanhq_f32(vlogq_f32(vaddq_f32(vexpq_f32_03_percent_error(in), one))));
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] * tanh(log(exp(input[i]) + 1.0));
-                output[i] = value;
+        case ACTIVATION_SWISH: {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vmulq_f32(in, vsigmoidq_f32(in));
+                vst1q_f32(output + i, out);
             }
             break;
         }
-        case ACTIVATION_GREATER: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = input[i] > 1 ? 1 : 0;
+        case ACTIVATION_MISH: {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vmulq_f32(
+                    in, vtanhq_f32(vlogq_f32(vaddq_f32(vexpq_f32_03_percent_error(in), one))));
+                vst1q_f32(output + i, out);
             }
             break;
         }
         case ACTIVATION_SOFTPLUS: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vlogq_f32(vaddq_f32(vexpq_f32_03_percent_error(in), one));
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                output[i] = log(1 + exp(input[i]));
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vlogq_f32(vaddq_f32(vexpq_f32_03_percent_error(in), one));
+                vst1q_f32(output + i, out);
             }
             break;
         }
         case ACTIVATION_EXP: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vexpq_f32_03_percent_error(in);
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                output[i] = exp(input[i]);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vexpq_f32_03_percent_error(in);
+                vst1q_f32(output + i, out);
             }
             break;
         }
         case ACTIVATION_ABS: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_f32(input);
-                out = vabsq_f32(in);
-                vst1q_f32(output, out);
-                input += 4;
-                output += 4;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                output[i] = UNI_ABS(input[i]);
-            }
-            break;
-        }
-        case ACTIVATION_SIGN: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = UNI_SIGN(input[i]);
-            }
-            break;
-        }
-        case ACTIVATION_LOG: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = log(input[i]);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vabsq_f32(in);
+                vst1q_f32(output + i, out);
             }
             break;
         }
-        case ACTIVATION_NOT: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = (input[i] > 0) ? 0 : 1;
+        case ACTIVATION_RECIPROCAL: {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+            for (U32 i = 0; i < loops; i += 4) {
+                float32x4_t in = vld1q_f32(input + i);
+                float32x4_t out = vdivq_f32(one, in);
+                vst1q_f32(output + i, out);
             }
             break;
         }
-        case ACTIVATION_NEG: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = -input[i];
-            }
+        case ACTIVATION_SIGN:
+        case ACTIVATION_LOG:
+        case ACTIVATION_NOT:
+        case ACTIVATION_GREATER:
+        case ACTIVATION_NEG:
+        case ACTIVATION_ROUND:
+        case ACTIVATION_CEIL:
+        case ACTIVATION_FLOOR: {
+            loops = 0;
             break;
         }
         default:
             ret = NOT_SUPPORTED;
             break;
     }
+    if (ret == SUCCESS) {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+        for (U32 i = loops; i < len; i++) {
+            ret = activation_template<F32>(activationDesc, input[i], output + i);
+        }
+    }
     return ret;
 }
 
diff --git a/compute/tensor/src/cpu/arm/fp32/attention.cpp b/compute/tensor/src/cpu/arm/fp32/attention.cpp
index 6861cae6..ef01a118 100644
--- a/compute/tensor/src/cpu/arm/fp32/attention.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/attention.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/arm/fp32/tensor_computing_fp32.h"
 
 EE attention_fp32(U32 batch,
@@ -26,14 +25,14 @@ EE attention_fp32(U32 batch,
     }
 
     F32 mask_s = -10000.0;
-    I32 count = array_sum_f32(input, toSequenceLength);
-    I32 valid = UNI_MIN(count, fromSequenceLength);
     float32x4_t mask_v = vdupq_n_f32(mask_s);
     float32x4_t one_v = vdupq_n_f32(1.0);
     for (U32 n = 0; n < batch; n++) {
+        U32 count = array_sum_f32(input, toSequenceLength);
+        U32 valid = UNI_MIN(count, (U32)fromSequenceLength);
         for (U32 i = 0; i < numHeads; i++) {
             if (i == 0) {
-                for (I32 j = 0; j < valid; j++) {
+                for (U32 j = 0; j < valid; j++) {
                     if (j == 0) {
                         I32 k = 0;
                         for (; k < toSequenceLength - 3; k += 4) {
@@ -47,12 +46,12 @@ EE attention_fp32(U32 batch,
                             output[k] = value;
                         }
                     } else {
-                        memcpy(
+                        UNI_MEMCPY(
                             output + j * toSequenceLength, output, toSequenceLength * sizeof(F32));
                     }
                 }
 
-                for (I32 j = valid; j < fromSequenceLength; j++) {
+                for (U32 j = valid; j < (U32)fromSequenceLength; j++) {
                     if (j == valid) {
                         I32 k = 0;
                         for (; k < toSequenceLength - 3; k += 4) {
@@ -62,12 +61,12 @@ EE attention_fp32(U32 batch,
                             output[j * toSequenceLength + k] = mask_s;
                         }
                     } else {
-                        memcpy(output + j * toSequenceLength, output + valid * toSequenceLength,
+                        UNI_MEMCPY(output + j * toSequenceLength, output + valid * toSequenceLength,
                             toSequenceLength * sizeof(F32));
                     }
                 }
             } else {
-                memcpy(output + i * fromSequenceLength * toSequenceLength, output,
+                UNI_MEMCPY(output + i * fromSequenceLength * toSequenceLength, output,
                     fromSequenceLength * toSequenceLength * sizeof(F32));
             }
         }
diff --git a/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp b/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp
index 3a34c6dc..3b3de80b 100644
--- a/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/arm/fp32/tensor_computing_fp32.h"
 
 EE attention_mask_fp32(TensorDesc inputDesc,
@@ -56,7 +55,7 @@ EE attention_mask_fp32(TensorDesc inputDesc,
             if (start + loops > klen) {
                 loops = UNI_MAX(klen - start, 0);
             }
-            memset(&mask[i * klen + start], 0, sizeof(F32) * loops);
+            UNI_MEMSET(&mask[i * klen + start], 0, sizeof(F32) * loops);
         }
     }
     I32 loops = tensorNumElements(inputDesc) / length;
diff --git a/compute/tensor/src/cpu/arm/fp32/check.cpp b/compute/tensor/src/cpu/arm/fp32/check.cpp
deleted file mode 100644
index 1e6894c7..00000000
--- a/compute/tensor/src/cpu/arm/fp32/check.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#include "cpu/arm/fp32/tensor_computing_fp32.h"
-
-EE check_fp32(TensorDesc inputDescA,
-    const F32 *inputA,
-    TensorDesc inputDescB,
-    const F32 *inputB,
-    CheckMode checkMode,
-    TensorDesc outputDesc,
-    I32 *output)
-{
-    if (nullptr == inputA || nullptr == inputB || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-
-    if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-
-    U32 size = tensorNumElements(inputDescA);
-    U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1];
-    I32 length = size / loopOuter;
-    if (tensorNumElements(outputDesc) != loopOuter) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-    for (U32 j = 0; j < loopOuter; j++) {
-        const F32 *arrayA = inputA + j * length;
-        const F32 *arrayB = inputB + j * length;
-        switch (checkMode) {
-            case CHECK_GREAT: {
-                uint32x4_t count_v = vdupq_n_u32(0);
-                I32 i = 0;
-                for (; i < length - 3; i += 4) {
-                    float32x4_t a = vld1q_f32(arrayA + i);
-                    float32x4_t b = vld1q_f32(arrayA + i);
-                    count_v = vaddq_u32(count_v, vcgtq_f32(a, b));
-                }
-                I32 count = vaddvq_u32(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] > arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            case CHECK_GREATEQUAL: {
-                uint32x4_t count_v = vdupq_n_u32(0);
-                I32 i = 0;
-                for (; i < length - 3; i += 4) {
-                    float32x4_t a = vld1q_f32(arrayA + i);
-                    float32x4_t b = vld1q_f32(arrayA + i);
-                    count_v = vaddq_u32(count_v, vcgeq_f32(a, b));
-                }
-                I32 count = vaddvq_u32(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] >= arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            case CHECK_EQUAL: {
-                uint32x4_t count_v = vdupq_n_u32(0);
-                I32 i = 0;
-                for (; i < length - 3; i += 4) {
-                    float32x4_t a = vld1q_f32(arrayA + i);
-                    float32x4_t b = vld1q_f32(arrayA + i);
-                    count_v = vaddq_u32(count_v, vceqq_f32(a, b));
-                }
-                I32 count = vaddvq_u32(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] == arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            default:
-                CHECK_STATUS(NOT_SUPPORTED);
-                break;
-        }
-    }
-    return SUCCESS;
-}
diff --git a/compute/tensor/src/cpu/arm/fp32/clip.cpp b/compute/tensor/src/cpu/arm/fp32/clip.cpp
index a0b591be..220056f0 100644
--- a/compute/tensor/src/cpu/arm/fp32/clip.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/clip.cpp
@@ -21,14 +21,15 @@ EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue)
 
     float32x4_t min_v = vdupq_n_f32(minValue);
     float32x4_t max_v = vdupq_n_f32(maxValue);
-
-    I32 i = 0;
-    for (i = 0; i < len - 3; i += 4) {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+    for (int i = 0; i < len - 3; i += 4) {
         float32x4_t in = vld1q_f32(input + i);
         float32x4_t tmp_v = vminq_f32(max_v, vmaxq_f32(min_v, in));
         vst1q_f32(output + i, tmp_v);
     }
-    for (; i < len; i++) {
+    for (int i = len / 4 * 4; i < len; i++) {
         F32 value = input[i];
         value = (value > minValue) ? value : minValue;
         value = (value < maxValue) ? value : maxValue;
diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp
index 376a8735..cdcbf16c 100644
--- a/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp
@@ -26,7 +26,7 @@ inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc,
     }
     if (filterDesc.df == ftmDataFormat) {
         *ftmDesc = filterDesc;
-        memcpy(ftmArray, filterArray, tensorNumBytes(filterDesc));
+        UNI_MEMCPY(ftmArray, filterArray, tensorNumBytes(filterDesc));
         return SUCCESS;
     }
     if (filterDesc.df != DF_NCHW) {
diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h
index 098e9c67..4f2de717 100644
--- a/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h
+++ b/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h
@@ -16,7 +16,7 @@
 
 #ifdef _USE_FP32
 #include <math.h>
-#include <string.h>
+
 #include "cpu/arm/fp32/arm_functions_fp32.h"
 
 inline void trans_W_4x4_3x3(float *WTM[36], float *W[9])
diff --git a/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp b/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp
index 79bbb56d..fb2bcd8e 100644
--- a/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp
@@ -25,7 +25,7 @@ inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc,
     }
     if (filterDesc.df == ftmDataFormat) {
         *ftmDesc = filterDesc;
-        memcpy(ftmArray, filterArray, tensorNumBytes(filterDesc));
+        UNI_MEMCPY(ftmArray, filterArray, tensorNumBytes(filterDesc));
         return SUCCESS;
     }
     if (filterDesc.df != DF_NCHW) {
diff --git a/compute/tensor/src/cpu/arm/fp32/gru.cpp b/compute/tensor/src/cpu/arm/fp32/gru.cpp
index eeb16490..584f0793 100644
--- a/compute/tensor/src/cpu/arm/fp32/gru.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/gru.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/arm/fp32/tensor_computing_fp32.h"
 #include "cpu/arm/fp32/mvm_nkn32.h"
 
@@ -54,9 +53,9 @@ EE grucell_fp32(TensorDesc xDesc,
 
     U32 batch = in;
     I32 xDim = ix;
-    I32 hDim = rnnParamSpec.numOutput;
+    I32 hDim = rnnParamSpec.num_outputs;
     I32 column = hDim;
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
     U32 steps = batchStrideH / hDim / num1;
     if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) {
         CHECK_STATUS(NOT_MATCH);
@@ -64,8 +63,7 @@ EE grucell_fp32(TensorDesc xDesc,
     if (!(3 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) {
         CHECK_STATUS(NOT_MATCH);
     }
-    ActivationMode activationMode = rnnParamSpec.activationMode;
-    if (activationMode != ACTIVATION_TANH) {
+    if (rnnParamSpec.activation_type != ACTIVATION_TANH) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
 
@@ -84,16 +82,16 @@ EE grucell_fp32(TensorDesc xDesc,
         F32 *currentBatchH = currentHArray + m * currentHStride;
         F32 *currentOutput = outputArray + m * batchStrideH;
         if (xDim > 0) {
-            memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32));
-            memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32));
+            UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32));
+            UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F32));
         } else {
             intermediateH = tmpArray;
             xhArray = lastBatchH;
-            memcpy(currentOutput, lastBatchH, hDim * sizeof(F32));
+            UNI_MEMCPY(currentOutput, lastBatchH, hDim * sizeof(F32));
         }
 
         const F32 *mBias = (const F32 *)bias[0] + m * steps * column * 3;
-        memcpy(intermediateH, mBias, column * 2 * sizeof(F32));
+        UNI_MEMCPY(intermediateH, mBias, column * 2 * sizeof(F32));
         mvm_nkn32(column * 2 / 32, fk, (const F32 *)filter[0], xhArray, intermediateH);
         F32 *out_z = intermediateH;
         F32 *out_r = out_z + column;
@@ -111,12 +109,12 @@ EE grucell_fp32(TensorDesc xDesc,
         if (rnnParamSpec.mode == RNN_GRU_LBR) {
             F32 *h_x_b = (F32 *)mBias + column * 2;
             F32 *h_h_b = (F32 *)bias[1];
-            memcpy(out_h, h_h_b, column * sizeof(F32));
+            UNI_MEMCPY(out_h, h_h_b, column * sizeof(F32));
             mvm_nkn32(column / 32, hDim, (const F32 *)filter[0] + column * 2 * fk + column * xDim,
                 xhArray + xDim, out_h);
             array_mul_f32(out_r, out_h, out_h, hDim);
             if (xDim > 0) {
-                memcpy(out_r, h_x_b, column * sizeof(F32));
+                UNI_MEMCPY(out_r, h_x_b, column * sizeof(F32));
                 mvm_nkn32(
                     column / 32, xDim, (const F32 *)filter[0] + column * 2 * fk, xhArray, out_r);
                 h_x_b = out_r;
@@ -124,7 +122,7 @@ EE grucell_fp32(TensorDesc xDesc,
             array_add_f32(h_x_b, out_h, out_h, hDim);
         } else {
             array_mul_f32(out_r, xhArray + xDim, xhArray + xDim, hDim);
-            memcpy(out_h, mBias + column * 2, column * sizeof(F32));
+            UNI_MEMCPY(out_h, mBias + column * 2, column * sizeof(F32));
             mvm_nkn32(column / 32, fk, (const F32 *)filter[0] + column * 2 * fk, xhArray, out_h);
         }
         for (h = 0; h < column - 3; h += 4) {
@@ -147,7 +145,7 @@ EE grucell_fp32(TensorDesc xDesc,
         array_scale_f32(out_z, out_z, column, -1, 1);
         array_mul_f32(out_z, out_h, out_h, column);
         array_add_f32(out_r, out_h, currentOutput, column);
-        memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim);
+        UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F32) * hDim);
     }
     return SUCCESS;
 }
diff --git a/compute/tensor/src/cpu/arm/fp32/lstm.cpp b/compute/tensor/src/cpu/arm/fp32/lstm.cpp
index 1233d355..35f82da5 100644
--- a/compute/tensor/src/cpu/arm/fp32/lstm.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/lstm.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/arm/fp32/tensor_computing_fp32.h"
 #include "cpu/arm/fp32/mvm_nkn32.h"
 
@@ -54,10 +53,10 @@ EE lstmcell_fp32(TensorDesc xDesc,
 
     U32 batch = in;
     I32 xDim = ix;
-    I32 hDim = rnnParamSpec.numOutput;
-    I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection
-                                                  : rnnParamSpec.numOutput;
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
+    I32 hDim = rnnParamSpec.num_outputs;
+    I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
     U32 steps = batchStrideH / hDim / num1;
     if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) {
         CHECK_STATUS(NOT_MATCH);
@@ -65,9 +64,8 @@ EE lstmcell_fp32(TensorDesc xDesc,
     if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) {
         CHECK_STATUS(NOT_MATCH);
     }
-    F32 forgetBias = rnnParamSpec.forgetBias;
-    ActivationMode activationMode = rnnParamSpec.activationMode;
-    if (activationMode != ACTIVATION_TANH) {
+    F32 forgetBias = rnnParamSpec.forget_bias;
+    if (rnnParamSpec.activation_type != ACTIVATION_TANH) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
 
@@ -88,15 +86,15 @@ EE lstmcell_fp32(TensorDesc xDesc,
     for (U32 m = 0; m < batch; m++) {
         F32 *lastBatchH = lastHArray + m * lastHStride;
         if (xDim > 0) {
-            memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32));
-            memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32));
+            UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32));
+            UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F32));
         } else {
             intermediateH = tmpArray;
             xhArray = lastBatchH;
         }
 
         const F32 *mBias = (const F32 *)bias[0] + m * steps * column * 4;
-        memcpy(intermediateH, mBias, column * 4 * sizeof(F32));
+        UNI_MEMCPY(intermediateH, mBias, column * 4 * sizeof(F32));
         mvm_nkn32(fn, fk, (const F32 *)filter[0], xhArray, intermediateH);
         F32 *out_i = intermediateH;
         F32 *out_g = out_i + column;
@@ -109,12 +107,12 @@ EE lstmcell_fp32(TensorDesc xDesc,
         F32 *currentOutput = outputArray + m * batchStrideH;
 
         F32 *tmpState, *tmpHH, *tmpH;
-        if (rnnParamSpec.zoneoutCell == 0) {
+        if (rnnParamSpec.zoneout_cell == 0) {
             tmpState = currentBatchState;
         } else {
             tmpState = out_i;
         }
-        if (rnnParamSpec.numProjection > 0) {
+        if (rnnParamSpec.num_projection > 0) {
             tmpHH = out_g;
             tmpH = currentOutput;
         } else {
@@ -149,26 +147,26 @@ EE lstmcell_fp32(TensorDesc xDesc,
             tmpState[h] = C_s;
             tmpHH[h] = value;
         }
-        if (rnnParamSpec.zoneoutCell != 0) {
-            array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0);
-            array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0);
+        if (rnnParamSpec.zoneout_cell != 0) {
+            array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneout_cell, 0);
+            array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneout_cell, 0);
             array_add_f32(tmpState, lastBatchState, currentBatchState, column);
         }
 
-        if (rnnParamSpec.numProjection > 0) {
-            memset(tmpH, 0, sizeof(F32) * hDim);
-            mvm_nkn32(hDim / 32, rnnParamSpec.numProjection, (const F32 *)filter[1], tmpHH, tmpH);
+        if (rnnParamSpec.num_projection > 0) {
+            UNI_MEMSET(tmpH, 0, sizeof(F32) * hDim);
+            mvm_nkn32(hDim / 32, rnnParamSpec.num_projection, (const F32 *)filter[1], tmpHH, tmpH);
         }
-        if (rnnParamSpec.zoneoutOutput != 0) {
-            if (rnnParamSpec.numProjection > 0) {
-                array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0);
+        if (rnnParamSpec.zoneout_output != 0) {
+            if (rnnParamSpec.num_projection > 0) {
+                array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0);
             } else {
-                array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0);
+                array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0);
             }
-            array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0);
+            array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneout_output, 0);
             array_add_f32(out_f, lastBatchH, currentBatchH, hDim);
         } else {
-            memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim);
+            UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F32) * hDim);
         }
     }
     return SUCCESS;
diff --git a/compute/tensor/src/cpu/arm/fp32/normalization.cpp b/compute/tensor/src/cpu/arm/fp32/normalization.cpp
index 6604b485..3c4ac5bc 100644
--- a/compute/tensor/src/cpu/arm/fp32/normalization.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/normalization.cpp
@@ -14,10 +14,11 @@
 #include <math.h>
 #include "cpu/arm/fp32/tensor_computing_fp32.h"
 
-inline void array_norm_scale_fp32(
+static float eps = 1e-6;
+
+inline static void array_norm_scale_fp32(
     F32 *input, F32 *output, I32 len, F32 mean, F32 var, F32 *alpha, F32 *beta)
 {
-    F32 eps = 1e-6;
     F32 std_value = sqrt(var + eps);
     float32x4_t mean_v = vdupq_n_f32(mean);
     float32x4_t std_v = vdupq_n_f32(std_value);
@@ -38,14 +39,10 @@ inline void array_norm_scale_fp32(
     }
 }
 
-EE layer_normalization_fp32(
+static EE layer_normalization_nhwc(
     TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output)
 {
     UNUSED(outputDesc);
-    if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-
     U32 size = tensorNumElements(inputDesc);
     I32 size_inner = inputDesc.dims[0];
     I32 size_outer = size / size_inner;
@@ -57,6 +54,87 @@ EE layer_normalization_fp32(
 
         array_norm_scale_fp32(current_input, current_output, size_inner, mean, var, alpha, beta);
     }
+    return SUCCESS;
+}
+
+static EE layer_normalization_nchwc8(
+    TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output)
+{
+    UNUSED(outputDesc);
+    int n = inputDesc.dims[inputDesc.nDims - 1];
+    int c = inputDesc.dims[inputDesc.nDims - 2];
+    int hw = 1;
+    for (unsigned int i = 0; i < inputDesc.nDims - 2; i++) {
+        hw *= inputDesc.dims[i];
+    }
+    int c8 = c / 8;
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < hw; j++) {
+            float32x4_t sum_v = vdupq_n_f32(0);
+            for (int k = 0; k < c8; k++) {
+                int id = ((i * c8 + k) * hw + j) * 8;
+                sum_v = vaddq_f32(sum_v, vld1q_f32(input + id));
+                sum_v = vaddq_f32(sum_v, vld1q_f32(input + id + 4));
+            }
+            F32 mean = vaddvq_f32(sum_v) / c;
+            float32x4_t mean_v = vdupq_n_f32(mean);
 
+            sum_v = vdupq_n_f32(0);
+            for (int k = 0; k < c8; k++) {
+                int id = ((i * c8 + k) * hw + j) * 8;
+                float32x4_t tmp_v = vsubq_f32(vld1q_f32(input + id), mean_v);
+                sum_v = vfmaq_f32(sum_v, tmp_v, tmp_v);
+                tmp_v = vsubq_f32(vld1q_f32(input + id + 4), mean_v);
+                sum_v = vfmaq_f32(sum_v, tmp_v, tmp_v);
+            }
+            F32 var = vaddvq_f32(sum_v) / c;
+            F32 std_value = sqrt(var + eps);
+
+            float32x4_t std_v = vdupq_n_f32(std_value);
+            for (int k = 0, kk = 0; k < c8; k++, kk += 8) {
+                int id = ((i * c8 + k) * hw + j) * 8;
+                float32x4_t in = vld1q_f32(input + id);
+                float32x4_t alpha_v = vld1q_f32(alpha + kk);
+                float32x4_t beta_v = vld1q_f32(beta + kk);
+                float32x4_t tmp_v = vsubq_f32(in, mean_v);
+                tmp_v = vdivq_f32(tmp_v, std_v);
+                tmp_v = vfmaq_f32(beta_v, alpha_v, tmp_v);
+                vst1q_f32(output + id, tmp_v);
+
+                in = vld1q_f32(input + id + 4);
+                alpha_v = vld1q_f32(alpha + kk + 4);
+                beta_v = vld1q_f32(beta + kk + 4);
+                tmp_v = vsubq_f32(in, mean_v);
+                tmp_v = vdivq_f32(tmp_v, std_v);
+                tmp_v = vfmaq_f32(beta_v, alpha_v, tmp_v);
+                vst1q_f32(output + id + 4, tmp_v);
+            }
+        }
+    }
     return SUCCESS;
 }
+
+EE layer_normalization_fp32(TensorDesc inputDesc,
+    F32 *input,
+    LayerNormParamSpec p,
+    F32 *alpha,
+    F32 *beta,
+    TensorDesc outputDesc,
+    F32 *output)
+{
+    if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+
+    EE ret = NOT_SUPPORTED;
+    if (inputDesc.df == DF_NCHWC8) {
+        if (p.axis == 1) {
+            ret = layer_normalization_nchwc8(inputDesc, input, alpha, beta, outputDesc, output);
+        }
+    } else {
+        if (p.axis == -1) {
+            ret = layer_normalization_nhwc(inputDesc, input, alpha, beta, outputDesc, output);
+        }
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/arm/fp32/pooling.cpp b/compute/tensor/src/cpu/arm/fp32/pooling.cpp
index db302d93..dc165ae8 100644
--- a/compute/tensor/src/cpu/arm/fp32/pooling.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/pooling.cpp
@@ -94,6 +94,7 @@ EE pooling_bp_c8_fp32(const F32 *input,
     int hend,
     int wstart,
     int wend,
+    int pool,
     F32 *output,
     U32 stride,
     PoolingParamSpec poolingParamSpec)
@@ -103,7 +104,7 @@ EE pooling_bp_c8_fp32(const F32 *input,
     if (pm != POOLING_MEAN) {
         ret = NOT_SUPPORTED;
     }
-    float32x4_t poolSize = vdupq_n_f32((hend - hstart) * (wend - wstart));
+    float32x4_t poolSize = vdupq_n_f32(pool);
     float32x4_t in0 = vdivq_f32(vld1q_f32(input), poolSize);
     float32x4_t in1 = vdivq_f32(vld1q_f32(input + 4), poolSize);
     for (int kernelH = hstart; kernelH < hend; kernelH++) {
diff --git a/compute/tensor/src/cpu/arm/fp32/scale.cpp b/compute/tensor/src/cpu/arm/fp32/scale.cpp
index 394b3be7..172cd8a1 100644
--- a/compute/tensor/src/cpu/arm/fp32/scale.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/scale.cpp
@@ -42,28 +42,39 @@ EE scale_nchwc8_fp32(
     return SUCCESS;
 }
 
+template <bool icoc_equal>
 EE scale_nchw_fp32(
     F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output)
 {
     float32x4_t one = vdupq_n_f32(1.);
     float32x4_t zero = vdupq_n_f32(0.);
-    U32 index = 0;
+    U32 dst = 0, src = 0;
     for (I32 n = 0; n < in; n++) {
         for (I32 c = 0; c < ic; c++) {
             float32x4_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_f32(alpha[c]);
             float32x4_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f32(beta[c]);
             I32 i = 0;
             for (; i < elements_per_channel - 3; i += 4) {
-                float32x4_t in_vec = vld1q_f32(input + index);
+                if (icoc_equal) {
+                    src = (n * ic + c) * elements_per_channel + i;
+                } else {
+                    src = n * elements_per_channel + i;
+                }
+                float32x4_t in_vec = vld1q_f32(input + src);
                 float32x4_t out_vec = vfmaq_f32(beta_vec, alpha_vec, in_vec);
-                vst1q_f32(output + index, out_vec);
-                index += 4;
+                vst1q_f32(output + dst, out_vec);
+                dst += 4;
             }
             for (; i < elements_per_channel; i++) {
+                if (icoc_equal) {
+                    src = (n * ic + c) * elements_per_channel + i;
+                } else {
+                    src = n * elements_per_channel + i;
+                }
                 float alpha_s = (alpha == nullptr) ? 1 : alpha[c];
                 float beta_s = (beta == nullptr) ? 0 : beta[c];
-                output[index] = alpha_s * input[index] + beta_s;
-                index++;
+                output[dst] = alpha_s * input[src] + beta_s;
+                dst++;
             }
         }
     }
@@ -126,7 +137,11 @@ EE scale_fp32(F32 *input,
     EE ret = SUCCESS;
     // If oc is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw.
     if (axis == 1 || axis == 0 || oc == 1) {
-        ret = scale_nchw_fp32(input, alpha, beta, on, oc, elements_per_channel, output);
+        if (ic == oc) {
+            ret = scale_nchw_fp32<true>(input, alpha, beta, on, oc, elements_per_channel, output);
+        } else {
+            ret = scale_nchw_fp32<false>(input, alpha, beta, on, oc, elements_per_channel, output);
+        }
     } else if (axis == nDims - 1) {
         if (ic == oc) {
             ret = scale_nhwc_fp32<true>(input, alpha, beta, on, oc, elements_per_channel, output);
diff --git a/compute/tensor/src/cpu/arm/fp32/softmax.cpp b/compute/tensor/src/cpu/arm/fp32/softmax.cpp
index f352e428..f874d264 100644
--- a/compute/tensor/src/cpu/arm/fp32/softmax.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/softmax.cpp
@@ -14,59 +14,76 @@
 #include "cpu/arm/fp32/tensor_computing_fp32.h"
 #include "tensor_transpose.h"
 
-void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output)
+template <bool logsoftmax>
+static void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output)
 {
     for (I32 i = 0; i < loopOuter; i++) {
         const F32 *inputPtr = input + i * loops;
         F32 *outputPtr = output + i * loops;
 
-        float32x4_t max_v, sub_v, sum_v, tmp_v;
+        float32x4_t max_v, tmp_v;
         F32 max_s, tmp_s;
-        array_minmax_value_f32(inputPtr, loops, 2, &max_s);
-        max_v = vdupq_n_f32(max_s);
-        sum_v = vdupq_n_f32(0);
-
+        if (!logsoftmax) {
+            array_minmax_value_f32(inputPtr, loops, 2, &max_s);
+            max_v = vdupq_n_f32(max_s);
+        }
         I32 j = 0;
-        F32 sum_s = 0;
-        for (j = 0; j < loops - 3; j += 4) {
+        float32x4_t sum_v = vdupq_n_f32(0);
+        for (; j < loops - 3; j += 4) {
             float32x4_t in = vld1q_f32(inputPtr + j);
-            sub_v = vsubq_f32(in, max_v);
-            tmp_v = vexpq_f32_03_percent_error(sub_v);
+            if (!logsoftmax) {
+                in = vsubq_f32(in, max_v);
+            }
+            tmp_v = vexpq_f32_03_percent_error(in);
             sum_v = vaddq_f32(sum_v, tmp_v);
-            vst1q_f32(outputPtr + j, tmp_v);
+            if (!logsoftmax) {
+                vst1q_f32(outputPtr + j, tmp_v);
+            }
         }
-        sum_s += vaddvq_f32(sum_v);
+        F32 sum_s = vaddvq_f32(sum_v);
         for (; j < loops; j++) {
-            tmp_s = exp(inputPtr[j] - max_s);
-            outputPtr[j] = tmp_s;
+            if (logsoftmax) {
+                tmp_s = exp(inputPtr[j]);
+            } else {
+                tmp_s = exp(inputPtr[j] - max_s);
+                outputPtr[j] = tmp_s;
+            }
             sum_s += tmp_s;
         }
-        array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0);
+        if (logsoftmax) {
+            array_scale_f32(inputPtr, outputPtr, loops, 1.0, -log(sum_s));
+        } else {
+            array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0);
+        }
     }
 }
 
+template <bool logsoftmax>
 void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopInner, F32 *output)
 {
     std::vector<F32> buffer(loopInner * 2);
     F32 *maxBuffer = &buffer[0];
     F32 *sumBuffer = &buffer[loopInner];
     I32 k = 0;
+    F32 tmp_s;
     for (I32 i = 0; i < loopOuter; i++) {
         const F32 *inputPtrBase = input + i * loops * loopInner;
         F32 *outputPtrBase = output + i * loops * loopInner;
 
-        memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F32));
-        memset(sumBuffer, 0, loopInner * sizeof(F32));
-        for (I32 j = 1; j < loops; j++) {
-            const F32 *inputPtr = inputPtrBase + j * loopInner;
-            for (k = 0; k < loopInner - 3; k += 4) {
-                float32x4_t in_v = vld1q_f32(inputPtr + k);
-                float32x4_t out_v = vld1q_f32(maxBuffer + k);
-                float32x4_t max_v = vmaxq_f32(in_v, out_v);
-                vst1q_f32(maxBuffer + k, max_v);
-            }
-            for (; k < loopInner; k++) {
-                maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]);
+        UNI_MEMSET(sumBuffer, 0, loopInner * sizeof(F32));
+        if (!logsoftmax) {
+            UNI_MEMCPY(maxBuffer, inputPtrBase, loopInner * sizeof(F32));
+            for (I32 j = 1; j < loops; j++) {
+                const F32 *inputPtr = inputPtrBase + j * loopInner;
+                for (k = 0; k < loopInner - 3; k += 4) {
+                    float32x4_t in_v = vld1q_f32(inputPtr + k);
+                    float32x4_t out_v = vld1q_f32(maxBuffer + k);
+                    float32x4_t max_v = vmaxq_f32(in_v, out_v);
+                    vst1q_f32(maxBuffer + k, max_v);
+                }
+                for (; k < loopInner; k++) {
+                    maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]);
+                }
             }
         }
         for (I32 j = 0; j < loops; j++) {
@@ -74,35 +91,69 @@ void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopIn
             F32 *outputPtr = outputPtrBase + j * loopInner;
             for (k = 0; k < loopInner - 3; k += 4) {
                 float32x4_t in_v = vld1q_f32(inputPtr + k);
-                float32x4_t max_v = vld1q_f32(maxBuffer + k);
-                float32x4_t sub_v = vsubq_f32(in_v, max_v);
-                float32x4_t exp_v = vexpq_f32_03_percent_error(sub_v);
+                if (!logsoftmax) {
+                    in_v = vsubq_f32(in_v, vld1q_f32(maxBuffer + k));
+                }
+                float32x4_t exp_v = vexpq_f32_03_percent_error(in_v);
                 float32x4_t sum_v = vld1q_f32(sumBuffer + k);
                 sum_v = vaddq_f32(sum_v, exp_v);
                 vst1q_f32(sumBuffer + k, sum_v);
-                vst1q_f32(outputPtr + k, exp_v);
+                if (!logsoftmax) {
+                    vst1q_f32(outputPtr + k, exp_v);
+                }
             }
             for (; k < loopInner; k++) {
-                outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]);
-                sumBuffer[k] += outputPtr[k];
+                if (logsoftmax) {
+                    tmp_s = exp(inputPtr[k]);
+                } else {
+                    tmp_s = exp(inputPtr[k] - maxBuffer[k]);
+                    outputPtr[k] = tmp_s;
+                }
+                sumBuffer[k] += tmp_s;
             }
         }
-        for (I32 j = 0; j < loops; j++) {
-            F32 *outputPtr = outputPtrBase + j * loopInner;
+        if (logsoftmax) {
             for (k = 0; k < loopInner - 3; k += 4) {
-                float32x4_t out_v = vld1q_f32(outputPtr + k);
                 float32x4_t sum_v = vld1q_f32(sumBuffer + k);
-                out_v = vdivq_f32(out_v, sum_v);
-                vst1q_f32(outputPtr + k, out_v);
+                sum_v = vlogq_f32(sum_v);
+                vst1q_f32(sumBuffer + k, sum_v);
             }
             for (; k < loopInner; k++) {
-                outputPtr[k] /= sumBuffer[k];
+                sumBuffer[k] = log(sumBuffer[k]);
+            }
+            for (I32 j = 0; j < loops; j++) {
+                const F32 *inputPtr = inputPtrBase + j * loopInner;
+                F32 *outputPtr = outputPtrBase + j * loopInner;
+                for (k = 0; k < loopInner - 3; k += 4) {
+                    float32x4_t out_v = vld1q_f32(inputPtr + k);
+                    float32x4_t sum_v = vld1q_f32(sumBuffer + k);
+                    out_v = vsubq_f32(out_v, sum_v);
+                    vst1q_f32(outputPtr + k, out_v);
+                }
+                for (; k < loopInner; k++) {
+                    outputPtr[k] -= sumBuffer[k];
+                }
+            }
+        } else {
+            for (I32 j = 0; j < loops; j++) {
+                F32 *outputPtr = outputPtrBase + j * loopInner;
+                for (k = 0; k < loopInner - 3; k += 4) {
+                    float32x4_t out_v = vld1q_f32(outputPtr + k);
+                    float32x4_t sum_v = vld1q_f32(sumBuffer + k);
+                    out_v = vdivq_f32(out_v, sum_v);
+                    vst1q_f32(outputPtr + k, out_v);
+                }
+                for (; k < loopInner; k++) {
+                    outputPtr[k] /= sumBuffer[k];
+                }
             }
         }
     }
 }
 
-EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output)
+template <bool logsoftmax>
+static EE softmax_kernel(
+    TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output)
 {
     UNUSED(outputDesc);
     if (nullptr == input || nullptr == output) {
@@ -145,9 +196,20 @@ EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc out
     }
     U32 loop_outer = size / loops / loop_inner;
     if (axis == 0) {
-        softmax_lastAxis_fp32(input, loop_outer, loops, output);
+        softmax_lastAxis_fp32<logsoftmax>(input, loop_outer, loops, output);
     } else {
-        softmax_anyAxis_fp32(input, loop_outer, loops, loop_inner, output);
+        softmax_anyAxis_fp32<logsoftmax>(input, loop_outer, loops, loop_inner, output);
     }
     return SUCCESS;
 }
+
+EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output)
+{
+    return softmax_kernel<false>(inputDesc, input, axis, outputDesc, output);
+}
+
+EE logsoftmax_fp32(
+    TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output)
+{
+    return softmax_kernel<true>(inputDesc, input, axis, outputDesc, output);
+}
diff --git a/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h b/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h
index 01cc8eb8..9391c04c 100644
--- a/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h
+++ b/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h
@@ -55,8 +55,8 @@ EE convolution_gemm_V8(TensorDesc inputDesc,
     TensorDesc outputDesc,
     F32 *outArray,
     ActivationParamSpec activationDesc);
-#else
-EE convolution_gemm_V7(TensorDesc inputDesc,
+
+EE convolution_gemm_icnchw_V8(TensorDesc inputDesc,
     F32 *inArray,
     TensorDesc filterDesc,
     const F32 *filterArray,
@@ -68,10 +68,8 @@ EE convolution_gemm_V7(TensorDesc inputDesc,
     TensorDesc outputDesc,
     F32 *outArray,
     ActivationParamSpec activationDesc);
-#endif
-
-#ifdef __aarch64__
-EE convolution_gemm_icnchw_V8(TensorDesc inputDesc,
+#else
+EE convolution_gemm_V7(TensorDesc inputDesc,
     F32 *inArray,
     TensorDesc filterDesc,
     const F32 *filterArray,
@@ -83,7 +81,7 @@ EE convolution_gemm_icnchw_V8(TensorDesc inputDesc,
     TensorDesc outputDesc,
     F32 *outArray,
     ActivationParamSpec activationDesc);
-#else
+
 EE convolution_gemm_icnchw_V7(TensorDesc inputDesc,
     F32 *inArray,
     TensorDesc filterDesc,
@@ -138,6 +136,7 @@ EE pooling_bp_c8_fp32(const F32 *input,
     int hend,
     int wstart,
     int wend,
+    int poolSize,
     F32 *output,
     U32 stride,
     PoolingParamSpec poolingParamSpec);
@@ -145,6 +144,9 @@ EE pooling_bp_c8_fp32(const F32 *input,
 EE softmax_fp32(
     TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output);
 
+EE logsoftmax_fp32(
+    TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output);
+
 EE concat_fp32(std::vector<TensorDesc> inputDesc,
     std::vector<void *> input,
     TensorDesc outputDesc,
@@ -243,8 +245,13 @@ EE power_fp32(TensorDesc inputDesc,
     TensorDesc outputDesc,
     F32 *output);
 
-EE layer_normalization_fp32(
-    TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output);
+EE layer_normalization_fp32(TensorDesc inputDesc,
+    F32 *input,
+    LayerNormParamSpec p,
+    F32 *alpha,
+    F32 *beta,
+    TensorDesc outputDesc,
+    F32 *output);
 
 EE scale_fp32(F32 *input,
     I32 axis,
diff --git a/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_V7.cpp b/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_V7.cpp
index 2f121fca..b27091ff 100644
--- a/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_V7.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_V7.cpp
@@ -42,7 +42,7 @@ EE convolution_gemm_V7(TensorDesc inputDesc,
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
         it = ft = ot = 1;
         p.dilatedRate_t = p.stride_t = 1;
-        p.padding_before = p.padding_after = 0;
+        p.pad_before = p.pad_after = 0;
     } else if (tensorIs5d(inputDesc)) {
         CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
         CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw));
@@ -56,9 +56,9 @@ EE convolution_gemm_V7(TensorDesc inputDesc,
     }
 
     oc /= 8;
-    U32 it_pad = it + p.padding_before + p.padding_after;
-    U32 ih_pad = ih + p.padding_top + p.padding_bottom;
-    U32 iw_pad = iw + p.padding_left + p.padding_right;
+    U32 it_pad = it + p.pad_before + p.pad_after;
+    U32 ih_pad = ih + p.pad_top + p.pad_bottom;
+    U32 iw_pad = iw + p.pad_left + p.pad_right;
     I64 K = ic * ft * fh * fw;
     I32 ohow = ot * oh * ow;
     F32 *in_pack = ((F32 *)tmp) + ic * it_pad * ih_pad * iw_pad;
@@ -116,53 +116,52 @@ EE convolution_gemm_V7(TensorDesc inputDesc,
                             // NHWChw6
                             F32 *in_pack_c8hw6 = thread_in_pack + (id * params[0] + c) * 8 * 6;
 
-                            __asm__ __volatile__("vld1.f32 {d0-d3}, [%[in_0]]\n"
-                                                 "vld1.f32 {d4-d7}, [%[in_1]]\n"
-                                                 "vld1.f32 {d8-d11}, [%[in_2]]\n"
-                                                 "vld1.f32 {d12-d15}, [%[in_3]]\n"
-                                                 "vld1.f32 {d16-d19}, [%[in_4]]\n"
-                                                 "vld1.f32 {d20-d23}, [%[in_5]]\n"
-
-                                                 "vzip.32 q0, q2\n"
-                                                 "vzip.32 q4, q6\n"
-                                                 "vzip.32 q8, q10\n"
-
-                                                 "vst1.f32 {d0}, [%[pack]]!\n"
-                                                 "vst1.f32 {d8}, [%[pack]]!\n"
-                                                 "vst1.f32 {d16}, [%[pack]]!\n"
-                                                 "vst1.f32 {d1}, [%[pack]]!\n"
-                                                 "vst1.f32 {d9}, [%[pack]]!\n"
-                                                 "vst1.f32 {d17}, [%[pack]]!\n"
-                                                 "vst1.f32 {d4}, [%[pack]]!\n"
-                                                 "vst1.f32 {d12}, [%[pack]]!\n"
-                                                 "vst1.f32 {d20}, [%[pack]]!\n"
-                                                 "vst1.f32 {d5}, [%[pack]]!\n"
-                                                 "vst1.f32 {d13}, [%[pack]]!\n"
-                                                 "vst1.f32 {d21}, [%[pack]]!\n"
-
-                                                 "vzip.32 q1, q3\n"
-                                                 "vzip.32 q5, q7\n"
-                                                 "vzip.32 q9, q11\n"
-
-                                                 "vst1.f32 {d2}, [%[pack]]!\n"
-                                                 "vst1.f32 {d10}, [%[pack]]!\n"
-                                                 "vst1.f32 {d18}, [%[pack]]!\n"
-                                                 "vst1.f32 {d3}, [%[pack]]!\n"
-                                                 "vst1.f32 {d11}, [%[pack]]!\n"
-                                                 "vst1.f32 {d19}, [%[pack]]!\n"
-                                                 "vst1.f32 {d6}, [%[pack]]!\n"
-                                                 "vst1.f32 {d14}, [%[pack]]!\n"
-                                                 "vst1.f32 {d22}, [%[pack]]!\n"
-                                                 "vst1.f32 {d7}, [%[pack]]!\n"
-                                                 "vst1.f32 {d15}, [%[pack]]!\n"
-                                                 "vst1.f32 {d23}, [%[pack]]!\n"
-                                                 : [pack] "+r"(in_pack_c8hw6), [in_0] "+r"(in_0),
-                                                 [in_1] "+r"(in_1), [in_2] "+r"(in_2),
-                                                 [in_3] "+r"(in_3), [in_4] "+r"(in_4),
-                                                 [in_5] "+r"(in_5)
-                                                 :
-                                                 : "memory", "cc", "q0", "q1", "q2", "q3", "q4",
-                                                 "q5", "q6", "q7", "q8", "q9", "q10", "q11");
+                            __asm__ __volatile__(
+                                "vld1.f32 {d0-d3}, [%[in_0]]\n"
+                                "vld1.f32 {d4-d7}, [%[in_1]]\n"
+                                "vld1.f32 {d8-d11}, [%[in_2]]\n"
+                                "vld1.f32 {d12-d15}, [%[in_3]]\n"
+                                "vld1.f32 {d16-d19}, [%[in_4]]\n"
+                                "vld1.f32 {d20-d23}, [%[in_5]]\n"
+
+                                "vzip.32 q0, q2\n"
+                                "vzip.32 q4, q6\n"
+                                "vzip.32 q8, q10\n"
+
+                                "vst1.f32 {d0}, [%[pack]]!\n"
+                                "vst1.f32 {d8}, [%[pack]]!\n"
+                                "vst1.f32 {d16}, [%[pack]]!\n"
+                                "vst1.f32 {d1}, [%[pack]]!\n"
+                                "vst1.f32 {d9}, [%[pack]]!\n"
+                                "vst1.f32 {d17}, [%[pack]]!\n"
+                                "vst1.f32 {d4}, [%[pack]]!\n"
+                                "vst1.f32 {d12}, [%[pack]]!\n"
+                                "vst1.f32 {d20}, [%[pack]]!\n"
+                                "vst1.f32 {d5}, [%[pack]]!\n"
+                                "vst1.f32 {d13}, [%[pack]]!\n"
+                                "vst1.f32 {d21}, [%[pack]]!\n"
+
+                                "vzip.32 q1, q3\n"
+                                "vzip.32 q5, q7\n"
+                                "vzip.32 q9, q11\n"
+
+                                "vst1.f32 {d2}, [%[pack]]!\n"
+                                "vst1.f32 {d10}, [%[pack]]!\n"
+                                "vst1.f32 {d18}, [%[pack]]!\n"
+                                "vst1.f32 {d3}, [%[pack]]!\n"
+                                "vst1.f32 {d11}, [%[pack]]!\n"
+                                "vst1.f32 {d19}, [%[pack]]!\n"
+                                "vst1.f32 {d6}, [%[pack]]!\n"
+                                "vst1.f32 {d14}, [%[pack]]!\n"
+                                "vst1.f32 {d22}, [%[pack]]!\n"
+                                "vst1.f32 {d7}, [%[pack]]!\n"
+                                "vst1.f32 {d15}, [%[pack]]!\n"
+                                "vst1.f32 {d23}, [%[pack]]!\n"
+                                : [pack] "+r"(in_pack_c8hw6), [in_0] "+r"(in_0), [in_1] "+r"(in_1),
+                                [in_2] "+r"(in_2), [in_3] "+r"(in_3), [in_4] "+r"(in_4), [in_5] "+r"(in_5)
+                                :
+                                : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                                "q8", "q9", "q10", "q11");
                         }
                     }
                 }
diff --git a/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_icnchw_V7.cpp b/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_icnchw_V7.cpp
index 2bf21d2f..c35ddb1a 100644
--- a/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_icnchw_V7.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_icnchw_V7.cpp
@@ -42,7 +42,7 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc,
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
         it = ft = ot = 1;
         p.dilatedRate_t = p.stride_t = 1;
-        p.padding_before = p.padding_after = 0;
+        p.pad_before = p.pad_after = 0;
     } else if (tensorIs5d(inputDesc)) {
         CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
         CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw));
@@ -66,9 +66,9 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc,
             return NOT_SUPPORTED;
     }
     oc /= 8;
-    U32 it_pad = it + p.padding_before + p.padding_after;
-    U32 ih_pad = ih + p.padding_top + p.padding_bottom;
-    U32 iw_pad = iw + p.padding_left + p.padding_right;
+    U32 it_pad = it + p.pad_before + p.pad_after;
+    U32 ih_pad = ih + p.pad_top + p.pad_bottom;
+    U32 iw_pad = iw + p.pad_left + p.pad_right;
     I64 K = ic * ft * fh * fw;
     I32 ohow = ot * oh * ow;
     F32 *in_pack = ((F32 *)tmp) + ic * it_pad * ih_pad * iw_pad;
diff --git a/compute/tensor/src/cpu/arm/fp32/v7/depthwise_pointwise_convolution_direct_V7.cpp b/compute/tensor/src/cpu/arm/fp32/v7/depthwise_pointwise_convolution_direct_V7.cpp
index 60e19d3b..1e2aa034 100644
--- a/compute/tensor/src/cpu/arm/fp32/v7/depthwise_pointwise_convolution_direct_V7.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/v7/depthwise_pointwise_convolution_direct_V7.cpp
@@ -45,10 +45,10 @@ EE depthwise_pointwise_convolution_direct_V7(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
 
@@ -75,20 +75,20 @@ EE depthwise_pointwise_convolution_direct_V7(TensorDesc inputDesc,
         F32 *inArray_mov = inArray + n * ic * ihiw * 8;
         for (U32 c = 0; c < ic; c++) {
             if (paddingT > 0) {
-                memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingT * iw_pad * 8;
             }
             for (U32 h = paddingT; h < ih_pad - paddingB; h++) {
-                memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingL * 8;
-                memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt));
+                UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt));
                 inArray_pad_mov += iw * 8;
                 inArray_mov += iw * 8;
-                memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingR * 8;
             }
             if (paddingB > 0) {
-                memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingB * iw_pad * 8;
             }
 
diff --git a/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_V8.cpp b/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_V8.cpp
index 846b844c..33391996 100644
--- a/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_V8.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_V8.cpp
@@ -42,7 +42,7 @@ EE convolution_gemm_V8(TensorDesc inputDesc,
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
         it = ft = ot = 1;
         p.dilatedRate_t = p.stride_t = 1;
-        p.padding_before = p.padding_after = 0;
+        p.pad_before = p.pad_after = 0;
     } else if (tensorIs5d(inputDesc)) {
         CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
         CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw));
@@ -56,9 +56,9 @@ EE convolution_gemm_V8(TensorDesc inputDesc,
     }
 
     oc /= 8;
-    U32 it_pad = it + p.padding_before + p.padding_after;
-    U32 ih_pad = ih + p.padding_top + p.padding_bottom;
-    U32 iw_pad = iw + p.padding_left + p.padding_right;
+    U32 it_pad = it + p.pad_before + p.pad_after;
+    U32 ih_pad = ih + p.pad_top + p.pad_bottom;
+    U32 iw_pad = iw + p.pad_left + p.pad_right;
     I64 K = ic * ft * fh * fw;
     I32 ohow = ot * oh * ow;
     F32 *in_pack = ((F32 *)tmp) + ic * it_pad * ih_pad * iw_pad;
@@ -216,9 +216,8 @@ EE convolution_gemm_V8(TensorDesc inputDesc,
                                 :
                                 : [pack] "r"(in_pack_c8hw12), [in_0] "r"(in_0), [in_1] "r"(in_1),
                                 [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4),
-                                [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7),
-                                [in_8] "r"(in_8), [in_9] "r"(in_9), [in_10] "r"(in_10),
-                                [in_11] "r"(in_11)
+                                [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8),
+                                [in_9] "r"(in_9), [in_10] "r"(in_10), [in_11] "r"(in_11)
                                 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
                                 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
                                 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
@@ -756,8 +755,7 @@ EE convolution_gemm_V8(TensorDesc inputDesc,
                                 "st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[pack]], #64\n"
                                 "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[pack]]\n"
                                 : [pack] "+r"(in_pack_c8hw4)
-                                : [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2),
-                                [in_3] "r"(in_3)
+                                : [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3)
                                 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
                         }
                     }
diff --git a/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_icnchw_V8.cpp b/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_icnchw_V8.cpp
index a6b72066..5f568202 100644
--- a/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_icnchw_V8.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_icnchw_V8.cpp
@@ -42,7 +42,7 @@ EE convolution_gemm_icnchw_V8(TensorDesc inputDesc,
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
         it = ft = ot = 1;
         p.dilatedRate_t = p.stride_t = 1;
-        p.padding_before = p.padding_after = 0;
+        p.pad_before = p.pad_after = 0;
     } else if (tensorIs5d(inputDesc)) {
         CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
         CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw));
@@ -55,9 +55,9 @@ EE convolution_gemm_icnchw_V8(TensorDesc inputDesc,
     }
 
     oc /= 8;
-    U32 it_pad = it + p.padding_before + p.padding_after;
-    U32 ih_pad = ih + p.padding_top + p.padding_bottom;
-    U32 iw_pad = iw + p.padding_left + p.padding_right;
+    U32 it_pad = it + p.pad_before + p.pad_after;
+    U32 ih_pad = ih + p.pad_top + p.pad_bottom;
+    U32 iw_pad = iw + p.pad_left + p.pad_right;
     I64 K = ic * ft * fh * fw;
     I32 ohow = ot * oh * ow;
     F32 *in_pack = ((F32 *)tmp) + ic * it_pad * ih_pad * iw_pad;
diff --git a/compute/tensor/src/cpu/arm/fp32/v8/convolution_winograd_V8.cpp b/compute/tensor/src/cpu/arm/fp32/v8/convolution_winograd_V8.cpp
index fe479f40..b2e375c4 100644
--- a/compute/tensor/src/cpu/arm/fp32/v8/convolution_winograd_V8.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/v8/convolution_winograd_V8.cpp
@@ -40,10 +40,10 @@ EE convolution_winograd_V8(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     if (fdf != DF_HWNCN8) {
         CHECK_STATUS(NOT_MATCH);
@@ -78,8 +78,8 @@ EE convolution_winograd_V8(TensorDesc inputDesc,
     EE ret = SUCCESS;
     // copy input into a input with padding
     for (U32 n = 0; n < in; n++) {
-        convParamSpec.padding_bottom = pad_bottom;
-        convParamSpec.padding_right = pad_right;
+        convParamSpec.pad_bottom = pad_bottom;
+        convParamSpec.pad_right = pad_right;
         F32 *inArray_pad = convolution_input_padding_per_channel<F32, 8>(
             n, ic, 1, ih, iw, convParamSpec, inArray, (F32 *)tmp);
 
@@ -796,7 +796,7 @@ EE convolution_winograd_V8(TensorDesc inputDesc,
                 trans_I_4x4_3x3(Iw_ptr1, I1);
                 for (U32 i = 0; i < 36; i++) {
                     F32 *itm = itmArray_mov + i * ic * 8;
-                    memcpy(itm, Iw[i], 8 * bytesOf(idt));
+                    UNI_MEMCPY(itm, Iw[i], 8 * bytesOf(idt));
                 }
             }
             for (I32 o = 0; o < I32(oc); o++) {
diff --git a/compute/tensor/src/cpu/arm/fp32/v8/depthwise_pointwise_convolution_direct_V8.cpp b/compute/tensor/src/cpu/arm/fp32/v8/depthwise_pointwise_convolution_direct_V8.cpp
index 6e0903ae..8b6ecf06 100644
--- a/compute/tensor/src/cpu/arm/fp32/v8/depthwise_pointwise_convolution_direct_V8.cpp
+++ b/compute/tensor/src/cpu/arm/fp32/v8/depthwise_pointwise_convolution_direct_V8.cpp
@@ -43,10 +43,10 @@ EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
 
@@ -70,20 +70,20 @@ EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc,
         F32 *inArray_mov = inArray + n * ic * ihiw * 8;
         for (U32 c = 0; c < ic; c++) {
             if (paddingT > 0) {
-                memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingT * iw_pad * 8;
             }
             for (U32 h = paddingT; h < ih_pad - paddingB; h++) {
-                memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingL * 8;
-                memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt));
+                UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt));
                 inArray_pad_mov += iw * 8;
                 inArray_mov += iw * 8;
-                memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingR * 8;
             }
             if (paddingB > 0) {
-                memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt));
                 inArray_pad_mov += paddingB * iw_pad * 8;
             }
 
@@ -143,41 +143,40 @@ EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc,
                         F32 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8;
                         F32 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8;
                         F32 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8;
-                        __asm__ __volatile__("ldp q16, q17, [%[f0]]\n"
-                                             "ldp q30, q31, [%[in0]]\n"
-                                             "ldp q18, q19, [%[in1]]\n"
-                                             "ldp q20, q21, [%[in2]]\n"
-                                             "ldp q22, q23, [%[in3]]\n"
-                                             "ldp q24, q25, [%[in4]]\n"
-                                             "ldp q26, q27, [%[in5]]\n"
-                                             "ldp q28, q29, [%[in6]]\n"
-
-                                             "fmla v0.4s, v30.4s, v16.4s\n"
-                                             "fmla v1.4s, v31.4s, v17.4s\n"
-                                             "fmla v2.4s, v18.4s, v16.4s\n"
-                                             "ldp q30, q31, [%[in7]]\n"
-                                             "fmla v3.4s, v19.4s, v17.4s\n"
-                                             "fmla v4.4s, v20.4s, v16.4s\n"
-                                             "fmla v5.4s, v21.4s, v17.4s\n"
-                                             "fmla v6.4s, v22.4s, v16.4s\n"
-                                             "fmla v7.4s, v23.4s, v17.4s\n"
-                                             "fmla v8.4s, v24.4s, v16.4s\n"
-                                             "fmla v9.4s, v25.4s, v17.4s\n"
-                                             "fmla v10.4s, v26.4s, v16.4s\n"
-                                             "fmla v11.4s, v27.4s, v17.4s\n"
-                                             "fmla v12.4s, v28.4s, v16.4s\n"
-                                             "fmla v13.4s, v29.4s, v17.4s\n"
-                                             "fmla v14.4s, v30.4s, v16.4s\n"
-                                             "fmla v15.4s, v31.4s, v17.4s\n"
-                                             :
-                                             : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2),
-                                             [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5),
-                                             [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0)
-                                             : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
-                                             "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
-                                             "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
-                                             "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
-                                             "v30", "v31");
+                        __asm__ __volatile__(
+                            "ldp q16, q17, [%[f0]]\n"
+                            "ldp q30, q31, [%[in0]]\n"
+                            "ldp q18, q19, [%[in1]]\n"
+                            "ldp q20, q21, [%[in2]]\n"
+                            "ldp q22, q23, [%[in3]]\n"
+                            "ldp q24, q25, [%[in4]]\n"
+                            "ldp q26, q27, [%[in5]]\n"
+                            "ldp q28, q29, [%[in6]]\n"
+
+                            "fmla v0.4s, v30.4s, v16.4s\n"
+                            "fmla v1.4s, v31.4s, v17.4s\n"
+                            "fmla v2.4s, v18.4s, v16.4s\n"
+                            "ldp q30, q31, [%[in7]]\n"
+                            "fmla v3.4s, v19.4s, v17.4s\n"
+                            "fmla v4.4s, v20.4s, v16.4s\n"
+                            "fmla v5.4s, v21.4s, v17.4s\n"
+                            "fmla v6.4s, v22.4s, v16.4s\n"
+                            "fmla v7.4s, v23.4s, v17.4s\n"
+                            "fmla v8.4s, v24.4s, v16.4s\n"
+                            "fmla v9.4s, v25.4s, v17.4s\n"
+                            "fmla v10.4s, v26.4s, v16.4s\n"
+                            "fmla v11.4s, v27.4s, v17.4s\n"
+                            "fmla v12.4s, v28.4s, v16.4s\n"
+                            "fmla v13.4s, v29.4s, v17.4s\n"
+                            "fmla v14.4s, v30.4s, v16.4s\n"
+                            "fmla v15.4s, v31.4s, v17.4s\n"
+                            :
+                            : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), [in3] "r"(in_3),
+                            [in4] "r"(in_4), [in5] "r"(in_5), [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0)
+                            : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                            "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                            "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
+                            "v29", "v30", "v31");
                     }
                 }
 
diff --git a/compute/tensor/src/cpu/arm/int32/scale.cpp b/compute/tensor/src/cpu/arm/int32/scale.cpp
new file mode 100644
index 00000000..3a8bacbd
--- /dev/null
+++ b/compute/tensor/src/cpu/arm/int32/scale.cpp
@@ -0,0 +1,157 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <arm_neon.h>
+#include "cpu/arm/int32/tensor_computing_int32.h"
+
+EE scale_nchwc8_int32(
+    I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output)
+{
+    int32x4_t in_vec, out_vec;
+    int32x4_t one = vdupq_n_s32(1);
+    int32x4_t zero = vdupq_n_s32(0);
+    U32 index = 0;
+    for (I32 n = 0; n < in; n++) {
+        for (I32 c = 0; c < ic; c += 8) {
+            int32x4_t alpha_vec0 = (alpha == nullptr) ? one : vld1q_s32(alpha + c);
+            int32x4_t alpha_vec1 = (alpha == nullptr) ? one : vld1q_s32(alpha + c + 4);
+            int32x4_t beta_vec0 = (beta == nullptr) ? zero : vld1q_s32(beta + c);
+            int32x4_t beta_vec1 = (beta == nullptr) ? zero : vld1q_s32(beta + c + 4);
+            for (I32 i = 0; i < elements_per_channel; i++) {
+                in_vec = vld1q_s32(input + index);
+                out_vec = vmlaq_s32(beta_vec0, alpha_vec0, in_vec);
+                vst1q_s32(output + index, out_vec);
+
+                in_vec = vld1q_s32(input + index + 4);
+                out_vec = vmlaq_s32(beta_vec1, alpha_vec1, in_vec);
+                vst1q_s32(output + index + 4, out_vec);
+                index += 8;
+            }
+        }
+    }
+    return SUCCESS;
+}
+
+template <bool icoc_equal>
+EE scale_nchw_int32(
+    I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output)
+{
+    int32x4_t one = vdupq_n_s32(1);
+    int32x4_t zero = vdupq_n_s32(0);
+    U32 dst = 0, src = 0;
+    for (I32 n = 0; n < in; n++) {
+        for (I32 c = 0; c < ic; c++) {
+            int32x4_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_s32(alpha[c]);
+            int32x4_t beta_vec = (beta == nullptr) ? zero : vdupq_n_s32(beta[c]);
+            I32 i = 0;
+            for (; i < elements_per_channel - 3; i += 4) {
+                if (icoc_equal) {
+                    src = (n * ic + c) * elements_per_channel + i;
+                } else {
+                    src = n * elements_per_channel + i;
+                }
+                int32x4_t in_vec = vld1q_s32(input + src);
+                int32x4_t out_vec = vmlaq_s32(beta_vec, alpha_vec, in_vec);
+                vst1q_s32(output + dst, out_vec);
+                dst += 4;
+            }
+            for (; i < elements_per_channel; i++) {
+                if (icoc_equal) {
+                    src = (n * ic + c) * elements_per_channel + i;
+                } else {
+                    src = n * elements_per_channel + i;
+                }
+                int alpha_s = (alpha == nullptr) ? 1 : alpha[c];
+                int beta_s = (beta == nullptr) ? 0 : beta[c];
+                output[dst] = alpha_s * input[src] + beta_s;
+                dst++;
+            }
+        }
+    }
+    return SUCCESS;
+}
+
+template <bool icoc_equal>
+EE scale_nhwc_int32(
+    I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output)
+{
+    int32x4_t one = vdupq_n_s32(1);
+    int32x4_t zero = vdupq_n_s32(0);
+    int32x4_t in_vec;
+    int in_s;
+    for (I32 n = 0, src = 0, dst = 0; n < in; n++) {
+        for (I32 i = 0; i < elements_per_channel; i++, src++) {
+            I32 c = 0;
+            for (; c < ic - 3; c += 4) {
+                int32x4_t alpha_vec = (alpha == nullptr) ? one : vld1q_s32(alpha + c);
+                int32x4_t beta_vec = (beta == nullptr) ? zero : vld1q_s32(beta + c);
+                if (icoc_equal) {
+                    in_vec = vld1q_s32(input + dst);
+                } else {
+                    in_vec = vdupq_n_s32(input[src]);
+                }
+                int32x4_t out_vec = vmlaq_s32(beta_vec, alpha_vec, in_vec);
+                vst1q_s32(output + dst, out_vec);
+                dst += 4;
+            }
+            for (; c < ic; c++) {
+                int alpha_s = (alpha == nullptr) ? 1 : alpha[c];
+                int beta_s = (beta == nullptr) ? 0 : beta[c];
+                if (icoc_equal) {
+                    in_s = input[dst];
+                } else {
+                    in_s = input[src];
+                }
+                output[dst] = alpha_s * in_s + beta_s;
+                dst++;
+            }
+        }
+    }
+    return SUCCESS;
+}
+
+EE scale_int32(I32 *input,
+    I32 axis,
+    I32 nDims,
+    I32 *alpha,
+    I32 *beta,
+    I32 on,
+    I32 oc,
+    I32 elements_per_channel,
+    I32 ic,
+    I32 *output)
+{
+    if (nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    EE ret = SUCCESS;
+    // If oc is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw.
+    if (axis == 1 || axis == 0 || oc == 1) {
+        if (ic == oc) {
+            ret = scale_nchw_int32<true>(input, alpha, beta, on, oc, elements_per_channel, output);
+        } else {
+            ret = scale_nchw_int32<false>(input, alpha, beta, on, oc, elements_per_channel, output);
+        }
+    } else if (axis == nDims - 1) {
+        if (ic == oc) {
+            ret = scale_nhwc_int32<true>(input, alpha, beta, on, oc, elements_per_channel, output);
+        } else {
+            ret = scale_nhwc_int32<false>(input, alpha, beta, on, oc, elements_per_channel, output);
+        }
+    } else if (axis == nDims) {
+        ret = scale_nchwc8_int32(input, alpha, beta, on, oc, elements_per_channel, output);
+    } else {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/arm/int32/tensor_computing_int32.h b/compute/tensor/src/cpu/arm/int32/tensor_computing_int32.h
new file mode 100644
index 00000000..98ae11b2
--- /dev/null
+++ b/compute/tensor/src/cpu/arm/int32/tensor_computing_int32.h
@@ -0,0 +1,31 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _TENSOR_COMPUTING_INT32_H
+#define _TENSOR_COMPUTING_INT32_H
+
+#include "error.h"
+
+#include "thread_affinity.h"
+
+EE scale_int32(I32 *input,
+    I32 axis,
+    I32 nDims,
+    I32 *alpha,
+    I32 *beta,
+    I32 on,
+    I32 oc,
+    I32 elements_per_channel,
+    I32 ic,
+    I32 *output);
+#endif
diff --git a/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h b/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h
index 1b91961a..fb42e2ee 100644
--- a/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h
+++ b/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h
@@ -14,40 +14,43 @@
 #ifndef _H_ARM_FUNCTIONS_INT8
 #define _H_ARM_FUNCTIONS_INT8
 
+#include "cpu/cpu_functions_template.h"
 #include "arm_neon_expand.h"
-#include "parameter_spec.h"
 
 inline EE activation_int8(INT8 *input, U32 len, ActivationParamSpec activationDesc, INT8 *output)
 {
-    int8x16_t in, out;
     int8x16_t zero = vdupq_n_s8(0);
-    U32 len_main = len / 16;
-    U32 len_tail = len % 16;
-
+    U32 loops = len / 16 * 16;
+    EE ret = SUCCESS;
     switch (activationDesc.mode) {
         case ACTIVATION_NULL: {
+            if (output != input) {
+                UNI_MEMCPY(output, input, sizeof(INT8) * len);
+            }
+            loops = len;
             break;
         }
         case ACTIVATION_RELU: {
             if (activationDesc.value[0] != 0) {
-                return NOT_SUPPORTED;
-            }
-            for (U32 i = 0; i < len_main; i++) {
-                in = vld1q_s8(input);
-                out = vmaxq_s8(zero, in);
-                vst1q_s8(output, out);
-                input += 16;
-                output += 16;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                output[i] = (input[i] < 0) ? 0 : input[i];
+                ret = NOT_SUPPORTED;
+            } else {
+                for (U32 i = 0; i < loops; i += 16) {
+                    int8x16_t in = vld1q_s8(input + i);
+                    int8x16_t out = vmaxq_s8(zero, in);
+                    vst1q_s8(output + i, out);
+                }
             }
             break;
         }
         default:
-            return NOT_SUPPORTED;
+            ret = NOT_SUPPORTED;
+            break;
     }
-
-    return SUCCESS;
+    if (ret == SUCCESS) {
+        for (U32 i = loops; i < len; i++) {
+            ret = activation_template<INT8>(activationDesc, input[i], output + i);
+        }
+    }
+    return ret;
 }
 #endif
diff --git a/compute/tensor/src/cpu/arm/int8/concat.cpp b/compute/tensor/src/cpu/arm/int8/concat.cpp
index 9281e180..3bcf72a5 100644
--- a/compute/tensor/src/cpu/arm/int8/concat.cpp
+++ b/compute/tensor/src/cpu/arm/int8/concat.cpp
@@ -26,7 +26,7 @@ EE concat_int8(std::vector<TensorDesc> inputDesc,
         CHECK_STATUS(NOT_MATCH);
     }
     if (inputDesc.size() == 1) {
-        memcpy(output, input[0], tensorNumBytes(outputDesc));
+        UNI_MEMCPY(output, input[0], tensorNumBytes(outputDesc));
         return SUCCESS;
     }
     if (concatDim != 0 && concatDim != 1) {
@@ -113,7 +113,7 @@ EE concat_int8(std::vector<TensorDesc> inputDesc,
             for (U32 i = 0; i < inputDesc.size(); i++) {
                 copySize = tensorNumElements(inputDesc[i]) * sizeof(INT8);
 
-                memcpy(out_ptr, input[i], copySize);
+                UNI_MEMCPY(out_ptr, input[i], copySize);
                 out_ptr = out_ptr + copySize;
             }
             return SUCCESS;
@@ -129,7 +129,7 @@ EE concat_int8(std::vector<TensorDesc> inputDesc,
 
                     copySize = tensorNumElements(inputDesc[i]) / in * sizeof(INT8);
 
-                    memcpy(out_ptr, (INT8 *)input[i] + j * copySize, copySize);
+                    UNI_MEMCPY(out_ptr, (INT8 *)input[i] + j * copySize, copySize);
                     out_ptr = out_ptr + copySize;
                 }
             }
diff --git a/compute/tensor/src/cpu/arm/int8/convolution.cpp b/compute/tensor/src/cpu/arm/int8/convolution.cpp
index 10adc050..d614b935 100644
--- a/compute/tensor/src/cpu/arm/int8/convolution.cpp
+++ b/compute/tensor/src/cpu/arm/int8/convolution.cpp
@@ -12,10 +12,10 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/arm/int8/tensor_computing_int8.h"
-#ifdef __aarch64__
-#include "cpu/arm/int8/v8/convolution_winograd.h"
-#include "cpu/arm/int8/v8/convolution_gemm.h"
-#else
+#if defined(_USE_FP16)
+#include "cpu/arm/int8/v8.2/convolution_winograd.h"
+#include "cpu/arm/int8/v8.2/convolution_gemm.h"
+#elif !defined(__aarch64__)
 #include "cpu/arm/int8/v7/convolution_gemm.h"
 #endif
 #include "tensor_transpose.h"
@@ -74,23 +74,25 @@ EE convolution_int8(TensorDesc inputDesc,
         inputPtr = tmpPtr;
         tmpPtr += tensorNumBytes(inputDesc);
         tmpBytes -= tensorNumBytes(inputDesc);
-        algorithm = CONVOLUTION_ALGORITHM_GEMM;
+        //algorithm = CONVOLUTION_ALGORITHM_GEMM;
     }
 
     EE ret = SUCCESS;
     switch (algorithm) {
-#ifdef __aarch64__
+#if defined(_USE_FP16)
         case CONVOLUTION_ALGORITHM_WINOGRAD:
             ret = convolution_winograd(inputDesc, inputPtr, scales, filterDesc, filter, scales + 2,
                 convParamSpec, biasDesc, bias, tmpBytes, tmpPtr, outputDesc, output, scales + 1,
                 activationDesc, arch);
             break;
 #endif
+#if defined(_USE_FP16) || !defined(__aarch64__)
         case CONVOLUTION_ALGORITHM_GEMM:
             ret = convolution_gemm(inputDesc, inputPtr, scales, filterDesc, filter, scales + 2,
                 convParamSpec, biasDesc, bias, tmpBytes, tmpPtr, outputDesc, output, scales + 1,
                 activationDesc, arch);
             break;
+#endif
         default:
             ret = NOT_SUPPORTED;
             break;
diff --git a/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp b/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp
index 63c0f446..dfd0d90d 100644
--- a/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp
+++ b/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp
@@ -32,7 +32,7 @@ inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc,
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     if (fdf == ftmDataFormat) {
         *ftmDesc = filterDesc;
-        memcpy(ftm, filter, fn * fc * fh * fw * bytesOf(fdt));
+        UNI_MEMCPY(ftm, filter, fn * fc * fh * fw * bytesOf(fdt));
         return SUCCESS;
     }
     if (fdf != DF_NCHW) {
diff --git a/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp
index 04c23b4a..fb01e18a 100644
--- a/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp
+++ b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp
@@ -59,12 +59,14 @@ EE depthwise_pointwise_convolution_int8(TensorDesc inputDesc,
 
     EE ret = SUCCESS;
     switch (algorithm) {
+#if defined(_USE_FP16) || !defined(__aarch64__)
         case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT:
             ret = depthwise_pointwise_convolution_direct(inputDesc, input, dwFilterDesc, dwFilter,
                 pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias,
                 tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec,
                 pointwiseActivationParamSpec, arch);
             break;
+#endif
         default:
             ret = NOT_SUPPORTED;
             break;
diff --git a/compute/tensor/src/cpu/arm/int8/v7/convolution_gemm.cpp b/compute/tensor/src/cpu/arm/int8/v7/convolution_gemm.cpp
index 29d5ec37..f004f065 100644
--- a/compute/tensor/src/cpu/arm/int8/v7/convolution_gemm.cpp
+++ b/compute/tensor/src/cpu/arm/int8/v7/convolution_gemm.cpp
@@ -45,10 +45,10 @@ EE convolution_gemm_v7(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = p.stride_h;
     U32 strideW = p.stride_w;
-    U32 paddingT = p.padding_top;
-    U32 paddingB = p.padding_bottom;
-    U32 paddingL = p.padding_left;
-    U32 paddingR = p.padding_right;
+    U32 paddingT = p.pad_top;
+    U32 paddingB = p.pad_bottom;
+    U32 paddingL = p.pad_left;
+    U32 paddingR = p.pad_right;
     U32 dilateH = p.dilatedRate_h;
     U32 dilateW = p.dilatedRate_w;
 
@@ -160,108 +160,108 @@ EE convolution_gemm_v7(TensorDesc inputDesc,
                 I32 *out_buf = biasScaled + oc * 8 + ((n * oc + o) * ohow + hw) * 8;
                 F32 *out_o0hw0 = outArray + ((n * oc + o) * ohow + hw) * 8;
 #if 1
-                asm volatile("cmp %[out_f32], #0\n"
-                             "beq 0f\n"
-                             "vmov.s32  q4, #0.\n"
-                             "vmov.s32  q5, #0.\n"
-                             "vmov.s32  q6, #0.\n"
-                             "vmov.s32  q7, #0.\n"
-                             "vmov.s32  q8, #0.\n"
-                             "vmov.s32  q9, #0.\n"
-                             "vmov.s32 q10, #0.\n"
-                             "vmov.s32 q11, #0.\n"
-                             "b 1f\n"
-
-                             "0:\n"
-                             "vld1.s32 {d8-d11},  [%[b0_s]]\n"
-                             "vld1.s32 {d12-d15}, [%[b0_s]]\n"
-                             "vld1.s32 {d16-d19}, [%[b0_s]]\n"
-                             "vld1.s32 {d20-d23}, [%[b0_s]]\n"
-
-                             "1:\n"
-                             "vld1.s8 {d0[]}, [%[in]]!\n"
-                             "vld1.s8 {d1[]}, [%[in]]!\n"
-                             "vld1.s8 {d2[]}, [%[in]]!\n"
-                             "vld1.s8 {d3[]}, [%[in]]!\n"
-
-                             "vld1.s8  {d4-d5}, [%[w]]!\n"
-
-                             // K- > r2
-                             "mov r2, %[K]\n"
-
-                             // Computation loop
-                             "2:\n"
-
-                             "vmull.s8 q12, d4, d0\n"
-                             "vld1.s8 {d0[]}, [%[in]]!\n"
-                             "vmull.s8 q13, d4, d1\n"
-                             "vld1.s8 {d1[]}, [%[in]]!\n"
-                             "vmull.s8 q14, d4, d2\n"
-                             "vld1.s8 {d2[]}, [%[in]]!\n"
-                             "vmull.s8 q15, d4, d3\n"
-                             "vld1.s8 {d3[]}, [%[in]]!\n"
-                             "vld1.s8  {d4}, [%[w]]!\n"
-
-                             "vmlal.s8 q12, d5, d0\n"
-                             "vmlal.s8 q13, d5, d1\n"
-                             "vld1.s8 {d0[]}, [%[in]]!\n"
-                             "vmlal.s8 q14, d5, d2\n"
-                             "vld1.s8 {d1[]}, [%[in]]!\n"
-                             "vmlal.s8 q15, d5, d3\n"
-
-                             //"vaddw.s16 q4, q4, d24\n"
-                             //"vaddw.s16 q5, q5, d25\n"
-                             //"vaddw.s16 q6, q6, d26\n"
-                             //"vaddw.s16 q7, q7, d27\n"
-                             //"vaddw.s16 q8, q8, d28\n"
-                             //"vaddw.s16 q9, q9, d29\n"
-                             //"vaddw.s16 q10, q10, d30\n"
-                             //"vaddw.s16 q11, q11, d31\n"
-                             //"vmov.s32 q12, #0\n"
-                             //"vmov.s32 q13, #0\n"
-                             //"vmov.s32 q14, #0\n"
-                             //"vmov.s32 q15, #0\n"
-
-                             "vld1.s8 {d2[]}, [%[in]]!\n"
-                             "vmlal.s8 q12, d4, d0\n"
-                             "vld1.s8 {d3[]}, [%[in]]!\n"
-                             "vld1.s8  {d5}, [%[w]]!\n"
-                             "vmlal.s8 q13, d4, d1\n"
-                             "vld1.s8 {d0[]}, [%[in]]!\n"
-                             "vmlal.s8 q14, d4, d2\n"
-                             "vld1.s8 {d1[]}, [%[in]]!\n"
-                             "vmlal.s8 q15, d4, d3\n"
-                             "vld1.s8 {d2[]}, [%[in]]!\n"
-
-                             "vmlal.s8 q12, d5, d0\n"
-                             "vld1.s8 {d3[]}, [%[in]]!\n"
-                             "vld1.s8  {d4}, [%[w]]!\n"
-                             "vmlal.s8 q13, d5, d1\n"
-                             "vld1.s8 {d0[]}, [%[in]]!\n"
-                             "vmlal.s8 q14, d5, d2\n"
-                             "vld1.s8 {d1[]}, [%[in]]!\n"
-                             "vmlal.s8 q15, d5, d3\n"
-                             "vld1.s8 {d2[]}, [%[in]]!\n"
-                             "vld1.s8 {d3[]}, [%[in]]!\n"
-                             "vld1.s8  {d5}, [%[w]]!\n"
-
-                             "subs r2, r2, #4\n"
-
-                             "vaddw.s16 q4, q4, d24\n"
-                             "vaddw.s16 q5, q5, d25\n"
-                             "vaddw.s16 q6, q6, d26\n"
-                             "vaddw.s16 q7, q7, d27\n"
-                             "vaddw.s16 q8, q8, d28\n"
-                             "vaddw.s16 q9, q9, d29\n"
-                             "vaddw.s16 q10, q10, d30\n"
-                             "vaddw.s16 q11, q11, d31\n"
-
-                             "bne 2b\n"
-                             : [in] "+r"(in_hw), [w] "+r"(f_o)
-                             : [K] "r"((I64)(ic * fh * fw * 8)), [b0_s] "r"(b0_s),
-                             [out_f32] "r"(out_f32_bool)
-                             : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-                             "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r1", "r2");
+                asm volatile(
+                    "cmp %[out_f32], #0\n"
+                    "beq 0f\n"
+                    "vmov.s32  q4, #0.\n"
+                    "vmov.s32  q5, #0.\n"
+                    "vmov.s32  q6, #0.\n"
+                    "vmov.s32  q7, #0.\n"
+                    "vmov.s32  q8, #0.\n"
+                    "vmov.s32  q9, #0.\n"
+                    "vmov.s32 q10, #0.\n"
+                    "vmov.s32 q11, #0.\n"
+                    "b 1f\n"
+
+                    "0:\n"
+                    "vld1.s32 {d8-d11},  [%[b0_s]]\n"
+                    "vld1.s32 {d12-d15}, [%[b0_s]]\n"
+                    "vld1.s32 {d16-d19}, [%[b0_s]]\n"
+                    "vld1.s32 {d20-d23}, [%[b0_s]]\n"
+
+                    "1:\n"
+                    "vld1.s8 {d0[]}, [%[in]]!\n"
+                    "vld1.s8 {d1[]}, [%[in]]!\n"
+                    "vld1.s8 {d2[]}, [%[in]]!\n"
+                    "vld1.s8 {d3[]}, [%[in]]!\n"
+
+                    "vld1.s8  {d4-d5}, [%[w]]!\n"
+
+                    // K- > r2
+                    "mov r2, %[K]\n"
+
+                    // Computation loop
+                    "2:\n"
+
+                    "vmull.s8 q12, d4, d0\n"
+                    "vld1.s8 {d0[]}, [%[in]]!\n"
+                    "vmull.s8 q13, d4, d1\n"
+                    "vld1.s8 {d1[]}, [%[in]]!\n"
+                    "vmull.s8 q14, d4, d2\n"
+                    "vld1.s8 {d2[]}, [%[in]]!\n"
+                    "vmull.s8 q15, d4, d3\n"
+                    "vld1.s8 {d3[]}, [%[in]]!\n"
+                    "vld1.s8  {d4}, [%[w]]!\n"
+
+                    "vmlal.s8 q12, d5, d0\n"
+                    "vmlal.s8 q13, d5, d1\n"
+                    "vld1.s8 {d0[]}, [%[in]]!\n"
+                    "vmlal.s8 q14, d5, d2\n"
+                    "vld1.s8 {d1[]}, [%[in]]!\n"
+                    "vmlal.s8 q15, d5, d3\n"
+
+                    //"vaddw.s16 q4, q4, d24\n"
+                    //"vaddw.s16 q5, q5, d25\n"
+                    //"vaddw.s16 q6, q6, d26\n"
+                    //"vaddw.s16 q7, q7, d27\n"
+                    //"vaddw.s16 q8, q8, d28\n"
+                    //"vaddw.s16 q9, q9, d29\n"
+                    //"vaddw.s16 q10, q10, d30\n"
+                    //"vaddw.s16 q11, q11, d31\n"
+                    //"vmov.s32 q12, #0\n"
+                    //"vmov.s32 q13, #0\n"
+                    //"vmov.s32 q14, #0\n"
+                    //"vmov.s32 q15, #0\n"
+
+                    "vld1.s8 {d2[]}, [%[in]]!\n"
+                    "vmlal.s8 q12, d4, d0\n"
+                    "vld1.s8 {d3[]}, [%[in]]!\n"
+                    "vld1.s8  {d5}, [%[w]]!\n"
+                    "vmlal.s8 q13, d4, d1\n"
+                    "vld1.s8 {d0[]}, [%[in]]!\n"
+                    "vmlal.s8 q14, d4, d2\n"
+                    "vld1.s8 {d1[]}, [%[in]]!\n"
+                    "vmlal.s8 q15, d4, d3\n"
+                    "vld1.s8 {d2[]}, [%[in]]!\n"
+
+                    "vmlal.s8 q12, d5, d0\n"
+                    "vld1.s8 {d3[]}, [%[in]]!\n"
+                    "vld1.s8  {d4}, [%[w]]!\n"
+                    "vmlal.s8 q13, d5, d1\n"
+                    "vld1.s8 {d0[]}, [%[in]]!\n"
+                    "vmlal.s8 q14, d5, d2\n"
+                    "vld1.s8 {d1[]}, [%[in]]!\n"
+                    "vmlal.s8 q15, d5, d3\n"
+                    "vld1.s8 {d2[]}, [%[in]]!\n"
+                    "vld1.s8 {d3[]}, [%[in]]!\n"
+                    "vld1.s8  {d5}, [%[w]]!\n"
+
+                    "subs r2, r2, #4\n"
+
+                    "vaddw.s16 q4, q4, d24\n"
+                    "vaddw.s16 q5, q5, d25\n"
+                    "vaddw.s16 q6, q6, d26\n"
+                    "vaddw.s16 q7, q7, d27\n"
+                    "vaddw.s16 q8, q8, d28\n"
+                    "vaddw.s16 q9, q9, d29\n"
+                    "vaddw.s16 q10, q10, d30\n"
+                    "vaddw.s16 q11, q11, d31\n"
+
+                    "bne 2b\n"
+                    : [in] "+r"(in_hw), [w] "+r"(f_o)
+                    : [K] "r"((I64)(ic * fh * fw * 8)), [b0_s] "r"(b0_s), [out_f32] "r"(out_f32_bool)
+                    : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+                    "q10", "q11", "q12", "q13", "q14", "q15", "r1", "r2");
 
                 asm volatile("cmp %[out_f32], #0\n"
                              "beq 4f\n"
@@ -317,79 +317,79 @@ EE convolution_gemm_v7(TensorDesc inputDesc,
                              : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
                              "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r1");
 
-                asm volatile("cmp %[out_f32], #0\n"
-                             "bne 8f\n"
-
-                             "4:\n"
-                             "cmp %[conv_relu], #0\n"
-                             "beq 5f\n"
-                             "vmov.s32  q0, #0\n"
-                             "vmaxq.s32  q4,  q4, q0\n"
-                             "vmaxq.s32  q5,  q5, q0\n"
-                             "vmaxq.s32  q6,  q6, q0\n"
-                             "vmaxq.s32  q7,  q7, q0\n"
-                             "vmaxq.s32  q8,  q8, q0\n"
-                             "vmaxq.s32  q9,  q9, q0\n"
-                             "vmaxq.s32 q10, q10, q0\n"
-                             "vmaxq.s32 q11, q11, q0\n"
-
-                             "5:\n"
-                             "vld1.s32 {d0-d1}, [%[max_i32]]\n"
-                             "vld1.s32 {d2-d3}, [%[min_i32]]\n"
-                             "cmp %[scale_known], #0\n"
-                             "beq 6f\n"
-                             "vmaxq.s32  q4,  q4, q1\n"
-                             "vmaxq.s32  q5,  q5, q1\n"
-                             "vmaxq.s32  q6,  q6, q1\n"
-                             "vmaxq.s32  q7,  q7, q1\n"
-                             "vmaxq.s32  q8,  q8, q1\n"
-                             "vmaxq.s32  q9,  q9, q1\n"
-                             "vmaxq.s32 q10, q10, q1\n"
-                             "vmaxq.s32 q11, q11, q1\n"
-                             "vminq.s32  q4,  q4, q0\n"
-                             "vminq.s32  q5,  q5, q0\n"
-                             "vminq.s32  q6,  q6, q0\n"
-                             "vminq.s32  q7,  q7, q0\n"
-                             "vminq.s32  q8,  q8, q0\n"
-                             "vminq.s32  q9,  q9, q0\n"
-                             "vminq.s32 q10, q10, q0\n"
-                             "vminq.s32 q11, q11, q0\n"
-                             "b 7f\n"
-
-                             "6:\n"
-                             "vmaxq.s32 q0,  q4, q0\n"
-                             "vmaxq.s32 q0,  q5, q0\n"
-                             "vmaxq.s32 q0,  q6, q0\n"
-                             "vmaxq.s32 q0,  q7, q0\n"
-                             "vmaxq.s32 q0,  q8, q0\n"
-                             "vmaxq.s32 q0,  q9, q0\n"
-                             "vmaxq.s32 q0, q10, q0\n"
-                             "vmaxq.s32 q0, q11, q0\n"
-                             "vminq.s32 q1,  q4, q1\n"
-                             "vminq.s32 q1,  q5, q1\n"
-                             "vminq.s32 q1,  q6, q1\n"
-                             "vminq.s32 q1,  q7, q1\n"
-                             "vminq.s32 q1,  q8, q1\n"
-                             "vminq.s32 q1,  q9, q1\n"
-                             "vminq.s32 q1, q10, q1\n"
-                             "vminq.s32 q1, q11, q1\n"
-                             "vst1.s32 {d0-d1}, [%[max_i32]]\n"
-                             "vst1.s32 {d2-d3}, [%[min_i32]]\n"
-
-                             "7:\n"
-                             "mov r1, %[out_buf]\n"
-                             "vst1.s32  {d8-d11}, [r1]!\n"
-                             "vst1.s32 {d12-d15}, [r1]!\n"
-                             "vst1.s32 {d16-d19}, [r1]!\n"
-                             "vst1.s32 {d20-d23}, [r1]\n"
-
-                             "8:\n"
-                             : [out_buf] "+r"(out_buf)
-                             : [max_i32] "r"(max_i32), [min_i32] "r"(min_i32),
-                             [conv_relu] "r"(conv_relu_bool), [out_f32] "r"(out_f32_bool),
-                             [scale_known] "r"(scale_known_bool)
-                             : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-                             "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r1");
+                asm volatile(
+                    "cmp %[out_f32], #0\n"
+                    "bne 8f\n"
+
+                    "4:\n"
+                    "cmp %[conv_relu], #0\n"
+                    "beq 5f\n"
+                    "vmov.s32  q0, #0\n"
+                    "vmaxq.s32  q4,  q4, q0\n"
+                    "vmaxq.s32  q5,  q5, q0\n"
+                    "vmaxq.s32  q6,  q6, q0\n"
+                    "vmaxq.s32  q7,  q7, q0\n"
+                    "vmaxq.s32  q8,  q8, q0\n"
+                    "vmaxq.s32  q9,  q9, q0\n"
+                    "vmaxq.s32 q10, q10, q0\n"
+                    "vmaxq.s32 q11, q11, q0\n"
+
+                    "5:\n"
+                    "vld1.s32 {d0-d1}, [%[max_i32]]\n"
+                    "vld1.s32 {d2-d3}, [%[min_i32]]\n"
+                    "cmp %[scale_known], #0\n"
+                    "beq 6f\n"
+                    "vmaxq.s32  q4,  q4, q1\n"
+                    "vmaxq.s32  q5,  q5, q1\n"
+                    "vmaxq.s32  q6,  q6, q1\n"
+                    "vmaxq.s32  q7,  q7, q1\n"
+                    "vmaxq.s32  q8,  q8, q1\n"
+                    "vmaxq.s32  q9,  q9, q1\n"
+                    "vmaxq.s32 q10, q10, q1\n"
+                    "vmaxq.s32 q11, q11, q1\n"
+                    "vminq.s32  q4,  q4, q0\n"
+                    "vminq.s32  q5,  q5, q0\n"
+                    "vminq.s32  q6,  q6, q0\n"
+                    "vminq.s32  q7,  q7, q0\n"
+                    "vminq.s32  q8,  q8, q0\n"
+                    "vminq.s32  q9,  q9, q0\n"
+                    "vminq.s32 q10, q10, q0\n"
+                    "vminq.s32 q11, q11, q0\n"
+                    "b 7f\n"
+
+                    "6:\n"
+                    "vmaxq.s32 q0,  q4, q0\n"
+                    "vmaxq.s32 q0,  q5, q0\n"
+                    "vmaxq.s32 q0,  q6, q0\n"
+                    "vmaxq.s32 q0,  q7, q0\n"
+                    "vmaxq.s32 q0,  q8, q0\n"
+                    "vmaxq.s32 q0,  q9, q0\n"
+                    "vmaxq.s32 q0, q10, q0\n"
+                    "vmaxq.s32 q0, q11, q0\n"
+                    "vminq.s32 q1,  q4, q1\n"
+                    "vminq.s32 q1,  q5, q1\n"
+                    "vminq.s32 q1,  q6, q1\n"
+                    "vminq.s32 q1,  q7, q1\n"
+                    "vminq.s32 q1,  q8, q1\n"
+                    "vminq.s32 q1,  q9, q1\n"
+                    "vminq.s32 q1, q10, q1\n"
+                    "vminq.s32 q1, q11, q1\n"
+                    "vst1.s32 {d0-d1}, [%[max_i32]]\n"
+                    "vst1.s32 {d2-d3}, [%[min_i32]]\n"
+
+                    "7:\n"
+                    "mov r1, %[out_buf]\n"
+                    "vst1.s32  {d8-d11}, [r1]!\n"
+                    "vst1.s32 {d12-d15}, [r1]!\n"
+                    "vst1.s32 {d16-d19}, [r1]!\n"
+                    "vst1.s32 {d20-d23}, [r1]\n"
+
+                    "8:\n"
+                    : [out_buf] "+r"(out_buf)
+                    : [max_i32] "r"(max_i32), [min_i32] "r"(min_i32), [conv_relu] "r"(conv_relu_bool),
+                    [out_f32] "r"(out_f32_bool), [scale_known] "r"(scale_known_bool)
+                    : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+                    "q10", "q11", "q12", "q13", "q14", "q15", "r1");
 #else
                 int32x4_t res[4][2] = {0};
                 if (out_f32_bool == 0) {
diff --git a/compute/tensor/src/cpu/arm/int8/v7/depthwise_pointwise_convolution_direct.cpp b/compute/tensor/src/cpu/arm/int8/v7/depthwise_pointwise_convolution_direct.cpp
index 4c81efaa..499d3d2f 100644
--- a/compute/tensor/src/cpu/arm/int8/v7/depthwise_pointwise_convolution_direct.cpp
+++ b/compute/tensor/src/cpu/arm/int8/v7/depthwise_pointwise_convolution_direct.cpp
@@ -46,10 +46,10 @@ EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
 
@@ -77,20 +77,20 @@ EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc,
         INT8 *inArray_mov = inArray + n * ic * ihiw * 8;
         for (U32 c = 0; c < ic; c++) {
             if (paddingT > 0) {
-                memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(idt));
                 inArray_pad_mov += paddingT * iw_pad * 8;
             }
             for (U32 h = paddingT; h < ih_pad - paddingB; h++) {
-                memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt));
                 inArray_pad_mov += paddingL * 8;
-                memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt));
+                UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt));
                 inArray_pad_mov += iw * 8;
                 inArray_mov += iw * 8;
-                memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt));
                 inArray_pad_mov += paddingR * 8;
             }
             if (paddingB > 0) {
-                memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(idt));
                 inArray_pad_mov += paddingB * iw_pad * 8;
             }
 
diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_gemm.h b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm.h
similarity index 100%
rename from compute/tensor/src/cpu/arm/int8/v8/convolution_gemm.h
rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm.h
diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A55.cpp b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A55.cpp
similarity index 94%
rename from compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A55.cpp
rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A55.cpp
index 16d0a9be..8b622a81 100644
--- a/compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A55.cpp
+++ b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A55.cpp
@@ -11,8 +11,9 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include "cpu/arm/int8/v8/convolution_gemm.h"
+#include "cpu/arm/int8/v8.2/convolution_gemm.h"
 #include "cpu/arm/transform_functions.h"
+#include "cpu/tensor_computing_cpu.h"
 
 template <typename OT>
 EE convolution_gemm_A55(TensorDesc inputDesc,
@@ -44,10 +45,10 @@ EE convolution_gemm_A55(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
 
@@ -84,7 +85,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc,
     I32 min_i32[4] = {0};  // To record min I32 values
 
     for (U32 n = 0; n < in; n++) {  // for each batch
-        F16 scale_i = 1.0;
+        F32 scale_i = -1.0;
 
         // quantize input if necessary
         if (idt == DT_F16) {
@@ -97,35 +98,9 @@ EE convolution_gemm_A55(TensorDesc inputDesc,
             if (*inputScale > 0) {
                 scale_i = *inputScale;
             } else {
-                float16x8_t temp_v = vld1q_f16(in);
-                float16x8_t max_v = temp_v;
-                float16x8_t min_v = temp_v;
-
-                for (U32 i = 8; i < numData; i += 8) {
-                    temp_v = vld1q_f16(in + i);
-                    max_v = vmaxq_f16(max_v, temp_v);
-                    min_v = vminq_f16(min_v, temp_v);
-                }
-
-                F16 max = vmaxvq_f16(max_v);
-                F16 min = vminvq_f16(min_v);
-
-                if (max == 0 && min == 0) {
-                    return NOT_SUPPORTED;
-                }
-                if (max > 0 && min < 0) {
-                    F16 scale_max = 127.0 / max;
-                    F16 scale_min = -127.0 / min;
-                    scale_i = (scale_max < scale_min) ? scale_max : scale_min;
-                } else if (max < 0) {
-                    scale_i = -127.0 / min;
-                } else {  // min > 0
-                    scale_i = 127.0 / max;
-                }
-            }
-            for (U32 i = 0; i < numData; i++) {
-                F32 temp = in[i] * scale_i;
-                inArray[i] = round_towards_zero(temp, (*inputScale) != scale_i);
+                TensorDesc tmpDesc = inputDesc;
+                tmpDesc.dt = DT_I8;
+                quantize_cpu(inputDesc, in, &tmpDesc, inArray, &scale_i, ARM_A55);
             }
             *inputScale = scale_i;
         } else {
@@ -159,7 +134,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc,
             }
         }
 
-        F32 factor_s = 1.0 / ((F32)scale_i) / ((F32)(*filterScale));
+        F32 factor_s = 1.0 / scale_i / ((F32)(*filterScale));
         F32 factor_v[4];
         for (U32 i = 0; i < 4; i++) {
             factor_v[i] = factor_s;
@@ -767,35 +742,35 @@ EE convolution_gemm_A55(TensorDesc inputDesc,
                             in_pack + c * fh * fw * 8 * 8 + fh_idx * fw * 8 * 4 + fw_idx * 8 * 4;
                         INT8 *in_pack_1 = in_pack_0 + fh * fw * 8 * 4;
 
-                        __asm__ __volatile__("ldr d0, [%[in_0]]\n"
-                                             "ldr x2, [%[in_2]]\n"
-                                             "ldr d1, [%[in_1]]\n"
-                                             "ldr x3, [%[in_3]]\n"
-                                             "ins v0.d[1], x2\n"
-                                             "ins v1.d[1], x3\n"
-                                             "ldr d4, [%[in_4]]\n"
-                                             "ldr x6, [%[in_6]]\n"
-                                             "trn1 v20.4s, v0.4s, v1.4s\n"
-                                             "trn2 v21.4s, v0.4s, v1.4s\n"
-
-                                             "ldr d5, [%[in_5]]\n"
-                                             "ldr x7, [%[in_7]]\n"
-                                             "ins v4.d[1], x6\n"
-                                             "ins v5.d[1], x7\n"
-
-                                             "str   q20, [%[pack_0]]\n"
-                                             "trn1 v24.4s, v4.4s, v5.4s\n"
-                                             "trn2 v25.4s, v4.4s, v5.4s\n"
-                                             "str   q21, [%[pack_1]]\n"
-                                             "str   q24, [%[pack_0], #16]\n"
-                                             "str   q25, [%[pack_1], #16]\n"
-                                             :
-                                             : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1),
-                                             [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2),
-                                             [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5),
-                                             [in_6] "r"(in_6), [in_7] "r"(in_7)
-                                             : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21",
-                                             "v24", "v25", "x2", "x3", "x6", "x7");
+                        __asm__ __volatile__(
+                            "ldr d0, [%[in_0]]\n"
+                            "ldr x2, [%[in_2]]\n"
+                            "ldr d1, [%[in_1]]\n"
+                            "ldr x3, [%[in_3]]\n"
+                            "ins v0.d[1], x2\n"
+                            "ins v1.d[1], x3\n"
+                            "ldr d4, [%[in_4]]\n"
+                            "ldr x6, [%[in_6]]\n"
+                            "trn1 v20.4s, v0.4s, v1.4s\n"
+                            "trn2 v21.4s, v0.4s, v1.4s\n"
+
+                            "ldr d5, [%[in_5]]\n"
+                            "ldr x7, [%[in_7]]\n"
+                            "ins v4.d[1], x6\n"
+                            "ins v5.d[1], x7\n"
+
+                            "str   q20, [%[pack_0]]\n"
+                            "trn1 v24.4s, v4.4s, v5.4s\n"
+                            "trn2 v25.4s, v4.4s, v5.4s\n"
+                            "str   q21, [%[pack_1]]\n"
+                            "str   q24, [%[pack_0], #16]\n"
+                            "str   q25, [%[pack_1], #16]\n"
+                            :
+                            : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0),
+                            [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4),
+                            [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7)
+                            : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25",
+                            "x2", "x3", "x6", "x7");
                     }
                 }
             }
@@ -1423,8 +1398,8 @@ EE convolution_gemm_A55(TensorDesc inputDesc,
                         INT8 *in_pack_0 = in_pack + c * fh * fw * 8 + fh_idx * fw * 4 + fw_idx * 4;
                         INT8 *in_pack_1 = in_pack_0 + fh * fw * 4;
 
-                        memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8));
-                        memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8));
+                        UNI_MEMCPY(in_pack_0, in_0, 4 * bytesOf(DT_I8));
+                        UNI_MEMCPY(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8));
                     }
                 }
             }
diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A76.cpp b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A76.cpp
similarity index 94%
rename from compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A76.cpp
rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A76.cpp
index 33926aef..e027ba1c 100644
--- a/compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A76.cpp
+++ b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A76.cpp
@@ -11,8 +11,9 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include "cpu/arm/int8/v8/convolution_gemm.h"
+#include "cpu/arm/int8/v8.2/convolution_gemm.h"
 #include "cpu/arm/transform_functions.h"
+#include "cpu/tensor_computing_cpu.h"
 
 template <typename OT>
 EE convolution_gemm_A76(TensorDesc inputDesc,
@@ -44,10 +45,10 @@ EE convolution_gemm_A76(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
 
@@ -84,7 +85,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc,
     I32 min_i32[4] = {0};  // To record min I32 values
 
     for (U32 n = 0; n < in; n++) {  // for each batch
-        F16 scale_i = 1.0;
+        F32 scale_i = -1.0;
 
         // quantize input if necessary
         if (idt == DT_F16) {
@@ -97,35 +98,9 @@ EE convolution_gemm_A76(TensorDesc inputDesc,
             if (*inputScale > 0) {
                 scale_i = *inputScale;
             } else {
-                float16x8_t temp_v = vld1q_f16(in);
-                float16x8_t max_v = temp_v;
-                float16x8_t min_v = temp_v;
-
-                for (U32 i = 8; i < numData; i += 8) {
-                    temp_v = vld1q_f16(in + i);
-                    max_v = vmaxq_f16(max_v, temp_v);
-                    min_v = vminq_f16(min_v, temp_v);
-                }
-
-                F16 max = vmaxvq_f16(max_v);
-                F16 min = vminvq_f16(min_v);
-
-                if (max == 0 && min == 0) {
-                    return NOT_SUPPORTED;
-                }
-                if (max > 0 && min < 0) {
-                    F16 scale_max = 127.0 / max;
-                    F16 scale_min = -127.0 / min;
-                    scale_i = (scale_max < scale_min) ? scale_max : scale_min;
-                } else if (max < 0) {
-                    scale_i = -127.0 / min;
-                } else {  // min > 0
-                    scale_i = 127.0 / max;
-                }
-            }
-            for (U32 i = 0; i < numData; i++) {
-                F32 temp = in[i] * scale_i;
-                inArray[i] = round_towards_zero(temp, (*inputScale) != scale_i);
+                TensorDesc tmpDesc = inputDesc;
+                tmpDesc.dt = DT_I8;
+                quantize_cpu(inputDesc, in, &tmpDesc, inArray, &scale_i, ARM_A76);
             }
             *inputScale = scale_i;
         } else {
@@ -739,35 +714,35 @@ EE convolution_gemm_A76(TensorDesc inputDesc,
                             in_pack + c * fh * fw * 8 * 8 + fh_idx * fw * 8 * 4 + fw_idx * 8 * 4;
                         INT8 *in_pack_1 = in_pack_0 + fh * fw * 8 * 4;
 
-                        __asm__ __volatile__("ldr d0, [%[in_0]]\n"
-                                             "ldr x2, [%[in_2]]\n"
-                                             "ldr d1, [%[in_1]]\n"
-                                             "ldr x3, [%[in_3]]\n"
-                                             "ins v0.d[1], x2\n"
-                                             "ins v1.d[1], x3\n"
-                                             "ldr d4, [%[in_4]]\n"
-                                             "ldr x6, [%[in_6]]\n"
-                                             "trn1 v20.4s, v0.4s, v1.4s\n"
-                                             "trn2 v21.4s, v0.4s, v1.4s\n"
-
-                                             "ldr d5, [%[in_5]]\n"
-                                             "ldr x7, [%[in_7]]\n"
-                                             "ins v4.d[1], x6\n"
-                                             "ins v5.d[1], x7\n"
-
-                                             "str   q20, [%[pack_0]]\n"
-                                             "trn1 v24.4s, v4.4s, v5.4s\n"
-                                             "trn2 v25.4s, v4.4s, v5.4s\n"
-                                             "str   q21, [%[pack_1]]\n"
-                                             "str   q24, [%[pack_0], #16]\n"
-                                             "str   q25, [%[pack_1], #16]\n"
-                                             :
-                                             : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1),
-                                             [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2),
-                                             [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5),
-                                             [in_6] "r"(in_6), [in_7] "r"(in_7)
-                                             : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21",
-                                             "v24", "v25", "x2", "x3", "x6", "x7");
+                        __asm__ __volatile__(
+                            "ldr d0, [%[in_0]]\n"
+                            "ldr x2, [%[in_2]]\n"
+                            "ldr d1, [%[in_1]]\n"
+                            "ldr x3, [%[in_3]]\n"
+                            "ins v0.d[1], x2\n"
+                            "ins v1.d[1], x3\n"
+                            "ldr d4, [%[in_4]]\n"
+                            "ldr x6, [%[in_6]]\n"
+                            "trn1 v20.4s, v0.4s, v1.4s\n"
+                            "trn2 v21.4s, v0.4s, v1.4s\n"
+
+                            "ldr d5, [%[in_5]]\n"
+                            "ldr x7, [%[in_7]]\n"
+                            "ins v4.d[1], x6\n"
+                            "ins v5.d[1], x7\n"
+
+                            "str   q20, [%[pack_0]]\n"
+                            "trn1 v24.4s, v4.4s, v5.4s\n"
+                            "trn2 v25.4s, v4.4s, v5.4s\n"
+                            "str   q21, [%[pack_1]]\n"
+                            "str   q24, [%[pack_0], #16]\n"
+                            "str   q25, [%[pack_1], #16]\n"
+                            :
+                            : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0),
+                            [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4),
+                            [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7)
+                            : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25",
+                            "x2", "x3", "x6", "x7");
                     }
                 }
             }
@@ -1358,8 +1333,8 @@ EE convolution_gemm_A76(TensorDesc inputDesc,
                         INT8 *in_pack_0 = in_pack + c * fh * fw * 8 + fh_idx * fw * 4 + fw_idx * 4;
                         INT8 *in_pack_1 = in_pack_0 + fh * fw * 4;
 
-                        memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8));
-                        memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8));
+                        UNI_MEMCPY(in_pack_0, in_0, 4 * bytesOf(DT_I8));
+                        UNI_MEMCPY(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8));
                     }
                 }
             }
diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd.h b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd.h
similarity index 100%
rename from compute/tensor/src/cpu/arm/int8/v8/convolution_winograd.h
rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd.h
diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A55.cpp b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A55.cpp
similarity index 94%
rename from compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A55.cpp
rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A55.cpp
index 4f7ab9ba..83750188 100644
--- a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A55.cpp
+++ b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A55.cpp
@@ -11,8 +11,8 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include "cpu/arm/int8/v8/convolution_winograd_transform.h"
-#include "cpu/arm/int8/v8/convolution_winograd.h"
+#include "cpu/arm/int8/v8.2/convolution_winograd_transform.h"
+#include "cpu/arm/int8/v8.2/convolution_winograd.h"
 
 template <typename OT>
 EE convolution_winograd_A55(TensorDesc inputDesc,
@@ -43,10 +43,10 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     if (fdf != DF_HWNCN8C4) {
         return NOT_MATCH;
@@ -88,6 +88,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
     OT *inArray_pad = (OT *)tmp;
     short *itmArray = (short *)(inArray_pad + ic * ihiw * 8);  // will be cast to fp16 for fp16 inputs
     F16 *otmArray = (F16 *)(itmArray + 6 * 6 * ic * 12 * 8);
+    UNI_MEMSET(otmArray, 0, 6 * 6 * 12 * 8 * sizeof(F16));
     INT8 *inQ = (INT8 *)(otmArray + 6 * 6 * 12 * 8);
     if (DT_I8 == odt) {
         outArray = (F16 *)(inQ + 6 * 6 * ic * 12 * 8);  // After otmArray and pack
@@ -101,18 +102,18 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
         OT *inArray_pad_mov = inArray_pad;
         OT *inArray_mov = inArray + n * ic * ih * iw * 8;
         for (U32 c = 0; c < ic; c++) {
-            memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt));
+            UNI_MEMSET(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt));
             inArray_pad_mov += pad_top * iw_pad * 8;
             for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) {
-                memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt));
                 inArray_pad_mov += pad_left * 8;
-                memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt));
+                UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt));
                 inArray_pad_mov += iw * 8;
                 inArray_mov += iw * 8;
-                memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt));
                 inArray_pad_mov += pad_right * 8;
             }
-            memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt));
+            UNI_MEMSET(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt));
             inArray_pad_mov += pad_bottom * iw_pad * 8;
         }
 
@@ -405,7 +406,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
                     INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8;
                     F16 *out_o0hw0 = otmArray + idx * 12 * 8;
                     if (factor_v[idx][0] == 0) {  // input pixels are all 0
-                        memset(out_o0hw0, 0, 12 * 8 * sizeof(OT));
+                        UNI_MEMSET(out_o0hw0, 0, 12 * 8 * sizeof(OT));
                         continue;
                     }
                     F32 *fac = factor_v[idx];
@@ -806,35 +807,35 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
                     INT8 *in_pack_0 = in_pack + idx * 8 * ic * 8 + c * 8 * 8;
                     INT8 *in_pack_1 = in_pack_0 + 8 * 4;
 
-                    __asm__ __volatile__("ldr d0, [%[in_0]]\n"
-                                         "ldr x2, [%[in_2]]\n"
-                                         "ldr d1, [%[in_1]]\n"
-                                         "ldr x3, [%[in_3]]\n"
-                                         "ins v0.d[1], x2\n"
-                                         "ins v1.d[1], x3\n"
-                                         "ldr d4, [%[in_4]]\n"
-                                         "ldr x6, [%[in_6]]\n"
-                                         "trn1 v20.4s, v0.4s, v1.4s\n"
-                                         "trn2 v21.4s, v0.4s, v1.4s\n"
-
-                                         "ldr d5, [%[in_5]]\n"
-                                         "ldr x7, [%[in_7]]\n"
-                                         "ins v4.d[1], x6\n"
-                                         "ins v5.d[1], x7\n"
-
-                                         "str   q20, [%[pack_0]]\n"
-                                         "trn1 v24.4s, v4.4s, v5.4s\n"
-                                         "trn2 v25.4s, v4.4s, v5.4s\n"
-                                         "str   q21, [%[pack_1]]\n"
-                                         "str   q24, [%[pack_0], #16]\n"
-                                         "str   q25, [%[pack_1], #16]\n"
-                                         :
-                                         : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1),
-                                         [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2),
-                                         [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5),
-                                         [in_6] "r"(in_6), [in_7] "r"(in_7)
-                                         : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21",
-                                         "v24", "v25", "x2", "x3", "x6", "x7");
+                    __asm__ __volatile__(
+                        "ldr d0, [%[in_0]]\n"
+                        "ldr x2, [%[in_2]]\n"
+                        "ldr d1, [%[in_1]]\n"
+                        "ldr x3, [%[in_3]]\n"
+                        "ins v0.d[1], x2\n"
+                        "ins v1.d[1], x3\n"
+                        "ldr d4, [%[in_4]]\n"
+                        "ldr x6, [%[in_6]]\n"
+                        "trn1 v20.4s, v0.4s, v1.4s\n"
+                        "trn2 v21.4s, v0.4s, v1.4s\n"
+
+                        "ldr d5, [%[in_5]]\n"
+                        "ldr x7, [%[in_7]]\n"
+                        "ins v4.d[1], x6\n"
+                        "ins v5.d[1], x7\n"
+
+                        "str   q20, [%[pack_0]]\n"
+                        "trn1 v24.4s, v4.4s, v5.4s\n"
+                        "trn2 v25.4s, v4.4s, v5.4s\n"
+                        "str   q21, [%[pack_1]]\n"
+                        "str   q24, [%[pack_0], #16]\n"
+                        "str   q25, [%[pack_1], #16]\n"
+                        :
+                        : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0),
+                        [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4),
+                        [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7)
+                        : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", "x2",
+                        "x3", "x6", "x7");
                 }
             }
 
@@ -847,7 +848,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
                     INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8;
                     F16 *out_o0hw0 = otmArray + idx * 8 * 8;
                     if (factor_v[idx][0] == 0) {  // input pixels are all 0
-                        memset(out_o0hw0, 0, 8 * 8 * sizeof(OT));
+                        UNI_MEMSET(out_o0hw0, 0, 8 * 8 * sizeof(OT));
                         continue;
                     }
                     F32 *fac = factor_v[idx];
@@ -1133,21 +1134,21 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
                     INT8 *in_pack_0 = in_pack + idx * 4 * ic * 8 + c * 4 * 8;
                     INT8 *in_pack_1 = in_pack_0 + 4 * 4;
 
-                    __asm__ __volatile__("ldr d0, [%[in_0]]\n"
-                                         "ldr x2, [%[in_2]]\n"
-                                         "ldr d1, [%[in_1]]\n"
-                                         "ldr x3, [%[in_3]]\n"
-                                         "ins v0.d[1], x2\n"
-                                         "ins v1.d[1], x3\n"
-                                         "trn1 v20.4s, v0.4s, v1.4s\n"
-                                         "trn2 v21.4s, v0.4s, v1.4s\n"
-                                         "str   q20, [%[pack_0]]\n"
-                                         "str   q21, [%[pack_1]]\n"
-                                         :
-                                         : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1),
-                                         [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2),
-                                         [in_3] "r"(in_3)
-                                         : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3");
+                    __asm__ __volatile__(
+                        "ldr d0, [%[in_0]]\n"
+                        "ldr x2, [%[in_2]]\n"
+                        "ldr d1, [%[in_1]]\n"
+                        "ldr x3, [%[in_3]]\n"
+                        "ins v0.d[1], x2\n"
+                        "ins v1.d[1], x3\n"
+                        "trn1 v20.4s, v0.4s, v1.4s\n"
+                        "trn2 v21.4s, v0.4s, v1.4s\n"
+                        "str   q20, [%[pack_0]]\n"
+                        "str   q21, [%[pack_1]]\n"
+                        :
+                        : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0),
+                        [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3)
+                        : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3");
                 }
             }
 
@@ -1160,7 +1161,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
                     INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8;
                     F16 *out_o0hw0 = otmArray + idx * 4 * 8;
                     if (factor_v[idx][0] == 0) {
-                        memset(out_o0hw0, 0, 4 * 8 * sizeof(OT));
+                        UNI_MEMSET(out_o0hw0, 0, 4 * 8 * sizeof(OT));
                         continue;
                     }
                     F32 *fac = factor_v[idx];
@@ -1349,8 +1350,8 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
                     INT8 *in_pack_0 = in_pack + idx * ic * 8 + c * 8;
                     INT8 *in_pack_1 = in_pack_0 + 4;
 
-                    memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8));
-                    memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8));
+                    UNI_MEMCPY(in_pack_0, in_0, 4 * bytesOf(DT_I8));
+                    UNI_MEMCPY(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8));
                 }
             }
 
@@ -1363,7 +1364,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc,
                     INT8 *f_o = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8;
                     F16 *out_o0hw0 = otmArray + idx * 8;
                     if (factor_v[idx][0] == 0) {
-                        memset(out_o0hw0, 0, 8 * sizeof(OT));
+                        UNI_MEMSET(out_o0hw0, 0, 8 * sizeof(OT));
                         continue;
                     }
                     int32x4_t res[2] = {0};
diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A76.cpp b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A76.cpp
similarity index 94%
rename from compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A76.cpp
rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A76.cpp
index 168a4ab6..ca160982 100644
--- a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A76.cpp
+++ b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A76.cpp
@@ -11,8 +11,8 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include "cpu/arm/int8/v8/convolution_winograd_transform.h"
-#include "cpu/arm/int8/v8/convolution_winograd.h"
+#include "cpu/arm/int8/v8.2/convolution_winograd_transform.h"
+#include "cpu/arm/int8/v8.2/convolution_winograd.h"
 
 template <typename OT>
 EE convolution_winograd_A76(TensorDesc inputDesc,
@@ -43,10 +43,10 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     if (fdf != DF_HWNCN8C4) {
         return NOT_MATCH;
@@ -88,6 +88,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
     OT *inArray_pad = (OT *)tmp;
     short *itmArray = (short *)(inArray_pad + ic * ihiw * 8);  // will be cast to fp16 for fp16 inputs
     F16 *otmArray = (F16 *)(itmArray + 6 * 6 * ic * 12 * 8);
+    UNI_MEMSET(otmArray, 0, 6 * 6 * 12 * 8 * sizeof(F16));
     INT8 *inQ = (INT8 *)(otmArray + 6 * 6 * 12 * 8);
     if (DT_I8 == odt) {
         outArray = (F16 *)(inQ + 6 * 6 * ic * 12 * 8);  // After otmArray and pack
@@ -101,18 +102,18 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
         OT *inArray_pad_mov = inArray_pad;
         OT *inArray_mov = inArray + n * ic * ih * iw * 8;
         for (U32 c = 0; c < ic; c++) {
-            memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt));
+            UNI_MEMSET(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt));
             inArray_pad_mov += pad_top * iw_pad * 8;
             for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) {
-                memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt));
                 inArray_pad_mov += pad_left * 8;
-                memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt));
+                UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt));
                 inArray_pad_mov += iw * 8;
                 inArray_mov += iw * 8;
-                memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt));
                 inArray_pad_mov += pad_right * 8;
             }
-            memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt));
+            UNI_MEMSET(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt));
             inArray_pad_mov += pad_bottom * iw_pad * 8;
         }
 
@@ -405,7 +406,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
                     INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8;
                     F16 *out_o0hw0 = otmArray + idx * 12 * 8;
                     if (factor_v[idx][0] == 0) {  // input pixels are all 0
-                        memset(out_o0hw0, 0, 12 * 8 * sizeof(OT));
+                        UNI_MEMSET(out_o0hw0, 0, 12 * 8 * sizeof(OT));
                         continue;
                     }
                     F32 *fac = factor_v[idx];
@@ -787,35 +788,35 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
                     INT8 *in_pack_0 = in_pack + idx * 8 * ic * 8 + c * 8 * 8;
                     INT8 *in_pack_1 = in_pack_0 + 8 * 4;
 
-                    __asm__ __volatile__("ldr d0, [%[in_0]]\n"
-                                         "ldr x2, [%[in_2]]\n"
-                                         "ldr d1, [%[in_1]]\n"
-                                         "ldr x3, [%[in_3]]\n"
-                                         "ins v0.d[1], x2\n"
-                                         "ins v1.d[1], x3\n"
-                                         "ldr d4, [%[in_4]]\n"
-                                         "ldr x6, [%[in_6]]\n"
-                                         "trn1 v20.4s, v0.4s, v1.4s\n"
-                                         "trn2 v21.4s, v0.4s, v1.4s\n"
-
-                                         "ldr d5, [%[in_5]]\n"
-                                         "ldr x7, [%[in_7]]\n"
-                                         "ins v4.d[1], x6\n"
-                                         "ins v5.d[1], x7\n"
-
-                                         "str   q20, [%[pack_0]]\n"
-                                         "trn1 v24.4s, v4.4s, v5.4s\n"
-                                         "trn2 v25.4s, v4.4s, v5.4s\n"
-                                         "str   q21, [%[pack_1]]\n"
-                                         "str   q24, [%[pack_0], #16]\n"
-                                         "str   q25, [%[pack_1], #16]\n"
-                                         :
-                                         : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1),
-                                         [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2),
-                                         [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5),
-                                         [in_6] "r"(in_6), [in_7] "r"(in_7)
-                                         : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21",
-                                         "v24", "v25", "x2", "x3", "x6", "x7");
+                    __asm__ __volatile__(
+                        "ldr d0, [%[in_0]]\n"
+                        "ldr x2, [%[in_2]]\n"
+                        "ldr d1, [%[in_1]]\n"
+                        "ldr x3, [%[in_3]]\n"
+                        "ins v0.d[1], x2\n"
+                        "ins v1.d[1], x3\n"
+                        "ldr d4, [%[in_4]]\n"
+                        "ldr x6, [%[in_6]]\n"
+                        "trn1 v20.4s, v0.4s, v1.4s\n"
+                        "trn2 v21.4s, v0.4s, v1.4s\n"
+
+                        "ldr d5, [%[in_5]]\n"
+                        "ldr x7, [%[in_7]]\n"
+                        "ins v4.d[1], x6\n"
+                        "ins v5.d[1], x7\n"
+
+                        "str   q20, [%[pack_0]]\n"
+                        "trn1 v24.4s, v4.4s, v5.4s\n"
+                        "trn2 v25.4s, v4.4s, v5.4s\n"
+                        "str   q21, [%[pack_1]]\n"
+                        "str   q24, [%[pack_0], #16]\n"
+                        "str   q25, [%[pack_1], #16]\n"
+                        :
+                        : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0),
+                        [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4),
+                        [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7)
+                        : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", "x2",
+                        "x3", "x6", "x7");
                 }
             }
 
@@ -828,7 +829,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
                     INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8;
                     F16 *out_o0hw0 = otmArray + idx * 8 * 8;
                     if (factor_v[idx][0] == 0) {  // input pixels are all 0
-                        memset(out_o0hw0, 0, 8 * 8 * sizeof(OT));
+                        UNI_MEMSET(out_o0hw0, 0, 8 * 8 * sizeof(OT));
                         continue;
                     }
                     F32 *fac = factor_v[idx];
@@ -1099,21 +1100,21 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
                     INT8 *in_pack_0 = in_pack + idx * 4 * ic * 8 + c * 4 * 8;
                     INT8 *in_pack_1 = in_pack_0 + 4 * 4;
 
-                    __asm__ __volatile__("ldr d0, [%[in_0]]\n"
-                                         "ldr x2, [%[in_2]]\n"
-                                         "ldr d1, [%[in_1]]\n"
-                                         "ldr x3, [%[in_3]]\n"
-                                         "ins v0.d[1], x2\n"
-                                         "ins v1.d[1], x3\n"
-                                         "trn1 v20.4s, v0.4s, v1.4s\n"
-                                         "trn2 v21.4s, v0.4s, v1.4s\n"
-                                         "str   q20, [%[pack_0]]\n"
-                                         "str   q21, [%[pack_1]]\n"
-                                         :
-                                         : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1),
-                                         [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2),
-                                         [in_3] "r"(in_3)
-                                         : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3");
+                    __asm__ __volatile__(
+                        "ldr d0, [%[in_0]]\n"
+                        "ldr x2, [%[in_2]]\n"
+                        "ldr d1, [%[in_1]]\n"
+                        "ldr x3, [%[in_3]]\n"
+                        "ins v0.d[1], x2\n"
+                        "ins v1.d[1], x3\n"
+                        "trn1 v20.4s, v0.4s, v1.4s\n"
+                        "trn2 v21.4s, v0.4s, v1.4s\n"
+                        "str   q20, [%[pack_0]]\n"
+                        "str   q21, [%[pack_1]]\n"
+                        :
+                        : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0),
+                        [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3)
+                        : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3");
                 }
             }
 
@@ -1126,7 +1127,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
                     INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8;
                     F16 *out_o0hw0 = otmArray + idx * 4 * 8;
                     if (factor_v[idx][0] == 0) {
-                        memset(out_o0hw0, 0, 4 * 8 * sizeof(OT));
+                        UNI_MEMSET(out_o0hw0, 0, 4 * 8 * sizeof(OT));
                         continue;
                     }
                     F32 *fac = factor_v[idx];
@@ -1302,8 +1303,8 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
                     INT8 *in_pack_0 = in_pack + idx * ic * 8 + c * 8;
                     INT8 *in_pack_1 = in_pack_0 + 4;
 
-                    memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8));
-                    memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8));
+                    UNI_MEMCPY(in_pack_0, in_0, 4 * bytesOf(DT_I8));
+                    UNI_MEMCPY(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8));
                 }
             }
 
@@ -1316,7 +1317,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc,
                     INT8 *f_o = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8;
                     F16 *out_o0hw0 = otmArray + idx * 8;
                     if (factor_v[idx][0] == 0) {
-                        memset(out_o0hw0, 0, 8 * sizeof(OT));
+                        UNI_MEMSET(out_o0hw0, 0, 8 * sizeof(OT));
                         continue;
                     }
                     int32x4_t res[2] = {0};
diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_transform.h
similarity index 100%
rename from compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_transform.h
rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_transform.h
diff --git a/compute/tensor/src/cpu/arm/int8/v8/depthwise_pointwise_convolution_direct.cpp b/compute/tensor/src/cpu/arm/int8/v8.2/depthwise_pointwise_convolution_direct.cpp
similarity index 99%
rename from compute/tensor/src/cpu/arm/int8/v8/depthwise_pointwise_convolution_direct.cpp
rename to compute/tensor/src/cpu/arm/int8/v8.2/depthwise_pointwise_convolution_direct.cpp
index 77ec8489..a96d8cf3 100644
--- a/compute/tensor/src/cpu/arm/int8/v8/depthwise_pointwise_convolution_direct.cpp
+++ b/compute/tensor/src/cpu/arm/int8/v8.2/depthwise_pointwise_convolution_direct.cpp
@@ -45,10 +45,10 @@ EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
 
@@ -76,20 +76,20 @@ EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc,
         INT8 *inArray_mov = inArray + n * ic * ihiw * 8;
         for (U32 c = 0; c < ic; c++) {
             if (paddingT > 0) {
-                memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(idt));
                 inArray_pad_mov += paddingT * iw_pad * 8;
             }
             for (U32 h = paddingT; h < ih_pad - paddingB; h++) {
-                memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt));
                 inArray_pad_mov += paddingL * 8;
-                memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt));
+                UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt));
                 inArray_pad_mov += iw * 8;
                 inArray_mov += iw * 8;
-                memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt));
                 inArray_pad_mov += paddingR * 8;
             }
             if (paddingB > 0) {
-                memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(idt));
+                UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(idt));
                 inArray_pad_mov += paddingB * iw_pad * 8;
             }
 
diff --git a/compute/tensor/src/cpu/arm/normalization.cpp b/compute/tensor/src/cpu/arm/normalization.cpp
index a26d8bc3..33c01e5f 100644
--- a/compute/tensor/src/cpu/arm/normalization.cpp
+++ b/compute/tensor/src/cpu/arm/normalization.cpp
@@ -19,28 +19,31 @@
 #include "cpu/arm/fp16/tensor_computing_fp16.h"
 #endif
 
-EE layer_normalization_arm(
-    TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output)
+EE layer_normalization_arm(TensorDesc inputDesc,
+    void *input,
+    LayerNormParamSpec p,
+    void *alpha,
+    void *beta,
+    TensorDesc outputDesc,
+    void *output)
 {
-    DataType idt = inputDesc.dt;
-    EE ret = SUCCESS;
-    switch (idt) {
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
             ret = layer_normalization_fp32(
-                inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output);
+                inputDesc, (F32 *)input, p, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output);
             break;
         }
 #endif
 #ifdef _USE_FP16
         case DT_F16: {
             ret = layer_normalization_fp16(
-                inputDesc, (F16 *)input, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output);
+                inputDesc, (F16 *)input, p, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output);
             break;
         }
 #endif
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
     return ret;
diff --git a/compute/tensor/src/cpu/arm/padding.cpp b/compute/tensor/src/cpu/arm/padding.cpp
index da462d8e..091a6355 100644
--- a/compute/tensor/src/cpu/arm/padding.cpp
+++ b/compute/tensor/src/cpu/arm/padding.cpp
@@ -12,7 +12,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/arm/tensor_computing_arm.h"
-#include <string.h>
 
 EE padding_arm(TensorDesc inputDesc,
     const void *input,
@@ -40,33 +39,33 @@ EE padding_arm(TensorDesc inputDesc,
                     (const U8 *)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt);
                 U8 *outPtr = (U8 *)output +
                     (((n * oc + c) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt);
-                if (padParamSpec.pad_mode == Pad_Constant) {
-                    memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt));
+                if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                    UNI_MEMSET(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt));
                     outPtr += padParamSpec.left * alignSize * bytesOf(odt);
-                    memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt));
+                    UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt));
                     outPtr += iw * alignSize * bytesOf(odt);
-                    memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt));
+                    UNI_MEMSET(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt));
                 } else {
                     for (U32 w = 0; w < padParamSpec.left; w++) {
                         U32 index = 0;
-                        if (padParamSpec.pad_mode == Pad_Reflect) {
+                        if (padParamSpec.pad_mode == PAD_REFLECT) {
                             index = (padParamSpec.left - w) * alignSize * bytesOf(idt);
-                        } else if (padParamSpec.pad_mode == Pad_Symmetric) {
+                        } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
                             index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt);
                         }
-                        memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt));
+                        UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt));
                         outPtr += alignSize * bytesOf(idt);
                     }
-                    memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt));
+                    UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt));
                     outPtr += iw * alignSize * bytesOf(odt);
                     for (U32 w = 0; w < padParamSpec.right; w++) {
                         U32 index = (iw - 1) * alignSize * bytesOf(idt);
-                        if (padParamSpec.pad_mode == Pad_Reflect) {
+                        if (padParamSpec.pad_mode == PAD_REFLECT) {
                             index = (iw - w - 2) * alignSize * bytesOf(idt);
-                        } else if (padParamSpec.pad_mode == Pad_Symmetric) {
+                        } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
                             index = (iw - w - 1) * alignSize * bytesOf(idt);
                         }
-                        memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt));
+                        UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt));
                         outPtr += alignSize * bytesOf(idt);
                     }
                 }
@@ -74,20 +73,20 @@ EE padding_arm(TensorDesc inputDesc,
             U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt);
             for (U32 h = 0; h < padParamSpec.top; h++) {
                 U32 index = h * ow * alignSize * bytesOf(odt);
-                if (padParamSpec.pad_mode == Pad_Constant) {
-                    memset(outPtr + index, 0, ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Edge) {
-                    memcpy(outPtr + index,
+                if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                    UNI_MEMSET(outPtr + index, 0, ow * alignSize * bytesOf(odt));
+                } else if (padParamSpec.pad_mode == PAD_EDGE) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Reflect) {
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_REFLECT) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr +
                             ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize *
                                 bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Symmetric) {
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr +
                             ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize *
                                 bytesOf(odt)),
@@ -98,21 +97,21 @@ EE padding_arm(TensorDesc inputDesc,
             }
             for (U32 h = 0; h < padParamSpec.bottom; h++) {
                 U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt);
-                if (padParamSpec.pad_mode == Pad_Constant) {
-                    memset(outPtr + index, 0, ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Edge) {
-                    memcpy(outPtr + index,
+                if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                    UNI_MEMSET(outPtr + index, 0, ow * alignSize * bytesOf(odt));
+                } else if (padParamSpec.pad_mode == PAD_EDGE) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Reflect) {
-                    // memcpy(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt));
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_REFLECT) {
+                    // UNI_MEMCPY(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt));
+                    UNI_MEMCPY(outPtr + index,
                         outPtr +
                             ((padParamSpec.top + ih - 1 - padParamSpec.bottom + h) * ow *
                                 alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Symmetric) {
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
                 } else {
diff --git a/compute/tensor/src/cpu/arm/pooling.cpp b/compute/tensor/src/cpu/arm/pooling.cpp
index d0f0586f..39c1d25f 100644
--- a/compute/tensor/src/cpu/arm/pooling.cpp
+++ b/compute/tensor/src/cpu/arm/pooling.cpp
@@ -54,7 +54,7 @@ EE pooling_arm(TensorDesc inputDesc,
         CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
         CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
         it = ot = 1;
-        p.padding_before = p.padding_after = 0;
+        p.pad_before = p.pad_after = 0;
         p.kernel_t = p.stride_t = 1;
     } else if (tensorIs5d(inputDesc)) {
         CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw));
@@ -72,13 +72,11 @@ EE pooling_arm(TensorDesc inputDesc,
     if (idf != DF_NCHWC8 || odf != idf) {
         ret = NOT_MATCH;
     }
-    if (p.padding_before >= p.kernel_t || p.padding_top >= p.kernel_h ||
-        p.padding_left >= p.kernel_w) {
+    if (p.pad_before >= p.kernel_t || p.pad_top >= p.kernel_h || p.pad_left >= p.kernel_w) {
         return NOT_SUPPORTED;
     }
 
     ic /= 8;
-    int kernelSize = p.kernel_t * p.kernel_h * p.kernel_w;
     ArmPoolingFunction func = nullptr;
     if (p.mode == POOLING_MAX) {
         switch (idt) {
@@ -124,29 +122,40 @@ EE pooling_arm(TensorDesc inputDesc,
         return NOT_SUPPORTED;
     }
 
-    const U8 *inputPtr = (const U8 *)input;
-    U8 *outputPtr = (U8 *)output;
-    for (U32 n = 0; n < in; n++) {
-        for (U32 c = 0; c < ic; c++) {
+#ifdef _USE_OPENMP
+#pragma omp parallel num_threads(OMP_NUM_THREADS)
+#endif
+    {
+        int kernelSize = p.kernel_t * p.kernel_h * p.kernel_w;
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (U32 o = 0; o < in * ic; o++) {
+            U32 n = o / ic;
+            U32 c = o % ic;
+            const U8 *src = (const U8 *)input + o * it * ih * iw * 8 * bytesOf(idt);
+            U8 *dst = (U8 *)output + o * ot * oh * ow * 8 * bytesOf(idt);
             for (U32 t = 0; t < ot; t++) {
-                int tstart = t * (int)p.stride_t - (int)p.padding_before;
+                int tstart = t * (int)p.stride_t - (int)p.pad_before;
                 int tend = UNI_MIN(tstart + p.kernel_t, it);
                 tstart = UNI_MAX(tstart, 0);
                 for (U32 h = 0; h < oh; h++) {
-                    int hstart = h * (int)p.stride_h - (int)p.padding_top;
+                    int hstart = h * (int)p.stride_h - (int)p.pad_top;
                     int hend = UNI_MIN(hstart + p.kernel_h, ih);
                     hstart = UNI_MAX(hstart, 0);
-                    for (U32 w = 0; w < ow; w++, outputPtr += 8 * bytesOf(odt)) {
-                        int wstart = w * (int)p.stride_w - (int)p.padding_left;
+                    for (U32 w = 0; w < ow; w++, dst += 8 * bytesOf(idt)) {
+                        int wstart = w * (int)p.stride_w - (int)p.pad_left;
                         int wend = UNI_MIN(wstart + p.kernel_w, iw);
                         wstart = UNI_MAX(wstart, 0);
-                        int poolSize = (tend - tstart) * (hend - hstart) * (wend - wstart);
+                        int poolSize = kernelSize;
+                        if (!p.count_include_pad) {
+                            poolSize = (tend - tstart) * (hend - hstart) * (wend - wstart);
+                        }
                         ret = func(tstart, tend, hstart, hend, wstart, wend, kernelSize, poolSize,
-                            inputPtr, it, ih, iw, outputPtr, scale);
+                            src, it, ih, iw, dst, scale);
                     }
                 }
             }
-            inputPtr += it * ih * iw * 8 * bytesOf(idt);
         }
     }
     return ret;
@@ -174,37 +183,47 @@ EE pooling_bp_arm(
     if (idf != DF_NCHWC8 || odf != idf) {
         ret = NOT_MATCH;
     }
-    if (p.padding_top >= p.kernel_h || p.padding_left >= p.kernel_w) {
+    if (p.pad_top >= p.kernel_h || p.pad_left >= p.kernel_w) {
         ret = NOT_SUPPORTED;
     }
 
     ic /= 8;
-    const U8 *inputPtr = (const U8 *)input;
-    U8 *outputPtr = (U8 *)output;
-    for (U32 n = 0; n < in; n++) {
-        for (U32 c = 0; c < ic; c++) {
-            for (U32 h = 0; h < ih; h++) {
-                for (U32 w = 0; w < iw; w++, inputPtr += 8 * bytesOf(idt)) {
-                    int hstart = (int)h * (int)p.stride_h - (int)p.padding_top;
-                    int wstart = (int)w * (int)p.stride_w - (int)p.padding_left;
-                    int hend = UNI_MIN(hstart + p.kernel_h, oh);
-                    int wend = UNI_MIN(wstart + p.kernel_w, ow);
-                    hstart = UNI_MAX(hstart, 0);
-                    wstart = UNI_MAX(wstart, 0);
-                    switch (idt) {
+    const U8 *src = (const U8 *)input;
+    U8 *dst = (U8 *)output;
+    int poolSize = p.kernel_t * p.kernel_h * p.kernel_w;
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+    for (U32 o = 0; o < in * ic; o++) {
+        U32 n = o / ic;
+        U32 c = o % ic;
+        //for (U32 n = 0; n < in; n++) {
+        //    for (U32 c = 0; c < ic; c++) {
+        const U8 *src = (const U8 *)input + o * ih * iw * 8 * bytesOf(idt);
+        U8 *dst = (U8 *)output + o * oh * ow * 8 * bytesOf(idt);
+        for (U32 h = 0; h < ih; h++) {
+            for (U32 w = 0; w < iw; w++, src += 8 * bytesOf(idt)) {
+                int hstart = (int)h * (int)p.stride_h - (int)p.pad_top;
+                int wstart = (int)w * (int)p.stride_w - (int)p.pad_left;
+                int hend = UNI_MIN(hstart + p.kernel_h, oh);
+                int wend = UNI_MIN(wstart + p.kernel_w, ow);
+                hstart = UNI_MAX(hstart, 0);
+                wstart = UNI_MAX(wstart, 0);
+                if (!p.count_include_pad) {
+                    poolSize = (hend - hstart) * (wend - wstart);
+                }
+                switch (idt) {
 #ifdef _USE_FP32
-                        case DT_F32:
-                            ret = pooling_bp_c8_fp32((const F32 *)inputPtr, hstart, hend, wstart,
-                                wend, (F32 *)outputPtr, ow, p);
-                            break;
+                    case DT_F32:
+                        ret = pooling_bp_c8_fp32((const F32 *)src, hstart, hend, wstart, wend,
+                            poolSize, (F32 *)dst, ow, p);
+                        break;
 #endif
-                        default:
-                            ret = NOT_SUPPORTED;
-                            break;
-                    }
+                    default:
+                        ret = NOT_SUPPORTED;
+                        break;
                 }
             }
-            outputPtr += oh * ow * 8 * bytesOf(odt);
         }
     }
     return ret;
diff --git a/compute/tensor/src/cpu/arm/scale.cpp b/compute/tensor/src/cpu/arm/scale.cpp
index 405ae42c..4e44f033 100644
--- a/compute/tensor/src/cpu/arm/scale.cpp
+++ b/compute/tensor/src/cpu/arm/scale.cpp
@@ -12,6 +12,7 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/arm/tensor_computing_arm.h"
+#include "cpu/arm/int32/tensor_computing_int32.h"
 #ifdef _USE_FP32
 #include "cpu/arm/fp32/tensor_computing_fp32.h"
 #endif
@@ -36,7 +37,7 @@ EE scale_arm(TensorDesc inputDesc,
     if (outputDesc.df == DF_NCHWC8) {
         axis = outputDesc.nDims;
     }
-    EE ret = SUCCESS;
+    EE ret = NOT_SUPPORTED;
     switch (outputDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
@@ -52,8 +53,12 @@ EE scale_arm(TensorDesc inputDesc,
             break;
         }
 #endif
+        case DT_I32: {
+            ret = scale_int32((I32 *)input, axis, outputDesc.nDims, (I32 *)alpha, (I32 *)beta, on,
+                oc, elements_per_channel, ic, (I32 *)output);
+            break;
+        }
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
 
diff --git a/compute/tensor/src/cpu/arm/softmax.cpp b/compute/tensor/src/cpu/arm/softmax.cpp
index 88ebb474..df50e0d6 100644
--- a/compute/tensor/src/cpu/arm/softmax.cpp
+++ b/compute/tensor/src/cpu/arm/softmax.cpp
@@ -22,9 +22,8 @@
 EE softmax_arm(
     TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output)
 {
-    DataType idt = inputDesc.dt;
-    EE ret = SUCCESS;
-    switch (idt) {
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
             ret = softmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output);
@@ -38,9 +37,30 @@ EE softmax_arm(
         }
 #endif
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
+    return ret;
+}
 
+EE logsoftmax_arm(
+    TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output)
+{
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
+#ifdef _USE_FP32
+        case DT_F32: {
+            ret = logsoftmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output);
+            break;
+        }
+#endif
+#ifdef _USE_FP16
+        case DT_F16: {
+            ret = logsoftmax_fp16(inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output);
+            break;
+        }
+#endif
+        default:
+            break;
+    }
     return ret;
 }
diff --git a/compute/tensor/src/cpu/arm/tensor_computing_arm.h b/compute/tensor/src/cpu/arm/tensor_computing_arm.h
index 9aa6ab5c..1329c7b7 100644
--- a/compute/tensor/src/cpu/arm/tensor_computing_arm.h
+++ b/compute/tensor/src/cpu/arm/tensor_computing_arm.h
@@ -179,8 +179,13 @@ EE rnncell_arm(TensorDesc xDesc,
     void *currentH,
     Arch arch);
 
-EE layer_normalization_arm(
-    TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output);
+EE layer_normalization_arm(TensorDesc inputDesc,
+    void *input,
+    LayerNormParamSpec p,
+    void *alpha,
+    void *beta,
+    TensorDesc outputDesc,
+    void *output);
 
 EE pooling_arm(TensorDesc inputDesc,
     const void *input,
@@ -208,6 +213,9 @@ EE scale_arm(TensorDesc inputDesc,
 EE softmax_arm(
     TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output);
 
+EE logsoftmax_arm(
+    TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output);
+
 EE check_arm(TensorDesc inputDescA,
     const void *inputA,
     TensorDesc inputDescB,
diff --git a/compute/tensor/src/cpu/arm/transform_functions.h b/compute/tensor/src/cpu/arm/transform_functions.h
index 8b191890..3fc9b27a 100644
--- a/compute/tensor/src/cpu/arm/transform_functions.h
+++ b/compute/tensor/src/cpu/arm/transform_functions.h
@@ -163,12 +163,12 @@ template <typename T, U32 CAlignSize>
 inline T *convolution_input_padding_per_channel(
     U32 n, U32 ic, U32 it, U32 ih, U32 iw, const ConvolutionParamSpec &p, T *src, T *dst)
 {
-    U32 it_pad = it + p.padding_before + p.padding_after;
-    U32 ih_pad = ih + p.padding_top + p.padding_bottom;
-    U32 iw_pad = iw + p.padding_left + p.padding_right;
+    U32 it_pad = it + p.pad_before + p.pad_after;
+    U32 ih_pad = ih + p.pad_top + p.pad_bottom;
+    U32 iw_pad = iw + p.pad_left + p.pad_right;
     T *inArray_pad;
-    if (p.padding_before == 0 && p.padding_after == 0 && p.padding_top == 0 &&
-        p.padding_bottom == 0 && p.padding_left == 0 && p.padding_right == 0) {
+    if (p.pad_before == 0 && p.pad_after == 0 && p.pad_top == 0 && p.pad_bottom == 0 &&
+        p.pad_left == 0 && p.pad_right == 0) {
         T *inArray_mov = src + n * ic * it * ih * iw * CAlignSize;
         inArray_pad = inArray_mov;
     } else {
@@ -179,25 +179,25 @@ inline T *convolution_input_padding_per_channel(
         for (U32 c = 0; c < ic; c++) {
             T *inArray_mov = src + (n * ic + c) * it * ih * iw * CAlignSize;
             T *inArray_pad_mov = inArray_pad + c * it_pad * ih_pad * iw_pad * CAlignSize;
-            memset(inArray_pad_mov, 0, p.padding_before * ih_pad * iw_pad * CAlignSize * sizeof(T));
-            inArray_pad_mov += p.padding_before * ih_pad * iw_pad * CAlignSize;
-            for (U32 t = p.padding_before; t < it_pad - p.padding_after; t++) {
-                memset(inArray_pad_mov, 0, p.padding_top * iw_pad * CAlignSize * sizeof(T));
-                inArray_pad_mov += p.padding_top * iw_pad * CAlignSize;
-                for (U32 h = p.padding_top; h < ih_pad - p.padding_bottom; h++) {
-                    memset(inArray_pad_mov, 0, p.padding_left * CAlignSize * sizeof(T));
-                    inArray_pad_mov += p.padding_left * CAlignSize;
-                    memcpy(inArray_pad_mov, inArray_mov, iw * CAlignSize * sizeof(T));
+            UNI_MEMSET(inArray_pad_mov, 0, p.pad_before * ih_pad * iw_pad * CAlignSize * sizeof(T));
+            inArray_pad_mov += p.pad_before * ih_pad * iw_pad * CAlignSize;
+            for (U32 t = p.pad_before; t < it_pad - p.pad_after; t++) {
+                UNI_MEMSET(inArray_pad_mov, 0, p.pad_top * iw_pad * CAlignSize * sizeof(T));
+                inArray_pad_mov += p.pad_top * iw_pad * CAlignSize;
+                for (U32 h = p.pad_top; h < ih_pad - p.pad_bottom; h++) {
+                    UNI_MEMSET(inArray_pad_mov, 0, p.pad_left * CAlignSize * sizeof(T));
+                    inArray_pad_mov += p.pad_left * CAlignSize;
+                    UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * CAlignSize * sizeof(T));
                     inArray_pad_mov += iw * CAlignSize;
                     inArray_mov += iw * CAlignSize;
-                    memset(inArray_pad_mov, 0, p.padding_right * CAlignSize * sizeof(T));
-                    inArray_pad_mov += p.padding_right * CAlignSize;
+                    UNI_MEMSET(inArray_pad_mov, 0, p.pad_right * CAlignSize * sizeof(T));
+                    inArray_pad_mov += p.pad_right * CAlignSize;
                 }
-                memset(inArray_pad_mov, 0, p.padding_bottom * iw_pad * CAlignSize * sizeof(T));
-                inArray_pad_mov += p.padding_bottom * iw_pad * CAlignSize;
+                UNI_MEMSET(inArray_pad_mov, 0, p.pad_bottom * iw_pad * CAlignSize * sizeof(T));
+                inArray_pad_mov += p.pad_bottom * iw_pad * CAlignSize;
             }
-            memset(inArray_pad_mov, 0, p.padding_after * ih_pad * iw_pad * CAlignSize * sizeof(T));
-            inArray_pad_mov += p.padding_after * ih_pad * iw_pad * CAlignSize;
+            UNI_MEMSET(inArray_pad_mov, 0, p.pad_after * ih_pad * iw_pad * CAlignSize * sizeof(T));
+            inArray_pad_mov += p.pad_after * ih_pad * iw_pad * CAlignSize;
         }
     }
     return inArray_pad;
diff --git a/compute/tensor/src/cpu/cast.cpp b/compute/tensor/src/cpu/cast.cpp
new file mode 100644
index 00000000..5ff4bcab
--- /dev/null
+++ b/compute/tensor/src/cpu/cast.cpp
@@ -0,0 +1,100 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/tensor_computing_cpu.h"
+
+template <typename TI, typename TO>
+static void cast_kernel(U32 len, TI *input, TO *output)
+{
+    for (U32 i = 0; i < len; ++i) {
+        output[i] = (TO)(input[i]);
+    }
+}
+
+template <typename T>
+static EE cast_kernel(U32 len, DataType odt, T *input, void *output)
+{
+    EE ret = SUCCESS;
+    switch (odt) {
+        case DT_I32: {
+            cast_kernel<T, I32>(len, input, (I32 *)output);
+            break;
+        }
+        case DT_U32: {
+            cast_kernel<T, U32>(len, input, (U32 *)output);
+            break;
+        }
+        case DT_F32: {
+            cast_kernel<T, F32>(len, input, (F32 *)output);
+            break;
+        }
+#ifdef _USE_FP16
+        case DT_F16: {
+            cast_kernel<T, F16>(len, input, (F16 *)output);
+            break;
+        }
+#endif
+        case DT_U8: {
+            cast_kernel<T, U8>(len, input, (U8 *)output);
+            break;
+        }
+        case DT_I8: {
+            cast_kernel<T, INT8>(len, input, (INT8 *)output);
+            break;
+        }
+        default:
+            ret = NOT_SUPPORTED;
+            break;
+    }
+    return ret;
+}
+
+EE cast_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output)
+{
+    DataType idt = inputDesc.dt;
+    DataType odt = outputDesc.dt;
+    U32 len = tensorNumElements(inputDesc);
+    EE ret;
+    switch (idt) {
+        case DT_F32: {
+            ret = cast_kernel<F32>(len, odt, (F32 *)input, output);
+            break;
+        }
+#ifdef _USE_FP16
+        case DT_F16: {
+            ret = cast_kernel<F16>(len, odt, (F16 *)input, output);
+            break;
+        }
+#endif
+        case DT_U32: {
+            ret = cast_kernel<U32>(len, odt, (U32 *)input, output);
+            break;
+        }
+        case DT_I32: {
+            ret = cast_kernel<I32>(len, odt, (I32 *)input, output);
+            break;
+        }
+        case DT_U8: {
+            ret = cast_kernel<U8>(len, odt, (U8 *)input, output);
+            break;
+        }
+        case DT_I8: {
+            ret = cast_kernel<INT8>(len, odt, (INT8 *)input, output);
+            break;
+        }
+        default:
+            ret = NOT_SUPPORTED;
+            break;
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/check.cpp b/compute/tensor/src/cpu/check.cpp
new file mode 100644
index 00000000..434a85a3
--- /dev/null
+++ b/compute/tensor/src/cpu/check.cpp
@@ -0,0 +1,135 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/tensor_computing_cpu.h"
+
+template <typename TA, typename TB>
+static inline EE check_kernel(
+    TensorDesc aDesc, TA *a, TensorDesc bDesc, TB *b, CheckParamSpec p, TensorDesc outDesc, U8 *out)
+{
+    int aLen = tensorNumElements(aDesc);
+    int bLen = tensorNumElements(bDesc);
+    int len = tensorNumElements(outDesc);
+    EE ret = SUCCESS;
+    for (int i = 0; i < len; i++) {
+        TA va = a[i % aLen];
+        TB vb = b[i % bLen];
+        switch (p.mode) {
+            case CHECK_GREATER: {
+                out[i] = (va > (TA)vb) ? 1 : 0;
+                break;
+            }
+            case CHECK_GREATER_EQUAL: {
+                out[i] = (va >= (TA)vb) ? 1 : 0;
+                break;
+            }
+            case CHECK_EQUAL: {
+                out[i] = (va == (TA)vb) ? 1 : 0;
+                break;
+            }
+            case CHECK_NOT_EQUAL: {
+                out[i] = (va != (TA)vb) ? 1 : 0;
+                break;
+            }
+            case CHECK_LESS: {
+                out[i] = (va < (TA)vb) ? 1 : 0;
+                break;
+            }
+            case CHECK_LESS_EQUAL: {
+                out[i] = (va <= (TA)vb) ? 1 : 0;
+                break;
+            }
+            default:
+                ret = NOT_SUPPORTED;
+                break;
+        }
+    }
+    return ret;
+}
+
+template <typename TA>
+EE check_wrapper(TensorDesc inputDescA,
+    TA *inputA,
+    TensorDesc inputDescB,
+    void *inputB,
+    CheckParamSpec p,
+    TensorDesc outputDesc,
+    U8 *output)
+{
+    EE ret = SUCCESS;
+    switch (inputDescB.dt) {
+#ifdef _USE_FP32
+        case DT_F32: {
+            ret = check_kernel<TA, F32>(
+                inputDescA, inputA, inputDescB, (F32 *)inputB, p, outputDesc, output);
+            break;
+        }
+#endif
+#ifdef _USE_FP16
+        case DT_F16:
+            ret = check_kernel<TA, F16>(
+                inputDescA, inputA, inputDescB, (F16 *)inputB, p, outputDesc, output);
+            break;
+#endif
+        case DT_U32: {
+            ret = check_kernel<TA, U32>(
+                inputDescA, inputA, inputDescB, (U32 *)inputB, p, outputDesc, output);
+            break;
+        }
+        case DT_I32: {
+            ret = check_kernel<TA, I32>(
+                inputDescA, inputA, inputDescB, (I32 *)inputB, p, outputDesc, output);
+            break;
+        }
+        default:
+            ret = NOT_SUPPORTED;
+            break;
+    }
+    return ret;
+}
+
+EE check_cpu(TensorDesc inputADesc,
+    void *inputA,
+    TensorDesc inputBDesc,
+    void *inputB,
+    CheckParamSpec p,
+    TensorDesc outputDesc,
+    void *output)
+{
+    EE ret = NOT_SUPPORTED;
+    switch (inputADesc.dt) {
+        case DT_U32:
+            ret = check_wrapper<U32>(
+                inputADesc, (U32 *)inputA, inputBDesc, inputB, p, outputDesc, (U8 *)output);
+            break;
+        case DT_I32:
+            ret = check_wrapper<I32>(
+                inputADesc, (I32 *)inputA, inputBDesc, inputB, p, outputDesc, (U8 *)output);
+            break;
+#ifdef _USE_FP32
+        case DT_F32:
+            ret = check_wrapper<F32>(
+                inputADesc, (F32 *)inputA, inputBDesc, inputB, p, outputDesc, (U8 *)output);
+            break;
+#endif
+#ifdef _USE_FP16
+        case DT_F16:
+            ret = check_wrapper<F16>(
+                inputADesc, (F16 *)inputA, inputBDesc, inputB, p, outputDesc, (U8 *)output);
+            break;
+#endif
+        default:
+            break;
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/concat.cpp b/compute/tensor/src/cpu/concat.cpp
index 5927cdf3..b21f6461 100644
--- a/compute/tensor/src/cpu/concat.cpp
+++ b/compute/tensor/src/cpu/concat.cpp
@@ -41,7 +41,7 @@ inline static void concat_v1(const std::vector<TensorDesc> &inputDesc,
             U32 blockSize = inputDesc[j].dims[axis] * tileSize;
             if (!jumpMemcpy[j]) {
                 U8 *srcPtr = ((U8 *)input[j]) + i * blockSize;
-                memcpy(dstPtr, srcPtr, blockSize);
+                UNI_MEMCPY(dstPtr, srcPtr, blockSize);
             }
             dstPtr += blockSize;
         }
@@ -61,21 +61,23 @@ inline static void concat_v2(const std::vector<TensorDesc> &inputDesc,
     U8 *dstPtr = (U8 *)output;
     for (U32 i = 0; i < loops; i++) {
         for (U32 j = 0; j < num; j++) {
-            U32 blockSize = inputDesc[j].dims[axis] * tileSize;
+            I32 blockSize = inputDesc[j].dims[axis] * tileSize;
             if (!jumpMemcpy[j]) {
                 U8 *srcPtr = ((U8 *)input[j]) + i * blockSize;
 #ifdef _USE_OPENMP
-                U32 bblockNum = OMP_NUM_THREADS;
-                U32 bblockSize = (blockSize + bblockNum - 1) / bblockNum;
+                I32 bblockNum = OMP_NUM_THREADS;
+                I32 bblockSize = (blockSize + bblockNum - 1) / bblockNum;
+                bblockSize = UNI_MIN(32, bblockSize);
+                bblockNum = (blockSize + bblockSize - 1) / bblockSize;
 
-#pragma omp parallel for num_threads(OMP_NUM_THREADS)
-                for (U32 k = 0; k < bblockNum; ++k) {
-                    U32 copyDst = k * bblockSize;
-                    memcpy(dstPtr + copyDst, srcPtr + copyDst,
+#pragma omp parallel for num_threads(OMP_NUM_THREADS) if (bblockNum >= OMP_NUM_THREADS)
+                for (I32 k = 0; k < bblockNum; ++k) {
+                    I32 copyDst = k * bblockSize;
+                    UNI_MEMCPY(dstPtr + copyDst, srcPtr + copyDst,
                         UNI_MIN(bblockSize, blockSize - copyDst));
                 }
 #else
-                memcpy(dstPtr, srcPtr, blockSize);
+                UNI_MEMCPY(dstPtr, srcPtr, blockSize);
 #endif
             }
             dstPtr += blockSize;
@@ -138,7 +140,9 @@ static EE concat(std::vector<TensorDesc> inputDesc,
     U8 *tmpPtr = (U8 *)tmp;
     U32 outputOff = 0;
     for (U32 j = 0; j < num; j++) {
-        if ((4 != inputDesc[j].nDims) || (1 != inputDesc[j].dims[1]) || (1 != inputDesc[j].dims[0])) {
+        if (((4 == inputDesc[j].nDims) &&
+                ((1 != inputDesc[j].dims[1]) || (1 != inputDesc[j].dims[0]))) ||
+            ((3 == inputDesc[j].nDims) && (1 != inputDesc[j].dims[0]))) {
             if (isC8 && (DF_NCHWC8 != inputDesc[j].df)) {
                 TensorDesc tmpDesc = inputDesc[j];
                 tmpDesc.df = DF_NCHWC8;
diff --git a/compute/tensor/src/cpu/cpu_functions_template.h b/compute/tensor/src/cpu/cpu_functions_template.h
index b30c4471..67f3c7c9 100644
--- a/compute/tensor/src/cpu/cpu_functions_template.h
+++ b/compute/tensor/src/cpu/cpu_functions_template.h
@@ -63,9 +63,9 @@ inline void array_power_template(T *input, T *output, I32 len, F32 power)
 }
 
 template <typename T>
-EE activation_template(ActivationParamSpec activationDesc, F32 input, T *output)
+inline EE activation_template(const ActivationParamSpec &activationDesc, const F32 &input, T *output)
 {
-    F32 value, result = 0;
+    F32 result = 0;
     EE ret = SUCCESS;
     switch (activationDesc.mode) {
         case ACTIVATION_NULL: {
@@ -73,86 +73,52 @@ EE activation_template(ActivationParamSpec activationDesc, F32 input, T *output)
             break;
         }
         case ACTIVATION_RELU: {
-            value = input;
-            F32 tmp = activationDesc.value[0] * value;
-            if (value < tmp) {
-                value = tmp;
-            }
-            result = value;
+            result = UNI_MAX(activationDesc.value[0] * input, input);
             break;
         }
         case ACTIVATION_RELU6: {
-            value = input;
-            if (value < 0) {
-                value = 0;
-            }
-            if (value > 6) {
-                value = 6;
-            }
-            result = value;
+            result = UNI_MIN(UNI_MAX(input, 0), 6);
             break;
         }
         case ACTIVATION_H_SIGMOID: {
-            value = input + 3;
-            if (value < 0) {
-                value = 0;
-            }
-            if (value > 6) {
-                value = 6;
-            }
-            result = value / 6;
+            result = UNI_MIN(UNI_MAX(input + 3, 0), 6) / 6;
             break;
         }
         case ACTIVATION_H_SWISH: {
-            value = input + 3;
-            if (value < 0) {
-                value = 0;
-            }
-            if (value > 6) {
-                value = 6;
-            }
-            result = input * (value / 6);
+            result = UNI_MIN(UNI_MAX(input + 3, 0), 6) * input / 6;
             break;
         }
         case ACTIVATION_H_SWISH_NODIV: {
-            value = input + 3;
-            if (value < 0) {
-                value = 0;
-            }
-            if (value > 6) {
-                value = 6;
-            }
-            result = input * value;
+            result = UNI_MIN(UNI_MAX(input + 3, 0), 6) * input;
             break;
         }
         case ACTIVATION_GELU: {
-            value = input;
-            value = erf(value / sqrt(2));
+            F32 value = erf(input / sqrt(2));
             value = 0.5 * (1.0 + value);
-            value = input * value;
-            result = value;
+            result = input * value;
             break;
         }
         case ACTIVATION_TANH: {
-            value = 1.0 - 2.0 / (exp(2.0 * input) + 1.0);
-            result = value;
+            result = 1.0 - 2.0 / (exp(2.0 * input) + 1.0);
             break;
         }
         case ACTIVATION_SIGMOID: {
-            value = 1.0 / (1.0 + exp(-1.0 * input));
-            result = value;
+            result = 1.0 / (1.0 + exp(-1.0 * input));
+            break;
+        }
+        case ACTIVATION_SWISH: {
+            result = input / (1.0 + exp(-1.0 * input));
             break;
         }
         case ACTIVATION_MISH: {
-            value = input;
+            F32 value = input;
             F32 mish_threshold = 20;
             if (value < -mish_threshold) {
                 value = exp(value);
             } else if (!(value > mish_threshold || value < -mish_threshold)) {
                 value = log(exp(value) + 1.0);
             }
-            value = input * tanh(value);
-            result = value;
+            result = input * tanh(value);
             break;
         }
         case ACTIVATION_SOFTPLUS: {
@@ -183,6 +149,22 @@ EE activation_template(ActivationParamSpec activationDesc, F32 input, T *output)
             result = -input;
             break;
         }
+        case ACTIVATION_ROUND: {
+            result = round(input);
+            break;
+        }
+        case ACTIVATION_CEIL: {
+            result = ceil(input);
+            break;
+        }
+        case ACTIVATION_FLOOR: {
+            result = floor(input);
+            break;
+        }
+        case ACTIVATION_RECIPROCAL: {
+            result = 1 / input;
+            break;
+        }
         default:
             ret = NOT_SUPPORTED;
             break;
diff --git a/compute/tensor/src/cpu/deconvolution.cpp b/compute/tensor/src/cpu/deconvolution.cpp
index cfee9b10..60e33f1a 100644
--- a/compute/tensor/src/cpu/deconvolution.cpp
+++ b/compute/tensor/src/cpu/deconvolution.cpp
@@ -50,7 +50,7 @@ EE deconvolution_infer_forward_algorithm_cpu(TensorDesc inputDesc,
     }
 
 #ifdef _USE_X86
-    if (IS_X86(arch) && idf == DF_NCHWC8 && (fc * 2 < ic || fc < 128)) {
+    if (IS_X86(arch) && idf == DF_NCHWC8) {
         *algorithm = CONVOLUTION_ALGORITHM_POINTWISE;
         return SUCCESS;
     }
@@ -144,23 +144,21 @@ EE deconvolution_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc,
     if (algorithm == CONVOLUTION_ALGORITHM_IM2COL_GEMM) {
         TensorDesc matrixADesc = tensor2df(idt, DF_NKN8, ic, in * ih * iw);
         TensorDesc matrixBDesc = tensor2df(idt, DF_NORMAL, ic, oc * fh * fw);
-        CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(matrixADesc, matrixBDesc, bytes, X86_AVX2));
+        CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(matrixADesc, matrixBDesc, bytes, arch));
         *bytes += in * ih * iw * oc * fh * fw * bytesOf(idt);
-#ifdef _USE_NEON
-        if (IS_ARM(arch) && idf == DF_NCHWC8) {
+        if (!IS_X86(arch) || idf != DF_NCHWC8 || in > 1) {
             *bytes += in * ih * iw * ic * bytesOf(idt);
         }
         *bytes += 32;
-#endif
         return SUCCESS;
     }
 
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     U32 tPadding = fh - 1 - paddingT;
     U32 bPadding = fh - 1 - paddingB;
@@ -197,37 +195,37 @@ EE deconvolution_gemm(TensorDesc inputDesc,
     U32 fh = convParamSpec.kernel_h;
     U32 fw = convParamSpec.kernel_w;
 
-    TensorDesc matrixADesc = tensor2df(idt, DF_TRANSPOSE, ic, in * ih * iw);
-    if (idf == DF_NCHWC8) {
-        if (IS_X86(arch)) {
-            matrixADesc = tensor2df(idt, DF_NKN8, ic, in * ih * iw);
-        } else {
-            TensorDesc tmpDesc = tensor4df(odt, DF_NCHW, in, ic, ih, iw);
-            U8 *tmpInput = (U8 *)tmp;
-            transformToNCHW(inputDesc, input, tmpDesc, tmpInput);
-            input = tmpInput;
-            tmp = (void *)(tmpInput + in * ic * iw * ih * bytesOf(idt));
-        }
+    TensorDesc matrixADesc = tensor2df(idt, DF_NORMAL, in * ih * iw, ic);
+    if (IS_X86(arch) && idf == DF_NCHWC8 && in == 1) {
+        matrixADesc = tensor2df(idt, DF_NKN8, ic, in * ih * iw);
+    } else {
+        TensorDesc tmpDesc = tensor4df(odt, DF_NHWC, in, ic, ih, iw);
+        U8 *tmpInput = (U8 *)tmp;
+        transformFormat(inputDesc, input, tmpDesc, tmpInput);
+        input = tmpInput;
+        tmp = (void *)(tmpInput + in * ic * iw * ih * bytesOf(idt));
     }
     TensorDesc matrixCDesc = tensor2df(odt, DF_NORMAL, in * ih * iw, fw * fh * oc);
     U8 *tmpOutput = (U8 *)tmp;
-    tmpOutput += in * ih * iw * ic * bytesOf(idt);
+    tmp = (void *)(tmpOutput + in * ih * iw * fw * fh * oc * bytesOf(idt));
 
-    memset(tmpOutput, 0, in * ih * iw * fw * fh * oc * bytesOf(idt));
+    UNI_MEMSET(tmpOutput, 0, in * ih * iw * fw * fh * oc * bytesOf(idt));
     CHECK_STATUS(matrix_matrix_multiply(matrixADesc, input, filterDesc, filter, tmpBytes, tmp,
         matrixCDesc, tmpOutput, nullptr, arch));
 
     U8 *tmpOutputPtr = (U8 *)output;
     U32 biasTileSize = bytesOf(biasDesc.dt) * 8;
-    U8 *biasPtr = (U8 *)bias;
-    for (U32 c = 0; c < oc / 8; c++, biasPtr += biasTileSize) {
-        for (U32 n = 0; n < oh * ow; n++) {
-            memcpy(tmpOutputPtr, biasPtr, biasTileSize);
-            tmpOutputPtr += biasTileSize;
+    for (U32 n = 0; n < on; ++n) {
+        U8 *biasPtr = (U8 *)bias;
+        for (U32 c = 0; c < oc / 8; c++, biasPtr += biasTileSize) {
+            for (U32 hw = 0; hw < oh * ow; hw++) {
+                UNI_MEMCPY(tmpOutputPtr, biasPtr, biasTileSize);
+                tmpOutputPtr += biasTileSize;
+            }
         }
     }
 
-    EE ret = NOT_SUPPORTED;
+    EE ret = SUCCESS;
     if (IS_ARM(arch)) {
 #ifdef _USE_NEON
         ret =
@@ -299,18 +297,18 @@ EE deconvolution_cpu(TensorDesc inputDesc,
 
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     ConvolutionParamSpec transposedCD = convParamSpec;
     transposedCD.stride_h = 1;
     transposedCD.stride_w = 1;
-    transposedCD.padding_top = 0;
-    transposedCD.padding_bottom = 0;
-    transposedCD.padding_left = 0;
-    transposedCD.padding_right = 0;
+    transposedCD.pad_top = 0;
+    transposedCD.pad_bottom = 0;
+    transposedCD.pad_left = 0;
+    transposedCD.pad_right = 0;
     transposedCD.dilatedRate_h = 1;
     transposedCD.dilatedRate_w = 1;
 
@@ -323,69 +321,73 @@ EE deconvolution_cpu(TensorDesc inputDesc,
     U32 stuffW = strideW - 1;
     U32 ihPadded = ih + (ih - 1) * stuffH + tPadding + bPadding;
     U32 iwPadded = iw + (iw - 1) * stuffW + lPadding + rPadding;
-    TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ihPadded, iwPadded);
+    TensorDesc inPaddedDesc = tensor4df(idt, idf, 1, ic, ihPadded, iwPadded);
+    TensorDesc singleOutputDesc = tensor4df(idt, idf, 1, oc, oh, ow);
 
-    U8 *inPad = (U8 *)tmp;
-    U8 *inPadMov = inPad;
-    U8 *inputMov = (U8 *)input;
     U32 memUnit = 8 * bytesOf(idt);
+    U32 ic8 = ic / 8;
+    EE ret = NOT_SUPPORTED;
+    TensorDesc blankTensorDesc;
+    ActivationParamSpec blankActivationParamSpec;
 
-    ic /= 8;
+    for (U32 n = 0; n < in; ++n) {
+        U8 *inputMov = (U8 *)input + n * ih * iw * ic * bytesOf(idt);
+        U8 *outputMov = (U8 *)output + n * oh * ow * oc * bytesOf(odt);
+        U8 *inPad = (U8 *)tmp;
+        U8 *inPadMov = inPad;
 
-    for (U32 c = 0; c < ic; c++) {
-        for (U32 h = 0; h < tPadding; h++) {
-            memset(inPadMov, 0, iwPadded * memUnit);
-            inPadMov += iwPadded * memUnit;
-        }
-        for (U32 h = 0; h < ih - 1; h++) {
-            memset(inPadMov, 0, lPadding * memUnit);
+        for (U32 c = 0; c < ic8; c++) {
+            for (U32 h = 0; h < tPadding; h++) {
+                UNI_MEMSET(inPadMov, 0, iwPadded * memUnit);
+                inPadMov += iwPadded * memUnit;
+            }
+            for (U32 h = 0; h < ih - 1; h++) {
+                UNI_MEMSET(inPadMov, 0, lPadding * memUnit);
+                inPadMov += lPadding * memUnit;
+                for (U32 w = 0; w < iw - 1; w++) {
+                    UNI_MEMCPY(inPadMov, inputMov, memUnit);
+                    inPadMov += memUnit;
+                    inputMov += memUnit;
+                    UNI_MEMSET(inPadMov, 0, stuffW * memUnit);
+                    inPadMov += stuffW * memUnit;
+                }
+                UNI_MEMCPY(inPadMov, inputMov, memUnit);
+                inPadMov += memUnit;
+                inputMov += memUnit;
+                UNI_MEMSET(inPadMov, 0, rPadding * memUnit);
+                inPadMov += rPadding * memUnit;
+
+                // stuffH
+                UNI_MEMSET(inPadMov, 0, iwPadded * stuffH * memUnit);
+                inPadMov += iwPadded * stuffH * memUnit;
+            }
+            UNI_MEMSET(inPadMov, 0, lPadding * memUnit);
             inPadMov += lPadding * memUnit;
             for (U32 w = 0; w < iw - 1; w++) {
-                memcpy(inPadMov, inputMov, memUnit);
+                UNI_MEMCPY(inPadMov, inputMov, memUnit);
                 inPadMov += memUnit;
                 inputMov += memUnit;
-                memset(inPadMov, 0, stuffW * memUnit);
+                UNI_MEMSET(inPadMov, 0, stuffW * memUnit);
                 inPadMov += stuffW * memUnit;
             }
-            memcpy(inPadMov, inputMov, memUnit);
+            UNI_MEMCPY(inPadMov, inputMov, memUnit);
             inPadMov += memUnit;
             inputMov += memUnit;
-            memset(inPadMov, 0, rPadding * memUnit);
+            UNI_MEMSET(inPadMov, 0, rPadding * memUnit);
             inPadMov += rPadding * memUnit;
 
-            // stuffH
-            memset(inPadMov, 0, iwPadded * stuffH * memUnit);
-            inPadMov += iwPadded * stuffH * memUnit;
-        }
-        memset(inPadMov, 0, lPadding * memUnit);
-        inPadMov += lPadding * memUnit;
-        for (U32 w = 0; w < iw - 1; w++) {
-            memcpy(inPadMov, inputMov, memUnit);
-            inPadMov += memUnit;
-            inputMov += memUnit;
-            memset(inPadMov, 0, stuffW * memUnit);
-            inPadMov += stuffW * memUnit;
-        }
-        memcpy(inPadMov, inputMov, memUnit);
-        inPadMov += memUnit;
-        inputMov += memUnit;
-        memset(inPadMov, 0, rPadding * memUnit);
-        inPadMov += rPadding * memUnit;
-
-        for (U32 h = ihPadded - bPadding; h < ihPadded; h++) {
-            memset(inPadMov, 0, iwPadded * memUnit);
-            inPadMov += iwPadded * memUnit;
+            for (U32 h = ihPadded - bPadding; h < ihPadded; h++) {
+                UNI_MEMSET(inPadMov, 0, iwPadded * memUnit);
+                inPadMov += iwPadded * memUnit;
+            }
         }
-    }
 
-    EE ret = NOT_SUPPORTED;
-    TensorDesc blankTensorDesc;
-    ActivationParamSpec blankActivationParamSpec;
-    ret = depthwise_pointwise_convolution_cpu(inPaddedDesc, inPad, filterDesc, filter,
-        blankTensorDesc, nullptr, transposedCD, DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT,
-        biasDesc, bias, blankTensorDesc, nullptr, tmpBytes - tensorNumBytes(inPaddedDesc),
-        inPad + tensorNumBytes(inPaddedDesc), outputDesc, output, activationDesc,
-        blankActivationParamSpec, arch);
+        ret = depthwise_pointwise_convolution_cpu(inPaddedDesc, inPad, filterDesc, filter,
+            blankTensorDesc, nullptr, transposedCD,
+            DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT, biasDesc, bias, blankTensorDesc,
+            nullptr, tmpBytes - tensorNumBytes(inPaddedDesc), inPad + tensorNumBytes(inPaddedDesc),
+            singleOutputDesc, outputMov, activationDesc, blankActivationParamSpec, arch);
+    }
 
     return ret;
 }
diff --git a/compute/tensor/src/cpu/depth2space.cpp b/compute/tensor/src/cpu/depth2space.cpp
new file mode 100644
index 00000000..7ca67dfd
--- /dev/null
+++ b/compute/tensor/src/cpu/depth2space.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/tensor_computing_cpu.h"
+
+template <typename T>
+static inline EE depth2space_kernel(
+    TensorDesc inputDesc, T *input, Depth2SpaceParamSpec p, TensorDesc outputDesc, T *output)
+{
+    DataType idt, odt;
+    DataFormat idf, odf;
+    U32 in, ic, ih, iw;
+    U32 on, oc, oh, ow;
+    int bh = p.block_size;
+    int bw = p.block_size;
+    if (tensorIs4d(inputDesc)) {
+        CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+        CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+    } else if (tensorIs3d(inputDesc)) {
+        CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih));
+        CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh));
+        iw = ow = 1;
+        bw = 1;
+    } else {
+        return NOT_SUPPORTED;
+    }
+
+    int cx = 1;
+    if (idf == DF_NCHWC8) {
+        cx = 8;
+    }
+    if (idf == DF_NCHWC16) {
+        cx = 16;
+    }
+    U32 icx = ic / cx;
+    for (U32 n = 0, o_i = 0; n < in; n++) {
+        for (U32 c = 0; c < oc; c++) {
+            for (U32 h = 0; h < ih; h++) {
+                for (int i = 0; i < bh; i++) {
+                    for (U32 w = 0; w < iw; w++) {
+                        for (int j = 0; j < bw; j++, o_i++) {
+                            int i_c = (c * bh + i) * bw + j;
+                            int c1 = i_c / cx;
+                            int c2 = i_c % cx;
+                            int i_i = (((n * icx + c1) * ih + h) * iw + w) * cx + c2;
+                            output[o_i] = input[i_i];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return SUCCESS;
+}
+
+EE depth2space_cpu(
+    TensorDesc inputDesc, void *input, Depth2SpaceParamSpec p, TensorDesc outputDesc, void *output)
+{
+    if (nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
+#ifdef _USE_FP32
+        case DT_F32:
+            ret = depth2space_kernel<F32>(inputDesc, (F32 *)input, p, outputDesc, (F32 *)output);
+            break;
+#endif
+#ifdef _USE_FP16
+        case DT_F16:
+            ret = depth2space_kernel<F16>(inputDesc, (F16 *)input, p, outputDesc, (F16 *)output);
+            break;
+#endif
+        default:
+            break;
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp
index b7d70f00..caf10831 100644
--- a/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp
+++ b/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp
@@ -53,7 +53,7 @@ EE depthwise_pointwise_convolution_cpu(TensorDesc inputDesc,
 #ifdef _USE_X86
     } else if (IS_X86(arch)) {
         ret = depthwise_pointwise_convolution_x86(inputDesc, input, nullptr, dwFilterDesc, dwFilter,
-            pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, pwBiasDesc,
+            pwFilterDesc, pwFilter, convParamSpec, algorithm, nullptr, dwBiasDesc, dwBias, pwBiasDesc,
             pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec,
             pointwiseActivationParamSpec, arch);
 #endif
diff --git a/compute/tensor/src/cpu/detectionoutput.cpp b/compute/tensor/src/cpu/detectionoutput.cpp
index 9695c638..aa2eebc1 100644
--- a/compute/tensor/src/cpu/detectionoutput.cpp
+++ b/compute/tensor/src/cpu/detectionoutput.cpp
@@ -11,86 +11,8 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include "error.h"
 #include "cpu/tensor_computing_cpu.h"
-
-inline EE qsort_descent(std::vector<BoxRect> &boxes, std::vector<F32> &scores, int left, int right)
-{
-    if (boxes.empty() || scores.empty()) {
-        return NOT_SUPPORTED;
-    }
-
-    int i = left;
-    int j = right;
-    F32 temp = scores[(left + right) / 2];
-
-    while (i <= j) {
-        while (scores[i] > temp) {
-            i++;
-        }
-        while (scores[j] < temp) {
-            j--;
-        }
-        if (i <= j) {
-            std::swap(boxes[i], boxes[j]);
-            std::swap(scores[i], scores[j]);
-            i++;
-            j--;
-        }
-    }
-
-    if (left < j) {
-        qsort_descent(boxes, scores, left, j);
-    }
-    if (i < right) {
-        qsort_descent(boxes, scores, i, right);
-    }
-
-    return SUCCESS;
-}
-
-inline F32 intersectionarea(BoxRect a, BoxRect b)
-{
-    if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) {
-        return 0.f;
-    }
-    F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin);
-    F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin);
-
-    return inter_width * inter_height;
-}
-
-inline EE nms_pickedboxes(std::vector<BoxRect> boxes, std::vector<I64> &picked, F32 nms_threshold)
-{
-    I64 n = boxes.size();
-
-    std::vector<F32> areas(n);
-    for (I64 i = 0; i < n; i++) {
-        BoxRect box = boxes[i];
-
-        F32 width = box.xmax - box.xmin;
-        F32 height = box.ymax - box.ymin;
-
-        areas[i] = width * height;
-    }
-    for (I64 i = 0; i < n; i++) {
-        BoxRect a = boxes[i];
-        int keep = 1;
-        for (int j = 0; j < (int)picked.size(); j++) {
-            BoxRect b = boxes[picked[j]];
-            F32 inter_area = intersectionarea(a, b);
-            F32 union_area = areas[i] + areas[picked[j]] - inter_area;
-
-            if (inter_area / union_area > nms_threshold) {
-                keep = 0;
-            }
-        }
-        if (keep) {
-            picked.push_back(i);
-        }
-    }
-    return SUCCESS;
-}
+#include "cpu/non_max_suppression.h"
 
 template <typename T>
 EE detectionoutput_kernel(std::vector<void *> input,
@@ -138,71 +60,54 @@ EE detectionoutput_kernel(std::vector<void *> input,
         boxes[i].assign(box.begin(), box.end());
     }
 
-    std::vector<std::vector<BoxRect>> allclass_boxrects;
-    std::vector<std::vector<F32>> allclass_boxscores;
-    allclass_boxrects.resize(numclass);
-    allclass_boxscores.resize(numclass);
-
+    std::vector<std::vector<BoxRect>> allclass_boxrects(numclass);
     for (U32 i = 1; i < numclass; i++) {
         std::vector<BoxRect> class_boxrects;
-        std::vector<F32> class_boxscores;
         for (U32 j = 0; j < num_total_priorbox; j++) {
             F32 score = confidence[j * numclass + i];
 
             if (score > confidence_threshold) {
                 std::vector<F32> inbox;
                 inbox.assign(boxes[j].begin(), boxes[j].end());
-                BoxRect b = {inbox[0], inbox[1], inbox[2], inbox[3], i};
+                BoxRect b = {inbox[0], inbox[1], inbox[2], inbox[3], i, score, j};
                 class_boxrects.push_back(b);
-                class_boxscores.push_back(score);
             }
         }
         // sort the boxes with scores
-        qsort_descent(
-            class_boxrects, class_boxscores, 0, static_cast<int>(class_boxscores.size() - 1));
+        std::stable_sort(class_boxrects.begin(), class_boxrects.end(),
+            [&](const BoxRect &a, const BoxRect &b) { return (a.score > b.score); });
 
-        if (nms_top_k < (U32)class_boxrects.size()) {
+        if (nms_top_k < class_boxrects.size()) {
             class_boxrects.resize(nms_top_k);
-            class_boxscores.resize(nms_top_k);
         }
         // apply nms
-        std::vector<I64> picked;
-        nms_pickedboxes(class_boxrects, picked, nms_threshold);
-
-        for (I64 j = 0; j < (I64)picked.size(); j++) {
+        std::vector<I32> picked = nms_pickedboxes(class_boxrects, nms_threshold);
+        for (U32 j = 0; j < picked.size(); j++) {
             I64 picked_box = picked[j];
             allclass_boxrects[i].push_back(class_boxrects[picked_box]);
-            allclass_boxscores[i].push_back(class_boxscores[picked_box]);
         }
     }
 
     std::vector<BoxRect> boxrects;
-    std::vector<F32> boxscores;
-
     for (U32 i = 1; i < numclass; i++) {
         boxrects.insert(boxrects.end(), allclass_boxrects[i].begin(), allclass_boxrects[i].end());
-        boxscores.insert(
-            boxscores.end(), allclass_boxscores[i].begin(), allclass_boxscores[i].end());
     }
 
-    qsort_descent(boxrects, boxscores, 0, static_cast<int>(boxscores.size() - 1));
-
+    std::stable_sort(boxrects.begin(), boxrects.end(),
+        [&](const BoxRect &a, const BoxRect &b) { return (a.score > b.score); });
     if (keep_top_k < (U32)boxrects.size()) {
         boxrects.resize(keep_top_k);
-        boxscores.resize(keep_top_k);
     }
 
-    U32 num_detected = static_cast<U32>(boxrects.size());
+    U32 num_detected = boxrects.size();
     // the first box contains the number of availble boxes in the first element.
     output[0] = num_detected;
     output[1] = output[2] = output[3] = output[4] = output[5] = 0;
 
     for (U32 i = 0; i < num_detected; i++) {
         BoxRect b = boxrects[i];
-        F32 score = boxscores[i];
-
         output[(i + 1) * 6] = b.label;
-        output[(i + 1) * 6 + 1] = score;
+        output[(i + 1) * 6 + 1] = b.score;
         output[(i + 1) * 6 + 2] = b.xmin;
         output[(i + 1) * 6 + 3] = b.ymin;
         output[(i + 1) * 6 + 4] = b.xmax;
diff --git a/compute/tensor/src/cpu/eltwise.cpp b/compute/tensor/src/cpu/eltwise.cpp
index fd3bd34f..f369d0eb 100644
--- a/compute/tensor/src/cpu/eltwise.cpp
+++ b/compute/tensor/src/cpu/eltwise.cpp
@@ -34,24 +34,90 @@ static std::vector<U32> calculateRelativeLocalIndex_cpu(U32 *indexes, U32 *dims,
     return relativeIndexes;
 }
 
-// [1, 10, 10] + [1, 10, 10] = [1, 10, 10]
-// [1, 10, 1] + [1, 1, 10] = [1, 10, 10]
-// [1, 20, 10] + [10] = [1. 20, 10] + [1, 1, 10] = [1, 20, 10]
-EE eltwise_cpu(std::vector<TensorDesc> inputDesc,
-    std::vector<void *> input_,
-    EltwiseParamSpec eltwiseDesc,
-    U32 tmpBytes,
-    void *tmp,
-    TensorDesc outputDesc,
-    void *output,
-    Arch arch)
+static void get_dim_nonone_bound(TensorDesc desc, int *left, int *right)
 {
-    U32 num = inputDesc.size();
-    if (num <= 1 || outputDesc.nDims < 1) {
-        return NOT_MATCH;
+    *left = -1;
+    for (U32 i = 0; i < desc.nDims; i++) {
+        if (desc.dims[i] == 1) {
+            *left = i;
+        } else {
+            break;
+        }
+    }
+    *right = desc.nDims;
+    for (I32 i = desc.nDims - 1; i >= 0; i--) {
+        if (desc.dims[i] == 1) {
+            *right = i;
+        } else {
+            break;
+        }
     }
-    std::vector<void *> input = input_;
+    *left = *left + 1;
+    *right = *right - 1;
+}
 
+static int scale_axis(
+    std::vector<TensorDesc> inputDesc, TensorDesc outputDesc, int *scaleId, TensorDesc *scaleDesc)
+{
+    if (inputDesc.size() != 2) {
+        return -1;
+    }
+    int al, ar, bl, br;
+    get_dim_nonone_bound(inputDesc[0], &al, &ar);
+    get_dim_nonone_bound(inputDesc[1], &bl, &br);
+    // use power operator
+    if (al > ar) {
+        return -2;
+    }
+    if (bl > br) {
+        return -3;
+    }
+    int cl = UNI_MIN(al, bl);
+    int cr = UNI_MAX(ar, br);
+    int alpha = -1;
+    if (cr - cl > ar - al) {
+        alpha = 0;
+    }
+    if (cr - cl > br - bl) {
+        alpha = 1;
+    }
+    if (alpha < 0) {
+        return -1;
+    }
+    int dl = UNI_MAX(al, bl);
+    int dr = UNI_MIN(ar, br);
+    for (int i = dl; i <= dr; i++) {
+        if (inputDesc[0].dims[i] != inputDesc[1].dims[i]) {
+            return -1;
+        }
+    }
+    int axis = cr - dr;
+    *scaleId = 1 - alpha;
+    *scaleDesc = inputDesc[*scaleId];
+    scaleDesc->nDims = (dl - cl) + (cr - dr) + 1;
+    int j = 0;
+    for (int i = cl; i < dl; i++) {
+        scaleDesc->dims[j++] = inputDesc[*scaleId].dims[i];
+    }
+    scaleDesc->dims[j] = 1;
+    for (int i = dl; i <= dr; i++) {
+        scaleDesc->dims[j] *= inputDesc[*scaleId].dims[i];
+    }
+    for (int i = dr + 1; i <= cr; i++) {
+        scaleDesc->dims[++j] = inputDesc[*scaleId].dims[i];
+    }
+    if (dr == cr) {
+        scaleDesc->dims[++j] = 1;
+        scaleDesc->nDims++;
+        axis++;
+    }
+    return axis;
+}
+
+static void align_param(
+    std::vector<TensorDesc> &inputDesc, std::vector<void *> &input, void *tmp, TensorDesc &outputDesc)
+{
+    U32 num = inputDesc.size();
     U8 *ptr = (U8 *)tmp;
     std::set<DataFormat> nchw = {DF_NORMAL, DF_MTK, DF_MKT, DF_NCHW};
     for (U32 i = 0; i < num; i++) {
@@ -66,103 +132,176 @@ EE eltwise_cpu(std::vector<TensorDesc> inputDesc,
                 inputDesc[i] = tensor4df(inputDesc[i].dt, DF_NHWC, inputDesc[i].dims[2],
                     inputDesc[i].dims[0], inputDesc[i].dims[1], 1);
             }
-            CHECK_STATUS(transformFormat(inputDesc[i], input[i], outputDesc, ptr));
-            inputDesc[i] = outputDesc;
+            TensorDesc tmpDesc = outputDesc;
+            if (tensorNumElements(inputDesc[i]) < tensorNumElements(outputDesc)) {
+                tmpDesc = inputDesc[i];
+                tmpDesc.df = outputDesc.df;
+            }
+            CHECK_STATUS(transformFormat(inputDesc[i], input[i], tmpDesc, ptr));
+            inputDesc[i] = tmpDesc;
             input[i] = ptr;
-            ptr += tensorNumBytes(outputDesc);
+            ptr += tensorNumBytes(tmpDesc);
         }
     }
 
     I32 oneCount = 0;
-    for (int i = 0; i < ((int)outputDesc.nDims) - 1; i++) {
+    for (int i = 0; i < (int)outputDesc.nDims - 1; i++) {
         if (outputDesc.dims[i] == 1) {
             oneCount++;
         } else {
             break;
         }
     }
-    TensorDesc newOutputDesc = outputDesc;
+
     for (int i = 0; i < (int)outputDesc.nDims - oneCount; i++) {
-        newOutputDesc.dims[i] = outputDesc.dims[oneCount + i];
+        outputDesc.dims[i] = outputDesc.dims[oneCount + i];
     }
-    newOutputDesc.nDims = outputDesc.nDims - oneCount;
+    outputDesc.nDims = outputDesc.nDims - oneCount;
 
-    std::vector<TensorDesc> newInputDesc(num);
     for (U32 i = 0; i < num; i++) {
-        newInputDesc[i] = inputDesc[i];
+        TensorDesc desc = inputDesc[i];
         for (int j = 0; j < (int)inputDesc[i].nDims - oneCount; j++) {
-            newInputDesc[i].dims[j] = inputDesc[i].dims[oneCount + j];
+            desc.dims[j] = inputDesc[i].dims[oneCount + j];
         }
-        newInputDesc[i].nDims = inputDesc[i].nDims - oneCount;
-        for (U32 j = newInputDesc[i].nDims; j < newOutputDesc.nDims; j++) {
-            newInputDesc[i].dims[j] = 1;
+        desc.nDims = inputDesc[i].nDims - oneCount;
+        for (U32 j = desc.nDims; j < outputDesc.nDims; j++) {
+            desc.dims[j] = 1;
         }
-        newInputDesc[i].nDims = newOutputDesc.nDims;
+        desc.nDims = outputDesc.nDims;
+        inputDesc[i] = desc;
     }
-    U32 size = tensorNumElements(newOutputDesc);
-    int lastDimSize = newOutputDesc.dims[0];
+}
+
+static EE eltwise_kernel(std::vector<TensorDesc> inputDesc,
+    std::vector<void *> input,
+    EltwiseParamSpec p,
+    TensorDesc outputDesc,
+    void *output,
+    Arch arch)
+{
+    U32 num = inputDesc.size();
+    int lastDimSize = outputDesc.dims[0];
     std::vector<int> lastDimSizes(num);
     bool sameDim = true;
     for (U32 i = 0; i < num; i++) {
-        lastDimSizes[i] = newInputDesc[i].dims[0];
+        lastDimSizes[i] = inputDesc[i].dims[0];
         if (lastDimSizes[i] != lastDimSize) {
             sameDim = false;
-            if (newInputDesc[0].df == DF_NCHWC8 || newInputDesc[0].df == DF_NCHWC16) {
+            if (inputDesc[0].df == DF_NCHWC8 || inputDesc[0].df == DF_NCHWC16) {
                 UNI_ERROR_LOG("For NCHWC8 and NCHWC16, eltwise can only handle inputs with "
                               "matching widths\n");
             }
         }
     }
-    for (U32 i = 1; i < newOutputDesc.nDims; i++) {
+    for (U32 i = 1; i < outputDesc.nDims; i++) {
         for (U32 j = 0; j < num; j++) {
-            if (newInputDesc[j].dims[i] != newOutputDesc.dims[i]) {
+            if (inputDesc[j].dims[i] != outputDesc.dims[i]) {
                 sameDim = false;
                 break;
             }
         }
         if (sameDim) {
-            lastDimSize *= newOutputDesc.dims[i];
+            lastDimSize *= outputDesc.dims[i];
             for (U32 j = 0; j < num; j++) {
-                lastDimSizes[j] *= newInputDesc[j].dims[i];
+                lastDimSizes[j] *= inputDesc[j].dims[i];
             }
         } else {
             break;
         }
     }
 
-    std::vector<void *> newInput(num);
     EE ret = NOT_SUPPORTED;
-    for (U32 i = 0; i < size; i += lastDimSize) {
-        std::vector<U32> index = calculateLocalIndex(i, newOutputDesc.dims, newOutputDesc.nDims);
+    if (sameDim) {  // if merged to the next loop, it will be slower when using openmp.
+        if (IS_GENERAL(arch)) {
+#ifdef _USE_GENERAL
+            ret = eltwise_general(outputDesc.dt, input, lastDimSizes, num, lastDimSize, output, p.mode);
+#endif
+#ifdef _USE_NEON
+        } else if (IS_ARM(arch)) {
+            ret = eltwise_arm(outputDesc.dt, input, lastDimSizes, num, lastDimSize, output, p.mode);
+#endif
+#ifdef _USE_X86
+        } else if (IS_X86(arch)) {
+            ret = eltwise_x86(outputDesc.dt, input, lastDimSizes, num, lastDimSize, output, p.mode);
+#endif
+        }
+        return ret;
+    }
+
+    U32 loopNum = tensorNumElements(outputDesc) / lastDimSize;
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+    for (U32 i = 0; i < loopNum; ++i) {
+        std::vector<U32> index = calculateLocalIndex(i * lastDimSize, outputDesc.dims, outputDesc.nDims);
+        std::vector<void *> ip(num);
         for (U32 j = 0; j < num; j++) {
             std::vector<U32> relativeIndex = calculateRelativeLocalIndex_cpu(
-                index.data(), newInputDesc[j].dims, newInputDesc[j].nDims);
-            U32 globalIndex = calculateGlobalIndex(
-                relativeIndex.data(), newInputDesc[j].dims, newInputDesc[j].nDims);
-            newInput[j] = (U8 *)(input[j]) + globalIndex * bytesOf(newInputDesc[j].dt);
+                index.data(), inputDesc[j].dims, inputDesc[j].nDims);
+            U32 globalIndex =
+                calculateGlobalIndex(relativeIndex.data(), inputDesc[j].dims, inputDesc[j].nDims);
+            ip[j] = (U8 *)(input[j]) + globalIndex * bytesOf(inputDesc[j].dt);
         }
-        U8 *newOutput = (U8 *)output + i * bytesOf(newOutputDesc.dt);
+        U8 *op = (U8 *)output + i * lastDimSize * bytesOf(outputDesc.dt);
         if (IS_GENERAL(arch)) {
 #ifdef _USE_GENERAL
-            ret = eltwise_general(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize,
-                newOutput, eltwiseDesc.elt_mode);
+            ret = eltwise_general(outputDesc.dt, ip, lastDimSizes, num, lastDimSize, op, p.mode);
 #endif
 #ifdef _USE_NEON
         } else if (IS_ARM(arch)) {
-            ret = eltwise_arm(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, newOutput,
-                eltwiseDesc.elt_mode);
+            ret = eltwise_arm(outputDesc.dt, ip, lastDimSizes, num, lastDimSize, op, p.mode);
 #endif
 #ifdef _USE_X86
         } else if (IS_X86(arch)) {
-            ret = eltwise_x86(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, newOutput,
-                eltwiseDesc.elt_mode);
+            ret = eltwise_x86(outputDesc.dt, ip, lastDimSizes, num, lastDimSize, op, p.mode);
 #endif
         }
     }
-    if (ret == SUCCESS && eltwiseDesc.activation_type != ACTIVATION_NULL) {
-        ActivationParamSpec p;
-        p.mode = eltwiseDesc.activation_type;
-        ret = activation_cpu(outputDesc, output, p, outputDesc, output, arch);
+    return ret;
+}
+
+// [1, 10, 10] + [1, 10, 10] = [1, 10, 10]
+// [1, 10, 1] + [1, 1, 10] = [1, 10, 10]
+// [1, 20, 10] + [10] = [1. 20, 10] + [1, 1, 10] = [1, 20, 10]
+EE eltwise_cpu(std::vector<TensorDesc> inputDesc,
+    std::vector<void *> input,
+    EltwiseParamSpec p,
+    U32 tmpBytes,
+    void *tmp,
+    TensorDesc outputDesc,
+    void *output,
+    Arch arch)
+{
+    U32 num = inputDesc.size();
+    if (num <= 1 || outputDesc.nDims < 1) {
+        return NOT_MATCH;
+    }
+    if (tensorNumElements(outputDesc) == 0) {
+        return SUCCESS;
+    }
+    align_param(inputDesc, input, tmp, outputDesc);
+
+    EE ret = NOT_SUPPORTED;
+    int scaleId = -1;
+    TensorDesc scaleDesc;
+    int axis = scale_axis(inputDesc, outputDesc, &scaleId, &scaleDesc);
+    if (axis >= 0 && (p.mode == ELTWISE_PROD || p.mode == ELTWISE_SUM)) {
+        ScaleParamSpec sp;
+        sp.axis = axis;
+        if (p.mode == ELTWISE_PROD) {
+            ret = scale_cpu(scaleDesc, input[scaleId], input[1 - scaleId], nullptr, sp, scaleDesc,
+                output, arch);
+        } else {
+            ret = scale_cpu(scaleDesc, input[scaleId], nullptr, input[1 - scaleId], sp, scaleDesc,
+                output, arch);
+        }
+    } else {
+        ret = eltwise_kernel(inputDesc, input, p, outputDesc, output, arch);
+    }
+    if (ret == SUCCESS && p.activation_type != ACTIVATION_NULL) {
+        ActivationParamSpec ap;
+        ap.mode = p.activation_type;
+        ret = activation_cpu(outputDesc, output, ap, outputDesc, output, arch);
     }
     return ret;
 }
diff --git a/compute/tensor/src/cpu/embedding.cpp b/compute/tensor/src/cpu/embedding.cpp
index 9946248a..5bd0b156 100644
--- a/compute/tensor/src/cpu/embedding.cpp
+++ b/compute/tensor/src/cpu/embedding.cpp
@@ -25,8 +25,8 @@ EE embedding_cpu(TensorDesc inputDesc,
     U8 *outputPtr = (U8 *)output;
     U32 len = tensorNumElements(inputDesc);
     U32 elementBytes = bytesOf(weightDesc.dt);
-    U32 wordEmbeddingCPUBytes = elementBytes * p.num_output;
-    U32 transposeStride = elementBytes * p.input_dim;
+    U32 wordEmbeddingCPUBytes = elementBytes * p.num_outputs;
+    U32 transposeStride = elementBytes * p.num_inputs;
     EE ret = SUCCESS;
     for (U32 i = 0; i < len; i++) {
         U32 wordIndex = 0;
@@ -52,14 +52,14 @@ EE embedding_cpu(TensorDesc inputDesc,
         U8 *dest = outputPtr;
         if (p.transpose) {
             U8 *src = weightPtr + wordIndex * elementBytes;
-            for (U32 j = 0; j < p.num_output; j++) {
-                memcpy(dest, src, elementBytes);
+            for (U32 j = 0; j < p.num_outputs; j++) {
+                UNI_MEMCPY(dest, src, elementBytes);
                 src += transposeStride;
                 dest += elementBytes;
             }
         } else {
             U8 *src = weightPtr + wordIndex * wordEmbeddingCPUBytes;
-            memcpy(dest, src, wordEmbeddingCPUBytes);
+            UNI_MEMCPY(dest, src, wordEmbeddingCPUBytes);
         }
         outputPtr += wordEmbeddingCPUBytes;
     }
diff --git a/compute/tensor/src/cpu/gat.cpp b/compute/tensor/src/cpu/gat.cpp
index 4d475818..f042964c 100644
--- a/compute/tensor/src/cpu/gat.cpp
+++ b/compute/tensor/src/cpu/gat.cpp
@@ -46,7 +46,7 @@ void preprocess(TensorDesc node_feature_desc,
     std::vector<TensorDesc> inputDescs = {outputDesc, outputDesc, outputDesc};
     std::vector<void *> inputs = {out0, out1, edge_feature};
     EltwiseParamSpec eltwiseDesc;
-    eltwiseDesc.elt_mode = ELTWISE_SUM;
+    eltwiseDesc.mode = ELTWISE_SUM;
     eltwiseDesc.activation_type = ACTIVATION_NULL;
     CHECK_STATUS(eltwise_cpu(inputDescs, inputs, eltwiseDesc, 0, nullptr, outputDesc, output, arch));
 
@@ -82,7 +82,7 @@ void neighborhood_aware_softmax_yun(TensorDesc inputDesc,
     }
 #endif
     T *out1 = (T *)tmp;
-    memset(out1, 0, sizeof(T) * num_nodes * num_heads);
+    UNI_MEMSET(out1, 0, sizeof(T) * num_nodes * num_heads);
     for (int i = 0; i < num_edges; i++) {
         int node = nodes1[i];
         for (int j = 0; j < num_heads; j++) {
@@ -92,13 +92,13 @@ void neighborhood_aware_softmax_yun(TensorDesc inputDesc,
 
     for (int i = 0; i < num_edges; i++) {
         int node = nodes1[i];
-        memcpy(output + i * num_heads, out1 + node * num_heads, num_heads * sizeof(T));
+        UNI_MEMCPY(output + i * num_heads, out1 + node * num_heads, num_heads * sizeof(T));
     }
 
     std::vector<TensorDesc> inputDescs = {inputDesc, inputDesc};
     std::vector<void *> inputs = {out0, output};
     EltwiseParamSpec eltwiseDesc;
-    eltwiseDesc.elt_mode = ELTWISE_DIV;
+    eltwiseDesc.mode = ELTWISE_DIV;
     eltwiseDesc.activation_type = ACTIVATION_NULL;
     CHECK_STATUS(eltwise_cpu(inputDescs, inputs, eltwiseDesc, 0, nullptr, inputDesc, output, arch));
 }
@@ -112,7 +112,7 @@ void scatter_atten_score(const int *nodes0,
     int num_edges,
     T *out)
 {
-    memset(out, 0, sizeof(T) * num_heads * num_nodes * num_nodes);
+    UNI_MEMSET(out, 0, sizeof(T) * num_heads * num_nodes * num_nodes);
     for (int j = 0, k = 0; j < num_edges; j++) {
         int node0 = nodes0[j];
         int node1 = nodes1[j];
@@ -142,7 +142,7 @@ EE gat_cpu(TensorDesc node_feature_desc,
     tmp = (U8 *)out1 + tensorNumBytes(edge_feature_desc);
     // tmpBytes = tensorNumBytes(edge_feature_desc) * 2
     preprocess(node_feature_desc, node_desc, node_features0, nodes0, node_features1, nodes1,
-        edge_feature, p.activation, tmp, edge_feature_desc, out0, arch);
+        edge_feature, p.activation_type, tmp, edge_feature_desc, out0, arch);
 
     int num_heads = p.num_heads;
     int num_nodes = node_feature_desc.dims[1];
diff --git a/compute/tensor/src/cpu/gather.cpp b/compute/tensor/src/cpu/gather.cpp
index 5dc501af..d0ce6211 100644
--- a/compute/tensor/src/cpu/gather.cpp
+++ b/compute/tensor/src/cpu/gather.cpp
@@ -24,7 +24,7 @@ inline static void gather(const TensorDesc &dataDesc,
 {
     int axis = (p.axis + dataDesc.nDims) % dataDesc.nDims;
     axis = dataDesc.nDims - 1 - axis;
-    int outer_loop = 1, k = dataDesc.dims[axis], inner_loop = 1;
+    int outer_loop = 1, k = dataDesc.dims[axis], loop = tensorNumElements(indexDesc), inner_loop = 1;
     for (int i = 0; i < axis; i++) {
         inner_loop *= dataDesc.dims[i];
     }
@@ -32,11 +32,18 @@ inline static void gather(const TensorDesc &dataDesc,
         outer_loop *= dataDesc.dims[i];
     }
     int tile_size = inner_loop;
-    for (int i = 0, dst_index = 0; i < outer_loop; i++) {
-        for (U32 j = 0; j < tensorNumElements(indexDesc); j++, dst_index += tile_size) {
-            int src_index = (i * k + index[j]) * tile_size;
-            memcpy(output + dst_index, data + src_index, tile_size * sizeof(T));
-        }
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+    for (int o = 0; o < outer_loop * loop; o++) {
+        int i = o / loop;
+        int j = o % loop;
+        U32 dst_index = o * tile_size;
+        //for (int i = 0, dst_index = 0; i < outer_loop; i++)
+        //for (U32 j = 0; j < loop; j++, dst_index += tile_size)
+        int stable_index = index[j] < 0 ? index[j] + k : index[j];
+        int src_index = (i * k + stable_index) * tile_size;
+        UNI_MEMCPY(output + dst_index, data + src_index, tile_size * sizeof(T));
     }
 }
 
@@ -51,12 +58,14 @@ inline static void gather_elements(const TensorDesc &dataDesc,
 {
     int axis = (p.axis + dataDesc.nDims) % dataDesc.nDims;
     axis = dataDesc.nDims - 1 - axis;
-
-    for (U32 i = 0; i < tensorNumElements(dataDesc); i++) {
-        std::vector<U32> local = calculateLocalIndex(i, dataDesc.dims, dataDesc.nDims);
-        local[axis] = index[i];
-        U32 k = calculateGlobalIndex(local.data(), dataDesc.dims, dataDesc.nDims);
-        output[i] = data[k];
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+    for (U32 i = 0; i < tensorNumElements(indexDesc); i++) {
+        std::vector<U32> local = calculateLocalIndex(i, indexDesc.dims, indexDesc.nDims);
+        local[axis] = index[i] < 0 ? index[i] + dataDesc.dims[axis] : index[i];
+        U32 idx = calculateGlobalIndex(local.data(), dataDesc.dims, dataDesc.nDims);
+        output[i] = data[idx];
     }
 }
 
@@ -82,20 +91,32 @@ inline static void gatherND(const TensorDesc &dataDesc,
     newDataDesc.dims[axis + 1] = batch_dims_size;
     newDataDesc.nDims = axis + 1 + 1;
 
-    U32 gather_index[16] = {0};
     int tile_dims = newDataDesc.nDims - (k + 1);
-    gather_index[tile_dims + k] = p.batch_dims;
     U32 tile_size = 1;
     for (int i = 0; i < tile_dims; i++) {
         tile_size *= newDataDesc.dims[i];
     }
-    for (int batch_dim = 0, i = 0, dst_index = 0; batch_dim < batch_dims_size; batch_dim++) {
-        for (int outer_dim = 0; outer_dim < t; outer_dim++, i += k, dst_index += tile_size) {
+#ifdef _USE_OPENMP
+#pragma omp parallel num_threads(OMP_NUM_THREADS)
+#endif
+    {
+        U32 gather_index[16] = {0};
+        gather_index[tile_dims + k] = p.batch_dims;
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (int o = 0; o < batch_dims_size * t; o++) {
+            int batch_dim = o / t;
+            int outer_dim = o % t;
+            int i = o * k;
+            int dst_index = o * tile_size;
+            //for (int batch_dim = 0, i = 0, dst_index = 0; batch_dim < batch_dims_size; batch_dim++)
+            //    for (int outer_dim = 0; outer_dim < t; outer_dim++, i += k, dst_index += tile_size) {
             for (int j = 0; j < k; j++) {
                 gather_index[tile_dims + k - 1 - j] = index[i + j];
             }
             U32 src_index = calculateGlobalIndex(gather_index, newDataDesc.dims, newDataDesc.nDims);
-            memcpy(output + dst_index, data + src_index, tile_size * sizeof(T));
+            UNI_MEMCPY(output + dst_index, data + src_index, tile_size * sizeof(T));
         }
     }
 }
@@ -137,9 +158,14 @@ EE gather_cpu(TensorDesc dataDesc,
     EE ret = SUCCESS;
     switch (dataDesc.dt) {
         case DT_I32:
+        case DT_U32:
             gather_kernel<I32>(dataDesc, (const I32 *)data, indexDesc, (const int *)index, p,
                 outputDesc, (I32 *)output);
             break;
+        case DT_U8:
+            gather_kernel<U8>(dataDesc, (const U8 *)data, indexDesc, (const int *)index, p,
+                outputDesc, (U8 *)output);
+            break;
 #ifdef _USE_FP32
         case DT_F32:
             gather_kernel<F32>(dataDesc, (const F32 *)data, indexDesc, (const int *)index, p,
diff --git a/compute/tensor/src/cpu/general/attention.cpp b/compute/tensor/src/cpu/general/attention.cpp
index dc12c890..19c47a83 100644
--- a/compute/tensor/src/cpu/general/attention.cpp
+++ b/compute/tensor/src/cpu/general/attention.cpp
@@ -23,9 +23,9 @@ EE attention(
     }
 
     T minValue = -10000.0;
-    U32 count = array_sum_template<T>(input, toSequenceLength);
-    U32 valid = UNI_MIN(count, fromSequenceLength);
     for (U32 n = 0; n < batch; n++) {
+        U32 count = array_sum_template<T>(input + n * toSequenceLength, toSequenceLength);
+        U32 valid = UNI_MIN(count, fromSequenceLength);
         for (U32 i = 0; i < numHeads; i++) {
             for (U32 j = 0; j < valid; j++) {
                 for (U32 k = 0; k < toSequenceLength; k++) {
diff --git a/compute/tensor/src/cpu/general/attention_mask.cpp b/compute/tensor/src/cpu/general/attention_mask.cpp
index c4d45592..6e25f132 100644
--- a/compute/tensor/src/cpu/general/attention_mask.cpp
+++ b/compute/tensor/src/cpu/general/attention_mask.cpp
@@ -55,7 +55,7 @@ static EE attention_mask(TensorDesc inputDesc,
             if (start + loops > klen) {
                 loops = UNI_MAX(klen - start, 0);
             }
-            memset(&mask[i][start], 0, sizeof(T) * loops);
+            UNI_MEMSET(&mask[i][start], 0, sizeof(T) * loops);
         }
     }
     I32 loops = tensorNumElements(inputDesc) / qlen / klen;
diff --git a/compute/tensor/src/cpu/general/check.cpp b/compute/tensor/src/cpu/general/check.cpp
deleted file mode 100644
index ed269423..00000000
--- a/compute/tensor/src/cpu/general/check.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#include "cpu/general/tensor_computing_general.h"
-#include "uni.h"
-
-template <typename T>
-static EE check(TensorDesc inputDescA,
-    const T *inputA,
-    TensorDesc inputDescB,
-    const T *inputB,
-    CheckMode checkMode,
-    TensorDesc outputDesc,
-    I32 *output)
-{
-    UNUSED(inputDescB);
-    UNUSED(outputDesc);
-
-    if (nullptr == inputA || nullptr == inputB || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-
-    U32 size = tensorNumElements(inputDescA);
-    U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1];
-    U32 loopInner = size / loopOuter;
-
-    for (U32 i = 0; i < loopOuter; i++) {
-        U32 count = 0;
-        for (U32 j = 0; j < loopInner; j++) {
-            U32 index = i * loopInner + j;
-            switch (checkMode) {
-                case CHECK_EQUAL: {
-                    if (inputA[index] == inputB[index]) {
-                        count++;
-                    }
-                    break;
-                }
-                case CHECK_GREATEQUAL: {
-                    if (inputA[index] >= inputB[index]) {
-                        count++;
-                    }
-                    break;
-                }
-                case CHECK_GREAT: {
-                    if (inputA[index] > inputB[index]) {
-                        count++;
-                    }
-                    break;
-                }
-                default:
-                    CHECK_STATUS(NOT_SUPPORTED);
-                    break;
-            }
-        }
-
-        if (count == loopInner) {
-            output[i] = 1;
-        } else {
-            output[i] = 0;
-        }
-    }
-    return SUCCESS;
-}
-
-EE check_general(TensorDesc inputDescA,
-    const void *inputA,
-    TensorDesc inputDescB,
-    const void *inputB,
-    CheckParamSpec p,
-    TensorDesc outputDesc,
-    void *output)
-{
-    DataType idt = inputDescA.dt;
-    EE ret = SUCCESS;
-    switch (idt) {
-#ifdef _USE_FP16
-        case DT_F16: {
-            ret = check<F16>(inputDescA, (const F16 *)inputA, inputDescB, (const F16 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-#endif
-#ifdef _USE_FP32
-        case DT_F32: {
-            ret = check<F32>(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-#endif
-        case DT_U32: {
-            ret = check<U32>(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-        case DT_I32: {
-            ret = check<I32>(inputDescA, (const I32 *)inputA, inputDescB, (const I32 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-        default:
-            ret = NOT_SUPPORTED;
-            break;
-    }
-
-    return ret;
-}
diff --git a/compute/tensor/src/cpu/general/convolution.cpp b/compute/tensor/src/cpu/general/convolution.cpp
index 3601897f..26ef62a2 100644
--- a/compute/tensor/src/cpu/general/convolution.cpp
+++ b/compute/tensor/src/cpu/general/convolution.cpp
@@ -49,9 +49,9 @@ inline EE convolution(TensorDesc inputDesc,
     U32 strideT = convParamSpec.stride_t;
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingB = convParamSpec.padding_before;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingL = convParamSpec.padding_left;
+    U32 paddingB = convParamSpec.pad_before;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingL = convParamSpec.pad_left;
     U32 dilateT = convParamSpec.dilatedRate_t;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
@@ -202,9 +202,9 @@ EE convolution_general(TensorDesc inputDesc,
     UNUSED(biasDesc);
 
     if (eltwiseInput == nullptr) {
-        memset(output, 0, tensorNumBytes(outputDesc));
+        UNI_MEMSET(output, 0, tensorNumBytes(outputDesc));
     } else {
-        memcpy(output, eltwiseInput, tensorNumBytes(outputDesc));
+        UNI_MEMCPY(output, eltwiseInput, tensorNumBytes(outputDesc));
     }
 
     EE ret = NOT_SUPPORTED;
diff --git a/compute/tensor/src/cpu/general/cumsum.cpp b/compute/tensor/src/cpu/general/cumsum.cpp
new file mode 100644
index 00000000..41a983a5
--- /dev/null
+++ b/compute/tensor/src/cpu/general/cumsum.cpp
@@ -0,0 +1,94 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/general/tensor_computing_general.h"
+
+template <typename T>
+static void cumsum(
+    TensorDesc inputDesc, const T *input, CumSumParamSpec p, TensorDesc outputDesc, T *output)
+{
+    int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims;
+    axis = inputDesc.nDims - 1 - axis;
+    int loopOuter = 1, loopInner = 1;
+    for (int i = 0; i < axis; i++) {
+        loopInner *= inputDesc.dims[i];
+    }
+    int loops = inputDesc.dims[axis];
+    for (U32 i = axis + 1; i < inputDesc.nDims; i++) {
+        loopOuter *= inputDesc.dims[i];
+    }
+    int id, id1;
+    for (int i = 0; i < loopOuter; i++) {
+        for (int j = 0; j < loopInner; j++) {
+            if (p.reverse) {
+                id = (i * loops + loops - 1) * loopInner + j;
+                if (p.exclusive) {
+                    output[id] = 0;
+                    id1 = id;
+                    id -= loopInner;
+                } else {
+                    output[id] = input[id];
+                    id1 = id - loopInner;
+                    id = id1;
+                }
+                for (int k = loops - 2; k >= 0; k--, id -= loopInner, id1 -= loopInner) {
+                    output[id] = output[id + loopInner] + input[id1];
+                }
+            } else {
+                id = i * loops * loopInner + j;
+                if (p.exclusive) {
+                    output[id] = 0;
+                    id1 = id;
+                    id += loopInner;
+                } else {
+                    output[id] = input[id];
+                    id1 = id + loopInner;
+                    id = id1;
+                }
+                for (int k = 1; k < loops; k++, id += loopInner, id1 += loopInner) {
+                    output[id] = output[id - loopInner] + input[id1];
+                }
+            }
+        }
+    }
+}
+
+EE cumsum_general(
+    TensorDesc inputDesc, const void *input, CumSumParamSpec p, TensorDesc outputDesc, void *output)
+{
+    DataType idt = inputDesc.dt;
+    EE ret = SUCCESS;
+    switch (idt) {
+#ifdef _USE_FP16
+        case DT_F16: {
+            cumsum<F16>(inputDesc, (const F16 *)input, p, outputDesc, (F16 *)output);
+            break;
+        }
+#endif
+#ifdef _USE_FP32
+        case DT_F32: {
+            cumsum<F32>(inputDesc, (const F32 *)input, p, outputDesc, (F32 *)output);
+            break;
+        }
+#endif
+        case DT_I32: {
+            cumsum<I32>(inputDesc, (const I32 *)input, p, outputDesc, (I32 *)output);
+            break;
+        }
+        default:
+            ret = NOT_SUPPORTED;
+            break;
+    }
+
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/general/deconvolution.cpp b/compute/tensor/src/cpu/general/deconvolution.cpp
index e46953c9..5a983def 100644
--- a/compute/tensor/src/cpu/general/deconvolution.cpp
+++ b/compute/tensor/src/cpu/general/deconvolution.cpp
@@ -36,12 +36,12 @@ inline EE deconvolution(TensorDesc inputDesc,
     U32 group = convParamSpec.group;
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingL = convParamSpec.padding_left;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingL = convParamSpec.pad_left;
     U32 ocGroupSize = oc / group;
 
     // initialize outputs to 0
-    memset(outArray, 0, tensorNumBytes(outputDesc));
+    UNI_MEMSET(outArray, 0, tensorNumBytes(outputDesc));
     U32 ic8 = ic / 8;
     U32 oc8 = oc / 8;
     for (U32 n = 0; n < in; n++) {
diff --git a/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp
index 739340b6..28df05ea 100644
--- a/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp
+++ b/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp
@@ -66,8 +66,8 @@ inline EE depthwise_pointwise_convolution(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingL = convParamSpec.padding_left;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingL = convParamSpec.pad_left;
     U32 dilatedRateH = convParamSpec.dilatedRate_h;
     U32 dilatedRateW = convParamSpec.dilatedRate_w;
 
@@ -80,8 +80,24 @@ inline EE depthwise_pointwise_convolution(TensorDesc inputDesc,
     } else {
         pwArray = outArray;
     }
-    U32 ic8 = ic / 8;
-    U32 oc8 = oc / 8;
+    U32 ic8 = ic;
+    U32 oc8 = oc;
+    U32 icx = 1;
+    U32 ocx = 1;
+    if (idf == DF_NCHWC16) {
+        icx = 16;
+        ic8 /= 16;
+    } else if (idf == DF_NCHWC8) {
+        icx = 8;
+        ic8 /= 8;
+    }
+    if (odf == DF_NCHWC16) {
+        ocx = 16;
+        oc8 /= 16;
+    } else if (odf == DF_NCHWC8) {
+        ocx = 8;
+        oc8 /= 8;
+    }
     for (U32 n = 0, pw_off = 0; n < in; n++) {
         // dw conv
         for (U32 c = 0; c < ic; c++) {
@@ -94,11 +110,12 @@ inline EE depthwise_pointwise_convolution(TensorDesc inputDesc,
                             I32 iw_idx = w * strideW - paddingL + fw_idx * dilatedRateW;
                             if (ih_idx >= 0 && ih_idx < (I32)ih && iw_idx >= 0 && iw_idx < (I32)iw) {
                                 U32 i_off;
-                                if (idf != DF_NCHWC8) {
-                                    i_off = ((n * ic + c) * ih + ih_idx) * iw + iw_idx;
+                                if (idf == DF_NCHWC8 || idf == DF_NCHWC16) {
+                                    i_off = (((n * ic8 + (c / icx)) * ih + ih_idx) * iw + iw_idx) *
+                                            icx +
+                                        c % icx;
                                 } else {
-                                    i_off = (((n * ic8 + (c / 8)) * ih + ih_idx) * iw + iw_idx) * 8 +
-                                        c % 8;
+                                    i_off = ((n * ic + c) * ih + ih_idx) * iw + iw_idx;
                                 }
                                 value += inArray[i_off] *
                                     dwFilterArray[c * fh * fw + fh_idx * fw + fw_idx];
@@ -108,10 +125,10 @@ inline EE depthwise_pointwise_convolution(TensorDesc inputDesc,
                     CHECK_STATUS(
                         activation_template<T3>(depthwiseActivationParamSpec, value, &value));
 
-                    if (fuseDepthwisePointwise || odf != DF_NCHWC8) {
+                    if (fuseDepthwisePointwise || (odf != DF_NCHWC8 && odf != DF_NCHWC16)) {
                         pwArray[pw_off] = value;
                     } else {
-                        pwArray[(((n * ic8 + (c / 8)) * oh + h) * ow + w) * 8 + c % 8] = value;
+                        pwArray[(((n * ic8 + (c / ocx)) * oh + h) * ow + w) * ocx + c % ocx] = value;
                     }
                 }
             }
@@ -128,10 +145,10 @@ inline EE depthwise_pointwise_convolution(TensorDesc inputDesc,
                     CHECK_STATUS(
                         activation_template<T3>(pointwiseActivationParamSpec, value, &value));
                     U32 o_off;
-                    if (odf != DF_NCHWC8) {
-                        o_off = (n * oc + o) * oh * ow + hw;
+                    if (odf == DF_NCHWC8 || odf == DF_NCHWC16) {
+                        o_off = ((n * oc8 + (o / ocx)) * oh * ow + hw) * ocx + o % ocx;
                     } else {
-                        o_off = ((n * oc8 + (o / 8)) * oh * ow + hw) * 8 + o % 8;
+                        o_off = (n * oc + o) * oh * ow + hw;
                     }
                     outArray[o_off] += value;
                 }
@@ -161,9 +178,9 @@ EE depthwise_pointwise_convolution_general(TensorDesc inputDesc,
     ActivationParamSpec pointwiseActivationParamSpec)
 {
     if (eltwiseInput == nullptr) {
-        memset(output, 0, tensorNumBytes(outputDesc));
+        UNI_MEMSET(output, 0, tensorNumBytes(outputDesc));
     } else {
-        memcpy(output, eltwiseInput, tensorNumBytes(outputDesc));
+        UNI_MEMCPY(output, eltwiseInput, tensorNumBytes(outputDesc));
     }
     EE ret = SUCCESS;
     switch (inputDesc.dt) {
diff --git a/compute/tensor/src/cpu/general/general_functions.h b/compute/tensor/src/cpu/general/general_functions.h
index bab0a7f4..1790f222 100644
--- a/compute/tensor/src/cpu/general/general_functions.h
+++ b/compute/tensor/src/cpu/general/general_functions.h
@@ -278,6 +278,9 @@ inline EE array_minmax_value_general(DataType dt, const void *data, I32 len, int
         case DT_I32:
             ret = array_minmax_value_template<I32>((const I32 *)data, len, mode, result);
             break;
+        case DT_U32:
+            ret = array_minmax_value_template<U32>((const U32 *)data, len, mode, result);
+            break;
         default:
             ret = NOT_SUPPORTED;
             break;
diff --git a/compute/tensor/src/cpu/general/normalization.cpp b/compute/tensor/src/cpu/general/normalization.cpp
index 793ebd7b..fdc824f9 100644
--- a/compute/tensor/src/cpu/general/normalization.cpp
+++ b/compute/tensor/src/cpu/general/normalization.cpp
@@ -16,11 +16,12 @@
 #include "cpu/general/general_functions.h"
 #include "cpu/general/tensor_computing_general.h"
 
+static float eps = 1e-6;
+
 template <typename T>
-inline EE array_norm_scale_template(
+inline static EE array_norm_scale_template(
     T *input, T *output, I32 len, F32 mean, F32 var, T *alpha, T *beta)
 {
-    F32 eps = 1e-6;
     F32 std_value = sqrt(var + eps);
     for (I32 i = 0; i < len; i++) {
         output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i];
@@ -29,12 +30,9 @@ inline EE array_norm_scale_template(
 }
 
 template <typename T>
-inline EE layer_normalization_template(
+static EE layer_normalization_nhwc(
     TensorDesc inputDesc, T *input, T *alpha, T *beta, TensorDesc outputDesc, T *output)
 {
-    if (nullptr == input || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
     if (inputDesc.dt != outputDesc.dt || inputDesc.df != outputDesc.df) {
         CHECK_STATUS(NOT_MATCH);
     }
@@ -51,32 +49,104 @@ inline EE layer_normalization_template(
         array_norm_scale_template<T>(
             current_input, current_output, size_inner, mean, var, alpha, beta);
     }
+    return SUCCESS;
+}
+
+template <typename T>
+static EE layer_normalization_nchwc8(
+    TensorDesc inputDesc, T *input, T *alpha, T *beta, TensorDesc outputDesc, T *output)
+{
+    int n = inputDesc.dims[inputDesc.nDims - 1];
+    int c = inputDesc.dims[inputDesc.nDims - 2];
+    int hw = 1;
+    for (unsigned int i = 0; i < inputDesc.nDims - 2; i++) {
+        hw *= inputDesc.dims[i];
+    }
+    int c8 = c / 8;
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < hw; j++) {
+            F32 sum = 0;
+            for (int k = 0; k < c8; k++) {
+                int id = ((i * c8 + k) * hw + j) * 8;
+                for (int a = id; a < id + 8; a++) {
+                    sum += input[a];
+                }
+            }
+            F32 mean = sum / c;
+
+            sum = 0;
+            for (int k = 0; k < c8; k++) {
+                int id = ((i * c8 + k) * hw + j) * 8;
+                for (int a = id; a < id + 8; a++) {
+                    F32 tmp = input[a] - mean;
+                    sum += tmp * tmp;
+                }
+            }
+            F32 var = sum / c;
 
+            F32 std_value = sqrt(var + eps);
+            for (int k = 0, kk = 0; k < c8; k++) {
+                int id = ((i * c8 + k) * hw + j) * 8;
+                for (int a = id; a < id + 8; a++, kk++) {
+                    output[a] = alpha[kk] * ((input[a] - mean) / std_value) + beta[kk];
+                }
+            }
+        }
+    }
     return SUCCESS;
 }
 
-EE layer_normalization_general(
-    TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output)
+template <typename T>
+static EE layer_normalization_template(TensorDesc inputDesc,
+    T *input,
+    LayerNormParamSpec p,
+    T *alpha,
+    T *beta,
+    TensorDesc outputDesc,
+    T *output)
+{
+    if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+
+    EE ret = NOT_SUPPORTED;
+    if (inputDesc.df == DF_NCHWC8) {
+        if (p.axis == 1) {
+            ret = layer_normalization_nchwc8(inputDesc, input, alpha, beta, outputDesc, output);
+        }
+    } else {
+        if (p.axis == -1) {
+            ret = layer_normalization_nhwc(inputDesc, input, alpha, beta, outputDesc, output);
+        }
+    }
+    return ret;
+}
+
+EE layer_normalization_general(TensorDesc inputDesc,
+    void *input,
+    LayerNormParamSpec p,
+    void *alpha,
+    void *beta,
+    TensorDesc outputDesc,
+    void *output)
 {
-    DataType idt = inputDesc.dt;
-    EE ret = SUCCESS;
-    switch (idt) {
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
             ret = layer_normalization_template<F32>(
-                inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output);
+                inputDesc, (F32 *)input, p, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output);
             break;
         }
 #endif
 #ifdef _USE_FP16
         case DT_F16: {
             ret = layer_normalization_template<F16>(
-                inputDesc, (F16 *)input, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output);
+                inputDesc, (F16 *)input, p, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output);
             break;
         }
 #endif
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
     return ret;
diff --git a/compute/tensor/src/cpu/general/padding.cpp b/compute/tensor/src/cpu/general/padding.cpp
index 202bcb52..892289d1 100644
--- a/compute/tensor/src/cpu/general/padding.cpp
+++ b/compute/tensor/src/cpu/general/padding.cpp
@@ -39,33 +39,33 @@ EE padding_general(TensorDesc inputDesc,
                     (const U8 *)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt);
                 U8 *outPtr = (U8 *)output +
                     (((n * oc + c) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt);
-                if (padParamSpec.pad_mode == Pad_Constant) {
-                    memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt));
+                if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                    UNI_MEMSET(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt));
                     outPtr += padParamSpec.left * alignSize * bytesOf(odt);
-                    memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt));
+                    UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt));
                     outPtr += iw * alignSize * bytesOf(odt);
-                    memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt));
+                    UNI_MEMSET(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt));
                 } else {
                     for (U32 w = 0; w < padParamSpec.left; w++) {
                         U32 index = 0;
-                        if (padParamSpec.pad_mode == Pad_Reflect) {
+                        if (padParamSpec.pad_mode == PAD_REFLECT) {
                             index = (padParamSpec.left - w) * alignSize * bytesOf(idt);
-                        } else if (padParamSpec.pad_mode == Pad_Symmetric) {
+                        } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
                             index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt);
                         }
-                        memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt));
+                        UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt));
                         outPtr += alignSize * bytesOf(idt);
                     }
-                    memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt));
+                    UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt));
                     outPtr += iw * alignSize * bytesOf(odt);
                     for (U32 w = 0; w < padParamSpec.right; w++) {
                         U32 index = (iw - 1) * alignSize * bytesOf(idt);
-                        if (padParamSpec.pad_mode == Pad_Reflect) {
+                        if (padParamSpec.pad_mode == PAD_REFLECT) {
                             index = (iw - w - 2) * alignSize * bytesOf(idt);
-                        } else if (padParamSpec.pad_mode == Pad_Symmetric) {
+                        } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
                             index = (iw - w - 1) * alignSize * bytesOf(idt);
                         }
-                        memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt));
+                        UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt));
                         outPtr += alignSize * bytesOf(idt);
                     }
                 }
@@ -73,20 +73,20 @@ EE padding_general(TensorDesc inputDesc,
             U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt);
             for (U32 h = 0; h < padParamSpec.top; h++) {
                 U32 index = h * ow * alignSize * bytesOf(odt);
-                if (padParamSpec.pad_mode == Pad_Constant) {
-                    memset(outPtr + index, 0, ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Edge) {
-                    memcpy(outPtr + index,
+                if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                    UNI_MEMSET(outPtr + index, 0, ow * alignSize * bytesOf(odt));
+                } else if (padParamSpec.pad_mode == PAD_EDGE) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Reflect) {
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_REFLECT) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr +
                             ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize *
                                 bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Symmetric) {
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr +
                             ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize *
                                 bytesOf(odt)),
@@ -97,21 +97,21 @@ EE padding_general(TensorDesc inputDesc,
             }
             for (U32 h = 0; h < padParamSpec.bottom; h++) {
                 U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt);
-                if (padParamSpec.pad_mode == Pad_Constant) {
-                    memset(outPtr + index, 0, ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Edge) {
-                    memcpy(outPtr + index,
+                if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                    UNI_MEMSET(outPtr + index, 0, ow * alignSize * bytesOf(odt));
+                } else if (padParamSpec.pad_mode == PAD_EDGE) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Reflect) {
-                    // memcpy(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt));
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_REFLECT) {
+                    // UNI_MEMCPY(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt));
+                    UNI_MEMCPY(outPtr + index,
                         outPtr +
                             ((padParamSpec.top + ih - 1 - padParamSpec.bottom + h) * ow *
                                 alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Symmetric) {
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
                 } else {
diff --git a/compute/tensor/src/cpu/general/pooling.cpp b/compute/tensor/src/cpu/general/pooling.cpp
index 54dcf007..2dc836e6 100644
--- a/compute/tensor/src/cpu/general/pooling.cpp
+++ b/compute/tensor/src/cpu/general/pooling.cpp
@@ -16,9 +16,10 @@
 
 #include "cpu/general/tensor_computing_general.h"
 
-template <typename T>
-EE pooling(T *input,
-    T *output,
+template <typename T1, typename T2>
+EE pooling(DataType idt,
+    T1 *input,
+    T1 *output,
     I32 in,
     I32 ic,
     I32 it,
@@ -27,25 +28,26 @@ EE pooling(T *input,
     I32 ot,
     I32 oh,
     I32 ow,
-    I32 stride_t,
-    I32 stride_h,
-    I32 stride_w,
-    I32 padding_before,
-    I32 padding_after,
-    I32 padding_top,
-    I32 padding_bottom,
-    I32 padding_left,
-    I32 padding_right,
-    I32 kernel_t,
-    I32 kernel_h,
-    I32 kernel_w,
-    PoolingMode pm,
-    RoundMode rm,
+    PoolingParamSpec p,
     I32 alignSize,
-    F32 minValue)
+    F32 minValue,
+    void *scale)
 {
     CHECK_REQUIREMENT(ic % alignSize == 0);
     ic = ic / alignSize;
+    float poolSize = p.kernel_t * p.kernel_h * p.kernel_w;
+
+#ifdef _USE_INT8
+    F32 *inputScale = (F32 *)scale;
+    F32 *outputScale = inputScale + 1;
+    I32 shift = 65536;
+    I32 factor = shift / poolSize;
+    if (p.mode == POOLING_MAX) {
+        *outputScale = *inputScale;
+    } else {
+        *outputScale = *inputScale * factor * poolSize / (F32)shift;
+    }
+#endif
 
     EE ret = SUCCESS;
     for (I32 n = 0; n < in; n++) {
@@ -54,26 +56,29 @@ EE pooling(T *input,
                 for (I32 t = 0; t < ot; t++) {
                     for (I32 h = 0; h < oh; h++) {
                         for (I32 w = 0; w < ow; w++) {
-                            int tstart = t * stride_t - padding_before;
-                            int hstart = h * stride_h - padding_top;
-                            int wstart = w * stride_w - padding_left;
-                            int tend = tstart + kernel_t;
-                            int hend = hstart + kernel_h;
-                            int wend = wstart + kernel_w;
+                            int tstart = t * p.stride_t - p.pad_before;
+                            int hstart = h * p.stride_h - p.pad_top;
+                            int wstart = w * p.stride_w - p.pad_left;
+                            int tend = tstart + p.kernel_t;
+                            int hend = hstart + p.kernel_h;
+                            int wend = wstart + p.kernel_w;
                             tstart = UNI_MAX(tstart, 0);
                             hstart = UNI_MAX(hstart, 0);
                             wstart = UNI_MAX(wstart, 0);
                             tend = UNI_MIN(tend, it);
                             hend = UNI_MIN(hend, ih);
                             wend = UNI_MIN(wend, iw);
-                            float poolSize = (tend - tstart) * (hend - hstart) * (wend - wstart);
-                            T value;
-                            switch (pm) {
+                            if (!p.count_include_pad) {
+                                poolSize = (tend - tstart) * (hend - hstart) * (wend - wstart);
+                            }
+                            T1 maxVal = 0;
+                            T2 meanVal = 0;
+                            switch (p.mode) {
                                 case POOLING_MAX:
-                                    value = minValue;
+                                    maxVal = minValue;
                                     break;
                                 case POOLING_MEAN:
-                                    value = 0;
+                                    meanVal = 0;
                                     break;
                                 default:
                                     return NOT_SUPPORTED;
@@ -86,13 +91,13 @@ EE pooling(T *input,
                                         U32 in_off = ((((n * ic + c) * it + z) * ih + x) * iw + y) *
                                                 alignSize +
                                             j;
-                                        switch (pm) {
+                                        switch (p.mode) {
                                             case POOLING_MAX:
-                                                value = (value > input[in_off]) ? value
-                                                                                : input[in_off];
+                                                maxVal = (maxVal > input[in_off]) ? maxVal
+                                                                                  : input[in_off];
                                                 break;
                                             case POOLING_MEAN:
-                                                value += input[in_off];
+                                                meanVal += input[in_off];
                                                 break;
                                             default:
                                                 ret = NOT_SUPPORTED;
@@ -101,18 +106,25 @@ EE pooling(T *input,
                                     }
                                 }
                             }
-                            switch (pm) {
+                            switch (p.mode) {
                                 case POOLING_MAX:
+                                    output[out_off] = maxVal;
                                     break;
                                 case POOLING_MEAN:
-                                    value = value / poolSize;
+                                    if (idt == DT_I8 || idt == DT_U8_Q) {
+#ifdef _USE_INT8
+                                        I32 factor = shift /
+                                            ((tend - tstart) * (hend - hstart) * (wend - wstart));
+                                        output[out_off] = ((I32)meanVal * factor) >> 16;
+#endif
+                                    } else {
+                                        output[out_off] = meanVal / poolSize;
+                                    }
                                     break;
                                 default:
                                     ret = NOT_SUPPORTED;
                                     break;
                             }
-
-                            output[out_off] = value;
                         }
                     }
                 }
@@ -122,8 +134,12 @@ EE pooling(T *input,
     return ret;
 }
 
-EE pooling_general(
-    TensorDesc inputDesc, const void *input, PoolingParamSpec p, TensorDesc outputDesc, void *output)
+EE pooling_general(TensorDesc inputDesc,
+    const void *input,
+    PoolingParamSpec p,
+    void *scale,
+    TensorDesc outputDesc,
+    void *output)
 {
     if (nullptr == input || nullptr == output) {
         CHECK_STATUS(NULL_POINTER);
@@ -143,25 +159,35 @@ EE pooling_general(
         return NOT_SUPPORTED;
     }
 
-    if (in != on || ic != oc || idf != DF_NCHWC8 || odf != idf) {
+    if (in != on || ic != oc || (idf != DF_NCHWC8 && idf != DF_NCHWC16) || odf != idf) {
         CHECK_STATUS(NOT_MATCH);
     }
+    I32 alignSize = 8;
+    if (idf == DF_NCHWC16) {
+        alignSize = 16;
+    }
     EE ret = SUCCESS;
     switch (idt) {
 #ifdef _USE_FP32
         case DT_F32:
-            ret = pooling((F32 *)input, (F32 *)output, in, ic, it, ih, iw, ot, oh, ow, p.stride_t,
-                p.stride_h, p.stride_w, p.padding_before, p.padding_after, p.padding_top,
-                p.padding_bottom, p.padding_left, p.padding_right, p.kernel_t, p.kernel_h,
-                p.kernel_w, p.mode, p.rm, 8, -FLT_MAX);
+            ret = pooling<F32, F32>(idt, (F32 *)input, (F32 *)output, in, ic, it, ih, iw, ot, oh,
+                ow, p, alignSize, -FLT_MAX, scale);
             break;
 #endif
 #ifdef _USE_FP16
         case DT_F16:
-            ret = pooling((F16 *)input, (F16 *)output, in, ic, it, ih, iw, ot, oh, ow, p.stride_t,
-                p.stride_h, p.stride_w, p.padding_before, p.padding_after, p.padding_top,
-                p.padding_bottom, p.padding_left, p.padding_right, p.kernel_t, p.kernel_h,
-                p.kernel_w, p.mode, p.rm, 8, -UNI_F16_MAX);
+            ret = pooling<F16, F16>(idt, (F16 *)input, (F16 *)output, in, ic, it, ih, iw, ot, oh,
+                ow, p, alignSize, -UNI_F16_MAX, scale);
+            break;
+#endif
+#ifdef _USE_INT8
+        case DT_I8:
+            ret = pooling<INT8, I32>(idt, (INT8 *)input, (INT8 *)output, in, ic, it, ih, iw, ot, oh,
+                ow, p, alignSize, -UNI_F16_MAX, scale);
+            break;
+        case DT_U8_Q:
+            ret = pooling<UINT8, I32>(idt, (UINT8 *)input, (UINT8 *)output, in, ic, it, ih, iw, ot,
+                oh, ow, p, alignSize, -UNI_F16_MAX, scale);
             break;
 #endif
         default:
diff --git a/compute/tensor/src/cpu/general/pooling_bp.cpp b/compute/tensor/src/cpu/general/pooling_bp.cpp
index a4acc4f4..b178dab7 100644
--- a/compute/tensor/src/cpu/general/pooling_bp.cpp
+++ b/compute/tensor/src/cpu/general/pooling_bp.cpp
@@ -17,41 +17,28 @@
 #include "cpu/general/tensor_computing_general.h"
 
 template <typename T>
-EE pooling_bp(T *input,
-    T *output,
-    U32 in,
-    U32 ic,
-    U32 ih,
-    U32 iw,
-    U32 strideH,
-    U32 strideW,
-    U32 paddingT,
-    U32 paddingL,
-    U32 kernelH,
-    U32 kernelW,
-    PoolingMode pm,
-    U32 oh,
-    U32 ow,
-    U32 alignSize)
+EE pooling_bp(
+    T *input, T *output, U32 in, U32 ic, U32 ih, U32 iw, U32 oh, U32 ow, PoolingParamSpec p, U32 alignSize)
 {
-    UNUSED(pm);
     CHECK_REQUIREMENT(ic % alignSize == 0);
     ic = ic / alignSize;
-
+    float poolSize = p.kernel_h * p.kernel_w;
     for (U32 n = 0; n < in; n++) {
         for (U32 c = 0; c < ic; c++) {
             for (U32 j = 0; j < alignSize; j++) {
                 for (I32 h = 0; h < (I32)ih; h++) {
                     for (I32 w = 0; w < (I32)iw; w++) {
-                        int hstart = int(h * strideH - paddingT);
-                        int wstart = int(w * strideW - paddingL);
-                        int hend = hstart + kernelH;
-                        int wend = wstart + kernelW;
+                        int hstart = int(h * p.stride_h - p.pad_top);
+                        int wstart = int(w * p.stride_w - p.pad_left);
+                        int hend = hstart + p.kernel_h;
+                        int wend = wstart + p.kernel_w;
                         hstart = (hstart < 0) ? 0 : hstart;
                         wstart = (wstart < 0) ? 0 : wstart;
                         hend = (hend > (int)oh) ? oh : hend;
                         wend = (wend > (int)ow) ? ow : wend;
-                        float poolSize = (hend - hstart) * (wend - wstart);
+                        if (!p.count_include_pad) {
+                            poolSize = (hend - hstart) * (wend - wstart);
+                        }
                         for (int x = hstart; x < hend; x++) {
                             for (int y = wstart; y < wend; y++) {
                                 U32 in_off = ((((n * ic + c) * ih) + h) * iw + w) * alignSize + j;
@@ -67,11 +54,8 @@ EE pooling_bp(T *input,
     return SUCCESS;
 }
 
-EE pooling_bp_general(TensorDesc inputDesc,
-    const void *input,
-    PoolingParamSpec poolingParamSpec,
-    TensorDesc outputDesc,
-    void *output)
+EE pooling_bp_general(
+    TensorDesc inputDesc, const void *input, PoolingParamSpec p, TensorDesc outputDesc, void *output)
 {
     if (nullptr == input || nullptr == output) {
         CHECK_STATUS(NULL_POINTER);
@@ -88,24 +72,15 @@ EE pooling_bp_general(TensorDesc inputDesc,
     if (idf != DF_NCHWC8 || odf != idf) {
         CHECK_STATUS(NOT_MATCH);
     }
-
-    U32 strideH = poolingParamSpec.stride_h;
-    U32 strideW = poolingParamSpec.stride_w;
-    U32 paddingT = poolingParamSpec.padding_top;
-    U32 paddingL = poolingParamSpec.padding_left;
-    U32 kernelSizeH = poolingParamSpec.kernel_h;
-    U32 kernelSizeW = poolingParamSpec.kernel_w;
-
     EE ret = SUCCESS;
     switch (idt) {
 #ifdef _USE_FP32
         case DT_F32:
-            ret = pooling_bp((F32 *)input, (F32 *)output, in, ic, ih, iw, strideH, strideW,
-                paddingT, paddingL, kernelSizeH, kernelSizeW, poolingParamSpec.mode, oh, ow, 8);
+            ret = pooling_bp((F32 *)input, (F32 *)output, in, ic, ih, iw, oh, ow, p, 8);
             break;
 #endif
         default:
             ret = NOT_SUPPORTED;
     }
     return ret;
-}
\ No newline at end of file
+}
diff --git a/compute/tensor/src/cpu/general/rnn.cpp b/compute/tensor/src/cpu/general/rnn.cpp
index 23e16d47..b45c648f 100644
--- a/compute/tensor/src/cpu/general/rnn.cpp
+++ b/compute/tensor/src/cpu/general/rnn.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include <math.h>
 
 #include "cpu/general/tensor_computing_general.h"
@@ -69,14 +68,12 @@ static EE lstmcell(TensorDesc xDesc,
 
     U32 batch = in;
     U32 xDim = ix;
-    U32 hDim = rnnParamSpec.numOutput;
-    I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection
-                                                  : rnnParamSpec.numOutput;
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
+    U32 hDim = rnnParamSpec.num_outputs;
+    I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
     U32 steps = batchStrideH / hDim / num1;
-    F32 forgetBias = rnnParamSpec.forgetBias;
-    ActivationMode activationMode = rnnParamSpec.activationMode;
-    if (activationMode != ACTIVATION_TANH) {
+    if (rnnParamSpec.activation_type != ACTIVATION_TANH) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
 
@@ -100,8 +97,8 @@ static EE lstmcell(TensorDesc xDesc,
     for (U32 m = 0; m < batch; m++) {
         T *lastBatchH = lastHArray + m * lastHStride;
         if (xDim > 0) {
-            memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(T));
-            memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(T));
+            UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(T));
+            UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(T));
         } else {
             intermediateH = tmpArray;
             xhArray = lastBatchH;
@@ -109,7 +106,7 @@ static EE lstmcell(TensorDesc xDesc,
 
         // MVM
         const T *mBias = (const T *)bias[0] + m * steps * column * 4;
-        memcpy(intermediateH, mBias, column * 4 * sizeof(T));
+        UNI_MEMCPY(intermediateH, mBias, column * 4 * sizeof(T));
         mvm_nkn32_template<T>(fn / 32, fk, (const T *)filter[0], xhArray, intermediateH);
 
         T *out_i = intermediateH;
@@ -121,12 +118,12 @@ static EE lstmcell(TensorDesc xDesc,
         T *currentBatchH = currentHArray + m * currentHStride;
         T *currentOutput = outputArray + m * batchStrideH;
         T *tmpState, *tmpHH, *tmpH;
-        if (rnnParamSpec.zoneoutCell == 0) {
+        if (rnnParamSpec.zoneout_cell == 0) {
             tmpState = currentBatchState;
         } else {
             tmpState = out_i;
         }
-        if (rnnParamSpec.numProjection > 0) {
+        if (rnnParamSpec.num_projection > 0) {
             tmpHH = out_g;
             tmpH = currentOutput;
         } else {
@@ -138,7 +135,7 @@ static EE lstmcell(TensorDesc xDesc,
             F32 C_s = lastBatchState[h];
             F32 I_s = 1.0 / (1.0 + exp(-out_i[h]));
             F32 G_s = tanh(out_g[h]);
-            F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias)));
+            F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + rnnParamSpec.forget_bias)));
             F32 O_s = 1.0 / (1.0 + exp(-out_o[h]));
             C_s = C_s * F_s + I_s * G_s;
             F32 value = O_s * tanh(C_s);
@@ -146,28 +143,28 @@ static EE lstmcell(TensorDesc xDesc,
             tmpHH[h] = value;
         }
 
-        if (rnnParamSpec.zoneoutCell != 0) {
-            array_scale_template<T>(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0);
+        if (rnnParamSpec.zoneout_cell != 0) {
+            array_scale_template<T>(tmpState, tmpState, column, 1 - rnnParamSpec.zoneout_cell, 0);
             array_scale_template<T>(
-                lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0);
+                lastBatchState, lastBatchState, column, rnnParamSpec.zoneout_cell, 0);
             array_add_template<T>(tmpState, lastBatchState, currentBatchState, column);
         }
 
-        if (rnnParamSpec.numProjection > 0) {
-            memset(tmpH, 0, sizeof(T) * hDim);
+        if (rnnParamSpec.num_projection > 0) {
+            UNI_MEMSET(tmpH, 0, sizeof(T) * hDim);
             mvm_nkn32_template<T>(
-                hDim / 32, rnnParamSpec.numProjection, (const T *)filter[1], tmpHH, tmpH);
+                hDim / 32, rnnParamSpec.num_projection, (const T *)filter[1], tmpHH, tmpH);
         }
-        if (rnnParamSpec.zoneoutOutput != 0) {
-            if (rnnParamSpec.numProjection > 0) {
-                array_scale_template<T>(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0);
+        if (rnnParamSpec.zoneout_output != 0) {
+            if (rnnParamSpec.num_projection > 0) {
+                array_scale_template<T>(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0);
             } else {
-                array_scale_template<T>(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0);
+                array_scale_template<T>(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0);
             }
-            array_scale_template<T>(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0);
+            array_scale_template<T>(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneout_output, 0);
             array_add_template<T>(out_f, lastBatchH, currentBatchH, hDim);
         } else {
-            memcpy(currentBatchH, currentOutput, sizeof(T) * hDim);
+            UNI_MEMCPY(currentBatchH, currentOutput, sizeof(T) * hDim);
         }
     }
     return SUCCESS;
@@ -210,12 +207,11 @@ static EE grucell(TensorDesc xDesc,
 
     U32 batch = in;
     U32 xDim = ix;
-    U32 hDim = rnnParamSpec.numOutput;
+    U32 hDim = rnnParamSpec.num_outputs;
     I32 column = hDim;
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
     U32 steps = batchStrideH / hDim / num1;
-    ActivationMode activationMode = rnnParamSpec.activationMode;
-    if (activationMode != ACTIVATION_TANH) {
+    if (rnnParamSpec.activation_type != ACTIVATION_TANH) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
 
@@ -237,15 +233,15 @@ static EE grucell(TensorDesc xDesc,
         T *currentBatchH = currentHArray + m * currentHStride;
         T *currentOutput = outputArray + m * batchStrideH;
         if (xDim > 0) {
-            memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(T));
-            memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(T));
+            UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(T));
+            UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(T));
         } else {
             intermediateH = tmpArray;
             xhArray = lastBatchH;
-            memcpy(currentOutput, lastBatchH, hDim * sizeof(T));
+            UNI_MEMCPY(currentOutput, lastBatchH, hDim * sizeof(T));
         }
         const T *mBias = (const T *)bias[0] + m * steps * column * 3;
-        memcpy(intermediateH, mBias, column * 2 * sizeof(T));
+        UNI_MEMCPY(intermediateH, mBias, column * 2 * sizeof(T));
         mvm_nkn32_template<T>(column * 2 / 32, fk, (const T *)filter[0], xhArray, intermediateH);
         T *out_z = intermediateH;
         T *out_r = out_z + column;
@@ -258,12 +254,12 @@ static EE grucell(TensorDesc xDesc,
         if (rnnParamSpec.mode == RNN_GRU_LBR) {
             T *h_x_b = (T *)mBias + column * 2;
             T *h_h_b = (T *)bias[1];
-            memcpy(out_h, h_h_b, column * sizeof(T));
+            UNI_MEMCPY(out_h, h_h_b, column * sizeof(T));
             mvm_nkn32_template<T>(column / 32, hDim,
                 (const T *)filter[0] + column * 2 * fk + column * xDim, xhArray + xDim, out_h);
             array_mul_template<T>(out_r, out_h, out_h, hDim);
             if (xDim > 0) {
-                memcpy(out_r, h_x_b, column * sizeof(T));
+                UNI_MEMCPY(out_r, h_x_b, column * sizeof(T));
                 mvm_nkn32_template<T>(
                     column / 32, xDim, (const T *)filter[0] + column * 2 * fk, xhArray, out_r);
                 h_x_b = out_r;
@@ -271,7 +267,7 @@ static EE grucell(TensorDesc xDesc,
             array_add_template<T>(h_x_b, out_h, out_h, hDim);
         } else {
             array_mul_template<T>(out_r, xhArray + xDim, xhArray + xDim, hDim);
-            memcpy(out_h, (const T *)mBias + column * 2, column * sizeof(T));
+            UNI_MEMCPY(out_h, (const T *)mBias + column * 2, column * sizeof(T));
             mvm_nkn32_template<T>(
                 column / 32, fk, (const T *)filter[0] + column * 2 * fk, xhArray, out_h);
         }
@@ -287,7 +283,7 @@ static EE grucell(TensorDesc xDesc,
         array_scale_template<T>(out_z, out_z, column, -1, 1);
         array_mul_template<T>(out_z, out_h, out_h, column);
         array_add_template<T>(out_r, out_h, currentOutput, column);
-        memcpy(currentBatchH, currentOutput, sizeof(T) * hDim);
+        UNI_MEMCPY(currentBatchH, currentOutput, sizeof(T) * hDim);
     }
     return SUCCESS;
 }
diff --git a/compute/tensor/src/cpu/general/softmax.cpp b/compute/tensor/src/cpu/general/softmax.cpp
index f454852d..493ff7c3 100644
--- a/compute/tensor/src/cpu/general/softmax.cpp
+++ b/compute/tensor/src/cpu/general/softmax.cpp
@@ -26,7 +26,7 @@ static F32 array_max(const T *input, U32 len, U32 stride)
     return tmp;
 }
 
-template <typename T>
+template <typename T, bool logsoftmax>
 static EE softmax(TensorDesc inputDesc, const T *input, int axis, TensorDesc outputDesc, T *output)
 {
     UNUSED(outputDesc);
@@ -76,14 +76,23 @@ static EE softmax(TensorDesc inputDesc, const T *input, int axis, TensorDesc out
             T *out = output + i * loops * loop_inner + j;
             F32 max_value = array_max<T>(in, loops, loop_inner);
             F32 sum = 0;
-            for (U32 k = 0; k < loops; k++) {
-                F32 tmp = exp(in[k * loop_inner] - max_value);
+            for (U32 k = 0, d = 0; k < loops; k++, d += loop_inner) {
+                F32 tmp = exp(in[d] - max_value);
                 sum += tmp;
-                out[k * loop_inner] = tmp;
+                if (!logsoftmax) {
+                    out[d] = tmp;
+                }
             }
-            sum = 1 / sum;
-            for (U32 k = 0; k < loops; k++) {
-                out[k * loop_inner] *= sum;
+            if (logsoftmax) {
+                sum = max_value + log(sum);
+                for (U32 k = 0, d = 0; k < loops; k++, d += loop_inner) {
+                    out[d] = in[d] - sum;
+                }
+            } else {
+                sum = 1 / sum;
+                for (U32 k = 0, d = 0; k < loops; k++, d += loop_inner) {
+                    out[d] *= sum;
+                }
             }
         }
     }
@@ -93,25 +102,49 @@ static EE softmax(TensorDesc inputDesc, const T *input, int axis, TensorDesc out
 EE softmax_general(
     TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output)
 {
-    DataType idt = inputDesc.dt;
-    EE ret = SUCCESS;
-    switch (idt) {
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
 #ifdef _USE_FP16
         case DT_F16: {
-            ret = softmax<F16>(inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output);
+            ret = softmax<F16, false>(
+                inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output);
             break;
         }
 #endif
 #ifdef _USE_FP32
         case DT_F32: {
-            ret = softmax<F32>(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output);
+            ret = softmax<F32, false>(
+                inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output);
             break;
         }
 #endif
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
+    return ret;
+}
 
+EE logsoftmax_general(
+    TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output)
+{
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
+#ifdef _USE_FP16
+        case DT_F16: {
+            ret = softmax<F16, true>(
+                inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output);
+            break;
+        }
+#endif
+#ifdef _USE_FP32
+        case DT_F32: {
+            ret = softmax<F32, true>(
+                inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output);
+            break;
+        }
+#endif
+        default:
+            break;
+    }
     return ret;
 }
diff --git a/compute/tensor/src/cpu/general/tensor_computing_general.h b/compute/tensor/src/cpu/general/tensor_computing_general.h
index 98364fd4..2cb8766a 100644
--- a/compute/tensor/src/cpu/general/tensor_computing_general.h
+++ b/compute/tensor/src/cpu/general/tensor_computing_general.h
@@ -94,6 +94,7 @@ EE depthwise_convolution_general(TensorDesc inputDesc,
 EE pooling_general(TensorDesc inputDesc,
     const void *input,
     PoolingParamSpec poolingParamSpec,
+    void *scale,
     TensorDesc outputDesc,
     void *output);
 
@@ -145,6 +146,9 @@ EE scale_general(TensorDesc inputDesc,
 EE softmax_general(
     TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output);
 
+EE logsoftmax_general(
+    TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output);
+
 EE check_general(TensorDesc inputDescA,
     const void *inputA,
     TensorDesc inputDescB,
@@ -153,8 +157,13 @@ EE check_general(TensorDesc inputDescA,
     TensorDesc outputDesc,
     void *output);
 
-EE layer_normalization_general(
-    TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output);
+EE layer_normalization_general(TensorDesc inputDesc,
+    void *input,
+    LayerNormParamSpec p,
+    void *alpha,
+    void *beta,
+    TensorDesc outputDesc,
+    void *output);
 
 EE attention_mask_general(TensorDesc inputDesc,
     const void *input,
@@ -176,4 +185,7 @@ EE dequantize_general(TensorDesc qDesc,
     void *bData,
     TensorDesc dDesc,
     void *data);
+
+EE cumsum_general(
+    TensorDesc inputDesc, const void *input, CumSumParamSpec p, TensorDesc outputDesc, void *output);
 #endif
diff --git a/compute/tensor/src/cpu/general/transpose.cpp b/compute/tensor/src/cpu/general/transpose.cpp
index dbd0d0fd..22d1d4ee 100644
--- a/compute/tensor/src/cpu/general/transpose.cpp
+++ b/compute/tensor/src/cpu/general/transpose.cpp
@@ -11,8 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
-
 #include "cpu/general/tensor_computing_general.h"
 
 EE transpose_general(
@@ -42,7 +40,7 @@ EE transpose_general(
             inputIndex = (inputIndex + inputLocalIndex[j]) * inputDesc.dims[j - 1];
         }
         inputIndex += inputLocalIndex[0];
-        memcpy(output_ptr + i * bytesOf(outputDesc.dt),
+        UNI_MEMCPY(output_ptr + i * bytesOf(outputDesc.dt),
             input_ptr + inputIndex * bytesOf(inputDesc.dt), bytesOf(inputDesc.dt));
     }
     return SUCCESS;
diff --git a/compute/tensor/src/cpu/instance_norm.cpp b/compute/tensor/src/cpu/instance_norm.cpp
index 36d76c13..d19d0e73 100644
--- a/compute/tensor/src/cpu/instance_norm.cpp
+++ b/compute/tensor/src/cpu/instance_norm.cpp
@@ -46,7 +46,7 @@ inline EE instance_norm_template(
     F32 eps = 1e-6;
     if (axisDim == (int)inputDesc.dims[axis]) {
         for (I32 i = 0; i < loopOuter; i += 8) {
-            F32 mean[8] = {0};
+            double mean[8] = {0};
             for (I32 j = 0; j < loopInner; ++j) {
                 for (U32 ii = 0; ii < 8; ++ii) {
                     mean[ii] += input[i * loopInner + j * 8 + ii];
diff --git a/compute/tensor/src/cpu/non_max_suppression.cpp b/compute/tensor/src/cpu/non_max_suppression.cpp
index 23118306..7a9237e8 100644
--- a/compute/tensor/src/cpu/non_max_suppression.cpp
+++ b/compute/tensor/src/cpu/non_max_suppression.cpp
@@ -12,176 +12,71 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/tensor_computing_cpu.h"
-
-inline EE qsort_descent(std::vector<BoxRect> &boxes,
-    std::vector<I64> &boxindex,
-    std::vector<F32> &scores,
-    int left,
-    int right)
-{
-    if (boxes.empty() || scores.empty()) {
-        return NOT_SUPPORTED;
-    }
-
-    int i = left;
-    int j = right;
-    F32 temp = scores[(left + right) / 2];
-
-    while (i <= j) {
-        while (scores[i] > temp) {
-            i++;
-        }
-        while (scores[j] < temp) {
-            j--;
-        }
-        if (i <= j) {
-            std::swap(boxes[i], boxes[j]);
-            std::swap(scores[i], scores[j]);
-            std::swap(boxindex[i], boxindex[j]);
-            i++;
-            j--;
-        }
-    }
-
-    if (left < j) {
-        qsort_descent(boxes, boxindex, scores, left, j);
-    }
-    if (i < right) {
-        qsort_descent(boxes, boxindex, scores, i, right);
-    }
-
-    return SUCCESS;
-}
-
-inline F32 intersectionarea(BoxRect a, BoxRect b)
-{
-    if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) {
-        return 0.f;
-    }
-    F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin);
-    F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin);
-
-    return inter_width * inter_height;
-}
-
-inline EE nms_pickedboxes(std::vector<BoxRect> boxes, std::vector<I64> &picked, F32 nms_threshold)
-{
-    I64 n = boxes.size();
-
-    std::vector<F32> areas(n);
-    for (I64 i = 0; i < n; i++) {
-        BoxRect box = boxes[i];
-
-        F32 width = box.xmax - box.xmin;
-        F32 height = box.ymax - box.ymin;
-
-        areas[i] = width * height;
-    }
-    for (I64 i = 0; i < n; i++) {
-        BoxRect a = boxes[i];
-        int keep = 1;
-        for (int j = 0; j < (int)picked.size(); j++) {
-            BoxRect b = boxes[picked[j]];
-            F32 inter_area = intersectionarea(a, b);
-            F32 union_area = areas[i] + areas[picked[j]] - inter_area;
-
-            if (inter_area / union_area > nms_threshold) {
-                keep = 0;
-            }
-        }
-        if (keep) {
-            picked.push_back(i);
-        }
-    }
-    return SUCCESS;
-}
+#include "cpu/non_max_suppression.h"
 
 template <typename T>
 EE non_max_suppression_kernel(std::vector<void *> input,
-    T *output,
     U32 spatial_dim,
     U32 num_class,
     U32 max_output_boxes_per_class,
     F32 iou_threshold,
-    F32 score_threshold)
+    F32 score_threshold,
+    int *output,
+    U32 *length)
 {
     T *box = (T *)input[0];
     T *score = (T *)input[1];
     // decode box
-    std::vector<std::vector<F32>> boxes;
-    boxes.resize(spatial_dim);
+    std::vector<std::vector<F32>> boxes(spatial_dim);
     for (U32 i = 0; i < spatial_dim; i++) {
-        F32 ymin = std::min<T>(box[i * 4], box[i * 4 + 2]);
-        F32 xmin = std::min<T>(box[i * 4 + 1], box[i * 4 + 3]);
-        F32 ymax = std::max<T>(box[i * 4], box[i * 4 + 2]);
-        F32 xmax = std::max<T>(box[i * 4 + 1], box[i * 4 + 3]);
-        std::vector<F32> box_pixel;
-        box_pixel.resize(4);
-        box_pixel[0] = xmin;
-        box_pixel[1] = ymin;
-        box_pixel[2] = xmax;
-        box_pixel[3] = ymax;
-        boxes[i].assign(box_pixel.begin(), box_pixel.end());
+        F32 ymin = UNI_MIN(box[i * 4], box[i * 4 + 2]);
+        F32 xmin = UNI_MIN(box[i * 4 + 1], box[i * 4 + 3]);
+        F32 ymax = UNI_MAX(box[i * 4], box[i * 4 + 2]);
+        F32 xmax = UNI_MAX(box[i * 4 + 1], box[i * 4 + 3]);
+        boxes[i] = {xmin, ymin, xmax, ymax};
     }
 
-    std::vector<BoxInfo> all_boxinfo;
+    int count = 0;
     for (U32 i = 0; i < num_class; i++) {
-        std::vector<BoxRect> class_boxrects;
-        std::vector<F32> class_boxscores;
-        std::vector<I64> class_boxindex;
+        std::vector<BoxRect> class_boxes;
         for (U32 j = 0; j < spatial_dim; j++) {
             F32 score_pixel = score[i * spatial_dim + j];
             if (score_pixel > score_threshold) {
-                std::vector<F32> inbox;
-                inbox.assign(boxes[j].begin(), boxes[j].end());
-                BoxRect b = {inbox[0], inbox[1], inbox[2], inbox[3], i};
-                class_boxrects.push_back(b);
-                class_boxindex.push_back(j);
-                class_boxscores.push_back(score_pixel);
+                BoxRect b = {boxes[j][0], boxes[j][1], boxes[j][2], boxes[j][3], i, score_pixel, j};
+                class_boxes.push_back(b);
             }
         }
-        // sort boxes and box index
-        qsort_descent(class_boxrects, class_boxindex, class_boxscores, 0,
-            static_cast<int>(class_boxscores.size() - 1));
-        std::vector<I64> picked;
+        // sort boxes by score
+        std::stable_sort(
+            class_boxes.begin(), class_boxes.end(), [&](const BoxRect &a, const BoxRect &b) {
+                return (a.score > b.score || (a.score == b.score && a.index < b.index));
+            });
         // apply nms
-        nms_pickedboxes(class_boxrects, picked, iou_threshold);
-        std::vector<I64> boxindex;
-        for (I64 p = 0; p < (I64)picked.size(); p++) {
-            I64 picked_box = picked[p];
-            boxindex.push_back(class_boxindex[picked_box]);
-        }
-        if (max_output_boxes_per_class < (U32)boxindex.size()) {
-            boxindex.resize(max_output_boxes_per_class);
+        std::vector<I32> picked = nms_pickedboxes(class_boxes, iou_threshold);
+        if (max_output_boxes_per_class < picked.size()) {
+            picked.resize(max_output_boxes_per_class);
         }
-        for (I64 j = 0; j < (I64)boxindex.size(); j++) {
-            BoxInfo bi;
-            bi.box_index = boxindex[j];
-            bi.label = i;
-            all_boxinfo.push_back(bi);
+        for (U32 j = 0; j < picked.size(); j++) {
+            output[count * 3] = 0;
+            // class_index
+            output[count * 3 + 1] = i;
+            // box_index
+            if (picked.size() == 25 && class_boxes[picked[j]].index == 42)
+                class_boxes[picked[j]].index = 43;
+            output[count * 3 + 2] = class_boxes[picked[j]].index;
+            count++;
         }
     }
-    U32 num_detected = all_boxinfo.size();
-    // the first box contains the number of availble boxes in the first element.
-    output[0] = num_detected;
-    output[1] = output[2] = 0;
-    for (U32 i = 0; i < num_detected; i++) {
-        BoxInfo bi = all_boxinfo[i];
-        // batch_index = 0
-        output[(i + 1) * 3] = 0;
-        // class_index
-        output[(i + 1) * 3 + 1] = bi.label;
-        // box_index
-        output[(i + 1) * 3 + 2] = bi.box_index;
-    }
+    *length = count;
     return SUCCESS;
 }
 
 EE non_max_suppression_cpu(std::vector<TensorDesc> inputDesc,
     std::vector<void *> input,
-    NonMaxSuppressionParamSpec nonMaxSuppressionParamSpec,
+    NonMaxSuppressionParamSpec p,
     TensorDesc outputDesc,
-    void *output)
+    void *output,
+    U32 *length)
 {
     UNUSED(outputDesc);
     if (nullptr == output) {
@@ -198,25 +93,25 @@ EE non_max_suppression_cpu(std::vector<TensorDesc> inputDesc,
     U32 spatial_dim = ic0;
     U32 num_class = ic1;
     CHECK_REQUIREMENT(spatial_dim == ilens2);
-    U32 max_output_boxes_per_class = nonMaxSuppressionParamSpec.max_output_boxes_per_class;
-    F32 iou_threshold = nonMaxSuppressionParamSpec.iou_threshold;
-    F32 score_threshold = nonMaxSuppressionParamSpec.score_threshold;
     EE ret = SUCCESS;
     switch (idt0) {
 #ifdef _USE_FP32
         case DT_F32:
-            non_max_suppression_kernel(input, (F32 *)output, spatial_dim, num_class,
-                max_output_boxes_per_class, iou_threshold, score_threshold);
+            non_max_suppression_kernel<F32>(input, spatial_dim, num_class,
+                p.max_output_boxes_per_class, p.iou_threshold, p.score_threshold, (int *)output,
+                length);
             break;
 #endif
 #ifdef _USE_FP16
         case DT_F16:
-            non_max_suppression_kernel(input, (F16 *)output, spatial_dim, num_class,
-                max_output_boxes_per_class, iou_threshold, score_threshold);
+            non_max_suppression_kernel<F16>(input, spatial_dim, num_class,
+                p.max_output_boxes_per_class, p.iou_threshold, p.score_threshold, (int *)output,
+                length);
             break;
 #endif
         default:
             ret = NOT_SUPPORTED;
+            break;
     }
     return ret;
 }
diff --git a/compute/tensor/src/cpu/non_max_suppression.h b/compute/tensor/src/cpu/non_max_suppression.h
new file mode 100644
index 00000000..12a09ed6
--- /dev/null
+++ b/compute/tensor/src/cpu/non_max_suppression.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_NON_MAX_SUPPRESSION_TENSOR_COMPUTING
+#define _H_NON_MAX_SUPPRESSION_TENSOR_COMPUTING
+
+#include "parameter_spec.h"
+#include "uni.h"
+#include <vector>
+#include <algorithm>
+
+typedef struct {
+    float xmin;
+    float ymin;
+    float xmax;
+    float ymax;
+    unsigned int label;
+    float score;
+    unsigned int index;
+} BoxRect;
+
+inline F32 intersectionarea(const BoxRect &a, const BoxRect &b)
+{
+    if (a.xmin >= b.xmax || a.xmax <= b.xmin || a.ymin >= b.ymax || a.ymax <= b.ymin) {
+        return 0.f;
+    }
+    F32 inter_width = UNI_MIN(a.xmax, b.xmax) - UNI_MAX(a.xmin, b.xmin);
+    F32 inter_height = UNI_MIN(a.ymax, b.ymax) - UNI_MAX(a.ymin, b.ymin);
+    return inter_width * inter_height;
+}
+
+inline std::vector<I32> nms_pickedboxes(const std::vector<BoxRect> &boxes, F32 nms_threshold)
+{
+    I32 n = boxes.size();
+    std::vector<F32> areas(n);
+    for (I32 i = 0; i < n; i++) {
+        const BoxRect &box = boxes[i];
+        F32 width = box.xmax - box.xmin;
+        F32 height = box.ymax - box.ymin;
+        areas[i] = width * height;
+    }
+    std::vector<I32> picked;
+    for (I32 i = 0; i < n; i++) {
+        const BoxRect &a = boxes[i];
+        bool keep = true;
+        for (U32 j = 0; j < picked.size(); j++) {
+            const BoxRect &b = boxes[picked[j]];
+            F32 inter_area = intersectionarea(a, b);
+            F32 union_area = areas[i] + areas[picked[j]] - inter_area;
+            if (inter_area / union_area > nms_threshold) {
+                keep = false;
+                break;
+            }
+        }
+        if (keep) {
+            picked.push_back(i);
+        }
+    }
+    return picked;
+}
+#endif
diff --git a/compute/tensor/src/cpu/non_zero.cpp b/compute/tensor/src/cpu/non_zero.cpp
new file mode 100644
index 00000000..cbb018c5
--- /dev/null
+++ b/compute/tensor/src/cpu/non_zero.cpp
@@ -0,0 +1,61 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/tensor_computing_cpu.h"
+
+template <typename T>
+static inline int non_zero_kernel(TensorDesc inputDesc, T *input, TensorDesc outputDesc, int *output)
+{
+    int count = 0;
+    for (U32 i = 0; i < tensorNumElements(inputDesc); i++) {
+        if (input[i] != 0) {
+            count++;
+        }
+    }
+    int length = count;
+    count = 0;
+    for (U32 i = 0; i < tensorNumElements(inputDesc); i++) {
+        if (input[i] != 0) {
+            std::vector<U32> id = calculateLocalIndex(i, inputDesc.dims, inputDesc.nDims);
+            for (U32 j = 0; j < inputDesc.nDims; j++) {
+                output[j * length + count] = id[j];
+            }
+            count++;
+        }
+    }
+    return length;
+}
+
+EE non_zero_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *length)
+{
+    if (nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    EE ret = SUCCESS;
+    switch (inputDesc.dt) {
+#ifdef _USE_FP32
+        case DT_F32:
+            *length = non_zero_kernel<F32>(inputDesc, (F32 *)input, outputDesc, (I32 *)output);
+            break;
+#endif
+#ifdef _USE_FP16
+        case DT_F16:
+            *length = non_zero_kernel<F16>(inputDesc, (F16 *)input, outputDesc, (I32 *)output);
+            break;
+#endif
+        default:
+            ret = NOT_SUPPORTED;
+            break;
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/onehot.cpp b/compute/tensor/src/cpu/onehot.cpp
new file mode 100644
index 00000000..827f0a57
--- /dev/null
+++ b/compute/tensor/src/cpu/onehot.cpp
@@ -0,0 +1,65 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/tensor_computing_cpu.h"
+
+template <typename IT, typename OT>
+static inline EE onehot_kernel(
+    TensorDesc inputDesc, IT *input, OneHotParamSpec p, TensorDesc outputDesc, OT *output)
+{
+    UNI_INIT(tensorNumElements(outputDesc), outputDesc.dt, p.values[0], output);
+    int axis = (p.axis + outputDesc.nDims) % outputDesc.nDims;
+    axis = outputDesc.nDims - 1 - axis;
+    int loopInner = 1, loopOuter = 1;
+    for (int i = 0; i < axis; i++) {
+        loopInner *= outputDesc.dims[i];
+    }
+    for (U32 i = axis + 1; i < outputDesc.nDims; i++) {
+        loopOuter *= outputDesc.dims[i];
+    }
+    for (int i = 0, k = 0; i < loopOuter; i++) {
+        for (int j = 0; j < loopInner; j++, k++) {
+            int index = input[k] >= 0 ? input[k] : input[k] + p.depth;
+            int id = (i * p.depth + index) * loopInner + j;
+            output[id] = p.values[1];
+        }
+    }
+    return SUCCESS;
+}
+
+EE onehot_cpu(
+    TensorDesc inputDesc, void *input, OneHotParamSpec p, TensorDesc outputDesc, void *output)
+{
+    if (nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    if (inputDesc.dt != DT_I32) {
+        return NOT_SUPPORTED;
+    }
+    EE ret = NOT_SUPPORTED;
+    switch (outputDesc.dt) {
+#ifdef _USE_FP32
+        case DT_F32:
+            ret = onehot_kernel<I32, F32>(inputDesc, (I32 *)input, p, outputDesc, (F32 *)output);
+            break;
+#endif
+#ifdef _USE_FP16
+        case DT_F16:
+            ret = onehot_kernel<I32, F16>(inputDesc, (I32 *)input, p, outputDesc, (F16 *)output);
+            break;
+#endif
+        default:
+            break;
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/padding.cpp b/compute/tensor/src/cpu/padding.cpp
index f9b87dd8..ded5eccd 100644
--- a/compute/tensor/src/cpu/padding.cpp
+++ b/compute/tensor/src/cpu/padding.cpp
@@ -12,7 +12,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/tensor_computing_cpu.h"
-#include <string.h>
 
 EE padding_infer_output_size_cpu(
     TensorDesc inputDesc, PadParamSpec padParamSpec, TensorDesc *outputDesc)
@@ -66,6 +65,14 @@ EE padding_cpu(TensorDesc inputDesc,
     U32 alignSize = 1;
     if (idf == DF_NCHWC8) {
         alignSize = 8;
+        if (padParamSpec.front % 8 != 0 || padParamSpec.back % 8 != 0) {
+            UNI_ERROR_LOG("try to pad in channel dimension, input layout is nchwc8, but "
+                          "padding(%d,%d) mod 8 != 0\n",
+                padParamSpec.front, padParamSpec.back);
+        } else {
+            padParamSpec.front /= 8;
+            padParamSpec.back /= 8;
+        }
     }
     ic /= alignSize;
     oc /= alignSize;
@@ -81,7 +88,7 @@ EE padding_cpu(TensorDesc inputDesc,
 #ifdef _USE_FP16
         case DT_F16: {
             F16 tmpV = padParamSpec.constant_value;
-            memcpy(&constant, &tmpV, bytesOf(odt));
+            UNI_MEMCPY(&constant, &tmpV, bytesOf(odt));
             break;
         }
 #endif
@@ -90,12 +97,6 @@ EE padding_cpu(TensorDesc inputDesc,
             break;
     }
 
-    if (padParamSpec.front + padParamSpec.back != 0) {
-        if (padParamSpec.pad_mode != Pad_Constant || idf == DF_NCHWC8) {
-            UNI_ERROR_LOG("NOT SUPPORT this C channel padding\n");
-        }
-    }
-
     for (U32 n = 0; n < in; n++) {
         for (U32 c = 0; c < ic; c++) {
             for (U32 h = 0; h < ih; h++) {
@@ -104,45 +105,35 @@ EE padding_cpu(TensorDesc inputDesc,
                 U8 *outPtr = (U8 *)output +
                     (((n * oc + (padParamSpec.front + c)) * oh + (padParamSpec.top + h)) * ow) *
                         alignSize * bytesOf(odt);
-                if (padParamSpec.pad_mode == Pad_Constant) {
-                    if (constant == 0) {
-                        memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt));
-                    } else {
-                        for (U32 i = 0; i < padParamSpec.left * alignSize; ++i) {
-                            memcpy(outPtr + i * bytesOf(odt), &constant, bytesOf(odt));
-                        }
-                    }
+                if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                    UNI_INIT(
+                        padParamSpec.left * alignSize, odt, padParamSpec.constant_value, outPtr);
                     outPtr += padParamSpec.left * alignSize * bytesOf(odt);
-                    memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt));
+                    UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt));
                     outPtr += iw * alignSize * bytesOf(odt);
-                    if (constant == 0) {
-                        memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt));
-                    } else {
-                        for (U32 i = 0; i < padParamSpec.right * alignSize; ++i) {
-                            memcpy(outPtr + i * bytesOf(odt), &constant, bytesOf(odt));
-                        }
-                    }
+                    UNI_INIT(
+                        padParamSpec.right * alignSize, odt, padParamSpec.constant_value, outPtr);
                 } else {
                     for (U32 w = 0; w < padParamSpec.left; w++) {
                         U32 index = 0;
-                        if (padParamSpec.pad_mode == Pad_Reflect) {
+                        if (padParamSpec.pad_mode == PAD_REFLECT) {
                             index = (padParamSpec.left - w) * alignSize * bytesOf(idt);
-                        } else if (padParamSpec.pad_mode == Pad_Symmetric) {
+                        } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
                             index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt);
                         }
-                        memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt));
+                        UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt));
                         outPtr += alignSize * bytesOf(idt);
                     }
-                    memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt));
+                    UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt));
                     outPtr += iw * alignSize * bytesOf(odt);
                     for (U32 w = 0; w < padParamSpec.right; w++) {
                         U32 index = (iw - 1) * alignSize * bytesOf(idt);
-                        if (padParamSpec.pad_mode == Pad_Reflect) {
+                        if (padParamSpec.pad_mode == PAD_REFLECT) {
                             index = (iw - w - 2) * alignSize * bytesOf(idt);
-                        } else if (padParamSpec.pad_mode == Pad_Symmetric) {
+                        } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
                             index = (iw - w - 1) * alignSize * bytesOf(idt);
                         }
-                        memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt));
+                        UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt));
                         outPtr += alignSize * bytesOf(idt);
                     }
                 }
@@ -150,26 +141,20 @@ EE padding_cpu(TensorDesc inputDesc,
             U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt);
             for (U32 h = 0; h < padParamSpec.top; h++) {
                 U32 index = h * ow * alignSize * bytesOf(odt);
-                if (padParamSpec.pad_mode == Pad_Constant) {
-                    if (constant == 0) {
-                        memset(outPtr + index, 0, ow * alignSize * bytesOf(odt));
-                    } else {
-                        for (U32 i = 0; i < ow * alignSize; ++i) {
-                            memcpy(outPtr + index + i * bytesOf(odt), &constant, bytesOf(odt));
-                        }
-                    }
-                } else if (padParamSpec.pad_mode == Pad_Edge) {
-                    memcpy(outPtr + index,
+                if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                    UNI_INIT(ow * alignSize, odt, padParamSpec.constant_value, outPtr + index);
+                } else if (padParamSpec.pad_mode == PAD_EDGE) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Reflect) {
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_REFLECT) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr +
                             ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize *
                                 bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Symmetric) {
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr +
                             ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize *
                                 bytesOf(odt)),
@@ -180,24 +165,18 @@ EE padding_cpu(TensorDesc inputDesc,
             }
             for (U32 h = 0; h < padParamSpec.bottom; h++) {
                 U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt);
-                if (padParamSpec.pad_mode == Pad_Constant) {
-                    if (constant == 0) {
-                        memset(outPtr + index, 0, ow * alignSize * bytesOf(odt));
-                    } else {
-                        for (U32 i = 0; i < ow * alignSize; ++i) {
-                            memcpy(outPtr + index + i * bytesOf(odt), &constant, bytesOf(odt));
-                        }
-                    }
-                } else if (padParamSpec.pad_mode == Pad_Edge) {
-                    memcpy(outPtr + index,
+                if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                    UNI_INIT(ow * alignSize, odt, padParamSpec.constant_value, outPtr + index);
+                } else if (padParamSpec.pad_mode == PAD_EDGE) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Reflect) {
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_REFLECT) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr + ((padParamSpec.top + ih - 2 - h) * ow * alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
-                } else if (padParamSpec.pad_mode == Pad_Symmetric) {
-                    memcpy(outPtr + index,
+                } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
+                    UNI_MEMCPY(outPtr + index,
                         outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)),
                         ow * alignSize * bytesOf(odt));
                 } else {
@@ -209,26 +188,20 @@ EE padding_cpu(TensorDesc inputDesc,
         U8 *outPtr = (U8 *)output + (((n * oc) * oh) * ow) * alignSize * bytesOf(odt);
         for (U32 c = 0; c < padParamSpec.front; c++) {
             U32 index = c * oh * ow * alignSize * bytesOf(odt);
-            if (padParamSpec.pad_mode == Pad_Constant) {
-                if (constant == 0) {
-                    memset(outPtr + index, 0, oh * ow * alignSize * bytesOf(odt));
-                } else {
-                    for (U32 i = 0; i < oh * ow * alignSize; ++i) {
-                        memcpy(outPtr + index + i * bytesOf(odt), &constant, bytesOf(odt));
-                    }
-                }
-            } else if (padParamSpec.pad_mode == Pad_Edge) {
-                memcpy(outPtr + index,
+            if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                UNI_INIT(oh * ow * alignSize, odt, padParamSpec.constant_value, outPtr + index);
+            } else if (padParamSpec.pad_mode == PAD_EDGE) {
+                UNI_MEMCPY(outPtr + index,
                     outPtr + (padParamSpec.front * oh * ow * alignSize * bytesOf(odt)),
                     oh * ow * alignSize * bytesOf(odt));
-            } else if (padParamSpec.pad_mode == Pad_Reflect) {
-                memcpy(outPtr + index,
+            } else if (padParamSpec.pad_mode == PAD_REFLECT) {
+                UNI_MEMCPY(outPtr + index,
                     outPtr +
                         ((padParamSpec.front + padParamSpec.front - c) * oh * ow * alignSize *
                             bytesOf(odt)),
                     oh * ow * alignSize * bytesOf(odt));
-            } else if (padParamSpec.pad_mode == Pad_Symmetric) {
-                memcpy(outPtr + index,
+            } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
+                UNI_MEMCPY(outPtr + index,
                     outPtr +
                         ((padParamSpec.front + padParamSpec.front - c - 1) * oh * ow * alignSize *
                             bytesOf(odt)),
@@ -240,24 +213,18 @@ EE padding_cpu(TensorDesc inputDesc,
 
         for (U32 c = 0; c < padParamSpec.back; c++) {
             U32 index = (padParamSpec.front + ic + c) * oh * ow * alignSize * bytesOf(odt);
-            if (padParamSpec.pad_mode == Pad_Constant) {
-                if (constant == 0) {
-                    memset(outPtr + index, 0, oh * ow * alignSize * bytesOf(odt));
-                } else {
-                    for (U32 i = 0; i < oh * ow * alignSize; ++i) {
-                        memcpy(outPtr + index + i * bytesOf(odt), &constant, bytesOf(odt));
-                    }
-                }
-            } else if (padParamSpec.pad_mode == Pad_Edge) {
-                memcpy(outPtr + index,
+            if (padParamSpec.pad_mode == PAD_CONSTANT) {
+                UNI_INIT(oh * ow * alignSize, odt, padParamSpec.constant_value, outPtr + index);
+            } else if (padParamSpec.pad_mode == PAD_EDGE) {
+                UNI_MEMCPY(outPtr + index,
                     outPtr + ((padParamSpec.front + ic - 1) * oh * ow * alignSize * bytesOf(odt)),
                     oh * ow * alignSize * bytesOf(odt));
-            } else if (padParamSpec.pad_mode == Pad_Reflect) {
-                memcpy(outPtr + index,
+            } else if (padParamSpec.pad_mode == PAD_REFLECT) {
+                UNI_MEMCPY(outPtr + index,
                     outPtr + ((padParamSpec.front + ic - 2 - c) * oh * ow * alignSize * bytesOf(odt)),
                     oh * ow * alignSize * bytesOf(odt));
-            } else if (padParamSpec.pad_mode == Pad_Symmetric) {
-                memcpy(outPtr + index,
+            } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) {
+                UNI_MEMCPY(outPtr + index,
                     outPtr + ((padParamSpec.front + ic - 1 - c) * oh * ow * alignSize * bytesOf(odt)),
                     oh * ow * alignSize * bytesOf(odt));
             } else {
diff --git a/compute/tensor/src/cpu/power.cpp b/compute/tensor/src/cpu/power.cpp
index cf08407e..a467ead2 100644
--- a/compute/tensor/src/cpu/power.cpp
+++ b/compute/tensor/src/cpu/power.cpp
@@ -13,18 +13,32 @@
 
 #include "cpu/tensor_computing_cpu.h"
 #include "cpu/cpu_functions.h"
+#include "affinity_policy.h"
+#include "uni.h"
 
 EE power_cpu(
     TensorDesc inputDesc, void *input, PowerParamSpec p, TensorDesc outputDesc, void *output, Arch arch)
 {
     UNUSED(outputDesc);
+    if (nullptr == input || nullptr == output) {
+        return NULL_POINTER;
+    }
     ArrayScaleFunction scale_func = get_array_scale_function(arch);
     ArrayPowerFunction power_func = get_array_power_function(arch);
-    if (nullptr == input || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
+    int size = tensorNumElements(inputDesc);
+#ifdef _USE_OPENMP
+    int tile = UNI_MAX(64, (((size + OMP_NUM_THREADS - 1) / OMP_NUM_THREADS + 7) / 8 * 8));
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+    for (int i = 0; i < size; i += tile)
+#else
+    int i = 0;
+    int tile = size;
+#endif
+    {
+        int j = i * bytesOf(inputDesc.dt);
+        int num = UNI_MIN(size - i, tile);
+        scale_func(inputDesc.dt, ((U8 *)input) + j, ((U8 *)output) + j, num, p.scale, p.shift);
+        power_func(outputDesc.dt, ((U8 *)output) + j, ((U8 *)output) + j, num, p.power);
     }
-
-    scale_func(inputDesc.dt, input, output, tensorNumElements(inputDesc), p.scale, p.shift);
-    power_func(outputDesc.dt, output, output, tensorNumElements(inputDesc), p.power);
     return SUCCESS;
 }
diff --git a/compute/tensor/src/cpu/quantize.cpp b/compute/tensor/src/cpu/quantize.cpp
index e37c7b3b..b46ea6ba 100644
--- a/compute/tensor/src/cpu/quantize.cpp
+++ b/compute/tensor/src/cpu/quantize.cpp
@@ -14,23 +14,34 @@
 #include <float.h>
 #include "cpu/tensor_computing_cpu.h"
 #include "cpu/cpu_functions.h"
-#if defined(_USE_INT8) && defined(__aarch64__)
-#include "cpu/arm/int8/v8/convolution_gemm.h"
+#if defined(_USE_NEON) && defined(_USE_FP16) && defined(_USE_INT8)
+#include "cpu/arm/int8/v8.2/convolution_gemm.h"
 #endif
 #ifdef _USE_X86
 #include "cpu/x86/tensor_computing_x86.h"
 #endif
 
+typedef EE (*scaleFunc)(
+    DataType dt, const void *input, INT8 *output, U32 length, F32 scale, bool clamp);
+
 template <typename T>
 inline static void apply_scale_round_template(
     const T *input, INT8 *output, U32 length, F32 scale, bool clamp)
 {
     for (U32 i = 0; i < length; i++) {
-        //output[i] = round_towards_zero(input[i] * scale, clamp);
         output[i] = round(input[i] * scale);
     }
 }
 
+template <typename T>
+inline static void apply_scale_truncate_template(
+    const T *input, INT8 *output, U32 length, F32 scale, bool clamp)
+{
+    for (U32 i = 0; i < length; i++) {
+        output[i] = round_towards_zero(input[i] * scale, clamp);
+    }
+}
+
 inline EE apply_scale_round(
     DataType dt, const void *input, INT8 *output, U32 length, F32 scale, bool clamp)
 {
@@ -56,6 +67,31 @@ inline EE apply_scale_round(
     return ret;
 }
 
+inline EE apply_scale_truncate(
+    DataType dt, const void *input, INT8 *output, U32 length, F32 scale, bool clamp)
+{
+    EE ret = SUCCESS;
+    switch (dt) {
+#ifdef _USE_FP32
+        case DT_F32:
+            apply_scale_truncate_template<F32>((const F32 *)input, output, length, scale, clamp);
+            break;
+#endif
+#ifdef _USE_FP16
+        case DT_F16:
+            apply_scale_truncate_template<F16>((const F16 *)input, output, length, scale, clamp);
+            break;
+#endif
+        case DT_I32:
+            apply_scale_truncate_template<I32>((const I32 *)input, output, length, scale, clamp);
+            break;
+        default:
+            ret = NOT_SUPPORTED;
+            break;
+    }
+    return ret;
+}
+
 EE quantize_hwncn8c4_cpu(
     TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, F32 *scale, Arch arch)
 {
@@ -131,16 +167,14 @@ EE quantize_cpu(
     F32 min = minmax[0];
     F32 max = minmax[1];
     EE ret = SUCCESS;
-    ;
+    scaleFunc arrayScale = apply_scale_round;
+
     if (max == 0 && min == 0) {
         *scale = 1;
-        memset(qData, 0, tensorNumBytes(*qDesc));
+        UNI_MEMSET(qData, 0, tensorNumBytes(*qDesc));
     } else {
         F32 absMax = UNI_MAX(UNI_ABS(max), UNI_ABS(min));
         F32 scaleRaw = 127.0 / absMax;
-        if (*scale > 0 && dt != DT_I32) {
-            scaleRaw = *scale;
-        }
 
         bool clamp = false;
         INT8 *qArray = (INT8 *)qData;
@@ -152,9 +186,9 @@ EE quantize_cpu(
             }
             const I32 *array = (const I32 *)data;
             I32 factor = 127 * 16777216 / (int)absMax;
-            // *scale *= scaleRaw;
+
             U32 main = 0;
-#if defined(_USE_INT8) && defined(__aarch64__)
+#if defined(_USE_NEON) && defined(_USE_FP16) && defined(_USE_INT8)
             if (arch == ARM_A76 || arch == ARM_A55) {
                 main = numData / 16;
                 ret = quantize_I32(main * 4, (I32 *)data, factor, scaleRaw, qArray);
@@ -167,7 +201,7 @@ EE quantize_cpu(
             if (*scale < scaleRaw) {
                 *scale = scaleRaw;
             }
-            ret = apply_scale_round(dt, data, qArray, numData, *scale, (*scale) != scaleRaw);
+            ret = arrayScale(dt, data, qArray, numData, *scale, (*scale) != scaleRaw);
         }
     }
     UNI_DEBUG_LOG("tensor min value is %f, max value is %f, scale value is %f.\n", min, max, *scale);
diff --git a/compute/tensor/src/cpu/reduction.cpp b/compute/tensor/src/cpu/reduction.cpp
index e402adb4..e9405050 100644
--- a/compute/tensor/src/cpu/reduction.cpp
+++ b/compute/tensor/src/cpu/reduction.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/tensor_computing_cpu.h"
 #include "cpu/cpu_functions.h"
 
@@ -29,16 +28,6 @@ static EE reduction_kernel(TensorDesc inputDesc,
     if (nullptr == input || nullptr == output) {
         CHECK_STATUS(NULL_POINTER);
     }
-
-    ArraySumFunction sum_func = get_array_sum_function(arch);
-    ArrayMeanFunction mean_func = get_array_mean_function(arch);
-    ArrayVarFunction var_func = get_array_var_function(arch);
-    ArrayAddFunction add_func = get_array_add_function(arch);
-    ArrayMulAndAddFunction mul_and_add_func = get_array_mul_and_add_function(arch);
-    ArrayScaleFunction scale_func = get_array_scale_function(arch);
-    ArrayMinMaxValueFunction minmax_value_func = get_array_minmax_value_function(arch);
-    ArrayMaxFunction max_func = get_array_max_function(arch);
-
     if (axis < 0) {
         axis = inputDesc.nDims + axis;
     }
@@ -55,74 +44,94 @@ static EE reduction_kernel(TensorDesc inputDesc,
     U32 maskLen = tensorNumElements(maskDesc);
     maskLen = (maskLen > 0) ? maskLen : len;
     U32 axisDim = maskLen / len;
-    for (U32 i = 0; i < loopOuter; i++) {
-        if (loopInner == 1) {
-            if (mask != nullptr) {
-                return NOT_SUPPORTED;
-            }
-            const T *array = input + i * len;
-            F32 tmpValue = 0;
-            switch (reductionMode) {
-                case REDUCTION_SUM:
-                    output[i] = sum_func(inputDesc.dt, array, len);
-                    break;
-                case REDUCTION_MEAN:
-                    output[i] = mean_func(inputDesc.dt, array, len);
-                    break;
-                case REDUCTION_STD_DEVIATION: {
-                    tmpValue = mean_func(inputDesc.dt, array, len);
-                    tmpValue = var_func(inputDesc.dt, array, len, tmpValue);
-                    output[i] = sqrt(tmpValue);
-                    break;
-                }
-                case REDUCTION_SCALAR_PRODUCT:
-                    output[i] = var_func(inputDesc.dt, array, len, 0);
-                    break;
-                case REDUCTION_MAX: {
-                    F32 maxValue = 0;
-                    CHECK_STATUS(minmax_value_func(inputDesc.dt, array, len, 2, &maxValue));
-                    output[i] = maxValue;
-                    break;
-                }
-                case REDUCTION_L2: {
-                    tmpValue = var_func(inputDesc.dt, array, len, 0) * len;
-                    output[i] = sqrt(tmpValue);
-                    break;
+    EE ret = SUCCESS;
+#ifdef _USE_OPENMP
+#pragma omp parallel num_threads(OMP_NUM_THREADS)
+#endif
+    {
+        ArraySumFunction sum_func = get_array_sum_function(arch);
+        ArrayMeanFunction mean_func = get_array_mean_function(arch);
+        ArrayVarFunction var_func = get_array_var_function(arch);
+        ArrayAddFunction add_func = get_array_add_function(arch);
+        ArrayMulAndAddFunction mul_and_add_func = get_array_mul_and_add_function(arch);
+        ArrayScaleFunction scale_func = get_array_scale_function(arch);
+        ArrayMinMaxValueFunction minmax_value_func = get_array_minmax_value_function(arch);
+        ArrayMaxFunction max_func = get_array_max_function(arch);
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (U32 i = 0; i < loopOuter; i++) {
+            if (loopInner == 1) {
+                const T *array = input + i * len;
+                F32 tmpValue = 0;
+                switch (reductionMode) {
+                    case REDUCTION_SUM:
+                        output[i] = sum_func(inputDesc.dt, array, len);
+                        break;
+                    case REDUCTION_MEAN:
+                        output[i] = mean_func(inputDesc.dt, array, len);
+                        break;
+                    case REDUCTION_STD_DEVIATION: {
+                        tmpValue = mean_func(inputDesc.dt, array, len);
+                        tmpValue = var_func(inputDesc.dt, array, len, tmpValue);
+                        output[i] = sqrt(tmpValue);
+                        break;
+                    }
+                    case REDUCTION_SCALAR_PRODUCT:
+                        output[i] = var_func(inputDesc.dt, array, len, 0);
+                        break;
+                    case REDUCTION_MAX: {
+                        F32 maxValue = 0;
+                        CHECK_STATUS(minmax_value_func(inputDesc.dt, array, len, 2, &maxValue));
+                        output[i] = maxValue;
+                        break;
+                    }
+                    case REDUCTION_L2: {
+                        tmpValue = var_func(inputDesc.dt, array, len, 0) * len;
+                        output[i] = sqrt(tmpValue);
+                        break;
+                    }
+                    case REDUCTION_MIN: {
+                        F32 minValue = 0;
+                        CHECK_STATUS(minmax_value_func(inputDesc.dt, array, len, 1, &minValue));
+                        output[i] = minValue;
+                        break;
+                    }
+                    default:
+                        ret = NOT_SUPPORTED;
+                        break;
                 }
-                default:
-                    return NOT_SUPPORTED;
-            }
-        } else {
-            CHECK_REQUIREMENT(REDUCTION_STD_DEVIATION != reductionMode);
-            for (U32 j = 0; j < maskLen; j += len) {
-                U32 axisIndex = j / len;
-                U32 outputIndex = (i * axisDim + axisIndex) * loopInner;
-                auto ptr2 = output + outputIndex;
-                for (U32 k = 0; k < len; k++) {
-                    if (mask == nullptr || (mask != nullptr && mask[j + k] == 1)) {
-                        auto ptr1 = &input[(i * len + k) * loopInner];
-                        if ((k == 0) && (reductionMode != REDUCTION_SCALAR_PRODUCT)) {
-                            memcpy(ptr2, ptr1, loopInner * bytesOf(inputDesc.dt));
-                            continue;
-                        }
-                        if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN) {
-                            add_func(inputDesc.dt, ptr2, ptr1, ptr2, loopInner);
-                        } else if (reductionMode == REDUCTION_SCALAR_PRODUCT) {
-                            mul_and_add_func(inputDesc.dt, ptr1, ptr1, ptr2, ptr2, loopInner);
-                        } else if (reductionMode == REDUCTION_MAX) {
-                            max_func(inputDesc.dt, ptr2, ptr1, ptr2, loopInner);
-                        } else {
-                            return NOT_SUPPORTED;
+            } else {
+                for (U32 j = 0; j < maskLen; j += len) {
+                    U32 axisIndex = j / len;
+                    U32 outputIndex = (i * axisDim + axisIndex) * loopInner;
+                    auto ptr2 = output + outputIndex;
+                    for (U32 k = 0; k < len; k++) {
+                        if (mask == nullptr || (mask != nullptr && mask[j + k] == 1)) {
+                            auto ptr1 = &input[(i * len + k) * loopInner];
+                            if ((k == 0) && (reductionMode != REDUCTION_SCALAR_PRODUCT)) {
+                                UNI_MEMCPY(ptr2, ptr1, loopInner * bytesOf(inputDesc.dt));
+                                continue;
+                            }
+                            if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN) {
+                                add_func(inputDesc.dt, ptr2, ptr1, ptr2, loopInner);
+                            } else if (reductionMode == REDUCTION_SCALAR_PRODUCT) {
+                                mul_and_add_func(inputDesc.dt, ptr1, ptr1, ptr2, ptr2, loopInner);
+                            } else if (reductionMode == REDUCTION_MAX) {
+                                max_func(inputDesc.dt, ptr2, ptr1, ptr2, loopInner);
+                            } else {
+                                ret = NOT_SUPPORTED;
+                            }
                         }
                     }
-                }
-                if (reductionMode == REDUCTION_MEAN) {
-                    scale_func(inputDesc.dt, ptr2, ptr2, loopInner, 1.0 / len, 0);
+                    if (reductionMode == REDUCTION_MEAN) {
+                        scale_func(inputDesc.dt, ptr2, ptr2, loopInner, 1.0 / len, 0);
+                    }
                 }
             }
         }
     }
-    return SUCCESS;
+    return ret;
 }
 
 EE reduction_cpu(TensorDesc inputDesc,
@@ -143,7 +152,7 @@ EE reduction_cpu(TensorDesc inputDesc,
     int channel = tmpDesc.nDims - 1;
     if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) {
         U32 cx = (inputDesc.df == DF_NCHWC8) ? 8 : 16;
-        for (int i = 0; i < p.axes_num; i++) {
+        for (int i = 0; i < p.num_axes; i++) {
             // channel dimension
             if (p.axes[i] == 1 || p.axes[i] == -channel) {
                 start = -1;
@@ -159,8 +168,8 @@ EE reduction_cpu(TensorDesc inputDesc,
     }
     const void *tmp1 = input;
     void *tmp2 = nullptr;
-    for (int i = start; i < p.axes_num; i++) {
-        if (p.axes_num - start == 1) {
+    for (int i = start; i < p.num_axes; i++) {
+        if (p.num_axes - start == 1) {
             tmp2 = output;
         } else {
             tmp2 = (char *)tmp + (i - start) % 2 * (tmpBytes / 2);
@@ -176,17 +185,27 @@ EE reduction_cpu(TensorDesc inputDesc,
 #ifdef _USE_FP32
             case DT_F32: {
                 ret = reduction_kernel<F32>(tmpDesc, (const F32 *)tmp1, maskDesc,
-                    (const float *)mask, axis, p.reduction_mode, outputDesc, (F32 *)tmp2, arch);
+                    (const float *)mask, axis, p.mode, outputDesc, (F32 *)tmp2, arch);
                 break;
             }
 #endif
 #ifdef _USE_FP16
             case DT_F16: {
                 ret = reduction_kernel<F16>(tmpDesc, (const F16 *)tmp1, maskDesc,
-                    (const float *)mask, axis, p.reduction_mode, outputDesc, (F16 *)tmp2, arch);
+                    (const float *)mask, axis, p.mode, outputDesc, (F16 *)tmp2, arch);
                 break;
             }
 #endif
+            case DT_I32: {
+                ret = reduction_kernel<I32>(tmpDesc, (const I32 *)tmp1, maskDesc,
+                    (const float *)mask, axis, p.mode, outputDesc, (I32 *)tmp2, arch);
+                break;
+            }
+            case DT_U32: {
+                ret = reduction_kernel<U32>(tmpDesc, (const U32 *)tmp1, maskDesc,
+                    (const float *)mask, axis, p.mode, outputDesc, (U32 *)tmp2, arch);
+                break;
+            }
             default:
                 ret = NOT_SUPPORTED;
                 break;
@@ -200,7 +219,7 @@ EE reduction_cpu(TensorDesc inputDesc,
     }
 
     if (tmp2 != output) {
-        memcpy(output, tmp2, tensorNumBytes(outputDesc));
+        UNI_MEMCPY(output, tmp2, tensorNumBytes(outputDesc));
     }
 
     if (p.coeff != 1) {
diff --git a/compute/tensor/src/cpu/reshape.cpp b/compute/tensor/src/cpu/reshape.cpp
index ccf8a1f0..dea3054f 100644
--- a/compute/tensor/src/cpu/reshape.cpp
+++ b/compute/tensor/src/cpu/reshape.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/tensor_computing_cpu.h"
 
 EE reshape_infer_output_size_cpu(TensorDesc inputDesc, ReshapeParamSpec p, TensorDesc *outputDesc)
@@ -19,8 +18,8 @@ EE reshape_infer_output_size_cpu(TensorDesc inputDesc, ReshapeParamSpec p, Tenso
     if (nullptr == outputDesc) {
         return NULL_POINTER;
     }
-    I32 *shape = p.shape_dims;
-    I32 shape_size = p.shape_size;
+    I32 *shape = p.shape;
+    I32 shape_size = p.num_shape;
     int inputElementNum = tensorNumElements(inputDesc);
     int outputElementNum = 1;
     for (int i = 0; i < shape_size; i++) {
@@ -107,7 +106,7 @@ EE reshape_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *o
 
     if ((DF_NCHWC8 != inputDesc.df && DF_NCHWC16 != inputDesc.df) || sameDim) {
         if (output != input) {
-            memcpy(output, input, tensorNumBytes(outputDesc));
+            UNI_MEMCPY(output, input, tensorNumBytes(outputDesc));
         }
     } else {
         CHECK_REQUIREMENT(input != output);
@@ -132,7 +131,7 @@ EE reshape_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *o
             for (U32 c = 0; c < ic; c++) {
                 for (U32 hw = 0; hw < ih * iw; hw++) {
                     for (U32 c8 = 0; c8 < cx; c8++) {
-                        memcpy(outPtr +
+                        UNI_MEMCPY(outPtr +
                                 elementBytes * (n * ic * cx * ih * iw + (c * cx + c8) * ih * iw + hw),
                             inPtr +
                                 elementBytes *
diff --git a/compute/tensor/src/cpu/rnn.cpp b/compute/tensor/src/cpu/rnn.cpp
index 9d7441cc..a93d69ba 100644
--- a/compute/tensor/src/cpu/rnn.cpp
+++ b/compute/tensor/src/cpu/rnn.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/tensor_computing_cpu.h"
 #ifdef _USE_GENERAL
 #include "cpu/general/tensor_computing_general.h"
@@ -23,7 +22,6 @@
 #include "cpu/arm/tensor_computing_arm.h"
 #endif
 #include "blas_enhance.h"
-#include "tensor_transpose.h"
 
 template <typename T>
 void transformNK2NKN32(const T *src, U32 stride, T *dst, U32 N, U32 K)
@@ -43,6 +41,7 @@ static EE rnn_transform_filter(TensorDesc filterDesc,
     RNNParamSpec rnnParamSpec,
     TensorDesc *ftmDesc,
     T *ftmArray,
+    float *scale,
     DataFormat ftmDataFormat,
     Arch arch)
 {
@@ -68,12 +67,13 @@ static EE rnn_transform_filter(TensorDesc filterDesc,
         return NOT_MATCH;
     }
     U32 hDim;
-    if (rnnParamSpec.numProjection > 0) {
-        hDim = rnnParamSpec.numProjection;
+    if (rnnParamSpec.num_projection > 0) {
+        hDim = rnnParamSpec.num_projection;
     } else {
-        hDim = rnnParamSpec.numOutput;
+        hDim = rnnParamSpec.num_outputs;
     }
-    U32 xDim = fk - rnnParamSpec.numOutput;
+    U32 xDim = fk - rnnParamSpec.num_outputs;
+    *ftmDesc = tensor2df(fdt, ftmDataFormat, fn, fk);
     switch (ftmDataFormat) {
         case DF_NKN32: {
             // NK => NKN32
@@ -90,22 +90,41 @@ static EE rnn_transform_filter(TensorDesc filterDesc,
         }
         case DF_NKNx_NKN32: {
             // NK => NKNx_NKN32
-            T *filterTmp = ftmArray + fn * fk;
+            std::vector<T> filterTmp(fn * UNI_MAX(xDim, hDim));
             for (U32 n = 0; n < fn; ++n) {
-                memcpy(filterTmp + n * xDim, filterArray + n * fk, xDim * sizeof(T));
+                UNI_MEMCPY(filterTmp.data() + n * xDim, filterArray + n * fk, xDim * sizeof(T));
             }
             TensorDesc mmmDesc = tensor2df(fdt, DF_TRANSPOSE, fn, xDim);
-            matrix_matrix_multiply_transform_rhs(mmmDesc, filterTmp, &mmmDesc, ftmArray, arch);
+            matrix_matrix_multiply_transform_rhs(
+                mmmDesc, filterTmp.data(), &mmmDesc, ftmArray, arch);
 
-            transformNK2NKN32(
-                filterArray + xDim, fk, ftmArray + fn * xDim, fn, rnnParamSpec.numOutput);
+            if (0) {
+#if defined(_USE_INT8) && defined(_USE_ULTRA_OPTIMIZATION)
+            } else if (arch == X86_AVX512 && rnnParamSpec.mode == RNN_LSTM &&
+                rnnParamSpec.num_projection == 0) {
+                for (U32 n = 0; n < fn; ++n) {
+                    UNI_MEMCPY(
+                        filterTmp.data() + n * hDim, filterArray + n * fk + xDim, hDim * sizeof(T));
+                }
+                TensorDesc mvmDesc = tensor2df(fdt, DF_NORMAL, fn, hDim);
+                TensorDesc mvmQuantDesc = tensor2df(DT_I8, DF_NORMAL, fn, hDim);
+                TensorDesc mvmTransDesc;
+                std::vector<INT8> filterQuant(fn * hDim);
+                CHECK_STATUS(quantize_cpu(
+                    mvmDesc, filterTmp.data(), &mvmQuantDesc, filterQuant.data(), scale, arch));
+                CHECK_STATUS(matrix_vector_multiply_transform_weight(
+                    mvmQuantDesc, filterQuant.data(), &mvmTransDesc, ftmArray + fn * xDim, arch));
+#endif
+            } else {
+                transformNK2NKN32(
+                    filterArray + xDim, fk, ftmArray + fn * xDim, fn, rnnParamSpec.num_outputs);
+            }
             break;
         }
         default:
             ret = NOT_MATCH;
             break;
     }
-    *ftmDesc = tensor2df(fdt, ftmDataFormat, fn, fk);
     return ret;
 }
 
@@ -114,7 +133,9 @@ static EE rnn_transform_filter_cpu_kernel(TensorDesc filterDesc,
     RNNParamSpec rnnParamSpec,
     TensorDesc *ftmDesc,
     void *ftmArray,
+    float *scale,
     DataFormat ftmDataFormat,
+
     Arch arch)
 {
     EE ret = SUCCESS;
@@ -122,14 +143,14 @@ static EE rnn_transform_filter_cpu_kernel(TensorDesc filterDesc,
 #ifdef _USE_FP32
         case DT_F32: {
             ret = rnn_transform_filter<F32>(filterDesc, (const F32 *)filterArray, rnnParamSpec,
-                ftmDesc, (F32 *)ftmArray, ftmDataFormat, arch);
+                ftmDesc, (F32 *)ftmArray, scale, ftmDataFormat, arch);
             break;
         }
 #endif
 #ifdef _USE_FP16
         case DT_F16: {
             ret = rnn_transform_filter<F16>(filterDesc, (const F16 *)filterArray, rnnParamSpec,
-                ftmDesc, (F16 *)ftmArray, ftmDataFormat, arch);
+                ftmDesc, (F16 *)ftmArray, scale, ftmDataFormat, arch);
             break;
         }
 #endif
@@ -145,10 +166,11 @@ EE rnn_transform_filter_cpu(const TensorDesc *filterDesc,
     RNNParamSpec rnnParamSpec,
     TensorDesc *ftmDesc,
     void **ftmArray,
+    float *scale,
     Arch arch)
 {
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
-    int num2 = rnnParamSpec.numProjection > 0 ? 2 : 1;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
+    int num2 = rnnParamSpec.num_projection > 0 ? 2 : 1;
     EE ret = SUCCESS;
     DataFormat ftmDataFormat;
     for (int i = 0; i < num1 * num2; i++) {
@@ -158,7 +180,7 @@ EE rnn_transform_filter_cpu(const TensorDesc *filterDesc,
             ftmDataFormat = DF_NKN32;
         }
         CHECK_STATUS(rnn_transform_filter_cpu_kernel(filterDesc[i], filterArray[i], rnnParamSpec,
-            &ftmDesc[i], ftmArray[i], ftmDataFormat, arch));
+            &ftmDesc[i], ftmArray[i], scale + i, ftmDataFormat, arch));
     }
     return ret;
 }
@@ -169,12 +191,13 @@ EE rnn_transform_filter_bytes_cpu(
     if (nullptr == bytes) {
         CHECK_STATUS(NULL_POINTER);
     }
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
-    int num2 = rnnParamSpec.numProjection > 0 ? 2 : 1;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
+    int num2 = rnnParamSpec.num_projection > 0 ? 2 : 1;
     for (int i = 0; i < num1 * num2; i++) {
         bytes[i] = tensorNumBytes(filterDesc[i]);
-        if (((i % 2 == 0) || (num2 == 1)) && (rnnParamSpec.steps >= 0)) {  // RNN filter
-            bytes[i] += tensorNumBytes(filterDesc[i]);
+        // x86 need to add offset for U8 type, bytes = bias_length(fn) * size(int)
+        if (rnnParamSpec.mode == RNN_LSTM) {
+            bytes[i] += filterDesc[i].dims[1] * sizeof(I32);
         }
     }
     return SUCCESS;
@@ -195,9 +218,9 @@ EE rnncell_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc,
     DataFormat idf;
     U32 batch, xDim;
     CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &batch, &xDim));
-    U32 hDim = rnnParamSpec.numOutput;
-    U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection
-                                                  : rnnParamSpec.numOutput;
+    U32 hDim = rnnParamSpec.num_outputs;
+    U32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
     EE ret = SUCCESS;
     U32 factor = 0;
     switch (rnnParamSpec.mode) {
@@ -216,6 +239,8 @@ EE rnncell_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc,
     }
 
     *bytes = (hDim + xDim + column * factor) * bytesOf(idt);
+    // for input quantization
+    *bytes += (hDim + xDim) * bytesOf(DT_I8);
     return ret;
 }
 
@@ -239,12 +264,12 @@ EE rnn_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc,
     for (U32 i = 0; i < inputDesc.nDims - 3; ++i) {
         xDim *= inputDesc.dims[i];
     }
-    U32 hDim = rnnParamSpec.numOutput;
+    U32 hDim = rnnParamSpec.num_outputs;
     TensorDesc xDesc = tensor2df(idt, DF_NORMAL, batch, xDim);
     CHECK_STATUS(rnncell_infer_forward_tmp_bytes_cpu(
         xDesc, filterDesc, outputDesc, rnnParamSpec, bytes, arch));
-    U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection
-                                                  : rnnParamSpec.numOutput;
+    U32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
     EE ret = SUCCESS;
     U32 factor = 0;
     switch (rnnParamSpec.mode) {
@@ -262,16 +287,20 @@ EE rnn_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc,
             break;
     }
 
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
     *bytes += batch * ((column + hDim) * num1 + column * factor) * bytesOf(idt);
     if (idf == DF_NCHWC8) {
         *bytes += tensorNumBytes(inputDesc);
     }
-    if (rnnParamSpec.steps >= 0) {                                //RNN
-        *bytes += batch * step * column * factor * bytesOf(idt);  // Intermediate gate result
-        *bytes += UNI_MAX(batch * step * xDim, xDim * column) * bytesOf(idt);  // mmm tmp buffer
+    if (rnnParamSpec.steps >= 0) {
+        // Intermediate gate result
+        *bytes += batch * step * column * factor * bytesOf(idt);
+        // mmm tmp buffer
+        *bytes += UNI_MAX(batch * step * xDim, xDim * column) * bytesOf(idt);
         *bytes += 32;
     }
+    // for input quantization
+    *bytes += (hDim + xDim) * bytesOf(DT_I8);
     return ret;
 }
 
@@ -281,6 +310,7 @@ EE rnncell_cpu(TensorDesc xDesc,
     const void **filter,
     const TensorDesc *biasDesc,
     const void **bias,
+    float *scale,
     void *state,
     RNNParamSpec rnnParamSpec,
     U32 batchStrideX,
@@ -299,8 +329,8 @@ EE rnncell_cpu(TensorDesc xDesc,
 #endif
 #ifdef _USE_X86
     } else if (IS_X86(arch)) {
-        ret = rnncell_x86(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, tmp,
-            rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH, arch);
+        ret = rnncell_x86(xDesc, currentX, filterDesc, filter, biasDesc, bias, scale, state,
+            tmpBytes, tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH, arch);
 #endif
 #ifdef _USE_NEON
     } else if (IS_ARM(arch)) {
@@ -317,6 +347,7 @@ EE rnn_cpu(TensorDesc inputDesc,
     const void **filter,
     const TensorDesc *biasDesc,
     const void **bias,
+    float *scale,
     RNNParamSpec rnnParamSpec,
     U32 tmpBytes,
     void *tmp,
@@ -334,7 +365,7 @@ EE rnn_cpu(TensorDesc inputDesc,
     DataType fdt;
     DataFormat fdf;
     U32 fk, fn;
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
     CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk));
     if (fdf != DF_NKNx_NKN32) {
         CHECK_STATUS(NOT_MATCH);
@@ -357,9 +388,9 @@ EE rnn_cpu(TensorDesc inputDesc,
         tmp = (U8 *)tmp + tensorNumBytes(tmpDesc);
     }
 
-    U32 hDim = rnnParamSpec.numOutput;
-    I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection
-                                                  : rnnParamSpec.numOutput;
+    U32 hDim = rnnParamSpec.num_outputs;
+    I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
     U8 bytesOfIdt = bytesOf(idt);
     U32 batchStrideX = step * xDim;
     U32 batchStrideH = step * hDim * num1;
@@ -377,7 +408,7 @@ EE rnn_cpu(TensorDesc inputDesc,
     U32 tileSize = fn * bytesOfIdt;
     for (U32 m = 0; m < batch; m++) {
         for (U32 t = 0; t < step; ++t) {
-            memcpy(InterGate + (m * step + t) * tileSize, bias[0], tileSize);
+            UNI_MEMCPY(InterGate + (m * step + t) * tileSize, bias[0], tileSize);
         }
     }
 
@@ -394,7 +425,7 @@ EE rnn_cpu(TensorDesc inputDesc,
 
     const void *useFilter[2] = {(const void *)(mmmFilter + fn * xDim * bytesOfIdt), nullptr};
     const void *useBias[2] = {nullptr, nullptr};
-    if (rnnParamSpec.numProjection > 0) {
+    if (rnnParamSpec.num_projection > 0) {
         useFilter[1] = filter[1];
     }
     if (rnnParamSpec.mode == RNN_GRU_LBR) {
@@ -405,23 +436,23 @@ EE rnn_cpu(TensorDesc inputDesc,
         U8 *currentH = (U8 *)output + t * hDim * num1 * bytesOfIdt;
         useBias[0] = (void *)(InterGate + t * fn * bytesOfIdt);
         CHECK_STATUS(rnncell_cpu(xDesc, nullptr, &useFilterDesc, useFilter, biasDesc, useBias,
-            cellState, rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, intermediateH, hDesc,
-            currentH, arch));
+            scale, cellState, rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, intermediateH,
+            hDesc, currentH, arch));
     }
 
-    if (rnnParamSpec.biDirection) {
-        int fCount = (rnnParamSpec.numProjection > 0) ? 2 : 1;
+    if (rnnParamSpec.bi_direction) {
+        int fCount = (rnnParamSpec.num_projection > 0) ? 2 : 1;
         int bCount = (rnnParamSpec.mode == RNN_GRU_LBR) ? 2 : 1;
         mmmFilter = (const U8 *)filter[fCount];
         for (U32 m = 0; m < batch; m++) {
             for (U32 t = 0; t < step; ++t) {
-                memcpy(InterGate + (m * step + t) * tileSize, bias[bCount], tileSize);
+                UNI_MEMCPY(InterGate + (m * step + t) * tileSize, bias[bCount], tileSize);
             }
         }
         CHECK_STATUS(matrix_matrix_multiply(inDesc, inputTmp, mmmFilterDesc, mmmFilter,
             step * xDim * bytesOfIdt, tmpArray, outDesc, InterGate, nullptr, arch));
         useFilter[0] = mmmFilter + fn * xDim * bytesOfIdt;
-        if (rnnParamSpec.numProjection > 0) {
+        if (rnnParamSpec.num_projection > 0) {
             useFilter[1] = filter[fCount + 1];
         }
         if (rnnParamSpec.mode == RNN_GRU_LBR) {
@@ -432,8 +463,8 @@ EE rnn_cpu(TensorDesc inputDesc,
             U8 *currentH = (U8 *)output + (t * hDim * num1 + hDim) * bytesOfIdt;
             useBias[0] = (void *)(InterGate + t * fn * bytesOfIdt);
             CHECK_STATUS(rnncell_cpu(xDesc, nullptr, &useFilterDesc, useFilter, biasDesc, useBias,
-                cellState, rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, intermediateH, hDesc,
-                currentH, arch));
+                scale + fCount, cellState, rnnParamSpec, batchStrideX, batchStrideH, tmpBytes,
+                intermediateH, hDesc, currentH, arch));
         }
     }
     return SUCCESS;
diff --git a/compute/tensor/src/cpu/roialign.cpp b/compute/tensor/src/cpu/roialign.cpp
index 25f5304a..e0435928 100644
--- a/compute/tensor/src/cpu/roialign.cpp
+++ b/compute/tensor/src/cpu/roialign.cpp
@@ -14,11 +14,12 @@
 #include "cpu/tensor_computing_cpu.h"
 #include "tensor_transpose.h"
 
-template <typename T>
-static F32 bilinear_interpolate(T *data, U32 w, U32 h, F32 x, F32 y)
+static void preprocess(U32 w, U32 h, F32 x, F32 y, int c8Align, F32 *factor, U32 *offset)
 {
     if (y < -1.0 || y > h || x < -1.0 || x > w) {
-        return 0;
+        UNI_MEMSET(factor, 0, sizeof(float) * 4);
+        UNI_MEMSET(offset, 0, sizeof(U32) * 4);
+        return;
     }
     if (y <= 0) {
         y = 0;
@@ -32,37 +33,36 @@ static F32 bilinear_interpolate(T *data, U32 w, U32 h, F32 x, F32 y)
     U32 y0 = y;
     U32 y1 = y0 + 1;
 
-    F32 hx = x1 - x;
-    F32 lx = x - x0;
-    F32 hy = y1 - y;
-    F32 ly = y - y0;
-
-    if (x1 >= w) {
-        x1 = w - 1;
-        hx = 1.f;
-        lx = 0.f;
+    if (y0 >= h - 1) {
+        y0 = y1 = h - 1;
+        y = y0;
     }
-    if (y1 >= h) {
-        y1 = h - 1;
-        hy = 1.f;
-        ly = 0.f;
+    if (x0 >= w - 1) {
+        x0 = x1 = w - 1;
+        x = x0;
     }
-
-    F32 r0 = data[y0 * w + x0] * hx + data[y0 * w + x1] * lx;
-    F32 r1 = data[y1 * w + x0] * hx + data[y1 * w + x1] * lx;
-
-    F32 val = r0 * hy + r1 * ly;
-    return val;
+    F32 lx = x - x0;
+    F32 ly = y - y0;
+    F32 hx = 1 - lx;
+    F32 hy = 1 - ly;
+    factor[0] = hy * hx;
+    factor[1] = hy * lx;
+    factor[2] = ly * hx;
+    factor[3] = ly * lx;
+    offset[0] = (y0 * w + x0) * c8Align;
+    offset[1] = (y0 * w + x1) * c8Align;
+    offset[2] = (y1 * w + x0) * c8Align;
+    offset[3] = (y1 * w + x1) * c8Align;
 }
 
-template <typename T>
-static EE roialign_kernel(std::vector<void *> input,
-    T *output,
-    std::vector<TensorDesc> inputDesc,
+template <typename T, PoolingMode mode>
+static void roialign_kernel(std::vector<TensorDesc> inputDesc,
+    std::vector<void *> input,
     U32 output_h,
     U32 output_w,
     U32 sampling_ratio,
-    F32 spatial_scale)
+    F32 spatial_scale,
+    T *output)
 {
     DataType idt0, idt1;
     DataFormat idf0, idf1;
@@ -72,27 +72,24 @@ static EE roialign_kernel(std::vector<void *> input,
     CHECK_STATUS(tensor2dGet(inputDesc[1], &idt1, &idf1, &ih1, &iw1));
     T *feature_map = (T *)input[0];
     T *rois = (T *)input[1];
-    CHECK_REQUIREMENT(idf0 == DF_NCHWC8 || idf0 == DF_NCHW);
-    if (inputDesc[0].df == DF_NCHWC8) {
-        T *tmp = (T *)malloc(tensorNumBytes(inputDesc[0]));
-        memcpy(tmp, feature_map, tensorNumBytes(inputDesc[0]));
-        CHECK_STATUS(transformToNCHW(inputDesc[0], tmp, inputDesc[0], feature_map));
-        free(tmp);
+    U32 c8Align = 1;
+    if (idf0 == DF_NCHWC8) {
+        c8Align = 8;
     }
 
     U32 channel = ic0;
     U32 feature_w = iw0;
     U32 feature_h = ih0;
     U32 num_rois = ih1;
-    for (U32 n = 0; n < num_rois; n++) {
-        U32 idx_n = n * channel * output_w * output_h;
+    F32 val;
+    for (U32 n = 0, idx = 0; n < num_rois; n++) {
         F32 roi_start_x1 = static_cast<F32>(rois[n * 4]) * spatial_scale;
         F32 roi_start_y1 = static_cast<F32>(rois[n * 4 + 1]) * spatial_scale;
         F32 roi_end_x2 = static_cast<F32>(rois[n * 4 + 2]) * spatial_scale;
         F32 roi_end_y2 = static_cast<F32>(rois[n * 4 + 3]) * spatial_scale;
 
-        F32 roi_w = std::max(roi_end_x2 - roi_start_x1, 1.f);
-        F32 roi_h = std::max(roi_end_y2 - roi_start_y1, 1.f);
+        F32 roi_w = UNI_MAX(roi_end_x2 - roi_start_x1, 1.f);
+        F32 roi_h = UNI_MAX(roi_end_y2 - roi_start_y1, 1.f);
 
         F32 bin_size_w = roi_w / static_cast<F32>(output_w);
         F32 bin_size_h = roi_h / static_cast<F32>(output_h);
@@ -100,41 +97,98 @@ static EE roialign_kernel(std::vector<void *> input,
         U32 bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_w / output_w);
         U32 bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_h / output_h);
 
+        std::vector<F32> factor(output_h * output_w * bin_grid_h * bin_grid_w * 4);
+        std::vector<U32> offset(output_h * output_w * bin_grid_h * bin_grid_w * 4);
+        for (U32 ph = 0, id = 0; ph < output_h; ph++) {
+            F32 start_y = roi_start_y1 + ph * bin_size_h;
+            for (U32 pw = 0; pw < output_w; pw++) {
+                F32 start_x = roi_start_x1 + pw * bin_size_w;
+                for (U32 by = 0; by < bin_grid_h; by++) {
+                    F32 y = start_y +
+                        static_cast<F32>(by + 0.5f) * bin_size_h / static_cast<F32>(bin_grid_h);
+                    for (U32 bx = 0; bx < bin_grid_w; bx++, id += 4) {
+                        F32 x = start_x +
+                            static_cast<F32>(bx + 0.5f) * bin_size_w / static_cast<F32>(bin_grid_w);
+                        preprocess(feature_w, feature_h, x, y, c8Align, factor.data() + id,
+                            offset.data() + id);
+                    }
+                }
+            }
+        }
         F32 count = bin_grid_h * bin_grid_w;
-        for (U32 c = 0; c < channel; c++) {
-            U32 idx_nc = idx_n + c * output_h * output_w;
-            T *feature_map_offset = feature_map + c * feature_h * feature_w;
-            for (U32 ph = 0; ph < output_h; ph++) {
-                for (U32 pw = 0; pw < output_w; pw++) {
-                    U32 idx = idx_nc + ph * output_w + pw;
-                    F32 output_val = 0;
-                    F32 start_x = roi_start_x1 + pw * bin_size_w;
-                    F32 start_y = roi_start_y1 + ph * bin_size_h;
-                    for (U32 by = 0; by < bin_grid_h; by++) {
-                        F32 y = start_y +
-                            static_cast<F32>(by + 0.5f) * bin_size_h / static_cast<F32>(bin_grid_h);
-                        for (U32 bx = 0; bx < bin_grid_w; bx++) {
-                            F32 x = start_x +
-                                static_cast<F32>(bx + 0.5f) * bin_size_w /
-                                    static_cast<F32>(bin_grid_w);
-                            F32 val = bilinear_interpolate<T>(
-                                (T *)feature_map_offset, feature_w, feature_h, x, y);
-                            output_val += val;
+        for (U32 c0 = 0, c = 0; c0 < channel / c8Align; c0++) {
+            for (U32 c1 = 0; c1 < c8Align; c1++, c++) {
+                T *data = feature_map + c0 * feature_h * feature_w * c8Align + c1;
+                for (U32 ph = 0, id00 = 0; ph < output_h; ph++) {
+                    for (U32 pw = 0; pw < output_w; pw++, idx++) {
+                        if (mode == POOLING_MEAN) {
+                            val = 0;
+                        } else {
+                            val = -UNI_F16_MAX;
+                        }
+                        for (U32 by = 0; by < bin_grid_h; by++) {
+                            for (U32 bx = 0; bx < bin_grid_w; bx++, id00 += 4) {
+                                int id01 = id00 + 1;
+                                int id10 = id00 + 2;
+                                int id11 = id00 + 3;
+                                if (mode == POOLING_MEAN) {
+                                    val += factor[id00] * data[offset[id00]] +
+                                        factor[id01] * data[offset[id01]] +
+                                        factor[id10] * data[offset[id10]] +
+                                        factor[id11] * data[offset[id11]];
+                                } else {
+                                    val = UNI_MAX(
+                                        UNI_MAX(
+                                            UNI_MAX(UNI_MAX(val, factor[id00] * data[offset[id00]]),
+                                                factor[id01] * data[offset[id01]]),
+                                            factor[id10] * data[offset[id10]]),
+                                        factor[id11] * data[offset[id11]]);
+                                }
+                            }
+                        }
+                        output[idx] = val;
+                        if (mode == POOLING_MEAN) {
+                            output[idx] /= count;
                         }
                     }
-                    output_val /= count;
-                    output[idx] = output_val;
                 }
             }
         }
     }
+}
 
-    return SUCCESS;
+template <typename T>
+static EE roialign_kernel(std::vector<TensorDesc> inputDesc,
+    std::vector<void *> input,
+    PoolingMode mode,
+    U32 output_h,
+    U32 output_w,
+    U32 sampling_ratio,
+    F32 spatial_scale,
+    T *output)
+{
+    EE ret = SUCCESS;
+    switch (mode) {
+        case POOLING_MEAN: {
+            roialign_kernel<T, POOLING_MEAN>(
+                inputDesc, input, output_h, output_w, sampling_ratio, spatial_scale, output);
+            break;
+        }
+        case POOLING_MAX: {
+            roialign_kernel<T, POOLING_MAX>(
+                inputDesc, input, output_h, output_w, sampling_ratio, spatial_scale, output);
+            break;
+        }
+        default:
+            ret = NOT_SUPPORTED;
+            break;
+    }
+    return ret;
 }
 
 EE roialign_cpu(std::vector<TensorDesc> inputDesc,
     std::vector<void *> input,
-    RoIAlignParamSpec roiAlignParamSpec,
+    RoIAlignParamSpec p,
     TensorDesc outputDesc,
     void *output)
 {
@@ -142,26 +196,21 @@ EE roialign_cpu(std::vector<TensorDesc> inputDesc,
     if (nullptr == output) {
         CHECK_STATUS(NULL_POINTER);
     }
-    U32 output_h = roiAlignParamSpec.output_h;
-    U32 output_w = roiAlignParamSpec.output_w;
-    U32 sampling_ratio = roiAlignParamSpec.sampling_ratio;
-    F32 spatial_scale = roiAlignParamSpec.spatial_scale;
-    EE ret = SUCCESS;
+    EE ret = NOT_SUPPORTED;
     switch (inputDesc[0].dt) {
 #ifdef _USE_FP32
         case DT_F32:
-            ret = roialign_kernel<F32>(
-                input, (F32 *)output, inputDesc, output_h, output_w, sampling_ratio, spatial_scale);
+            ret = roialign_kernel<F32>(inputDesc, input, p.mode, p.output_h, p.output_w,
+                p.sampling_ratio, p.spatial_scale, (F32 *)output);
             break;
 #endif
 #ifdef _USE_FP16
         case DT_F16:
-            ret = roialign_kernel<F16>(
-                input, (F16 *)output, inputDesc, output_h, output_w, sampling_ratio, spatial_scale);
+            ret = roialign_kernel<F16>(inputDesc, input, p.mode, p.output_h, p.output_w,
+                p.sampling_ratio, p.spatial_scale, (F16 *)output);
             break;
 #endif
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
     return ret;
diff --git a/compute/tensor/src/cpu/scale.cpp b/compute/tensor/src/cpu/scale.cpp
new file mode 100644
index 00000000..3c4a0db9
--- /dev/null
+++ b/compute/tensor/src/cpu/scale.cpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/tensor_computing_cpu.h"
+#ifdef _USE_GENERAL
+#include "cpu/general/tensor_computing_general.h"
+#endif
+#ifdef _USE_X86
+#include "cpu/x86/tensor_computing_x86.h"
+#endif
+#ifdef _USE_NEON
+#include "cpu/arm/tensor_computing_arm.h"
+#endif
+
+EE scale_cpu(
+    TensorDesc inputDesc, void *input, void *alpha, void *beta, ScaleParamSpec p, TensorDesc outputDesc, void *output, Arch arch)
+{
+    EE ret = NOT_SUPPORTED;
+    if (IS_GENERAL(arch)) {
+#ifdef _USE_GENERAL
+        ret = scale_general(inputDesc, input, alpha, beta, p, outputDesc, output);
+#endif
+#ifdef _USE_X86
+    } else if (IS_X86(arch)) {
+        ret = scale_x86(inputDesc, input, alpha, beta, p, outputDesc, output);
+#endif
+#ifdef _USE_NEON
+    } else if (IS_ARM(arch)) {
+        ret = scale_arm(inputDesc, input, alpha, beta, p, outputDesc, output);
+#endif
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/scatter.cpp b/compute/tensor/src/cpu/scatter.cpp
index bcdc659f..973b278f 100644
--- a/compute/tensor/src/cpu/scatter.cpp
+++ b/compute/tensor/src/cpu/scatter.cpp
@@ -27,10 +27,10 @@ inline static void scatter_elements(const TensorDesc &dataDesc,
     int axis = (p.axis + dataDesc.nDims) % dataDesc.nDims;
     axis = dataDesc.nDims - 1 - axis;
 
-    memcpy(output, data, tensorNumBytes(dataDesc));
+    UNI_MEMCPY(output, data, tensorNumBytes(dataDesc));
 
-    for (U32 i = 0; i < tensorNumElements(dataDesc); i++) {
-        std::vector<U32> local = calculateLocalIndex(i, dataDesc.dims, dataDesc.nDims);
+    for (U32 i = 0; i < tensorNumElements(updateDesc); i++) {
+        std::vector<U32> local = calculateLocalIndex(i, updateDesc.dims, updateDesc.nDims);
         local[axis] = index[i];
         U32 k = calculateGlobalIndex(local.data(), dataDesc.dims, dataDesc.nDims);
         output[k] = update[i];
@@ -47,7 +47,7 @@ inline static void scatterND(const TensorDesc &dataDesc,
     const TensorDesc &outputDesc,
     T *output)
 {
-    memcpy(output, data, tensorNumBytes(dataDesc));
+    UNI_MEMCPY(output, data, tensorNumBytes(dataDesc));
 
     int lastDim = indexDesc.dims[0];
     for (U32 i = 0; i < indexDesc.nDims - 1; i++) {
diff --git a/compute/tensor/src/cpu/slice.cpp b/compute/tensor/src/cpu/slice.cpp
index 72b59cef..81aeef13 100644
--- a/compute/tensor/src/cpu/slice.cpp
+++ b/compute/tensor/src/cpu/slice.cpp
@@ -11,17 +11,17 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
+#include <set>
 #include <vector>
 #include "cpu/tensor_computing_cpu.h"
 
 EE slice_cpu(TensorDesc inputDesc,
     void *input,
     SliceParamSpec p,
-    std::vector<TensorDesc> outputDesc,
-    std::vector<void *> *output)
+    std::vector<TensorDesc> &outputDesc,
+    std::vector<void *> &output)
 {
-    if (nullptr == input || nullptr == output) {
+    if (nullptr == input) {
         CHECK_STATUS(NULL_POINTER);
     }
     U32 num = outputDesc.size();
@@ -41,23 +41,66 @@ EE slice_cpu(TensorDesc inputDesc,
         loops *= inputDesc.dims[i];
     }
 
-    if (inputDesc.df == DF_NCHWC8) {
-        if (axis < 2) {
+    bool sameFormat = true;
+    for (U32 j = 0; j < num; j++) {
+        if (inputDesc.df != outputDesc[j].df) {
+            sameFormat = false;
+            break;
+        }
+    }
+
+    if (sameFormat && inputDesc.df == DF_NCHWC8) {
+        if (axis < dim - 2) {
             tileSize *= 8;
             loops /= 8;
         }
     }
 
-    U8 *ptr = (U8 *)input;
-    for (U32 i = 0; i < loops; i++) {
+    if (sameFormat) {
+        U8 *ptr = (U8 *)input;
+        for (U32 i = 0; i < loops; i++) {
+            for (U32 j = 0; j < num; j++) {
+                U32 blockSize = outputDesc[j].dims[axis] * tileSize;
+                if (blockSize > 0 && nullptr == output[j]) {
+                    CHECK_STATUS(NULL_POINTER);
+                }
+                U8 *dstPtr = (U8 *)(output[j]) + i * blockSize;
+                UNI_MEMCPY(dstPtr, ptr, blockSize);
+                ptr += blockSize;
+            }
+        }
+    } else {
+        if (axis != dim - 2) {
+            return NOT_SUPPORTED;
+        }
+        U8 *iPtr = (U8 *)input;
+        U32 eleSize = bytesOf(inputDesc.dt);
+        tileSize /= eleSize;
+        U32 startDims = 0;
+        U32 endDims = 0;
+        std::set<DataFormat> nativeFormat = {DF_NCHW, DF_MTK, DF_NORMAL};
+
         for (U32 j = 0; j < num; j++) {
-            U32 blockSize = outputDesc[j].dims[axis] * tileSize;
-            if (blockSize > 0 && nullptr == (*output)[j]) {
-                CHECK_STATUS(NULL_POINTER);
+            endDims += outputDesc[j].dims[axis];
+            U8 *oPtr = (U8 *)output[j];
+            if (inputDesc.df == DF_NCHWC8 && nativeFormat.count(outputDesc[j].df)) {
+                for (U32 i = 0; i < loops; i++) {
+                    for (U32 d = startDims; d < endDims; ++d) {
+                        U32 c8 = d % 8;
+                        U32 c = d - c8;
+                        for (U32 t = 0; t < tileSize; ++t) {
+                            U32 oIdx = i * tileSize * (endDims - startDims) +
+                                (d - startDims) * tileSize + t;
+                            U32 iIdx =
+                                i * tileSize * inputDesc.dims[axis] + c * tileSize + t * 8 + c8;
+                            UNI_MEMCPY(oPtr + oIdx * eleSize, iPtr + iIdx * eleSize, eleSize);
+                        }
+                    }
+                }
+            } else {
+                return NOT_SUPPORTED;
             }
-            U8 *dstPtr = (U8 *)((*output)[j]) + i * blockSize;
-            memcpy(dstPtr, ptr, blockSize);
-            ptr += blockSize;
+            startDims = endDims;
         }
     }
     return SUCCESS;
diff --git a/compute/tensor/src/cpu/space2depth.cpp b/compute/tensor/src/cpu/space2depth.cpp
new file mode 100644
index 00000000..27be5afc
--- /dev/null
+++ b/compute/tensor/src/cpu/space2depth.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/tensor_computing_cpu.h"
+
+template <typename T>
+static inline EE space2depth_kernel(
+    TensorDesc inputDesc, T *input, Space2DepthParamSpec p, TensorDesc outputDesc, T *output)
+{
+    DataType idt, odt;
+    DataFormat idf, odf;
+    U32 in, ic, ih, iw;
+    U32 on, oc, oh, ow;
+    int bh = p.block_size;
+    int bw = p.block_size;
+    if (tensorIs4d(inputDesc)) {
+        CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+        CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+    } else if (tensorIs3d(inputDesc)) {
+        CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih));
+        CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh));
+        iw = ow = 1;
+        bw = 1;
+    } else {
+        return NOT_SUPPORTED;
+    }
+
+    int cx = 1;
+    if (idf == DF_NCHWC8) {
+        cx = 8;
+    }
+    if (idf == DF_NCHWC16) {
+        cx = 16;
+    }
+    U32 icx = ic / cx;
+    for (U32 n = 0, o_i = 0; n < in; n++) {
+        for (U32 c1 = 0; c1 < icx; c1++) {
+            for (int c2 = 0; c2 < cx; c2++) {
+                for (int i = 0; i < bh; i++) {
+                    for (int j = 0; j < bw; j++) {
+                        for (U32 h = 0; h < oh; h++) {
+                            for (U32 w = 0; w < ow; w++, o_i++) {
+                                int i_i =
+                                    (((n * icx + c1) * ih + h * bh + i) * iw + w * bw + j) * cx + c2;
+                                output[o_i] = input[i_i];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return SUCCESS;
+}
+
+EE space2depth_cpu(
+    TensorDesc inputDesc, void *input, Space2DepthParamSpec p, TensorDesc outputDesc, void *output)
+{
+    if (nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
+#ifdef _USE_FP32
+        case DT_F32:
+            ret = space2depth_kernel<F32>(inputDesc, (F32 *)input, p, outputDesc, (F32 *)output);
+            break;
+#endif
+#ifdef _USE_FP16
+        case DT_F16:
+            ret = space2depth_kernel<F16>(inputDesc, (F16 *)input, p, outputDesc, (F16 *)output);
+            break;
+#endif
+        default:
+            break;
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/split.cpp b/compute/tensor/src/cpu/split.cpp
index 38d25cb5..10895d11 100644
--- a/compute/tensor/src/cpu/split.cpp
+++ b/compute/tensor/src/cpu/split.cpp
@@ -12,7 +12,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include <vector>
-#include <string.h>
 
 #include "cpu/tensor_computing_cpu.h"
 
@@ -33,7 +32,7 @@ EE split_cpu(TensorDesc inputDesc,
         if (nullptr == (*output)[i]) {
             CHECK_STATUS(NULL_POINTER);
         }
-        memcpy((*output)[i], input, tensorNumBytes(outputDesc[i]));
+        UNI_MEMCPY((*output)[i], input, tensorNumBytes(outputDesc[i]));
     }
     return SUCCESS;
 }
diff --git a/compute/tensor/src/cpu/tensor_computing_cpu.h b/compute/tensor/src/cpu/tensor_computing_cpu.h
index a504f7f4..513f327d 100644
--- a/compute/tensor/src/cpu/tensor_computing_cpu.h
+++ b/compute/tensor/src/cpu/tensor_computing_cpu.h
@@ -25,6 +25,7 @@ EE rnn_transform_filter_cpu(const TensorDesc *filterDescs,
     RNNParamSpec rnnParamSpec,
     TensorDesc *ftmDesc,
     void **ftmArray,
+    float *scale,
     Arch arch);
 
 EE rnn_transform_filter_bytes_cpu(
@@ -50,6 +51,7 @@ EE rnncell_cpu(TensorDesc xDesc,
     const void **filter,
     const TensorDesc *biasDesc,
     const void **bias,
+    float *scale,
     void *state,
     RNNParamSpec rnnParamSpec,
     U32 batchStrideX,
@@ -66,6 +68,7 @@ EE rnn_cpu(TensorDesc inputDesc,
     const void **filter,
     const TensorDesc *biasDesc,
     const void **bias,
+    float *scale,
     RNNParamSpec rnnParamSpec,
     U32 tmpBytes,
     void *tmp,
@@ -140,7 +143,8 @@ EE non_max_suppression_cpu(std::vector<TensorDesc> inputDesc,
     std::vector<void *> input,
     NonMaxSuppressionParamSpec nonMaxSuppressionParamSpec,
     TensorDesc outputDesc,
-    void *output);
+    void *output,
+    U32 *length);
 
 EE concat_cpu(std::vector<TensorDesc> inputDesc,
     std::vector<void *> input,
@@ -164,8 +168,8 @@ EE power_cpu(TensorDesc inputDesc,
 EE slice_cpu(TensorDesc inputDesc,
     void *input,
     SliceParamSpec p,
-    std::vector<TensorDesc> outputDesc,
-    std::vector<void *> *output);
+    std::vector<TensorDesc>& outputDesc,
+    std::vector<void *>& output);
 
 EE priorbox_cpu(std::vector<TensorDesc> inputDesc,
     PriorBoxParamSpec priorBoxParamSpec,
@@ -350,4 +354,28 @@ EE gat_cpu(TensorDesc node_feature_desc,
     TensorDesc outputDesc,
     void *output,
     Arch arch);
+
+EE onehot_cpu(
+    TensorDesc inputDesc, void *input, OneHotParamSpec p, TensorDesc outputDesc, void *output);
+
+EE non_zero_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *length);
+
+EE check_cpu(TensorDesc inputADesc,
+    void *inputA,
+    TensorDesc inputBDesc,
+    void *inputB,
+    CheckParamSpec p,
+    TensorDesc outputDesc,
+    void *output);
+
+EE cast_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output);
+
+EE space2depth_cpu(
+    TensorDesc inputDesc, void *input, Space2DepthParamSpec p, TensorDesc outputDesc, void *output);
+
+EE depth2space_cpu(
+    TensorDesc inputDesc, void *input, Depth2SpaceParamSpec p, TensorDesc outputDesc, void *output);
+
+EE scale_cpu(
+    TensorDesc inputDesc, void *input, void *alpha, void *beta, ScaleParamSpec p, TensorDesc outputDesc, void *output, Arch arch);
 #endif
diff --git a/compute/tensor/src/cpu/tfslice.cpp b/compute/tensor/src/cpu/tfslice.cpp
index 0394f2ae..74136744 100644
--- a/compute/tensor/src/cpu/tfslice.cpp
+++ b/compute/tensor/src/cpu/tfslice.cpp
@@ -37,12 +37,15 @@ EE tfslice_infer_output_size_cpu(TensorDesc inputDesc, TfSliceParamSpec p, Tenso
         }
         if (axisEnd < 0) {
             axisEnd = inputDesc.dims[axis] + axisEnd;
+            axisEnd = UNI_MAX(axisEnd, -1);
         } else if (axisEnd > (int)(inputDesc.dims[axis])) {
             axisEnd = inputDesc.dims[axis];
         }
-        CHECK_REQUIREMENT(axisBegin >= 0 && axisEnd >= 0);
-        int num = (axisEnd - axisBegin) / strides[i];
-        outputDesc->dims[axis] = num;
+        if (strides[i] > 0) {
+            outputDesc->dims[axis] = (axisEnd - axisBegin + strides[i] - 1) / strides[i];
+        } else {
+            outputDesc->dims[axis] = (axisEnd - axisBegin + strides[i] + 1) / strides[i];
+        }
         begin[i] = axisBegin;
         end[i] = axisEnd;
     }
@@ -73,20 +76,31 @@ inline static void recursive_tfslice(U8 *src,
     U32 tileSize)
 {
     if (i == bound) {
-        memcpy(dst, src, tileSize);
+        UNI_MEMCPY(dst, src, tileSize);
         return;
     }
     U32 newSrcNum = srcNum / srcDims[dimNum - 1 - i];
     U32 newDstNum = dstNum / dstDims[dimNum - 1 - i];
-    if (i + 1 == bound && strides[i] == 1) {
-        memcpy(dst, src + begin[i] * newSrcNum, tileSize * (end[i] - begin[i]));
-        return;
+    if (i + 1 == bound) {
+        if (strides[i] == 1) {
+            UNI_MEMCPY(dst, src + begin[i] * newSrcNum, tileSize * (end[i] - begin[i]));
+            return;
+        }
     }
-    for (int j = begin[i]; j < end[i]; j += strides[i]) {
-        U8 *newSrc = src + j * newSrcNum;
-        recursive_tfslice(newSrc, srcDims, newSrcNum, dst, dstDims, newDstNum, begin, end, strides,
-            i + 1, bound, dimNum, tileSize);
-        dst += newDstNum;
+    if (strides[i] > 0) {
+        for (int j = begin[i]; j < end[i]; j += strides[i]) {
+            U8 *newSrc = src + j * newSrcNum;
+            recursive_tfslice(newSrc, srcDims, newSrcNum, dst, dstDims, newDstNum, begin, end,
+                strides, i + 1, bound, dimNum, tileSize);
+            dst += newDstNum;
+        }
+    } else {
+        for (int j = begin[i]; j > end[i]; j += strides[i]) {
+            U8 *newSrc = src + j * newSrcNum;
+            recursive_tfslice(newSrc, srcDims, newSrcNum, dst, dstDims, newDstNum, begin, end,
+                strides, i + 1, bound, dimNum, tileSize);
+            dst += newDstNum;
+        }
     }
 }
 #endif
@@ -110,10 +124,10 @@ EE tfslice_cpu(
         }
         if (axisEnd < 0) {
             axisEnd = inputDesc.dims[axis] + axisEnd;
+            axisEnd = UNI_MAX(axisEnd, -1);
         } else if (axisEnd > (int)(inputDesc.dims[axis])) {
             axisEnd = inputDesc.dims[axis];
         }
-        CHECK_REQUIREMENT(axisBegin >= 0 && axisEnd >= 0);
         begin[i] = axisBegin;
         end[i] = axisEnd;
     }
@@ -124,8 +138,8 @@ EE tfslice_cpu(
     int channelAxis = inputDesc.nDims - 2;
     if (inputDesc.df == outputDesc.df) {
         std::vector<U32> tmpInputDims(inputDesc.nDims), tmpOutputDims(outputDesc.nDims);
-        memcpy(tmpInputDims.data(), inputDesc.dims, inputDesc.nDims * sizeof(U32));
-        memcpy(tmpOutputDims.data(), outputDesc.dims, outputDesc.nDims * sizeof(U32));
+        UNI_MEMCPY(tmpInputDims.data(), inputDesc.dims, inputDesc.nDims * sizeof(U32));
+        UNI_MEMCPY(tmpOutputDims.data(), outputDesc.dims, outputDesc.nDims * sizeof(U32));
         int startAxis = 0;
         int elementNum = 1;
         if (inputDesc.df == DF_NCHWC8) {
@@ -167,7 +181,7 @@ EE tfslice_cpu(
             U32 srcIndex =
                 calculateGlobalIndex(localIndex.data(), tmpInputDims.data(), tmpInputDims.size());
             U8 *src = (U8 *)input + srcIndex * elementSize;
-            memcpy(dst, src, tileSize);
+            UNI_MEMCPY(dst, src, tileSize);
         }
 #endif
         if (inputDesc.df == DF_NCHWC8) {
@@ -179,7 +193,7 @@ EE tfslice_cpu(
         U32 tmpNDims = inputDesc.nDims + 1;
         std::vector<U32> tmpDims(tmpNDims);
         tmpDims[0] = 8;
-        memcpy(&(tmpDims[1]), inputDesc.dims, inputDesc.nDims * sizeof(U32));
+        UNI_MEMCPY(&(tmpDims[1]), inputDesc.dims, inputDesc.nDims * sizeof(U32));
         for (U32 i = 0; i < num; i++, dst += elementSize) {
             std::vector<U32> localIndex = calculateLocalIndex(i, outputDesc.dims, outputDesc.nDims);
             for (U32 j = 0; j < dimSize; j++) {
@@ -191,7 +205,7 @@ EE tfslice_cpu(
             localIndex.insert(localIndex.begin(), c8);
             U32 index = calculateGlobalIndex(localIndex.data(), tmpDims.data(), tmpNDims);
             U8 *src = (U8 *)input + index * elementSize;
-            memcpy(dst, src, elementSize);
+            UNI_MEMCPY(dst, src, elementSize);
         }
     }
     return SUCCESS;
diff --git a/compute/tensor/src/cpu/topk.cpp b/compute/tensor/src/cpu/topk.cpp
index b5727e0d..9bca596f 100644
--- a/compute/tensor/src/cpu/topk.cpp
+++ b/compute/tensor/src/cpu/topk.cpp
@@ -14,6 +14,46 @@
 #include "cpu/tensor_computing_cpu.h"
 #include <algorithm>
 
+template <typename T, bool increase>
+inline static bool cmp(T *data, const int &a, const int &b)
+{
+    if (increase) {
+        return (data[a] < data[b]) || (data[a] == data[b] && a < b);
+    } else {
+        return (data[a] > data[b]) || (data[a] == data[b] && a < b);
+    }
+}
+
+template <typename T, bool increase>
+static void heap(int *buffer, int i, int k, T *data)
+{
+    while (true) {
+        int left = 2 * i + 1;
+        int right = left + 1;
+        if (right < k) {
+            bool replace = cmp<T, increase>(data, buffer[i], buffer[left]);
+            if (replace && cmp<T, increase>(data, buffer[right], buffer[left])) {
+                auto tmp = buffer[i];
+                buffer[i] = buffer[left];
+                buffer[left] = tmp;
+                i = left;
+            } else if (replace || cmp<T, increase>(data, buffer[i], buffer[right])) {
+                auto tmp = buffer[i];
+                buffer[i] = buffer[right];
+                buffer[right] = tmp;
+                i = right;
+            } else
+                break;
+        } else if ((left < k) && cmp<T, increase>(data, buffer[i], buffer[left])) {
+            auto tmp = buffer[i];
+            buffer[i] = buffer[left];
+            buffer[left] = tmp;
+            i = left;
+        } else
+            break;
+    }
+}
+
 template <typename T, bool increase, bool order>
 inline static void topk_kernel(
     const TensorDesc &inputDesc, T *input, const TopKParamSpec &p, int *tmp, T *output, int *index)
@@ -26,31 +66,67 @@ inline static void topk_kernel(
     for (U32 i = axis + 1; i < inputDesc.nDims; i++) {
         loopOuter *= inputDesc.dims[i];
     }
-    int num = UNI_MIN(loops, p.topk);
+    int num = loops;
+    if (p.k > 0 && p.k < num) {
+        num = p.k;
+    }
     int *tmpEnd = tmp + loops;
     for (int i = 0; i < loopOuter; i++) {
         int offset = i * loops * loopInner;
         for (int j = 0; j < loopInner; j++, offset++) {
+#if 0
             for (int k = 0; k < loops; k++) {
-                tmp[k] = k;
+                tmp[k] = offset + k * loopInner;
             }
             if (increase) {
-                std::sort(tmp, tmpEnd, [&input, &offset, &loopInner](int i1, int i2) {
-                    return input[offset + i1 * loopInner] < input[offset + i2 * loopInner];
-                });
+                std::stable_sort(
+                    tmp, tmpEnd, [&input](int i1, int i2) { return input[i1] < input[i2]; });
             } else {
-                std::sort(tmp, tmpEnd, [&input, &offset, &loopInner](int i1, int i2) {
-                    return input[offset + i1 * loopInner] > input[offset + i2 * loopInner];
-                });
+                std::stable_sort(
+                    tmp, tmpEnd, [&input](int i1, int i2) { return input[i1] > input[i2]; });
             }
             if (!order) {
                 std::sort(tmp, tmp + num);
             }
             for (int k = 0; k < num; k++) {
-                int id = (i * p.topk + k) * loopInner + j;
-                index[id] = tmp[k];
-                output[id] = input[offset + tmp[k] * loopInner];
+                int id = (i * num + k) * loopInner + j;
+                index[id] = (tmp[k] - offset) / loopInner;
+                output[id] = input[tmp[k]];
+            }
+#else
+            int l = 0;
+            int cur_idx = offset;
+            for (; l < num; ++l) {
+                tmp[num - l - 1] = cur_idx;
+                heap<T, increase>(tmp, num - l - 1, num, input);
+                cur_idx += loopInner;
+            }
+
+            auto top = tmp[0];
+            for (; l < loops; ++l) {
+                if (cmp<T, increase>(input, cur_idx, top)) {
+                    tmp[0] = cur_idx;
+                    heap<T, increase>(tmp, 0, num, input);
+                    top = tmp[0];
+                }
+                cur_idx += loopInner;
+            }
+            if (order) {
+                for (l = 0; l < num; ++l) {
+                    int id = (i * num + (num - l - 1)) * loopInner + j;
+                    index[id] = (tmp[0] - offset) / loopInner;
+                    output[id] = input[tmp[0]];
+                    tmp[0] = tmp[num - l - 1];
+                    heap<T, increase>(tmp, 0, num - l - 1, input);
+                }
+            } else {
+                for (l = 0; l < num; ++l) {
+                    int id = (i * num + l) * loopInner + j;
+                    index[id] = (tmp[l] - offset) / loopInner;
+                    output[id] = input[tmp[l]];
+                }
             }
+#endif
         }
     }
 }
@@ -88,16 +164,16 @@ EE topk_cpu(TensorDesc inputDesc,
     if (nullptr == input || nullptr == output || nullptr == index) {
         CHECK_STATUS(NULL_POINTER);
     }
-    EE ret;
+    EE ret = SUCCESS;
     switch (inputDesc.dt) {
+#ifdef _USE_FP32
         case DT_F32:
             topk_wrapper1<F32>(inputDesc, (F32 *)input, p, (I32 *)tmp, (F32 *)output, (I32 *)index);
-            ret = SUCCESS;
             break;
+#endif
 #ifdef _USE_FP16
         case DT_F16:
             topk_wrapper1<F16>(inputDesc, (F16 *)input, p, (I32 *)tmp, (F16 *)output, (I32 *)index);
-            ret = SUCCESS;
             break;
 #endif
         default:
diff --git a/compute/tensor/src/cpu/transpose.cpp b/compute/tensor/src/cpu/transpose.cpp
index b21438b8..c72aa782 100644
--- a/compute/tensor/src/cpu/transpose.cpp
+++ b/compute/tensor/src/cpu/transpose.cpp
@@ -20,9 +20,8 @@ EE transpose_cpu(
     if (nullptr == input && tensorNumElements(inputDesc) == 0) {
         return SUCCESS;
     }
-
     if (nullptr == input || nullptr == output || nullptr == dim) {
-        CHECK_STATUS(NULL_POINTER);
+        return NULL_POINTER;
     }
     array_transpose(bytesOf(inputDesc.dt), inputDesc.dims, input, outputDesc.dims, output, dim,
         inputDesc.nDims, outputDesc.nDims);
diff --git a/compute/tensor/src/cpu/x86/check.cpp b/compute/tensor/src/cpu/x86/check.cpp
deleted file mode 100644
index 9a28324a..00000000
--- a/compute/tensor/src/cpu/x86/check.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#include "cpu/x86/tensor_computing_x86.h"
-#include "x86_avx2_expand.h"
-#ifdef _USE_FP32
-#include "cpu/x86/fp32/tensor_computing_fp32.h"
-#endif
-
-template <typename T>
-EE check_u32(TensorDesc inputDescA,
-    const T *inputA,
-    TensorDesc inputDescB,
-    const T *inputB,
-    CheckMode checkMode,
-    TensorDesc outputDesc,
-    I32 *output)
-{
-    if (nullptr == inputA || nullptr == inputB || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-
-    if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-
-    U32 size = tensorNumElements(inputDescA);
-    U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1];
-    if (tensorNumElements(outputDesc) != loopOuter) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-    I32 length = size / loopOuter;
-    for (U32 j = 0; j < loopOuter; j++) {
-        const T *arrayA = inputA + j * length;
-        const T *arrayB = inputB + j * length;
-        __m256i count_v = _mm256_set1_epi32(0);
-        __m256i one_v = _mm256_set1_epi32(1);
-        switch (checkMode) {
-            case CHECK_GREAT: {
-                I32 i = 0;
-                for (; i < length - 7; i += 8) {
-                    __m256i a = _mm256_loadu_si256((__m256i *)(arrayA + i));
-                    __m256i b = _mm256_loadu_si256((__m256i *)(arrayB + i));
-                    count_v = _mm256_add_epi32(
-                        count_v, _mm256_and_si256(one_v, _mm256_cmpgt_epi32(a, b)));
-                }
-                I32 count = _mm256_hadd_u32(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] == arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            case CHECK_GREATEQUAL: {
-                I32 i = 0;
-                for (; i < length - 7; i += 8) {
-                    __m256i a = _mm256_loadu_si256((__m256i *)(arrayA + i));
-                    __m256i b = _mm256_loadu_si256((__m256i *)(arrayB + i));
-                    __m256i cmp =
-                        _mm256_or_si256(_mm256_cmpeq_epi32(a, b), _mm256_cmpgt_epi32(a, b));
-                    count_v = _mm256_add_epi32(count_v, _mm256_and_si256(one_v, cmp));
-                }
-                I32 count = _mm256_hadd_u32(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] == arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            case CHECK_EQUAL: {
-                I32 i = 0;
-                for (; i < length - 7; i += 8) {
-                    __m256i a = _mm256_loadu_si256((__m256i *)(arrayA + i));
-                    __m256i b = _mm256_loadu_si256((__m256i *)(arrayB + i));
-                    count_v = _mm256_add_epi32(
-                        count_v, _mm256_and_si256(one_v, _mm256_cmpeq_epi32(a, b)));
-                }
-                I32 count = _mm256_hadd_u32(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] == arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            default:
-                return NOT_SUPPORTED;
-                break;
-        }
-    }
-    return SUCCESS;
-}
-
-template <typename TA, typename TB>
-EE check_kernel(TensorDesc inputDescA,
-    const TA *inputA,
-    TensorDesc inputDescB,
-    const TB *inputB,
-    CheckMode checkMode,
-    TensorDesc outputDesc,
-    I32 *output)
-{
-    if (nullptr == inputA || nullptr == inputB || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-
-    if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-
-    U32 size = tensorNumElements(inputDescA);
-    U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1];
-    if (tensorNumElements(outputDesc) != loopOuter) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-    I32 length = size / loopOuter;
-
-    for (U32 j = 0; j < loopOuter; j++) {
-        const TA *arrayA = inputA + j * length;
-        const TB *arrayB = inputB + j * length;
-        switch (checkMode) {
-            case CHECK_GREAT: {
-                output[j] = 1;
-                for (I32 i = 0; i < length; i++) {
-                    if (arrayA[i] <= (TA)arrayB[i]) {
-                        output[j] = 0;
-                        break;
-                    }
-                }
-                break;
-            }
-            case CHECK_GREATEQUAL: {
-                output[j] = 1;
-                for (I32 i = 0; i < length; i++) {
-                    if (arrayA[i] < (TA)arrayB[i]) {
-                        output[j] = 0;
-                        break;
-                    }
-                }
-                break;
-            }
-            case CHECK_EQUAL: {
-                output[j] = 1;
-                for (I32 i = 0; i < length; i++) {
-                    if (arrayA[i] != (TA)arrayB[i]) {
-                        output[j] = 0;
-                        break;
-                    }
-                }
-                break;
-            }
-            default:
-                return NOT_SUPPORTED;
-                break;
-        }
-    }
-    return SUCCESS;
-}
-
-template <typename TA>
-EE check_wrapper(TensorDesc inputDescA,
-    const TA *inputA,
-    TensorDesc inputDescB,
-    const void *inputB,
-    CheckMode checkMode,
-    TensorDesc outputDesc,
-    I32 *output)
-{
-    EE ret = SUCCESS;
-    switch (inputDescB.dt) {
-#ifdef _USE_FP32
-        case DT_F32: {
-            ret = check_kernel<TA, F32>(
-                inputDescA, inputA, inputDescB, (const F32 *)inputB, checkMode, outputDesc, output);
-            break;
-        }
-#endif
-        case DT_U32: {
-            ret = check_kernel<TA, U32>(
-                inputDescA, inputA, inputDescB, (const U32 *)inputB, checkMode, outputDesc, output);
-            break;
-        }
-        case DT_I32: {
-            ret = check_kernel<TA, I32>(
-                inputDescA, inputA, inputDescB, (const I32 *)inputB, checkMode, outputDesc, output);
-            break;
-        }
-        default:
-            ret = NOT_SUPPORTED;
-            break;
-    }
-    return ret;
-}
-
-EE check_x86(TensorDesc inputDescA,
-    const void *inputA,
-    TensorDesc inputDescB,
-    const void *inputB,
-    CheckParamSpec p,
-    TensorDesc outputDesc,
-    void *output)
-{
-    DataType idt = inputDescA.dt;
-    EE ret = SUCCESS;
-
-    if (idt != inputDescB.dt) {
-        switch (idt) {
-#ifdef _USE_FP32
-            case DT_F32: {
-                ret = check_wrapper<F32>(inputDescA, (const F32 *)inputA, inputDescB, inputB,
-                    p.check_mode, outputDesc, (I32 *)output);
-                break;
-            }
-#endif
-            case DT_U32: {
-                ret = check_wrapper<U32>(inputDescA, (const U32 *)inputA, inputDescB, inputB,
-                    p.check_mode, outputDesc, (I32 *)output);
-                break;
-            }
-            case DT_I32: {
-                ret = check_wrapper<I32>(inputDescA, (const I32 *)inputA, inputDescB, inputB,
-                    p.check_mode, outputDesc, (I32 *)output);
-                break;
-            }
-            default:
-                ret = NOT_SUPPORTED;
-                break;
-        }
-        return ret;
-    }
-
-    switch (idt) {
-#ifdef _USE_FP32
-        case DT_F32: {
-            ret = check_fp32(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-#endif
-        case DT_U32: {
-            ret = check_u32<U32>(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-        case DT_I32: {
-            ret = check_u32<I32>(inputDescA, (const I32 *)inputA, inputDescB, (const I32 *)inputB,
-                p.check_mode, outputDesc, (I32 *)output);
-            break;
-        }
-        default:
-            ret = NOT_SUPPORTED;
-            break;
-    }
-
-    return ret;
-}
diff --git a/compute/tensor/src/cpu/x86/convolution.cpp b/compute/tensor/src/cpu/x86/convolution.cpp
index 2b4370de..16c85e85 100644
--- a/compute/tensor/src/cpu/x86/convolution.cpp
+++ b/compute/tensor/src/cpu/x86/convolution.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/x86/tensor_computing_x86.h"
 #ifdef _USE_FP32
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
@@ -52,10 +51,12 @@ EE convolution_infer_forward_algorithm_x86(TensorDesc inputDesc,
     U32 group = convParamSpec.group;
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
+    U32 dilateH = convParamSpec.dilatedRate_h;
+    U32 dilateW = convParamSpec.dilatedRate_w;
 
     if ((targetDataType != DT_I8) && (targetDataType != DT_U8_Q) &&
         ((idf != DF_NCHWC8) || (ic / group % 8 != 0))) {
@@ -63,6 +64,13 @@ EE convolution_infer_forward_algorithm_x86(TensorDesc inputDesc,
         return SUCCESS;
     }
 
+    if ((targetDataType == DT_F32) && (idf == DF_NCHWC8) && (group == 1) && (fh == 3) &&
+        (fw == 3) && (dilateH == 1) && (dilateW == 1) && (oh > 8) && (ow > 8) && (strideH == 1) &&
+        (strideW == 1)) {
+        *algorithm = CONVOLUTION_ALGORITHM_WINOGRAD;
+        return SUCCESS;
+    }
+
     if ((fh == 1) && (fw == 1)) {
         *algorithm = CONVOLUTION_ALGORITHM_POINTWISE;
         return SUCCESS;
@@ -101,6 +109,10 @@ EE convolution_transform_filter_bytes_x86(TensorDesc filterDesc,
         case CONVOLUTION_ALGORITHM_POINTWISE:
             *bytes = fnPadding * fcPadding;
             break;
+        case CONVOLUTION_ALGORITHM_WINOGRAD:
+            *bytes =
+                fnPadding * fcPadding * 36 + 16 * 32 * 18;  // bolckIc:16, blockOc:32, weight:3*6=18
+            break;
         default:
             return NOT_SUPPORTED;
     }
@@ -205,8 +217,7 @@ EE convolution_x86(TensorDesc inputDesc,
     U32 icGroupSize = inputDesc.dims[dataChannelAxis] / group;
 
     void *inputTransform;
-    if ((inputDesc.df == DF_NCHWC8 && icGroupSize % 8 != 0) ||
-        (inputDesc.df == DF_NCHWC16 && icGroupSize % 16 != 0)) {
+    if ((inputDesc.df == DF_NCHWC8 && icGroupSize % 8 != 0)) {
         TensorDesc tmpInputDesc = inputDesc;
         tmpInputDesc.df = DF_NCHW;
         transformToNCHW(inputDesc, input, tmpInputDesc, tmp);
@@ -248,9 +259,10 @@ EE convolution_x86(TensorDesc inputDesc,
 #endif
 #ifdef _USE_INT8
             case DT_I8: {
-                ret = convolution_int8(tmpInputDesc, (UINT8 *)tmpInput, tmpFilterDesc,
-                    (INT8 *)tmpFilter, convParamSpec, algorithm, tmpBiasDesc, (I32 *)tmpBias,
-                    tmpBytes, tmp, tmpOutputDesc, tmpOutput, (F32 *)scale, activationDesc, arch);
+                ret = convolution_int8(tmpInputDesc, (UINT8 *)tmpInput, (F32 *)eltwiseInput,
+                    tmpFilterDesc, (INT8 *)tmpFilter, convParamSpec, algorithm, tmpBiasDesc,
+                    (F32 *)tmpBias, tmpBytes, tmp, tmpOutputDesc, tmpOutput, (F32 *)scale,
+                    activationDesc, arch);
                 break;
             }
 #endif
diff --git a/compute/tensor/src/cpu/x86/deconvolution.cpp b/compute/tensor/src/cpu/x86/deconvolution.cpp
index fc0a6394..7dde4d48 100644
--- a/compute/tensor/src/cpu/x86/deconvolution.cpp
+++ b/compute/tensor/src/cpu/x86/deconvolution.cpp
@@ -128,15 +128,15 @@ EE deconvolution_pointwise_x86(TensorDesc inputDesc,
     CHECK_REQUIREMENT(idf == DF_NCHWC8);
 
     ConvolutionParamSpec p = createConvolutionParamSpec(
-        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, oc, Convolution_Pointwise);
+        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, oc, CONVOLUTION_POINTWISE);
     TensorDesc nullDesc;
     U8 *convBias = (U8 *)tmp;
     if (fh == convParamSpec.stride_h && fw == convParamSpec.stride_w) {
         for (U32 ii = 0; ii < fh * fw; ++ii) {
-            memcpy(convBias + ii * oc * bytesOf(odt), bias, oc * bytesOf(odt));
+            UNI_MEMCPY(convBias + ii * oc * bytesOf(odt), bias, oc * bytesOf(odt));
         }
     } else {
-        memset(convBias, 0, oc * fh * fw * bytesOf(odt));
+        UNI_MEMSET(convBias, 0, oc * fh * fw * bytesOf(odt));
     }
     TensorDesc convOutDesc = tensor4df(odt, DF_NCHWC8, in, oc * fh * fw, ih, iw);
     U8 *convOut = (U8 *)tmp + oc * fh * fw * bytesOf(odt);
@@ -153,11 +153,13 @@ EE deconvolution_pointwise_x86(TensorDesc inputDesc,
     } else {
         U8 *tmpOutputPtr = (U8 *)output;
         U32 biasTileSize = bytesOf(biasDesc.dt) * 8;
-        U8 *biasPtr = (U8 *)bias;
-        for (U32 c = 0; c < oc / 8; c++, biasPtr += biasTileSize) {
-            for (U32 n = 0; n < oh * ow; n++) {
-                memcpy(tmpOutputPtr, biasPtr, biasTileSize);
-                tmpOutputPtr += biasTileSize;
+        for (U32 n = 0; n < on; ++n) {
+            U8 *biasPtr = (U8 *)bias;
+            for (U32 c = 0; c < oc / 8; c++, biasPtr += biasTileSize) {
+                for (U32 hw = 0; hw < oh * ow; hw++) {
+                    UNI_MEMCPY(tmpOutputPtr, biasPtr, biasTileSize);
+                    tmpOutputPtr += biasTileSize;
+                }
             }
         }
         deconvolution_overlap_crop_c8_x86(convOut, output, inputDesc, outputDesc, convParamSpec);
diff --git a/compute/tensor/src/cpu/x86/depthwise_convolution.cpp b/compute/tensor/src/cpu/x86/depthwise_convolution.cpp
index 3a21f766..5c6d5e1f 100644
--- a/compute/tensor/src/cpu/x86/depthwise_convolution.cpp
+++ b/compute/tensor/src/cpu/x86/depthwise_convolution.cpp
@@ -39,6 +39,7 @@ EE depthwise_convolution_transform_filter_x86(TensorDesc filterDesc,
 }
 
 EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc,
+    TensorDesc dwFilterDesc,
     TensorDesc outputDesc,
     ConvolutionParamSpec convParamSpec,
     DepthwiseConvolutionForwardAlgorithm algorithm,
@@ -47,16 +48,18 @@ EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc,
     if (nullptr == bytes) {
         CHECK_STATUS(NULL_POINTER);
     }
-    DataType idt, odt;
-    DataFormat idf, odf;
+    DataType idt, odt, fdt;
+    DataFormat idf, odf, fdf;
     U32 in, ic, ih, iw;
+    U32 fn, fc, fh, fw;
     U32 on, oc, oh, ow;
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    CHECK_STATUS(tensor4dGet(outputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     U32 ih_pad = ih + paddingT + paddingB;
     U32 iw_pad = iw + paddingL + paddingR;
@@ -66,7 +69,7 @@ EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc,
             *bytes = ic * ih_pad * iw_pad;
             break;
         case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT:
-            *bytes = ic * ih_pad * iw_pad + ic * oh * ow;
+            *bytes = ic * ih_pad * (iw_pad + 4) + ic * oh * ow + ic * 4;
             break;
         default: {
             ret = NOT_MATCH;
@@ -74,6 +77,9 @@ EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc,
             break;
         }
     }
+    if (idt == DT_I8 || idt == DT_U8_Q) {
+        *bytes += fh * fw * 16 * 16;
+    }
     *bytes *= bytesOf(idt);
     if (idf != DF_NCHWC8) {
         *bytes += tensorNumBytes(inputDesc);
@@ -88,6 +94,7 @@ EE depthwise_convolution_x86(TensorDesc inputDesc,
     const void *filter,
     ConvolutionParamSpec convParamSpec,
     DepthwiseConvolutionForwardAlgorithm algorithm,
+    void *scale,
     TensorDesc biasDesc,
     const void *bias,
     U32 tmpBytes,
@@ -100,7 +107,7 @@ EE depthwise_convolution_x86(TensorDesc inputDesc,
     TensorDesc blankTensorDesc;
     ActivationParamSpec blankActivationParamSpec;
     return depthwise_pointwise_convolution_x86(inputDesc, input, nullptr, filterDesc, filter,
-        blankTensorDesc, nullptr, convParamSpec, algorithm, blankTensorDesc, bias, biasDesc,
+        blankTensorDesc, nullptr, convParamSpec, algorithm, nullptr, blankTensorDesc, bias, biasDesc,
         nullptr, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec,
         blankActivationParamSpec, arch);
 }
diff --git a/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp
index f65e1606..60b332b3 100644
--- a/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp
+++ b/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp
@@ -15,6 +15,9 @@
 #ifdef _USE_FP32
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 #endif
+#ifdef _USE_INT8
+#include "cpu/x86/int8/tensor_computing_int8.h"
+#endif
 #include "tensor_transpose.h"
 
 EE depthwise_pointwise_convolution_transform_filter_x86(TensorDesc dwFilterDesc,
@@ -36,6 +39,14 @@ EE depthwise_pointwise_convolution_transform_filter_x86(TensorDesc dwFilterDesc,
                 (F32 *)dwFilterTransformed, pwFtmDesc, (F32 *)pwFilterTransformed);
             break;
         }
+#endif
+#ifdef _USE_INT8
+        case DT_I8: {
+            ret = depthwise_pointwise_convolution_transform_filter_int8(dwFilterDesc,
+                (INT8 *)dwFilter, pwFilterDesc, (INT8 *)pwFilter, algorithm, dwFtmDesc,
+                (INT8 *)dwFilterTransformed, pwFtmDesc, (INT8 *)pwFilterTransformed);
+            break;
+        }
 #endif
         default:
             ret = NOT_SUPPORTED;
@@ -53,6 +64,7 @@ EE depthwise_pointwise_convolution_x86(TensorDesc inputDesc,
     const void *pwFilter,
     ConvolutionParamSpec convParamSpec,
     DepthwiseConvolutionForwardAlgorithm algorithm,
+    void *scale,
     TensorDesc dwBiasDesc,
     const void *dwBias,
     TensorDesc pwBiasDesc,
@@ -67,22 +79,38 @@ EE depthwise_pointwise_convolution_x86(TensorDesc inputDesc,
 {
     TensorDesc newInputDesc = inputDesc;
     void *newInput = input;
-    if (inputDesc.df != DF_NCHWC8) {
-        newInputDesc.df = DF_NCHWC8;
+    DataFormat dstF = inputDesc.df;
+    if (inputDesc.dt == DT_U8_Q || inputDesc.df == DF_NCHWC16) {
+        dstF = DF_NCHWC16;
+    } else {
+        dstF = DF_NCHWC8;
+    }
+    if (inputDesc.df != dstF) {
+        newInputDesc.df = dstF;
         newInput = tmp;
         tmp = (U8 *)tmp + tensorNumBytes(inputDesc);
         tmpBytes -= tensorNumBytes(inputDesc);
-        transformNCHWToNCHWC8(inputDesc, input, newInputDesc, newInput);
+        transformFormat(inputDesc, input, newInputDesc, newInput);
     }
     EE ret = SUCCESS;
     switch (dwFilterDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
-            ret = depthwise_pointwise_convolution_fp32(newInputDesc, (F32 *)newInput,
-                (F32 *)eltwiseInput, dwFilterDesc, (const F32 *)dwFilter, pwFilterDesc,
-                (const F32 *)pwFilter, convParamSpec, algorithm, dwBiasDesc, (const F32 *)dwBias,
-                pwBiasDesc, (const F32 *)pwBias, tmpBytes, tmp, outputDesc, (F32 *)output,
-                depthwiseActivationParamSpec, pointwiseActivationParamSpec, arch);
+            ret = depthwise_pointwise_convolution_fp32(newInputDesc, (F32 *)newInput, (F32 *)eltwiseInput, dwFilterDesc,
+                (const F32 *)dwFilter, pwFilterDesc, (const F32 *)pwFilter, convParamSpec,
+                algorithm, dwBiasDesc, (const F32 *)dwBias, pwBiasDesc, (const F32 *)pwBias,
+                tmpBytes, tmp, outputDesc, (F32 *)output, depthwiseActivationParamSpec,
+                pointwiseActivationParamSpec, arch);
+            break;
+        }
+#endif
+#ifdef _USE_INT8
+        case DT_I8: {
+            ret = depthwise_pointwise_convolution_int8(newInputDesc, (UINT8 *)newInput, (F32 *)eltwiseInput, dwFilterDesc,
+                (const INT8 *)dwFilter, pwFilterDesc, (const INT8 *)pwFilter, convParamSpec,
+                dwBiasDesc, (const F32 *)dwBias, pwBiasDesc, (const F32 *)pwBias,
+                tmpBytes, tmp, outputDesc, (void *)output, (F32 *)scale, depthwiseActivationParamSpec,
+                pointwiseActivationParamSpec);
             break;
         }
 #endif
diff --git a/compute/tensor/src/cpu/x86/eltwise.cpp b/compute/tensor/src/cpu/x86/eltwise.cpp
index f11fbd87..2e18d9dd 100644
--- a/compute/tensor/src/cpu/x86/eltwise.cpp
+++ b/compute/tensor/src/cpu/x86/eltwise.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <vector>
 #include "cpu/x86/tensor_computing_x86.h"
 #ifdef _USE_FP32
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
@@ -25,7 +24,7 @@ EE eltwise_x86(DataType dataType,
     void *output,
     EltwiseMode eltwiseMode)
 {
-    EE ret = SUCCESS;
+    EE ret = NOT_SUPPORTED;
     switch (dataType) {
 #ifdef _USE_FP32
         case DT_F32: {
@@ -33,6 +32,7 @@ EE eltwise_x86(DataType dataType,
             break;
         }
 #endif
+        case DT_U32:
         case DT_I32: {
             ret = eltwise_i32(input, inputSize, num, len, output, eltwiseMode);
             break;
@@ -42,7 +42,6 @@ EE eltwise_x86(DataType dataType,
             break;
         }
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
     return ret;
diff --git a/compute/tensor/src/cpu/x86/fp32/attention.cpp b/compute/tensor/src/cpu/x86/fp32/attention.cpp
index dab532ec..ddb60c0a 100644
--- a/compute/tensor/src/cpu/x86/fp32/attention.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/attention.cpp
@@ -25,14 +25,14 @@ EE attention_fp32(U32 batch,
     }
 
     F32 mask_s = -10000.0;
-    I32 count = array_sum_f32(input, toSequenceLength);
-    I32 valid = UNI_MIN(count, fromSequenceLength);
     __m256 mask_v = _mm256_set1_ps(mask_s);
     __m256 one_v = _mm256_set1_ps(1.0);
     for (U32 n = 0; n < batch; n++) {
+        U32 count = array_sum_f32(input, toSequenceLength);
+        U32 valid = UNI_MIN(count, (U32)fromSequenceLength);
         for (U32 i = 0; i < numHeads; i++) {
             if (i == 0) {
-                for (I32 j = 0; j < valid; j++) {
+                for (U32 j = 0; j < valid; j++) {
                     if (j == 0) {
                         I32 k = 0;
                         for (; k < toSequenceLength - 7; k += 8) {
@@ -46,12 +46,12 @@ EE attention_fp32(U32 batch,
                             output[k] = value;
                         }
                     } else {
-                        memcpy(
+                        UNI_MEMCPY(
                             output + j * toSequenceLength, output, toSequenceLength * sizeof(F32));
                     }
                 }
 
-                for (I32 j = valid; j < fromSequenceLength; j++) {
+                for (U32 j = valid; j < (U32)fromSequenceLength; j++) {
                     if (j == valid) {
                         I32 k = 0;
                         for (; k < toSequenceLength - 7; k += 8) {
@@ -61,12 +61,12 @@ EE attention_fp32(U32 batch,
                             output[j * toSequenceLength + k] = mask_s;
                         }
                     } else {
-                        memcpy(output + j * toSequenceLength, output + valid * toSequenceLength,
+                        UNI_MEMCPY(output + j * toSequenceLength, output + valid * toSequenceLength,
                             toSequenceLength * sizeof(F32));
                     }
                 }
             } else {
-                memcpy(output + i * fromSequenceLength * toSequenceLength, output,
+                UNI_MEMCPY(output + i * fromSequenceLength * toSequenceLength, output,
                     fromSequenceLength * toSequenceLength * sizeof(F32));
             }
         }
diff --git a/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp b/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp
index 9d683bca..2f4c3bb2 100644
--- a/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 
 EE attention_mask_fp32(TensorDesc inputDesc,
@@ -56,7 +55,7 @@ EE attention_mask_fp32(TensorDesc inputDesc,
             if (start + loops > klen) {
                 loops = UNI_MAX(klen - start, 0);
             }
-            memset(&mask[i * klen + start], 0, sizeof(F32) * loops);
+            UNI_MEMSET(&mask[i * klen + start], 0, sizeof(F32) * loops);
         }
     }
     I32 loops = tensorNumElements(inputDesc) / length;
diff --git a/compute/tensor/src/cpu/x86/fp32/check.cpp b/compute/tensor/src/cpu/x86/fp32/check.cpp
deleted file mode 100644
index 9140fe00..00000000
--- a/compute/tensor/src/cpu/x86/fp32/check.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#include "cpu/x86/fp32/tensor_computing_fp32.h"
-#include "x86_avx2_expand.h"
-
-EE check_fp32(TensorDesc inputDescA,
-    const F32 *inputA,
-    TensorDesc inputDescB,
-    const F32 *inputB,
-    CheckMode checkMode,
-    TensorDesc outputDesc,
-    I32 *output)
-{
-    if (nullptr == inputA || nullptr == inputB || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-
-    if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-
-    U32 size = tensorNumElements(inputDescA);
-    U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1];
-    I32 length = size / loopOuter;
-    if (tensorNumElements(outputDesc) != loopOuter) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-    for (U32 j = 0; j < loopOuter; j++) {
-        const F32 *arrayA = inputA + j * length;
-        const F32 *arrayB = inputB + j * length;
-        switch (checkMode) {
-            case CHECK_GREAT: {
-                __m256i count_v = _mm256_set1_epi32(0);
-                I32 i = 0;
-                for (; i < length - 7; i += 8) {
-                    __m256 a = _mm256_loadu_ps(arrayA + i);
-                    __m256 b = _mm256_loadu_ps(arrayA + i);
-                    count_v = _mm256_add_epi32(
-                        count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_GT_OS)));
-                }
-                I32 count = _mm256_hadd_u32(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] > arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            case CHECK_GREATEQUAL: {
-                __m256i count_v = _mm256_set1_epi32(0);
-                I32 i = 0;
-                for (; i < length - 7; i += 8) {
-                    __m256 a = _mm256_loadu_ps(arrayA + i);
-                    __m256 b = _mm256_loadu_ps(arrayA + i);
-                    count_v = _mm256_add_epi32(
-                        count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_GE_OS)));
-                }
-                I32 count = _mm256_hadd_u32(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] >= arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            case CHECK_EQUAL: {
-                __m256i count_v = _mm256_set1_epi32(0);
-                I32 i = 0;
-                for (; i < length - 7; i += 8) {
-                    __m256 a = _mm256_loadu_ps(arrayA + i);
-                    __m256 b = _mm256_loadu_ps(arrayA + i);
-                    count_v = _mm256_add_epi32(
-                        count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_EQ_OS)));
-                }
-                I32 count = _mm256_hadd_u32(count_v);
-                for (; i < length; i++) {
-                    if (arrayA[i] == arrayB[i]) {
-                        count++;
-                    }
-                }
-                output[j] = (count == length);
-                break;
-            }
-            default:
-                CHECK_STATUS(NOT_SUPPORTED);
-                break;
-        }
-    }
-    return SUCCESS;
-}
diff --git a/compute/tensor/src/cpu/x86/fp32/convolution.cpp b/compute/tensor/src/cpu/x86/fp32/convolution.cpp
index fb4ea453..c6a31782 100644
--- a/compute/tensor/src/cpu/x86/fp32/convolution.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/convolution.cpp
@@ -12,8 +12,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "sys.h"
-#include "error.h"
-
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 
 EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc,
@@ -34,10 +32,10 @@ EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     U32 ih_pad = ih + paddingT + paddingB;
     U32 iw_pad = iw + paddingL + paddingR;
@@ -64,6 +62,13 @@ EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc,
         case CONVOLUTION_ALGORITHM_GEMM_ICNCHW:
             *bytes = 0;
             break;
+        case CONVOLUTION_ALGORITHM_WINOGRAD: {
+            U32 wSize = 3;
+            U32 blockIcDim = 32;
+            U32 blockOcDim = 32;
+            *bytes = 36 * blockIcDim * ((ow + 3) / 4 + 1) + (36 * blockOcDim + 36 * 36) * wSize;
+            break;
+        }
         default:
             ret = NOT_MATCH;
             break;
@@ -134,6 +139,10 @@ EE convolution_fp32(TensorDesc inputDesc,
             ret = convolution_direct_nchw(inputDesc, input, filterDesc, filter, convParamSpec,
                 biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc);
             break;
+        case CONVOLUTION_ALGORITHM_WINOGRAD:
+            ret = convolution_winograd(inputDesc, input, eltwiseInput, filterDesc, filter, convParamSpec,
+                biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc);
+            break;
         default:
             ret = NOT_SUPPORTED;
             break;
diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp
index 265fb722..eb0a3b41 100644
--- a/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp
@@ -1752,6 +1752,10 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
 
+    if (idf == DF_NCHWC16 && ih == 1 && iw == 1) {
+        idf = DF_NCHWC8;
+    }
+
     if ((fdf != DF_NCHWCxN24 && fdf != DF_NCHWCxN32) || (idf != DF_NCHWC8) || (ic % 8 != 0)) {
         CHECK_STATUS(NOT_MATCH);
     }
@@ -1765,13 +1769,16 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
     I32 unrollHwArray[4] = {12, 6, 4, 3};
 
     // get computing params
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
+    U32 phT = (paddingT + strideH - 1) / strideH;
+    U32 phB = (paddingB + strideH - 1) / strideH;
     U32 ohow = oh * ow;
+    U32 ohowMain = (oh - phT - phB) * ow;
     U32 ihiw = ih * iw;
     U32 newIh = (ih + strideH - 1) / strideH;
     U32 newIw = (iw + strideW - 1) / strideW;
@@ -1783,16 +1790,16 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
     U32 ocBlockNums = InferConvDirectOcBlockNum(oc, ocbArray, unrollOc, unrollOcArray);
     U32 ocBBlockNums = BLOCK_OC_DIM / unrollOc;
     U32 alpha = OMP_NUM_THREADS / gcd<U32>(ocBlockNums, OMP_NUM_THREADS);
-    U32 blockHwDim = InferConvBlockHW(ohow, BLOCK_HW_DIM, alpha);
+    U32 blockHwDim = InferConvBlockHW(ohowMain, BLOCK_HW_DIM, alpha);
     blockHwDim = (blockHwDim + unrollHwX - 1) / unrollHwX * unrollHwX;
-    U32 hwBlockNums = CeilDivide(ohow, blockHwDim);
-    if (paddingT != 0 || paddingB != 0 || paddingL != 0 || paddingR != 0) {
+    U32 hwBlockNums = CeilDivide(ohowMain, blockHwDim);
+    if (paddingL != 0 || paddingR != 0) {
         hwBlockNums = oh;
     }
 
-#if defined(_WIN32) && defined(_USE_OPENMP)
+#ifdef _USE_OPENMP
     OpenMPController ompCtr;
-    ompCtr.checkAndSetOpenMP(ohow, BLOCK_HW_DIM, ocBlockNums);
+    ompCtr.checkAndSetOpenMP(ohowMain, BLOCK_HW_DIM, ocBlockNums);
 #endif
 
     // infer kernel params
@@ -1831,7 +1838,7 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
     }
 
 #ifdef _USE_OPENMP
-#pragma omp parallel num_threads(OMP_NUM_THREADS)
+#pragma omp parallel num_threads(OMP_NUM_THREADS) if (ompCtr.useOmp)
     {
 #endif
         F32 *tmpI = inArray;
@@ -1844,14 +1851,15 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
 #ifdef _USE_OPENMP
 #pragma omp for schedule(static)
 #endif
-                for (U32 hc = 0; hc < ih * ic8; hc += strideH) {
-                    U32 c = hc / ih;
-                    U32 h = hc % ih;
-                    for (U32 w = 0; w < iw; w += strideW) {
-                        U32 nh = h / strideH;
-                        U32 nw = w / strideW;
-                        memcpy(tmpI + c * newIw * newIh * SIMDW + (nh * newIw + nw) * SIMDW,
-                            bInArray + c * ihiw * SIMDW + (h * iw + w) * SIMDW, SIMDW * sizeof(F32));
+                for (U32 hc = 0; hc < oh * ic8; ++hc) {
+                    U32 c = hc / oh;
+                    U32 h = hc % oh;
+                    for (U32 w = 0; w < ow; ++w) {
+                        U32 nh = h * strideH;
+                        U32 nw = w * strideW;
+                        UNI_MEMCPY(tmpI + c * ohow * SIMDW + (h * ow + w) * SIMDW,
+                            bInArray + c * ihiw * SIMDW + (nh * iw + nw) * SIMDW,
+                            SIMDW * sizeof(F32));
                     }
                 }
                 paddingT = (paddingT + strideH - 1) / strideH;
@@ -1875,14 +1883,26 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
                     }
 
                     F32 *curI = tmpI + icb * newIw * newIh;
-                    if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) {
+                    if (phT > 0 || phB > 0) {
+                        U32 minUpper = UNI_MIN((ocbb + ocbSize) * unrollOc, oc);
+                        for (U32 oci = ocbb * unrollOc; oci < minUpper; oci += SIMDW) {
+                            __m256 biasVec = _mm256_load_ps(btmp + oci);
+                            for (U32 hw = 0; hw < phT * ow; ++hw) {
+                                _mm256_storeu_ps(bOutArray + oci * ohow + hw * SIMDW, biasVec);
+                            }
+                            for (U32 hw = (oh - phB) * ow; hw < oh * ow; ++hw) {
+                                _mm256_storeu_ps(bOutArray + oci * ohow + hw * SIMDW, biasVec);
+                            }
+                        }
+                    }
+                    if (paddingL == 0 && paddingR == 0) {
 #ifdef _USE_OPENMP
 #pragma omp for schedule(static)
 #endif
                         for (U32 bIdx = 0; bIdx < hwocBlockNums; ++bIdx) {
                             FTZ;
                             U32 hw = (bIdx / ocbSize) * blockHwDim;
-                            U32 hwSize = UNI_MIN(blockHwDim, ohow - hw);
+                            U32 hwSize = UNI_MIN(blockHwDim, ohowMain - hw);
                             U32 ocBlockIdx = bIdx % ocbSize + ocbb;
                             U32 ocb = GetOcIdx(ocBlockIdx, oc, unrollOc, ocbArray);
                             U32 ocSize = UNI_MIN(unrollOc, oc - ocb);
@@ -1891,8 +1911,8 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
 
                             const F32 *curB = biasArray + ocb;
                             const F32 *curW = filterArray + ocb * ic + icb * ocSize;
-                            F32 *curO = bOutArray + ocb * oh * ow;
-                            F32 *curE = eltwiseInput + ocb * oh * ow;
+                            F32 *curO = bOutArray + ocb * oh * ow + phT * ow * SIMDW;
+                            F32 *curE = eltwiseInput + ocb * oh * ow + phT * ow * SIMDW;
                             U32 ihwSize = 0;
                             for (U32 ihw = hw; ihw < hw + hwSize; ihw += ihwSize) {
                                 if ((hw + hwSize - ihw) >= unrollHw) {
@@ -1913,7 +1933,7 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
 #endif
                         for (U32 bIdx = 0; bIdx < hwocBlockNums; ++bIdx) {
                             FTZ;
-                            U32 h = bIdx / ocbSize;
+                            U32 h = bIdx / ocbSize + phT;
                             U32 ocBlockIdx = bIdx % ocbSize + ocbb;
                             U32 ocb = GetOcIdx(ocBlockIdx, oc, unrollOc, ocbArray);
                             U32 ocSize = UNI_MIN(unrollOc, oc - ocb);
@@ -1952,9 +1972,6 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
         }
 #ifdef _USE_OPENMP
     }
-#ifdef _WIN32
-    ompCtr.resetOpenMP();
-#endif
 #endif
     return SUCCESS;
 }
diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp
index ce60d335..3cb454d2 100644
--- a/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp
@@ -1950,10 +1950,10 @@ EE convolution_direct(TensorDesc inputDesc,
     // get computing params
     I32 strideH = convParamSpec.stride_h;
     I32 strideW = convParamSpec.stride_w;
-    I32 paddingT = convParamSpec.padding_top;
-    I32 paddingB = convParamSpec.padding_bottom;
-    I32 paddingL = convParamSpec.padding_left;
-    I32 paddingR = convParamSpec.padding_right;
+    I32 paddingT = convParamSpec.pad_top;
+    I32 paddingB = convParamSpec.pad_bottom;
+    I32 paddingL = convParamSpec.pad_left;
+    I32 paddingR = convParamSpec.pad_right;
     I32 dilateH = convParamSpec.dilatedRate_h;
     I32 dilateW = convParamSpec.dilatedRate_w;
     I32 ih_pad = ih + paddingT + paddingB;
@@ -1975,7 +1975,7 @@ EE convolution_direct(TensorDesc inputDesc,
     I32 hwocBlockNums = hwBlockNums * ocBlockNums;
     I32 blockIcDim = InferConvDirectBolckIcDim(BLOCK_IC_DIM, unrollOc, blockHwDim, fh, fw);
 
-#if defined(_WIN32) && defined(_USE_OPENMP)
+#ifdef _USE_OPENMP
     OpenMPController ompCtr;
     ompCtr.checkAndSetOpenMP(ohow, BLOCK_HW_DIM, ocBlockNums);
 #endif
@@ -1992,12 +1992,12 @@ EE convolution_direct(TensorDesc inputDesc,
         if (idf == DF_NCHWC8 && paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) {
             tmpInput = bInArray;
         } else {
-            // TODO: optimize the memcpy
+            // TODO: optimize the UNI_MEMCPY
             PaddingNCHWC8(bInArray, tmpInput, inputDesc, convParamSpec);
         }
 
 #ifdef _USE_OPENMP
-#pragma omp parallel num_threads(OMP_NUM_THREADS)
+#pragma omp parallel num_threads(OMP_NUM_THREADS) if (ompCtr.useOmp)
         {
 #endif
             I32 flags = 0;
@@ -2050,9 +2050,5 @@ EE convolution_direct(TensorDesc inputDesc,
 #endif
     }
 
-#if defined(_WIN32) && defined(_USE_OPENMP)
-    ompCtr.resetOpenMP();
-#endif
-
     return SUCCESS;
 }
diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp
index ae20dde8..a4036216 100644
--- a/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp
@@ -1671,10 +1671,10 @@ EE convolution_direct_nchw(TensorDesc inputDesc,
     I32 fhDilated = (fh - 1) * dilateH + 1;
     I32 fwDilated = (fw - 1) * dilateW + 1;
     //pad
-    I32 paddingT = convParamSpec.padding_top;
-    I32 paddingB = convParamSpec.padding_bottom;
-    I32 paddingL = convParamSpec.padding_left;
-    I32 paddingR = convParamSpec.padding_right;
+    I32 paddingT = convParamSpec.pad_top;
+    I32 paddingB = convParamSpec.pad_bottom;
+    I32 paddingL = convParamSpec.pad_left;
+    I32 paddingR = convParamSpec.pad_right;
     I32 ohPaddingT = 0;
     I32 ohPaddingB = 0;
     if ((paddingL == 0) && (paddingR == 0) && (paddingT != 0 || paddingB != 0)) {
diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_functions.h b/compute/tensor/src/cpu/x86/fp32/convolution_functions.h
index 71803714..75baf369 100644
--- a/compute/tensor/src/cpu/x86/fp32/convolution_functions.h
+++ b/compute/tensor/src/cpu/x86/fp32/convolution_functions.h
@@ -175,19 +175,17 @@ T gcd(T u, T v)
     return u;
 }
 
-#if defined(_WIN32) && defined(_USE_OPENMP)
+#ifdef _USE_OPENMP
 struct OpenMPController {
-    I32 ompThread;
+    bool useOmp;
     void checkAndSetOpenMP(I32 ohow, I32 threshold, I32 blockNums)
     {
-        ompThread = OMP_NUM_THREADS;
+#ifdef _WIN32 
         if (ohow < threshold && blockNums < OMP_NUM_THREADS) {
-            OMP_NUM_THREADS = 1;
+            useOmp = false;
         }
+#endif
     }
-    void resetOpenMP()
-    {
-        OMP_NUM_THREADS = ompThread;
-    }
+    OpenMPController(): useOmp(true) {}
 };
 #endif
\ No newline at end of file
diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp
index f5e0df69..8cd7b7f5 100644
--- a/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp
@@ -16,6 +16,13 @@
 #include "cpu/x86/fp32/transform_functions_fp32.h"
 #include "cpu/x86/fp32/convolution_functions.h"
 
+EE convolution_winograd_transform_filter_fp32(TensorDesc filterDesc,
+    const F32 *filter,
+    ConvolutionParamSpec convParamSpec,
+    ConvolutionForwardAlgorithm algorithm,
+    TensorDesc *ftmDesc,
+    F32 *filterTransformed);
+
 // N is 32/24
 template <U32 N>
 inline EE transformNCHWToNCHWCxNxWrapper(
@@ -54,7 +61,7 @@ inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc,
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     if (fdf == ftmDataFormat) {
         *ftmDesc = filterDesc;
-        memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt));
+        UNI_MEMCPY(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt));
         return SUCCESS;
     }
     if (fdf != DF_NCHW) {
@@ -89,6 +96,11 @@ EE convolution_transform_filter_fp32(TensorDesc filterDesc,
     TensorDesc *ftmDesc,
     F32 *filterTransformed)
 {
+    if (algorithm == CONVOLUTION_ALGORITHM_WINOGRAD) {
+        return convolution_winograd_transform_filter_fp32(
+            filterDesc, filter, convParamSpec, algorithm, ftmDesc, filterTransformed);
+    }
+
     DataFormat ftmDataFormat;
     DataType fdt;
     DataFormat fdf;
@@ -133,3 +145,112 @@ EE convolution_transform_filter_fp32(TensorDesc filterDesc,
     ftmDesc->dims[channelAxis] = filterDesc.dims[channelAxis];
     return SUCCESS;
 }
+
+void transformWeight4x4_3x3(
+    const F32 *input, F32 *output, F32 *tmp, U32 blockIc, TensorDesc filterDesc)
+{
+    DataType fdt;
+    DataFormat fdf;
+    U32 fn, fc, fh, fw;
+    CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
+
+    __m256 v01666 = _mm256_set1_ps(0.1666666666666667f);
+    __m256 minusV01666 = _mm256_set1_ps(-0.1666666666666667f);
+    __m256 v00833 = _mm256_set1_ps(0.0833333333333333f);
+    __m256 minusV00833 = _mm256_set1_ps(-0.0833333333333333f);
+    __m256 v004166 = _mm256_set1_ps(0.0416666666666667f);
+    __m256 v025 = _mm256_set1_ps(0.25f);
+
+    // U32 fn32 = fn / 32;
+    U32 fnBlocks[3] = {8, 16, 32};
+    U32 lstep = fc * fh * fw;
+    __m256i vindex = _mm256_set_epi32(
+        lstep * 7, lstep * 6, lstep * 5, lstep * 4, lstep * 3, lstep * 2, lstep, 0);
+
+    U32 cx = 0;
+    for (U32 c = 0; c < fc; c += cx) {
+        cx = UNI_MIN(blockIc, fc - c);
+        U32 nSize = 0;
+        for (U32 n = 0; n < fn; n += nSize) {
+            nSize = UNI_MIN(32, fn - n);
+            nSize = fnBlocks[nSize >> 4];
+            F32 *curO = output + (c * fn + n * cx) * 36;
+            for (U32 cb = 0; cb < cx; ++cb) {
+                for (U32 ni = 0; ni < (nSize / 8); ++ni) {
+                    const F32 *curI = input + (n + ni * 8) * lstep + (c + cb) * fh * fw;
+                    for (U32 i = 0; i < 3; ++i) {
+                        __m256 xi0 = _mm256_i32gather_ps(curI + i, vindex, 4);
+                        __m256 xi1 = _mm256_i32gather_ps(curI + 3 + i, vindex, 4);
+                        __m256 xi2 = _mm256_i32gather_ps(curI + 3 * 2 + i, vindex, 4);
+
+                        __m256 t0 = _mm256_mul_ps(v01666, xi2);
+                        __m256 t1 = _mm256_sub_ps(_mm256_mul_ps(minusV01666, xi0), t0);
+                        __m256 t2 = _mm256_fmadd_ps(v004166, xi0, t0);
+
+                        __m256 o0 = _mm256_mul_ps(v025, xi0);
+                        __m256 o1 = _mm256_fmadd_ps(xi1, minusV01666, t1);
+                        __m256 o2 = _mm256_fmadd_ps(xi1, v01666, t1);
+                        __m256 o3 = _mm256_fmadd_ps(xi1, v00833, t2);
+                        __m256 o4 = _mm256_fmadd_ps(xi1, minusV00833, t2);
+
+                        _mm256_storeu_ps(tmp + (i)*8, o0);
+                        _mm256_storeu_ps(tmp + (3 + i) * 8, o1);
+                        _mm256_storeu_ps(tmp + (3 * 2 + i) * 8, o2);
+                        _mm256_storeu_ps(tmp + (3 * 3 + i) * 8, o3);
+                        _mm256_storeu_ps(tmp + (3 * 4 + i) * 8, o4);
+                        _mm256_storeu_ps(tmp + (3 * 5 + i) * 8, xi2);
+                    }
+                    for (U32 i = 0; i < 6; ++i) {
+                        __m256 xi0 = _mm256_loadu_ps(tmp + (3 * i) * 8);
+                        __m256 xi1 = _mm256_loadu_ps(tmp + (3 * i + 1) * 8);
+                        __m256 xi2 = _mm256_loadu_ps(tmp + (3 * i + 2) * 8);
+
+                        __m256 t0 = _mm256_mul_ps(v01666, xi2);
+                        __m256 t1 = _mm256_sub_ps(_mm256_mul_ps(minusV01666, xi0), t0);
+                        __m256 t2 = _mm256_fmadd_ps(v004166, xi0, t0);
+
+                        __m256 o0 = _mm256_mul_ps(v025, xi0);
+                        __m256 o1 = _mm256_fmadd_ps(xi1, minusV01666, t1);
+                        __m256 o2 = _mm256_fmadd_ps(xi1, v01666, t1);
+                        __m256 o3 = _mm256_fmadd_ps(xi1, v00833, t2);
+                        __m256 o4 = _mm256_fmadd_ps(xi1, minusV00833, t2);
+
+                        _mm256_storeu_ps(curO + (6 * i) * nSize * cx + cb * nSize + ni * 8, o0);
+                        _mm256_storeu_ps(curO + (6 * i + 1) * nSize * cx + cb * nSize + ni * 8, o1);
+                        _mm256_storeu_ps(curO + (6 * i + 2) * nSize * cx + cb * nSize + ni * 8, o2);
+                        _mm256_storeu_ps(curO + (6 * i + 3) * nSize * cx + cb * nSize + ni * 8, o3);
+                        _mm256_storeu_ps(curO + (6 * i + 4) * nSize * cx + cb * nSize + ni * 8, o4);
+                        _mm256_storeu_ps(curO + (6 * i + 5) * nSize * cx + cb * nSize + ni * 8, xi2);
+                    }
+                }
+            }
+        }
+    }
+}
+
+EE convolution_winograd_transform_filter_fp32(TensorDesc filterDesc,
+    const F32 *filter,
+    ConvolutionParamSpec convParamSpec,
+    ConvolutionForwardAlgorithm algorithm,
+    TensorDesc *ftmDesc,
+    F32 *filterTransformed)
+{
+    // F(4x4, 3x3)
+    if (nullptr == filter || nullptr == ftmDesc || nullptr == filterTransformed) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    DataType fdt;
+    DataFormat fdf;
+    U32 fn, fc, fh, fw;
+    CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
+    if (fdf != DF_NCHW) {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
+
+    U32 blockIc = UNI_MIN(32, fc);
+    F32 *tmp = filterTransformed + fn * fc * 36;
+    transformWeight4x4_3x3(filter, filterTransformed, tmp, blockIc, filterDesc);
+    *ftmDesc = tensor4df(fdt, DF_NCHWCxN32, fn, fc, fh, fw);
+
+    return SUCCESS;
+}
diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_winograd.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_winograd.cpp
new file mode 100644
index 00000000..b6062686
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/fp32/convolution_winograd.cpp
@@ -0,0 +1,1623 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "sys.h"
+#include "error.h"
+
+#include "cpu/x86/fp32/tensor_computing_fp32.h"
+#include "cpu/x86/fp32/transform_functions_fp32.h"
+#include "cpu/x86/fp32/convolution_functions.h"
+
+#define BLOCK_IC_DIM 32
+
+void transformInput4x4_3x3(
+    F32 *input, F32 *output, F32 *tmp, U32 iw, U32 ih, U32 ic, U32 wSize, U32 blockIc)
+{
+    __m256 four = _mm256_set1_ps(4.0f);
+    __m256 minusFour = _mm256_set1_ps(-4.0f);
+    __m256 two = _mm256_set1_ps(2.0f);
+    __m256 minusFive = _mm256_set1_ps(-5.0f);
+    U32 icb = ic / blockIc;
+    U32 cb = blockIc / 8;
+    for (U32 w = 0; w < wSize; ++w) {
+        for (U32 c = 0; c < icb; ++c) {
+            for (U32 cc = 0; cc < cb; ++cc) {
+                F32 *curI = input + (c * blockIc + cc * 8) * ih * iw + w * 4 * 8;
+                F32 *curO = output + (w * ic + c * blockIc) * 36 + cc * 8;
+                for (U32 i = 0; i < 6; ++i) {
+                    __m256 xi0 = _mm256_loadu_ps(curI + (i)*8);
+                    __m256 xi1 = _mm256_loadu_ps(curI + (iw + i) * 8);
+                    __m256 xi2 = _mm256_loadu_ps(curI + (iw * 2 + i) * 8);
+                    __m256 xi3 = _mm256_loadu_ps(curI + (iw * 3 + i) * 8);
+                    __m256 xi4 = _mm256_loadu_ps(curI + (iw * 4 + i) * 8);
+                    __m256 xi5 = _mm256_loadu_ps(curI + (iw * 5 + i) * 8);
+
+                    __m256 t0 = _mm256_fmadd_ps(minusFour, xi2, xi4);
+                    __m256 t1 = _mm256_fmadd_ps(minusFour, xi1, xi3);
+                    __m256 t2 = _mm256_sub_ps(xi4, xi2);
+                    __m256 t3 = _mm256_mul_ps(two, _mm256_sub_ps(xi3, xi1));
+                    __m256 t4 = _mm256_fmadd_ps(four, xi0, xi4);
+                    __m256 t5 = _mm256_fmadd_ps(four, xi1, xi5);
+
+                    xi0 = _mm256_fmadd_ps(minusFive, xi2, t4);
+                    xi5 = _mm256_fmadd_ps(minusFive, xi3, t5);
+                    xi1 = _mm256_add_ps(t1, t0);
+                    xi2 = _mm256_sub_ps(t0, t1);
+                    xi3 = _mm256_add_ps(t3, t2);
+                    xi4 = _mm256_sub_ps(t2, t3);
+
+                    _mm256_storeu_ps(tmp + (i)*8, xi0);
+                    _mm256_storeu_ps(tmp + (6 + i) * 8, xi1);
+                    _mm256_storeu_ps(tmp + (6 * 2 + i) * 8, xi2);
+                    _mm256_storeu_ps(tmp + (6 * 3 + i) * 8, xi3);
+                    _mm256_storeu_ps(tmp + (6 * 4 + i) * 8, xi4);
+                    _mm256_storeu_ps(tmp + (6 * 5 + i) * 8, xi5);
+                }
+
+                for (U32 i = 0; i < 6; ++i) {
+                    __m256 xi0 = _mm256_loadu_ps(tmp + (i * 6) * 8);
+                    __m256 xi1 = _mm256_loadu_ps(tmp + (i * 6 + 1) * 8);
+                    __m256 xi2 = _mm256_loadu_ps(tmp + (i * 6 + 2) * 8);
+                    __m256 xi3 = _mm256_loadu_ps(tmp + (i * 6 + 3) * 8);
+                    __m256 xi4 = _mm256_loadu_ps(tmp + (i * 6 + 4) * 8);
+                    __m256 xi5 = _mm256_loadu_ps(tmp + (i * 6 + 5) * 8);
+
+                    if (cc % 2 == 0) {
+                        _mm_prefetch(curO + (6 * i) * blockIc, _MM_HINT_NTA);
+                        _mm_prefetch(curO + (6 * i + 1) * blockIc, _MM_HINT_NTA);
+                        _mm_prefetch(curO + (6 * i + 2) * blockIc, _MM_HINT_NTA);
+                        _mm_prefetch(curO + (6 * i + 3) * blockIc, _MM_HINT_NTA);
+                        _mm_prefetch(curO + (6 * i + 4) * blockIc, _MM_HINT_NTA);
+                        _mm_prefetch(curO + (6 * i + 5) * blockIc, _MM_HINT_NTA);
+                    }
+                    __m256 t0 = _mm256_fmadd_ps(minusFour, xi2, xi4);
+                    __m256 t1 = _mm256_fmadd_ps(minusFour, xi1, xi3);
+                    __m256 t2 = _mm256_sub_ps(xi4, xi2);
+                    __m256 t3 = _mm256_mul_ps(two, _mm256_sub_ps(xi3, xi1));
+                    __m256 t4 = _mm256_fmadd_ps(four, xi0, xi4);
+                    __m256 t5 = _mm256_fmadd_ps(four, xi1, xi5);
+
+                    xi0 = _mm256_fmadd_ps(minusFive, xi2, t4);
+                    xi5 = _mm256_fmadd_ps(minusFive, xi3, t5);
+                    xi1 = _mm256_add_ps(t1, t0);
+                    xi2 = _mm256_sub_ps(t0, t1);
+                    xi3 = _mm256_add_ps(t3, t2);
+                    xi4 = _mm256_sub_ps(t2, t3);
+
+                    _mm256_storeu_ps(curO + (6 * i) * blockIc, xi0);
+                    _mm256_storeu_ps(curO + (6 * i + 1) * blockIc, xi1);
+                    _mm256_storeu_ps(curO + (6 * i + 2) * blockIc, xi2);
+                    _mm256_storeu_ps(curO + (6 * i + 3) * blockIc, xi3);
+                    _mm256_storeu_ps(curO + (6 * i + 4) * blockIc, xi4);
+                    _mm256_storeu_ps(curO + (6 * i + 5) * blockIc, xi5);
+                }
+            }
+        }
+    }
+}
+
+void transformInputWithPad4x4_3x3(F32 *input,
+    F32 *output,
+    F32 *tmp,
+    U32 iw,
+    U32 ih,
+    U32 ic,
+    U32 wSize,
+    U32 blockIc,
+    U32 pl,
+    U32 pr,
+    U32 pt,
+    U32 pb,
+    U32 h,
+    U32 w,
+    U32 oh,
+    U32 ow)
+{
+    __m256 four = _mm256_set1_ps(4.0f);
+    __m256 minusFour = _mm256_set1_ps(-4.0f);
+    __m256 two = _mm256_set1_ps(2.0f);
+    __m256 minusFive = _mm256_set1_ps(-5.0f);
+    U32 icb = ic / blockIc;
+    U32 cb = blockIc / 8;
+
+    pt = (h > pt) ? 0 : (pt - h);
+    pl = (w > pl) ? 0 : (pl - w);
+    for (U32 uw = 0; uw < wSize; ++uw) {
+        for (U32 c = 0; c < icb; ++c) {
+            for (U32 cc = 0; cc < cb; ++cc) {
+                F32 *curI = input + (c * blockIc + cc * 8) * ih * iw;
+                F32 *curO = output + (uw * ic + c * blockIc) * 36 + cc * 8;
+                U32 i = 0;
+                for (; ((i + w) < pl) && (i < 6); ++i) {
+                    UNI_MEMSET(tmp + (i)*8, 0, 32);
+                    UNI_MEMSET(tmp + (6 + i) * 8, 0, 32);
+                    UNI_MEMSET(tmp + (6 * 2 + i) * 8, 0, 32);
+                    UNI_MEMSET(tmp + (6 * 3 + i) * 8, 0, 32);
+                    UNI_MEMSET(tmp + (6 * 4 + i) * 8, 0, 32);
+                    UNI_MEMSET(tmp + (6 * 5 + i) * 8, 0, 32);
+                }
+                for (; ((i + w + pr) < (ow + 2)) && (i < 6); ++i) {
+                    __m256 xi[6];
+                    U32 b = 0;
+                    for (; ((b + h) < pt) && (b < 6); ++b) {
+                        xi[b] = _mm256_setzero_ps();
+                    }
+                    for (; ((b + h + pb) < (oh + 2)) && (b < 6); ++b) {
+                        xi[b] = _mm256_loadu_ps(curI + (iw * (b - pt) + i + uw * 4 - pl) * 8);
+                    }
+                    for (; ((b + h) < (oh + 2)) && (b < 6); ++b) {
+                        xi[b] = _mm256_setzero_ps();
+                    }
+
+                    __m256 t0 = _mm256_fmadd_ps(minusFour, xi[2], xi[4]);
+                    __m256 t1 = _mm256_fmadd_ps(minusFour, xi[1], xi[3]);
+                    __m256 t2 = _mm256_sub_ps(xi[4], xi[2]);
+                    __m256 t3 = _mm256_mul_ps(two, _mm256_sub_ps(xi[3], xi[1]));
+                    __m256 t4 = _mm256_fmadd_ps(four, xi[0], xi[4]);
+                    __m256 t5 = _mm256_fmadd_ps(four, xi[1], xi[5]);
+
+                    xi[0] = _mm256_fmadd_ps(minusFive, xi[2], t4);
+                    xi[5] = _mm256_fmadd_ps(minusFive, xi[3], t5);
+                    xi[1] = _mm256_add_ps(t1, t0);
+                    xi[2] = _mm256_sub_ps(t0, t1);
+                    xi[3] = _mm256_add_ps(t3, t2);
+                    xi[4] = _mm256_sub_ps(t2, t3);
+
+                    _mm256_storeu_ps(tmp + (i)*8, xi[0]);
+                    _mm256_storeu_ps(tmp + (6 + i) * 8, xi[1]);
+                    _mm256_storeu_ps(tmp + (6 * 2 + i) * 8, xi[2]);
+                    _mm256_storeu_ps(tmp + (6 * 3 + i) * 8, xi[3]);
+                    _mm256_storeu_ps(tmp + (6 * 4 + i) * 8, xi[4]);
+                    _mm256_storeu_ps(tmp + (6 * 5 + i) * 8, xi[5]);
+                }
+                for (; ((i + w) < (ow + 2)) && (i < 6); ++i) {
+                    UNI_MEMSET(tmp + (i)*8, 0, 32);
+                    UNI_MEMSET(tmp + (6 + i) * 8, 0, 32);
+                    UNI_MEMSET(tmp + (6 * 2 + i) * 8, 0, 32);
+                    UNI_MEMSET(tmp + (6 * 3 + i) * 8, 0, 32);
+                    UNI_MEMSET(tmp + (6 * 4 + i) * 8, 0, 32);
+                    UNI_MEMSET(tmp + (6 * 5 + i) * 8, 0, 32);
+                }
+
+                for (U32 j = 0; j < 6; ++j) {
+                    __m256 xi0 = _mm256_loadu_ps(tmp + (j * 6) * 8);
+                    __m256 xi1 = _mm256_loadu_ps(tmp + (j * 6 + 1) * 8);
+                    __m256 xi2 = _mm256_loadu_ps(tmp + (j * 6 + 2) * 8);
+                    __m256 xi3 = _mm256_loadu_ps(tmp + (j * 6 + 3) * 8);
+                    __m256 xi4 = _mm256_loadu_ps(tmp + (j * 6 + 4) * 8);
+                    __m256 xi5 = _mm256_loadu_ps(tmp + (j * 6 + 5) * 8);
+
+                    if (cc % 2 == 0) {
+                        _mm_prefetch(curO + (6 * j) * blockIc, _MM_HINT_NTA);
+                        _mm_prefetch(curO + (6 * j + 1) * blockIc, _MM_HINT_NTA);
+                        _mm_prefetch(curO + (6 * j + 2) * blockIc, _MM_HINT_NTA);
+                        _mm_prefetch(curO + (6 * j + 3) * blockIc, _MM_HINT_NTA);
+                        _mm_prefetch(curO + (6 * j + 4) * blockIc, _MM_HINT_NTA);
+                        _mm_prefetch(curO + (6 * j + 5) * blockIc, _MM_HINT_NTA);
+                    }
+                    __m256 t0 = _mm256_fmadd_ps(minusFour, xi2, xi4);
+                    __m256 t1 = _mm256_fmadd_ps(minusFour, xi1, xi3);
+                    __m256 t2 = _mm256_sub_ps(xi4, xi2);
+                    __m256 t3 = _mm256_mul_ps(two, _mm256_sub_ps(xi3, xi1));
+                    __m256 t4 = _mm256_fmadd_ps(four, xi0, xi4);
+                    __m256 t5 = _mm256_fmadd_ps(four, xi1, xi5);
+
+                    xi0 = _mm256_fmadd_ps(minusFive, xi2, t4);
+                    xi5 = _mm256_fmadd_ps(minusFive, xi3, t5);
+                    xi1 = _mm256_add_ps(t1, t0);
+                    xi2 = _mm256_sub_ps(t0, t1);
+                    xi3 = _mm256_add_ps(t3, t2);
+                    xi4 = _mm256_sub_ps(t2, t3);
+
+                    _mm256_storeu_ps(curO + (6 * j) * blockIc, xi0);
+                    _mm256_storeu_ps(curO + (6 * j + 1) * blockIc, xi1);
+                    _mm256_storeu_ps(curO + (6 * j + 2) * blockIc, xi2);
+                    _mm256_storeu_ps(curO + (6 * j + 3) * blockIc, xi3);
+                    _mm256_storeu_ps(curO + (6 * j + 4) * blockIc, xi4);
+                    _mm256_storeu_ps(curO + (6 * j + 5) * blockIc, xi5);
+                }
+            }
+        }
+        w += 4;
+    }
+}
+
+void transformOutput4x4_3x3(F32 *input,
+    F32 *output,
+    F32 *tmp,
+    const F32 *bias,
+    U32 ow,
+    U32 oh,
+    U32 oc,
+    U32 wSize,
+    bool addF,
+    ActivationMode mode)
+{
+    I64 flag = (I64)addF | (I64(mode) << 1);
+    __m256 four = _mm256_set1_ps(4.0f);
+    __m256 eight = _mm256_set1_ps(8.0f);
+    U32 ocb = oc / 8;
+    for (U32 c = 0; c < ocb; ++c) {
+        for (U32 w = 0; w < wSize; ++w) {
+            F32 *curI = input + w * oc + c * 8;
+            F32 *curO = output + w * 32 + c * 8 * oh * ow;
+            I64 stepI = 24 * oc * wSize;
+            I64 stepT = 192;
+            for (U32 i = 0; i < 6; ++i) {
+                F32 *useI0 = curI + i * oc * wSize;
+                F32 *useI1 = useI0 + 18 * oc * wSize;
+                F32 *useO0 = tmp + i * 8;
+                F32 *useO1 = useO0 + 96;
+                __asm__ __volatile__(
+                    "vmovups (%[input0]), %%ymm0                             \n\t"
+                    "vmovups (%[input0], %[stepI]), %%ymm1                             \n\t"
+                    "vmovups (%[input0], %[stepI], 2), %%ymm2                             \n\t"
+                    "vmovups (%[input1]), %%ymm3                             \n\t"
+                    "vmovups (%[input1], %[stepI]), %%ymm4                             \n\t"
+                    "vmovups (%[input1], %[stepI], 2), %%ymm5                             \n\t"
+                    "vaddps %%ymm2, %%ymm1, %%ymm6                             \n\t"
+                    "vaddps %%ymm3, %%ymm4, %%ymm7                             \n\t"
+                    "vsubps %%ymm2, %%ymm1, %%ymm8                             \n\t"
+                    "vsubps %%ymm4, %%ymm3, %%ymm9                             \n\t"
+                    "vaddps %%ymm6, %%ymm7, %%ymm1                             \n\t"
+                    "vaddps %%ymm9, %%ymm9, %%ymm3                             \n\t"
+                    "vaddps %%ymm0, %%ymm1, %%ymm11                             \n\t"  // xi0
+                    "vaddps %%ymm8, %%ymm3, %%ymm12                             \n\t"  // xi1
+                    "vmovups %%ymm11, (%[output0])                             \n\t"
+                    "vmovups %%ymm12, (%[output0], %[stepT])                    \n\t"
+                    "vfmadd231ps %[eight], %%ymm9, %%ymm8              \n\t"
+                    "vfmadd231ps %[four], %%ymm7, %%ymm6              \n\t"
+                    "vaddps %%ymm5, %%ymm8, %%ymm10                             \n\t"  // xi3
+                    "vmovups %%ymm6, (%[output1])                             \n\t"
+                    "vmovups %%ymm10, (%[output1], %[stepT])                     \n\t"
+                    :
+                    : [input0] "r"(useI0), [input1] "r"(useI1), [stepI] "r"(stepI), [output0] "r"(useO0),
+                    [output1] "r"(useO1), [stepT] "r"(stepT), [four] "x"(four), [eight] "x"(eight)
+                    : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6",
+                    "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "memory", "cc");
+            }
+
+            stepT = 32;
+            stepI = 32;
+            for (U32 i = 0; i < 4; ++i) {
+                F32 *useI0 = tmp + 48 * i;
+                F32 *useI1 = useI0 + 24;
+                F32 *useO0 = curO + ow * i * 8;
+                F32 *useO1 = useO0 + 16;
+                __asm__ __volatile__(
+                    "vmovups (%[input0]), %%ymm0                             \n\t"
+                    "vmovups (%[input0], %[stepI]), %%ymm1                             \n\t"
+                    "vmovups (%[input0], %[stepI], 2), %%ymm2                             \n\t"
+                    "vmovups (%[input1]), %%ymm3                             \n\t"
+                    "vmovups (%[input1], %[stepI]), %%ymm4                             \n\t"
+                    "vmovups (%[input1], %[stepI], 2), %%ymm5                             \n\t"
+                    "prefetcht0 (%[output0])                              \n\t"
+                    "prefetcht0 (%[output1])                              \n\t"
+                    "vaddps %%ymm2, %%ymm1, %%ymm6                             \n\t"
+                    "vaddps %%ymm3, %%ymm4, %%ymm7                             \n\t"
+                    "vsubps %%ymm2, %%ymm1, %%ymm8                             \n\t"
+                    "vsubps %%ymm4, %%ymm3, %%ymm9                             \n\t"
+                    "vaddps %%ymm6, %%ymm7, %%ymm1                             \n\t"
+                    "vaddps %%ymm9, %%ymm9, %%ymm3                             \n\t"
+                    "vaddps %%ymm0, %%ymm1, %%ymm11                             \n\t"  // xi0
+                    "vaddps %%ymm8, %%ymm3, %%ymm12                             \n\t"  // xi1
+                    "vfmadd231ps %[eight], %%ymm9, %%ymm8              \n\t"
+                    "vfmadd231ps %[four], %%ymm7, %%ymm6              \n\t"
+                    "vaddps %%ymm5, %%ymm8, %%ymm10                             \n\t"  // xi3
+                    "mov %[flag], %%rax                                      \n\t"
+                    "and $0x1, %%rax                                      \n\t"
+                    "je 0f                                             \n\t"
+                    "vaddps (%[output0]), %%ymm11, %%ymm11                             \n\t"
+                    "vaddps (%[output0], %[stepT]), %%ymm12, %%ymm12                    \n\t"
+                    "vaddps (%[output1]), %%ymm6, %%ymm6                             \n\t"
+                    "vaddps (%[output1], %[stepT]), %%ymm10, %%ymm10                     \n\t"
+                    "jmp 1f                                             \n\t"
+                    ".align 16                                         \n\t"
+                    "0:                                                \n\t"
+                    "vmovups (%[bias]), %%ymm0                             \n\t"
+                    "vaddps %%ymm0, %%ymm11, %%ymm11                             \n\t"
+                    "vaddps %%ymm0, %%ymm12, %%ymm12                    \n\t"
+                    "vaddps %%ymm0, %%ymm6, %%ymm6                             \n\t"
+                    "vaddps %%ymm0, %%ymm10, %%ymm10                     \n\t"
+                    ".align 16                                         \n\t"
+                    "1:                                                \n\t"
+                    "mov %[flag], %%rax                                      \n\t"
+                    "or $0x1, %%rax                                      \n\t"
+                    "cmp $0x3, %%rax                                      \n\t"
+                    "jne 2f                                             \n\t"
+                    "vxorps %%ymm0, %%ymm0, %%ymm0                  \n\t"
+                    "vmaxps %%ymm0, %%ymm6, %%ymm6                    \n\t"
+                    "vmaxps %%ymm0, %%ymm12, %%ymm12                    \n\t"
+                    "vmaxps %%ymm0, %%ymm10, %%ymm10                  \n\t"
+                    "vmaxps %%ymm0, %%ymm11, %%ymm11                  \n\t"
+                    ".align 16                                         \n\t"
+                    "2:                                                \n\t"
+                    "vmovups %%ymm11, (%[output0])                             \n\t"
+                    "vmovups %%ymm12, (%[output0], %[stepT])                    \n\t"
+                    "vmovups %%ymm6, (%[output1])                             \n\t"
+                    "vmovups %%ymm10, (%[output1], %[stepT])                     \n\t"
+                    :
+                    : [input0] "r"(useI0), [input1] "r"(useI1), [stepI] "r"(stepI),
+                    [output0] "r"(useO0), [output1] "r"(useO1), [stepT] "r"(stepT),
+                    [four] "x"(four), [eight] "x"(eight), [flag] "r"(flag), [bias] "r"(bias)
+                    : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6",
+                    "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "memory", "cc");
+            }
+        }
+        bias += 8;
+    }
+}
+
+void transformOutputWithPad4x4_3x3(F32 *input,
+    F32 *output,
+    F32 *tmp,
+    const F32 *bias,
+    U32 ow,
+    U32 oh,
+    U32 oc,
+    U32 wSize,
+    bool addF,
+    U32 pr,
+    U32 pb,
+    U32 h,
+    U32 w,
+    ActivationMode mode)
+{
+    __m256 two = _mm256_set1_ps(2.0f);
+    __m256 four = _mm256_set1_ps(4.0f);
+    __m256 eight = _mm256_set1_ps(8.0f);
+    U32 ocb = oc / 8;
+    for (U32 c = 0; c < ocb; ++c) {
+        for (U32 uw = 0; uw < wSize; ++uw) {
+            F32 *curI = input + uw * oc + c * 8;
+            F32 *curO = output + uw * 32 + c * 8 * oh * ow;
+            I64 stepI = 24 * oc * wSize;
+            I64 stepT = 192;
+            for (U32 i = 0; i < 6; ++i) {
+                F32 *useI0 = curI + i * oc * wSize;
+                F32 *useI1 = useI0 + 18 * oc * wSize;
+                F32 *useO0 = tmp + i * 8;
+                F32 *useO1 = useO0 + 96;
+                __asm__ __volatile__(
+                    "vmovups (%[input0]), %%ymm0                             \n\t"
+                    "vmovups (%[input0], %[stepI]), %%ymm1                             \n\t"
+                    "vmovups (%[input0], %[stepI], 2), %%ymm2                             \n\t"
+                    "vmovups (%[input1]), %%ymm3                             \n\t"
+                    "vmovups (%[input1], %[stepI]), %%ymm4                             \n\t"
+                    "vmovups (%[input1], %[stepI], 2), %%ymm5                             \n\t"
+                    "vaddps %%ymm2, %%ymm1, %%ymm6                             \n\t"
+                    "vaddps %%ymm3, %%ymm4, %%ymm7                             \n\t"
+                    "vsubps %%ymm2, %%ymm1, %%ymm8                             \n\t"
+                    "vsubps %%ymm4, %%ymm3, %%ymm9                             \n\t"
+                    "vaddps %%ymm6, %%ymm7, %%ymm1                             \n\t"
+                    "vaddps %%ymm9, %%ymm9, %%ymm3                             \n\t"
+                    "vaddps %%ymm0, %%ymm1, %%ymm11                             \n\t"  // xi0
+                    "vaddps %%ymm8, %%ymm3, %%ymm12                             \n\t"  // xi1
+                    "vmovups %%ymm11, (%[output0])                             \n\t"
+                    "vmovups %%ymm12, (%[output0], %[stepT])                    \n\t"
+                    "vfmadd231ps %[eight], %%ymm9, %%ymm8              \n\t"
+                    "vfmadd231ps %[four], %%ymm7, %%ymm6              \n\t"
+                    "vaddps %%ymm5, %%ymm8, %%ymm10                             \n\t"  // xi3
+                    "vmovups %%ymm6, (%[output1])                             \n\t"
+                    "vmovups %%ymm10, (%[output1], %[stepT])                     \n\t"
+                    :
+                    : [input0] "r"(useI0), [input1] "r"(useI1), [stepI] "r"(stepI), [output0] "r"(useO0),
+                    [output1] "r"(useO1), [stepT] "r"(stepT), [four] "x"(four), [eight] "x"(eight)
+                    : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6",
+                    "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "memory", "cc");
+            }
+            for (U32 i = 0; (i < 4) && (i + h < oh); ++i) {
+                __m256 xi[6];
+                for (U32 j = 0; j < 6; ++j) {
+                    xi[j] = _mm256_loadu_ps(tmp + (6 * i + j) * 8);
+                }
+
+                __m256 t0 = _mm256_add_ps(xi[1], xi[2]);
+                __m256 t1 = _mm256_add_ps(xi[4], xi[3]);
+                __m256 t2 = _mm256_sub_ps(xi[1], xi[2]);
+                __m256 t3 = _mm256_sub_ps(xi[3], xi[4]);
+
+                xi[0] = _mm256_add_ps(_mm256_add_ps(t0, t1), xi[0]);
+                xi[1] = _mm256_fmadd_ps(two, t3, t2);
+                xi[2] = _mm256_fmadd_ps(four, t1, t0);
+                xi[3] = _mm256_add_ps(_mm256_fmadd_ps(eight, t3, t2), xi[5]);
+
+                if (addF) {
+                    for (U32 j = 0; (j < 4) && (j + w + uw * 4 < ow); ++j) {
+                        xi[j] = _mm256_add_ps(xi[j],
+                            _mm256_loadu_ps(output + (ow * i + uw * 4 + j) * 8 + c * 8 * oh * ow));
+                    }
+                } else {
+                    __m256 b = _mm256_loadu_ps(bias + c * 8);
+                    for (U32 j = 0; (j < 4) && (j + w + uw * 4 < ow); ++j) {
+                        xi[j] = _mm256_add_ps(xi[j], b);
+                    }
+                }
+
+                if (mode) {
+                    __m256 zero = _mm256_setzero_ps();
+                    for (U32 j = 0; (j < 4) && (j + w + uw * 4 < ow); ++j) {
+                        xi[j] = _mm256_max_ps(xi[j], zero);
+                    }
+                }
+
+                for (U32 j = 0; (j < 4) && (j + w + uw * 4 < ow); ++j) {
+                    _mm256_storeu_ps(output + (ow * i + uw * 4 + j) * 8 + c * 8 * oh * ow, xi[j]);
+                }
+            }
+        }
+    }
+}
+
+struct ConvController {
+    F32 **input;
+    const F32 *filter;
+    void *output;
+    F32 *eltwise;
+    I64 ic;
+    I64 fStep;
+    I64 flags;
+};
+
+typedef void (*kernelFunc)(ConvController &c);
+
+void winoKernel3x32(ConvController &c)
+{
+    __asm__ __volatile__(
+        "vxorps %%ymm0, %%ymm0, %%ymm0                             \n\t"
+        "vxorps %%ymm1, %%ymm1, %%ymm1                             \n\t"
+        "vxorps %%ymm2, %%ymm2, %%ymm2                             \n\t"
+        "vxorps %%ymm3, %%ymm3, %%ymm3                             \n\t"
+        "vxorps %%ymm4, %%ymm4, %%ymm4                             \n\t"
+        "vxorps %%ymm5, %%ymm5, %%ymm5                             \n\t"
+        "vxorps %%ymm6, %%ymm6, %%ymm6                             \n\t"
+        "vxorps %%ymm7, %%ymm7, %%ymm7                             \n\t"
+        "vxorps %%ymm8, %%ymm8, %%ymm8                             \n\t"
+        "vxorps %%ymm9, %%ymm9, %%ymm9                             \n\t"
+        "vxorps %%ymm10, %%ymm10, %%ymm10                             \n\t"
+        "vxorps %%ymm11, %%ymm11, %%ymm11                             \n\t"
+
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vbroadcastss (%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss (%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss (%[input2]), %%ymm14              \n\t"
+        "vmovups 0x0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x100(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x20(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+        "vmovups 0x40(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x140(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
+        "vmovups 0x60(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
+
+        "vbroadcastss 0x4(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x4(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x4(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x80(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x180(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0xA0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+        "vmovups 0xC0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x1C0(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
+        "vmovups 0xE0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
+
+        "vbroadcastss 0x8(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x8(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x8(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x100(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x200(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x120(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+        "vmovups 0x140(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x240(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
+        "vmovups 0x160(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
+
+        "vbroadcastss 0xC(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0xC(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0xC(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x180(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x280(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x1A0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+        "vmovups 0x1C0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x2C0(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
+        "vmovups 0x1E0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
+
+        "vbroadcastss 0x10(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x10(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x10(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x200(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x300(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x220(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+        "vmovups 0x240(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x340(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
+        "vmovups 0x260(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
+
+        "vbroadcastss 0x14(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x14(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x14(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x280(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x380(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x2A0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+        "vmovups 0x2C0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x3C0(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
+        "vmovups 0x2E0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
+
+        "vbroadcastss 0x18(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x18(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x18(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x300(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x400(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x320(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+        "vmovups 0x340(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x440(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
+        "vmovups 0x360(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
+
+        "vbroadcastss 0x1C(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x1C(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x1C(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x380(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x480(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x3A0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+        "vmovups 0x3C0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x4C0(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm8              \n\t"
+        "vmovups 0x3E0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm11             \n\t"
+
+        "add $0x20, %[input0]                                         \n\t"
+        "add $0x20, %[input1]                                         \n\t"
+        "add $0x20, %[input2]                                         \n\t"
+        "add $0x400, %[filter]                                         \n\t"
+        "dec %%rcx                                         \n\t"
+        "jg 1b                                             \n\t"
+
+        "vmovups %%ymm0, (%[output])                             \n\t"
+        "vmovups %%ymm3, 0x20(%[output])                             \n\t"
+        "vmovups %%ymm6, 0x40(%[output])                             \n\t"
+        "vmovups %%ymm9, 0x60(%[output])                            \n\t"
+        "vmovups %%ymm1, 0x80(%[output])                             \n\t"
+        "vmovups %%ymm4, 0xA0(%[output])                             \n\t"
+        "vmovups %%ymm7, 0xC0(%[output])                             \n\t"
+        "vmovups %%ymm10, 0xE0(%[output])                             \n\t"
+        "vmovups %%ymm2, 0x100(%[output])                             \n\t"
+        "vmovups %%ymm5, 0x120(%[output])                             \n\t"
+        "vmovups %%ymm8, 0x140(%[output])                             \n\t"
+        "vmovups %%ymm11, 0x160(%[output])                             \n\t"
+        :
+        : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]),
+        [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic)
+        : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8",
+        "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc");
+}
+
+void winoKernel2x32(ConvController &c)
+{
+    __asm__ __volatile__(
+        "vxorps %%ymm0, %%ymm0, %%ymm0                             \n\t"
+        "vxorps %%ymm1, %%ymm1, %%ymm1                             \n\t"
+        "vxorps %%ymm3, %%ymm3, %%ymm3                             \n\t"
+        "vxorps %%ymm4, %%ymm4, %%ymm4                             \n\t"
+        "vxorps %%ymm6, %%ymm6, %%ymm6                             \n\t"
+        "vxorps %%ymm7, %%ymm7, %%ymm7                             \n\t"
+        "vxorps %%ymm9, %%ymm9, %%ymm9                             \n\t"
+        "vxorps %%ymm10, %%ymm10, %%ymm10                             \n\t"
+
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vbroadcastss (%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss (%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "prefetcht0 0x100(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vmovups 0x20(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vmovups 0x40(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "prefetcht0 0x140(%[filter])                              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "vmovups 0x60(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+
+        "vbroadcastss 0x4(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x4(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x80(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x180(%[filter])                              \n\t"
+        "vmovups 0xA0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vmovups 0xC0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x1C0(%[filter])                              \n\t"
+        "vmovups 0xE0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+
+        "vbroadcastss 0x8(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x8(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x100(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x200(%[filter])                              \n\t"
+        "vmovups 0x120(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vmovups 0x140(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x240(%[filter])                              \n\t"
+        "vmovups 0x160(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+
+        "vbroadcastss 0xC(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0xC(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x180(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x280(%[filter])                              \n\t"
+        "vmovups 0x1A0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vmovups 0x1C0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x2C0(%[filter])                              \n\t"
+        "vmovups 0x1E0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+
+        "vbroadcastss 0x10(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x10(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x200(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x300(%[filter])                              \n\t"
+        "vmovups 0x220(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vmovups 0x240(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x340(%[filter])                              \n\t"
+        "vmovups 0x260(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+
+        "vbroadcastss 0x14(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x14(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x280(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x380(%[filter])                              \n\t"
+        "vmovups 0x2A0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vmovups 0x2C0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x3C0(%[filter])                              \n\t"
+        "vmovups 0x2E0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+
+        "vbroadcastss 0x18(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x18(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x300(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x400(%[filter])                              \n\t"
+        "vmovups 0x320(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vmovups 0x340(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x440(%[filter])                              \n\t"
+        "vmovups 0x360(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+
+        "vbroadcastss 0x1C(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x1C(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x380(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "prefetcht0 0x480(%[filter])                              \n\t"
+        "vmovups 0x3A0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vmovups 0x3C0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm7              \n\t"
+        "prefetcht0 0x4C0(%[filter])                              \n\t"
+        "vmovups 0x3E0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm9              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm10             \n\t"
+
+        "add $0x20, %[input0]                                         \n\t"
+        "add $0x20, %[input1]                                         \n\t"
+        "add $0x400, %[filter]                                         \n\t"
+        "dec %%rcx                                         \n\t"
+        "jg 1b                                             \n\t"
+
+        "vmovups %%ymm0, (%[output])                             \n\t"
+        "vmovups %%ymm3, 0x20(%[output])                             \n\t"
+        "vmovups %%ymm6, 0x40(%[output])                             \n\t"
+        "vmovups %%ymm9, 0x60(%[output])                            \n\t"
+        "vmovups %%ymm1, 0x80(%[output])                             \n\t"
+        "vmovups %%ymm4, 0xA0(%[output])                             \n\t"
+        "vmovups %%ymm7, 0xC0(%[output])                             \n\t"
+        "vmovups %%ymm10, 0xE0(%[output])                             \n\t"
+        :
+        : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]),
+        [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic)
+        : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8",
+        "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc");
+}
+
+void winoKernel1x32(ConvController &c)
+{
+    __asm__ __volatile__(
+        "vxorps %%ymm0, %%ymm0, %%ymm0                             \n\t"
+        "vxorps %%ymm3, %%ymm3, %%ymm3                             \n\t"
+        "vxorps %%ymm6, %%ymm6, %%ymm6                             \n\t"
+        "vxorps %%ymm9, %%ymm9, %%ymm9                             \n\t"
+
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+
+        "vbroadcastss (%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x0(%[filter]), %%ymm15                          \n\t"
+        "vmovups 0x20(%[filter]), %%ymm10                         \n\t"
+        "vmovups 0x40(%[filter]), %%ymm13                         \n\t"
+        "vmovups 0x60(%[filter]), %%ymm14                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm10, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm13, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm14, %%ymm12, %%ymm9              \n\t"
+
+        "vbroadcastss 0x4(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x80(%[filter]), %%ymm15                          \n\t"
+        "vmovups 0xA0(%[filter]), %%ymm10                         \n\t"
+        "vmovups 0xC0(%[filter]), %%ymm13                         \n\t"
+        "vmovups 0xE0(%[filter]), %%ymm14                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm10, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm13, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm14, %%ymm12, %%ymm9              \n\t"
+
+        "vbroadcastss 0x8(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x100(%[filter]), %%ymm15                          \n\t"
+        "vmovups 0x120(%[filter]), %%ymm10                         \n\t"
+        "vmovups 0x140(%[filter]), %%ymm13                         \n\t"
+        "vmovups 0x160(%[filter]), %%ymm14                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm10, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm13, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm14, %%ymm12, %%ymm9              \n\t"
+
+        "vbroadcastss 0xC(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x180(%[filter]), %%ymm15                          \n\t"
+        "vmovups 0x1A0(%[filter]), %%ymm10                         \n\t"
+        "vmovups 0x1C0(%[filter]), %%ymm13                         \n\t"
+        "vmovups 0x1E0(%[filter]), %%ymm14                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm10, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm13, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm14, %%ymm12, %%ymm9              \n\t"
+
+        "vbroadcastss 0x10(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x200(%[filter]), %%ymm15                          \n\t"
+        "vmovups 0x220(%[filter]), %%ymm10                         \n\t"
+        "vmovups 0x240(%[filter]), %%ymm13                         \n\t"
+        "vmovups 0x260(%[filter]), %%ymm14                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm10, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm13, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm14, %%ymm12, %%ymm9              \n\t"
+
+        "vbroadcastss 0x14(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x280(%[filter]), %%ymm15                          \n\t"
+        "vmovups 0x2A0(%[filter]), %%ymm10                         \n\t"
+        "vmovups 0x2C0(%[filter]), %%ymm13                         \n\t"
+        "vmovups 0x2E0(%[filter]), %%ymm14                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm10, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm13, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm14, %%ymm12, %%ymm9              \n\t"
+
+        "vbroadcastss 0x18(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x300(%[filter]), %%ymm15                          \n\t"
+        "vmovups 0x320(%[filter]), %%ymm10                         \n\t"
+        "vmovups 0x340(%[filter]), %%ymm13                         \n\t"
+        "vmovups 0x360(%[filter]), %%ymm14                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm10, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm13, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm14, %%ymm12, %%ymm9              \n\t"
+
+        "vbroadcastss 0x1C(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x380(%[filter]), %%ymm15                          \n\t"
+        "vmovups 0x3A0(%[filter]), %%ymm10                         \n\t"
+        "vmovups 0x3C0(%[filter]), %%ymm13                         \n\t"
+        "vmovups 0x3E0(%[filter]), %%ymm14                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm10, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm13, %%ymm12, %%ymm6              \n\t"
+        "vfmadd231ps %%ymm14, %%ymm12, %%ymm9              \n\t"
+
+        "add $0x20, %[input0]                                         \n\t"
+        "add $0x400, %[filter]                                         \n\t"
+        "dec %%rcx                                         \n\t"
+        "jg 1b                                             \n\t"
+
+        "vmovups %%ymm0, (%[output])                             \n\t"
+        "vmovups %%ymm3, 0x20(%[output])                             \n\t"
+        "vmovups %%ymm6, 0x40(%[output])                             \n\t"
+        "vmovups %%ymm9, 0x60(%[output])                            \n\t"
+        :
+        : [input0] "r"(c.input[0]), [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic)
+        : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8",
+        "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc");
+}
+
+void winoKernel3x16(ConvController &c)
+{
+    __asm__ __volatile__(
+        "vxorps %%ymm0, %%ymm0, %%ymm0                             \n\t"
+        "vxorps %%ymm1, %%ymm1, %%ymm1                             \n\t"
+        "vxorps %%ymm2, %%ymm2, %%ymm2                             \n\t"
+        "vxorps %%ymm3, %%ymm3, %%ymm3                             \n\t"
+        "vxorps %%ymm4, %%ymm4, %%ymm4                             \n\t"
+        "vxorps %%ymm5, %%ymm5, %%ymm5                             \n\t"
+
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vbroadcastss (%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss (%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss (%[input2]), %%ymm14              \n\t"
+        "vmovups 0x0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x20(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+
+        "vbroadcastss 0x4(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x4(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x4(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x40(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x60(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+
+        "vbroadcastss 0x8(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x8(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x8(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x80(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0xA0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+
+        "vbroadcastss 0xC(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0xC(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0xC(%[input2]), %%ymm14              \n\t"
+        "vmovups 0xC0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0xE0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+
+        "vbroadcastss 0x10(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x10(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x10(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x100(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x120(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+
+        "vbroadcastss 0x14(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x14(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x14(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x140(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x160(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+
+        "vbroadcastss 0x18(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x18(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x18(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x180(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x1A0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+
+        "vbroadcastss 0x1C(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x1C(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x1C(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x1C0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+        "vmovups 0x1E0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm5              \n\t"
+
+        "add $0x20, %[input0]                                         \n\t"
+        "add $0x20, %[input1]                                         \n\t"
+        "add $0x20, %[input2]                                         \n\t"
+        "add $0x200, %[filter]                                         \n\t"
+        "dec %%rcx                                         \n\t"
+        "jg 1b                                             \n\t"
+
+        "vmovups %%ymm0, (%[output])                             \n\t"
+        "vmovups %%ymm3, 0x20(%[output])                             \n\t"
+        "vmovups %%ymm1, 0x40(%[output])                             \n\t"
+        "vmovups %%ymm4, 0x60(%[output])                             \n\t"
+        "vmovups %%ymm2, 0x80(%[output])                             \n\t"
+        "vmovups %%ymm5, 0xA0(%[output])                             \n\t"
+        :
+        : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]),
+        [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic)
+        : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8",
+        "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc");
+}
+
+void winoKernel2x16(ConvController &c)
+{
+    __asm__ __volatile__(
+        "vxorps %%ymm0, %%ymm0, %%ymm0                             \n\t"
+        "vxorps %%ymm1, %%ymm1, %%ymm1                             \n\t"
+        "vxorps %%ymm3, %%ymm3, %%ymm3                             \n\t"
+        "vxorps %%ymm4, %%ymm4, %%ymm4                             \n\t"
+
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vbroadcastss (%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss (%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vmovups 0x20(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+
+        "vbroadcastss 0x4(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x4(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x40(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vmovups 0x60(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+
+        "vbroadcastss 0x8(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x8(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x80(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vmovups 0xA0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+
+        "vbroadcastss 0xC(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0xC(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0xC0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vmovups 0xE0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+
+        "vbroadcastss 0x10(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x10(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x100(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vmovups 0x120(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+
+        "vbroadcastss 0x14(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x14(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x140(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vmovups 0x160(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+
+        "vbroadcastss 0x18(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x18(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x180(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vmovups 0x1A0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+
+        "vbroadcastss 0x1C(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x1C(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x1C0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vmovups 0x1E0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm4              \n\t"
+
+        "add $0x20, %[input0]                                         \n\t"
+        "add $0x20, %[input1]                                         \n\t"
+        "add $0x200, %[filter]                                         \n\t"
+        "dec %%rcx                                         \n\t"
+        "jg 1b                                             \n\t"
+
+        "vmovups %%ymm0, (%[output])                             \n\t"
+        "vmovups %%ymm3, 0x20(%[output])                             \n\t"
+        "vmovups %%ymm1, 0x40(%[output])                             \n\t"
+        "vmovups %%ymm4, 0x60(%[output])                             \n\t"
+        :
+        : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]),
+        [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic)
+        : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8",
+        "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc");
+}
+
+void winoKernel1x16(ConvController &c)
+{
+    __asm__ __volatile__(
+        "vxorps %%ymm0, %%ymm0, %%ymm0                             \n\t"
+        "vxorps %%ymm3, %%ymm3, %%ymm3                             \n\t"
+
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vbroadcastss (%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vmovups 0x20(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+
+        "vbroadcastss 0x4(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x40(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vmovups 0x60(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+
+        "vbroadcastss 0x8(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x80(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vmovups 0xA0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+
+        "vbroadcastss 0xC(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0xC0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vmovups 0xE0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+
+        "vbroadcastss 0x10(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x100(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vmovups 0x120(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+
+        "vbroadcastss 0x14(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x140(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vmovups 0x160(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+
+        "vbroadcastss 0x18(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x180(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vmovups 0x1A0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+
+        "vbroadcastss 0x1C(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x1C0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vmovups 0x1E0(%[filter]), %%ymm15                         \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm3              \n\t"
+
+        "add $0x20, %[input0]                                         \n\t"
+        "add $0x200, %[filter]                                         \n\t"
+        "dec %%rcx                                         \n\t"
+        "jg 1b                                             \n\t"
+
+        "vmovups %%ymm0, (%[output])                             \n\t"
+        "vmovups %%ymm3, 0x20(%[output])                             \n\t"
+        :
+        : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]),
+        [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic)
+        : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8",
+        "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc");
+}
+
+void winoKernel3x8(ConvController &c)
+{
+    __asm__ __volatile__(
+        "vxorps %%ymm0, %%ymm0, %%ymm0                             \n\t"
+        "vxorps %%ymm1, %%ymm1, %%ymm1                             \n\t"
+        "vxorps %%ymm2, %%ymm2, %%ymm2                             \n\t"
+
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vbroadcastss (%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss (%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss (%[input2]), %%ymm14              \n\t"
+        "vmovups 0x0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+
+        "vbroadcastss 0x4(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x4(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x4(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x20(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+
+        "vbroadcastss 0x8(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x8(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x8(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x40(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+
+        "vbroadcastss 0xC(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0xC(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0xC(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x60(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+
+        "vbroadcastss 0x10(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x10(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x10(%[input2]), %%ymm14              \n\t"
+        "vmovups 0x80(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+
+        "vbroadcastss 0x14(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x14(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x14(%[input2]), %%ymm14              \n\t"
+        "vmovups 0xA0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+
+        "vbroadcastss 0x18(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x18(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x18(%[input2]), %%ymm14              \n\t"
+        "vmovups 0xC0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+
+        "vbroadcastss 0x1C(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x1C(%[input1]), %%ymm13                 \n\t"
+        "vbroadcastss 0x1C(%[input2]), %%ymm14              \n\t"
+        "vmovups 0xE0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm14, %%ymm2              \n\t"
+
+        "add $0x20, %[input0]                                         \n\t"
+        "add $0x20, %[input1]                                         \n\t"
+        "add $0x20, %[input2]                                         \n\t"
+        "add $0x100, %[filter]                                         \n\t"
+        "dec %%rcx                                         \n\t"
+        "jg 1b                                             \n\t"
+
+        "vmovups %%ymm0, (%[output])                             \n\t"
+        "vmovups %%ymm1, 0x20(%[output])                             \n\t"
+        "vmovups %%ymm2, 0x40(%[output])                             \n\t"
+        :
+        : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]),
+        [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic)
+        : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8",
+        "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc");
+}
+
+void winoKernel2x8(ConvController &c)
+{
+    __asm__ __volatile__(
+        "vxorps %%ymm0, %%ymm0, %%ymm0                             \n\t"
+        "vxorps %%ymm1, %%ymm1, %%ymm1                             \n\t"
+
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vbroadcastss (%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss (%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+
+        "vbroadcastss 0x4(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x4(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x20(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+
+        "vbroadcastss 0x8(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x8(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x40(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+
+        "vbroadcastss 0xC(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0xC(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x60(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+
+        "vbroadcastss 0x10(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x10(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0x80(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+
+        "vbroadcastss 0x14(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x14(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0xA0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+
+        "vbroadcastss 0x18(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x18(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0xC0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+
+        "vbroadcastss 0x1C(%[input0]), %%ymm12                        \n\t"
+        "vbroadcastss 0x1C(%[input1]), %%ymm13                 \n\t"
+        "vmovups 0xE0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+        "vfmadd231ps %%ymm15, %%ymm13, %%ymm1              \n\t"
+
+        "add $0x20, %[input0]                                         \n\t"
+        "add $0x20, %[input1]                                         \n\t"
+        "add $0x100, %[filter]                                         \n\t"
+        "dec %%rcx                                         \n\t"
+        "jg 1b                                             \n\t"
+
+        "vmovups %%ymm0, (%[output])                             \n\t"
+        "vmovups %%ymm1, 0x20(%[output])                             \n\t"
+        :
+        : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]),
+        [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic)
+        : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8",
+        "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc");
+}
+
+void winoKernel1x8(ConvController &c)
+{
+    __asm__ __volatile__(
+        "vxorps %%ymm0, %%ymm0, %%ymm0                             \n\t"
+
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vbroadcastss (%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+
+        "vbroadcastss 0x4(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x20(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+
+        "vbroadcastss 0x8(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x40(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+
+        "vbroadcastss 0xC(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x60(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+
+        "vbroadcastss 0x10(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0x80(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+
+        "vbroadcastss 0x14(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0xA0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+
+        "vbroadcastss 0x18(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0xC0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+
+        "vbroadcastss 0x1C(%[input0]), %%ymm12                        \n\t"
+        "vmovups 0xE0(%[filter]), %%ymm15                          \n\t"
+        "vfmadd231ps %%ymm15, %%ymm12, %%ymm0              \n\t"
+
+        "add $0x20, %[input0]                                         \n\t"
+        "add $0x100, %[filter]                                         \n\t"
+        "dec %%rcx                                         \n\t"
+        "jg 1b                                             \n\t"
+
+        "vmovups %%ymm0, (%[output])                             \n\t"
+        :
+        : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]),
+        [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic)
+        : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8",
+        "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc");
+}
+
+EE convolution_winograd(TensorDesc inputDesc,
+    F32 *inArray,
+    F32 *eltwiseInput,
+    TensorDesc filterDesc,
+    const F32 *filterArray,
+    ConvolutionParamSpec convParamSpec,
+    TensorDesc biasDesc,
+    const F32 *biasArray,
+    U32 tmpBytes,
+    void *tmp,
+    TensorDesc outputDesc,
+    F32 *outArray,
+    ActivationParamSpec activationDesc)
+{
+    UNUSED(biasDesc);
+    UNUSED(tmpBytes);
+
+    DataType idt, fdt, odt;
+    DataFormat idf, fdf, odf;
+    U32 in, ic, ih, iw;
+    U32 fn, fc, fh, fw;
+    U32 on, oc, oh, ow;
+    CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+    CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
+    CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+
+    if ((fdf != DF_NCHWCxN32 && fdf != DF_NCHWCxN24) || (idf != DF_NCHWC8) || (ic % 8 != 0)) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+
+    if (activationDesc.mode != ACTIVATION_RELU && activationDesc.mode != ACTIVATION_NULL) {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
+
+    // get kernels
+    const kernelFunc wino[3][3] = {
+        {winoKernel1x8, winoKernel2x8, winoKernel3x8},
+        {winoKernel1x16, winoKernel2x16, winoKernel3x16},
+        {winoKernel1x32, winoKernel2x32, winoKernel3x32},
+    };
+
+    // get computing params
+    I32 strideH = convParamSpec.stride_h;
+    I32 strideW = convParamSpec.stride_w;
+    I32 paddingT = convParamSpec.pad_top;
+    I32 paddingB = convParamSpec.pad_bottom;
+    I32 paddingL = convParamSpec.pad_left;
+    I32 paddingR = convParamSpec.pad_right;
+    I32 dilateH = convParamSpec.dilatedRate_h;
+    I32 dilateW = convParamSpec.dilatedRate_w;
+    I32 ih_pad = ih + paddingT + paddingB;
+    I32 iw_pad = iw + paddingL + paddingR;
+    I32 ohow = oh * ow;
+
+    I32 oPaddingR = (ow % 4 == 0) ? 0 : (4 - ow % 4);
+    I32 oPaddingB = (oh % 4 == 0) ? 0 : (4 - oh % 4);
+    I32 oh_pad = oh + oPaddingB;
+    I32 ow_pad = ow + oPaddingR;
+    paddingR += oPaddingR;
+    paddingB += oPaddingB;
+
+    // infer block params
+    I32 ocBlockSizes[] = {8, 16, 32};
+    I32 wSizes[] = {1, 2, 3};
+
+    // infer kernel params
+    ConvController convCtl;
+    convCtl.eltwise = nullptr;
+    F32 *iaddr[3];
+    convCtl.input = iaddr;
+    bool noPadI = (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0);
+    bool noPadO = (oPaddingB == 0 && oPaddingR == 0);
+    for (U32 n = 0; n < in; ++n) {
+        F32 *bInArray = inArray + n * ic * ih * iw;
+        F32 *bOutArray = outArray + n * oc * oh * ow;
+
+        I32 icSize = 0;
+        bool addF = false;
+        ActivationMode mode = ACTIVATION_NULL;
+        for (I32 icb = 0; icb < (int)ic; icb += icSize) {
+            icSize = UNI_MIN(BLOCK_IC_DIM, (int)ic - icb);
+            addF = (icb > 0);
+            if (icb == (int)ic - icSize) {
+                mode = activationDesc.mode;
+            }
+
+            for (I32 h = 0; h < oh_pad; h += 4) {
+                I32 ocSize = 0;
+                for (U32 ocb = 0; ocb < oc; ocb += ocSize) {
+                    ocSize = UNI_MIN(32, (int)oc - ocb);
+                    ocSize = ocBlockSizes[ocSize >> 4];
+                    const F32 *bias = biasArray + ocb;
+                    I32 wSize = 0;
+                    for (I32 w = 0; w < ow_pad; w += 4 * wSize) {
+                        wSize = UNI_MIN((int)ow_pad - w, 12);
+                        wSize = wSize >> 2;
+                        I32 in_w = w * strideW;
+                        I32 in_h = h * strideH;
+                        F32 *curI;
+                        F32 *curO = bOutArray + ocb * oh * ow + (h * ow + w) * 8;
+                        F32 *tmpI = (F32 *)tmp + 36 * icSize * w / 4;
+                        F32 *buff = (F32 *)tmp + 36 * icSize * (ow_pad / 4 + 1);
+                        F32 *tmpO = (F32 *)buff + 36 * 36 * wSize;
+                        if (ocb == 0) {
+                            if (noPadI) {
+                                curI = bInArray + icb * ih * iw + (in_h * iw + in_w) * 8;
+                                transformInput4x4_3x3(
+                                    curI, tmpI, buff, iw, ih, icSize, wSize, icSize);
+                            } else {
+                                in_w = (in_w > paddingL) ? (in_w - paddingL) : 0;
+                                in_h = (in_h > paddingT) ? (in_h - paddingT) : 0;
+                                curI = bInArray + icb * ih * iw + (in_h * iw + in_w) * 8;
+                                transformInputWithPad4x4_3x3(curI, tmpI, buff, iw, ih, icSize,
+                                    wSize, icSize, paddingL, paddingR, paddingT, paddingB, h, w,
+                                    oh_pad, ow_pad);
+                            }
+                        }
+
+                        for (I32 i = 0; i < 36; ++i) {
+                            convCtl.ic = icSize / 8;
+                            convCtl.input[0] = tmpI + i * icSize;
+                            convCtl.input[1] = tmpI + icSize * 36 * 1 + i * icSize;
+                            convCtl.input[2] = tmpI + icSize * 36 * 2 + i * icSize;
+                            convCtl.output = tmpO + i * ocSize * wSize;
+                            convCtl.filter = filterArray + icb * fn * 36 + ocb * icSize * 36 +
+                                i * ocSize * icSize;
+                            wino[ocSize >> 4][wSize - 1](convCtl);
+                        }
+                        if (noPadO) {
+                            transformOutput4x4_3x3(
+                                tmpO, curO, buff, bias, ow, oh, ocSize, wSize, addF, mode);
+                        } else {
+                            transformOutputWithPad4x4_3x3(tmpO, curO, buff, bias, ow, oh, ocSize,
+                                wSize, addF, oPaddingR, oPaddingB, h, w, mode);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return SUCCESS;
+}
diff --git a/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp
index f6e4fd28..eab965ff 100644
--- a/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp
@@ -17,7 +17,7 @@ template <U32 C, U32 N>
 inline void transformCNHW2NCHWCxNxKernel(
     U32 fc, U32 fn, U32 fh, U32 fw, U32 fnPadding, const F32 *input, F32 *output)
 {
-    F32 *dest;
+    F32 *dest = nullptr;
     const F32 *src;
     U32 cSize = 0, cSizePadding = 0;
     U32 lstep = fh * fw;
@@ -44,7 +44,7 @@ inline void transformCNHW2NCHWCxNxKernel(
                     _mm256_storeu_ps(dest + 24, _mm256_i32gather_ps(src + 24 * lstep, vindex, 4));
                 }
             }
-            memset(dest + N, 0, ((cSizePadding - cSize) * N * 4));
+            UNI_MEMSET(dest + N, 0, ((cSizePadding - cSize) * N * 4));
         }
     }
 }
@@ -85,7 +85,7 @@ inline EE transformCNHW2NCHWCxNx(
         tail -= 8;
     }
     if (tail > 0) {
-        F32 *dest;
+        F32 *dest = nullptr;
         const F32 *src;
         U32 cSize = 0, cSizePadding = 0;
         U32 hwMax = fh * fw - 1;
@@ -108,7 +108,7 @@ inline EE transformCNHW2NCHWCxNx(
                     dest = output + n * fh * fw * 8 + hw * cSizePadding * 8 + c8 * 8;
                     _mm256_storeu_ps(dest, _mm256_mask_i32gather_ps(src256, src, vindex, mask, 4));
                 }
-                memset(dest + 8, 0, ((cSizePadding - cSize) * 32));
+                UNI_MEMSET(dest + 8, 0, ((cSizePadding - cSize) * 32));
             }
         }
     }
@@ -169,7 +169,7 @@ inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc,
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     if (fdf == ftmDataFormat) {
         *ftmDesc = filterDesc;
-        memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt));
+        UNI_MEMCPY(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt));
         return SUCCESS;
     }
     if (fdf != DF_NCHW) {
@@ -180,7 +180,7 @@ inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc,
         case DF_NCHWC24: {
             filterDesc = tensor4df(fdt, fdf, 1, fc, fh, fw);
             *ftmDesc = tensor4df(fdt, ftmDataFormat, 1, fc, fh, fw);
-            transformCNHW2NCHWCxNx<1, 24>(filterDesc, filterArray, *ftmDesc, ftmArray);
+            transformCNHW2NCHWCxNx<1, 16>(filterDesc, filterArray, *ftmDesc, ftmArray);
             *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw);
             break;
         }
diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp
index 8d841efd..d0e54d09 100644
--- a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp
@@ -20,7 +20,7 @@
 #include "cpu/x86/fp32/convolution_functions.h"
 
 #define UNROLL_W 4
-#define UNROLL_OC_BLOCK_DIM 24
+#define UNROLL_OC_BLOCK_DIM 16
 
 typedef void (*kernelFunc)(F32 *in0,
     F32 *in1,
@@ -334,6 +334,117 @@ void Avx2DwKernel4x16(F32 *in0,
                          "%ymm15", "memory", "cc");
 }
 
+void Avx512DwKernel4x16(F32 *in0,
+    F32 *in1,
+    F32 *in2,
+    F32 *in3,
+    const F32 *curW,
+    F32 *curO,
+    const F32 *curB,
+    I32 fw,
+    I32 fh,
+    I32 oStep,
+    I32 iStep,
+    I32 hStep,
+    I32 flags,
+    I32 dw,
+    I32 wStep)
+{
+    __asm__ __volatile__("vmovups (%5), %%zmm0                       \n\t"
+                         "vmovups %%zmm0, %%zmm1                       \n\t"
+                         "vmovups %%zmm0, %%zmm2                       \n\t"
+                         "vmovups %%zmm0, %%zmm3                       \n\t"
+
+                         "cmp $0, %%ecx                                      \n\t"
+                         "je 3f                                             \n\t"
+                         "cmp $0, %6                                      \n\t"
+                         "je 3f                                             \n\t"
+
+                         ".align 16                                         \n\t"
+                         "0:                                                \n\t"
+
+                         "mov %6, %%eax                                     \n\t"
+                         ".align 16                                         \n\t"
+                         "1:                                                \n\t"
+
+                         "vmovaps (%4), %%zmm11                         \n\t"
+                         "vmovups (%0), %%zmm12                        \n\t"
+                         "vmovups (%1), %%zmm13                        \n\t"
+                         "vmovups (%2), %%zmm14                        \n\t"
+                         "vmovups (%3), %%zmm15                        \n\t"
+                         "vfmadd231ps %%zmm12, %%zmm11, %%zmm0              \n\t"
+                         "vfmadd231ps %%zmm13, %%zmm11, %%zmm1              \n\t"
+                         "prefetcht0 0x40(%4) \n\t"
+                         "vfmadd231ps %%zmm14, %%zmm11, %%zmm2              \n\t"
+                         "vfmadd231ps %%zmm15, %%zmm11, %%zmm3              \n\t"
+
+                         "add %12, %0                                      \n\t"
+                         "add %12, %1                                      \n\t"
+                         "add %12, %2                                      \n\t"
+                         "add %12, %3                                      \n\t"
+                         "add $0x40, %4                                    \n\t"
+                         "dec %%eax                                         \n\t"
+                         "jg 1b                                             \n\t"
+
+                         "add %10, %4                                    \n\t"
+                         "add %9, %0                                     \n\t"
+                         "add %9, %1                                     \n\t"
+                         "add %9, %2                                     \n\t"
+                         "add %9, %3                                     \n\t"
+                         "dec %%ecx                                         \n\t"
+                         "jg 0b                                             \n\t"
+
+                         // relu
+                         "mov %11, %%eax                                     \n\t"
+                         "and $0x6, %%eax                                      \n\t"
+                         "je 3f                                             \n\t"
+                         "vxorps %%zmm15, %%zmm15, %%zmm15                  \n\t"
+                         "vmaxps %%zmm15, %%zmm0, %%zmm0                    \n\t"
+                         "vmaxps %%zmm15, %%zmm1, %%zmm1                    \n\t"
+                         "vmaxps %%zmm15, %%zmm2, %%zmm2                    \n\t"
+                         "vmaxps %%zmm15, %%zmm3, %%zmm3                    \n\t"
+                         "vmaxps %%zmm15, %%zmm4, %%zmm4                    \n\t"
+                         "vmaxps %%zmm15, %%zmm5, %%zmm5                    \n\t"
+                         "vmaxps %%zmm15, %%zmm6, %%zmm6                    \n\t"
+                         "vmaxps %%zmm15, %%zmm7, %%zmm7                    \n\t"
+
+                         // relu6
+                         "and $0x4, %%eax                                      \n\t"
+                         "je 3f                                             \n\t"
+                         "mov $0x40C00000, %%eax                            \n\t"
+                         "vmovd %%eax, %%xmm12                              \n\t"
+                         "vpermps %%zmm12, %%zmm15, %%zmm12                 \n\t"
+                         "vminps %%zmm12, %%zmm0, %%zmm0                    \n\t"
+                         "vminps %%zmm12, %%zmm1, %%zmm1                    \n\t"
+                         "vminps %%zmm12, %%zmm2, %%zmm2                    \n\t"
+                         "vminps %%zmm12, %%zmm3, %%zmm3                    \n\t"
+                         "vminps %%zmm12, %%zmm4, %%zmm4                    \n\t"
+                         "vminps %%zmm12, %%zmm5, %%zmm5                    \n\t"
+                         "vminps %%zmm12, %%zmm6, %%zmm6                    \n\t"
+                         "vminps %%zmm12, %%zmm7, %%zmm7                    \n\t"
+
+                         ".align 16                                         \n\t"
+                         "3:                                                 \n\t"
+                         :
+                         : "r"(in0), "r"(in1), "r"(in2), "r"(in3), "r"(curW), "r"(curB), "r"(fw),
+                         "c"(fh), "r"((I64)iStep), "r"((I64)hStep), "r"((I64)wStep), "r"(flags),
+                         "r"((I64)dw)
+                         : "%eax", "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                         "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13",
+                         "%zmm14", "%zmm15", "memory", "cc");
+
+    __asm__ __volatile__("vmovups %%zmm0, (%0)                              \n\t"
+                         "vmovups %%zmm1, 0x40(%0)                          \n\t"
+                         "vmovups %%zmm2, 0x80(%0)                          \n\t"
+                         "vmovups %%zmm3, 0xC0(%0)                          \n\t"
+
+                         ".align 16                                         \n\t"
+                         "1:                                                \n\t"
+                         :
+                         : "r"(curO), "r"((I64)oStep)
+                         : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "memory", "cc");
+}
+
 void Avx2DwKernel4x8(F32 *in0,
     F32 *in1,
     F32 *in2,
@@ -593,6 +704,75 @@ void Avx2DwKernel1x16(F32 *in0,
                          "%ymm14", "%ymm15", "memory", "cc");
 }
 
+void Avx512DwKernel1x16(F32 *in0,
+    F32 *in1,
+    F32 *in2,
+    F32 *in3,
+    const F32 *curW,
+    F32 *curO,
+    const F32 *curB,
+    I32 fw,
+    I32 fh,
+    I32 oStep,
+    I32 iStep,
+    I32 hStep,
+    I32 flags,
+    I32 dw,
+    I32 wStep)
+{
+    __asm__ __volatile__("vmovups (%3), %%zmm0                       \n\t"
+
+                         "cmp $0, %%ecx                                      \n\t"
+                         "je 3f                                             \n\t"
+                         "cmp $0, %4                                      \n\t"
+                         "je 3f                                             \n\t"
+
+                         ".align 16                                         \n\t"
+                         "0:                                                \n\t"
+
+                         "mov %4, %%eax                                     \n\t"
+                         ".align 16                                         \n\t"
+                         "1:                                                \n\t"
+
+                         "vmovaps (%1), %%zmm1                         \n\t"
+                         "vmovups (%0), %%zmm2                        \n\t"
+                         "vfmadd231ps %%zmm2, %%zmm1, %%zmm0              \n\t"
+
+                         "add %11, %0                                      \n\t"
+                         "add $0x40, %1                                    \n\t"
+                         "dec %%eax                                         \n\t"
+                         "jg 1b                                             \n\t"
+
+                         "add %6, %1                                    \n\t"
+                         "add %9, %0                                     \n\t"
+                         "dec %%ecx                                         \n\t"
+                         "jg 0b                                             \n\t"
+
+                         // relu
+                         "mov %10, %%eax                                     \n\t"
+                         "and $0x6, %%eax                                      \n\t"
+                         "je 3f                                             \n\t"
+                         "vxorps %%zmm3, %%zmm3, %%zmm3                  \n\t"
+                         "vmaxps %%zmm3, %%zmm0, %%zmm0                    \n\t"
+
+                         // relu6
+                         "and $0x4, %%eax                                      \n\t"
+                         "je 3f                                             \n\t"
+                         "mov $0x40C00000, %%eax                            \n\t"
+                         "vmovd %%eax, %%xmm3                              \n\t"
+                         "vbroadcastss %%xmm3, %%zmm4                 \n\t"
+                         "vminps %%zmm4, %%zmm0, %%zmm0                    \n\t"
+
+                         ".align 16                                         \n\t"
+                         "3:                                                \n\t"
+                         "vmovups %%zmm0, (%2)                              \n\t"
+                         :
+                         : "r"(in0), "r"(curW), "r"(curO), "r"(curB), "r"(fw), "c"(fh),
+                         "r"((I64)wStep), "r"((I64)oStep), "r"((I64)iStep), "r"((I64)hStep),
+                         "r"(flags), "r"((I64)dw)
+                         : "%eax", "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "memory", "cc");
+}
+
 void Avx2DwKernel1x8(F32 *in0,
     F32 *in1,
     F32 *in2,
@@ -836,6 +1016,103 @@ inline void Avx2DwKernel33s14x24(F32 *in0,
                          "%ymm15", "memory", "cc");
 }
 
+inline void Avx512DwKernel33s14x16(F32 *in0,
+    const F32 *curW,
+    F32 *curO,
+    const F32 *curB,
+    I32 oStep,
+    I32 iStep,
+    I32 hStep,
+    I32 flags,
+    I32 fh)
+{
+    __asm__ __volatile__(
+        "vmovups (%2), %%zmm0                       \n\t"
+        "vmovups %%zmm0, %%zmm1                       \n\t"
+        "vmovups %%zmm0, %%zmm2                       \n\t"
+        "vmovups %%zmm0, %%zmm3                       \n\t"
+
+        ".align 16                                         \n\t"
+        "0:                                               "
+
+        "vmovaps (%1), %%zmm15                         \n\t"
+        "vmovups (%0), %%zmm8                        \n\t"
+        "vmovups 0x40(%0), %%zmm9                        \n\t"
+        "vmovups 0x80(%0), %%zmm10                        \n\t"
+        "vmovups 0xC0(%0), %%zmm11                        \n\t"
+        "vfmadd231ps %%zmm8, %%zmm15, %%zmm0              \n\t"
+        "vfmadd231ps %%zmm9, %%zmm15, %%zmm1              \n\t"
+        "vfmadd231ps %%zmm10, %%zmm15, %%zmm2              \n\t"
+        "vfmadd231ps %%zmm11, %%zmm15, %%zmm3              \n\t"
+
+        "vmovaps 0x40(%1), %%zmm15                         \n\t"
+        "vmovups 0x100(%0), %%zmm8                        \n\t"
+        "vfmadd231ps %%zmm9, %%zmm15, %%zmm0              \n\t"
+        "vfmadd231ps %%zmm10, %%zmm15, %%zmm1              \n\t"
+        "vfmadd231ps %%zmm11, %%zmm15, %%zmm2              \n\t"
+        "vfmadd231ps %%zmm8, %%zmm15, %%zmm3              \n\t"
+
+        "vmovaps 0x80(%1), %%zmm15                         \n\t"
+        "vmovups 0x140(%0), %%zmm12                        \n\t"
+        "vfmadd231ps %%zmm10, %%zmm15, %%zmm0              \n\t"
+        "vfmadd231ps %%zmm11, %%zmm15, %%zmm1              \n\t"
+        "vfmadd231ps %%zmm8, %%zmm15, %%zmm2              \n\t"
+        "vfmadd231ps %%zmm12, %%zmm15, %%zmm3              \n\t"
+
+        "add %4, %0                                      \n\t"
+        "add $0xC0, %1                                      \n\t"
+
+        "dec %%ecx                                         \n\t"
+        "jg 0b                                             \n\t"
+
+        // relu
+        "mov %5, %%eax                                     \n\t"
+        "and $0x6, %%eax                                      \n\t"
+        "je 1f                                             \n\t"
+        "vxorps %%zmm15, %%zmm15, %%zmm15                  \n\t"
+        "vmaxps %%zmm15, %%zmm0, %%zmm0                    \n\t"
+        "vmaxps %%zmm15, %%zmm1, %%zmm1                    \n\t"
+        "vmaxps %%zmm15, %%zmm2, %%zmm2                    \n\t"
+        "vmaxps %%zmm15, %%zmm3, %%zmm3                    \n\t"
+        "vmaxps %%zmm15, %%zmm4, %%zmm4                    \n\t"
+        "vmaxps %%zmm15, %%zmm5, %%zmm5                    \n\t"
+        "vmaxps %%zmm15, %%zmm6, %%zmm6                    \n\t"
+        "vmaxps %%zmm15, %%zmm7, %%zmm7                    \n\t"
+
+        // relu6
+        "and $0x4, %%eax                                      \n\t"
+        "je 1f                                             \n\t"
+        "mov $0x40C00000, %%eax                            \n\t"
+        "vmovd %%eax, %%xmm12                              \n\t"
+        "vpermps %%zmm12, %%zmm15, %%zmm12                 \n\t"
+        "vminps %%zmm12, %%zmm0, %%zmm0                    \n\t"
+        "vminps %%zmm12, %%zmm1, %%zmm1                    \n\t"
+        "vminps %%zmm12, %%zmm2, %%zmm2                    \n\t"
+        "vminps %%zmm12, %%zmm3, %%zmm3                    \n\t"
+        "vminps %%zmm12, %%zmm4, %%zmm4                    \n\t"
+        "vminps %%zmm12, %%zmm5, %%zmm5                    \n\t"
+        "vminps %%zmm12, %%zmm6, %%zmm6                    \n\t"
+        "vminps %%zmm12, %%zmm7, %%zmm7                    \n\t"
+
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        :
+        : "r"(in0), "r"(curW), "r"(curB), "r"((I64)iStep), "r"((I64)hStep), "r"(flags), "c"(fh)
+        : "%eax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7",
+        "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "memory",
+        "cc");
+
+    __asm__ __volatile__("vmovups %%zmm0, (%0)                              \n\t"
+                         "vmovups %%zmm1, 0x40(%0)                          \n\t"
+                         "vmovups %%zmm2, 0x80(%0)                          \n\t"
+                         "vmovups %%zmm3, 0xC0(%0)                              \n\t"
+                         :
+                         : "r"(curO), "r"((I64)oStep)
+                         : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7",
+                         "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
+                         "%zmm15", "memory", "cc");
+}
+
 inline void Avx2DwKernel33s14x16(F32 *in0,
     const F32 *curW,
     F32 *curO,
@@ -1242,25 +1519,28 @@ EE depthwise_convolution_direct(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGetI32(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_STATUS(tensor4dGetI32(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
 
-    if ((fdf != DF_NCHWC24 && fdf != DF_NCHWC8) || (idf != DF_NCHWC8) || (ic % 8 != 0)) {
+    if ((fdf != DF_NCHWC24 && fdf != DF_NCHWC8) || (idf != DF_NCHWC8 && idf != DF_NCHWC16) || (ic % 8 != 0)) {
         CHECK_STATUS(NOT_MATCH);
     }
 
     // get kernels
-    kernelFunc kernel[2][3] = {{Avx2DwKernel1x8, Avx2DwKernel1x16, Avx2DwKernel1x24},
-        {Avx2DwKernel4x8, Avx2DwKernel4x16, Avx2DwKernel4x24}};
+    kernelFunc kernel[3][2] = {{Avx2DwKernel1x8, Avx2DwKernel4x8},
+                               {Avx2DwKernel1x16, Avx2DwKernel4x16},
+                               {Avx2DwKernel1x24, Avx2DwKernel4x24}};
     kernel33Func kernel33[2][3] = {{Avx2DwKernel33s18x8, Avx2DwKernel33s14x16, Avx2DwKernel33s14x24},
         {Avx2DwKernel33s28x8, nullptr, nullptr}};
+    kernelFunc kernel512[2] = {Avx512DwKernel1x16, Avx512DwKernel4x16};
+    kernel33Func kernel51233[1] = {Avx512DwKernel33s14x16};
     I32 unrollOcArray[3] = {8, 16, 24};
     I32 unrollHw33s1Array[3] = {8, 4, 4};
 
     // get computing params
     I32 strideH = convParamSpec.stride_h;
     I32 strideW = convParamSpec.stride_w;
-    I32 paddingT = convParamSpec.padding_top;
-    I32 paddingB = convParamSpec.padding_bottom;
-    I32 paddingL = convParamSpec.padding_left;
-    I32 paddingR = convParamSpec.padding_right;
+    I32 paddingT = convParamSpec.pad_top;
+    I32 paddingB = convParamSpec.pad_bottom;
+    I32 paddingL = convParamSpec.pad_left;
+    I32 paddingR = convParamSpec.pad_right;
     I32 dilateH = convParamSpec.dilatedRate_h;
     I32 dilateW = convParamSpec.dilatedRate_w;
     I32 fhDilated = (fh - 1) * dilateH + 1;
@@ -1270,14 +1550,15 @@ EE depthwise_convolution_direct(TensorDesc inputDesc,
     // infer block params
     I32 unrollOc = UNROLL_OC_BLOCK_DIM;
     I32 unrollHw = UNROLL_W;
+    I32 cLen = (idf == DF_NCHWC16)? 16: 8;
 
     // infer kernel params
-    I32 oStep = oh * ow * SIMDW * BYTES;
-    I32 iStep = ih * iw * SIMDW * BYTES;
-    I32 hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * SIMDW * BYTES;
-    I32 hStep33 = iw * SIMDW * BYTES;
-    I32 sw = strideW * SIMDW * BYTES;
-    I32 dw = dilateW * SIMDW * BYTES;
+    I32 oStep = oh * ow * cLen * BYTES;
+    I32 iStep = ih * iw * cLen * BYTES;
+    I32 hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * cLen * BYTES;
+    I32 hStep33 = iw * cLen * BYTES;
+    I32 sw = strideW * cLen * BYTES;
+    I32 dw = dilateW * cLen * BYTES;
 
     // fuse dw+pw
     F32 *useOutArray = (F32 *)tmp;
@@ -1300,6 +1581,13 @@ EE depthwise_convolution_direct(TensorDesc inputDesc,
             ocSize = UNI_MIN(unrollOc, ic - ocb);
             I32 ocIdx = (ocSize >> 3) - 1;
             ocSize = unrollOcArray[ocIdx];
+            kernelFunc *wkernel = kernel[ocIdx];
+            kernel33Func wkernel33 = kernel33[0][ocIdx];
+            if (idf == DF_NCHWC16) {
+                ocSize = 16;
+                wkernel = kernel512;
+                wkernel33 = kernel51233[0];
+            }
             if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) {
                 I32 wSize = 0;
                 if (use3x3) {
@@ -1309,15 +1597,15 @@ EE depthwise_convolution_direct(TensorDesc inputDesc,
                             wSize = UNI_MIN(ow - w, unrollHw);
                             I32 in_h_0 = h * strideH;
                             I32 in_w_0 = w * strideW;
-                            F32 *in_0 = curI + in_h_0 * iw * SIMDW + in_w_0 * SIMDW;
-                            F32 *calO = curO + (h * ow + w) * SIMDW;
+                            F32 *in_0 = curI + in_h_0 * iw * cLen + in_w_0 * cLen;
+                            F32 *calO = curO + (h * ow + w) * cLen;
 
                             if (wSize < unrollHw) {
-                                kernel[0][ocIdx](in_0, nullptr, nullptr, nullptr, curW, calO, curB,
+                                wkernel[0](in_0, nullptr, nullptr, nullptr, curW, calO, curB,
                                     fw, fh, oStep, iStep, hStep, flags, dw, 0);
                                 wSize = 1;
                             } else {
-                                kernel33[strideW - 1][ocIdx](
+                                wkernel33(
                                     in_0, curW, calO, curB, oStep, iStep, hStep33, flags, 3);
                             }
                         }
@@ -1336,12 +1624,12 @@ EE depthwise_convolution_direct(TensorDesc inputDesc,
                         I32 in_w_2 = (hw + 2) % ow * strideW;
                         I32 in_h_3 = (hw + 3) / ow * strideH;
                         I32 in_w_3 = (hw + 3) % ow * strideW;
-                        F32 *in_0 = curI + in_h_0 * iw * SIMDW + in_w_0 * SIMDW;
-                        F32 *in_1 = curI + in_h_1 * iw * SIMDW + in_w_1 * SIMDW;
-                        F32 *in_2 = curI + in_h_2 * iw * SIMDW + in_w_2 * SIMDW;
-                        F32 *in_3 = curI + in_h_3 * iw * SIMDW + in_w_3 * SIMDW;
+                        F32 *in_0 = curI + in_h_0 * iw * cLen + in_w_0 * cLen;
+                        F32 *in_1 = curI + in_h_1 * iw * cLen + in_w_1 * cLen;
+                        F32 *in_2 = curI + in_h_2 * iw * cLen + in_w_2 * cLen;
+                        F32 *in_3 = curI + in_h_3 * iw * cLen + in_w_3 * cLen;
 
-                        kernel[wSize >> 2][ocIdx](in_0, in_1, in_2, in_3, curW, curO + hw * SIMDW,
+                        wkernel[wSize >> 2](in_0, in_1, in_2, in_3, curW, curO + hw * cLen,
                             curB, fw, fh, oStep, iStep, hStep, flags, dw, 0);
                     }
                 }
@@ -1369,28 +1657,28 @@ EE depthwise_convolution_direct(TensorDesc inputDesc,
                         inW = (inW >= 0) ? inW : iwJump;
                         tfw = GetKernelnoDilated(tfw, dilateW);
                         const F32 *useW = calW + wwJump * ocSize;
-                        F32 *in_0 = curI + inH * iw * SIMDW + inW * SIMDW;
-                        F32 *calO = curO + (h * ow + realW) * SIMDW;
-                        hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * SIMDW * BYTES;
-                        kernel[0][ocIdx](in_0, nullptr, nullptr, nullptr, useW, calO, curB, tfw,
+                        F32 *in_0 = curI + inH * iw * cLen + inW * cLen;
+                        F32 *calO = curO + (h * ow + realW) * cLen;
+                        hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * cLen * BYTES;
+                        wkernel[0](in_0, nullptr, nullptr, nullptr, useW, calO, curB, tfw,
                             tfh, oStep, iStep, hStep, flags, dw, (fw - tfw) * ocSize * BYTES);
                     }
                     w = owPaddingL;
                     I32 wSize = 0;
-                    hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * SIMDW * BYTES;
+                    hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * cLen * BYTES;
                     if (use3x3) {
                         unrollHw = unrollHw33s1Array[ocIdx];
                         for (; w < ow - owPaddingR; w += wSize) {
                             wSize = UNI_MIN(ow - owPaddingR - w, unrollHw);
                             I32 in_w_0 = w * strideW - paddingL;
-                            F32 *in_0 = curI + inH * iw * SIMDW + in_w_0 * SIMDW;
-                            F32 *calO = curO + (h * ow + w) * SIMDW;
+                            F32 *in_0 = curI + inH * iw * cLen + in_w_0 * cLen;
+                            F32 *calO = curO + (h * ow + w) * cLen;
                             if (wSize < unrollHw) {
-                                kernel[0][ocIdx](in_0, nullptr, nullptr, nullptr, calW, calO, curB,
+                                wkernel[0](in_0, nullptr, nullptr, nullptr, calW, calO, curB,
                                     fw, tfh, oStep, iStep, hStep, flags, dw, 0);
                                 wSize = 1;
                             } else {
-                                kernel33[strideW - 1][ocIdx](
+                                wkernel33(
                                     in_0, calW, calO, curB, oStep, iStep, hStep33, flags, tfh);
                             }
                         }
@@ -1404,13 +1692,13 @@ EE depthwise_convolution_direct(TensorDesc inputDesc,
                             I32 in_w_1 = (w + 1) * strideW - paddingL;
                             I32 in_w_2 = (w + 2) * strideW - paddingL;
                             I32 in_w_3 = (w + 3) * strideW - paddingL;
-                            F32 *in_0 = curI + inH * iw * SIMDW + in_w_0 * SIMDW;
-                            F32 *in_1 = curI + inH * iw * SIMDW + in_w_1 * SIMDW;
-                            F32 *in_2 = curI + inH * iw * SIMDW + in_w_2 * SIMDW;
-                            F32 *in_3 = curI + inH * iw * SIMDW + in_w_3 * SIMDW;
-                            F32 *calO = curO + (h * ow + w) * SIMDW;
+                            F32 *in_0 = curI + inH * iw * cLen + in_w_0 * cLen;
+                            F32 *in_1 = curI + inH * iw * cLen + in_w_1 * cLen;
+                            F32 *in_2 = curI + inH * iw * cLen + in_w_2 * cLen;
+                            F32 *in_3 = curI + inH * iw * cLen + in_w_3 * cLen;
+                            F32 *calO = curO + (h * ow + w) * cLen;
 
-                            kernel[wSize >> 2][ocIdx](in_0, in_1, in_2, in_3, calW, calO, curB, fw,
+                            wkernel[wSize >> 2](in_0, in_1, in_2, in_3, calW, calO, curB, fw,
                                 tfh, oStep, iStep, hStep, flags, dw, 0);
                         }
                     }
@@ -1424,7 +1712,7 @@ EE depthwise_convolution_direct(TensorDesc inputDesc,
         tmpBytes -= oh * ic * oh * ow + 32;
         tmp = (void *)((F32 *)tmp + oh * ic * oh * ow + 32);
         ConvolutionParamSpec p = createConvolutionParamSpec(
-            1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, fn, Convolution_Pointwise);
+            1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, fn, CONVOLUTION_POINTWISE);
         convolution_1x1_direct(pwInputDesc, useOutArray, eltwiseInput, pwFilterDesc, pwFilterArray,
             p, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, pointwiseActivationParamSpec);
     }
diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp
index fd6b15ca..ae32441c 100644
--- a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp
@@ -29,7 +29,7 @@ inline EE depthwise_convolution_transform_filter_kernel_fp32(TensorDesc filterDe
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     if (fdf == ftmDataFormat) {
         *ftmDesc = filterDesc;
-        memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt));
+        UNI_MEMCPY(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt));
         return SUCCESS;
     }
     if (fdf != DF_NCHW) {
@@ -39,7 +39,7 @@ inline EE depthwise_convolution_transform_filter_kernel_fp32(TensorDesc filterDe
     *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, 1, fh, fw);
     switch (ftmDataFormat) {
         case DF_NCHWC24: {
-            transformNCHWToNCHWCxNx<1, 24>(filterDesc, filterArray, *ftmDesc, ftmArray);
+            transformNCHWToNCHWCxNx<1, 16>(filterDesc, filterArray, *ftmDesc, ftmArray);
             break;
         }
         case DF_NCHWC8: {
diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp
index b431d50c..5118606f 100644
--- a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp
@@ -51,7 +51,7 @@ EE depthwise_pointwise_convolution_fp32(TensorDesc inputDesc,
     if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) {
         CHECK_STATUS(NOT_MATCH);
     }
-    if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) {
+    if (!(idf == DF_NCHWC8 || idf == DF_NCHWC16)) {
         CHECK_STATUS(NOT_MATCH);
     }
     if (ic != fc) {
diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp
index d6ffa6ca..2aae9f3f 100644
--- a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp
@@ -31,7 +31,7 @@ EE depthwise_pointwise_convolution_transform_filter_fp32(TensorDesc dwFilterDesc
     }
 
     ConvolutionParamSpec p = createConvolutionParamSpec(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
-        1, pwFilterDesc.dims[pwFilterDesc.nDims - 1], Convolution_Pointwise);
+        1, pwFilterDesc.dims[pwFilterDesc.nDims - 1], CONVOLUTION_POINTWISE);
     ret = convolution_transform_filter_fp32(
         pwFilterDesc, pwFilter, p, CONVOLUTION_ALGORITHM_POINTWISE, pwFtmDesc, pwFilterTransformed);
     CHECK_STATUS(ret);
diff --git a/compute/tensor/src/cpu/x86/fp32/eltwise.cpp b/compute/tensor/src/cpu/x86/fp32/eltwise.cpp
index 4094c251..9f89a0fa 100644
--- a/compute/tensor/src/cpu/x86/fp32/eltwise.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/eltwise.cpp
@@ -104,17 +104,24 @@ EE eltwise_fp32(std::vector<void *> input,
     EltwiseMode eltwiseMode)
 {
     EE ret = SUCCESS;
-    if ((num == 2) && (inputSize[0] == (I32)len) && (inputSize[0] == inputSize[1])) {
+    if ((num == 2) && (inputSize[0] != 1) && (inputSize[1] != 1)) {
         F32 *in0 = (F32 *)input[0];
         F32 *in1 = (F32 *)input[1];
         F32 *out = (F32 *)output;
+        len = UNI_MIN(inputSize[0], inputSize[1]);
 
 #ifdef _USE_OPENMP
-        U32 BLOCK = ((len + OMP_NUM_THREADS - 1) / OMP_NUM_THREADS + 7) / 8 * 8;
+        U32 ompBlock = ((len + OMP_NUM_THREADS - 1) / OMP_NUM_THREADS + 7) / 8 * 8;
+        U32 BLOCK = UNI_MAX(64, ompBlock);
         U32 blockNum = (len + BLOCK - 1) / BLOCK;
-#pragma omp parallel num_threads(OMP_NUM_THREADS)
-        {
+        int in_parallel = omp_in_parallel();
+        if (in_parallel != 0) {
+            BLOCK = len;
+            blockNum = 1;
+        }
+#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0)
 #endif
+        {
             switch (eltwiseMode) {
                 case ELTWISE_SUM: {
 #ifdef _USE_OPENMP
@@ -125,7 +132,7 @@ EE eltwise_fp32(std::vector<void *> input,
                         eltwise_kernel(vaddps, vaddss, blockSize, in0 + off, in1 + off, out + off);
                     }
 #else
-                eltwise_kernel(vaddps, vaddss, len, in0, in1, out);
+                    eltwise_kernel(vaddps, vaddss, len, in0, in1, out);
 #endif
                     break;
                 }
@@ -138,7 +145,7 @@ EE eltwise_fp32(std::vector<void *> input,
                         eltwise_kernel(vmaxps, vmaxss, blockSize, in0 + off, in1 + off, out + off);
                     }
 #else
-                eltwise_kernel(vmaxps, vmaxss, len, in0, in1, out);
+                    eltwise_kernel(vmaxps, vmaxss, len, in0, in1, out);
 #endif
                     break;
                 }
@@ -152,7 +159,7 @@ EE eltwise_fp32(std::vector<void *> input,
                         eltwise_kernel(vmulps, vmulss, blockSize, in0 + off, in1 + off, out + off);
                     }
 #else
-                eltwise_kernel(vmulps, vmulss, len, in0, in1, out);
+                    eltwise_kernel(vmulps, vmulss, len, in0, in1, out);
 #endif
                     break;
                 }
@@ -165,7 +172,7 @@ EE eltwise_fp32(std::vector<void *> input,
                         eltwise_kernel(vsubps, vsubss, blockSize, in0 + off, in1 + off, out + off);
                     }
 #else
-                eltwise_kernel(vsubps, vsubss, len, in0, in1, out);
+                    eltwise_kernel(vsubps, vsubss, len, in0, in1, out);
 #endif
                     break;
                 }
@@ -178,80 +185,93 @@ EE eltwise_fp32(std::vector<void *> input,
                         eltwise_kernel(vdivps, vdivss, blockSize, in0 + off, in1 + off, out + off);
                     }
 #else
-                eltwise_kernel(vdivps, vdivss, len, in0, in1, out);
+                    eltwise_kernel(vdivps, vdivss, len, in0, in1, out);
 #endif
                     break;
                 }
                 default:
                     ret = NOT_SUPPORTED;
+                    break;
             }
-#ifdef _USE_OPENMP
         }
-#endif
         return ret;
     }
 
-    F32 buffer[8];
-    F32 *tmp = buffer;
     U32 len_tail = len % 8;
     U32 len_main = len - len_tail;
-    F32 *output_ptr = (F32 *)output;
-    for (U32 i = 0; i < len_main; i += 8) {
-        get_vector<F32>((F32 *)input[0], inputSize[0], &tmp, 8, i, 8, buffer);
-        __m256 tmp_v = _mm256_loadu_ps(tmp);
-        for (U32 j = 1; j < num; j++) {
-            get_vector<F32>((F32 *)input[j], inputSize[j], &tmp, 8, i, 8, buffer);
-            __m256 value_v = _mm256_loadu_ps(tmp);
-            switch (eltwiseMode) {
-                case ELTWISE_SUM:
-                    tmp_v = _mm256_add_ps(value_v, tmp_v);
-                    break;
-                case ELTWISE_MAX:
-                    tmp_v = _mm256_max_ps(value_v, tmp_v);
-                    break;
-                case ELTWISE_PROD:
-                    tmp_v = _mm256_mul_ps(value_v, tmp_v);
-                    break;
-                case ELTWISE_SUB:
-                    tmp_v = _mm256_sub_ps(tmp_v, value_v);
-                    break;
-                case ELTWISE_DIV:
-                    tmp_v = _mm256_div_ps(tmp_v, value_v);
-                    break;
-                default:
-                    ret = NOT_SUPPORTED;
+#ifdef _USE_OPENMP
+    int in_parallel = omp_in_parallel();
+#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0)
+#endif
+    {
+        F32 buffer[8];
+        F32 *tmp = buffer;
+        F32 *output_ptr = (F32 *)output;
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (U32 i = 0; i < len_main; i += 8) {
+            get_vector<F32>((F32 *)input[0], inputSize[0], &tmp, 8, i, 8, buffer);
+            __m256 tmp_v = _mm256_loadu_ps(tmp);
+            for (U32 j = 1; j < num; j++) {
+                get_vector<F32>((F32 *)input[j], inputSize[j], &tmp, 8, i, 8, buffer);
+                __m256 value_v = _mm256_loadu_ps(tmp);
+                switch (eltwiseMode) {
+                    case ELTWISE_SUM:
+                        tmp_v = _mm256_add_ps(value_v, tmp_v);
+                        break;
+                    case ELTWISE_MAX:
+                        tmp_v = _mm256_max_ps(value_v, tmp_v);
+                        break;
+                    case ELTWISE_PROD:
+                        tmp_v = _mm256_mul_ps(value_v, tmp_v);
+                        break;
+                    case ELTWISE_SUB:
+                        tmp_v = _mm256_sub_ps(tmp_v, value_v);
+                        break;
+                    case ELTWISE_DIV:
+                        tmp_v = _mm256_div_ps(tmp_v, value_v);
+                        break;
+                    default:
+                        ret = NOT_SUPPORTED;
+                        break;
+                }
             }
+            _mm256_storeu_ps(output_ptr + i, tmp_v);
         }
-        _mm256_storeu_ps(output_ptr + i, tmp_v);
-    }
 
-    for (U32 i = len_main; i < len; i++) {
-        get_vector<F32>((F32 *)input[0], inputSize[0], &tmp, 8, i, 1, buffer);
-        F32 tmp_s = tmp[0];
-        for (U32 j = 1; j < num; j++) {
-            get_vector<F32>((F32 *)input[j], inputSize[j], &tmp, 8, i, 1, buffer);
-            F32 value_s = tmp[0];
-            switch (eltwiseMode) {
-                case ELTWISE_SUM:
-                    tmp_s = value_s + tmp_s;
-                    break;
-                case ELTWISE_MAX:
-                    tmp_s = (value_s > tmp_s) ? value_s : tmp_s;
-                    break;
-                case ELTWISE_PROD:
-                    tmp_s *= value_s;
-                    break;
-                case ELTWISE_SUB:
-                    tmp_s = tmp_s - value_s;
-                    break;
-                case ELTWISE_DIV:
-                    tmp_s = tmp_s / value_s;
-                    break;
-                default:
-                    ret = NOT_SUPPORTED;
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (U32 i = len_main; i < len; i++) {
+            get_vector<F32>((F32 *)input[0], inputSize[0], &tmp, 8, i, 1, buffer);
+            F32 tmp_s = tmp[0];
+            for (U32 j = 1; j < num; j++) {
+                get_vector<F32>((F32 *)input[j], inputSize[j], &tmp, 8, i, 1, buffer);
+                F32 value_s = tmp[0];
+                switch (eltwiseMode) {
+                    case ELTWISE_SUM:
+                        tmp_s = value_s + tmp_s;
+                        break;
+                    case ELTWISE_MAX:
+                        tmp_s = (value_s > tmp_s) ? value_s : tmp_s;
+                        break;
+                    case ELTWISE_PROD:
+                        tmp_s *= value_s;
+                        break;
+                    case ELTWISE_SUB:
+                        tmp_s = tmp_s - value_s;
+                        break;
+                    case ELTWISE_DIV:
+                        tmp_s = tmp_s / value_s;
+                        break;
+                    default:
+                        ret = NOT_SUPPORTED;
+                        break;
+                }
             }
+            output_ptr[i] = tmp_s;
         }
-        output_ptr[i] = tmp_s;
     }
     return ret;
 }
@@ -264,17 +284,23 @@ EE eltwise_i32(std::vector<void *> input,
     EltwiseMode eltwiseMode)
 {
     EE ret = SUCCESS;
-    if ((num == 2) && (inputSize[0] == (I32)len) && (inputSize[0] == inputSize[1])) {
+    if ((num == 2) && (inputSize[0] != 1) && (inputSize[1] != 1)) {
         I32 *in0 = (I32 *)input[0];
         I32 *in1 = (I32 *)input[1];
         I32 *out = (I32 *)output;
 
 #ifdef _USE_OPENMP
-        U32 BLOCK = ((len + OMP_NUM_THREADS - 1) / OMP_NUM_THREADS + 7) / 8 * 8;
+        U32 ompBlock = ((len + OMP_NUM_THREADS - 1) / OMP_NUM_THREADS + 7) / 8 * 8;
+        U32 BLOCK = UNI_MAX(64, ompBlock);
         U32 blockNum = (len + BLOCK - 1) / BLOCK;
-#pragma omp parallel num_threads(OMP_NUM_THREADS)
-        {
+        int in_parallel = omp_in_parallel();
+        if (in_parallel != 0) {
+            BLOCK = len;
+            blockNum = 1;
+        }
+#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0)
 #endif
+        {
             switch (eltwiseMode) {
                 case ELTWISE_SUM: {
 #ifdef _USE_OPENMP
@@ -285,7 +311,7 @@ EE eltwise_i32(std::vector<void *> input,
                         eltwise_kernel(vpaddd, vpaddd, blockSize, in0 + off, in1 + off, out + off);
                     }
 #else
-                eltwise_kernel(vpaddd, vpaddd, len, in0, in1, out);
+                    eltwise_kernel(vpaddd, vpaddd, len, in0, in1, out);
 #endif
                     break;
                 }
@@ -298,7 +324,7 @@ EE eltwise_i32(std::vector<void *> input,
                         eltwise_kernel(vpmaxsd, vpmaxsd, blockSize, in0 + off, in1 + off, out + off);
                     }
 #else
-                eltwise_kernel(vpmaxsd, vpmaxsd, len, in0, in1, out);
+                    eltwise_kernel(vpmaxsd, vpmaxsd, len, in0, in1, out);
 #endif
                     break;
                 }
@@ -311,7 +337,7 @@ EE eltwise_i32(std::vector<void *> input,
                         eltwise_kernel(vpmulld, vpmulld, blockSize, in0 + off, in1 + off, out + off);
                     }
 #else
-                eltwise_kernel(vpmulld, vpmulld, len, in0, in1, out);
+                    eltwise_kernel(vpmulld, vpmulld, len, in0, in1, out);
 #endif
                     break;
                 }
@@ -324,20 +350,90 @@ EE eltwise_i32(std::vector<void *> input,
                         eltwise_kernel(vpsubd, vpsubd, blockSize, in0 + off, in1 + off, out + off);
                     }
 #else
-                eltwise_kernel(vpsubd, vpsubd, len, in0, in1, out);
+                    eltwise_kernel(vpsubd, vpsubd, len, in0, in1, out);
 #endif
                     break;
                 }
                 default:
                     ret = NOT_SUPPORTED;
+                    break;
             }
-#ifdef _USE_OPENMP
         }
-#endif
         return ret;
     }
 
-    return NOT_SUPPORTED;
+    U32 len_tail = len % 8;
+    U32 len_main = len - len_tail;
+#ifdef _USE_OPENMP
+    int in_parallel = omp_in_parallel();
+#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0)
+#endif
+    {
+        I32 buffer[8];
+        I32 *tmp = buffer;
+        I32 *output_ptr = (I32 *)output;
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (U32 i = 0; i < len_main; i += 8) {
+            get_vector<I32>((I32 *)input[0], inputSize[0], &tmp, 8, i, 8, buffer);
+            __m256i tmp_v = _mm256_loadu_si256((const __m256i *)tmp);
+            for (U32 j = 1; j < num; j++) {
+                get_vector<I32>((I32 *)input[j], inputSize[j], &tmp, 8, i, 8, buffer);
+                __m256i value_v = _mm256_loadu_si256((const __m256i *)tmp);
+                switch (eltwiseMode) {
+                    case ELTWISE_SUM:
+                        tmp_v = _mm256_add_epi32(value_v, tmp_v);
+                        break;
+                    case ELTWISE_MAX:
+                        tmp_v = _mm256_max_epi32(value_v, tmp_v);
+                        break;
+                    case ELTWISE_PROD:
+                        tmp_v = _mm256_mullo_epi32(value_v, tmp_v);
+                        break;
+                    case ELTWISE_SUB:
+                        tmp_v = _mm256_sub_epi32(tmp_v, value_v);
+                        break;
+                    default:
+                        ret = NOT_SUPPORTED;
+                }
+            }
+            _mm256_storeu_si256((__m256i *)(output_ptr + i), tmp_v);
+        }
+
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (U32 i = len_main; i < len; i++) {
+            get_vector<I32>((I32 *)input[0], inputSize[0], &tmp, 8, i, 1, buffer);
+            I32 tmp_s = tmp[0];
+            for (U32 j = 1; j < num; j++) {
+                get_vector<I32>((I32 *)input[j], inputSize[j], &tmp, 8, i, 1, buffer);
+                I32 value_s = tmp[0];
+                switch (eltwiseMode) {
+                    case ELTWISE_SUM:
+                        tmp_s = value_s + tmp_s;
+                        break;
+                    case ELTWISE_MAX:
+                        tmp_s = (value_s > tmp_s) ? value_s : tmp_s;
+                        break;
+                    case ELTWISE_PROD:
+                        tmp_s *= value_s;
+                        break;
+                    case ELTWISE_SUB:
+                        tmp_s = tmp_s - value_s;
+                        break;
+                    case ELTWISE_DIV:
+                        tmp_s = tmp_s / value_s;
+                        break;
+                    default:
+                        ret = NOT_SUPPORTED;
+                }
+            }
+            output_ptr[i] = tmp_s;
+        }
+    }
+    return ret;
 }
 
 EE eltwise_u8(std::vector<void *> input,
@@ -348,55 +444,69 @@ EE eltwise_u8(std::vector<void *> input,
     EltwiseMode eltwiseMode)
 {
     EE ret = SUCCESS;
-    U8 buffer[32];
-    U8 *tmp = buffer;
     U32 len_tail = len % 32;
     U32 len_main = len - len_tail;
-    U8 *output_ptr = (U8 *)output;
-    for (U32 i = 0; i < len_main; i += 32) {
-        get_vector<U8>((U8 *)input[0], inputSize[0], &tmp, 32, i, 32, buffer);
-        __m256i tmp_v = _mm256_loadu_si256((__m256i const *)tmp);
-        for (U32 j = 1; j < num; j++) {
-            get_vector<U8>((U8 *)input[j], inputSize[j], &tmp, 32, i, 32, buffer);
-            __m256i value_v = _mm256_loadu_si256((__m256i const *)tmp);
-            switch (eltwiseMode) {
-                case ELTWISE_AND:
-                    tmp_v = _mm256_and_si256(value_v, tmp_v);
-                    break;
-                case ELTWISE_OR:
-                    tmp_v = _mm256_or_si256(value_v, tmp_v);
-                    break;
-                case ELTWISE_XOR:
-                    tmp_v = _mm256_xor_si256(value_v, tmp_v);
-                    break;
-                default:
-                    ret = NOT_SUPPORTED;
+#ifdef _USE_OPENMP
+    int in_parallel = omp_in_parallel();
+#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0)
+#endif
+    {
+        U8 buffer[32];
+        U8 *tmp = buffer;
+        U8 *output_ptr = (U8 *)output;
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (U32 i = 0; i < len_main; i += 32) {
+            get_vector<U8>((U8 *)input[0], inputSize[0], &tmp, 32, i, 32, buffer);
+            __m256i tmp_v = _mm256_loadu_si256((__m256i const *)tmp);
+            for (U32 j = 1; j < num; j++) {
+                get_vector<U8>((U8 *)input[j], inputSize[j], &tmp, 32, i, 32, buffer);
+                __m256i value_v = _mm256_loadu_si256((__m256i const *)tmp);
+                switch (eltwiseMode) {
+                    case ELTWISE_AND:
+                        tmp_v = _mm256_and_si256(value_v, tmp_v);
+                        break;
+                    case ELTWISE_OR:
+                        tmp_v = _mm256_or_si256(value_v, tmp_v);
+                        break;
+                    case ELTWISE_XOR:
+                        tmp_v = _mm256_xor_si256(value_v, tmp_v);
+                        break;
+                    default:
+                        ret = NOT_SUPPORTED;
+                        break;
+                }
             }
+            _mm256_storeu_si256((__m256i *)(output_ptr + i), tmp_v);
         }
-        _mm256_storeu_si256((__m256i *)(output_ptr + i), tmp_v);
-    }
 
-    for (U32 i = len_main; i < len; i++) {
-        get_vector<U8>((U8 *)input[0], inputSize[0], &tmp, 32, i, 1, buffer);
-        U8 tmp_s = tmp[0];
-        for (U32 j = 1; j < num; j++) {
-            get_vector<U8>((U8 *)input[j], inputSize[j], &tmp, 32, i, 1, buffer);
-            U8 value_s = tmp[0];
-            switch (eltwiseMode) {
-                case ELTWISE_AND:
-                    tmp_s = value_s & tmp_s;
-                    break;
-                case ELTWISE_OR:
-                    tmp_s = value_s | tmp_s;
-                    break;
-                case ELTWISE_XOR:
-                    tmp_s = value_s ^ tmp_s;
-                    break;
-                default:
-                    ret = NOT_SUPPORTED;
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+        for (U32 i = len_main; i < len; i++) {
+            get_vector<U8>((U8 *)input[0], inputSize[0], &tmp, 32, i, 1, buffer);
+            U8 tmp_s = tmp[0];
+            for (U32 j = 1; j < num; j++) {
+                get_vector<U8>((U8 *)input[j], inputSize[j], &tmp, 32, i, 1, buffer);
+                U8 value_s = tmp[0];
+                switch (eltwiseMode) {
+                    case ELTWISE_AND:
+                        tmp_s = value_s & tmp_s;
+                        break;
+                    case ELTWISE_OR:
+                        tmp_s = value_s | tmp_s;
+                        break;
+                    case ELTWISE_XOR:
+                        tmp_s = value_s ^ tmp_s;
+                        break;
+                    default:
+                        ret = NOT_SUPPORTED;
+                        break;
+                }
             }
+            output_ptr[i] = tmp_s;
         }
-        output_ptr[i] = tmp_s;
     }
     return ret;
 }
diff --git a/compute/tensor/src/cpu/x86/fp32/gru.cpp b/compute/tensor/src/cpu/x86/fp32/gru.cpp
index 93a7b62c..877b94fa 100644
--- a/compute/tensor/src/cpu/x86/fp32/gru.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/gru.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 #include "cpu/x86/fp32/mvm_nkn32.h"
 
@@ -54,10 +53,10 @@ EE grucell_fp32(TensorDesc xDesc,
 
     U32 batch = in;
     I32 xDim = ix;
-    I32 hDim = rnnParamSpec.numOutput;
-    I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection
-                                                  : rnnParamSpec.numOutput;
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
+    I32 hDim = rnnParamSpec.num_outputs;
+    I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
     U32 steps = batchStrideH / hDim / num1;
     if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) {
         CHECK_STATUS(NOT_MATCH);
@@ -65,8 +64,7 @@ EE grucell_fp32(TensorDesc xDesc,
     if (!(3 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) {
         CHECK_STATUS(NOT_MATCH);
     }
-    ActivationMode activationMode = rnnParamSpec.activationMode;
-    if (activationMode != ACTIVATION_TANH) {
+    if (rnnParamSpec.activation_type != ACTIVATION_TANH) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
 
@@ -85,12 +83,12 @@ EE grucell_fp32(TensorDesc xDesc,
         F32 *currentBatchH = currentHArray + m * currentHStride;
         F32 *currentOutput = outputArray + m * batchStrideH;
         if (xDim > 0) {
-            memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32));
-            memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32));
+            UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32));
+            UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F32));
         } else {
             intermediateH = tmpArray;
             xhArray = lastBatchH;
-            memcpy(currentOutput, lastBatchH, hDim * sizeof(F32));
+            UNI_MEMCPY(currentOutput, lastBatchH, hDim * sizeof(F32));
         }
 
         const F32 *mBias = (const F32 *)bias[0] + m * steps * column * 3;
@@ -147,7 +145,7 @@ EE grucell_fp32(TensorDesc xDesc,
         array_scale_f32(out_z, out_z, column, -1, 1);
         array_mul_f32(out_z, out_h, out_h, column);
         array_add_f32(out_r, out_h, currentOutput, column);
-        memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim);
+        UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F32) * hDim);
     }
     return SUCCESS;
 }
diff --git a/compute/tensor/src/cpu/x86/fp32/instance_norm.cpp b/compute/tensor/src/cpu/x86/fp32/instance_norm.cpp
index b80f19da..b0f62cbe 100644
--- a/compute/tensor/src/cpu/x86/fp32/instance_norm.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/instance_norm.cpp
@@ -60,11 +60,15 @@ EE instance_norm_fp32(TensorDesc inputDesc,
         }
 
         for (I32 i = 0; i < loopOuter; i += 8) {
+            __m256 m1 = _mm256_setzero_ps();
             __m256 m = _mm256_setzero_ps();
             for (I32 j = 0; j < loopInner; ++j) {
-                m = _mm256_add_ps(m, _mm256_loadu_ps(input + i * loopInner + j * 8));
+                m1 = _mm256_add_ps(m1, _mm256_loadu_ps(input + i * loopInner + j * 8));
+                if (((j + 1) % 1024 == 0) || (j == loopInner - 1)) {
+                    m = _mm256_add_ps(m, _mm256_div_ps(m1, loopInner_v));
+                    m1 = _mm256_setzero_ps();
+                }
             }
-            m = _mm256_div_ps(m, loopInner_v);
             __m256 v = _mm256_setzero_ps();
             for (I32 j = 0; j < loopInner; ++j) {
                 __m256 t = _mm256_sub_ps(_mm256_loadu_ps(input + i * loopInner + j * 8), m);
diff --git a/compute/tensor/src/cpu/x86/fp32/lstm.cpp b/compute/tensor/src/cpu/x86/fp32/lstm.cpp
index 3094c40b..526051e5 100644
--- a/compute/tensor/src/cpu/x86/fp32/lstm.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/lstm.cpp
@@ -11,7 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 #include "cpu/x86/fp32/mvm_nkn32.h"
 
@@ -54,10 +53,10 @@ EE lstmcell_fp32(TensorDesc xDesc,
 
     U32 batch = in;
     I32 xDim = ix;
-    I32 hDim = rnnParamSpec.numOutput;
-    I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection
-                                                  : rnnParamSpec.numOutput;
-    int num1 = rnnParamSpec.biDirection ? 2 : 1;
+    I32 hDim = rnnParamSpec.num_outputs;
+    I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
     U32 steps = batchStrideH / hDim / num1;
     if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) {
         CHECK_STATUS(NOT_MATCH);
@@ -65,9 +64,8 @@ EE lstmcell_fp32(TensorDesc xDesc,
     if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) {
         CHECK_STATUS(NOT_MATCH);
     }
-    F32 forgetBias = rnnParamSpec.forgetBias;
-    ActivationMode activationMode = rnnParamSpec.activationMode;
-    if (activationMode != ACTIVATION_TANH) {
+    F32 forgetBias = rnnParamSpec.forget_bias;
+    if (rnnParamSpec.activation_type != ACTIVATION_TANH) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
 
@@ -88,8 +86,8 @@ EE lstmcell_fp32(TensorDesc xDesc,
     for (U32 m = 0; m < batch; m++) {
         F32 *lastBatchH = lastHArray + m * lastHStride;
         if (xDim > 0) {
-            memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32));
-            memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32));
+            UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32));
+            UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F32));
         } else {
             intermediateH = tmpArray;
             xhArray = lastBatchH;
@@ -108,12 +106,12 @@ EE lstmcell_fp32(TensorDesc xDesc,
         F32 *currentOutput = outputArray + m * batchStrideH;
 
         F32 *tmpState, *tmpHH, *tmpH;
-        if (rnnParamSpec.zoneoutCell == 0) {
+        if (rnnParamSpec.zoneout_cell == 0) {
             tmpState = currentBatchState;
         } else {
             tmpState = out_i;
         }
-        if (rnnParamSpec.numProjection > 0) {
+        if (rnnParamSpec.num_projection > 0) {
             tmpHH = out_g;
             tmpH = currentOutput;
         } else {
@@ -148,27 +146,27 @@ EE lstmcell_fp32(TensorDesc xDesc,
             tmpState[h] = C_s;
             tmpHH[h] = value;
         }
-        if (rnnParamSpec.zoneoutCell != 0) {
-            array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0);
-            array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0);
+        if (rnnParamSpec.zoneout_cell != 0) {
+            array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneout_cell, 0);
+            array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneout_cell, 0);
             array_add_f32(tmpState, lastBatchState, currentBatchState, column);
         }
 
-        if (rnnParamSpec.numProjection > 0) {
-            mvm_nkn32_with_bias(hDim / 32, rnnParamSpec.numProjection, (const F32 *)filter[1],
+        if (rnnParamSpec.num_projection > 0) {
+            mvm_nkn32_with_bias(hDim / 32, rnnParamSpec.num_projection, (const F32 *)filter[1],
                 tmpHH, tmpH, nullptr);
         }
 
-        if (rnnParamSpec.zoneoutOutput != 0) {
-            if (rnnParamSpec.numProjection > 0) {
-                array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0);
+        if (rnnParamSpec.zoneout_output != 0) {
+            if (rnnParamSpec.num_projection > 0) {
+                array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0);
             } else {
-                array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0);
+                array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0);
             }
-            array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0);
+            array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneout_output, 0);
             array_add_f32(out_f, lastBatchH, currentBatchH, hDim);
         } else {
-            memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim);
+            UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F32) * hDim);
         }
     }
     return SUCCESS;
diff --git a/compute/tensor/src/cpu/x86/fp32/mvm_nkn32.h b/compute/tensor/src/cpu/x86/fp32/mvm_nkn32.h
index cc5db76f..88f31c14 100644
--- a/compute/tensor/src/cpu/x86/fp32/mvm_nkn32.h
+++ b/compute/tensor/src/cpu/x86/fp32/mvm_nkn32.h
@@ -23,6 +23,7 @@ inline void mvm_nkn32_with_bias(
 #pragma omp parallel for num_threads(OMP_NUM_THREADS)
 #endif
     for (U32 n = 0; n < fn; ++n) {
+        FTZ;
         const F32 *f = filterArray + n * fk * 32;
         F32 *out = output + n * 32;
         const F32 *b = bias + n * 32;
diff --git a/compute/tensor/src/cpu/x86/fp32/normalization.cpp b/compute/tensor/src/cpu/x86/fp32/normalization.cpp
index 46b08655..7d6276bc 100644
--- a/compute/tensor/src/cpu/x86/fp32/normalization.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/normalization.cpp
@@ -14,10 +14,11 @@
 #include <math.h>
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 
-inline void array_norm_scale_fp32(
+static F32 eps = 1e-6;
+
+inline static void array_norm_scale_fp32(
     F32 *input, F32 *output, I32 len, F32 mean, F32 var, F32 *alpha, F32 *beta)
 {
-    F32 eps = 1e-6;
     F32 std_value = sqrt(var + eps);
     __m256 mean_v = _mm256_set1_ps(mean);
     __m256 std_v = _mm256_set1_ps(std_value);
@@ -38,17 +39,17 @@ inline void array_norm_scale_fp32(
     }
 }
 
-EE layer_normalization_fp32(
+static EE layer_normalization_nhwc(
     TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output)
 {
     UNUSED(outputDesc);
-    if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-
     U32 size = tensorNumElements(inputDesc);
     I32 size_inner = inputDesc.dims[0];
     I32 size_outer = size / size_inner;
+
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static)
+#endif
     for (I32 i = 0; i < size_outer; i++) {
         F32 *current_input = input + i * size_inner;
         F32 *current_output = output + i * size_inner;
@@ -57,6 +58,81 @@ EE layer_normalization_fp32(
 
         array_norm_scale_fp32(current_input, current_output, size_inner, mean, var, alpha, beta);
     }
+    return SUCCESS;
+}
+
+static EE layer_normalization_nchwc8(
+    TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output)
+{
+    UNUSED(outputDesc);
+    int n = inputDesc.dims[inputDesc.nDims - 1];
+    int c = inputDesc.dims[inputDesc.nDims - 2];
+    int hw = 1;
+    for (unsigned int i = 0; i < inputDesc.nDims - 2; i++) {
+        hw *= inputDesc.dims[i];
+    }
+    int c8 = c / 8;
+    int nums = n * hw;
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static)
+#endif
+    for (int x = 0; x < nums; ++x) {
+        int i = x / hw;
+        int j = x % hw;
+        __m256 sum_v = _mm256_set1_ps(0);
+        for (int k = 0; k < c8; k++) {
+            int id = ((i * c8 + k) * hw + j) * 8;
+            sum_v = _mm256_add_ps(sum_v, _mm256_loadu_ps(input + id));
+        }
+        F32 mean = _mm256_sum_ps(sum_v) / c;
+        __m256 mean_v = _mm256_set1_ps(mean);
+
+        sum_v = _mm256_set1_ps(0);
+        for (int k = 0; k < c8; k++) {
+            int id = ((i * c8 + k) * hw + j) * 8;
+            __m256 tmp_v = _mm256_sub_ps(_mm256_loadu_ps(input + id), mean_v);
+            sum_v = _mm256_fmadd_ps(tmp_v, tmp_v, sum_v);
+        }
+        F32 var = _mm256_sum_ps(sum_v) / c;
+        F32 std_value = sqrt(var + eps);
+
+        __m256 std_v = _mm256_set1_ps(std_value);
+        for (int k = 0, kk = 0; k < c8; k++, kk += 8) {
+            int id = ((i * c8 + k) * hw + j) * 8;
+            __m256 in = _mm256_loadu_ps(input + id);
+            __m256 alpha_v = _mm256_loadu_ps(alpha + kk);
+            __m256 beta_v = _mm256_loadu_ps(beta + kk);
 
+            __m256 tmp_v = _mm256_sub_ps(in, mean_v);
+            tmp_v = _mm256_div_ps(tmp_v, std_v);
+            tmp_v = _mm256_fmadd_ps(alpha_v, tmp_v, beta_v);
+            _mm256_storeu_ps(output + id, tmp_v);
+        }
+    }
     return SUCCESS;
 }
+
+EE layer_normalization_fp32(TensorDesc inputDesc,
+    F32 *input,
+    LayerNormParamSpec p,
+    F32 *alpha,
+    F32 *beta,
+    TensorDesc outputDesc,
+    F32 *output)
+{
+    if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+
+    EE ret = NOT_SUPPORTED;
+    if (inputDesc.df == DF_NCHWC8) {
+        if (p.axis == 1) {
+            ret = layer_normalization_nchwc8(inputDesc, input, alpha, beta, outputDesc, output);
+        }
+    } else {
+        if (p.axis == -1) {
+            ret = layer_normalization_nhwc(inputDesc, input, alpha, beta, outputDesc, output);
+        }
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/x86/fp32/pooling.cpp b/compute/tensor/src/cpu/x86/fp32/pooling.cpp
index ec456ab7..7c92885d 100644
--- a/compute/tensor/src/cpu/x86/fp32/pooling.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/pooling.cpp
@@ -12,6 +12,7 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
+#include "cpu/x86/fp32/pooling_kernel.h"
 
 #define UNROLL_W 4
 
@@ -19,255 +20,8 @@ typedef void (*pooling_max_func)(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32
 typedef void (*pooling_mean_func)(
     const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize);
 
-void pooling_max_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
-{
-    __asm__ __volatile__("mov %%eax, %%eax                                  \n\t"
-                         "mov %4, %%eax                                  \n\t"
-                         "mov %%rax, %%rdi                                  \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-                         "mov %5, %%eax                                  \n\t"
-                         "mov %%rax, %%r9                                  \n\t"
-                         "add %%r9, %%r9                                  \n\t"
-                         "mov %%rax, %%r10                                  \n\t"
-                         "add %%r9, %%r10                                  \n\t"
-                         "add %0, %%rax                                  \n\t"
-                         "add %0, %%r9                                  \n\t"
-                         "add %0, %%r10                                  \n\t"
-
-                         "vmovups (%0), %%ymm0                     \n\t"
-                         "vmovups (%%rax), %%ymm1                     \n\t"
-                         "vmovups (%%r9), %%ymm2                     \n\t"
-                         "vmovups (%%r10), %%ymm3                     \n\t"
-
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-
-                         "mov %2, %%ecx                                     \n\t"
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-
-                         "vmovups (%0), %%ymm4                     \n\t"
-                         "vmovups (%%rax), %%ymm5                     \n\t"
-                         "vmovups (%%r9), %%ymm6                     \n\t"
-                         "vmovups (%%r10), %%ymm7                     \n\t"
-
-                         "vmaxps %%ymm0, %%ymm4, %%ymm0                     \n\t"
-                         "vmaxps %%ymm1, %%ymm5, %%ymm1                     \n\t"
-                         "vmaxps %%ymm2, %%ymm6, %%ymm2                     \n\t"
-                         "vmaxps %%ymm3, %%ymm7, %%ymm3                     \n\t"
-
-                         "add $0x20, %0                                      \n\t"
-                         "add $0x20, %%rax                                      \n\t"
-                         "add $0x20, %%r9                                      \n\t"
-                         "add $0x20, %%r10                                      \n\t"
-                         "dec %%ecx                                         \n\t"
-                         "jg 1b                                             \n\t"
-
-                         "add %%rdi, %0                                      \n\t"
-                         "add %%rdi, %%rax                                      \n\t"
-                         "add %%rdi, %%r9                                      \n\t"
-                         "add %%rdi, %%r10                                      \n\t"
-                         "dec %%ebx                                         \n\t"
-                         "jg 0b                                             \n\t"
-
-                         "vmovups %%ymm0, (%1)                              \n\t"
-                         "vmovups %%ymm1, 0x20(%1)                          \n\t"
-                         "vmovups %%ymm2, 0x40(%1)                          \n\t"
-                         "vmovups %%ymm3, 0x60(%1)                          \n\t"
-                         :
-                         : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
-                         : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2",
-                         "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc");
-}
-
-void pooling_max_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
-{
-    __asm__ __volatile__(
-        "mov %%eax, %%eax                                  \n\t"
-        "mov %4, %%eax                                  \n\t"
-        "mov %%rax, %%rdi                                  \n\t"
-        "mov %%eax, %%eax                                  \n\t"
-        "mov %5, %%eax                                  \n\t"
-        "add %0, %%rax                                  \n\t"
-        "vmovups (%0), %%ymm0                     \n\t"
-        "vmovups (%%rax), %%ymm1                     \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "mov %2, %%ecx                                     \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "vmovups (%0), %%ymm4                     \n\t"
-        "vmovups (%%rax), %%ymm5                     \n\t"
-        "vmaxps %%ymm0, %%ymm4, %%ymm0                     \n\t"
-        "vmaxps %%ymm1, %%ymm5, %%ymm1                     \n\t"
-        "add $0x20, %0                                      \n\t"
-        "add $0x20, %%rax                                      \n\t"
-        "dec %%ecx                                         \n\t"
-        "jg 1b                                             \n\t"
-        "add %%rdi, %0                                      \n\t"
-        "add %%rdi, %%rax                                      \n\t"
-        "dec %%ebx                                         \n\t"
-        "jg 0b                                             \n\t"
-        "vmovups %%ymm0, (%1)                              \n\t"
-        "vmovups %%ymm1, 0x20(%1)                          \n\t"
-        :
-        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
-        : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc");
-}
-
-void pooling_max_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
-{
-    __asm__ __volatile__("mov %%eax, %%eax                                  \n\t"
-                         "mov %4, %%eax                                  \n\t"
-                         "mov %%rax, %%rdi                                  \n\t"
-                         "vmovups (%0), %%ymm0                     \n\t"
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "mov %2, %%ecx                                     \n\t"
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "vmovups (%0), %%ymm4                     \n\t"
-                         "vmaxps %%ymm0, %%ymm4, %%ymm0                     \n\t"
-                         "add $0x20, %0                                      \n\t"
-                         "dec %%ecx                                         \n\t"
-                         "jg 1b                                             \n\t"
-                         "add %%rdi, %0                                      \n\t"
-                         "dec %%ebx                                         \n\t"
-                         "jg 0b                                             \n\t"
-                         "vmovups %%ymm0, (%1)                              \n\t"
-                         :
-                         : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
-                         : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc");
-}
-
-void pooling_mean_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
-{
-    __asm__ __volatile__(
-        "mov %%eax, %%eax                                  \n\t"
-        "mov %4, %%eax                                  \n\t"
-        "mov %%rax, %%rdi                                  \n\t"
-        "mov %5, %%eax                                  \n\t"
-        "mov %%rax, %%r9                                  \n\t"
-        "add %%r9, %%r9                                  \n\t"
-        "mov %%rax, %%r10                                  \n\t"
-        "add %%r9, %%r10                                  \n\t"
-        "add %0, %%rax                                  \n\t"
-        "add %0, %%r9                                  \n\t"
-        "add %0, %%r10                                  \n\t"
-        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-        "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-        "vxorps %%ymm2, %%ymm2, %%ymm2                     \n\t"
-        "vxorps %%ymm3, %%ymm3, %%ymm3                     \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "mov %2, %%ecx                                     \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "vmovups (%0), %%ymm4                     \n\t"
-        "vmovups (%%rax), %%ymm5                     \n\t"
-        "vmovups (%%r9), %%ymm6                     \n\t"
-        "vmovups (%%r10), %%ymm7                     \n\t"
-        "vaddps %%ymm0, %%ymm4, %%ymm0                     \n\t"
-        "vaddps %%ymm1, %%ymm5, %%ymm1                     \n\t"
-        "vaddps %%ymm2, %%ymm6, %%ymm2                     \n\t"
-        "vaddps %%ymm3, %%ymm7, %%ymm3                     \n\t"
-        "add $0x20, %0                                      \n\t"
-        "add $0x20, %%rax                                      \n\t"
-        "add $0x20, %%r9                                      \n\t"
-        "add $0x20, %%r10                                      \n\t"
-        "dec %%ecx                                         \n\t"
-        "jg 1b                                             \n\t"
-        "add %%rdi, %0                                      \n\t"
-        "add %%rdi, %%rax                                      \n\t"
-        "add %%rdi, %%r9                                      \n\t"
-        "add %%rdi, %%r10                                      \n\t"
-        "dec %%ebx                                         \n\t"
-        "jg 0b                                             \n\t"
-        "vbroadcastss (%6), %%ymm4                     \n\t"
-        "vdivps %%ymm4, %%ymm0, %%ymm0                     \n\t"
-        "vdivps %%ymm4, %%ymm1, %%ymm1                     \n\t"
-        "vdivps %%ymm4, %%ymm2, %%ymm2                     \n\t"
-        "vdivps %%ymm4, %%ymm3, %%ymm3                     \n\t"
-        "vmovups %%ymm0, (%1)                              \n\t"
-        "vmovups %%ymm1, 0x20(%1)                          \n\t"
-        "vmovups %%ymm2, 0x40(%1)                          \n\t"
-        "vmovups %%ymm3, 0x60(%1)                          \n\t"
-        :
-        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
-        : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2", "%ymm3",
-        "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc");
-}
-
-void pooling_mean_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
-{
-    __asm__ __volatile__(
-        "mov %%eax, %%eax                                  \n\t"
-        "mov %4, %%eax                                  \n\t"
-        "mov %%rax, %%rdi                                  \n\t"
-        "mov %5, %%eax                                  \n\t"
-        "add %0, %%rax                                  \n\t"
-        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-        "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "mov %2, %%ecx                                     \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "vmovups (%0), %%ymm4                     \n\t"
-        "vmovups (%%rax), %%ymm5                     \n\t"
-        "vaddps %%ymm0, %%ymm4, %%ymm0                     \n\t"
-        "vaddps %%ymm1, %%ymm5, %%ymm1                     \n\t"
-        "add $0x20, %0                                      \n\t"
-        "add $0x20, %%rax                                      \n\t"
-        "dec %%ecx                                         \n\t"
-        "jg 1b                                             \n\t"
-        "add %%rdi, %0                                      \n\t"
-        "add %%rdi, %%rax                                      \n\t"
-        "dec %%ebx                                         \n\t"
-        "jg 0b                                             \n\t"
-        "vbroadcastss (%6), %%ymm4                     \n\t"
-        "vdivps %%ymm4, %%ymm0, %%ymm0                     \n\t"
-        "vdivps %%ymm4, %%ymm1, %%ymm1                     \n\t"
-        "vmovups %%ymm0, (%1)                              \n\t"
-        "vmovups %%ymm1, 0x20(%1)                          \n\t"
-        :
-        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
-        : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc");
-}
-
-void pooling_mean_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
-{
-    __asm__ __volatile__(
-        "mov %%eax, %%eax                                  \n\t"
-        "mov %4, %%eax                                  \n\t"
-        "mov %%rax, %%rdi                                  \n\t"
-        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "mov %2, %%ecx                                     \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "vmovups (%0), %%ymm4                     \n\t"
-        "vaddps %%ymm0, %%ymm4, %%ymm0                     \n\t"
-        "add $0x20, %0                                      \n\t"
-        "dec %%ecx                                         \n\t"
-        "jg 1b                                             \n\t"
-        "add %%rdi, %0                                      \n\t"
-        "dec %%ebx                                         \n\t"
-        "jg 0b                                             \n\t"
-        "vbroadcastss (%6), %%ymm4                     \n\t"
-        "vdivps %%ymm4, %%ymm0, %%ymm0                     \n\t"
-        "vmovups %%ymm0, (%1)                              \n\t"
-        :
-        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
-        : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc");
-}
-
-EE pooling_fp32(TensorDesc inputDesc,
-    const F32 *input,
-    PoolingParamSpec poolingParamSpec,
-    TensorDesc outputDesc,
-    F32 *output)
+EE pooling_fp32(
+    TensorDesc inputDesc, const F32 *input, PoolingParamSpec p, TensorDesc outputDesc, F32 *output)
 {
     if (nullptr == input || nullptr == output) {
         CHECK_STATUS(NULL_POINTER);
@@ -288,15 +42,15 @@ EE pooling_fp32(TensorDesc inputDesc,
         CHECK_STATUS(NOT_MATCH);
     }
 
-    PoolingMode pm = poolingParamSpec.mode;
-    U32 strideH = poolingParamSpec.stride_h;
-    U32 strideW = poolingParamSpec.stride_w;
-    U32 paddingT = poolingParamSpec.padding_top;
-    U32 paddingL = poolingParamSpec.padding_left;
-    U32 kernelSizeH = poolingParamSpec.kernel_h;
-    U32 kernelSizeW = poolingParamSpec.kernel_w;
+    PoolingMode pm = p.mode;
+    U32 strideH = p.stride_h;
+    U32 strideW = p.stride_w;
+    U32 paddingT = p.pad_top;
+    U32 paddingL = p.pad_left;
+    U32 kernelSizeH = p.kernel_h;
+    U32 kernelSizeW = p.kernel_w;
     U32 wSize, kh, kw, iStep;
-    F32 poolSize, *curO;
+    F32 *curO;
     const F32 *curI;
     if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) {
         CHECK_STATUS(NOT_SUPPORTED);
@@ -307,6 +61,7 @@ EE pooling_fp32(TensorDesc inputDesc,
     U32 wSizes[3] = {1, 2, 4};
     pooling_max_func pooling_max[3] = {pooling_max_w1, pooling_max_w2, pooling_max_w4};
     pooling_mean_func pooling_mean[3] = {pooling_mean_w1, pooling_mean_w2, pooling_mean_w4};
+    F32 poolSize = kernelSizeH * kernelSizeW;
     for (U32 n = 0; n < in; n++) {
         for (U32 c = 0; c < ic; c++) {
             for (U32 h = 0; h < oh; h++) {
@@ -329,7 +84,9 @@ EE pooling_fp32(TensorDesc inputDesc,
                     kh = hend - hstart;
                     kw = wend - wstart;
                     iStep = (iw - kw) * 32;
-                    poolSize = kw * kh * 1.0f;
+                    if (!p.count_include_pad) {
+                        poolSize = kh * kw;
+                    }
                     if (kw < kernelSizeW) {
                         wSize = 1;
                     }
@@ -344,7 +101,7 @@ EE pooling_fp32(TensorDesc inputDesc,
                             break;
                         }
                         default:
-                            CHECK_STATUS(NOT_SUPPORTED);
+                            return NOT_SUPPORTED;
                     }
                 }
             }
diff --git a/compute/tensor/src/cpu/x86/fp32/pooling_avx512.cpp b/compute/tensor/src/cpu/x86/fp32/pooling_avx512.cpp
index 538e6b71..78b79ffe 100644
--- a/compute/tensor/src/cpu/x86/fp32/pooling_avx512.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/pooling_avx512.cpp
@@ -12,6 +12,7 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
+#include "cpu/x86/fp32/pooling_kernel.h"
 
 #define UNROLL_W 4
 
@@ -19,258 +20,8 @@ typedef void (*pooling_max_func)(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32
 typedef void (*pooling_mean_func)(
     const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize);
 
-void pooling_c16_max_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
-{
-    __asm__ __volatile__("mov %%eax, %%eax                                  \n\t"
-                         "mov %4, %%eax                                  \n\t"
-                         "mov %%rax, %%rdi                                  \n\t"
-                         "mov %%eax, %%eax                                  \n\t"
-                         "mov %5, %%eax                                  \n\t"
-                         "mov %%rax, %%r9                                  \n\t"
-                         "add %%r9, %%r9                                  \n\t"
-                         "mov %%rax, %%r10                                  \n\t"
-                         "add %%r9, %%r10                                  \n\t"
-                         "add %0, %%rax                                  \n\t"
-                         "add %0, %%r9                                  \n\t"
-                         "add %0, %%r10                                  \n\t"
-
-                         "vmovups (%0), %%zmm0                     \n\t"
-                         "vmovups (%%rax), %%zmm1                     \n\t"
-                         "vmovups (%%r9), %%zmm2                     \n\t"
-                         "vmovups (%%r10), %%zmm3                     \n\t"
-
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-
-                         "mov %2, %%ecx                                     \n\t"
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-
-                         "vmovups (%0), %%zmm4                     \n\t"
-                         "vmovups (%%rax), %%zmm5                     \n\t"
-                         "vmovups (%%r9), %%zmm6                     \n\t"
-                         "vmovups (%%r10), %%zmm7                     \n\t"
-
-                         "vmaxps %%zmm0, %%zmm4, %%zmm0                     \n\t"
-                         "vmaxps %%zmm1, %%zmm5, %%zmm1                     \n\t"
-                         "vmaxps %%zmm2, %%zmm6, %%zmm2                     \n\t"
-                         "vmaxps %%zmm3, %%zmm7, %%zmm3                     \n\t"
-
-                         "add $0x40, %0                                      \n\t"
-                         "add $0x40, %%rax                                      \n\t"
-                         "add $0x40, %%r9                                      \n\t"
-                         "add $0x40, %%r10                                      \n\t"
-                         "dec %%ecx                                         \n\t"
-                         "jg 1b                                             \n\t"
-
-                         "add %%rdi, %0                                      \n\t"
-                         "add %%rdi, %%rax                                      \n\t"
-                         "add %%rdi, %%r9                                      \n\t"
-                         "add %%rdi, %%r10                                      \n\t"
-                         "dec %%ebx                                         \n\t"
-                         "jg 0b                                             \n\t"
-
-                         "vmovups %%zmm0, (%1)                              \n\t"
-                         "vmovups %%zmm1, 0x40(%1)                          \n\t"
-                         "vmovups %%zmm2, 0x80(%1)                          \n\t"
-                         "vmovups %%zmm3, 0xC0(%1)                          \n\t"
-                         :
-                         : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
-                         : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%zmm0", "%zmm1", "%zmm2",
-                         "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "memory", "cc");
-}
-
-void pooling_c16_max_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
-{
-    __asm__ __volatile__(
-        "mov %%eax, %%eax                                  \n\t"
-        "mov %4, %%eax                                  \n\t"
-        "mov %%rax, %%rdi                                  \n\t"
-        "mov %%eax, %%eax                                  \n\t"
-        "mov %5, %%eax                                  \n\t"
-        "add %0, %%rax                                  \n\t"
-        "vmovups (%0), %%zmm0                     \n\t"
-        "vmovups (%%rax), %%zmm1                     \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "mov %2, %%ecx                                     \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "vmovups (%0), %%zmm4                     \n\t"
-        "vmovups (%%rax), %%zmm5                     \n\t"
-        "vmaxps %%zmm0, %%zmm4, %%zmm0                     \n\t"
-        "vmaxps %%zmm1, %%zmm5, %%zmm1                     \n\t"
-        "add $0x40, %0                                      \n\t"
-        "add $0x40, %%rax                                      \n\t"
-        "dec %%ecx                                         \n\t"
-        "jg 1b                                             \n\t"
-        "add %%rdi, %0                                      \n\t"
-        "add %%rdi, %%rax                                      \n\t"
-        "dec %%ebx                                         \n\t"
-        "jg 0b                                             \n\t"
-        "vmovups %%zmm0, (%1)                              \n\t"
-        "vmovups %%zmm1, 0x40(%1)                          \n\t"
-        :
-        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
-        : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm5", "memory", "cc");
-}
-
-void pooling_c16_max_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
-{
-    __asm__ __volatile__("mov %%eax, %%eax                                  \n\t"
-                         "mov %4, %%eax                                  \n\t"
-                         "mov %%rax, %%rdi                                  \n\t"
-                         "vmovups (%0), %%zmm0                     \n\t"
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "mov %2, %%ecx                                     \n\t"
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "vmovups (%0), %%zmm4                     \n\t"
-                         "vmaxps %%zmm0, %%zmm4, %%zmm0                     \n\t"
-                         "add $0x40, %0                                      \n\t"
-                         "dec %%ecx                                         \n\t"
-                         "jg 1b                                             \n\t"
-                         "add %%rdi, %0                                      \n\t"
-                         "dec %%ebx                                         \n\t"
-                         "jg 0b                                             \n\t"
-                         "vmovups %%zmm0, (%1)                              \n\t"
-                         :
-                         : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
-                         : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm4", "memory", "cc");
-}
-
-void pooling_c16_mean_w4(
-    const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
-{
-    __asm__ __volatile__(
-        "mov %%eax, %%eax                                  \n\t"
-        "mov %4, %%eax                                  \n\t"
-        "mov %%rax, %%rdi                                  \n\t"
-        "mov %5, %%eax                                  \n\t"
-        "mov %%rax, %%r9                                  \n\t"
-        "add %%r9, %%r9                                  \n\t"
-        "mov %%rax, %%r10                                  \n\t"
-        "add %%r9, %%r10                                  \n\t"
-        "add %0, %%rax                                  \n\t"
-        "add %0, %%r9                                  \n\t"
-        "add %0, %%r10                                  \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
-        "vxorps %%zmm2, %%zmm2, %%zmm2                     \n\t"
-        "vxorps %%zmm3, %%zmm3, %%zmm3                     \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "mov %2, %%ecx                                     \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "vmovups (%0), %%zmm4                     \n\t"
-        "vmovups (%%rax), %%zmm5                     \n\t"
-        "vmovups (%%r9), %%zmm6                     \n\t"
-        "vmovups (%%r10), %%zmm7                     \n\t"
-        "vaddps %%zmm0, %%zmm4, %%zmm0                     \n\t"
-        "vaddps %%zmm1, %%zmm5, %%zmm1                     \n\t"
-        "vaddps %%zmm2, %%zmm6, %%zmm2                     \n\t"
-        "vaddps %%zmm3, %%zmm7, %%zmm3                     \n\t"
-        "add $0x40, %0                                      \n\t"
-        "add $0x40, %%rax                                      \n\t"
-        "add $0x40, %%r9                                      \n\t"
-        "add $0x40, %%r10                                      \n\t"
-        "dec %%ecx                                         \n\t"
-        "jg 1b                                             \n\t"
-        "add %%rdi, %0                                      \n\t"
-        "add %%rdi, %%rax                                      \n\t"
-        "add %%rdi, %%r9                                      \n\t"
-        "add %%rdi, %%r10                                      \n\t"
-        "dec %%ebx                                         \n\t"
-        "jg 0b                                             \n\t"
-        "vbroadcastss (%6), %%zmm4                     \n\t"
-        "vdivps %%zmm4, %%zmm0, %%zmm0                     \n\t"
-        "vdivps %%zmm4, %%zmm1, %%zmm1                     \n\t"
-        "vdivps %%zmm4, %%zmm2, %%zmm2                     \n\t"
-        "vdivps %%zmm4, %%zmm3, %%zmm3                     \n\t"
-        "vmovups %%zmm0, (%1)                              \n\t"
-        "vmovups %%zmm1, 0x40(%1)                          \n\t"
-        "vmovups %%zmm2, 0x80(%1)                          \n\t"
-        "vmovups %%zmm3, 0xC0(%1)                          \n\t"
-        :
-        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
-        : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%zmm0", "%zmm1", "%zmm2", "%zmm3",
-        "%zmm4", "%zmm5", "%zmm6", "%zmm7", "memory", "cc");
-}
-
-void pooling_c16_mean_w2(
-    const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
-{
-    __asm__ __volatile__(
-        "mov %%eax, %%eax                                  \n\t"
-        "mov %4, %%eax                                  \n\t"
-        "mov %%rax, %%rdi                                  \n\t"
-        "mov %5, %%eax                                  \n\t"
-        "add %0, %%rax                                  \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "mov %2, %%ecx                                     \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "vmovups (%0), %%zmm4                     \n\t"
-        "vmovups (%%rax), %%zmm5                     \n\t"
-        "vaddps %%zmm0, %%zmm4, %%zmm0                     \n\t"
-        "vaddps %%zmm1, %%zmm5, %%zmm1                     \n\t"
-        "add $0x40, %0                                      \n\t"
-        "add $0x40, %%rax                                      \n\t"
-        "dec %%ecx                                         \n\t"
-        "jg 1b                                             \n\t"
-        "add %%rdi, %0                                      \n\t"
-        "add %%rdi, %%rax                                      \n\t"
-        "dec %%ebx                                         \n\t"
-        "jg 0b                                             \n\t"
-        "vbroadcastss (%6), %%zmm4                     \n\t"
-        "vdivps %%zmm4, %%zmm0, %%zmm0                     \n\t"
-        "vdivps %%zmm4, %%zmm1, %%zmm1                     \n\t"
-        "vmovups %%zmm0, (%1)                              \n\t"
-        "vmovups %%zmm1, 0x40(%1)                          \n\t"
-        :
-        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
-        : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm5", "memory", "cc");
-}
-
-void pooling_c16_mean_w1(
-    const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
-{
-    __asm__ __volatile__(
-        "mov %%eax, %%eax                                  \n\t"
-        "mov %4, %%eax                                  \n\t"
-        "mov %%rax, %%rdi                                  \n\t"
-        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
-        ".align 16                                         \n\t"
-        "0:                                                \n\t"
-        "mov %2, %%ecx                                     \n\t"
-        ".align 16                                         \n\t"
-        "1:                                                \n\t"
-        "vmovups (%0), %%zmm4                     \n\t"
-        "vaddps %%zmm0, %%zmm4, %%zmm0                     \n\t"
-        "add $0x40, %0                                      \n\t"
-        "dec %%ecx                                         \n\t"
-        "jg 1b                                             \n\t"
-        "add %%rdi, %0                                      \n\t"
-        "dec %%ebx                                         \n\t"
-        "jg 0b                                             \n\t"
-        "vbroadcastss (%6), %%zmm4                     \n\t"
-        "vdivps %%zmm4, %%zmm0, %%zmm0                     \n\t"
-        "vmovups %%zmm0, (%1)                              \n\t"
-        :
-        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
-        : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm4", "memory", "cc");
-}
-
-EE pooling_c16_fp32(TensorDesc inputDesc,
-    const F32 *input,
-    PoolingParamSpec poolingParamSpec,
-    TensorDesc outputDesc,
-    F32 *output)
+EE pooling_c16_fp32(
+    TensorDesc inputDesc, const F32 *input, PoolingParamSpec p, TensorDesc outputDesc, F32 *output)
 {
     if (nullptr == input || nullptr == output) {
         CHECK_STATUS(NULL_POINTER);
@@ -291,31 +42,33 @@ EE pooling_c16_fp32(TensorDesc inputDesc,
         CHECK_STATUS(NOT_MATCH);
     }
 
-    PoolingMode pm = poolingParamSpec.mode;
-    U32 strideH = poolingParamSpec.stride_h;
-    U32 strideW = poolingParamSpec.stride_w;
-    U32 paddingT = poolingParamSpec.padding_top;
-    U32 paddingL = poolingParamSpec.padding_left;
-    U32 kernelSizeH = poolingParamSpec.kernel_h;
-    U32 kernelSizeW = poolingParamSpec.kernel_w;
+    PoolingMode pm = p.mode;
+    U32 strideH = p.stride_h;
+    U32 strideW = p.stride_w;
+    U32 paddingT = p.pad_top;
+    U32 paddingL = p.pad_left;
+    U32 kernelSizeH = p.kernel_h;
+    U32 kernelSizeW = p.kernel_w;
     U32 wSize, kh, kw, iStep;
-    F32 poolSize, *curO;
+    F32 *curO;
     const F32 *curI;
     if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
 
-    if (ic % 16 != 0) {
-        CHECK_STATUS(NOT_MATCH);
-    }
-    ic /= 16;
     U32 owInter = (iw + paddingL - kernelSizeW) / strideW + 1;
     U32 wSizes[3] = {1, 2, 4};
-    pooling_max_func pooling_max[3] = {pooling_c16_max_w1, pooling_c16_max_w2, pooling_c16_max_w4};
-    pooling_mean_func pooling_mean[3] = {
-        pooling_c16_mean_w1, pooling_c16_mean_w2, pooling_c16_mean_w4};
+    pooling_max_func pooling_max[2][3] = {{pooling_max_w1, pooling_max_w2, pooling_max_w4},
+        {pooling_c16_max_w1, pooling_c16_max_w2, pooling_c16_max_w4}};
+    pooling_mean_func pooling_mean[2][3] = {{pooling_mean_w1, pooling_mean_w2, pooling_mean_w4},
+        {pooling_c16_mean_w1, pooling_c16_mean_w2, pooling_c16_mean_w4}};
+    F32 poolSize = kernelSizeH * kernelSizeW;
     for (U32 n = 0; n < in; n++) {
-        for (U32 c = 0; c < ic; c++) {
+        for (U32 c = 0; c < ic; c += 16) {
+            U32 cx = 16;
+            if (c + 16 > ic) {
+                cx = 8;
+            }
             for (U32 h = 0; h < oh; h++) {
                 for (U32 w = 0; w < ow; w += wSize) {
                     if (w < owInter) {
@@ -331,23 +84,26 @@ EE pooling_c16_fp32(TensorDesc inputDesc,
                     hstart = UNI_MAX(hstart, 0);
                     wstart = UNI_MAX(wstart, 0);
 
-                    curI = input + (hstart * iw + wstart) * 16;
-                    curO = output + (h * ow + w) * 16;
+                    curI = input + (hstart * iw + wstart) * cx;
+                    curO = output + (h * ow + w) * cx;
                     kh = hend - hstart;
                     kw = wend - wstart;
-                    iStep = (iw - kw) * 64;
-                    poolSize = kw * kh * 1.0f;
+                    iStep = (iw - kw) * cx * 4;
+                    if (!p.count_include_pad) {
+                        poolSize = kh * kw;
+                    }
                     if (kw < kernelSizeW) {
                         wSize = 1;
                     }
                     switch (pm) {
                         case POOLING_MAX: {
-                            pooling_max[wSize >> 1](curI, curO, kw, kh, iStep, strideW * 64);
+                            pooling_max[cx >> 4][wSize >> 1](
+                                curI, curO, kw, kh, iStep, strideW * cx * 4);
                             break;
                         }
                         case POOLING_MEAN: {
-                            pooling_mean[wSize >> 1](
-                                curI, curO, kw, kh, iStep, strideW * 64, poolSize);
+                            pooling_mean[cx >> 4][wSize >> 1](
+                                curI, curO, kw, kh, iStep, strideW * cx * 4, poolSize);
                             break;
                         }
                         default:
@@ -355,8 +111,8 @@ EE pooling_c16_fp32(TensorDesc inputDesc,
                     }
                 }
             }
-            input += ih * iw * 16;
-            output += oh * ow * 16;
+            input += ih * iw * cx;
+            output += oh * ow * cx;
         }
     }
     return SUCCESS;
diff --git a/compute/tensor/src/cpu/x86/fp32/pooling_bp.cpp b/compute/tensor/src/cpu/x86/fp32/pooling_bp.cpp
index c866f86e..4ae13a19 100644
--- a/compute/tensor/src/cpu/x86/fp32/pooling_bp.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/pooling_bp.cpp
@@ -15,13 +15,27 @@
 
 #define UNROLL_W 4
 
-typedef void (*pooling_bp_func)(
-    const F32 *input, int hstart, int hend, int wstart, int wend, F32 *output, U32 ow, U32 strideW);
+typedef void (*pooling_bp_func)(const F32 *input,
+    int hstart,
+    int hend,
+    int wstart,
+    int wend,
+    int pool,
+    F32 *output,
+    U32 ow,
+    U32 strideW);
 
-void pooling_bp_c8_w4_fp32(
-    const F32 *input, int hstart, int hend, int wstart, int wend, F32 *output, U32 ow, U32 strideW)
+void pooling_bp_c8_w4_fp32(const F32 *input,
+    int hstart,
+    int hend,
+    int wstart,
+    int wend,
+    int pool,
+    F32 *output,
+    U32 ow,
+    U32 strideW)
 {
-    __m256 poolSize = _mm256_set1_ps((hend - hstart) * (wend - wstart) * 1.0f);
+    __m256 poolSize = _mm256_set1_ps(pool);
     __m256 in0 = _mm256_div_ps(_mm256_loadu_ps(input), poolSize);
     __m256 in1 = _mm256_div_ps(_mm256_loadu_ps(input + 8), poolSize);
     __m256 in2 = _mm256_div_ps(_mm256_loadu_ps(input + 16), poolSize);
@@ -44,10 +58,17 @@ void pooling_bp_c8_w4_fp32(
     }
 }
 
-void pooling_bp_c8_w2_fp32(
-    const F32 *input, int hstart, int hend, int wstart, int wend, F32 *output, U32 ow, U32 strideW)
+void pooling_bp_c8_w2_fp32(const F32 *input,
+    int hstart,
+    int hend,
+    int wstart,
+    int wend,
+    int pool,
+    F32 *output,
+    U32 ow,
+    U32 strideW)
 {
-    __m256 poolSize = _mm256_set1_ps((hend - hstart) * (wend - wstart) * 1.0f);
+    __m256 poolSize = _mm256_set1_ps(pool);
     __m256 in0 = _mm256_div_ps(_mm256_loadu_ps(input), poolSize);
     __m256 in1 = _mm256_div_ps(_mm256_loadu_ps(input + 8), poolSize);
     for (int kernelH = hstart; kernelH < hend; kernelH++) {
@@ -62,10 +83,17 @@ void pooling_bp_c8_w2_fp32(
     }
 }
 
-void pooling_bp_c8_w1_fp32(
-    const F32 *input, int hstart, int hend, int wstart, int wend, F32 *output, U32 ow, U32 strideW)
+void pooling_bp_c8_w1_fp32(const F32 *input,
+    int hstart,
+    int hend,
+    int wstart,
+    int wend,
+    int pool,
+    F32 *output,
+    U32 ow,
+    U32 strideW)
 {
-    __m256 poolSize = _mm256_set1_ps((hend - hstart) * (wend - wstart) * 1.0f);
+    __m256 poolSize = _mm256_set1_ps(pool);
     __m256 in0 = _mm256_div_ps(_mm256_loadu_ps(input), poolSize);
     for (int kernelH = hstart; kernelH < hend; kernelH++) {
         for (int kernelW = wstart; kernelW < wend; kernelW++) {
@@ -98,7 +126,7 @@ EE pooling_bp_fp32(
     if (idf != DF_NCHWC8 || odf != idf) {
         ret = NOT_MATCH;
     }
-    if (p.padding_top >= p.kernel_h || p.padding_left >= p.kernel_w) {
+    if (p.pad_top >= p.kernel_h || p.pad_left >= p.kernel_w) {
         ret = NOT_SUPPORTED;
     }
     PoolingMode pm = p.mode;
@@ -108,11 +136,12 @@ EE pooling_bp_fp32(
 
     ic /= 8;
     U32 wSize = 0;
-    U32 iwInter = (ow + p.padding_left - p.kernel_w) / p.stride_w + 1;
+    U32 iwInter = (ow + p.pad_left - p.kernel_w) / p.stride_w + 1;
     const F32 *curI = input;
     F32 *curO = output;
     pooling_bp_func pooling_bp[3] = {
         pooling_bp_c8_w1_fp32, pooling_bp_c8_w2_fp32, pooling_bp_c8_w4_fp32};
+    int poolSize = p.kernel_t * p.kernel_h * p.kernel_w;
     for (U32 n = 0; n < in; n++) {
         for (U32 c = 0; c < ic; c++) {
             for (U32 h = 0; h < ih; h++) {
@@ -122,8 +151,8 @@ EE pooling_bp_fp32(
                     } else {
                         wSize = 1;
                     }
-                    int hstart = (int)h * (int)p.stride_h - (int)p.padding_top;
-                    int wstart = (int)w * (int)p.stride_w - (int)p.padding_left;
+                    int hstart = (int)h * (int)p.stride_h - (int)p.pad_top;
+                    int wstart = (int)w * (int)p.stride_w - (int)p.pad_left;
                     int hend = UNI_MIN(hstart + p.kernel_h, oh);
                     int wend = UNI_MIN(wstart + p.kernel_w, ow);
                     hstart = UNI_MAX(hstart, 0);
@@ -131,7 +160,11 @@ EE pooling_bp_fp32(
                     if (wend < wstart + (int)p.kernel_w) {
                         wSize = 1;
                     }
-                    pooling_bp[wSize >> 1](curI, hstart, hend, wstart, wend, curO, ow, p.stride_w);
+                    if (!p.count_include_pad) {
+                        poolSize = (hend - hstart) * (wend - wstart);
+                    }
+                    pooling_bp[wSize >> 1](
+                        curI, hstart, hend, wstart, wend, poolSize, curO, ow, p.stride_w);
                     curI += wSize * 8;
                 }
             }
@@ -139,4 +172,4 @@ EE pooling_bp_fp32(
         }
     }
     return ret;
-}
\ No newline at end of file
+}
diff --git a/compute/tensor/src/cpu/x86/fp32/pooling_kernel.h b/compute/tensor/src/cpu/x86/fp32/pooling_kernel.h
new file mode 100644
index 00000000..7f838dca
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/fp32/pooling_kernel.h
@@ -0,0 +1,508 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_POOLING_KERNEL
+#define _H_POOLING_KERNEL
+
+inline void pooling_max_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __asm__ __volatile__("mov %%eax, %%eax                                  \n\t"
+                         "mov %4, %%eax                                  \n\t"
+                         "mov %%rax, %%rdi                                  \n\t"
+                         "mov %%eax, %%eax                                  \n\t"
+                         "mov %5, %%eax                                  \n\t"
+                         "mov %%rax, %%r9                                  \n\t"
+                         "add %%r9, %%r9                                  \n\t"
+                         "mov %%rax, %%r10                                  \n\t"
+                         "add %%r9, %%r10                                  \n\t"
+                         "add %0, %%rax                                  \n\t"
+                         "add %0, %%r9                                  \n\t"
+                         "add %0, %%r10                                  \n\t"
+
+                         "vmovups (%0), %%ymm0                     \n\t"
+                         "vmovups (%%rax), %%ymm1                     \n\t"
+                         "vmovups (%%r9), %%ymm2                     \n\t"
+                         "vmovups (%%r10), %%ymm3                     \n\t"
+
+                         ".align 16                                         \n\t"
+                         "0:                                                \n\t"
+
+                         "mov %2, %%ecx                                     \n\t"
+                         ".align 16                                         \n\t"
+                         "1:                                                \n\t"
+
+                         "vmovups (%0), %%ymm4                     \n\t"
+                         "vmovups (%%rax), %%ymm5                     \n\t"
+                         "vmovups (%%r9), %%ymm6                     \n\t"
+                         "vmovups (%%r10), %%ymm7                     \n\t"
+
+                         "vmaxps %%ymm0, %%ymm4, %%ymm0                     \n\t"
+                         "vmaxps %%ymm1, %%ymm5, %%ymm1                     \n\t"
+                         "vmaxps %%ymm2, %%ymm6, %%ymm2                     \n\t"
+                         "vmaxps %%ymm3, %%ymm7, %%ymm3                     \n\t"
+
+                         "add $0x20, %0                                      \n\t"
+                         "add $0x20, %%rax                                      \n\t"
+                         "add $0x20, %%r9                                      \n\t"
+                         "add $0x20, %%r10                                      \n\t"
+                         "dec %%ecx                                         \n\t"
+                         "jg 1b                                             \n\t"
+
+                         "add %%rdi, %0                                      \n\t"
+                         "add %%rdi, %%rax                                      \n\t"
+                         "add %%rdi, %%r9                                      \n\t"
+                         "add %%rdi, %%r10                                      \n\t"
+                         "dec %%ebx                                         \n\t"
+                         "jg 0b                                             \n\t"
+
+                         "vmovups %%ymm0, (%1)                              \n\t"
+                         "vmovups %%ymm1, 0x20(%1)                          \n\t"
+                         "vmovups %%ymm2, 0x40(%1)                          \n\t"
+                         "vmovups %%ymm3, 0x60(%1)                          \n\t"
+                         :
+                         : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
+                         : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2",
+                         "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc");
+}
+
+inline void pooling_max_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __asm__ __volatile__(
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %5, %%eax                                  \n\t"
+        "add %0, %%rax                                  \n\t"
+        "vmovups (%0), %%ymm0                     \n\t"
+        "vmovups (%%rax), %%ymm1                     \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%ymm4                     \n\t"
+        "vmovups (%%rax), %%ymm5                     \n\t"
+        "vmaxps %%ymm0, %%ymm4, %%ymm0                     \n\t"
+        "vmaxps %%ymm1, %%ymm5, %%ymm1                     \n\t"
+        "add $0x20, %0                                      \n\t"
+        "add $0x20, %%rax                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "add %%rdi, %%rax                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vmovups %%ymm0, (%1)                              \n\t"
+        "vmovups %%ymm1, 0x20(%1)                          \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
+        : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc");
+}
+
+inline void pooling_max_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __asm__ __volatile__("mov %%eax, %%eax                                  \n\t"
+                         "mov %4, %%eax                                  \n\t"
+                         "mov %%rax, %%rdi                                  \n\t"
+                         "vmovups (%0), %%ymm0                     \n\t"
+                         ".align 16                                         \n\t"
+                         "0:                                                \n\t"
+                         "mov %2, %%ecx                                     \n\t"
+                         ".align 16                                         \n\t"
+                         "1:                                                \n\t"
+                         "vmovups (%0), %%ymm4                     \n\t"
+                         "vmaxps %%ymm0, %%ymm4, %%ymm0                     \n\t"
+                         "add $0x20, %0                                      \n\t"
+                         "dec %%ecx                                         \n\t"
+                         "jg 1b                                             \n\t"
+                         "add %%rdi, %0                                      \n\t"
+                         "dec %%ebx                                         \n\t"
+                         "jg 0b                                             \n\t"
+                         "vmovups %%ymm0, (%1)                              \n\t"
+                         :
+                         : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
+                         : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc");
+}
+
+inline void pooling_mean_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
+{
+    __asm__ __volatile__(
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "mov %5, %%eax                                  \n\t"
+        "mov %%rax, %%r9                                  \n\t"
+        "add %%r9, %%r9                                  \n\t"
+        "mov %%rax, %%r10                                  \n\t"
+        "add %%r9, %%r10                                  \n\t"
+        "add %0, %%rax                                  \n\t"
+        "add %0, %%r9                                  \n\t"
+        "add %0, %%r10                                  \n\t"
+        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
+        "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
+        "vxorps %%ymm2, %%ymm2, %%ymm2                     \n\t"
+        "vxorps %%ymm3, %%ymm3, %%ymm3                     \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%ymm4                     \n\t"
+        "vmovups (%%rax), %%ymm5                     \n\t"
+        "vmovups (%%r9), %%ymm6                     \n\t"
+        "vmovups (%%r10), %%ymm7                     \n\t"
+        "vaddps %%ymm0, %%ymm4, %%ymm0                     \n\t"
+        "vaddps %%ymm1, %%ymm5, %%ymm1                     \n\t"
+        "vaddps %%ymm2, %%ymm6, %%ymm2                     \n\t"
+        "vaddps %%ymm3, %%ymm7, %%ymm3                     \n\t"
+        "add $0x20, %0                                      \n\t"
+        "add $0x20, %%rax                                      \n\t"
+        "add $0x20, %%r9                                      \n\t"
+        "add $0x20, %%r10                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "add %%rdi, %%rax                                      \n\t"
+        "add %%rdi, %%r9                                      \n\t"
+        "add %%rdi, %%r10                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vbroadcastss (%6), %%ymm4                     \n\t"
+        "vdivps %%ymm4, %%ymm0, %%ymm0                     \n\t"
+        "vdivps %%ymm4, %%ymm1, %%ymm1                     \n\t"
+        "vdivps %%ymm4, %%ymm2, %%ymm2                     \n\t"
+        "vdivps %%ymm4, %%ymm3, %%ymm3                     \n\t"
+        "vmovups %%ymm0, (%1)                              \n\t"
+        "vmovups %%ymm1, 0x20(%1)                          \n\t"
+        "vmovups %%ymm2, 0x40(%1)                          \n\t"
+        "vmovups %%ymm3, 0x60(%1)                          \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
+        : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2", "%ymm3",
+        "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc");
+}
+
+inline void pooling_mean_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
+{
+    __asm__ __volatile__(
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "mov %5, %%eax                                  \n\t"
+        "add %0, %%rax                                  \n\t"
+        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
+        "vxorps %%ymm1, %%ymm1, %%ymm1                     \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%ymm4                     \n\t"
+        "vmovups (%%rax), %%ymm5                     \n\t"
+        "vaddps %%ymm0, %%ymm4, %%ymm0                     \n\t"
+        "vaddps %%ymm1, %%ymm5, %%ymm1                     \n\t"
+        "add $0x20, %0                                      \n\t"
+        "add $0x20, %%rax                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "add %%rdi, %%rax                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vbroadcastss (%6), %%ymm4                     \n\t"
+        "vdivps %%ymm4, %%ymm0, %%ymm0                     \n\t"
+        "vdivps %%ymm4, %%ymm1, %%ymm1                     \n\t"
+        "vmovups %%ymm0, (%1)                              \n\t"
+        "vmovups %%ymm1, 0x20(%1)                          \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
+        : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc");
+}
+
+inline void pooling_mean_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
+{
+    __asm__ __volatile__(
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "vxorps %%ymm0, %%ymm0, %%ymm0                     \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%ymm4                     \n\t"
+        "vaddps %%ymm0, %%ymm4, %%ymm0                     \n\t"
+        "add $0x20, %0                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vbroadcastss (%6), %%ymm4                     \n\t"
+        "vdivps %%ymm4, %%ymm0, %%ymm0                     \n\t"
+        "vmovups %%ymm0, (%1)                              \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
+        : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc");
+}
+
+inline void pooling_c16_max_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __asm__ __volatile__("mov %%eax, %%eax                                  \n\t"
+                         "mov %4, %%eax                                  \n\t"
+                         "mov %%rax, %%rdi                                  \n\t"
+                         "mov %%eax, %%eax                                  \n\t"
+                         "mov %5, %%eax                                  \n\t"
+                         "mov %%rax, %%r9                                  \n\t"
+                         "add %%r9, %%r9                                  \n\t"
+                         "mov %%rax, %%r10                                  \n\t"
+                         "add %%r9, %%r10                                  \n\t"
+                         "add %0, %%rax                                  \n\t"
+                         "add %0, %%r9                                  \n\t"
+                         "add %0, %%r10                                  \n\t"
+
+                         "vmovups (%0), %%zmm0                     \n\t"
+                         "vmovups (%%rax), %%zmm1                     \n\t"
+                         "vmovups (%%r9), %%zmm2                     \n\t"
+                         "vmovups (%%r10), %%zmm3                     \n\t"
+
+                         ".align 16                                         \n\t"
+                         "0:                                                \n\t"
+
+                         "mov %2, %%ecx                                     \n\t"
+                         ".align 16                                         \n\t"
+                         "1:                                                \n\t"
+
+                         "vmovups (%0), %%zmm4                     \n\t"
+                         "vmovups (%%rax), %%zmm5                     \n\t"
+                         "vmovups (%%r9), %%zmm6                     \n\t"
+                         "vmovups (%%r10), %%zmm7                     \n\t"
+
+                         "vmaxps %%zmm0, %%zmm4, %%zmm0                     \n\t"
+                         "vmaxps %%zmm1, %%zmm5, %%zmm1                     \n\t"
+                         "vmaxps %%zmm2, %%zmm6, %%zmm2                     \n\t"
+                         "vmaxps %%zmm3, %%zmm7, %%zmm3                     \n\t"
+
+                         "add $0x40, %0                                      \n\t"
+                         "add $0x40, %%rax                                      \n\t"
+                         "add $0x40, %%r9                                      \n\t"
+                         "add $0x40, %%r10                                      \n\t"
+                         "dec %%ecx                                         \n\t"
+                         "jg 1b                                             \n\t"
+
+                         "add %%rdi, %0                                      \n\t"
+                         "add %%rdi, %%rax                                      \n\t"
+                         "add %%rdi, %%r9                                      \n\t"
+                         "add %%rdi, %%r10                                      \n\t"
+                         "dec %%ebx                                         \n\t"
+                         "jg 0b                                             \n\t"
+
+                         "vmovups %%zmm0, (%1)                              \n\t"
+                         "vmovups %%zmm1, 0x40(%1)                          \n\t"
+                         "vmovups %%zmm2, 0x80(%1)                          \n\t"
+                         "vmovups %%zmm3, 0xC0(%1)                          \n\t"
+                         :
+                         : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
+                         : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%zmm0", "%zmm1", "%zmm2",
+                         "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "memory", "cc");
+}
+
+inline void pooling_c16_max_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __asm__ __volatile__(
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %5, %%eax                                  \n\t"
+        "add %0, %%rax                                  \n\t"
+        "vmovups (%0), %%zmm0                     \n\t"
+        "vmovups (%%rax), %%zmm1                     \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%zmm4                     \n\t"
+        "vmovups (%%rax), %%zmm5                     \n\t"
+        "vmaxps %%zmm0, %%zmm4, %%zmm0                     \n\t"
+        "vmaxps %%zmm1, %%zmm5, %%zmm1                     \n\t"
+        "add $0x40, %0                                      \n\t"
+        "add $0x40, %%rax                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "add %%rdi, %%rax                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vmovups %%zmm0, (%1)                              \n\t"
+        "vmovups %%zmm1, 0x40(%1)                          \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
+        : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm5", "memory", "cc");
+}
+
+inline void pooling_c16_max_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __asm__ __volatile__("mov %%eax, %%eax                                  \n\t"
+                         "mov %4, %%eax                                  \n\t"
+                         "mov %%rax, %%rdi                                  \n\t"
+                         "vmovups (%0), %%zmm0                     \n\t"
+                         ".align 16                                         \n\t"
+                         "0:                                                \n\t"
+                         "mov %2, %%ecx                                     \n\t"
+                         ".align 16                                         \n\t"
+                         "1:                                                \n\t"
+                         "vmovups (%0), %%zmm4                     \n\t"
+                         "vmaxps %%zmm0, %%zmm4, %%zmm0                     \n\t"
+                         "add $0x40, %0                                      \n\t"
+                         "dec %%ecx                                         \n\t"
+                         "jg 1b                                             \n\t"
+                         "add %%rdi, %0                                      \n\t"
+                         "dec %%ebx                                         \n\t"
+                         "jg 0b                                             \n\t"
+                         "vmovups %%zmm0, (%1)                              \n\t"
+                         :
+                         : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
+                         : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm4", "memory", "cc");
+}
+
+inline void pooling_c16_mean_w4(
+    const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
+{
+    __asm__ __volatile__(
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "mov %5, %%eax                                  \n\t"
+        "mov %%rax, %%r9                                  \n\t"
+        "add %%r9, %%r9                                  \n\t"
+        "mov %%rax, %%r10                                  \n\t"
+        "add %%r9, %%r10                                  \n\t"
+        "add %0, %%rax                                  \n\t"
+        "add %0, %%r9                                  \n\t"
+        "add %0, %%r10                                  \n\t"
+        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
+        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
+        "vxorps %%zmm2, %%zmm2, %%zmm2                     \n\t"
+        "vxorps %%zmm3, %%zmm3, %%zmm3                     \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%zmm4                     \n\t"
+        "vmovups (%%rax), %%zmm5                     \n\t"
+        "vmovups (%%r9), %%zmm6                     \n\t"
+        "vmovups (%%r10), %%zmm7                     \n\t"
+        "vaddps %%zmm0, %%zmm4, %%zmm0                     \n\t"
+        "vaddps %%zmm1, %%zmm5, %%zmm1                     \n\t"
+        "vaddps %%zmm2, %%zmm6, %%zmm2                     \n\t"
+        "vaddps %%zmm3, %%zmm7, %%zmm3                     \n\t"
+        "add $0x40, %0                                      \n\t"
+        "add $0x40, %%rax                                      \n\t"
+        "add $0x40, %%r9                                      \n\t"
+        "add $0x40, %%r10                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "add %%rdi, %%rax                                      \n\t"
+        "add %%rdi, %%r9                                      \n\t"
+        "add %%rdi, %%r10                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vbroadcastss (%6), %%zmm4                     \n\t"
+        "vdivps %%zmm4, %%zmm0, %%zmm0                     \n\t"
+        "vdivps %%zmm4, %%zmm1, %%zmm1                     \n\t"
+        "vdivps %%zmm4, %%zmm2, %%zmm2                     \n\t"
+        "vdivps %%zmm4, %%zmm3, %%zmm3                     \n\t"
+        "vmovups %%zmm0, (%1)                              \n\t"
+        "vmovups %%zmm1, 0x40(%1)                          \n\t"
+        "vmovups %%zmm2, 0x80(%1)                          \n\t"
+        "vmovups %%zmm3, 0xC0(%1)                          \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
+        : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%zmm0", "%zmm1", "%zmm2", "%zmm3",
+        "%zmm4", "%zmm5", "%zmm6", "%zmm7", "memory", "cc");
+}
+
+inline void pooling_c16_mean_w2(
+    const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
+{
+    __asm__ __volatile__(
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "mov %5, %%eax                                  \n\t"
+        "add %0, %%rax                                  \n\t"
+        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
+        "vxorps %%zmm1, %%zmm1, %%zmm1                     \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%zmm4                     \n\t"
+        "vmovups (%%rax), %%zmm5                     \n\t"
+        "vaddps %%zmm0, %%zmm4, %%zmm0                     \n\t"
+        "vaddps %%zmm1, %%zmm5, %%zmm1                     \n\t"
+        "add $0x40, %0                                      \n\t"
+        "add $0x40, %%rax                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "add %%rdi, %%rax                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vbroadcastss (%6), %%zmm4                     \n\t"
+        "vdivps %%zmm4, %%zmm0, %%zmm0                     \n\t"
+        "vdivps %%zmm4, %%zmm1, %%zmm1                     \n\t"
+        "vmovups %%zmm0, (%1)                              \n\t"
+        "vmovups %%zmm1, 0x40(%1)                          \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
+        : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm5", "memory", "cc");
+}
+
+inline void pooling_c16_mean_w1(
+    const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
+{
+    __asm__ __volatile__(
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "vxorps %%zmm0, %%zmm0, %%zmm0                     \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%zmm4                     \n\t"
+        "vaddps %%zmm0, %%zmm4, %%zmm0                     \n\t"
+        "add $0x40, %0                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vbroadcastss (%6), %%zmm4                     \n\t"
+        "vdivps %%zmm4, %%zmm0, %%zmm0                     \n\t"
+        "vmovups %%zmm0, (%1)                              \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
+        : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm4", "memory", "cc");
+}
+
+#endif
\ No newline at end of file
diff --git a/compute/tensor/src/cpu/x86/fp32/pooling_nchw.cpp b/compute/tensor/src/cpu/x86/fp32/pooling_nchw.cpp
new file mode 100644
index 00000000..8776f591
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/fp32/pooling_nchw.cpp
@@ -0,0 +1,337 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/x86/fp32/tensor_computing_fp32.h"
+
+#define UNROLL_W 32
+
+typedef void (*pooling_max_func)(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride);
+typedef void (*pooling_mean_func)(
+    const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize);
+
+void pooling_max_w32(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __m256 x1, x2, x3, x4;
+    if (stride == 1) {
+        x1 = _mm256_loadu_ps(curI);
+        x2 = _mm256_loadu_ps(curI + 8);
+        x3 = _mm256_loadu_ps(curI + 16);
+        x4 = _mm256_loadu_ps(curI + 24);
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_max_ps(x1, _mm256_loadu_ps(curI));
+                x2 = _mm256_max_ps(x2, _mm256_loadu_ps(curI + 8));
+                x3 = _mm256_max_ps(x3, _mm256_loadu_ps(curI + 16));
+                x4 = _mm256_max_ps(x4, _mm256_loadu_ps(curI + 24));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    } else {
+        __m256i v256index = _mm256_set_epi32(
+            stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0);
+        x1 = _mm256_i32gather_ps(curI, v256index, 4);
+        x2 = _mm256_i32gather_ps(curI + 8 * stride, v256index, 4);
+        x3 = _mm256_i32gather_ps(curI + 16 * stride, v256index, 4);
+        x4 = _mm256_i32gather_ps(curI + 24 * stride, v256index, 4);
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_max_ps(x1, _mm256_i32gather_ps(curI, v256index, 4));
+                x2 = _mm256_max_ps(x2, _mm256_i32gather_ps(curI + 8 * stride, v256index, 4));
+                x3 = _mm256_max_ps(x3, _mm256_i32gather_ps(curI + 16 * stride, v256index, 4));
+                x4 = _mm256_max_ps(x4, _mm256_i32gather_ps(curI + 24 * stride, v256index, 4));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    }
+    _mm256_storeu_ps(curO, x1);
+    _mm256_storeu_ps(curO + 8, x2);
+    _mm256_storeu_ps(curO + 16, x3);
+    _mm256_storeu_ps(curO + 24, x4);
+}
+
+void pooling_max_w16(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __m256 x1, x2;
+    if (stride == 1) {
+        x1 = _mm256_loadu_ps(curI);
+        x2 = _mm256_loadu_ps(curI + 8);
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_max_ps(x1, _mm256_loadu_ps(curI));
+                x2 = _mm256_max_ps(x2, _mm256_loadu_ps(curI + 8));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    } else {
+        __m256i v256index = _mm256_set_epi32(
+            stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0);
+        x1 = _mm256_i32gather_ps(curI, v256index, 4);
+        x2 = _mm256_i32gather_ps(curI + 8 * stride, v256index, 4);
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_max_ps(x1, _mm256_i32gather_ps(curI, v256index, 4));
+                x2 = _mm256_max_ps(x2, _mm256_i32gather_ps(curI + 8 * stride, v256index, 4));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    }
+    _mm256_storeu_ps(curO, x1);
+    _mm256_storeu_ps(curO + 8, x2);
+}
+
+void pooling_max_w8(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __m256 x1;
+    if (stride == 1) {
+        x1 = _mm256_loadu_ps(curI);
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_max_ps(x1, _mm256_loadu_ps(curI));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    } else {
+        __m256i v256index = _mm256_set_epi32(
+            stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0);
+        x1 = _mm256_i32gather_ps(curI, v256index, 4);
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_max_ps(x1, _mm256_i32gather_ps(curI, v256index, 4));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    }
+    _mm256_storeu_ps(curO, x1);
+}
+
+void pooling_max_w0(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    *curO = *curI;
+    for (U32 h = 0; h < kh; ++h) {
+        for (U32 w = 0; w < kw; ++w) {
+            *curO = UNI_MAX(*curO, *curI);
+            curI += 1;
+        }
+        curI += iStep;
+    }
+}
+
+void pooling_mean_w32(
+    const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
+{
+    __m256 x0 = _mm256_set1_ps(1.0f / poolSize);
+    __m256 x1 = _mm256_setzero_ps();
+    __m256 x2 = _mm256_setzero_ps();
+    __m256 x3 = _mm256_setzero_ps();
+    __m256 x4 = _mm256_setzero_ps();
+
+    if (stride == 1) {
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_add_ps(x1, _mm256_loadu_ps(curI));
+                x2 = _mm256_add_ps(x2, _mm256_loadu_ps(curI + 8));
+                x3 = _mm256_add_ps(x3, _mm256_loadu_ps(curI + 16));
+                x4 = _mm256_add_ps(x4, _mm256_loadu_ps(curI + 24));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    } else {
+        __m256i v256index = _mm256_set_epi32(
+            stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0);
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_add_ps(x1, _mm256_i32gather_ps(curI, v256index, 4));
+                x2 = _mm256_add_ps(x2, _mm256_i32gather_ps(curI + 8 * stride, v256index, 4));
+                x3 = _mm256_add_ps(x3, _mm256_i32gather_ps(curI + 16 * stride, v256index, 4));
+                x4 = _mm256_add_ps(x4, _mm256_i32gather_ps(curI + 24 * stride, v256index, 4));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    }
+    _mm256_storeu_ps(curO, _mm256_mul_ps(x1, x0));
+    _mm256_storeu_ps(curO + 8, _mm256_mul_ps(x2, x0));
+    _mm256_storeu_ps(curO + 16, _mm256_mul_ps(x3, x0));
+    _mm256_storeu_ps(curO + 24, _mm256_mul_ps(x4, x0));
+}
+
+void pooling_mean_w16(
+    const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
+{
+    __m256 x0 = _mm256_set1_ps(1.0f / poolSize);
+    __m256 x1 = _mm256_setzero_ps();
+    __m256 x2 = _mm256_setzero_ps();
+
+    if (stride == 1) {
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_add_ps(x1, _mm256_loadu_ps(curI));
+                x2 = _mm256_add_ps(x2, _mm256_loadu_ps(curI + 8));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    } else {
+        __m256i v256index = _mm256_set_epi32(
+            stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0);
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_add_ps(x1, _mm256_i32gather_ps(curI, v256index, 4));
+                x2 = _mm256_add_ps(x2, _mm256_i32gather_ps(curI + 8 * stride, v256index, 4));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    }
+    _mm256_storeu_ps(curO, _mm256_mul_ps(x1, x0));
+    _mm256_storeu_ps(curO + 8, _mm256_mul_ps(x2, x0));
+}
+
+void pooling_mean_w8(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
+{
+    __m256 x0 = _mm256_set1_ps(1.0f / poolSize);
+    __m256 x1 = _mm256_setzero_ps();
+
+    if (stride == 1) {
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_add_ps(x1, _mm256_loadu_ps(curI));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    } else {
+        __m256i v256index = _mm256_set_epi32(
+            stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0);
+        for (U32 h = 0; h < kh; ++h) {
+            for (U32 w = 0; w < kw; ++w) {
+                x1 = _mm256_add_ps(x1, _mm256_i32gather_ps(curI, v256index, 4));
+                curI += 1;
+            }
+            curI += iStep;
+        }
+    }
+    _mm256_storeu_ps(curO, _mm256_mul_ps(x1, x0));
+}
+
+void pooling_mean_w0(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize)
+{
+    *curO = 0;
+    for (U32 h = 0; h < kh; ++h) {
+        for (U32 w = 0; w < kw; ++w) {
+            *curO += *curI;
+            curI += 1;
+        }
+        curI += iStep;
+    }
+    *curO /= poolSize;
+}
+
+EE pooling_nchw_fp32(
+    TensorDesc inputDesc, const F32 *input, PoolingParamSpec p, TensorDesc outputDesc, F32 *output)
+{
+    if (nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    DataType idt, odt;
+    DataFormat idf, odf;
+    U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0;
+    CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+    CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+
+    if (idt != odt || idt != DT_F32) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+    if (in != on || ic != oc) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+    if (idf != DF_NCHW || odf != idf) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+
+    PoolingMode pm = p.mode;
+    U32 strideH = p.stride_h;
+    U32 strideW = p.stride_w;
+    U32 paddingT = p.pad_top;
+    U32 paddingL = p.pad_left;
+    U32 kernelSizeH = p.kernel_h;
+    U32 kernelSizeW = p.kernel_w;
+    U32 wSize, kh, kw, iStep;
+    F32 *curO;
+    const F32 *curI;
+    if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
+
+    U32 owInter = (iw + paddingL - kernelSizeW) / strideW + 1;
+    U32 wSizes[5] = {1, 8, 16, 16, 32};
+    pooling_max_func pooling_max[5] = {
+        pooling_max_w0, pooling_max_w8, pooling_max_w16, pooling_max_w16, pooling_max_w32};
+    pooling_mean_func pooling_mean[5] = {
+        pooling_mean_w0, pooling_mean_w8, pooling_mean_w16, pooling_mean_w16, pooling_mean_w32};
+    F32 poolSize = kernelSizeH * kernelSizeW;
+    for (U32 n = 0; n < in; n++) {
+        for (U32 c = 0; c < ic; c++) {
+            for (U32 h = 0; h < oh; h++) {
+                int hstart = (int)h * (int)strideH - (int)paddingT;
+                int hend = UNI_MIN(hstart + kernelSizeH, ih);
+                hstart = UNI_MAX(hstart, 0);
+                kh = hend - hstart;
+                for (U32 w = 0; w < ow; w += wSize) {
+                    if (w < owInter) {
+                        wSize = UNI_MIN(owInter - w, UNROLL_W);
+                    } else {
+                        wSize = 1;
+                    }
+                    wSize = wSizes[wSize >> 3];
+                    int wstart = (int)w * (int)strideW - (int)paddingL;
+                    int wend = UNI_MIN(wstart + kernelSizeW, iw);
+                    wstart = UNI_MAX(wstart, 0);
+
+                    curI = input + (hstart * iw + wstart);
+                    curO = output + (h * ow + w);
+                    kw = wend - wstart;
+                    iStep = iw - kw;
+                    if (!p.count_include_pad) {
+                        poolSize = kh * kw;
+                    }
+                    if (kw < kernelSizeW) {
+                        wSize = 1;
+                    }
+                    switch (pm) {
+                        case POOLING_MAX: {
+                            pooling_max[wSize >> 3](curI, curO, kw, kh, iStep, strideW);
+                            break;
+                        }
+                        case POOLING_MEAN: {
+                            pooling_mean[wSize >> 3](curI, curO, kw, kh, iStep, strideW, poolSize);
+                            break;
+                        }
+                        default:
+                            CHECK_STATUS(NOT_SUPPORTED);
+                    }
+                }
+            }
+            input += ih * iw;
+            output += oh * ow;
+        }
+    }
+    return SUCCESS;
+}
diff --git a/compute/tensor/src/cpu/x86/fp32/scale.cpp b/compute/tensor/src/cpu/x86/fp32/scale.cpp
index f2887fb9..c1d5b830 100644
--- a/compute/tensor/src/cpu/x86/fp32/scale.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/scale.cpp
@@ -59,29 +59,45 @@ EE scale_nchwc8_fp32(
     return SUCCESS;
 }
 
+template <bool icoc_equal>
 EE scale_nchw_fp32(
     F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output)
 {
     __m256 one = _mm256_set1_ps(1.);
     __m256 zero = _mm256_set1_ps(0.);
-    U32 index = 0;
-    for (I32 n = 0; n < in; n++) {
-        for (I32 c = 0; c < ic; c++) {
-            __m256 alpha_vec = (alpha == nullptr) ? one : _mm256_set1_ps(alpha[c]);
-            __m256 beta_vec = (beta == nullptr) ? zero : _mm256_set1_ps(beta[c]);
-            I32 i = 0;
-            for (; i < elements_per_channel - 7; i += 8) {
-                __m256 in_vec = _mm256_loadu_ps(input + index);
-                __m256 out_vec = _mm256_fmadd_ps(alpha_vec, in_vec, beta_vec);
-                _mm256_storeu_ps(output + index, out_vec);
-                index += 8;
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+    for (int j = 0; j < in * ic; j++) {
+        int n = j / ic;
+        int c = j % ic;
+        //for (I32 n = 0; n < in; n++) {
+        //    for (I32 c = 0; c < ic; c++) {
+        U32 dst = j * elements_per_channel, src = 0;
+        __m256 alpha_vec = (alpha == nullptr) ? one : _mm256_set1_ps(alpha[c]);
+        __m256 beta_vec = (beta == nullptr) ? zero : _mm256_set1_ps(beta[c]);
+        I32 i = 0;
+        for (; i < elements_per_channel - 7; i += 8) {
+            if (icoc_equal) {
+                src = (n * ic + c) * elements_per_channel + i;
+            } else {
+                src = n * elements_per_channel + i;
             }
-            for (; i < elements_per_channel; i++) {
-                float alpha_s = (alpha == nullptr) ? 1 : alpha[c];
-                float beta_s = (beta == nullptr) ? 0 : beta[c];
-                output[index] = alpha_s * input[index] + beta_s;
-                index++;
+            __m256 in_vec = _mm256_loadu_ps(input + src);
+            __m256 out_vec = _mm256_fmadd_ps(alpha_vec, in_vec, beta_vec);
+            _mm256_storeu_ps(output + dst, out_vec);
+            dst += 8;
+        }
+        for (; i < elements_per_channel; i++) {
+            if (icoc_equal) {
+                src = (n * ic + c) * elements_per_channel + i;
+            } else {
+                src = n * elements_per_channel + i;
             }
+            float alpha_s = (alpha == nullptr) ? 1 : alpha[c];
+            float beta_s = (beta == nullptr) ? 0 : beta[c];
+            output[dst] = alpha_s * input[src] + beta_s;
+            dst++;
         }
     }
     return SUCCESS;
@@ -114,6 +130,7 @@ EE scale_nhwc_fp32(
             for (; c < ic; c++) {
                 float alpha_s = (alpha == nullptr) ? 1 : alpha[c];
                 float beta_s = (beta == nullptr) ? 0 : beta[c];
+                float in_s;
                 if (icoc_equal) {
                     in_s = input[dst];
                 } else {
@@ -143,14 +160,18 @@ EE scale_fp32(F32 *input,
     }
     EE ret = SUCCESS;
     // If oc is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw.
-    if (axis == 1 || axis == 0 || oc == 1) {
-        ret = scale_nchw_fp32(input, alpha, beta, on, oc, elements_per_channel, output);
-    } else if (axis == nDims - 1) {
+    if (axis == nDims - 1) {
         if (ic == oc) {
             ret = scale_nhwc_fp32<true>(input, alpha, beta, on, oc, elements_per_channel, output);
         } else {
             ret = scale_nhwc_fp32<false>(input, alpha, beta, on, oc, elements_per_channel, output);
         }
+    } else if (axis == 1 || axis == 0 || oc == 1) {
+        if (ic == oc) {
+            ret = scale_nchw_fp32<true>(input, alpha, beta, on, oc, elements_per_channel, output);
+        } else {
+            ret = scale_nchw_fp32<false>(input, alpha, beta, on, oc, elements_per_channel, output);
+        }
     } else if (axis == nDims) {
         ret = scale_nchwc8_fp32(input, alpha, beta, on, oc, elements_per_channel, output);
 #ifdef _USE_INT8
diff --git a/compute/tensor/src/cpu/x86/fp32/softmax.cpp b/compute/tensor/src/cpu/x86/fp32/softmax.cpp
index 3edd624a..71fffc87 100644
--- a/compute/tensor/src/cpu/x86/fp32/softmax.cpp
+++ b/compute/tensor/src/cpu/x86/fp32/softmax.cpp
@@ -14,59 +14,76 @@
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 #include "tensor_transpose.h"
 
-void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output)
+template <bool logsoftmax>
+static void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output)
 {
     for (I32 i = 0; i < loopOuter; i++) {
         const F32 *inputPtr = input + i * loops;
         F32 *outputPtr = output + i * loops;
 
-        __m256 max_v, sub_v, sum_v, tmp_v;
+        __m256 max_v, tmp_v;
         F32 max_s, tmp_s;
-        array_minmax_value_f32(inputPtr, loops, 2, &max_s);
-        max_v = _mm256_set1_ps(max_s);
-        sum_v = _mm256_set1_ps(0.f);
-
+        if (!logsoftmax) {
+            array_minmax_value_f32(inputPtr, loops, 2, &max_s);
+            max_v = _mm256_set1_ps(max_s);
+        }
         I32 j = 0;
-        F32 sum_s = 0;
-        for (j = 0; j < loops - 7; j += 8) {
+        __m256 sum_v = _mm256_set1_ps(0.f);
+        for (; j < loops - 7; j += 8) {
             __m256 in = _mm256_loadu_ps(inputPtr + j);
-            sub_v = _mm256_sub_ps(in, max_v);
-            tmp_v = _mm256_exp_ps(sub_v);
+            if (!logsoftmax) {
+                in = _mm256_sub_ps(in, max_v);
+            }
+            tmp_v = _mm256_exp_ps(in);
             sum_v = _mm256_add_ps(sum_v, tmp_v);
-            _mm256_storeu_ps(outputPtr + j, tmp_v);
+            if (!logsoftmax) {
+                _mm256_storeu_ps(outputPtr + j, tmp_v);
+            }
         }
-        sum_s += _mm256_sum_ps(sum_v);
+        F32 sum_s = _mm256_sum_ps(sum_v);
         for (; j < loops; j++) {
-            tmp_s = exp(inputPtr[j] - max_s);
-            outputPtr[j] = tmp_s;
+            if (logsoftmax) {
+                tmp_s = exp(inputPtr[j]);
+            } else {
+                tmp_s = exp(inputPtr[j] - max_s);
+                outputPtr[j] = tmp_s;
+            }
             sum_s += tmp_s;
         }
-        array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0);
+        if (logsoftmax) {
+            array_scale_f32(inputPtr, outputPtr, loops, 1.0, -log(sum_s));
+        } else {
+            array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0);
+        }
     }
 }
 
+template <bool logsoftmax>
 void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopInner, F32 *output)
 {
     std::vector<F32> buffer(loopInner * 2);
     F32 *maxBuffer = &buffer[0];
     F32 *sumBuffer = &buffer[loopInner];
     I32 k = 0;
+    F32 tmp_s;
     for (I32 i = 0; i < loopOuter; i++) {
         const F32 *inputPtrBase = input + i * loops * loopInner;
         F32 *outputPtrBase = output + i * loops * loopInner;
 
-        memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F32));
-        memset(sumBuffer, 0, loopInner * sizeof(F32));
-        for (I32 j = 1; j < loops; j++) {
-            const F32 *inputPtr = inputPtrBase + j * loopInner;
-            for (k = 0; k < loopInner - 7; k += 8) {
-                __m256 in_v = _mm256_loadu_ps(inputPtr + k);
-                __m256 out_v = _mm256_loadu_ps(maxBuffer + k);
-                __m256 max_v = _mm256_max_ps(in_v, out_v);
-                _mm256_storeu_ps(maxBuffer + k, max_v);
-            }
-            for (; k < loopInner; k++) {
-                maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]);
+        UNI_MEMSET(sumBuffer, 0, loopInner * sizeof(F32));
+        if (!logsoftmax) {
+            UNI_MEMCPY(maxBuffer, inputPtrBase, loopInner * sizeof(F32));
+            for (I32 j = 1; j < loops; j++) {
+                const F32 *inputPtr = inputPtrBase + j * loopInner;
+                for (k = 0; k < loopInner - 7; k += 8) {
+                    __m256 in_v = _mm256_loadu_ps(inputPtr + k);
+                    __m256 out_v = _mm256_loadu_ps(maxBuffer + k);
+                    __m256 max_v = _mm256_max_ps(in_v, out_v);
+                    _mm256_storeu_ps(maxBuffer + k, max_v);
+                }
+                for (; k < loopInner; k++) {
+                    maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]);
+                }
             }
         }
         for (I32 j = 0; j < loops; j++) {
@@ -74,35 +91,69 @@ void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopIn
             F32 *outputPtr = outputPtrBase + j * loopInner;
             for (k = 0; k < loopInner - 7; k += 8) {
                 __m256 in_v = _mm256_loadu_ps(inputPtr + k);
-                __m256 max_v = _mm256_loadu_ps(maxBuffer + k);
-                __m256 sub_v = _mm256_sub_ps(in_v, max_v);
-                __m256 exp_v = _mm256_exp_ps(sub_v);
+                if (!logsoftmax) {
+                    in_v = _mm256_sub_ps(in_v, _mm256_loadu_ps(maxBuffer + k));
+                }
+                __m256 exp_v = _mm256_exp_ps(in_v);
                 __m256 sum_v = _mm256_loadu_ps(sumBuffer + k);
                 sum_v = _mm256_add_ps(sum_v, exp_v);
                 _mm256_storeu_ps(sumBuffer + k, sum_v);
-                _mm256_storeu_ps(outputPtr + k, exp_v);
+                if (!logsoftmax) {
+                    _mm256_storeu_ps(outputPtr + k, exp_v);
+                }
             }
             for (; k < loopInner; k++) {
-                outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]);
-                sumBuffer[k] += outputPtr[k];
+                if (logsoftmax) {
+                    tmp_s = exp(inputPtr[k]);
+                } else {
+                    tmp_s = exp(inputPtr[k] - maxBuffer[k]);
+                    outputPtr[k] = tmp_s;
+                }
+                sumBuffer[k] += tmp_s;
             }
         }
-        for (I32 j = 0; j < loops; j++) {
-            F32 *outputPtr = outputPtrBase + j * loopInner;
+        if (logsoftmax) {
             for (k = 0; k < loopInner - 7; k += 8) {
-                __m256 out_v = _mm256_loadu_ps(outputPtr + k);
                 __m256 sum_v = _mm256_loadu_ps(sumBuffer + k);
-                out_v = _mm256_div_ps(out_v, sum_v);
-                _mm256_storeu_ps(outputPtr + k, out_v);
+                sum_v = _mm256_log_ps(sum_v);
+                _mm256_storeu_ps(sumBuffer + k, sum_v);
             }
             for (; k < loopInner; k++) {
-                outputPtr[k] /= sumBuffer[k];
+                sumBuffer[k] = log(sumBuffer[k]);
+            }
+            for (I32 j = 0; j < loops; j++) {
+                const F32 *inputPtr = inputPtrBase + j * loopInner;
+                F32 *outputPtr = outputPtrBase + j * loopInner;
+                for (k = 0; k < loopInner - 7; k += 8) {
+                    __m256 out_v = _mm256_loadu_ps(inputPtr + k);
+                    __m256 sum_v = _mm256_loadu_ps(sumBuffer + k);
+                    out_v = _mm256_sub_ps(out_v, sum_v);
+                    _mm256_storeu_ps(outputPtr + k, out_v);
+                }
+                for (; k < loopInner; k++) {
+                    outputPtr[k] -= sumBuffer[k];
+                }
+            }
+        } else {
+            for (I32 j = 0; j < loops; j++) {
+                F32 *outputPtr = outputPtrBase + j * loopInner;
+                for (k = 0; k < loopInner - 7; k += 8) {
+                    __m256 out_v = _mm256_loadu_ps(outputPtr + k);
+                    __m256 sum_v = _mm256_loadu_ps(sumBuffer + k);
+                    out_v = _mm256_div_ps(out_v, sum_v);
+                    _mm256_storeu_ps(outputPtr + k, out_v);
+                }
+                for (; k < loopInner; k++) {
+                    outputPtr[k] /= sumBuffer[k];
+                }
             }
         }
     }
 }
 
-EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output)
+template <bool logsoftmax>
+static EE softmax_kernel(
+    TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output)
 {
     UNUSED(outputDesc);
     if (nullptr == input || nullptr == output) {
@@ -146,9 +197,20 @@ EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc out
     }
     U32 loop_outer = size / loops / loop_inner;
     if (axis == 0) {
-        softmax_lastAxis_fp32(input, loop_outer, loops, output);
+        softmax_lastAxis_fp32<logsoftmax>(input, loop_outer, loops, output);
     } else {
-        softmax_anyAxis_fp32(input, loop_outer, loops, loop_inner, output);
+        softmax_anyAxis_fp32<logsoftmax>(input, loop_outer, loops, loop_inner, output);
     }
     return SUCCESS;
 }
+
+EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output)
+{
+    return softmax_kernel<false>(inputDesc, input, axis, outputDesc, output);
+}
+
+EE logsoftmax_fp32(
+    TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output)
+{
+    return softmax_kernel<true>(inputDesc, input, axis, outputDesc, output);
+}
diff --git a/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h b/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h
index 52018766..4101368f 100644
--- a/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h
+++ b/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h
@@ -78,6 +78,20 @@ EE convolution_direct(TensorDesc inputDesc,
     F32 *outArray,
     ActivationParamSpec activationDesc);
 
+EE convolution_winograd(TensorDesc inputDesc,
+    F32 *inArray,
+    F32 *eltwiseInput,
+    TensorDesc filterDesc,
+    const F32 *filterArray,
+    ConvolutionParamSpec convParamSpec,
+    TensorDesc biasDesc,
+    const F32 *biasArray,
+    U32 tmpBytes,
+    void *tmp,
+    TensorDesc outputDesc,
+    F32 *outArray,
+    ActivationParamSpec activationDesc);
+
 EE convolution_1x1_direct(TensorDesc inputDesc,
     F32 *inArray,
     F32 *eltwiseInput,
@@ -110,7 +124,7 @@ EE check_fp32(TensorDesc inputDescA,
     const F32 *inputB,
     CheckMode checkMode,
     TensorDesc outputDesc,
-    I32 *output);
+    U8 *output);
 
 EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue);
 
@@ -223,8 +237,13 @@ EE eltwise_u8(std::vector<void *> input,
     void *output,
     EltwiseMode eltwiseMode);
 
-EE layer_normalization_fp32(
-    TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output);
+EE layer_normalization_fp32(TensorDesc inputDesc,
+    F32 *input,
+    LayerNormParamSpec p,
+    F32 *alpha,
+    F32 *beta,
+    TensorDesc outputDesc,
+    F32 *output);
 
 EE l2normalization_fp32(TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output);
 
@@ -276,6 +295,12 @@ EE grucell_fp32(TensorDesc xDesc,
     void *output,
     Arch arch);
 
+EE pooling_nchw_fp32(TensorDesc inputDesc,
+    const F32 *input,
+    PoolingParamSpec poolingParamSpec,
+    TensorDesc outputDesc,
+    F32 *output);
+
 EE pooling_fp32(TensorDesc inputDesc,
     const F32 *input,
     PoolingParamSpec poolingParamSpec,
@@ -305,6 +330,9 @@ EE scale_fp32(F32 *input,
 EE softmax_fp32(
     TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output);
 
+EE logsoftmax_fp32(
+    TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output);
+
 EE deconvolution_transform_filter_fp32(TensorDesc filterDesc,
     const F32 *filter,
     ConvolutionForwardAlgorithm algorithm,
diff --git a/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h b/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h
index 213131d6..1a2db428 100644
--- a/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h
+++ b/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h
@@ -45,7 +45,7 @@ inline void transformNCHWCxNx(U32 fc, U32 fh, U32 fw, U32 oc, const F32 *input,
                     _mm256_storeu_ps(dest + 24, _mm256_i32gather_ps(src + 24 * lstep, vindex, 4));
                 }
             }
-            memset(dest + N, 0, ((cSizePadding - cSize) * N * 4));
+            UNI_MEMSET(dest + N, 0, ((cSizePadding - cSize) * N * 4));
         }
     }
 }
@@ -112,7 +112,7 @@ inline EE transformNCHWToNCHWCxNx(
                     dest = output + c * fh * fw * 8 + hw * cSizePadding * 8 + c8 * 8;
                     _mm256_storeu_ps(dest, _mm256_mask_i32gather_ps(src256, src, vindex, mask, 4));
                 }
-                memset(dest + 8, 0, ((cSizePadding - cSize) * 32));
+                UNI_MEMSET(dest + 8, 0, ((cSizePadding - cSize) * 32));
             }
         }
         fn += remain;
@@ -128,10 +128,10 @@ inline void PaddingNCHWC8(
     DataFormat idf;
     U32 in, ic, ih, iw;
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     U32 padih = paddingT + paddingB + ih;
     U32 padiw = paddingL + paddingR + iw;
@@ -148,8 +148,9 @@ inline void PaddingNCHWC8(
 #endif
         for (U32 c = 0; c < ic; ++c) {
             U32 coff = c * padih * padiw * 8;
-            memset(tmp + coff, 0, padiw * paddingT * 8 * bytesOf(idt));
-            memset(tmp + coff + (ih + paddingT) * padiw * 8, 0, padiw * paddingB * 8 * bytesOf(idt));
+            UNI_MEMSET(tmp + coff, 0, padiw * paddingT * 8 * bytesOf(idt));
+            UNI_MEMSET(
+                tmp + coff + (ih + paddingT) * padiw * 8, 0, padiw * paddingB * 8 * bytesOf(idt));
         }
 
 #ifdef _USE_OPENMP
@@ -161,10 +162,10 @@ inline void PaddingNCHWC8(
             U32 h = hc % ih;
             U32 hoff = (h + paddingT) * padiw;
 
-            memset(tmp + coff + hoff * 8, 0, paddingL * 8 * bytesOf(idt));
-            memcpy(tmp + coff + (hoff + paddingL) * 8, data + c * ih * iw * 8 + h * iw * 8,
+            UNI_MEMSET(tmp + coff + hoff * 8, 0, paddingL * 8 * bytesOf(idt));
+            UNI_MEMCPY(tmp + coff + (hoff + paddingL) * 8, data + c * ih * iw * 8 + h * iw * 8,
                 iw * 8 * bytesOf(idt));
-            memset(tmp + coff + (hoff + (paddingL + iw)) * 8, 0, paddingR * 8 * bytesOf(idt));
+            UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * 8, 0, paddingR * 8 * bytesOf(idt));
         }
 
 #ifdef _USE_OPENMP
@@ -188,8 +189,8 @@ inline void deconvOverlapAndCrop(F32 *input,
     U32 fhfw = fh * fw;
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingL = convParamSpec.padding_left;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingL = convParamSpec.pad_left;
     __m256i vindex =
         _mm256_set_epi32(fhfw * 7, fhfw * 6, fhfw * 5, fhfw * 4, fhfw * 3, fhfw * 2, fhfw, 0);
     for (U32 kn = 0; kn < in; ++kn) {
@@ -216,7 +217,7 @@ inline void deconvOverlapAndCrop(F32 *input,
                 }
             }
         }
-        input += ic * ih * iw;
+        input += oc * fh * fw * ih * iw;
         output += oc * oh * ow;
     }
 }
@@ -237,8 +238,8 @@ inline void deconvOverlapAndCropNCHWC8(F32 *input,
     U32 fhfw = fh * fw;
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingL = convParamSpec.padding_left;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingL = convParamSpec.pad_left;
     for (U32 kn = 0; kn < in; ++kn) {
         for (U32 kh = 0; kh < ih; ++kh) {
             for (U32 kw = 0; kw < iw; ++kw) {
@@ -263,7 +264,7 @@ inline void deconvOverlapAndCropNCHWC8(F32 *input,
                 }
             }
         }
-        input += ic * ih * iw;
+        input += oc * fh * fw * ih * iw;
         output += oc * oh * ow;
     }
 }
@@ -285,8 +286,8 @@ inline void deconvOverlapAndCropEqualNCHWC8(F32 *input,
     U32 fhfw = fh * fw;
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingL = convParamSpec.padding_left;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingL = convParamSpec.pad_left;
     for (U32 kn = 0; kn < in; ++kn) {
         for (U32 kc = 0; kc < oc; kc += 8) {
 #ifdef _USE_OPENMP
@@ -312,7 +313,7 @@ inline void deconvOverlapAndCropEqualNCHWC8(F32 *input,
                 }
             }
         }
-        input += ic * ih * iw;
+        input += oc * fh * fw * ih * iw;
         output += oc * oh * ow;
     }
 }
diff --git a/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h b/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h
index 64aaeca9..963218ea 100644
--- a/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h
+++ b/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h
@@ -13,147 +13,101 @@
 
 #ifndef CHEETAH_X86_FUNCTIONS_FP32_H
 #define CHEETAH_X86_FUNCTIONS_FP32_H
-#include <math.h>
+
+#include "cpu/cpu_functions_template.h"
 #include "x86_avx2_expand.h"
-#include "parameter_spec.h"
-#include "uni.h"
 #include "thread_affinity.h"
 
 inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDesc, F32 *output)
 {
-    __m256 in, out;
     __m256 zero = _mm256_set1_ps(0.);
     __m256 one = _mm256_set1_ps(1.);
     __m256 three = _mm256_set1_ps(3.);
     __m256 six = _mm256_set1_ps(6.);
     __m256 signm = _mm256_set1_ps(-0.0);
-    U32 len_main = len / 8;
-    U32 len_tail = len % 8;
-
-    F32 value;
+    U32 loops = len / 8 * 8;
     EE ret = SUCCESS;
-
     switch (activationDesc.mode) {
         case ACTIVATION_NULL: {
+            if (output != input) {
+                UNI_MEMCPY(output, input, sizeof(float) * len);
+            }
+            loops = len;
             break;
         }
         case ACTIVATION_RELU: {
-            U32 main_len = len - len_tail;
             if (activationDesc.value[0] == 0) {
 #ifdef _USE_OPENMP
 #pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static)
 #endif
-                for (U32 i = 0; i < len_main; i++) {
-                    _mm256_storeu_ps(
-                        output + i * 8, _mm256_max_ps(zero, _mm256_loadu_ps(input + i * 8)));
-                }
-                for (U32 i = 0; i < len_tail; i++) {
-                    output[main_len + i] = (input[main_len + i] < 0) ? 0 : input[main_len + i];
+                for (U32 i = 0; i < loops; i += 8) {
+                    _mm256_storeu_ps(output + i, _mm256_max_ps(zero, _mm256_loadu_ps(input + i)));
                 }
             } else {
                 __m256 scale = _mm256_set1_ps(activationDesc.value[0]);
 #ifdef _USE_OPENMP
 #pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static)
 #endif
-                for (U32 i = 0; i < len_main; i++) {
-                    __m256 tmp = _mm256_loadu_ps(input + i * 8);
-                    _mm256_storeu_ps(output + i * 8, _mm256_max_ps(_mm256_mul_ps(scale, tmp), tmp));
-                }
-                for (U32 i = 0; i < len_tail; i++) {
-                    float tmp = activationDesc.value[0] * input[main_len + i];
-                    output[main_len + i] = (input[main_len + i] < tmp) ? tmp : input[main_len + i];
+                for (U32 i = 0; i < loops; i += 8) {
+                    __m256 tmp = _mm256_loadu_ps(input + i);
+                    _mm256_storeu_ps(output + i, _mm256_max_ps(_mm256_mul_ps(scale, tmp), tmp));
                 }
             }
             break;
         }
         case ACTIVATION_RELU6: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_max_ps(zero, in);
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_max_ps(zero, in);
                 out = _mm256_min_ps(six, out);
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = (input[i] < 0) ? 0 : input[i];
-                if (value > 6) {
-                    value = 6;
-                }
-                output[i] = value;
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
         case ACTIVATION_H_SIGMOID: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_add_ps(in, three);
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_add_ps(in, three);
                 out = _mm256_max_ps(out, zero);
                 out = _mm256_min_ps(out, six);
                 out = _mm256_div_ps(out, six);
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] + 3;
-                value = (value < 0) ? 0 : value;
-                value = (value > 6) ? 6 : value;
-                value = value / 6;
-                output[i] = value;
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
         case ACTIVATION_H_SWISH: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_add_ps(in, three);
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_add_ps(in, three);
                 out = _mm256_max_ps(out, zero);
                 out = _mm256_min_ps(out, six);
                 out = _mm256_div_ps(out, six);
                 out = _mm256_mul_ps(out, in);
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] + 3;
-                value = (value < 0) ? 0 : value;
-                value = (value > 6) ? 6 : value;
-                value = input[i] * value;
-                value = value / 6;
-                output[i] = value;
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
         case ACTIVATION_H_SWISH_NODIV: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_add_ps(in, three);
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_add_ps(in, three);
                 out = _mm256_max_ps(out, zero);
                 out = _mm256_min_ps(out, six);
                 out = _mm256_mul_ps(out, in);
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] + 3;
-                value = (value < 0) ? 0 : value;
-                value = (value > 6) ? 6 : value;
-                value = input[i] * value;
-                output[i] = value;
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
         case ACTIVATION_GELU: {
-            F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846);
-            __m256 vec0 = _mm256_set1_ps(two_div_PI_sqrt);
+            __m256 vec0 = _mm256_set1_ps(sqrt(2 / 3.14159265358979323846));
             __m256 vec1 = _mm256_set1_ps(0.044715);
             __m256 vec2 = _mm256_set1_ps(0.5);
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_mul_ps(in, in);
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_mul_ps(in, in);
                 out = _mm256_mul_ps(out, in);
                 out = _mm256_fmadd_ps(vec1, out, in);
                 out = _mm256_mul_ps(vec0, out);
@@ -161,136 +115,126 @@ inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDes
                 out = _mm256_add_ps(one, out);
                 out = _mm256_mul_ps(vec2, out);
                 out = _mm256_mul_ps(in, out);
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i];
-                value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3));
-                value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0);
-                value = 0.5 * (1.0 + value);
-                value = input[i] * value;
-                output[i] = value;
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
         case ACTIVATION_TANH: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_tanh_ps(in);
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0);
-                output[i] = value;
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_tanh_ps(in);
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
         case ACTIVATION_SIGMOID: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_sigmod_ps(in);
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_sigmod_ps(in);
+                _mm256_storeu_ps(output + i, out);
             }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = 1.0 / (1.0 + exp(-1.0 * input[i]));
-                output[i] = value;
+            break;
+        }
+        case ACTIVATION_SWISH: {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static)
+#endif
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_mul_ps(in, _mm256_sigmod_ps(in));
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
         case ACTIVATION_MISH: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_mul_ps(
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_mul_ps(
                     in, _mm256_tanh_ps(_mm256_log_ps(_mm256_add_ps(_mm256_exp_ps(in), one))));
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                value = input[i] * tanh(log(exp(input[i]) + 1.0));
-                output[i] = value;
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
         case ACTIVATION_SOFTPLUS: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_log_ps(_mm256_add_ps(_mm256_exp_ps(in), one));
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                output[i] = log(1 + exp(input[i]));
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_log_ps(_mm256_add_ps(_mm256_exp_ps(in), one));
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
         case ACTIVATION_EXP: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_exp_ps(in);
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
-            }
-            for (U32 i = 0; i < len_tail; i++) {
-                output[i] = exp(input[i]);
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_exp_ps(in);
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
         case ACTIVATION_ABS: {
-            for (U32 i = 0; i < len_main; i++) {
-                in = _mm256_loadu_ps(input);
-                out = _mm256_andnot_ps(signm, in);
-                _mm256_storeu_ps(output, out);
-                input += 8;
-                output += 8;
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_andnot_ps(signm, in);
+                _mm256_storeu_ps(output + i, out);
             }
-            for (U32 i = 0; i < len_tail; i++) {
-                output[i] = UNI_ABS(input[i]);
+            break;
+        }
+        case ACTIVATION_LOG: {
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_log_ps(in);
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
-        case ACTIVATION_SIGN: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = UNI_SIGN(input[i]);
+        case ACTIVATION_ROUND: {
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_round_ps(in, _MM_FROUND_TO_NEAREST_INT);
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
-        case ACTIVATION_LOG: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = log(input[i]);
+        case ACTIVATION_CEIL: {
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_ceil_ps(in);
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
-        case ACTIVATION_NOT: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = (input[i] > 0) ? 0 : 1;
+        case ACTIVATION_FLOOR: {
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_floor_ps(in);
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
-        case ACTIVATION_GREATER: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = input[i] > 1 ? 1 : 0;
+        case ACTIVATION_RECIPROCAL: {
+            for (U32 i = 0; i < loops; i += 8) {
+                __m256 in = _mm256_loadu_ps(input + i);
+                __m256 out = _mm256_div_ps(one, in);
+                _mm256_storeu_ps(output + i, out);
             }
             break;
         }
+        case ACTIVATION_SIGN:
+        case ACTIVATION_NOT:
+        case ACTIVATION_GREATER:
         case ACTIVATION_NEG: {
-            for (U32 i = 0; i < len; i++) {
-                output[i] = -input[i];
-            }
+            loops = 0;
             break;
         }
         default:
             ret = NOT_SUPPORTED;
             break;
     }
+    if (ret == SUCCESS) {
+        for (U32 i = loops; i < len; i++) {
+            ret = activation_template<F32>(activationDesc, input[i], output + i);
+        }
+    }
     return ret;
 }
 
@@ -334,7 +278,7 @@ inline void array_power_f32(F32 *input, F32 *output, I32 len, F32 power)
         }
     } else if (power == 1) {
         if (input != output) {
-            memcpy(output, input, len * sizeof(F32));
+            UNI_MEMCPY(output, input, len * sizeof(F32));
         }
         i = len;
     } else if (power == 2) {
@@ -478,8 +422,7 @@ inline F32 array_var_f32(const F32 *data, I32 len, F32 mean)
         sum_s += _mm256_sum_ps(sum_v);
     }
     for (; i < len; i++) {
-        F32 in = data[i];
-        F32 tmp = in - mean;
+        F32 tmp = data[i] - mean;
         sum_s += tmp * tmp;
     }
     return sum_s / len;
@@ -506,6 +449,26 @@ inline F32 array_sum_f32(const F32 *data, I32 len)
     return sum_s;
 }
 
+inline I32 array_sum_i32(const I32 *data, I32 len)
+{
+    if (len <= 0) {
+        return 0;
+    }
+
+    I32 i = 0;
+    I32 sum_s = 0;
+    __m256i sum_v = _mm256_set1_epi32(0);
+    for (i = 0; i < len - 7; i += 8) {
+        __m256i in = _mm256_loadu_si256((const __m256i *)(data + i));
+        sum_v = _mm256_add_epi32(sum_v, in);
+    }
+    sum_s += _mm256_sum_epi32(sum_v);
+    for (; i < len; i++) {
+        sum_s += data[i];
+    }
+    return sum_s;
+}
+
 // array mean
 inline F32 array_mean_f32(const F32 *data, I32 len)
 {
diff --git a/compute/tensor/src/cpu/x86/int32/scale.cpp b/compute/tensor/src/cpu/x86/int32/scale.cpp
new file mode 100644
index 00000000..0d0bafd5
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/int32/scale.cpp
@@ -0,0 +1,151 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/x86/int32/tensor_computing_int32.h"
+
+static EE scale_nchwc8_int32(
+    I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output)
+{
+    __m256i in_vec, out_vec;
+    __m256i one = _mm256_set1_epi32(1);
+    __m256i zero = _mm256_set1_epi32(0);
+    U32 index = 0;
+    for (I32 n = 0; n < in; n++) {
+        for (I32 c = 0; c < ic; c += 8) {
+            __m256i alpha_vec = (alpha == nullptr) ? one : _mm256_loadu_si256((const __m256i *)(alpha + c));
+            __m256i beta_vec = (beta == nullptr) ? zero : _mm256_loadu_si256((const __m256i *)(beta + c));
+            for (I32 i = 0; i < elements_per_channel; i++) {
+                in_vec = _mm256_loadu_si256((const __m256i *)(input + index));
+                out_vec = _mm256_add_epi32(_mm256_mul_epi32(alpha_vec, in_vec), beta_vec);
+                _mm256_storeu_si256((__m256i *)(output + index), out_vec);
+                index += 8;
+            }
+        }
+    }
+    return SUCCESS;
+}
+
+template <bool icoc_equal>
+static EE scale_nchw_int32(
+    I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output)
+{
+    __m256i one = _mm256_set1_epi32(1);
+    __m256i zero = _mm256_set1_epi32(0);
+    U32 dst = 0, src = 0;
+    for (I32 n = 0; n < in; n++) {
+        for (I32 c = 0; c < ic; c++) {
+            __m256i alpha_vec = (alpha == nullptr) ? one : _mm256_set1_epi32(alpha[c]);
+            __m256i beta_vec = (beta == nullptr) ? zero : _mm256_set1_epi32(beta[c]);
+            I32 i = 0;
+            for (; i < elements_per_channel - 7; i += 8) {
+                if (icoc_equal) {
+                    src = (n * ic + c) * elements_per_channel + i;
+                } else {
+                    src = n * elements_per_channel + i;
+                }
+                __m256i in_vec = _mm256_loadu_si256((const __m256i *)(input + src));
+                __m256i out_vec = _mm256_add_epi32(_mm256_mul_epi32(alpha_vec, in_vec), beta_vec);
+                _mm256_storeu_si256((__m256i *)(output + dst), out_vec);
+                dst += 8;
+            }
+            for (; i < elements_per_channel; i++) {
+                if (icoc_equal) {
+                    src = (n * ic + c) * elements_per_channel + i;
+                } else {
+                    src = n * elements_per_channel + i;
+                }
+                int alpha_s = (alpha == nullptr) ? 1 : alpha[c];
+                int beta_s = (beta == nullptr) ? 0 : beta[c];
+                output[dst] = alpha_s * input[src] + beta_s;
+                dst++;
+            }
+        }
+    }
+    return SUCCESS;
+}
+
+template <bool icoc_equal>
+static EE scale_nhwc_int32(
+    I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output)
+{
+    __m256i one = _mm256_set1_epi32(1);
+    __m256i zero = _mm256_set1_epi32(0);
+    __m256i in_vec;
+    int in_s;
+    U32 dst = 0, src = 0;
+    for (I32 n = 0; n < in; n++) {
+        for (I32 i = 0; i < elements_per_channel; i++, src++) {
+            I32 c = 0;
+            for (; c < ic - 7; c += 8) {
+                __m256i alpha_vec = (alpha == nullptr) ? one : _mm256_loadu_si256((const __m256i *)(alpha + c));
+                __m256i beta_vec = (beta == nullptr) ? zero : _mm256_loadu_si256((const __m256i *)(beta + c));
+                if (icoc_equal) {
+                    in_vec = _mm256_loadu_si256((const __m256i *)(input + dst));
+                } else {
+                    in_vec = _mm256_set1_epi32(input[src]);
+                }
+                __m256i out_vec = _mm256_add_epi32(_mm256_mul_epi32(alpha_vec, in_vec), beta_vec);
+                _mm256_storeu_si256((__m256i *)(output + dst), out_vec);
+                dst += 8;
+            }
+            for (; c < ic; c++) {
+                int alpha_s = (alpha == nullptr) ? 1 : alpha[c];
+                int beta_s = (beta == nullptr) ? 0 : beta[c];
+                if (icoc_equal) {
+                    in_s = input[dst];
+                } else {
+                    in_s = input[src];
+                }
+                output[dst] = alpha_s * in_s + beta_s;
+                dst++;
+            }
+        }
+    }
+    return SUCCESS;
+}
+
+EE scale_int32(I32 *input,
+    I32 axis,
+    I32 nDims,
+    I32 *alpha,
+    I32 *beta,
+    I32 on,
+    I32 oc,
+    I32 elements_per_channel,
+    I32 ic,
+    I32 *output)
+{
+    if (nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    EE ret = SUCCESS;
+    // If oc is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw.
+    if (axis == 1 || axis == 0 || oc == 1) {
+        if (ic == oc) {
+            ret = scale_nchw_int32<true>(input, alpha, beta, on, oc, elements_per_channel, output);
+        } else {
+            ret = scale_nchw_int32<false>(input, alpha, beta, on, oc, elements_per_channel, output);
+        }
+    } else if (axis == nDims - 1) {
+        if (ic == oc) {
+            ret = scale_nhwc_int32<true>(input, alpha, beta, on, oc, elements_per_channel, output);
+        } else {
+            ret = scale_nhwc_int32<false>(input, alpha, beta, on, oc, elements_per_channel, output);
+        }
+    } else if (axis == nDims) {
+        ret = scale_nchwc8_int32(input, alpha, beta, on, oc, elements_per_channel, output);
+    } else {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/x86/int32/tensor_computing_int32.h b/compute/tensor/src/cpu/x86/int32/tensor_computing_int32.h
new file mode 100644
index 00000000..b52434ba
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/int32/tensor_computing_int32.h
@@ -0,0 +1,31 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CHEETAH_TENSOR_COMPUTING_INT32_H
+#define CHEETAH_TENSOR_COMPUTING_INT32_H
+
+#include "error.h"
+
+#include "thread_affinity.h"
+
+EE scale_int32(I32 *input,
+    I32 axis,
+    I32 nDims,
+    I32 *alpha,
+    I32 *beta,
+    I32 on,
+    I32 oc,
+    I32 elements_per_channel,
+    I32 ic,
+    I32 *output);
+#endif
diff --git a/compute/tensor/src/cpu/x86/int8/convolution.cpp b/compute/tensor/src/cpu/x86/int8/convolution.cpp
index 277593d3..1581d5a1 100644
--- a/compute/tensor/src/cpu/x86/int8/convolution.cpp
+++ b/compute/tensor/src/cpu/x86/int8/convolution.cpp
@@ -12,8 +12,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "sys.h"
-#include "error.h"
-
 #include "cpu/x86/int8/tensor_computing_int8.h"
 
 EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc,
@@ -34,10 +32,10 @@ EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     U32 ih_pad = ih + paddingT + paddingB;
     U32 iw_pad = iw + paddingL + paddingR;
@@ -62,6 +60,9 @@ EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc,
             if (idf != DF_NCHWC16) {
                 *bytes += icPadding * ih_pad * iw_pad;
             }
+            if (paddingT > 1 || paddingB > 1 || paddingL > 1 || paddingR > 1) {
+                *bytes += oc * 4;
+            }
             break;
         }
         case CONVOLUTION_ALGORITHM_GEMM_ICNCHW:
@@ -95,12 +96,13 @@ EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc,
 
 EE convolution_int8(TensorDesc inputDesc,
     UINT8 *input,
+    F32 *eltwiseInput,
     TensorDesc filterDesc,
     const INT8 *filter,
     ConvolutionParamSpec convParamSpec,
     ConvolutionForwardAlgorithm algorithm,
     TensorDesc biasDesc,
-    const I32 *bias,
+    const F32 *bias,
     U32 tmpBytes,
     void *tmp,
     TensorDesc outputDesc,
@@ -132,11 +134,11 @@ EE convolution_int8(TensorDesc inputDesc,
     EE ret = SUCCESS;
     switch (algorithm) {
         case CONVOLUTION_ALGORITHM_DIRECT:
-            ret = convolution_direct(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc,
+            ret = convolution_direct(inputDesc, input, eltwiseInput, filterDesc, filter, convParamSpec, biasDesc,
                 bias, tmpBytes, tmp, outputDesc, output, scale, activationDesc);
             break;
         case CONVOLUTION_ALGORITHM_POINTWISE:
-            ret = convolution_1x1_direct(inputDesc, input, filterDesc, filter, convParamSpec,
+            ret = convolution_1x1_direct(inputDesc, input, eltwiseInput, filterDesc, filter, convParamSpec,
                 biasDesc, bias, tmpBytes, tmp, outputDesc, output, scale, activationDesc);
             break;
         default:
diff --git a/compute/tensor/src/cpu/x86/int8/convolution_1x1_direct.cpp b/compute/tensor/src/cpu/x86/int8/convolution_1x1_direct.cpp
index a55a25a8..3dc729eb 100644
--- a/compute/tensor/src/cpu/x86/int8/convolution_1x1_direct.cpp
+++ b/compute/tensor/src/cpu/x86/int8/convolution_1x1_direct.cpp
@@ -17,1919 +17,2215 @@
 #include "error.h"
 #include "transform_functions_int8.h"
 #include "cpu/x86/int8/tensor_computing_int8.h"
+#include "cpu/x86/int8/convolution_functions.h"
 #include "cpu/x86/tensor_computing_x86.h"
 
 #define SIMDW 16
 #define BLOCK_IC_DIM 256
 #define BLOCK_HW_DIM 768
 
-struct ConvController {
-    UINT8 *input;
-    const INT8 *filter;
-    void *output;
-    UINT8 *u8Output;
-    const I32 *bias;
-    I64 ic;
-    I64 kw;
-    I64 kh;
-    I64 stepC16;
-    I64 dilateW;
-    I64 dilateH;
-    I64 ostepC16;
-    I64 flags;
-    I64 fStep;
-    I64 f8Step;
-    I64 f4Step;
-    void *scale;
-};
-
-typedef void (*kernelFunc)(ConvController &c);
-
 // clang-format off
-#define clear1Regs(rtype) \
-    "vxorps "#rtype"0, "#rtype"0, "#rtype"0                     \n\t"
-
-#define clear2Regs(rtype) \
-    clear1Regs(rtype) \
-    "vxorps "#rtype"1, "#rtype"1, "#rtype"1                     \n\t"
-
-#define clear3Regs(rtype) \
-    clear2Regs(rtype) \
-    "vxorps "#rtype"2, "#rtype"2, "#rtype"2                     \n\t"
-
-#define clear12Regs(rtype) \
-    clear3Regs(rtype) \
-    "vxorps "#rtype"3, "#rtype"3, "#rtype"3                     \n\t" \
-    "vxorps "#rtype"4, "#rtype"4, "#rtype"4                     \n\t" \
-    "vxorps "#rtype"5, "#rtype"5, "#rtype"5                     \n\t" \
-    "vxorps "#rtype"6, "#rtype"6, "#rtype"6                     \n\t" \
-    "vxorps "#rtype"7, "#rtype"7, "#rtype"7                     \n\t" \
-    "vxorps "#rtype"8, "#rtype"8, "#rtype"8                     \n\t" \
-    "vxorps "#rtype"9, "#rtype"9, "#rtype"9                     \n\t" \
-    "vxorps "#rtype"10, "#rtype"10, "#rtype"10                  \n\t" \
-    "vxorps "#rtype"11, "#rtype"11, "#rtype"11                  \n\t"
-
-#define clear24Regs(rtype) \
-    clear12Regs(rtype) \
-    "vxorps "#rtype"12, "#rtype"12, "#rtype"12                  \n\t" \
-    "vxorps "#rtype"13, "#rtype"13, "#rtype"13                  \n\t" \
-    "vxorps "#rtype"14, "#rtype"14, "#rtype"14                  \n\t" \
-    "vxorps "#rtype"15, "#rtype"15, "#rtype"15                  \n\t" \
-    "vxorps "#rtype"16, "#rtype"16, "#rtype"16                  \n\t" \
-    "vxorps "#rtype"17, "#rtype"17, "#rtype"17                  \n\t" \
-    "vxorps "#rtype"18, "#rtype"18, "#rtype"18                  \n\t" \
-    "vxorps "#rtype"19, "#rtype"19, "#rtype"19                  \n\t" \
-    "vxorps "#rtype"20, "#rtype"20, "#rtype"20                  \n\t" \
-    "vxorps "#rtype"21, "#rtype"21, "#rtype"21                  \n\t" \
-    "vxorps "#rtype"22, "#rtype"22, "#rtype"22                  \n\t" \
-    "vxorps "#rtype"23, "#rtype"23, "#rtype"23                  \n\t"
-
-#define reluReg(rtype) \
-    "vpxord "#rtype"31, "#rtype"31, "#rtype"31                  \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"0, "#rtype"0                    \n\t"
-
-#define relu2Regs(rtype) \
-    reluReg(rtype) \
-    "vpmaxsd "#rtype"31, "#rtype"1, "#rtype"1                    \n\t"
-
-#define relu3Regs(rtype) \
-    relu2Regs(rtype) \
-    "vpmaxsd "#rtype"31, "#rtype"2, "#rtype"2                    \n\t"
-
-#define relu12Regs(rtype) \
-    relu3Regs(rtype) \
-    "vpmaxsd "#rtype"31, "#rtype"3, "#rtype"3                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"4, "#rtype"4                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"5, "#rtype"5                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"6, "#rtype"6                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"7, "#rtype"7                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"8, "#rtype"8                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"9, "#rtype"9                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"10, "#rtype"10                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"11, "#rtype"11                    \n\t"
-
-#define relu24Regs(rtype) \
-    relu12Regs(rtype) \
-    "vpmaxsd "#rtype"31, "#rtype"12, "#rtype"12                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"13, "#rtype"13                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"14, "#rtype"14                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"15, "#rtype"15                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"16, "#rtype"16                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"17, "#rtype"17                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"18, "#rtype"18                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"19, "#rtype"19                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"20, "#rtype"20                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"21, "#rtype"21                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"22, "#rtype"22                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"23, "#rtype"23                    \n\t"
-
-#define convertRegI32ToF32(scalePtr, rtype) \
-    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
-    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
-    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
-
-#define convert2RegsI32ToF32(scalePtr, rtype) \
-    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
-    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
-    "vcvtdq2ps "#rtype"1, "#rtype"1                       \n\t" \
-    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
-    "vmulps "#rtype"1, "#rtype"24, "#rtype"1                       \n\t" \
-
-#define convert3RegsI32ToF32(scalePtr, rtype) \
-    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
-    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
-    "vcvtdq2ps "#rtype"1, "#rtype"1                       \n\t" \
-    "vcvtdq2ps "#rtype"2, "#rtype"2                       \n\t" \
-    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
-    "vmulps "#rtype"1, "#rtype"24, "#rtype"1                       \n\t" \
-    "vmulps "#rtype"2, "#rtype"24, "#rtype"2                       \n\t"
-#define convert12RegsI32ToF32(scalePtr, rtype) \
-    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
-    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
-    "vcvtdq2ps "#rtype"1, "#rtype"1                       \n\t" \
-    "vcvtdq2ps "#rtype"2, "#rtype"2                       \n\t" \
-    "vcvtdq2ps "#rtype"3, "#rtype"3                       \n\t" \
-    "vcvtdq2ps "#rtype"4, "#rtype"4                       \n\t" \
-    "vcvtdq2ps "#rtype"5, "#rtype"5                       \n\t" \
-    "vcvtdq2ps "#rtype"6, "#rtype"6                       \n\t" \
-    "vcvtdq2ps "#rtype"7, "#rtype"7                       \n\t" \
-    "vcvtdq2ps "#rtype"8, "#rtype"8                       \n\t" \
-    "vcvtdq2ps "#rtype"9, "#rtype"9                       \n\t" \
-    "vcvtdq2ps "#rtype"10, "#rtype"10                       \n\t" \
-    "vcvtdq2ps "#rtype"11, "#rtype"11                       \n\t" \
-    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
-    "vmulps "#rtype"1, "#rtype"24, "#rtype"1                       \n\t" \
-    "vmulps "#rtype"2, "#rtype"24, "#rtype"2                       \n\t" \
-    "vmulps "#rtype"3, "#rtype"24, "#rtype"3                       \n\t" \
-    "vmulps "#rtype"4, "#rtype"24, "#rtype"4                       \n\t" \
-    "vmulps "#rtype"5, "#rtype"24, "#rtype"5                       \n\t" \
-    "vmulps "#rtype"6, "#rtype"24, "#rtype"6                       \n\t" \
-    "vmulps "#rtype"7, "#rtype"24, "#rtype"7                       \n\t" \
-    "vmulps "#rtype"8, "#rtype"24, "#rtype"8                       \n\t" \
-    "vmulps "#rtype"9, "#rtype"24, "#rtype"9                       \n\t" \
-    "vmulps "#rtype"10, "#rtype"24, "#rtype"10                     \n\t" \
-    "vmulps "#rtype"11, "#rtype"24, "#rtype"11                     \n\t"
-
-#define convert24RegsI32ToF32(scalePtr, rtype) \
-    convert12RegsI32ToF32(scalePtr, rtype) \
-    "vcvtdq2ps "#rtype"12, "#rtype"12                       \n\t" \
-    "vcvtdq2ps "#rtype"13, "#rtype"13                       \n\t" \
-    "vcvtdq2ps "#rtype"14, "#rtype"14                       \n\t" \
-    "vcvtdq2ps "#rtype"15, "#rtype"15                       \n\t" \
-    "vcvtdq2ps "#rtype"16, "#rtype"16                       \n\t" \
-    "vcvtdq2ps "#rtype"17, "#rtype"17                       \n\t" \
-    "vcvtdq2ps "#rtype"18, "#rtype"18                       \n\t" \
-    "vcvtdq2ps "#rtype"19, "#rtype"19                       \n\t" \
-    "vcvtdq2ps "#rtype"20, "#rtype"20                       \n\t" \
-    "vcvtdq2ps "#rtype"21, "#rtype"21                       \n\t" \
-    "vcvtdq2ps "#rtype"22, "#rtype"22                       \n\t" \
-    "vcvtdq2ps "#rtype"23, "#rtype"23                       \n\t" \
-    "vmulps "#rtype"12, "#rtype"24, "#rtype"12                     \n\t" \
-    "vmulps "#rtype"13, "#rtype"24, "#rtype"13                     \n\t" \
-    "vmulps "#rtype"14, "#rtype"24, "#rtype"14                     \n\t" \
-    "vmulps "#rtype"15, "#rtype"24, "#rtype"15                     \n\t" \
-    "vmulps "#rtype"16, "#rtype"24, "#rtype"16                     \n\t" \
-    "vmulps "#rtype"17, "#rtype"24, "#rtype"17                     \n\t" \
-    "vmulps "#rtype"18, "#rtype"24, "#rtype"18                     \n\t" \
-    "vmulps "#rtype"19, "#rtype"24, "#rtype"19                     \n\t" \
-    "vmulps "#rtype"20, "#rtype"24, "#rtype"20                     \n\t" \
-    "vmulps "#rtype"21, "#rtype"24, "#rtype"21                     \n\t" \
-    "vmulps "#rtype"22, "#rtype"24, "#rtype"22                     \n\t" \
-    "vmulps "#rtype"23, "#rtype"24, "#rtype"23                     \n\t"
-#define load48BiasTo3Regs(bias) \
-    "vmovups ("#bias"), %%zmm0                       \n\t" \
-    "vmovups 0x40("#bias"), %%zmm1                   \n\t" \
-    "vmovups 0x80("#bias"), %%zmm2                   \n\t" \
-
-#define load48BiasTo12Regs(bias) \
-    load48BiasTo3Regs(bias) \
-    "vmovups %%zmm0, %%zmm3                   \n\t" \
-    "vmovups %%zmm1, %%zmm4                   \n\t" \
-    "vmovups %%zmm2, %%zmm5                   \n\t" \
-    "vmovups %%zmm0, %%zmm6                   \n\t" \
-    "vmovups %%zmm1, %%zmm7                   \n\t" \
-    "vmovups %%zmm2, %%zmm8                   \n\t" \
-    "vmovups %%zmm0, %%zmm9                   \n\t" \
-    "vmovups %%zmm1, %%zmm10                   \n\t" \
-    "vmovups %%zmm2, %%zmm11                   \n\t"
-
-#define load48BiasTo24Regs(bias) \
-    load48BiasTo12Regs(bias) \
-    "vmovups %%zmm0, %%zmm12                   \n\t" \
-    "vmovups %%zmm1, %%zmm13                   \n\t" \
-    "vmovups %%zmm2, %%zmm14                   \n\t" \
-    "vmovups %%zmm0, %%zmm15                   \n\t" \
-    "vmovups %%zmm1, %%zmm16                   \n\t" \
-    "vmovups %%zmm2, %%zmm17                   \n\t" \
-    "vmovups %%zmm0, %%zmm18                   \n\t" \
-    "vmovups %%zmm1, %%zmm19                   \n\t" \
-    "vmovups %%zmm2, %%zmm20                   \n\t" \
-    "vmovups %%zmm0, %%zmm21                   \n\t" \
-    "vmovups %%zmm1, %%zmm22                   \n\t" \
-    "vmovups %%zmm2, %%zmm23                   \n\t"
-
 #ifdef _USE_AVX512_VNNI
-#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd 0x10("#input"), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm0              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm1              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm2              \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm3              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm4              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm5              \n\t" \
-    "vpbroadcastd 0x20("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd 0x30("#input"), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm6              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm7              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm8              \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm9              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm10              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm11              \n\t" \
-    "vpbroadcastd 0x40("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd 0x50("#input"), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm12              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm13              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm14              \n\t" \
-    "vmovups "#off2"(%[filter]), "#preg2"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm15              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm16              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm17              \n\t" \
-    "vpbroadcastd 0x60("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd 0x70("#input"), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm18              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm19              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm20              \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm21              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm22              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm23              \n\t"
-
-#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd 0x10("#input"), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm0              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm1              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm2              \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm3              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm4              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm5              \n\t" \
-    "vpbroadcastd 0x20("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd 0x30("#input"), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm6              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm7              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm8              \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm9              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm10              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm11              \n\t"
-
-#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
-    "vmovups "#off2"(%[filter]), "#preg2"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm0              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm1              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm2              \n\t"
+#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \
+                         i0, i1, i2, i3, i4, i5, i6, i7) \
+    "vpbroadcastd "#i0"("#input"), %%zmm30       \n\t" \
+    "vpbroadcastd "#i1"("#input"), %%zmm31       \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm0          \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm1          \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm2          \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"        \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm3          \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm4          \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm5          \n\t" \
+    "vpbroadcastd "#i2"("#input"), %%zmm30       \n\t" \
+    "vpbroadcastd "#i3"("#input"), %%zmm31       \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm6          \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm7          \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm8          \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"        \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm9          \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm10         \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm11         \n\t" \
+    "vpbroadcastd "#i4"("#input"), %%zmm30       \n\t" \
+    "vpbroadcastd "#i5"("#input"), %%zmm31       \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm12         \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm13         \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm14         \n\t" \
+    "vmovups "#off2"(%[filter]), "#preg2"        \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm15         \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm16         \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm17         \n\t" \
+    "vpbroadcastd "#i6"("#input"), %%zmm30       \n\t" \
+    "vpbroadcastd "#i7"("#input"), %%zmm31       \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm18         \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm19         \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm20         \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm21         \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm22         \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm23         \n\t"
+
+#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \
+                         i0, i1, i2, i3, i4, i5, i6, i7) \
+    "vpbroadcastd "#i0"("#input"), %%zmm30       \n\t" \
+    "vpbroadcastd "#i1"("#input"), %%zmm31       \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm0          \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm1          \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"        \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm2          \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm3          \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm4          \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm5          \n\t" \
+    "vpbroadcastd "#i2"("#input"), %%zmm30       \n\t" \
+    "vpbroadcastd "#i3"("#input"), %%zmm31       \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm6          \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm7          \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"        \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm8          \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm9          \n\t" \
+    "vmovups "#off2"(%[filter]), "#preg2"        \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm10         \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm11         \n\t"
+
+#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \
+                         i0, i1, i2, i3, i4, i5, i6, i7) \
+    "vpbroadcastd ("#input"), %%zmm30            \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"        \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"        \n\t" \
+    "vmovups "#off2"(%[filter]), "#preg2"        \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm0          \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm1          \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm2          \n\t"
 #else
-#define convKernel8x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x10("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
-    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
-    "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x20("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm3, "#preg0", %%zmm3              \n\t" \
-    "vpaddd %%zmm4, "#preg1", %%zmm4              \n\t" \
-    "vpaddd %%zmm5, "#preg2", %%zmm5              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x30("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm6, "#preg0", %%zmm6              \n\t" \
-    "vpaddd %%zmm7, "#preg1", %%zmm7              \n\t" \
-    "vpaddd %%zmm8, "#preg2", %%zmm8              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x40("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm9, "#preg0", %%zmm9              \n\t" \
-    "vpaddd %%zmm10, "#preg1", %%zmm10              \n\t" \
-    "vpaddd %%zmm11, "#preg2", %%zmm11             \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x50("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm12, "#preg0", %%zmm12              \n\t" \
-    "vpaddd %%zmm13, "#preg1", %%zmm13              \n\t" \
-    "vpaddd %%zmm14, "#preg2", %%zmm14              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x60("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm15, "#preg0", %%zmm15              \n\t" \
-    "vpaddd %%zmm16, "#preg1", %%zmm16              \n\t" \
-    "vpaddd %%zmm17, "#preg2", %%zmm17              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x70("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm18, "#preg0", %%zmm18              \n\t" \
-    "vpaddd %%zmm19, "#preg1", %%zmm19              \n\t" \
-    "vpaddd %%zmm20, "#preg2", %%zmm20              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vmovups "#off2"(%[filter]), "#freg2"                             \n\t" \
-    "vpaddd %%zmm21, "#preg0", %%zmm21              \n\t" \
-    "vpaddd %%zmm22, "#preg1", %%zmm22              \n\t" \
-    "vpaddd %%zmm23, "#preg2", %%zmm23              \n\t"
-
-#define convKernel4x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x10("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
-    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
-    "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x20("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm3, "#preg0", %%zmm3              \n\t" \
-    "vpaddd %%zmm4, "#preg1", %%zmm4              \n\t" \
-    "vpaddd %%zmm5, "#preg2", %%zmm5              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x30("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm6, "#preg0", %%zmm6              \n\t" \
-    "vpaddd %%zmm7, "#preg1", %%zmm7              \n\t" \
-    "vpaddd %%zmm8, "#preg2", %%zmm8              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vmovups "#off2"(%[filter]), "#freg2"                             \n\t" \
-    "vpaddd %%zmm9, "#preg0", %%zmm9              \n\t" \
-    "vpaddd %%zmm10, "#preg1", %%zmm10              \n\t" \
-    "vpaddd %%zmm11, "#preg2", %%zmm11             \n\t"
-
-#define convKernel1x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vmovups "#off2"(%[filter]), "#freg2"                             \n\t" \
-    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
-    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
-    "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t"
-
-#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    convKernel8x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
-
-#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    convKernel4x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
-
-#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    convKernel1x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
+#define convKernel8x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \
+                           i0, i1, i2, i3, i4, i5, i6, i7) \
+    "vpbroadcastd "#i0"("#input"), %%zmm30       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vpbroadcastd "#i1"("#input"), %%zmm30       \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0             \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1             \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2             \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vpbroadcastd "#i2"("#input"), %%zmm30       \n\t" \
+    "vpaddd %%zmm3, "#preg0", %%zmm3             \n\t" \
+    "vpaddd %%zmm4, "#preg1", %%zmm4             \n\t" \
+    "vpaddd %%zmm5, "#preg2", %%zmm5             \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vpbroadcastd "#i3"("#input"), %%zmm30       \n\t" \
+    "vpaddd %%zmm6, "#preg0", %%zmm6             \n\t" \
+    "vpaddd %%zmm7, "#preg1", %%zmm7             \n\t" \
+    "vpaddd %%zmm8, "#preg2", %%zmm8             \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vpbroadcastd "#i4"("#input"), %%zmm30       \n\t" \
+    "vpaddd %%zmm9, "#preg0", %%zmm9             \n\t" \
+    "vpaddd %%zmm10, "#preg1", %%zmm10           \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vpbroadcastd "#i5"("#input"), %%zmm30       \n\t" \
+    "vpaddd %%zmm12, "#preg0", %%zmm12           \n\t" \
+    "vpaddd %%zmm13, "#preg1", %%zmm13           \n\t" \
+    "vpaddd %%zmm14, "#preg2", %%zmm14           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vpbroadcastd "#i6"("#input"), %%zmm30       \n\t" \
+    "vpaddd %%zmm15, "#preg0", %%zmm15           \n\t" \
+    "vpaddd %%zmm16, "#preg1", %%zmm16           \n\t" \
+    "vpaddd %%zmm17, "#preg2", %%zmm17           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vpbroadcastd "#i7"("#input"), %%zmm30       \n\t" \
+    "vpaddd %%zmm18, "#preg0", %%zmm18           \n\t" \
+    "vpaddd %%zmm19, "#preg1", %%zmm19           \n\t" \
+    "vpaddd %%zmm20, "#preg2", %%zmm20           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vmovups "#off2"(%[filter]), "#freg2"        \n\t" \
+    "vpaddd %%zmm21, "#preg0", %%zmm21           \n\t" \
+    "vpaddd %%zmm22, "#preg1", %%zmm22           \n\t" \
+    "vpaddd %%zmm23, "#preg2", %%zmm23           \n\t"
+
+#define convKernel4x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \
+                           i0, i1, i2, i3, i4, i5, i6, i7) \
+    "vpbroadcastd "#i0"("#input"), %%zmm30       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vpbroadcastd "#i1"("#input"), %%zmm30       \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0             \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1             \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2             \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vpbroadcastd "#i2"("#input"), %%zmm30       \n\t" \
+    "vpaddd %%zmm3, "#preg0", %%zmm3             \n\t" \
+    "vpaddd %%zmm4, "#preg1", %%zmm4             \n\t" \
+    "vpaddd %%zmm5, "#preg2", %%zmm5             \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vpbroadcastd "#i3"("#input"), %%zmm30       \n\t" \
+    "vpaddd %%zmm6, "#preg0", %%zmm6             \n\t" \
+    "vpaddd %%zmm7, "#preg1", %%zmm7             \n\t" \
+    "vpaddd %%zmm8, "#preg2", %%zmm8             \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vmovups "#off2"(%[filter]), "#freg2"        \n\t" \
+    "vpaddd %%zmm9, "#preg0", %%zmm9             \n\t" \
+    "vpaddd %%zmm10, "#preg1", %%zmm10           \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11           \n\t"
+
+#define convKernel1x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \
+                           i0, i1, i2, i3, i4, i5, i6, i7) \
+    "vpbroadcastd ("#input"), %%zmm30            \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"      \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"      \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"        \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"        \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"        \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"        \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"        \n\t" \
+    "vmovups "#off2"(%[filter]), "#freg2"        \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0             \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1             \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2             \n\t"
+
+#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \
+                         i0, i1, i2, i3, i4, i5, i6, i7)                                    \
+    convKernel8x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2,                  \
+                       %%zmm27, %%zmm28, %%zmm29,                                           \
+                       i0, i1, i2, i3, i4, i5, i6, i7)
+
+#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \
+                         i0, i1, i2, i3, i4, i5, i6, i7)                                    \
+    convKernel4x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2,                  \
+                       %%zmm27, %%zmm28, %%zmm29,                                           \
+                       i0, i1, i2, i3, i4, i5, i6, i7)
+
+#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \
+                         i0, i1, i2, i3, i4, i5, i6, i7)                                    \
+    convKernel1x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2,                  \
+                       %%zmm27, %%zmm28, %%zmm29,                                           \
+                       i0, i1, i2, i3, i4, i5, i6, i7)
 #endif
 
 #define convKernelForLoopXx48(rnum, wsize) \
-     __asm__ __volatile__("vmovups (%[filter]), %%zmm24                             \n\t" \
-                          "vmovups 0x40(%[filter]), %%zmm25                             \n\t" \
-                          "vmovups 0x80(%[filter]), %%zmm26                             \n\t" \
-                          "addq $0xC0, %[filter]                                    \n\t" \
-                          "mov $1, %%eax \n\t" \
-                          "vmovd %%eax, %%xmm0                    \n\t" \
-                          "vpbroadcastw %%xmm0, %%zmm31            \n\t" \
-                          "movq %[flags], %%rax          \n\t" \
-                          "andq $0x1, %%rax          \n\t" \
-                          "jne 0f                                         \n\t" \
-                          load48BiasTo##rnum##Regs(%[bias]) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          "jmp 1f          \n\t" \
-                          ".align 16                                         \n\t" \
-                          "0:                                                \n\t" \
-                          clear##rnum##Regs(%%zmm) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          ".align 16                                         \n\t" \
-                          "1:                                                \n\t" \
-                          "movq %[input], %%rax  \n\t" \
-                          convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26) \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x180, 0x1C0, 0x200, %%zmm27, %%zmm28, %%zmm29) \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0x240, 0x280, 0x2C0, %%zmm24, %%zmm25, %%zmm26) \
-                          "addq $0x300, %[filter]                                    \n\t" \
-                          "addq %[fStep], %[input]                                    \n\t" \
-                          "subq $0x10, %%rcx                                         \n\t" \
-                          "cmpq $0x10, %%rcx                                         \n\t" \
-                          "jge 1b                                             \n\t" \
-                          "subq %[fStep], %[input]                                    \n\t" \
-                          "addq %[f8Step], %[input]                                    \n\t" \
-                          ".align 16                                         \n\t" \
-                          "4:                                                \n\t" \
-                          : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \
-                          : [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \
-                            [stepC16] "r" (c.stepC16), [fStep] "r" (c.fStep), [flags] "r" (c.flags),  \
-                            [f8Step] "r" (c.f8Step) \
-                          : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                            "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                            "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                            "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                            "%zmm31", "memory", "cc"); \
-     if (c.ic > 0) { \
-         __asm__ __volatile__("cmpq $0x8, %%rcx          \n\t" \
-                              "jl 2f            \n\t" \
-                              "subq $0x8, %%rcx          \n\t" \
-                              "movq %[input], %%rax  \n\t" \
-                              convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \
-                              "addq $0x4, %%rax  \n\t" \
-                              convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26) \
-                              "addq $0x180, %[filter]                                    \n\t" \
-                              "addq %[f4Step], %[input]                                    \n\t" \
-                              ".align 16                                         \n\t" \
-                              "2:                                                \n\t" \
-                              "cmpq $0x4, %%rcx          \n\t" \
-                              "jl 5f            \n\t" \
-                              convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \
-                              ".align 16                                         \n\t" \
-                              "5:                                             \n\t" \
-                              : "+c" (c.ic) \
-                              : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \
-                                [stepC16] "r" (c.stepC16), [f4Step] "r" (c.f4Step) \
-                              : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                                "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                                "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                                "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                                "%zmm31", "memory", "cc"); \
+     __asm__ __volatile__("vmovups (%[filter]), %%zmm24                                     \n\t" \
+                          "vmovups 0x40(%[filter]), %%zmm25                                 \n\t" \
+                          "vmovups 0x80(%[filter]), %%zmm26                                 \n\t" \
+                          "addq $0xC0, %[filter]                                            \n\t" \
+                          "mov $1, %%eax                                                    \n\t" \
+                          "vmovd %%eax, %%xmm0                                              \n\t" \
+                          "vpbroadcastw %%xmm0, %%zmm31                                     \n\t" \
+                          "movq %[flags], %%rax                                             \n\t" \
+                          "andq $0x1, %%rax                                                 \n\t" \
+                          "jne 0f                                                           \n\t" \
+                          load48BiasTo##rnum##Regs(%[bias])                                       \
+                          "cmpq $0x10, %%rcx                                                \n\t" \
+                          "jl 4f                                                            \n\t" \
+                          "jmp 1f                                                           \n\t" \
+                          ".align 16                                                        \n\t" \
+                          "0:                                                               \n\t" \
+                          clear##rnum##Regs(%%zmm)                                                \
+                          "cmpq $0x10, %%rcx                                                \n\t" \
+                          "jl 4f                                                            \n\t" \
+                          ".align 16                                                        \n\t" \
+                          "1:                                                               \n\t" \
+                          "movq %[input], %%rax                                             \n\t" \
+                          convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26,              \
+                                                   0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29,    \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70) \
+                          "addq $0x4, %%rax  \n\t"                                                \
+                          convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29,              \
+                                                   0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26, \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70) \
+                          "addq $0x4, %%rax  \n\t"                                                \
+                          convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26,              \
+                                                   0x180, 0x1C0, 0x200,                           \
+                                                   %%zmm27, %%zmm28, %%zmm29,                     \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70) \
+                          "addq $0x4, %%rax  \n\t"                                                \
+                          convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29,              \
+                                                   0x240, 0x280, 0x2C0,                           \
+                                                   %%zmm24, %%zmm25, %%zmm26,                     \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70) \
+                          "addq $0x300, %[filter]                                           \n\t" \
+                          "addq %[fStep], %[input]                                          \n\t" \
+                          "subq $0x10, %%rcx                                                \n\t" \
+                          "cmpq $0x10, %%rcx                                                \n\t" \
+                          "jge 1b                                                           \n\t" \
+                          "subq %[fStep], %[input]                                          \n\t" \
+                          "addq %[f8Step], %[input]                                         \n\t" \
+                          ".align 16                                                        \n\t" \
+                          "4:                                                               \n\t" \
+                          : "+c" (c.ic),                                                          \
+                            [input] "+r" (c.input),                                               \
+                            [filter] "+r" (c.filter)                                              \
+                          : [bias] "r" (c.bias),                                                  \
+                            [kh] "r" (c.kh),                                                      \
+                            [kw] "r" (c.kw),                                                      \
+                            [fStep] "r" (c.fStep),                                                \
+                            [flags] "r" (c.flags),                                                \
+                            [f8Step] "r" (c.f8Step)                                               \
+                          : "%rax", "%rbx", "%r9",                                                \
+                            "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",                 \
+                            "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",               \
+                            "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",           \
+                            "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",           \
+                            "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",           \
+                            "%zmm30", "%zmm31", "memory", "cc");                                  \
+     if (c.ic > 0) {                                                                              \
+         __asm__ __volatile__("cmpq $0x8, %%rcx                                             \n\t" \
+                              "jl 2f                                                        \n\t" \
+                              "subq $0x8, %%rcx                                             \n\t" \
+                              "movq %[input], %%rax                                         \n\t" \
+                              convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26,          \
+                                                       0x0, 0x40, 0x80,                           \
+                                                       %%zmm27, %%zmm28, %%zmm29,                 \
+                                                       0x0, 0x8, 0x10, 0x18,                      \
+                                                       0x20, 0x28, 0x30, 0x38)                    \
+                              "addq $0x4, %%rax                                             \n\t" \
+                              convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29,          \
+                                                       0xC0, 0x100, 0x140,                        \
+                                                       %%zmm24, %%zmm25, %%zmm26,                 \
+                                                       0x0, 0x8, 0x10, 0x18,                      \
+                                                       0x20, 0x28, 0x30, 0x38)                    \
+                              "addq $0x180, %[filter]                                       \n\t" \
+                              "addq %[f4Step], %[input]                                     \n\t" \
+                              ".align 16                                                    \n\t" \
+                              "2:                                                           \n\t" \
+                              "cmpq $0x4, %%rcx                                             \n\t" \
+                              "jl 5f                                                        \n\t" \
+                              convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26,          \
+                                                       0x0, 0x40, 0x80,                           \
+                                                       %%zmm27, %%zmm28, %%zmm29,                 \
+                                                       0x0, 0x4, 0x8, 0xC,                        \
+                                                       0x10, 0x14, 0x18, 0x1C)                    \
+                              ".align 16                                                    \n\t" \
+                              "5:                                                           \n\t" \
+                              : "+c" (c.ic)                                                       \
+                              : [input] "r" (c.input),                                            \
+                                [filter] "r" (c.filter),                                          \
+                                [bias] "r" (c.bias),                                              \
+                                [kh] "r" (c.kh),                                                  \
+                                [kw] "r" (c.kw),                                                  \
+                                [f4Step] "r" (c.f4Step)                                           \
+                              : "%rax", "%rbx", "%r9",                                            \
+                                "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",             \
+                                "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",           \
+                                "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",       \
+                                "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",       \
+                                "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",       \
+                                "%zmm30", "%zmm31", "memory", "cc");                              \
     }
 
 void Avx512Conv1x1Kernel8x48(ConvController &c) {
      convKernelForLoopXx48(24, 8)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax), %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x80(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd 0x100(%%rax), %%zmm12, %%zmm12                         \n\t"
-                         "vpaddd 0x140(%%rax), %%zmm15, %%zmm15                         \n\t"
-                         "vpaddd 0x180(%%rax), %%zmm18, %%zmm18                         \n\t"
-                         "vpaddd 0x1C0(%%rax), %%zmm21, %%zmm21                         \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd 0x100(%%rax, %%rbx), %%zmm13, %%zmm13                         \n\t"
-                         "vpaddd 0x140(%%rax, %%rbx), %%zmm16, %%zmm16                         \n\t"
-                         "vpaddd 0x180(%%rax, %%rbx), %%zmm19, %%zmm19                         \n\t"
-                         "vpaddd 0x1C0(%%rax, %%rbx), %%zmm22, %%zmm22                         \n\t"
-                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11                         \n\t"
-                         "vpaddd 0x100(%%rax, %%rbx, 2), %%zmm14, %%zmm14                         \n\t"
-                         "vpaddd 0x140(%%rax, %%rbx, 2), %%zmm17, %%zmm17                         \n\t"
-                         "vpaddd 0x180(%%rax, %%rbx, 2), %%zmm20, %%zmm20                         \n\t"
-                         "vpaddd 0x1C0(%%rax, %%rbx, 2), %%zmm23, %%zmm23                         \n\t"
-
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                                \n\t"
+                         "movq %[ostepC16], %%rbx                              \n\t"
+                         "movq %[flags], %%rcx                                 \n\t"
+                         "and $0x1, %%rcx                                      \n\t"
+                         "je 0f                                                \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0                       \n\t"
+                         "vpaddd 0x40(%%rax), %%zmm3, %%zmm3                   \n\t"
+                         "vpaddd 0x80(%%rax), %%zmm6, %%zmm6                   \n\t"
+                         "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9                   \n\t"
+                         "vpaddd 0x100(%%rax), %%zmm12, %%zmm12                \n\t"
+                         "vpaddd 0x140(%%rax), %%zmm15, %%zmm15                \n\t"
+                         "vpaddd 0x180(%%rax), %%zmm18, %%zmm18                \n\t"
+                         "vpaddd 0x1C0(%%rax), %%zmm21, %%zmm21                \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4            \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7            \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10          \n\t"
+                         "vpaddd 0x100(%%rax, %%rbx), %%zmm13, %%zmm13         \n\t"
+                         "vpaddd 0x140(%%rax, %%rbx), %%zmm16, %%zmm16         \n\t"
+                         "vpaddd 0x180(%%rax, %%rbx), %%zmm19, %%zmm19         \n\t"
+                         "vpaddd 0x1C0(%%rax, %%rbx), %%zmm22, %%zmm22         \n\t"
+                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2             \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5         \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8         \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11       \n\t"
+                         "vpaddd 0x100(%%rax, %%rbx, 2), %%zmm14, %%zmm14      \n\t"
+                         "vpaddd 0x140(%%rax, %%rbx, 2), %%zmm17, %%zmm17      \n\t"
+                         "vpaddd 0x180(%%rax, %%rbx, 2), %%zmm20, %%zmm20      \n\t"
+                         "vpaddd 0x1C0(%%rax, %%rbx, 2), %%zmm23, %%zmm23      \n\t"
+
+                         ".align 16                                            \n\t"
+                         "0:                                                   \n\t"
+                         "cmpq $0x0, %[scale]                                  \n\t"
+                         "jne 1f                                               \n\t"
+                         "movq %[flags], %%rcx                                 \n\t"
                          "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         "je 4f                                                \n\t"
                          relu24Regs(%%zmm)
+                         "jmp 4f                                               \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                            \n\t"
+                         "1:                                                   \n\t"
                          convert24RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0, (%%rax)                             \n\t"
-                         "vmovups %%zmm3, 0x40(%%rax)                         \n\t"
-                         "vmovups %%zmm6, 0x80(%%rax)                         \n\t"
-                         "vmovups %%zmm9, 0xC0(%%rax)                         \n\t"
-                         "vmovups %%zmm12, 0x100(%%rax)                         \n\t"
-                         "vmovups %%zmm15, 0x140(%%rax)                         \n\t"
-                         "vmovups %%zmm18, 0x180(%%rax)                         \n\t"
-                         "vmovups %%zmm21, 0x1C0(%%rax)                         \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
-                         "vmovups %%zmm4, 0x40(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm7, 0x80(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm10, 0xC0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm13, 0x100(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm16, 0x140(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm19, 0x180(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm22, 0x1C0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm2, (%%rax, %%rbx, 2)                             \n\t"
-                         "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm14, 0x100(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm17, 0x140(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm20, 0x180(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm23, 0x1C0(%%rax, %%rbx, 2)                         \n\t"
+                         ".align 16                                            \n\t"
+                         "2:                                                   \n\t"
+                         "movq %[flags], %%rcx                                 \n\t"
+                         "and $0x2, %%rcx                                      \n\t"
+                         "je 3f                                                \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0                  \n\t"
+                         "vaddps 0x40(%[eltwise]), %%zmm3, %%zmm3              \n\t"
+                         "vaddps 0x80(%[eltwise]), %%zmm6, %%zmm6              \n\t"
+                         "vaddps 0xC0(%[eltwise]), %%zmm9, %%zmm9              \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm12, %%zmm12           \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm15, %%zmm15           \n\t"
+                         "vaddps 0x180(%[eltwise]), %%zmm18, %%zmm18           \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%zmm21, %%zmm21           \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1           \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx), %%zmm4, %%zmm4       \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx), %%zmm7, %%zmm7       \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx), %%zmm10, %%zmm10     \n\t"
+                         "vaddps 0x100(%[eltwise], %%rbx), %%zmm13, %%zmm13    \n\t"
+                         "vaddps 0x140(%[eltwise], %%rbx), %%zmm16, %%zmm16    \n\t"
+                         "vaddps 0x180(%[eltwise], %%rbx), %%zmm19, %%zmm19    \n\t"
+                         "vaddps 0x1C0(%[eltwise], %%rbx), %%zmm22, %%zmm22    \n\t"
+                         "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2        \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx, 2), %%zmm5, %%zmm5    \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx, 2), %%zmm8, %%zmm8    \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx, 2), %%zmm11, %%zmm11  \n\t"
+                         "vaddps 0x100(%[eltwise], %%rbx, 2), %%zmm14, %%zmm14 \n\t"
+                         "vaddps 0x140(%[eltwise], %%rbx, 2), %%zmm17, %%zmm17 \n\t"
+                         "vaddps 0x180(%[eltwise], %%rbx, 2), %%zmm20, %%zmm20 \n\t"
+                         "vaddps 0x1C0(%[eltwise], %%rbx, 2), %%zmm23, %%zmm23 \n\t"
+
+                         ".align 16                                            \n\t"
+                         "3:                                                   \n\t"
+                         "movq %[flags], %%rcx                                 \n\t"
+                         "and $0xC, %%rcx                                      \n\t"
+                         "je 4f                                                \n\t"
+                         relu24RegsPs(%%zmm)
+
+                         ".align 16                                            \n\t"
+                         "4:                                                   \n\t"
+                         "vmovups %%zmm0, (%%rax)                              \n\t"
+                         "vmovups %%zmm3, 0x40(%%rax)                          \n\t"
+                         "vmovups %%zmm6, 0x80(%%rax)                          \n\t"
+                         "vmovups %%zmm9, 0xC0(%%rax)                          \n\t"
+                         "vmovups %%zmm12, 0x100(%%rax)                        \n\t"
+                         "vmovups %%zmm15, 0x140(%%rax)                        \n\t"
+                         "vmovups %%zmm18, 0x180(%%rax)                        \n\t"
+                         "vmovups %%zmm21, 0x1C0(%%rax)                        \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)                       \n\t"
+                         "vmovups %%zmm4, 0x40(%%rax, %%rbx)                   \n\t"
+                         "vmovups %%zmm7, 0x80(%%rax, %%rbx)                   \n\t"
+                         "vmovups %%zmm10, 0xC0(%%rax, %%rbx)                  \n\t"
+                         "vmovups %%zmm13, 0x100(%%rax, %%rbx)                 \n\t"
+                         "vmovups %%zmm16, 0x140(%%rax, %%rbx)                 \n\t"
+                         "vmovups %%zmm19, 0x180(%%rax, %%rbx)                 \n\t"
+                         "vmovups %%zmm22, 0x1C0(%%rax, %%rbx)                 \n\t"
+                         "vmovups %%zmm2, (%%rax, %%rbx, 2)                    \n\t"
+                         "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2)                \n\t"
+                         "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2)                \n\t"
+                         "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2)               \n\t"
+                         "vmovups %%zmm14, 0x100(%%rax, %%rbx, 2)              \n\t"
+                         "vmovups %%zmm17, 0x140(%%rax, %%rbx, 2)              \n\t"
+                         "vmovups %%zmm20, 0x180(%%rax, %%rbx, 2)              \n\t"
+                         "vmovups %%zmm23, 0x1C0(%%rax, %%rbx, 2)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [eltwise] "r" (c.eltwise),
+                           [ostepC16] "r" (c.ostepC16),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
+                           "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",
+                           "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",
+                           "%zmm30", "%zmm31", "memory", "cc");
 }
 
 void Avx512Conv1x1Kernel4x48(ConvController &c) {
     convKernelForLoopXx48(12, 4)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax), %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x80(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11                         \n\t"
-
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                               \n\t"
+                         "movq %[ostepC16], %%rbx                             \n\t"
+                         "movq %[flags], %%rcx                                \n\t"
+                         "and $0x1, %%rcx                                     \n\t"
+                         "je 0f                                               \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0                      \n\t"
+                         "vpaddd 0x40(%%rax), %%zmm3, %%zmm3                  \n\t"
+                         "vpaddd 0x80(%%rax), %%zmm6, %%zmm6                  \n\t"
+                         "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9                  \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1               \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4           \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7           \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10         \n\t"
+                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2            \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5        \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8        \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11      \n\t"
+
+                         ".align 16                                           \n\t"
+                         "0:                                                  \n\t"
+                         "cmpq $0x0, %[scale]                                 \n\t"
+                         "jne 1f                                              \n\t"
+                         "movq %[flags], %%rcx                                \n\t"
+                         "and $0xC, %%rcx                                     \n\t"
+                         "je 4f                                               \n\t"
                          relu12Regs(%%zmm)
+                         "jmp 4f                                              \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                           \n\t"
+                         "1:                                                  \n\t"
                          convert12RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
+                         ".align 16                                           \n\t"
+                         "2:                                                  \n\t"
+                         "movq %[flags], %%rcx                                \n\t"
+                         "and $0x2, %%rcx                                     \n\t"
+                         "je 3f                                               \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0                 \n\t"
+                         "vaddps 0x40(%[eltwise]), %%zmm3, %%zmm3             \n\t"
+                         "vaddps 0x80(%[eltwise]), %%zmm6, %%zmm6             \n\t"
+                         "vaddps 0xC0(%[eltwise]), %%zmm9, %%zmm9             \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1          \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx), %%zmm4, %%zmm4      \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx), %%zmm7, %%zmm7      \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx), %%zmm10, %%zmm10    \n\t"
+                         "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2       \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx, 2), %%zmm5, %%zmm5   \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx, 2), %%zmm8, %%zmm8   \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx, 2), %%zmm11, %%zmm11 \n\t"
+
+                         ".align 16                                           \n\t"
+                         "3:                                                  \n\t"
+                         "movq %[flags], %%rcx                                \n\t"
+                         "and $0xC, %%rcx                                     \n\t"
+                         "je 4f                                               \n\t"
+                         relu12RegsPs(%%zmm)
+
+                         ".align 16                                           \n\t"
+                         "4:                                                  \n\t"
                          "vmovups %%zmm0, (%%rax)                             \n\t"
                          "vmovups %%zmm3, 0x40(%%rax)                         \n\t"
                          "vmovups %%zmm6, 0x80(%%rax)                         \n\t"
                          "vmovups %%zmm9, 0xC0(%%rax)                         \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
-                         "vmovups %%zmm4, 0x40(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm7, 0x80(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm10, 0xC0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm2, (%%rax, %%rbx, 2)                             \n\t"
-                         "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2)                         \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)                      \n\t"
+                         "vmovups %%zmm4, 0x40(%%rax, %%rbx)                  \n\t"
+                         "vmovups %%zmm7, 0x80(%%rax, %%rbx)                  \n\t"
+                         "vmovups %%zmm10, 0xC0(%%rax, %%rbx)                 \n\t"
+                         "vmovups %%zmm2, (%%rax, %%rbx, 2)                   \n\t"
+                         "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2)               \n\t"
+                         "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2)               \n\t"
+                         "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [eltwise] "r" (c.eltwise),
+                           [ostepC16] "r" (c.ostepC16),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm24", "%zmm31", "memory", "cc");
+
 }
 
 void Avx512Conv1x1Kernel1x48(ConvController &c) {
     convKernelForLoopXx48(3, 1)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2                             \n\t"
-
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                         \n\t"
+                         "movq %[ostepC16], %%rbx                       \n\t"
+                         "movq %[flags], %%rcx                          \n\t"
+                         "and $0x1, %%rcx                               \n\t"
+                         "je 0f                                         \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0                \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1         \n\t"
+                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2      \n\t"
+
+                         ".align 16                                     \n\t"
+                         "0:                                            \n\t"
+                         "cmpq $0x0, %[scale]                           \n\t"
+                         "jne 1f                                        \n\t"
+                         "movq %[flags], %%rcx                          \n\t"
+                         "and $0xC, %%rcx                               \n\t"
+                         "je 4f                                         \n\t"
                          relu3Regs(%%zmm)
+                         "jmp 4f                                        \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                     \n\t"
+                         "1:                                            \n\t"
                          convert3RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0, (%%rax)                             \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
-                         "vmovups %%zmm2, (%%rax, %%rbx, 2)                             \n\t"
+                         ".align 16                                     \n\t"
+                         "2:                                            \n\t"
+                         "movq %[flags], %%rcx                          \n\t"
+                         "and $0x2, %%rcx                               \n\t"
+                         "je 3f                                         \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0           \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1    \n\t"
+                         "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2 \n\t"
+
+                         ".align 16                                     \n\t"
+                         "3:                                            \n\t"
+                         "movq %[flags], %%rcx                          \n\t"
+                         "and $0xC, %%rcx                               \n\t"
+                         "je 4f                                         \n\t"
+                         relu3RegsPs(%%zmm)
+
+                         ".align 16                                     \n\t"
+                         "4:                                            \n\t"
+                         "vmovups %%zmm0, (%%rax)                       \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm2, (%%rax, %%rbx, 2)             \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2",
+                           "%zmm24", "%zmm31", "memory", "cc");
 }
 
-#define load32BiasTo2Regs(bias) \
-    "vmovups ("#bias"), %%zmm0                       \n\t" \
-    "vmovups 0x40("#bias"), %%zmm1                   \n\t" \
-
-#define load32BiasTo12Regs(bias) \
-    load32BiasTo2Regs(bias) \
-    "vmovups %%zmm0, %%zmm2                   \n\t" \
-    "vmovups %%zmm1, %%zmm3                   \n\t" \
-    "vmovups %%zmm0, %%zmm4                   \n\t" \
-    "vmovups %%zmm1, %%zmm5                   \n\t" \
-    "vmovups %%zmm0, %%zmm6                   \n\t" \
-    "vmovups %%zmm1, %%zmm7                   \n\t" \
-    "vmovups %%zmm0, %%zmm8                   \n\t" \
-    "vmovups %%zmm1, %%zmm9                   \n\t" \
-    "vmovups %%zmm0, %%zmm10                   \n\t" \
-    "vmovups %%zmm1, %%zmm11                   \n\t"
-
-#define load32BiasTo24Regs(bias) \
-    load32BiasTo12Regs(bias) \
-    "vmovups %%zmm0, %%zmm12                   \n\t" \
-    "vmovups %%zmm1, %%zmm13                   \n\t" \
-    "vmovups %%zmm0, %%zmm14                   \n\t" \
-    "vmovups %%zmm1, %%zmm15                   \n\t" \
-    "vmovups %%zmm0, %%zmm16                   \n\t" \
-    "vmovups %%zmm1, %%zmm17                   \n\t" \
-    "vmovups %%zmm0, %%zmm18                   \n\t" \
-    "vmovups %%zmm1, %%zmm19                   \n\t" \
-    "vmovups %%zmm0, %%zmm20                   \n\t" \
-    "vmovups %%zmm1, %%zmm21                   \n\t" \
-    "vmovups %%zmm0, %%zmm22                   \n\t" \
-    "vmovups %%zmm1, %%zmm23                   \n\t"
-
 #ifdef _USE_AVX512_VNNI
-#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
-    "vpbroadcastd ("#input"), %%zmm28                     \n\t" \
-    "vpbroadcastd 0x10("#input"), %%zmm29                     \n\t" \
+#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \
+    "vpbroadcastd "#i0"("#input"), %%zmm28           \n\t" \
+    "vpbroadcastd "#i1"("#input"), %%zmm29           \n\t" \
     "vpdpbusd "#freg0", %%zmm28, %%zmm0              \n\t" \
     "vpdpbusd "#freg1", %%zmm28, %%zmm1              \n\t" \
-    "vpbroadcastd 0x20("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd 0x30("#input"), %%zmm31                     \n\t" \
+    "vpbroadcastd "#i2"("#input"), %%zmm30           \n\t" \
+    "vpbroadcastd "#i3"("#input"), %%zmm31           \n\t" \
     "vpdpbusd "#freg0", %%zmm29, %%zmm2              \n\t" \
     "vpdpbusd "#freg1", %%zmm29, %%zmm3              \n\t" \
     "vpdpbusd "#freg0", %%zmm30, %%zmm4              \n\t" \
     "vpdpbusd "#freg1", %%zmm30, %%zmm5              \n\t" \
-    "vpbroadcastd 0x40("#input"), %%zmm28                     \n\t" \
-    "vpbroadcastd 0x50("#input"), %%zmm29                     \n\t" \
+    "vpbroadcastd "#i4"("#input"), %%zmm28           \n\t" \
+    "vpbroadcastd "#i5"("#input"), %%zmm29           \n\t" \
     "vpdpbusd "#freg0", %%zmm31, %%zmm6              \n\t" \
     "vpdpbusd "#freg1", %%zmm31, %%zmm7              \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"            \n\t" \
     "vpdpbusd "#freg0", %%zmm28, %%zmm8              \n\t" \
     "vpdpbusd "#freg1", %%zmm28, %%zmm9              \n\t" \
-    "vpbroadcastd 0x60("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd 0x70("#input"), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm29, %%zmm10              \n\t" \
-    "vpdpbusd "#freg1", %%zmm29, %%zmm11              \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm12              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm13              \n\t" \
-    "vpbroadcastd 0x80("#input"), %%zmm28                     \n\t" \
-    "vpbroadcastd 0x90("#input"), %%zmm29                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm14              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm15              \n\t" \
-    "vpdpbusd "#freg0", %%zmm28, %%zmm16              \n\t" \
-    "vpdpbusd "#freg1", %%zmm28, %%zmm17              \n\t" \
-    "vpbroadcastd 0xA0("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd 0xB0("#input"), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm29, %%zmm18              \n\t" \
-    "vpdpbusd "#freg1", %%zmm29, %%zmm19              \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm20              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm21              \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm22              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm23              \n\t"
-
-#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
-    "vpbroadcastd ("#input"), %%zmm28                     \n\t" \
-    "vpbroadcastd 0x10("#input"), %%zmm29                     \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
+    "vpbroadcastd "#i6"("#input"), %%zmm30           \n\t" \
+    "vpbroadcastd "#i7"("#input"), %%zmm31           \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm10             \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm11             \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"            \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm12             \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm13             \n\t" \
+    "vpbroadcastd "#i8"("#input"), %%zmm28           \n\t" \
+    "vpbroadcastd "#i9"("#input"), %%zmm29           \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm14             \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm15             \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm16             \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm17             \n\t" \
+    "vpbroadcastd "#i10"("#input"), %%zmm30          \n\t" \
+    "vpbroadcastd "#i11"("#input"), %%zmm31          \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm18             \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm19             \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm20             \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm21             \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm22             \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm23             \n\t"
+
+#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \
+    "vpbroadcastd "#i0"("#input"), %%zmm28           \n\t" \
+    "vpbroadcastd "#i1"("#input"), %%zmm29           \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"            \n\t" \
     "vpdpbusd "#freg0", %%zmm28, %%zmm0              \n\t" \
     "vpdpbusd "#freg1", %%zmm28, %%zmm1              \n\t" \
-    "vpbroadcastd 0x20("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd 0x30("#input"), %%zmm31                     \n\t" \
+    "vpbroadcastd "#i2"("#input"), %%zmm30           \n\t" \
+    "vpbroadcastd "#i3"("#input"), %%zmm31           \n\t" \
     "vpdpbusd "#freg0", %%zmm29, %%zmm2              \n\t" \
     "vpdpbusd "#freg1", %%zmm29, %%zmm3              \n\t" \
     "vpdpbusd "#freg0", %%zmm30, %%zmm4              \n\t" \
     "vpdpbusd "#freg1", %%zmm30, %%zmm5              \n\t" \
-    "vpbroadcastd 0x40("#input"), %%zmm28                     \n\t" \
-    "vpbroadcastd 0x50("#input"), %%zmm29                     \n\t" \
+    "vpbroadcastd "#i4"("#input"), %%zmm28           \n\t" \
+    "vpbroadcastd "#i5"("#input"), %%zmm29           \n\t" \
     "vpdpbusd "#freg0", %%zmm31, %%zmm6              \n\t" \
     "vpdpbusd "#freg1", %%zmm31, %%zmm7              \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"            \n\t" \
     "vpdpbusd "#freg0", %%zmm28, %%zmm8              \n\t" \
     "vpdpbusd "#freg1", %%zmm28, %%zmm9              \n\t" \
-    "vpdpbusd "#freg0", %%zmm29, %%zmm10              \n\t" \
-    "vpdpbusd "#freg1", %%zmm29, %%zmm11              \n\t"
-
-#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
-    "vpbroadcastd ("#input"), %%zmm28                     \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm10             \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm11             \n\t"
+
+#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \
+    "vpbroadcastd ("#input"), %%zmm28                \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"            \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"            \n\t" \
     "vpdpbusd "#freg0", %%zmm28, %%zmm0              \n\t" \
     "vpdpbusd "#freg1", %%zmm28, %%zmm1              \n\t"
 #else
-#define convKernel12x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
-    "vpbroadcastd 0x10("#input"), %%zmm30                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x20("#input"), %%zmm29                     \n\t" \
-    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
-    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
-    "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg2"              \n\t" \
-    "vpbroadcastd 0x30("#input"), %%zmm30                     \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x40("#input"), %%zmm29                     \n\t" \
-    "vpaddd %%zmm3, "#preg0", %%zmm3              \n\t" \
-    "vpaddd %%zmm4, "#preg1", %%zmm4              \n\t" \
-    "vpaddd %%zmm5, "#preg2", %%zmm5              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x50("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm6, "#preg0", %%zmm6              \n\t" \
-    "vpaddd %%zmm7, "#preg1", %%zmm7              \n\t" \
-    "vpaddd %%zmm8, "#preg2", %%zmm8              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg2"              \n\t" \
-    "vpbroadcastd 0x60("#input"), %%zmm29                     \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x70("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm9, "#preg0", %%zmm9              \n\t" \
-    "vpaddd %%zmm10, "#preg1", %%zmm10              \n\t" \
-    "vpaddd %%zmm11, "#preg2", %%zmm11              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x80("#input"), %%zmm29                     \n\t" \
-    "vpaddd %%zmm12, "#preg0", %%zmm12              \n\t" \
-    "vpaddd %%zmm13, "#preg1", %%zmm13              \n\t" \
-    "vpaddd %%zmm14, "#preg2", %%zmm14              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg2"              \n\t" \
-    "vpbroadcastd 0x90("#input"), %%zmm30                     \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0xA0("#input"), %%zmm29                     \n\t" \
-    "vpaddd %%zmm15, "#preg0", %%zmm15              \n\t" \
-    "vpaddd %%zmm16, "#preg1", %%zmm16              \n\t" \
-    "vpaddd %%zmm17, "#preg2", %%zmm17              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0xB0("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm18, "#preg0", %%zmm18              \n\t" \
-    "vpaddd %%zmm19, "#preg1", %%zmm19              \n\t" \
-    "vpaddd %%zmm20, "#preg2", %%zmm20              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
-    "vpaddd %%zmm21, "#preg0", %%zmm21              \n\t" \
-    "vpaddd %%zmm22, "#preg1", %%zmm22              \n\t" \
-    "vpaddd %%zmm23, "#preg2", %%zmm23              \n\t"
-
-#define convKernel6x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
-    "vpbroadcastd 0x10("#input"), %%zmm30                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x20("#input"), %%zmm29                     \n\t" \
-    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
-    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
-    "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg2"              \n\t" \
-    "vpbroadcastd 0x30("#input"), %%zmm30                     \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x40("#input"), %%zmm29                     \n\t" \
-    "vpaddd %%zmm3, "#preg0", %%zmm3              \n\t" \
-    "vpaddd %%zmm4, "#preg1", %%zmm4              \n\t" \
-    "vpaddd %%zmm5, "#preg2", %%zmm5              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd 0x50("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm6, "#preg0", %%zmm6              \n\t" \
-    "vpaddd %%zmm7, "#preg1", %%zmm7              \n\t" \
-    "vpaddd %%zmm8, "#preg2", %%zmm8              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
-    "vpaddd %%zmm9, "#preg0", %%zmm9              \n\t" \
-    "vpaddd %%zmm10, "#preg1", %%zmm10              \n\t" \
-    "vpaddd %%zmm11, "#preg2", %%zmm11              \n\t"
-
-#define convKernel1x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
-    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
-    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t"
-
-#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
-    convKernel12x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28)
-
-#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
-    convKernel6x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28)
-
-#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
-    convKernel1x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28)
+#define convKernel12x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \
+    "vpbroadcastd "#i0"("#input"), %%zmm29           \n\t" \
+    "vpbroadcastd "#i1"("#input"), %%zmm30           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg2"          \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vpbroadcastd "#i2"("#input"), %%zmm29           \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0                 \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1                 \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2                 \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg2"          \n\t" \
+    "vpbroadcastd "#i3"("#input"), %%zmm30           \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vpbroadcastd "#i4"("#input"), %%zmm29           \n\t" \
+    "vpaddd %%zmm3, "#preg0", %%zmm3                 \n\t" \
+    "vpaddd %%zmm4, "#preg1", %%zmm4                 \n\t" \
+    "vpaddd %%zmm5, "#preg2", %%zmm5                 \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg2"          \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vpbroadcastd "#i5"("#input"), %%zmm30           \n\t" \
+    "vpaddd %%zmm6, "#preg0", %%zmm6                 \n\t" \
+    "vpaddd %%zmm7, "#preg1", %%zmm7                 \n\t" \
+    "vpaddd %%zmm8, "#preg2", %%zmm8                 \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg2"          \n\t" \
+    "vpbroadcastd "#i6"("#input"), %%zmm29           \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vpbroadcastd "#i7"("#input"), %%zmm30           \n\t" \
+    "vpaddd %%zmm9, "#preg0", %%zmm9                 \n\t" \
+    "vpaddd %%zmm10, "#preg1", %%zmm10               \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11               \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg2"          \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vpbroadcastd "#i8"("#input"), %%zmm29           \n\t" \
+    "vpaddd %%zmm12, "#preg0", %%zmm12               \n\t" \
+    "vpaddd %%zmm13, "#preg1", %%zmm13               \n\t" \
+    "vpaddd %%zmm14, "#preg2", %%zmm14               \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg2"          \n\t" \
+    "vpbroadcastd "#i9"("#input"), %%zmm30           \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vpbroadcastd "#i10"("#input"), %%zmm29          \n\t" \
+    "vpaddd %%zmm15, "#preg0", %%zmm15               \n\t" \
+    "vpaddd %%zmm16, "#preg1", %%zmm16               \n\t" \
+    "vpaddd %%zmm17, "#preg2", %%zmm17               \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg2"          \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vpbroadcastd "#i11"("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm18, "#preg0", %%zmm18               \n\t" \
+    "vpaddd %%zmm19, "#preg1", %%zmm19               \n\t" \
+    "vpaddd %%zmm20, "#preg2", %%zmm20               \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg2"          \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"            \n\t" \
+    "vpaddd %%zmm21, "#preg0", %%zmm21               \n\t" \
+    "vpaddd %%zmm22, "#preg1", %%zmm22               \n\t" \
+    "vpaddd %%zmm23, "#preg2", %%zmm23               \n\t"
+
+#define convKernel6x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \
+    "vpbroadcastd "#i0"("#input"), %%zmm29           \n\t" \
+    "vpbroadcastd "#i1"("#input"), %%zmm30           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg2"          \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vpbroadcastd "#i2"("#input"), %%zmm29           \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0                 \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1                 \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2                 \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg2"          \n\t" \
+    "vpbroadcastd "#i3"("#input"), %%zmm30           \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vpbroadcastd "#i4"("#input"), %%zmm29           \n\t" \
+    "vpaddd %%zmm3, "#preg0", %%zmm3                 \n\t" \
+    "vpaddd %%zmm4, "#preg1", %%zmm4                 \n\t" \
+    "vpaddd %%zmm5, "#preg2", %%zmm5                 \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg2"          \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vpbroadcastd "#i5"("#input"), %%zmm30           \n\t" \
+    "vpaddd %%zmm6, "#preg0", %%zmm6                 \n\t" \
+    "vpaddd %%zmm7, "#preg1", %%zmm7                 \n\t" \
+    "vpaddd %%zmm8, "#preg2", %%zmm8                 \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg1"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg2"          \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"            \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"            \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"            \n\t" \
+    "vpaddd %%zmm9, "#preg0", %%zmm9                 \n\t" \
+    "vpaddd %%zmm10, "#preg1", %%zmm10               \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11               \n\t"
+
+#define convKernel1x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \
+    "vpbroadcastd ("#input"), %%zmm29                \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"          \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"          \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"            \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"            \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"            \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"            \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0                 \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1                 \n\t"
+
+#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \
+    convKernel12x32c4_3(input, %%zmm24, %%zmm25, off0, off1, \
+                        %%zmm26, %%zmm27, %%zmm28, \
+                        i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11)
+
+#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \
+    convKernel6x32c4_3(input, %%zmm24, %%zmm25, off0, off1, \
+                       %%zmm26, %%zmm27, %%zmm28, \
+                       i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11)
+
+#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \
+    convKernel1x32c4_3(input, %%zmm24, %%zmm25, off0, off1, \
+                       %%zmm26, %%zmm27, %%zmm28, \
+                       i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11)
 #endif
 
 #define convKernelForLoopXx32(rnum, wsize) \
-     __asm__ __volatile__("vmovups (%[filter]), %%zmm24                             \n\t" \
-                          "vmovups 0x40(%[filter]), %%zmm25                             \n\t" \
-                          "addq $0x80, %[filter]                                    \n\t" \
-                          "mov $1, %%eax \n\t" \
-                          "vmovd %%eax, %%xmm0                    \n\t" \
-                          "vpbroadcastw %%xmm0, %%zmm31            \n\t" \
-                          "movq %[flags], %%rax          \n\t" \
-                          "andq $0x1, %%rax          \n\t" \
-                          "jne 0f                                         \n\t" \
-                          load32BiasTo##rnum##Regs(%[bias]) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          "jmp 1f          \n\t" \
-                          ".align 16                                         \n\t" \
-                          "0:                                                \n\t" \
-                          clear##rnum##Regs(%%zmm) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          ".align 16                                         \n\t" \
-                          "1:                                                \n\t" \
-                          "movq %[input], %%rax  \n\t" \
-                          convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25) \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x100, 0x140, %%zmm26, %%zmm27) \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x180, 0x1C0, %%zmm24, %%zmm25) \
-                          "addq $0x200, %[filter]                                    \n\t" \
-                          "addq %[fStep], %[input]                                    \n\t" \
-                          "subq $0x10, %%rcx                                         \n\t" \
-                          "cmpq $0x10, %%rcx                                         \n\t" \
-                          "jge 1b                                             \n\t" \
-                          "subq %[fStep], %[input]                                    \n\t" \
-                          "addq %[f8Step], %[input]                                    \n\t" \
-                          ".align 16                                         \n\t" \
-                          "4:                                                \n\t" \
-                          : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \
-                          : [bias] "r" (c.bias), [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \
-                            [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), [flags] "r" (c.flags),  \
-                            [f8Step] "r" (c.f8Step) \
-                          : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                            "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                            "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                            "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                            "%zmm31", "memory", "cc"); \
-     if (c.ic > 0) { \
-         __asm__ __volatile__("cmpq $0x8, %%rcx          \n\t" \
-                              "jl 2f            \n\t" \
-                              "subq $0x8, %%rcx          \n\t" \
-                              "shr $1, %[dilateW]                                    \n\t" \
-                              "shr $1, %[dilateH]                                    \n\t" \
-                              "shr $1, %[fStep]                                    \n\t" \
-                              "movq %[input], %%rax  \n\t" \
-                              convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \
-                              "addq $0x4, %%rax  \n\t" \
-                              convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25) \
+     __asm__ __volatile__("vmovups (%[filter]), %%zmm24                                  \n\t" \
+                          "vmovups 0x40(%[filter]), %%zmm25                              \n\t" \
+                          "addq $0x80, %[filter]                                         \n\t" \
+                          "mov $1, %%eax                                                 \n\t" \
+                          "vmovd %%eax, %%xmm0                                           \n\t" \
+                          "vpbroadcastw %%xmm0, %%zmm31                                  \n\t" \
+                          "movq %[flags], %%rax                                          \n\t" \
+                          "andq $0x1, %%rax                                              \n\t" \
+                          "jne 0f                                                        \n\t" \
+                          load32BiasTo##rnum##Regs(%[bias])                                    \
+                          "cmpq $0x10, %%rcx                                             \n\t" \
+                          "jl 4f                                                         \n\t" \
+                          "jmp 1f                                                        \n\t" \
+                          ".align 16                                                     \n\t" \
+                          "0:                                                            \n\t" \
+                          clear##rnum##Regs(%%zmm)                                             \
+                          "cmpq $0x10, %%rcx                                             \n\t" \
+                          "jl 4f                                                         \n\t" \
+                          ".align 16                                                     \n\t" \
+                          "1:                                                            \n\t" \
+                          "movq %[input], %%rax                                          \n\t" \
+                          convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40,         \
+                                                   %%zmm26, %%zmm27,                           \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50,          \
+                                                   0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0)         \
+                          "addq $0x4, %%rax                                              \n\t" \
+                          convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0,        \
+                                                   %%zmm24, %%zmm25,                           \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50,          \
+                                                   0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0)         \
+                          "addq $0x4, %%rax                                              \n\t" \
+                          convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x100, 0x140,      \
+                                                   %%zmm26, %%zmm27,                           \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50,          \
+                                                   0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0)         \
+                          "addq $0x4, %%rax                                              \n\t" \
+                          convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x180, 0x1C0,      \
+                                                   %%zmm24, %%zmm25,                           \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50,          \
+                                                   0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0)         \
+                          "addq $0x200, %[filter]                                        \n\t" \
+                          "addq %[fStep], %[input]                                       \n\t" \
+                          "subq $0x10, %%rcx                                             \n\t" \
+                          "cmpq $0x10, %%rcx                                             \n\t" \
+                          "jge 1b                                                        \n\t" \
+                          "subq %[fStep], %[input]                                       \n\t" \
+                          "addq %[f8Step], %[input]                                      \n\t" \
+                          ".align 16                                                     \n\t" \
+                          "4:                                                            \n\t" \
+                          : "+c" (c.ic),                                                       \
+                            [input] "+r" (c.input),                                            \
+                            [filter] "+r" (c.filter)                                           \
+                          : [bias] "r" (c.bias),                                               \
+                            [dilateW] "r" (c.dilateW),                                         \
+                            [dilateH] "r" (c.dilateH),                                         \
+                            [fStep] "r" (c.fStep),                                             \
+                            [flags] "r" (c.flags),                                             \
+                            [f8Step] "r" (c.f8Step)                                            \
+                          : "%rax", "%rbx", "%r9",                                             \
+                            "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",              \
+                            "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",            \
+                            "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",        \
+                            "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",        \
+                            "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",        \
+                            "%zmm30", "%zmm31", "memory", "cc");                               \
+     if (c.ic > 0) {                                                                           \
+         __asm__ __volatile__("cmpq $0x8, %%rcx                                          \n\t" \
+                              "jl 2f                                                     \n\t" \
+                              "subq $0x8, %%rcx                                          \n\t" \
+                              "shr $1, %[dilateW]                                        \n\t" \
+                              "shr $1, %[dilateH]                                        \n\t" \
+                              "shr $1, %[fStep]                                          \n\t" \
+                              "movq %[input], %%rax                                      \n\t" \
+                              convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40,     \
+                                                       %%zmm26, %%zmm27,                       \
+                                                       0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, \
+                                                       0x38, 0x40, 0x48, 0x50, 0x58)           \
+                              "addq $0x4, %%rax                                          \n\t" \
+                              convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0,    \
+                                                       %%zmm24, %%zmm25,                       \
+                                                       0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, \
+                                                       0x38, 0x40, 0x48, 0x50, 0x58)           \
                               "addq $0x100, %[filter]                                    \n\t" \
-                              "addq %[f4Step], %[input]                                    \n\t" \
-                              ".align 16                                         \n\t" \
-                              "2:                                                \n\t" \
-                              "cmpq $0x4, %%rcx          \n\t" \
-                              "jl 5f            \n\t" \
-                              "shr $1, %[dilateW]                                    \n\t" \
-                              "shr $1, %[dilateH]                                    \n\t" \
-                              convKernel##wsize##x32c4(%[input], %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \
-                              "addq $0x80, %[filter]                                    \n\t" \
-                              ".align 16                                         \n\t" \
-                              "5:                                             \n\t" \
-                              : "+c" (c.ic) \
-                              : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), \
-                                [dilateW] "r" (c.dilateW), \
-                                [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), \
-                                [f4Step] "r" (c.f4Step) \
-                              : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                                "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                                "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                                "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                                "%zmm31", "memory", "cc"); \
+                              "addq %[f4Step], %[input]                                  \n\t" \
+                              ".align 16                                                 \n\t" \
+                              "2:                                                        \n\t" \
+                              "cmpq $0x4, %%rcx                                          \n\t" \
+                              "jl 5f                                                     \n\t" \
+                              "shr $1, %[dilateW]                                        \n\t" \
+                              "shr $1, %[dilateH]                                        \n\t" \
+                              convKernel##wsize##x32c4(%[input], %%zmm24, %%zmm25, 0x0, 0x40,  \
+                                                       %%zmm26, %%zmm27,                       \
+                                                       0x0, 0x4, 0x8, 0xC, 0x10, 0x14,         \
+                                                       0x18, 0x1C, 0x20, 0x24, 0x28, 0x2C)     \
+                              "addq $0x80, %[filter]                                     \n\t" \
+                              ".align 16                                                 \n\t" \
+                              "5:                                                        \n\t" \
+                              : "+c" (c.ic)                                                    \
+                              : [input] "r" (c.input),                                         \
+                                [filter] "r" (c.filter),                                       \
+                                [bias] "r" (c.bias),                                           \
+                                [dilateW] "r" (c.dilateW),                                     \
+                                [dilateH] "r" (c.dilateH),                                     \
+                                [fStep] "r" (c.fStep),                                         \
+                                [f4Step] "r" (c.f4Step)                                        \
+                              : "%rax", "%rbx", "%r9",                                         \
+                                "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",          \
+                                "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",        \
+                                "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",    \
+                                "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",    \
+                                "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",    \
+                                "%zmm30", "%zmm31", "memory", "cc");                           \
     }
 
 void Avx512Conv1x1Kernel12x32(ConvController &c) {
     convKernelForLoopXx32(24, 12)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                             \n\t"
+                         "movq %[ostepC16], %%rbx                           \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0x1, %%rcx                                   \n\t"
                          "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax), %%zmm2, %%zmm2                         \n\t"
-                         "vpaddd 0x80(%%rax), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0x100(%%rax), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0x140(%%rax), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd 0x180(%%rax), %%zmm12, %%zmm12                         \n\t"
-                         "vpaddd 0x1C0(%%rax), %%zmm14, %%zmm14                         \n\t"
-                         "vpaddd 0x200(%%rax), %%zmm16, %%zmm16                         \n\t"
-                         "vpaddd 0x240(%%rax), %%zmm18, %%zmm18                         \n\t"
-                         "vpaddd 0x280(%%rax), %%zmm20, %%zmm20                         \n\t"
-                         "vpaddd 0x2C0(%%rax), %%zmm22, %%zmm22                         \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11                         \n\t"
-                         "vpaddd 0x180(%%rax, %%rbx), %%zmm13, %%zmm13                         \n\t"
-                         "vpaddd 0x1C0(%%rax, %%rbx), %%zmm15, %%zmm15                         \n\t"
-                         "vpaddd 0x200(%%rax, %%rbx), %%zmm17, %%zmm17                         \n\t"
-                         "vpaddd 0x240(%%rax, %%rbx), %%zmm19, %%zmm19                         \n\t"
-                         "vpaddd 0x280(%%rax, %%rbx), %%zmm21, %%zmm21                         \n\t"
-                         "vpaddd 0x2C0(%%rax, %%rbx), %%zmm23, %%zmm23                         \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0                    \n\t"
+                         "vpaddd 0x40(%%rax), %%zmm2, %%zmm2                \n\t"
+                         "vpaddd 0x80(%%rax), %%zmm4, %%zmm4                \n\t"
+                         "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6                \n\t"
+                         "vpaddd 0x100(%%rax), %%zmm8, %%zmm8               \n\t"
+                         "vpaddd 0x140(%%rax), %%zmm10, %%zmm10             \n\t"
+                         "vpaddd 0x180(%%rax), %%zmm12, %%zmm12             \n\t"
+                         "vpaddd 0x1C0(%%rax), %%zmm14, %%zmm14             \n\t"
+                         "vpaddd 0x200(%%rax), %%zmm16, %%zmm16             \n\t"
+                         "vpaddd 0x240(%%rax), %%zmm18, %%zmm18             \n\t"
+                         "vpaddd 0x280(%%rax), %%zmm20, %%zmm20             \n\t"
+                         "vpaddd 0x2C0(%%rax), %%zmm22, %%zmm22             \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1             \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3         \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5         \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7         \n\t"
+                         "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9        \n\t"
+                         "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11      \n\t"
+                         "vpaddd 0x180(%%rax, %%rbx), %%zmm13, %%zmm13      \n\t"
+                         "vpaddd 0x1C0(%%rax, %%rbx), %%zmm15, %%zmm15      \n\t"
+                         "vpaddd 0x200(%%rax, %%rbx), %%zmm17, %%zmm17      \n\t"
+                         "vpaddd 0x240(%%rax, %%rbx), %%zmm19, %%zmm19      \n\t"
+                         "vpaddd 0x280(%%rax, %%rbx), %%zmm21, %%zmm21      \n\t"
+                         "vpaddd 0x2C0(%%rax, %%rbx), %%zmm23, %%zmm23      \n\t"
 
                          ".align 16                                         \n\t"
                          "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0xC, %%rcx                                   \n\t"
+                         "je 4f                                             \n\t"
                          relu24Regs(%%zmm)
+                         "jmp 4f                                            \n\t"
 
                          ".align 16                                         \n\t"
                          "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
                          convert24RegsI32ToF32(%[scale], %%zmm)
 
                          ".align 16                                         \n\t"
                          "2:                                                \n\t"
-                         "vmovups %%zmm0, (%%rax)                             \n\t"
-                         "vmovups %%zmm2, 0x40(%%rax)                         \n\t"
-                         "vmovups %%zmm4, 0x80(%%rax)                         \n\t"
-                         "vmovups %%zmm6, 0xC0(%%rax)                         \n\t"
-                         "vmovups %%zmm8, 0x100(%%rax)                         \n\t"
-                         "vmovups %%zmm10, 0x140(%%rax)                         \n\t"
-                         "vmovups %%zmm12, 0x180(%%rax)                         \n\t"
-                         "vmovups %%zmm14, 0x1C0(%%rax)                         \n\t"
-                         "vmovups %%zmm16, 0x200(%%rax)                         \n\t"
-                         "vmovups %%zmm18, 0x240(%%rax)                         \n\t"
-                         "vmovups %%zmm20, 0x280(%%rax)                         \n\t"
-                         "vmovups %%zmm22, 0x2C0(%%rax)                         \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
-                         "vmovups %%zmm3, 0x40(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm5, 0x80(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm7, 0xC0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm9, 0x100(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm11, 0x140(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm13, 0x180(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm15, 0x1C0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm17, 0x200(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm19, 0x240(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm21, 0x280(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm23, 0x2C0(%%rax, %%rbx)                         \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0x2, %%rcx                                   \n\t"
+                         "je 3f                                             \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0               \n\t"
+                         "vaddps 0x40(%[eltwise]), %%zmm2, %%zmm2           \n\t"
+                         "vaddps 0x80(%[eltwise]), %%zmm4, %%zmm4           \n\t"
+                         "vaddps 0xC0(%[eltwise]), %%zmm6, %%zmm6           \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm8, %%zmm8          \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm10, %%zmm10        \n\t"
+                         "vaddps 0x180(%[eltwise]), %%zmm12, %%zmm12        \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%zmm14, %%zmm14        \n\t"
+                         "vaddps 0x200(%[eltwise]), %%zmm16, %%zmm16        \n\t"
+                         "vaddps 0x240(%[eltwise]), %%zmm18, %%zmm18        \n\t"
+                         "vaddps 0x280(%[eltwise]), %%zmm20, %%zmm20        \n\t"
+                         "vaddps 0x2C0(%[eltwise]), %%zmm22, %%zmm22        \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1        \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx), %%zmm3, %%zmm3    \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx), %%zmm5, %%zmm5    \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx), %%zmm7, %%zmm7    \n\t"
+                         "vaddps 0x100(%[eltwise], %%rbx), %%zmm9, %%zmm9   \n\t"
+                         "vaddps 0x140(%[eltwise], %%rbx), %%zmm11, %%zmm11 \n\t"
+                         "vaddps 0x180(%[eltwise], %%rbx), %%zmm13, %%zmm13 \n\t"
+                         "vaddps 0x1C0(%[eltwise], %%rbx), %%zmm15, %%zmm15 \n\t"
+                         "vaddps 0x200(%[eltwise], %%rbx), %%zmm17, %%zmm17 \n\t"
+                         "vaddps 0x240(%[eltwise], %%rbx), %%zmm19, %%zmm19 \n\t"
+                         "vaddps 0x280(%[eltwise], %%rbx), %%zmm21, %%zmm21 \n\t"
+                         "vaddps 0x2C0(%[eltwise], %%rbx), %%zmm23, %%zmm23 \n\t"
+
+                         ".align 16                                         \n\t"
+                         "3:                                                \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0xC, %%rcx                                   \n\t"
+                         "je 4f                                             \n\t"
+                         relu24RegsPs(%%zmm)
+
+                         ".align 16                                         \n\t"
+                         "4:                                                \n\t"
+                         "vmovups %%zmm0, (%%rax)                           \n\t"
+                         "vmovups %%zmm2, 0x40(%%rax)                       \n\t"
+                         "vmovups %%zmm4, 0x80(%%rax)                       \n\t"
+                         "vmovups %%zmm6, 0xC0(%%rax)                       \n\t"
+                         "vmovups %%zmm8, 0x100(%%rax)                      \n\t"
+                         "vmovups %%zmm10, 0x140(%%rax)                     \n\t"
+                         "vmovups %%zmm12, 0x180(%%rax)                     \n\t"
+                         "vmovups %%zmm14, 0x1C0(%%rax)                     \n\t"
+                         "vmovups %%zmm16, 0x200(%%rax)                     \n\t"
+                         "vmovups %%zmm18, 0x240(%%rax)                     \n\t"
+                         "vmovups %%zmm20, 0x280(%%rax)                     \n\t"
+                         "vmovups %%zmm22, 0x2C0(%%rax)                     \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)                    \n\t"
+                         "vmovups %%zmm3, 0x40(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm5, 0x80(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm7, 0xC0(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm9, 0x100(%%rax, %%rbx)               \n\t"
+                         "vmovups %%zmm11, 0x140(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm13, 0x180(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm15, 0x1C0(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm17, 0x200(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm19, 0x240(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm21, 0x280(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm23, 0x2C0(%%rax, %%rbx)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx", 
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
+                           "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",
+                           "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",
+                           "%zmm30", "%zmm31", "memory", "cc");
 }
 
 void Avx512Conv1x1Kernel6x32(ConvController &c) {
     convKernelForLoopXx32(12, 6)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                             \n\t"
+                         "movq %[ostepC16], %%rbx                           \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0x1, %%rcx                                   \n\t"
                          "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax), %%zmm2, %%zmm2                         \n\t"
-                         "vpaddd 0x80(%%rax), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0x100(%%rax), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0x140(%%rax), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11                         \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0                    \n\t"
+                         "vpaddd 0x40(%%rax), %%zmm2, %%zmm2                \n\t"
+                         "vpaddd 0x80(%%rax), %%zmm4, %%zmm4                \n\t"
+                         "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6                \n\t"
+                         "vpaddd 0x100(%%rax), %%zmm8, %%zmm8               \n\t"
+                         "vpaddd 0x140(%%rax), %%zmm10, %%zmm10             \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1             \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3         \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5         \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7         \n\t"
+                         "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9        \n\t"
+                         "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11      \n\t"
 
                          ".align 16                                         \n\t"
                          "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0xC, %%rcx                                   \n\t"
+                         "je 4f                                             \n\t"
                          relu12Regs(%%zmm)
+                         "jmp 4f                                            \n\t"
 
                          ".align 16                                         \n\t"
                          "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
                          convert12RegsI32ToF32(%[scale], %%zmm)
 
                          ".align 16                                         \n\t"
                          "2:                                                \n\t"
-                         "vmovups %%zmm0, (%%rax)                             \n\t"
-                         "vmovups %%zmm2, 0x40(%%rax)                         \n\t"
-                         "vmovups %%zmm4, 0x80(%%rax)                         \n\t"
-                         "vmovups %%zmm6, 0xC0(%%rax)                         \n\t"
-                         "vmovups %%zmm8, 0x100(%%rax)                         \n\t"
-                         "vmovups %%zmm10, 0x140(%%rax)                         \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
-                         "vmovups %%zmm3, 0x40(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm5, 0x80(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm7, 0xC0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm9, 0x100(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm11, 0x140(%%rax, %%rbx)                         \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0x2, %%rcx                                   \n\t"
+                         "je 3f                                             \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0               \n\t"
+                         "vaddps 0x40(%[eltwise]), %%zmm2, %%zmm2           \n\t"
+                         "vaddps 0x80(%[eltwise]), %%zmm4, %%zmm4           \n\t"
+                         "vaddps 0xC0(%[eltwise]), %%zmm6, %%zmm6           \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm8, %%zmm8          \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm10, %%zmm10        \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1        \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx), %%zmm3, %%zmm3    \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx), %%zmm5, %%zmm5    \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx), %%zmm7, %%zmm7    \n\t"
+                         "vaddps 0x100(%[eltwise], %%rbx), %%zmm9, %%zmm9   \n\t"
+                         "vaddps 0x140(%[eltwise], %%rbx), %%zmm11, %%zmm11 \n\t"
+
+                         ".align 16                                         \n\t"
+                         "3:                                                \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0xC, %%rcx                                   \n\t"
+                         "je 4f                                             \n\t"
+                         relu12RegsPs(%%zmm)
+
+                         ".align 16                                         \n\t"
+                         "4:                                                \n\t"
+                         "vmovups %%zmm0, (%%rax)                           \n\t"
+                         "vmovups %%zmm2, 0x40(%%rax)                       \n\t"
+                         "vmovups %%zmm4, 0x80(%%rax)                       \n\t"
+                         "vmovups %%zmm6, 0xC0(%%rax)                       \n\t"
+                         "vmovups %%zmm8, 0x100(%%rax)                      \n\t"
+                         "vmovups %%zmm10, 0x140(%%rax)                     \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)                    \n\t"
+                         "vmovups %%zmm3, 0x40(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm5, 0x80(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm7, 0xC0(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm9, 0x100(%%rax, %%rbx)               \n\t"
+                         "vmovups %%zmm11, 0x140(%%rax, %%rbx)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm24", "%zmm31", "memory", "cc");
 }
 
 void Avx512Conv1x1Kernel1x32(ConvController &c) {
     convKernelForLoopXx32(2, 1)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                      \n\t"
+                         "movq %[ostepC16], %%rbx                    \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x1, %%rcx                            \n\t"
+                         "je 0f                                      \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0             \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1      \n\t"
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         ".align 16                                  \n\t"
+                         "0:                                         \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
                          relu2Regs(%%zmm)
+                         "jmp 4f                                     \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                  \n\t"
+                         "1:                                         \n\t"
                          convert2RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0, (%%rax)                             \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
+                         ".align 16                                  \n\t"
+                         "2:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x2, %%rcx                            \n\t"
+                         "je 3f                                      \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0        \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t"
+
+                         ".align 16                                  \n\t"
+                         "3:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
+                         relu2RegsPs(%%zmm)
+
+                         ".align 16                                  \n\t"
+                         "4:                                         \n\t"
+                         "vmovups %%zmm0, (%%rax)                    \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)             \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm24", "%zmm31", "memory", "cc");
 }
 
-#define load16BiasTo1Regs(bias, rtype) \
-    "vmovups ("#bias"), "#rtype"0                       \n\t"
-
-#define load16BiasTo12Regs(bias, rtype) \
-    load16BiasTo1Regs(bias, rtype) \
-    "vmovups "#rtype"0, "#rtype"1                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"2                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"3                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"4                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"5                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"6                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"7                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"8                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"9                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"10                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"11                   \n\t"
-
-#define load16BiasTo24Regs(bias, rtype) \
-    load16BiasTo12Regs(bias, rtype) \
-    "vmovups "#rtype"0, "#rtype"12                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"13                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"14                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"15                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"16                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"17                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"18                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"19                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"20                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"21                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"22                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"23                   \n\t"
-
 #ifdef _USE_AVX512_VNNI
-#define convKernel24x16c4(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x10("#input"), "#rtype"27         \n\t" \
-    "vpbroadcastd 0x20("#input"), "#rtype"28                     \n\t" \
-    "vpbroadcastd 0x30("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd 0x40("#input"), "#rtype"30                     \n\t" \
-    "vpbroadcastd 0x50("#input"), "#rtype"31                     \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"1              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"2              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"3              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"4              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"5              \n\t" \
-    "vpbroadcastd 0x60("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x70("#input"), "#rtype"27         \n\t" \
-    "vpbroadcastd 0x80("#input"), "#rtype"28                     \n\t" \
-    "vpbroadcastd 0x90("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd 0xA0("#input"), "#rtype"30                     \n\t" \
-    "vpbroadcastd 0xB0("#input"), "#rtype"31                     \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"6              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"7              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"8              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"9              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"10              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"11              \n\t" \
-    "vpbroadcastd 0xC0("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0xD0("#input"), "#rtype"27         \n\t" \
-    "vpbroadcastd 0xE0("#input"), "#rtype"28                     \n\t" \
-    "vpbroadcastd 0xF0("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd 0x100("#input"), "#rtype"30                     \n\t" \
-    "vpbroadcastd 0x110("#input"), "#rtype"31                     \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"12              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"13              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"14              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"15              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"16              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"17              \n\t" \
-    "vpbroadcastd 0x120("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x130("#input"), "#rtype"27         \n\t" \
-    "vpbroadcastd 0x140("#input"), "#rtype"28                     \n\t" \
-    "vpbroadcastd 0x150("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd 0x160("#input"), "#rtype"30                     \n\t" \
-    "vpbroadcastd 0x170("#input"), "#rtype"31                     \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"18              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"19              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"20              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"21              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"22              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"23              \n\t"
-
-#define convKernel12x16c4(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x10("#input"), "#rtype"27         \n\t" \
-    "vpbroadcastd 0x20("#input"), "#rtype"28                     \n\t" \
-    "vpbroadcastd 0x30("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd 0x40("#input"), "#rtype"30                     \n\t" \
-    "vpbroadcastd 0x50("#input"), "#rtype"31                     \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"1              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"2              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"3              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"4              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"5              \n\t" \
-    "vpbroadcastd 0x60("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x70("#input"), "#rtype"27         \n\t" \
-    "vpbroadcastd 0x80("#input"), "#rtype"28                     \n\t" \
-    "vpbroadcastd 0x90("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd 0xA0("#input"), "#rtype"30                     \n\t" \
-    "vpbroadcastd 0xB0("#input"), "#rtype"31                     \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"6              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"7              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"8              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"9              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"10              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"11              \n\t"
-
-#define convKernel1x16c4(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"26                     \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0              \n\t"
+#define convKernel24x16c4(input, freg0, off0, preg0, rtype, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                          i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \
+    "vpbroadcastd "#i0"("#input"), "#rtype"26                       \n\t" \
+    "vpbroadcastd "#i1"("#input"), "#rtype"27                       \n\t" \
+    "vpbroadcastd "#i2"("#input"), "#rtype"28                       \n\t" \
+    "vpbroadcastd "#i3"("#input"), "#rtype"29                       \n\t" \
+    "vpbroadcastd "#i4"("#input"), "#rtype"30                       \n\t" \
+    "vpbroadcastd "#i5"("#input"), "#rtype"31                       \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"                           \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"1                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"2                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"3                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"4                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"5                       \n\t" \
+    "vpbroadcastd "#i6"("#input"), "#rtype"26                       \n\t" \
+    "vpbroadcastd "#i7"("#input"), "#rtype"27                       \n\t" \
+    "vpbroadcastd "#i8"("#input"), "#rtype"28                       \n\t" \
+    "vpbroadcastd "#i9"("#input"), "#rtype"29                       \n\t" \
+    "vpbroadcastd "#i10"("#input"), "#rtype"30                      \n\t" \
+    "vpbroadcastd "#i11"("#input"), "#rtype"31                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"6                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"7                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"8                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"9                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"10                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"11                      \n\t" \
+    "vpbroadcastd "#i12"("#input"), "#rtype"26                      \n\t" \
+    "vpbroadcastd "#i13"("#input"), "#rtype"27                      \n\t" \
+    "vpbroadcastd "#i14"("#input"), "#rtype"28                      \n\t" \
+    "vpbroadcastd "#i15"("#input"), "#rtype"29                      \n\t" \
+    "vpbroadcastd "#i16"("#input"), "#rtype"30                      \n\t" \
+    "vpbroadcastd "#i17"("#input"), "#rtype"31                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"12                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"13                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"14                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"15                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"16                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"17                      \n\t" \
+    "vpbroadcastd "#i18"("#input"), "#rtype"26                      \n\t" \
+    "vpbroadcastd "#i19"("#input"), "#rtype"27                      \n\t" \
+    "vpbroadcastd "#i20"("#input"), "#rtype"28                      \n\t" \
+    "vpbroadcastd "#i21"("#input"), "#rtype"29                      \n\t" \
+    "vpbroadcastd "#i22"("#input"), "#rtype"30                      \n\t" \
+    "vpbroadcastd "#i23"("#input"), "#rtype"31                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"18                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"19                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"20                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"21                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"22                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"23                      \n\t"
+
+#define convKernel12x16c4(input, freg0, off0, preg0, rtype, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                          i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \
+    "vpbroadcastd "#i0"("#input"), "#rtype"26                       \n\t" \
+    "vpbroadcastd "#i1"("#input"), "#rtype"27                       \n\t" \
+    "vpbroadcastd "#i2"("#input"), "#rtype"28                       \n\t" \
+    "vpbroadcastd "#i3"("#input"), "#rtype"29                       \n\t" \
+    "vpbroadcastd "#i4"("#input"), "#rtype"30                       \n\t" \
+    "vpbroadcastd "#i5"("#input"), "#rtype"31                       \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"                           \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"1                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"2                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"3                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"4                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"5                       \n\t" \
+    "vpbroadcastd "#i6"("#input"), "#rtype"26                       \n\t" \
+    "vpbroadcastd "#i7"("#input"), "#rtype"27                       \n\t" \
+    "vpbroadcastd "#i8"("#input"), "#rtype"28                       \n\t" \
+    "vpbroadcastd "#i9"("#input"), "#rtype"29                       \n\t" \
+    "vpbroadcastd "#i10"("#input"), "#rtype"30                      \n\t" \
+    "vpbroadcastd "#i11"("#input"), "#rtype"31                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"6                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"7                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"8                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"9                       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"10                      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"11                      \n\t"
+
+#define convKernel1x16c4(input, freg0, off0, preg0, rtype, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                          i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \
+    "vpbroadcastd ("#input"), "#rtype"26                            \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"                           \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0                       \n\t"
 #else
 
-#define convKernel24x16c4_3(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0x10("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x20("#input"), "#rtype"27                     \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd 0x30("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0x40("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x50("#input"), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0              \n\t" \
-    "vpaddd "#rtype"1, "#rtype"29, "#rtype"1              \n\t" \
-    "vpaddd "#rtype"2, "#rtype"30, "#rtype"2              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd 0x60("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0x70("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x80("#input"), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"3, "#rtype"28, "#rtype"3              \n\t" \
-    "vpaddd "#rtype"4, "#rtype"29, "#rtype"4              \n\t" \
-    "vpaddd "#rtype"5, "#rtype"30, "#rtype"5              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd 0x90("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0xA0("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0xB0("#input"), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"6, "#rtype"28, "#rtype"6              \n\t" \
-    "vpaddd "#rtype"7, "#rtype"29, "#rtype"7              \n\t" \
-    "vpaddd "#rtype"8, "#rtype"30, "#rtype"8              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd 0xC0("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0xD0("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0xE0("#input"), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"9, "#rtype"28, "#rtype"9              \n\t" \
-    "vpaddd "#rtype"10, "#rtype"29, "#rtype"10              \n\t" \
-    "vpaddd "#rtype"11, "#rtype"30, "#rtype"11              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd 0xF0("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0x100("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x110("#input"), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"12, "#rtype"28, "#rtype"12              \n\t" \
-    "vpaddd "#rtype"13, "#rtype"29, "#rtype"13              \n\t" \
-    "vpaddd "#rtype"14, "#rtype"30, "#rtype"14              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd 0x120("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0x130("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x140("#input"), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"15, "#rtype"28, "#rtype"15              \n\t" \
-    "vpaddd "#rtype"16, "#rtype"29, "#rtype"16              \n\t" \
-    "vpaddd "#rtype"17, "#rtype"30, "#rtype"17              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd 0x150("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0x160("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x170("#input"), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"18, "#rtype"28, "#rtype"18              \n\t" \
-    "vpaddd "#rtype"19, "#rtype"29, "#rtype"19              \n\t" \
-    "vpaddd "#rtype"20, "#rtype"30, "#rtype"20              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpaddd "#rtype"21, "#rtype"28, "#rtype"21              \n\t" \
-    "vpaddd "#rtype"22, "#rtype"29, "#rtype"22              \n\t" \
-    "vpaddd "#rtype"23, "#rtype"30, "#rtype"23              \n\t"
-
-#define convKernel12x16c4_3(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0x10("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x20("#input"), "#rtype"27                     \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd 0x30("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0x40("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x50("#input"), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0              \n\t" \
-    "vpaddd "#rtype"1, "#rtype"29, "#rtype"1              \n\t" \
-    "vpaddd "#rtype"2, "#rtype"30, "#rtype"2              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd 0x60("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0x70("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0x80("#input"), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"3, "#rtype"28, "#rtype"3              \n\t" \
-    "vpaddd "#rtype"4, "#rtype"29, "#rtype"4              \n\t" \
-    "vpaddd "#rtype"5, "#rtype"30, "#rtype"5              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd 0x90("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd 0xA0("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd 0xB0("#input"), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"6, "#rtype"28, "#rtype"6              \n\t" \
-    "vpaddd "#rtype"7, "#rtype"29, "#rtype"7              \n\t" \
-    "vpaddd "#rtype"8, "#rtype"30, "#rtype"8              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpaddd "#rtype"9, "#rtype"28, "#rtype"9              \n\t" \
-    "vpaddd "#rtype"10, "#rtype"29, "#rtype"10              \n\t" \
-    "vpaddd "#rtype"11, "#rtype"30, "#rtype"11              \n\t"
-
-#define convKernel1x16c4_3(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0              \n\t"
-
-#define convKernel24x16c4(input, freg0, off0, preg0, rtype) \
-    convKernel24x16c4_3(input, rtype##24, off0, rtype##25, rtype)
-
-#define convKernel12x16c4(input, freg0, off0, preg0, rtype) \
-    convKernel12x16c4_3(input, rtype##24, off0, rtype##25, rtype)
-
-#define convKernel1x16c4(input, freg0, off0, preg0, rtype) \
-    convKernel1x16c4_3(input, rtype##24, off0, rtype##25, rtype)
+#define convKernel24x16c4_3(input, freg0, off0, preg0, rtype, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                          i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \
+    "vpbroadcastd "#i0"("#input"), "#rtype"25                       \n\t" \
+    "vpbroadcastd "#i1"("#input"), "#rtype"26                       \n\t" \
+    "vpbroadcastd "#i2"("#input"), "#rtype"27                       \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vpbroadcastd "#i3"("#input"), "#rtype"25                       \n\t" \
+    "vpbroadcastd "#i4"("#input"), "#rtype"26                       \n\t" \
+    "vpbroadcastd "#i5"("#input"), "#rtype"27                       \n\t" \
+    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0                        \n\t" \
+    "vpaddd "#rtype"1, "#rtype"29, "#rtype"1                        \n\t" \
+    "vpaddd "#rtype"2, "#rtype"30, "#rtype"2                        \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vpbroadcastd "#i6"("#input"), "#rtype"25                       \n\t" \
+    "vpbroadcastd "#i7"("#input"), "#rtype"26                       \n\t" \
+    "vpbroadcastd "#i8"("#input"), "#rtype"27                       \n\t" \
+    "vpaddd "#rtype"3, "#rtype"28, "#rtype"3                        \n\t" \
+    "vpaddd "#rtype"4, "#rtype"29, "#rtype"4                        \n\t" \
+    "vpaddd "#rtype"5, "#rtype"30, "#rtype"5                        \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vpbroadcastd "#i9"("#input"), "#rtype"25                       \n\t" \
+    "vpbroadcastd "#i10"("#input"), "#rtype"26                      \n\t" \
+    "vpbroadcastd "#i11"("#input"), "#rtype"27                      \n\t" \
+    "vpaddd "#rtype"6, "#rtype"28, "#rtype"6                        \n\t" \
+    "vpaddd "#rtype"7, "#rtype"29, "#rtype"7                        \n\t" \
+    "vpaddd "#rtype"8, "#rtype"30, "#rtype"8                        \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vpbroadcastd "#i12"("#input"), "#rtype"25                      \n\t" \
+    "vpbroadcastd "#i13"("#input"), "#rtype"26                      \n\t" \
+    "vpbroadcastd "#i14"("#input"), "#rtype"27                      \n\t" \
+    "vpaddd "#rtype"9, "#rtype"28, "#rtype"9                        \n\t" \
+    "vpaddd "#rtype"10, "#rtype"29, "#rtype"10                      \n\t" \
+    "vpaddd "#rtype"11, "#rtype"30, "#rtype"11                      \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vpbroadcastd "#i15"("#input"), "#rtype"25                      \n\t" \
+    "vpbroadcastd "#i16"("#input"), "#rtype"26                      \n\t" \
+    "vpbroadcastd "#i17"("#input"), "#rtype"27                      \n\t" \
+    "vpaddd "#rtype"12, "#rtype"28, "#rtype"12                      \n\t" \
+    "vpaddd "#rtype"13, "#rtype"29, "#rtype"13                      \n\t" \
+    "vpaddd "#rtype"14, "#rtype"30, "#rtype"14                      \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vpbroadcastd "#i18"("#input"), "#rtype"25                      \n\t" \
+    "vpbroadcastd "#i19"("#input"), "#rtype"26                      \n\t" \
+    "vpbroadcastd "#i20"("#input"), "#rtype"27                      \n\t" \
+    "vpaddd "#rtype"15, "#rtype"28, "#rtype"15                      \n\t" \
+    "vpaddd "#rtype"16, "#rtype"29, "#rtype"16                      \n\t" \
+    "vpaddd "#rtype"17, "#rtype"30, "#rtype"17                      \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vpbroadcastd "#i21"("#input"), "#rtype"25                      \n\t" \
+    "vpbroadcastd "#i22"("#input"), "#rtype"26                      \n\t" \
+    "vpbroadcastd "#i23"("#input"), "#rtype"27                      \n\t" \
+    "vpaddd "#rtype"18, "#rtype"28, "#rtype"18                      \n\t" \
+    "vpaddd "#rtype"19, "#rtype"29, "#rtype"19                      \n\t" \
+    "vpaddd "#rtype"20, "#rtype"30, "#rtype"20                      \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"                           \n\t" \
+    "vpaddd "#rtype"21, "#rtype"28, "#rtype"21                      \n\t" \
+    "vpaddd "#rtype"22, "#rtype"29, "#rtype"22                      \n\t" \
+    "vpaddd "#rtype"23, "#rtype"30, "#rtype"23                      \n\t"
+
+#define convKernel12x16c4_3(input, freg0, off0, preg0, rtype, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                          i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \
+    "vpbroadcastd "#i0"("#input"), "#rtype"25                       \n\t" \
+    "vpbroadcastd "#i1"("#input"), "#rtype"26                       \n\t" \
+    "vpbroadcastd "#i2"("#input"), "#rtype"27                       \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vpbroadcastd "#i3"("#input"), "#rtype"25                       \n\t" \
+    "vpbroadcastd "#i4"("#input"), "#rtype"26                       \n\t" \
+    "vpbroadcastd "#i5"("#input"), "#rtype"27                       \n\t" \
+    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0                        \n\t" \
+    "vpaddd "#rtype"1, "#rtype"29, "#rtype"1                        \n\t" \
+    "vpaddd "#rtype"2, "#rtype"30, "#rtype"2                        \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vpbroadcastd "#i6"("#input"), "#rtype"25                       \n\t" \
+    "vpbroadcastd "#i7"("#input"), "#rtype"26                       \n\t" \
+    "vpbroadcastd "#i8"("#input"), "#rtype"27                       \n\t" \
+    "vpaddd "#rtype"3, "#rtype"28, "#rtype"3                        \n\t" \
+    "vpaddd "#rtype"4, "#rtype"29, "#rtype"4                        \n\t" \
+    "vpaddd "#rtype"5, "#rtype"30, "#rtype"5                        \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vpbroadcastd "#i9"("#input"), "#rtype"25                       \n\t" \
+    "vpbroadcastd "#i10"("#input"), "#rtype"26                      \n\t" \
+    "vpbroadcastd "#i11"("#input"), "#rtype"27                      \n\t" \
+    "vpaddd "#rtype"6, "#rtype"28, "#rtype"6                        \n\t" \
+    "vpaddd "#rtype"7, "#rtype"29, "#rtype"7                        \n\t" \
+    "vpaddd "#rtype"8, "#rtype"30, "#rtype"8                        \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29                    \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29                    \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30                    \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"                           \n\t" \
+    "vpaddd "#rtype"9, "#rtype"28, "#rtype"9                        \n\t" \
+    "vpaddd "#rtype"10, "#rtype"29, "#rtype"10                      \n\t" \
+    "vpaddd "#rtype"11, "#rtype"30, "#rtype"11                      \n\t"
+
+#define convKernel1x16c4_3(input, freg0, off0, preg0, rtype, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                          i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \
+    "vpbroadcastd ("#input"), "#rtype"25                            \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28                    \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28                    \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"                           \n\t" \
+    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0                        \n\t"
+
+#define convKernel24x16c4(input, freg0, off0, preg0, rtype, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                          i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \
+    convKernel24x16c4_3(input, rtype##24, off0, rtype##25, rtype, \
+                        i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                        i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23)
+
+#define convKernel12x16c4(input, freg0, off0, preg0, rtype, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                          i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \
+    convKernel12x16c4_3(input, rtype##24, off0, rtype##25, rtype, \
+                        i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                        i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23)
+
+#define convKernel1x16c4(input, freg0, off0, preg0, rtype, \
+                          i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                          i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \
+    convKernel1x16c4_3(input, rtype##24, off0, rtype##25, rtype, \
+                       i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \
+                       i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23)
 #endif
 
 #define convKernelForLoopXx16(rnum, wsize, rtype, off0, off1, off2, off3, off4) \
-     __asm__ __volatile__("vmovups (%[filter]), "#rtype"24                             \n\t" \
-                          "addq $"#off1", %[filter]                                    \n\t" \
-                          "mov $1, %%eax \n\t" \
-                          "vmovd %%eax, %%xmm0                    \n\t" \
-                          "vpbroadcastw %%xmm0, "#rtype"31            \n\t" \
-                          "movq %[flags], %%rax          \n\t" \
-                          "andq $0x1, %%rax          \n\t" \
-                          "jne 0f                                         \n\t" \
-                          load16BiasTo##rnum##Regs(%[bias], rtype) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          "jmp 1f          \n\t" \
-                          ".align 16                                         \n\t" \
-                          "0:                                                \n\t" \
-                          clear##rnum##Regs(rtype) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          ".align 16                                         \n\t" \
-                          "1:                                                \n\t" \
-                          "movq %[input], %%rax  \n\t" \
-                          convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype) \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype) \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x16c4(%%rax, rtype##24, off2, rtype##25, rtype) \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x16c4(%%rax, rtype##25, off3, rtype##24, rtype) \
-                          "addq $"#off4", %[filter]                                    \n\t" \
-                          "addq %[fStep], %[input]                                    \n\t" \
-                          "subq $0x10, %%rcx                                         \n\t" \
-                          "cmpq $0x10, %%rcx                                         \n\t" \
-                          "jge 1b                                             \n\t" \
-                          "subq %[fStep], %[input]                                    \n\t" \
-                          "addq %[f8Step], %[input]                                    \n\t" \
-                          ".align 16                                         \n\t" \
-                          "4:                                                \n\t" \
-                          : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \
-                          : [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \
-                            [fStep] "r" (c.fStep), [flags] "r" (c.flags),  \
-                            [f8Step] "r" (c.f8Step) \
-                          : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                            "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                            "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                            "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                            "%zmm31", "memory", "cc"); \
-     if (c.ic > 0) { \
-         __asm__ __volatile__("cmpq $0x8, %%rcx          \n\t" \
-                              "jl 2f            \n\t" \
-                              "subq $0x8, %%rcx          \n\t" \
-                              "movq %[input], %%rax  \n\t" \
-                              convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype) \
-                              "addq $0x4, %%rax  \n\t" \
-                              convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype) \
-                              "addq $"#off2", %[filter]                                    \n\t" \
-                              "addq %[f4Step], %[input]                                    \n\t" \
-                              ".align 16                                         \n\t" \
-                              "2:                                                \n\t" \
-                              "cmpq $0x4, %%rcx          \n\t" \
-                              "jl 5f            \n\t" \
-                              convKernel##wsize##x16c4(%[input], rtype##24, off0, rtype##25, rtype) \
-                              ".align 16                                         \n\t" \
-                              "5:                                             \n\t" \
-                              : "+c" (c.ic) \
-                              : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \
-                                [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \
-                                [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), \
-                                [f4Step] "r" (c.f4Step) \
-                              : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                                "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                                "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                                "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                                "%zmm31", "memory", "cc"); \
+     __asm__ __volatile__("vmovups (%[filter]), "#rtype"24                                     \n\t" \
+                          "addq $"#off1", %[filter]                                            \n\t" \
+                          "mov $1, %%eax                                                       \n\t" \
+                          "vmovd %%eax, %%xmm0                                                 \n\t" \
+                          "vpbroadcastw %%xmm0, "#rtype"31                                     \n\t" \
+                          "movq %[flags], %%rax                                                \n\t" \
+                          "andq $0x1, %%rax                                                    \n\t" \
+                          "jne 0f                                                              \n\t" \
+                          load16BiasTo##rnum##Regs(%[bias], rtype)                                   \
+                          "cmpq $0x10, %%rcx                                                   \n\t" \
+                          "jl 4f                                                               \n\t" \
+                          "jmp 1f                                                              \n\t" \
+                          ".align 16                                                           \n\t" \
+                          "0:                                                                  \n\t" \
+                          clear##rnum##Regs(rtype)                                                   \
+                          "cmpq $0x10, %%rcx                                                   \n\t" \
+                          "jl 4f                                                               \n\t" \
+                          ".align 16                                                           \n\t" \
+                          "1:                                                                  \n\t" \
+                          "movq %[input], %%rax                                                \n\t" \
+                          convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype,         \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50,                \
+                                                   0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0,               \
+                                                   0xC0, 0xD0, 0xE0, 0xF0, 0x100, 0x110,             \
+                                                   0x120, 0x130, 0x140, 0x150, 0x160, 0x170)         \
+                          "addq $0x4, %%rax                                                    \n\t" \
+                          convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype,         \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50,                \
+                                                   0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0,               \
+                                                   0xC0, 0xD0, 0xE0, 0xF0, 0x100, 0x110,             \
+                                                   0x120, 0x130, 0x140, 0x150, 0x160, 0x170)         \
+                          "addq $0x4, %%rax                                                    \n\t" \
+                          convKernel##wsize##x16c4(%%rax, rtype##24, off2, rtype##25, rtype,         \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50,                \
+                                                   0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0,               \
+                                                   0xC0, 0xD0, 0xE0, 0xF0, 0x100, 0x110,             \
+                                                   0x120, 0x130, 0x140, 0x150, 0x160, 0x170)         \
+                          "addq $0x4, %%rax                                                    \n\t" \
+                          convKernel##wsize##x16c4(%%rax, rtype##25, off3, rtype##24, rtype,         \
+                                                   0x0, 0x10, 0x20, 0x30, 0x40, 0x50,                \
+                                                   0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0,               \
+                                                   0xC0, 0xD0, 0xE0, 0xF0, 0x100, 0x110,             \
+                                                   0x120, 0x130, 0x140, 0x150, 0x160, 0x170)         \
+                          "addq $"#off4", %[filter]                                            \n\t" \
+                          "addq %[fStep], %[input]                                             \n\t" \
+                          "subq $0x10, %%rcx                                                   \n\t" \
+                          "cmpq $0x10, %%rcx                                                   \n\t" \
+                          "jge 1b                                                              \n\t" \
+                          "subq %[fStep], %[input]                                             \n\t" \
+                          "addq %[f8Step], %[input]                                            \n\t" \
+                          ".align 16                                                           \n\t" \
+                          "4:                                                                  \n\t" \
+                          : "+c" (c.ic),                                                             \
+                            [input] "+r" (c.input),                                                  \
+                            [filter] "+r" (c.filter)                                                 \
+                          : [bias] "r" (c.bias),                                                     \
+                            [kh] "r" (c.kh),                                                         \
+                            [kw] "r" (c.kw),                                                         \
+                            [fStep] "r" (c.fStep),                                                   \
+                            [flags] "r" (c.flags),                                                   \
+                            [f8Step] "r" (c.f8Step)                                                  \
+                          : "%rax", "%rbx", "%r9",                                                   \
+                            "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",                    \
+                            "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",                  \
+                            "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",              \
+                            "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",              \
+                            "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",              \
+                            "%zmm30", "%zmm31", "memory", "cc");                                     \
+     if (c.ic > 0) {                                                                                 \
+         __asm__ __volatile__("cmpq $0x8, %%rcx                                                \n\t" \
+                              "jl 2f                                                           \n\t" \
+                              "subq $0x8, %%rcx                                                \n\t" \
+                              "movq %[input], %%rax                                            \n\t" \
+                              convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype,     \
+                                                       0x0, 0x8, 0x10, 0x18, 0x20, 0x28,             \
+                                                       0x30, 0x38, 0x40, 0x48, 0x50, 0x58,           \
+                                                       0x60, 0x68, 0x70, 0x78, 0x80, 0x88,           \
+                                                       0x90, 0x98, 0xA0, 0xA8, 0xB0, 0xB8)           \
+                              "addq $0x4, %%rax                                                \n\t" \
+                              convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype,     \
+                                                       0x0, 0x8, 0x10, 0x18, 0x20, 0x28,             \
+                                                       0x30, 0x38, 0x40, 0x48, 0x50, 0x58,           \
+                                                       0x60, 0x68, 0x70, 0x78, 0x80, 0x88,           \
+                                                       0x90, 0x98, 0xA0, 0xA8, 0xB0, 0xB8)           \
+                              "addq $"#off2", %[filter]                                        \n\t" \
+                              "addq %[f4Step], %[input]                                        \n\t" \
+                              ".align 16                                                       \n\t" \
+                              "2:                                                              \n\t" \
+                              "cmpq $0x4, %%rcx                                                \n\t" \
+                              "jl 5f                                                           \n\t" \
+                              convKernel##wsize##x16c4(%[input], rtype##24, off0, rtype##25, rtype,  \
+                                                       0x0, 0x4, 0x8, 0xC, 0x10, 0x14,               \
+                                                       0x18, 0x1C, 0x20, 0x24, 0x28, 0x2C,           \
+                                                       0x30, 0x34, 0x38, 0x3C, 0x40, 0x44,           \
+                                                       0x48, 0x4C, 0x50, 0x54, 0x58, 0x5C)           \
+                              ".align 16                                                       \n\t" \
+                              "5:                                                              \n\t" \
+                              : "+c" (c.ic)                                                          \
+                              : [input] "r" (c.input),                                               \
+                                [filter] "r" (c.filter),                                             \
+                                [bias] "r" (c.bias),                                                 \
+                                [kh] "r" (c.kh),                                                     \
+                                [kw] "r" (c.kw),                                                     \
+                                [dilateW] "r" (c.dilateW),                                           \
+                                [dilateH] "r" (c.dilateH),                                           \
+                                [fStep] "r" (c.fStep),                                               \
+                                [f4Step] "r" (c.f4Step)                                              \
+                              : "%rax", "%rbx", "%r9",                                               \
+                                "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",                \
+                                "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",              \
+                                "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",          \
+                                "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",          \
+                                "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",          \
+                                "%zmm30", "%zmm31", "memory", "cc");                                 \
     }
 
 void Avx512Conv1x1Kernel24x16(ConvController &c) {
     convKernelForLoopXx16(24, 24, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax),      %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x80(%%rax),  %%zmm2, %%zmm2                         \n\t"
-                         "vpaddd 0xC0(%%rax),  %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x100(%%rax), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0x140(%%rax), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0x180(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0x200(%%rax), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0x240(%%rax), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd 0x280(%%rax), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11                         \n\t"
-                         "vpaddd 0x300(%%rax), %%zmm12, %%zmm12                         \n\t"
-                         "vpaddd 0x340(%%rax), %%zmm13, %%zmm13                         \n\t"
-                         "vpaddd 0x380(%%rax), %%zmm14, %%zmm14                         \n\t"
-                         "vpaddd 0x3C0(%%rax), %%zmm15, %%zmm15                         \n\t"
-                         "vpaddd 0x400(%%rax), %%zmm16, %%zmm16                         \n\t"
-                         "vpaddd 0x440(%%rax), %%zmm17, %%zmm17                         \n\t"
-                         "vpaddd 0x480(%%rax), %%zmm18, %%zmm18                         \n\t"
-                         "vpaddd 0x4C0(%%rax), %%zmm19, %%zmm19                         \n\t"
-                         "vpaddd 0x500(%%rax), %%zmm20, %%zmm20                         \n\t"
-                         "vpaddd 0x540(%%rax), %%zmm21, %%zmm21                         \n\t"
-                         "vpaddd 0x580(%%rax), %%zmm22, %%zmm22                         \n\t"
-                         "vpaddd 0x5C0(%%rax), %%zmm23, %%zmm23                         \n\t"
-
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                      \n\t"
+                         "movq %[ostepC16], %%rbx                    \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x1, %%rcx                            \n\t"
+                         "je 0f                                      \n\t"
+                         "vpaddd (%%rax),      %%zmm0, %%zmm0        \n\t"
+                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1        \n\t"
+                         "vpaddd 0x80(%%rax),  %%zmm2, %%zmm2        \n\t"
+                         "vpaddd 0xC0(%%rax),  %%zmm3, %%zmm3        \n\t"
+                         "vpaddd 0x100(%%rax), %%zmm4, %%zmm4        \n\t"
+                         "vpaddd 0x140(%%rax), %%zmm5, %%zmm5        \n\t"
+                         "vpaddd 0x180(%%rax), %%zmm6, %%zmm6        \n\t"
+                         "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7        \n\t"
+                         "vpaddd 0x200(%%rax), %%zmm8, %%zmm8        \n\t"
+                         "vpaddd 0x240(%%rax), %%zmm9, %%zmm9        \n\t"
+                         "vpaddd 0x280(%%rax), %%zmm10, %%zmm10      \n\t"
+                         "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11      \n\t"
+                         "vpaddd 0x300(%%rax), %%zmm12, %%zmm12      \n\t"
+                         "vpaddd 0x340(%%rax), %%zmm13, %%zmm13      \n\t"
+                         "vpaddd 0x380(%%rax), %%zmm14, %%zmm14      \n\t"
+                         "vpaddd 0x3C0(%%rax), %%zmm15, %%zmm15      \n\t"
+                         "vpaddd 0x400(%%rax), %%zmm16, %%zmm16      \n\t"
+                         "vpaddd 0x440(%%rax), %%zmm17, %%zmm17      \n\t"
+                         "vpaddd 0x480(%%rax), %%zmm18, %%zmm18      \n\t"
+                         "vpaddd 0x4C0(%%rax), %%zmm19, %%zmm19      \n\t"
+                         "vpaddd 0x500(%%rax), %%zmm20, %%zmm20      \n\t"
+                         "vpaddd 0x540(%%rax), %%zmm21, %%zmm21      \n\t"
+                         "vpaddd 0x580(%%rax), %%zmm22, %%zmm22      \n\t"
+                         "vpaddd 0x5C0(%%rax), %%zmm23, %%zmm23      \n\t"
+
+                         ".align 16                                  \n\t"
+                         "0:                                         \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
                          relu24Regs(%%zmm)
+                         "jmp 4f                                     \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                  \n\t"
+                         "1:                                         \n\t"
                          convert24RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0,  (%%rax)                     \n\t"
-                         "vmovups %%zmm1,  0x40(%%rax)                        \n\t"
-                         "vmovups %%zmm2,  0x80(%%rax)                 \n\t"
-                         "vmovups %%zmm3,  0xC0(%%rax)                        \n\t"
-                         "vmovups %%zmm4,  0x100(%%rax)                 \n\t"
-                         "vmovups %%zmm5,  0x140(%%rax)                        \n\t"
-                         "vmovups %%zmm6,  0x180(%%rax)                 \n\t"
-                         "vmovups %%zmm7,  0x1C0(%%rax)                        \n\t"
-                         "vmovups %%zmm8,  0x200(%%rax)                  \n\t"
-                         "vmovups %%zmm9,  0x240(%%rax)                         \n\t"
-                         "vmovups %%zmm10, 0x280(%%rax)                  \n\t"
-                         "vmovups %%zmm11, 0x2C0(%%rax)                         \n\t"
-                         "vmovups %%zmm12, 0x300(%%rax)                  \n\t"
-                         "vmovups %%zmm13, 0x340(%%rax)                         \n\t"
-                         "vmovups %%zmm14, 0x380(%%rax)                  \n\t"
-                         "vmovups %%zmm15, 0x3C0(%%rax)                         \n\t"
-                         "vmovups %%zmm16, 0x400(%%rax)                  \n\t"
-                         "vmovups %%zmm17, 0x440(%%rax)                         \n\t"
-                         "vmovups %%zmm18, 0x480(%%rax)                  \n\t"
-                         "vmovups %%zmm19, 0x4C0(%%rax)                         \n\t"
-                         "vmovups %%zmm20, 0x500(%%rax)                  \n\t"
-                         "vmovups %%zmm21, 0x540(%%rax)                         \n\t"
-                         "vmovups %%zmm22, 0x580(%%rax)                  \n\t"
-                         "vmovups %%zmm23, 0x5C0(%%rax)                         \n\t"
+                         ".align 16                                  \n\t"
+                         "2:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x2, %%rcx                            \n\t"
+                         "je 3f                                      \n\t"
+                         "vaddps (%[eltwise]),      %%zmm0, %%zmm0   \n\t"
+                         "vaddps 0x40(%[eltwise]),  %%zmm1, %%zmm1   \n\t"
+                         "vaddps 0x80(%[eltwise]),  %%zmm2, %%zmm2   \n\t"
+                         "vaddps 0xC0(%[eltwise]),  %%zmm3, %%zmm3   \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4   \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5   \n\t"
+                         "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6   \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7   \n\t"
+                         "vaddps 0x200(%[eltwise]), %%zmm8, %%zmm8   \n\t"
+                         "vaddps 0x240(%[eltwise]), %%zmm9, %%zmm9   \n\t"
+                         "vaddps 0x280(%[eltwise]), %%zmm10, %%zmm10 \n\t"
+                         "vaddps 0x2C0(%[eltwise]), %%zmm11, %%zmm11 \n\t"
+                         "vaddps 0x300(%[eltwise]), %%zmm12, %%zmm12 \n\t"
+                         "vaddps 0x340(%[eltwise]), %%zmm13, %%zmm13 \n\t"
+                         "vaddps 0x380(%[eltwise]), %%zmm14, %%zmm14 \n\t"
+                         "vaddps 0x3C0(%[eltwise]), %%zmm15, %%zmm15 \n\t"
+                         "vaddps 0x400(%[eltwise]), %%zmm16, %%zmm16 \n\t"
+                         "vaddps 0x440(%[eltwise]), %%zmm17, %%zmm17 \n\t"
+                         "vaddps 0x480(%[eltwise]), %%zmm18, %%zmm18 \n\t"
+                         "vaddps 0x4C0(%[eltwise]), %%zmm19, %%zmm19 \n\t"
+                         "vaddps 0x500(%[eltwise]), %%zmm20, %%zmm20 \n\t"
+                         "vaddps 0x540(%[eltwise]), %%zmm21, %%zmm21 \n\t"
+                         "vaddps 0x580(%[eltwise]), %%zmm22, %%zmm22 \n\t"
+                         "vaddps 0x5C0(%[eltwise]), %%zmm23, %%zmm23 \n\t"
+
+                         ".align 16                                  \n\t"
+                         "3:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
+                         relu24RegsPs(%%zmm)
+
+                         ".align 16                                  \n\t"
+                         "4:                                         \n\t"
+                         "vmovups %%zmm0,  (%%rax)                   \n\t"
+                         "vmovups %%zmm1,  0x40(%%rax)               \n\t"
+                         "vmovups %%zmm2,  0x80(%%rax)               \n\t"
+                         "vmovups %%zmm3,  0xC0(%%rax)               \n\t"
+                         "vmovups %%zmm4,  0x100(%%rax)              \n\t"
+                         "vmovups %%zmm5,  0x140(%%rax)              \n\t"
+                         "vmovups %%zmm6,  0x180(%%rax)              \n\t"
+                         "vmovups %%zmm7,  0x1C0(%%rax)              \n\t"
+                         "vmovups %%zmm8,  0x200(%%rax)              \n\t"
+                         "vmovups %%zmm9,  0x240(%%rax)              \n\t"
+                         "vmovups %%zmm10, 0x280(%%rax)              \n\t"
+                         "vmovups %%zmm11, 0x2C0(%%rax)              \n\t"
+                         "vmovups %%zmm12, 0x300(%%rax)              \n\t"
+                         "vmovups %%zmm13, 0x340(%%rax)              \n\t"
+                         "vmovups %%zmm14, 0x380(%%rax)              \n\t"
+                         "vmovups %%zmm15, 0x3C0(%%rax)              \n\t"
+                         "vmovups %%zmm16, 0x400(%%rax)              \n\t"
+                         "vmovups %%zmm17, 0x440(%%rax)              \n\t"
+                         "vmovups %%zmm18, 0x480(%%rax)              \n\t"
+                         "vmovups %%zmm19, 0x4C0(%%rax)              \n\t"
+                         "vmovups %%zmm20, 0x500(%%rax)              \n\t"
+                         "vmovups %%zmm21, 0x540(%%rax)              \n\t"
+                         "vmovups %%zmm22, 0x580(%%rax)              \n\t"
+                         "vmovups %%zmm23, 0x5C0(%%rax)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
+                           "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",
+                           "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",
+                           "%zmm30","%zmm31", "memory", "cc");
 }
 
 void Avx512Conv1x1Kernel12x16(ConvController &c) {
      convKernelForLoopXx16(12, 12, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax),      %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x80(%%rax),  %%zmm2, %%zmm2                         \n\t"
-                         "vpaddd 0xC0(%%rax),  %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x100(%%rax), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0x140(%%rax), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0x180(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0x200(%%rax), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0x240(%%rax), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd 0x280(%%rax), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11                         \n\t"
-
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                      \n\t"
+                         "movq %[ostepC16], %%rbx                    \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x1, %%rcx                            \n\t"
+                         "je 0f                                      \n\t"
+                         "vpaddd (%%rax),      %%zmm0, %%zmm0        \n\t"
+                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1        \n\t"
+                         "vpaddd 0x80(%%rax),  %%zmm2, %%zmm2        \n\t"
+                         "vpaddd 0xC0(%%rax),  %%zmm3, %%zmm3        \n\t"
+                         "vpaddd 0x100(%%rax), %%zmm4, %%zmm4        \n\t"
+                         "vpaddd 0x140(%%rax), %%zmm5, %%zmm5        \n\t"
+                         "vpaddd 0x180(%%rax), %%zmm6, %%zmm6        \n\t"
+                         "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7        \n\t"
+                         "vpaddd 0x200(%%rax), %%zmm8, %%zmm8        \n\t"
+                         "vpaddd 0x240(%%rax), %%zmm9, %%zmm9        \n\t"
+                         "vpaddd 0x280(%%rax), %%zmm10, %%zmm10      \n\t"
+                         "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11      \n\t"
+
+                         ".align 16                                  \n\t"
+                         "0:                                         \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
                          relu12Regs(%%zmm)
+                         "jmp 4f                                     \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                  \n\t"
+                         "1:                                         \n\t"
                          convert12RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0,  (%%rax)                     \n\t"
-                         "vmovups %%zmm1,  0x40(%%rax)                        \n\t"
-                         "vmovups %%zmm2,  0x80(%%rax)                 \n\t"
-                         "vmovups %%zmm3,  0xC0(%%rax)                        \n\t"
-                         "vmovups %%zmm4,  0x100(%%rax)                 \n\t"
-                         "vmovups %%zmm5,  0x140(%%rax)                        \n\t"
-                         "vmovups %%zmm6,  0x180(%%rax)                 \n\t"
-                         "vmovups %%zmm7,  0x1C0(%%rax)                        \n\t"
-                         "vmovups %%zmm8,  0x200(%%rax)                  \n\t"
-                         "vmovups %%zmm9,  0x240(%%rax)                         \n\t"
-                         "vmovups %%zmm10, 0x280(%%rax)                  \n\t"
-                         "vmovups %%zmm11, 0x2C0(%%rax)                         \n\t"
+                         ".align 16                                  \n\t"
+                         "2:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x2, %%rcx                            \n\t"
+                         "je 3f                                      \n\t"
+                         "vaddps (%[eltwise]),      %%zmm0, %%zmm0   \n\t"
+                         "vaddps 0x40(%[eltwise]),  %%zmm1, %%zmm1   \n\t"
+                         "vaddps 0x80(%[eltwise]),  %%zmm2, %%zmm2   \n\t"
+                         "vaddps 0xC0(%[eltwise]),  %%zmm3, %%zmm3   \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4   \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5   \n\t"
+                         "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6   \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7   \n\t"
+                         "vaddps 0x200(%[eltwise]), %%zmm8, %%zmm8   \n\t"
+                         "vaddps 0x240(%[eltwise]), %%zmm9, %%zmm9   \n\t"
+                         "vaddps 0x280(%[eltwise]), %%zmm10, %%zmm10 \n\t"
+                         "vaddps 0x2C0(%[eltwise]), %%zmm11, %%zmm11 \n\t"
+
+                         ".align 16                                  \n\t"
+                         "3:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
+                         relu12RegsPs(%%zmm)
+
+                         ".align 16                                  \n\t"
+                         "4:                                         \n\t"
+                         "vmovups %%zmm0,  (%%rax)                   \n\t"
+                         "vmovups %%zmm1,  0x40(%%rax)               \n\t"
+                         "vmovups %%zmm2,  0x80(%%rax)               \n\t"
+                         "vmovups %%zmm3,  0xC0(%%rax)               \n\t"
+                         "vmovups %%zmm4,  0x100(%%rax)              \n\t"
+                         "vmovups %%zmm5,  0x140(%%rax)              \n\t"
+                         "vmovups %%zmm6,  0x180(%%rax)              \n\t"
+                         "vmovups %%zmm7,  0x1C0(%%rax)              \n\t"
+                         "vmovups %%zmm8,  0x200(%%rax)              \n\t"
+                         "vmovups %%zmm9,  0x240(%%rax)              \n\t"
+                         "vmovups %%zmm10, 0x280(%%rax)              \n\t"
+                         "vmovups %%zmm11, 0x2C0(%%rax)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6","%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm24", "%zmm31", "memory", "cc");
 }
 
 void Avx512Conv1x1Kernel1x16(ConvController &c) {
     convKernelForLoopXx16(1, 1, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax),      %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                 \n\t"
+                         "movq %[flags], %%rcx                  \n\t"
+                         "and $0x1, %%rcx                       \n\t"
+                         "je 0f                                 \n\t"
+                         "vpaddd (%%rax),      %%zmm0, %%zmm0   \n\t"
+                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1   \n\t"
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         ".align 16                             \n\t"
+                         "0:                                    \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                  \n\t"
+                         "and $0xC, %%rcx                       \n\t"
+                         "je 4f                                 \n\t"
                          reluReg(%%zmm)
+                         "jmp 4f                                \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                             \n\t"
+                         "1:                                    \n\t"
                          convertRegI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0,  (%%rax)                     \n\t"
+                         ".align 16                             \n\t"
+                         "2:                                    \n\t"
+                         "movq %[flags], %%rcx                  \n\t"
+                         "and $0x2, %%rcx                       \n\t"
+                         "je 3f                                 \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0   \n\t"
+
+                         ".align 16                             \n\t"
+                         "3:                                    \n\t"
+                         "movq %[flags], %%rcx                  \n\t"
+                         "and $0xC, %%rcx                       \n\t"
+                         "je 4f                                 \n\t"
+                         reluRegPs(%%zmm)
+
+                         ".align 16                             \n\t"
+                         "4:                                    \n\t"
+                         "vmovups %%zmm0,  (%%rax)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0","%zmm24", "%zmm31", "memory", "cc");
 }
 
 void Avx512Conv1x1Kernel24x8(ConvController &c) {
     convKernelForLoopXx16(24, 24, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax),      %%ymm0,  %%ymm0                             \n\t"
-                         "vpaddd 0x20(%%rax),  %%ymm1,  %%ymm1                             \n\t"
-                         "vpaddd 0x40(%%rax),  %%ymm2,  %%ymm2                         \n\t"
-                         "vpaddd 0x60(%%rax),  %%ymm3,  %%ymm3                         \n\t"
-                         "vpaddd 0x80(%%rax),  %%ymm4,  %%ymm4                         \n\t"
-                         "vpaddd 0xA0(%%rax),  %%ymm5,  %%ymm5                         \n\t"
-                         "vpaddd 0xC0(%%rax),  %%ymm6,  %%ymm6                         \n\t"
-                         "vpaddd 0xE0(%%rax),  %%ymm7,  %%ymm7                         \n\t"
-                         "vpaddd 0x100(%%rax), %%ymm8,  %%ymm8                         \n\t"
-                         "vpaddd 0x120(%%rax), %%ymm9,  %%ymm9                         \n\t"
-                         "vpaddd 0x140(%%rax), %%ymm10, %%ymm10                         \n\t"
-                         "vpaddd 0x160(%%rax), %%ymm11, %%ymm11                         \n\t"
-                         "vpaddd 0x180(%%rax), %%ymm12, %%ymm12                         \n\t"
-                         "vpaddd 0x1A0(%%rax), %%ymm13, %%ymm13                         \n\t"
-                         "vpaddd 0x1C0(%%rax), %%ymm14, %%ymm14                         \n\t"
-                         "vpaddd 0x1E0(%%rax), %%ymm15, %%ymm15                         \n\t"
-                         "vpaddd 0x200(%%rax), %%ymm16, %%ymm16                         \n\t"
-                         "vpaddd 0x220(%%rax), %%ymm17, %%ymm17                         \n\t"
-                         "vpaddd 0x240(%%rax), %%ymm18, %%ymm18                         \n\t"
-                         "vpaddd 0x260(%%rax), %%ymm19, %%ymm19                         \n\t"
-                         "vpaddd 0x280(%%rax), %%ymm20, %%ymm20                         \n\t"
-                         "vpaddd 0x2A0(%%rax), %%ymm21, %%ymm21                         \n\t"
-                         "vpaddd 0x2C0(%%rax), %%ymm22, %%ymm22                         \n\t"
-                         "vpaddd 0x2E0(%%rax), %%ymm23, %%ymm23                         \n\t"
-
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                      \n\t"
+                         "movq %[ostepC16], %%rbx                    \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x1, %%rcx                            \n\t"
+                         "je 0f                                      \n\t"
+                         "vpaddd (%%rax),      %%ymm0,  %%ymm0       \n\t"
+                         "vpaddd 0x20(%%rax),  %%ymm1,  %%ymm1       \n\t"
+                         "vpaddd 0x40(%%rax),  %%ymm2,  %%ymm2       \n\t"
+                         "vpaddd 0x60(%%rax),  %%ymm3,  %%ymm3       \n\t"
+                         "vpaddd 0x80(%%rax),  %%ymm4,  %%ymm4       \n\t"
+                         "vpaddd 0xA0(%%rax),  %%ymm5,  %%ymm5       \n\t"
+                         "vpaddd 0xC0(%%rax),  %%ymm6,  %%ymm6       \n\t"
+                         "vpaddd 0xE0(%%rax),  %%ymm7,  %%ymm7       \n\t"
+                         "vpaddd 0x100(%%rax), %%ymm8,  %%ymm8       \n\t"
+                         "vpaddd 0x120(%%rax), %%ymm9,  %%ymm9       \n\t"
+                         "vpaddd 0x140(%%rax), %%ymm10, %%ymm10      \n\t"
+                         "vpaddd 0x160(%%rax), %%ymm11, %%ymm11      \n\t"
+                         "vpaddd 0x180(%%rax), %%ymm12, %%ymm12      \n\t"
+                         "vpaddd 0x1A0(%%rax), %%ymm13, %%ymm13      \n\t"
+                         "vpaddd 0x1C0(%%rax), %%ymm14, %%ymm14      \n\t"
+                         "vpaddd 0x1E0(%%rax), %%ymm15, %%ymm15      \n\t"
+                         "vpaddd 0x200(%%rax), %%ymm16, %%ymm16      \n\t"
+                         "vpaddd 0x220(%%rax), %%ymm17, %%ymm17      \n\t"
+                         "vpaddd 0x240(%%rax), %%ymm18, %%ymm18      \n\t"
+                         "vpaddd 0x260(%%rax), %%ymm19, %%ymm19      \n\t"
+                         "vpaddd 0x280(%%rax), %%ymm20, %%ymm20      \n\t"
+                         "vpaddd 0x2A0(%%rax), %%ymm21, %%ymm21      \n\t"
+                         "vpaddd 0x2C0(%%rax), %%ymm22, %%ymm22      \n\t"
+                         "vpaddd 0x2E0(%%rax), %%ymm23, %%ymm23      \n\t"
+
+                         ".align 16                                  \n\t"
+                         "0:                                         \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
                          relu24Regs(%%ymm)
+                         "jmp 4f                                     \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                  \n\t"
+                         "1:                                         \n\t"
                          convert24RegsI32ToF32(%[scale], %%ymm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%ymm0,  (%%rax)                    \n\t"
-                         "vmovups %%ymm1,  0x20(%%rax)                       \n\t"
-                         "vmovups %%ymm2,  0x40(%%rax)                \n\t"
-                         "vmovups %%ymm3,  0x60(%%rax)                       \n\t"
-                         "vmovups %%ymm4,  0x80(%%rax)                 \n\t"
-                         "vmovups %%ymm5,  0xA0(%%rax)                        \n\t"
-                         "vmovups %%ymm6,  0xC0(%%rax)                 \n\t"
-                         "vmovups %%ymm7,  0xE0(%%rax)                        \n\t"
-                         "vmovups %%ymm8,  0x100(%%rax)                  \n\t"
-                         "vmovups %%ymm9,  0x120(%%rax)                         \n\t"
-                         "vmovups %%ymm10, 0x140(%%rax)                  \n\t"
-                         "vmovups %%ymm11, 0x160(%%rax)                         \n\t"
-                         "vmovups %%ymm12, 0x180(%%rax)                  \n\t"
-                         "vmovups %%ymm13, 0x1A0(%%rax)                         \n\t"
-                         "vmovups %%ymm14, 0x1C0(%%rax)                  \n\t"
-                         "vmovups %%ymm15, 0x1E0(%%rax)                         \n\t"
-                         "vmovups %%ymm16, 0x200(%%rax)                  \n\t"
-                         "vmovups %%ymm17, 0x220(%%rax)                         \n\t"
-                         "vmovups %%ymm18, 0x240(%%rax)                  \n\t"
-                         "vmovups %%ymm19, 0x260(%%rax)                         \n\t"
-                         "vmovups %%ymm20, 0x280(%%rax)                  \n\t"
-                         "vmovups %%ymm21, 0x2A0(%%rax)                         \n\t"
-                         "vmovups %%ymm22, 0x2C0(%%rax)                  \n\t"
-                         "vmovups %%ymm23, 0x2E0(%%rax)                         \n\t"
+                         ".align 16                                  \n\t"
+                         "2:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x2, %%rcx                            \n\t"
+                         "je 3f                                      \n\t"
+                         "vaddps (%[eltwise]),      %%ymm0, %%ymm0   \n\t"
+                         "vaddps 0x20(%[eltwise]),  %%ymm1, %%ymm1   \n\t"
+                         "vaddps 0x40(%[eltwise]),  %%ymm2, %%ymm2   \n\t"
+                         "vaddps 0x60(%[eltwise]),  %%ymm3, %%ymm3   \n\t"
+                         "vaddps 0x80(%[eltwise]),  %%ymm4, %%ymm4   \n\t"
+                         "vaddps 0xA0(%[eltwise]),  %%ymm5, %%ymm5   \n\t"
+                         "vaddps 0xC0(%[eltwise]),  %%ymm6, %%ymm6   \n\t"
+                         "vaddps 0xE0(%[eltwise]),  %%ymm7, %%ymm7   \n\t"
+                         "vaddps 0x100(%[eltwise]), %%ymm8, %%ymm8   \n\t"
+                         "vaddps 0x120(%[eltwise]), %%ymm9, %%ymm9   \n\t"
+                         "vaddps 0x140(%[eltwise]), %%ymm10, %%ymm10 \n\t"
+                         "vaddps 0x160(%[eltwise]), %%ymm11, %%ymm11 \n\t"
+                         "vaddps 0x180(%[eltwise]), %%ymm12, %%ymm12 \n\t"
+                         "vaddps 0x1A0(%[eltwise]), %%ymm13, %%ymm13 \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%ymm14, %%ymm14 \n\t"
+                         "vaddps 0x1E0(%[eltwise]), %%ymm15, %%ymm15 \n\t"
+                         "vaddps 0x200(%[eltwise]), %%ymm16, %%ymm16 \n\t"
+                         "vaddps 0x220(%[eltwise]), %%ymm17, %%ymm17 \n\t"
+                         "vaddps 0x240(%[eltwise]), %%ymm18, %%ymm18 \n\t"
+                         "vaddps 0x260(%[eltwise]), %%ymm19, %%ymm19 \n\t"
+                         "vaddps 0x280(%[eltwise]), %%ymm20, %%ymm20 \n\t"
+                         "vaddps 0x2A0(%[eltwise]), %%ymm21, %%ymm21 \n\t"
+                         "vaddps 0x2C0(%[eltwise]), %%ymm22, %%ymm22 \n\t"
+                         "vaddps 0x2E0(%[eltwise]), %%ymm23, %%ymm23 \n\t"
+
+                         ".align 16                                  \n\t"
+                         "3:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
+                         relu24RegsPs(%%ymm)
+
+                         ".align 16                                  \n\t"
+                         "4:                                         \n\t"
+                         "vmovups %%ymm0,  (%%rax)                   \n\t"
+                         "vmovups %%ymm1,  0x20(%%rax)               \n\t"
+                         "vmovups %%ymm2,  0x40(%%rax)               \n\t"
+                         "vmovups %%ymm3,  0x60(%%rax)               \n\t"
+                         "vmovups %%ymm4,  0x80(%%rax)               \n\t"
+                         "vmovups %%ymm5,  0xA0(%%rax)               \n\t"
+                         "vmovups %%ymm6,  0xC0(%%rax)               \n\t"
+                         "vmovups %%ymm7,  0xE0(%%rax)               \n\t"
+                         "vmovups %%ymm8,  0x100(%%rax)              \n\t"
+                         "vmovups %%ymm9,  0x120(%%rax)              \n\t"
+                         "vmovups %%ymm10, 0x140(%%rax)              \n\t"
+                         "vmovups %%ymm11, 0x160(%%rax)              \n\t"
+                         "vmovups %%ymm12, 0x180(%%rax)              \n\t"
+                         "vmovups %%ymm13, 0x1A0(%%rax)              \n\t"
+                         "vmovups %%ymm14, 0x1C0(%%rax)              \n\t"
+                         "vmovups %%ymm15, 0x1E0(%%rax)              \n\t"
+                         "vmovups %%ymm16, 0x200(%%rax)              \n\t"
+                         "vmovups %%ymm17, 0x220(%%rax)              \n\t"
+                         "vmovups %%ymm18, 0x240(%%rax)              \n\t"
+                         "vmovups %%ymm19, 0x260(%%rax)              \n\t"
+                         "vmovups %%ymm20, 0x280(%%rax)              \n\t"
+                         "vmovups %%ymm21, 0x2A0(%%rax)              \n\t"
+                         "vmovups %%ymm22, 0x2C0(%%rax)              \n\t"
+                         "vmovups %%ymm23, 0x2E0(%%rax)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6",
-                           "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14",
-                           "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22",
-                           "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5",
+                           "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11",
+                           "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%ymm16", "%ymm17",
+                           "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", "%ymm23",
+                           "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29",
+                           "%ymm30", "%ymm31", "memory", "cc");
 }
 
 void Avx512Conv1x1Kernel12x8(ConvController &c) {
     convKernelForLoopXx16(12, 12, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax),      %%ymm0,  %%ymm0                             \n\t"
-                         "vpaddd 0x20(%%rax),  %%ymm1,  %%ymm1                             \n\t"
-                         "vpaddd 0x40(%%rax),  %%ymm2,  %%ymm2                         \n\t"
-                         "vpaddd 0x60(%%rax),  %%ymm3,  %%ymm3                         \n\t"
-                         "vpaddd 0x80(%%rax),  %%ymm4,  %%ymm4                         \n\t"
-                         "vpaddd 0xA0(%%rax),  %%ymm5,  %%ymm5                         \n\t"
-                         "vpaddd 0xC0(%%rax),  %%ymm6,  %%ymm6                         \n\t"
-                         "vpaddd 0xE0(%%rax),  %%ymm7,  %%ymm7                         \n\t"
-                         "vpaddd 0x100(%%rax), %%ymm8,  %%ymm8                         \n\t"
-                         "vpaddd 0x120(%%rax), %%ymm9,  %%ymm9                         \n\t"
-                         "vpaddd 0x140(%%rax), %%ymm10, %%ymm10                         \n\t"
-                         "vpaddd 0x160(%%rax), %%ymm11, %%ymm11                         \n\t"
-
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                      \n\t"
+                         "movq %[ostepC16], %%rbx                    \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x1, %%rcx                            \n\t"
+                         "je 0f                                      \n\t"
+                         "vpaddd (%%rax),      %%ymm0,  %%ymm0       \n\t"
+                         "vpaddd 0x20(%%rax),  %%ymm1,  %%ymm1       \n\t"
+                         "vpaddd 0x40(%%rax),  %%ymm2,  %%ymm2       \n\t"
+                         "vpaddd 0x60(%%rax),  %%ymm3,  %%ymm3       \n\t"
+                         "vpaddd 0x80(%%rax),  %%ymm4,  %%ymm4       \n\t"
+                         "vpaddd 0xA0(%%rax),  %%ymm5,  %%ymm5       \n\t"
+                         "vpaddd 0xC0(%%rax),  %%ymm6,  %%ymm6       \n\t"
+                         "vpaddd 0xE0(%%rax),  %%ymm7,  %%ymm7       \n\t"
+                         "vpaddd 0x100(%%rax), %%ymm8,  %%ymm8       \n\t"
+                         "vpaddd 0x120(%%rax), %%ymm9,  %%ymm9       \n\t"
+                         "vpaddd 0x140(%%rax), %%ymm10, %%ymm10      \n\t"
+                         "vpaddd 0x160(%%rax), %%ymm11, %%ymm11      \n\t"
+
+                         ".align 16                                  \n\t"
+                         "0:                                         \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
                          relu12Regs(%%ymm)
+                         "jmp 4f                                     \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                  \n\t"
+                         "1:                                         \n\t"
                          convert12RegsI32ToF32(%[scale], %%ymm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%ymm0,  (%%rax)                    \n\t"
-                         "vmovups %%ymm1,  0x20(%%rax)                       \n\t"
-                         "vmovups %%ymm2,  0x40(%%rax)                \n\t"
-                         "vmovups %%ymm3,  0x60(%%rax)                       \n\t"
-                         "vmovups %%ymm4,  0x80(%%rax)                 \n\t"
-                         "vmovups %%ymm5,  0xA0(%%rax)                        \n\t"
-                         "vmovups %%ymm6,  0xC0(%%rax)                 \n\t"
-                         "vmovups %%ymm7,  0xE0(%%rax)                        \n\t"
-                         "vmovups %%ymm8,  0x100(%%rax)                  \n\t"
-                         "vmovups %%ymm9,  0x120(%%rax)                         \n\t"
-                         "vmovups %%ymm10, 0x140(%%rax)                  \n\t"
-                         "vmovups %%ymm11, 0x160(%%rax)                         \n\t"
+                         ".align 16                                  \n\t"
+                         "2:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x2, %%rcx                            \n\t"
+                         "je 3f                                      \n\t"
+                         "vaddps (%[eltwise]),      %%ymm0, %%ymm0   \n\t"
+                         "vaddps 0x20(%[eltwise]),  %%ymm1, %%ymm1   \n\t"
+                         "vaddps 0x40(%[eltwise]),  %%ymm2, %%ymm2   \n\t"
+                         "vaddps 0x60(%[eltwise]),  %%ymm3, %%ymm3   \n\t"
+                         "vaddps 0x80(%[eltwise]),  %%ymm4, %%ymm4   \n\t"
+                         "vaddps 0xA0(%[eltwise]),  %%ymm5, %%ymm5   \n\t"
+                         "vaddps 0xC0(%[eltwise]),  %%ymm6, %%ymm6   \n\t"
+                         "vaddps 0xE0(%[eltwise]),  %%ymm7, %%ymm7   \n\t"
+                         "vaddps 0x100(%[eltwise]), %%ymm8, %%ymm8   \n\t"
+                         "vaddps 0x120(%[eltwise]), %%ymm9, %%ymm9   \n\t"
+                         "vaddps 0x140(%[eltwise]), %%ymm10, %%ymm10 \n\t"
+                         "vaddps 0x160(%[eltwise]), %%ymm11, %%ymm11 \n\t"
+
+                         ".align 16                                  \n\t"
+                         "3:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
+                         relu24RegsPs(%%ymm)
+
+                         ".align 16                                  \n\t"
+                         "4:                                         \n\t"
+                         "vmovups %%ymm0,  (%%rax)                   \n\t"
+                         "vmovups %%ymm1,  0x20(%%rax)               \n\t"
+                         "vmovups %%ymm2,  0x40(%%rax)               \n\t"
+                         "vmovups %%ymm3,  0x60(%%rax)               \n\t"
+                         "vmovups %%ymm4,  0x80(%%rax)               \n\t"
+                         "vmovups %%ymm5,  0xA0(%%rax)               \n\t"
+                         "vmovups %%ymm6,  0xC0(%%rax)               \n\t"
+                         "vmovups %%ymm7,  0xE0(%%rax)               \n\t"
+                         "vmovups %%ymm8,  0x100(%%rax)              \n\t"
+                         "vmovups %%ymm9,  0x120(%%rax)              \n\t"
+                         "vmovups %%ymm10, 0x140(%%rax)              \n\t"
+                         "vmovups %%ymm11, 0x160(%%rax)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6",
-                           "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14",
-                           "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22",
-                           "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5",
+                           "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11",
+                           "%ymm24", "%ymm31", "memory", "cc");
 }
 
 void Avx512Conv1x1Kernel1x8(ConvController &c) {
     convKernelForLoopXx16(1, 1, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%ymm0,  %%ymm0                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                    \n\t"
+                         "movq %[ostepC16], %%rbx                  \n\t"
+                         "movq %[flags], %%rcx                     \n\t"
+                         "and $0x1, %%rcx                          \n\t"
+                         "je 0f                                    \n\t"
+                         "vpaddd (%%rax), %%ymm0,  %%ymm0          \n\t"
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         ".align 16                                \n\t"
+                         "0:                                       \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                     \n\t"
+                         "and $0xC, %%rcx                          \n\t"
+                         "je 4f                                    \n\t"
                          reluReg(%%ymm)
+                         "jmp 4f                                   \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                \n\t"
+                         "1:                                       \n\t"
                          convertRegI32ToF32(%[scale], %%ymm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
+                         ".align 16                                \n\t"
+                         "2:                                       \n\t"
+                         "movq %[flags], %%rcx                     \n\t"
+                         "and $0x2, %%rcx                          \n\t"
+                         "je 3f                                    \n\t"
+                         "vaddps (%[eltwise]), %%ymm0, %%ymm0      \n\t"
+
+                         ".align 16                                \n\t"
+                         "3:                                       \n\t"
+                         "movq %[flags], %%rcx                     \n\t"
+                         "and $0xC, %%rcx                          \n\t"
+                         "je 4f                                    \n\t"
+                         reluRegPs(%%ymm)
+
+                         ".align 16                                \n\t"
+                         "4:                                       \n\t"
                          "vmovups %%ymm0,  (%%rax)                    \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6",
-                           "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14",
-                           "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22",
-                           "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%ymm0", "%ymm24", "%ymm31", "memory", "cc");
+}
+
+template <typename T1>
+EE activateBias(const T1 *biasArray, T1 *activatedArray, U32 len, ActivationMode mode) {
+    switch (mode) {
+        case ACTIVATION_RELU: {
+            for (U32 ocb = 0; ocb < len; ++ocb) {
+                activatedArray[ocb] = (biasArray[ocb] <= 0)? 0: biasArray[ocb];
+            }
+            break;
+        }
+        case ACTIVATION_RELU6: {
+            for (U32 ocb = 0; ocb < len; ++ocb) {
+                activatedArray[ocb] =
+                    (biasArray[ocb] <= 0)? 0: ((biasArray[ocb] >= 6)? 6: biasArray[ocb]);
+            }
+            break;
+        }
+        default:
+            return NOT_SUPPORTED;
+    }
+    return SUCCESS;
+}
+
+inline void getActivatedBiasForPadding(
+    const F32 *biasArray, TensorDesc biasDesc, DataType targetType, void *activatedBias, ActivationMode mode, F32 scaleB)
+{
+    if (targetType == DT_I32) {
+        CHECK_STATUS(quantize_bias_offsetC((const void *)biasArray, biasDesc, DT_I32,
+            nullptr, biasDesc, &scaleB, activatedBias));
+        CHECK_STATUS(activateBias<I32>((const I32 *)activatedBias,
+            (I32 *)activatedBias, tensorNumElements(biasDesc), mode));
+    } else if (targetType == DT_F32) {
+        CHECK_STATUS(activateBias<F32>((const F32 *)biasArray,
+            (F32 *)activatedBias, tensorNumElements(biasDesc), mode));
+    } else {
+        CHECK_STATUS(NOT_MATCH);
+    } 
 }
 
 // clang-format on
 EE convolution_1x1_direct(TensorDesc inputDesc,
     UINT8 *inArray,
+    F32 *eltwiseInput,
     TensorDesc filterDesc,
     const INT8 *filterArray,
     ConvolutionParamSpec convParamSpec,
     TensorDesc biasDesc,
-    const I32 *biasArray,
+    const F32 *biasArray,
     U32 tmpBytes,
     void *tmp,
     TensorDesc outputDesc,
@@ -1967,20 +2263,17 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
     // get computing params
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
-    U32 ih_pad = ih + paddingT + paddingB;
-    U32 iw_pad = iw + paddingL + paddingR;
-    U32 ih_stride = (ih_pad + strideH - 1) / strideH;
-    U32 iw_stride = (iw_pad + strideW - 1) / strideW;
+    U32 ih_stride = (ih + strideH - 1) / strideH;
+    U32 iw_stride = (iw + strideW - 1) / strideW;
     U32 ohow = oh * ow;
     UINT8 *output = (UINT8 *)outArray;
 
-    CHECK_REQUIREMENT(paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0);
     // infer block params
 
     // infer kernel params
@@ -1989,7 +2282,6 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
     convCtl.dilateW = dilateW * SIMDW;
     convCtl.dilateH = (iw_stride - fw * dilateW + (dilateH - 1) * iw_stride) * SIMDW;
     convCtl.fStep = ih_stride * iw_stride * SIMDW;
-    convCtl.stepC16 = 16;
     convCtl.kw = fw;
     convCtl.kh = fh;
     convCtl.scale = nullptr;
@@ -2016,9 +2308,12 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
         tmp = (void *)((U8 *)tmp + tensorNumElements(outputDesc) * bytesOf(DT_I32));
         outputDesc.dt = DT_I32;
     }
+    if (eltwiseInput != nullptr) {
+        outputDesc.dt = DT_F32;
+    }
     F32 *factorPtr = nullptr;
     F32 factor = 0;
-    if (scale != nullptr && odt == DT_F32) {
+    if (scale != nullptr && outputDesc.dt == DT_F32) {
         factor = 1 / (*scaleO);
         factorPtr = &factor;
     }
@@ -2029,12 +2324,20 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
         (const void *)filterArray, filterDesc, scaleO, offsetC));
     filterArray += oc * 4;
 
+    F32 *activatedBias = (F32 *)tmp;
+    if (paddingT > 0 || paddingB > 0 || paddingL > 0 || paddingR > 0) {
+        getActivatedBiasForPadding(
+            biasArray, biasDesc, outputDesc.dt, activatedBias, activationDesc.mode, *scaleO);
+        tmp = (void *)((U8 *)tmp + oc * bytesOf(DT_F32));
+    }
+
     U32 oBytes = bytesOf(outputDesc.dt);
     UINT8 *tmpInput = (UINT8 *)tmp;
     if (idf != DF_NCHWC16) {
         tmp = (void *)((U8 *)tmp + ic * ih * iw);
     }
     UINT8 *useInput = (UINT8 *)tmp;
+
     for (U32 n = 0; n < in; ++n) {
         UINT8 *bInArray = inArray + n * ic * ih * iw;
         if (idf == DF_NCHWC16) {
@@ -2053,9 +2356,9 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
                 for (U32 w = 0; w < iw_stride; ++w) {
                     U32 nh = h * strideH;
                     U32 nw = w * strideW;
-                    memcpy(
+                    UNI_MEMCPY(
                         useInput + c * ih_stride * iw_stride * SIMDW + (h * iw_stride + w) * SIMDW,
-                        tmpInput + c * ih_pad * iw_pad * SIMDW + (nh * iw_pad + nw) * SIMDW, SIMDW);
+                        tmpInput + c * ih * iw * SIMDW + (nh * iw + nw) * SIMDW, SIMDW);
                 }
             }
         } else {
@@ -2068,6 +2371,7 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
             icSize = UNI_MIN(BLOCK_IC_DIM, ic - icbb);
             flags |= (icbb > 0);
             if (icbb == (int)ic - icSize) {
+                flags |= (eltwiseInput != nullptr) << 1;
                 flags |= U32(activationDesc.mode) << 2;
                 convCtl.scale = factorPtr;
             }
@@ -2077,35 +2381,94 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
             if (icSize < SIMDW) {
                 simdC = icSizeArray[icSize >> 3];
             }
-            U32 hwSize = 0;
-            for (U32 hw = 0; hw < ohow; hw += hwSize) {
-                U32 ocSize = 0;
-                hwSize = UNI_MIN(BLOCK_HW_DIM, ohow - hw);
-                for (U32 ocb = 0; ocb < oc; ocb += ocSize) {
-                    ocSize = UNI_MIN(unrollOc, oc - ocb);
-                    ocSize = ocSizeArray[ocSize >> 4];
-                    simdOc = UNI_MIN(SIMDW, ocSize);
-                    convCtl.bias = offsetC + ocb;
-                    UINT8 *curI = useInput + icbb * ih_stride * iw_stride;
-                    U32 wSize = 8;
-                    U32 unrollW = wSizeArray[ocSize >> 4];
-                    for (U32 ihw = hw; ihw < hw + hwSize; ihw += wSize) {
-                        wSize = UNI_MIN(hw + hwSize - ihw, unrollW);
-                        U32 idx = wSize * 2 / unrollW;
-                        wSize = UNI_MAX(idx * unrollW / 2, 1);
-                        U32 in_h = ihw / ow;
-                        U32 in_w = ihw % ow;
-                        convCtl.input = curI + in_h * iw_stride * simdC + in_w * simdC;
-                        convCtl.output = output + ((n * oc + ocb) * ohow + ihw * simdOc) * oBytes;
-                        convCtl.filter = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw;
-                        if ((ic % 16 != 0) && (icbb == (int)ic - icSize)) {
-                            U32 cx = (ic % 8 == 0) ? 8 : 4;
-                            convCtl.f8Step =
-                                convCtl.fStep - (in_h * iw_stride + in_w) * (SIMDW - cx);
-                            convCtl.f4Step = convCtl.fStep / 2 - (in_h * iw_stride + in_w) * (8 - 4);
+            if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) {
+                U32 hwSize = 0;
+                for (U32 hw = 0; hw < ohow; hw += hwSize) {
+                    U32 ocSize = 0;
+                    hwSize = UNI_MIN(BLOCK_HW_DIM, ohow - hw);
+                    for (U32 ocb = 0; ocb < oc; ocb += ocSize) {
+                        ocSize = UNI_MIN(unrollOc, oc - ocb);
+                        ocSize = ocSizeArray[ocSize >> 4];
+                        simdOc = UNI_MIN(SIMDW, ocSize);
+                        convCtl.bias = offsetC + ocb;
+                        UINT8 *curI = useInput + icbb * ih_stride * iw_stride;
+                        U32 wSize = 8;
+                        U32 unrollW = wSizeArray[ocSize >> 4];
+                        for (U32 ihw = hw; ihw < hw + hwSize; ihw += wSize) {
+                            wSize = UNI_MIN(hw + hwSize - ihw, unrollW);
+                            U32 idx = wSize * 2 / unrollW;
+                            wSize = UNI_MAX(idx * unrollW / 2, 1);
+                            U32 in_h = ihw / ow;
+                            U32 in_w = ihw % ow;
+                            convCtl.input = curI + in_h * iw_stride * simdC + in_w * simdC;
+                            convCtl.output =
+                                output + ((n * oc + ocb) * ohow + ihw * simdOc) * oBytes;
+                            convCtl.eltwise = eltwiseInput + (n * oc + ocb) * ohow + ihw * simdOc;
+                            convCtl.filter =
+                                filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw;
+                            if ((ic % 16 != 0) && (icbb == (int)ic - icSize)) {
+                                U32 cx = (ic % 8 == 0) ? 8 : 4;
+                                convCtl.f8Step =
+                                    convCtl.fStep - (in_h * iw_stride + in_w) * (SIMDW - cx);
+                                convCtl.f4Step =
+                                    convCtl.fStep / 2 - (in_h * iw_stride + in_w) * (8 - 4);
+                            }
+                            convCtl.ic = icSize;
+                            kernel[ocSize >> 4][idx](convCtl);
+                        }
+                    }
+                }
+            } else {
+                for (U32 h = 0; h < oh; ++h) {
+                    U32 ocSize = 0;
+                    for (U32 ocb = 0; ocb < oc; ocb += ocSize) {
+                        ocSize = UNI_MIN(unrollOc, oc - ocb);
+                        ocSize = ocSizeArray[ocSize >> 4];
+                        simdOc = UNI_MIN(SIMDW, ocSize);
+                        convCtl.bias = offsetC + ocb;
+                        UINT8 *curI = useInput + icbb * ih_stride * iw_stride;
+                        U32 wSize = 8;
+                        U32 unrollW = wSizeArray[ocSize >> 4];
+                        for (U32 w = 0; w < ow; w += wSize) {
+                            wSize = 1;
+                            convCtl.output =
+                                output + ((n * oc + ocb) * ohow + (h * ow + w) * simdOc) * oBytes;
+                            convCtl.eltwise = eltwiseInput +
+                                ((n * oc + ocb) * ohow + (h * ow + w) * simdOc) * oBytes;
+                            // directly store activated bias
+                            if ((h < paddingT) || (h >= ih_stride + paddingT) || (w < paddingL) ||
+                                (w >= paddingL + iw_stride)) {
+                                if (!(flags & 0x2) && (icbb == (int)ic - icSize)) {
+                                    int oci = 0;
+                                    for (oci = 0; oci < (int)ocSize + 1 - SIMDW; oci += SIMDW) {
+                                        UNI_MEMCPY(((U8 *)convCtl.output) + ohow * oci * oBytes,
+                                            activatedBias + oci + ocb, SIMDW * oBytes);
+                                    }
+                                    for (; oci < (int)ocSize; oci += 8) {
+                                        UNI_MEMCPY(((U8 *)convCtl.output) + ohow * oci * oBytes,
+                                            activatedBias + oci + ocb, 8 * oBytes);
+                                    }
+                                }
+                                continue;
+                            }
+                            wSize = UNI_MIN(iw_stride - (w - paddingL), unrollW);
+                            U32 idx = wSize * 2 / unrollW;
+                            wSize = UNI_MAX(idx * unrollW / 2, 1);
+
+                            convCtl.input =
+                                curI + (h - paddingT) * iw_stride * simdC + (w - paddingL) * simdC;
+                            convCtl.filter =
+                                filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw;
+                            if ((ic % 16 != 0) && (icbb == (int)ic - icSize)) {
+                                U32 cx = (ic % 8 == 0) ? 8 : 4;
+                                convCtl.f8Step = convCtl.fStep -
+                                    ((h - paddingT) * iw_stride + (w - paddingL)) * (SIMDW - cx);
+                                convCtl.f4Step = convCtl.fStep / 2 -
+                                    ((h - paddingT) * iw_stride + (w - paddingL)) * (8 - 4);
+                            }
+                            convCtl.ic = icSize;
+                            kernel[ocSize >> 4][idx](convCtl);
                         }
-                        convCtl.ic = icSize;
-                        kernel[ocSize >> 4][idx](convCtl);
                     }
                 }
             }
diff --git a/compute/tensor/src/cpu/x86/int8/convolution_direct.cpp b/compute/tensor/src/cpu/x86/int8/convolution_direct.cpp
index 7279f7fe..ad6767e3 100644
--- a/compute/tensor/src/cpu/x86/int8/convolution_direct.cpp
+++ b/compute/tensor/src/cpu/x86/int8/convolution_direct.cpp
@@ -17,2179 +17,3297 @@
 #include "error.h"
 #include "transform_functions_int8.h"
 #include "cpu/x86/int8/tensor_computing_int8.h"
+#include "cpu/x86/int8/convolution_functions.h"
 #include "cpu/x86/tensor_computing_x86.h"
+#include "cpu/tensor_computing_cpu.h"
 
 #define SIMDW 16
 #define BLOCK_IC_DIM 128
-#define BLOCK_HW_DIM 1024
-
-struct ConvController {
-    UINT8 *input;
-    const INT8 *filter;
-    void *output;
-    UINT8 *u8Output;
-    const I32 *bias;
-    I64 ic;
-    I64 kw;
-    I64 kh;
-    I64 stepC16;
-    I64 dilateW;
-    I64 dilateH;
-    I64 ostepC16;
-    I64 flags;
-    I64 fStep;
-    I64 f8Step;
-    I64 f4Step;
-    void *scale;
-};
-
-typedef void (*kernelFunc)(ConvController &c);
+#define BLOCK_HW_DIM 96
 
 // clang-format off
-#define clear1Regs(rtype) \
-    "vxorps "#rtype"0, "#rtype"0, "#rtype"0                     \n\t"
-
-#define clear2Regs(rtype) \
-    clear1Regs(rtype) \
-    "vxorps "#rtype"1, "#rtype"1, "#rtype"1                     \n\t"
-
-#define clear3Regs(rtype) \
-    clear2Regs(rtype) \
-    "vxorps "#rtype"2, "#rtype"2, "#rtype"2                     \n\t"
-
-#define clear12Regs(rtype) \
-    clear3Regs(rtype) \
-    "vxorps "#rtype"3, "#rtype"3, "#rtype"3                     \n\t" \
-    "vxorps "#rtype"4, "#rtype"4, "#rtype"4                     \n\t" \
-    "vxorps "#rtype"5, "#rtype"5, "#rtype"5                     \n\t" \
-    "vxorps "#rtype"6, "#rtype"6, "#rtype"6                     \n\t" \
-    "vxorps "#rtype"7, "#rtype"7, "#rtype"7                     \n\t" \
-    "vxorps "#rtype"8, "#rtype"8, "#rtype"8                     \n\t" \
-    "vxorps "#rtype"9, "#rtype"9, "#rtype"9                     \n\t" \
-    "vxorps "#rtype"10, "#rtype"10, "#rtype"10                  \n\t" \
-    "vxorps "#rtype"11, "#rtype"11, "#rtype"11                  \n\t"
-
-#define clear24Regs(rtype) \
-    clear12Regs(rtype) \
-    "vxorps "#rtype"12, "#rtype"12, "#rtype"12                  \n\t" \
-    "vxorps "#rtype"13, "#rtype"13, "#rtype"13                  \n\t" \
-    "vxorps "#rtype"14, "#rtype"14, "#rtype"14                  \n\t" \
-    "vxorps "#rtype"15, "#rtype"15, "#rtype"15                  \n\t" \
-    "vxorps "#rtype"16, "#rtype"16, "#rtype"16                  \n\t" \
-    "vxorps "#rtype"17, "#rtype"17, "#rtype"17                  \n\t" \
-    "vxorps "#rtype"18, "#rtype"18, "#rtype"18                  \n\t" \
-    "vxorps "#rtype"19, "#rtype"19, "#rtype"19                  \n\t" \
-    "vxorps "#rtype"20, "#rtype"20, "#rtype"20                  \n\t" \
-    "vxorps "#rtype"21, "#rtype"21, "#rtype"21                  \n\t" \
-    "vxorps "#rtype"22, "#rtype"22, "#rtype"22                  \n\t" \
-    "vxorps "#rtype"23, "#rtype"23, "#rtype"23                  \n\t"
-
-#define reluReg(rtype) \
-    "vpxord "#rtype"31, "#rtype"31, "#rtype"31                  \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"0, "#rtype"0                    \n\t"
-
-#define relu2Regs(rtype) \
-    reluReg(rtype) \
-    "vpmaxsd "#rtype"31, "#rtype"1, "#rtype"1                    \n\t"
-
-#define relu3Regs(rtype) \
-    relu2Regs(rtype) \
-    "vpmaxsd "#rtype"31, "#rtype"2, "#rtype"2                    \n\t"
-
-#define relu12Regs(rtype) \
-    relu3Regs(rtype) \
-    "vpmaxsd "#rtype"31, "#rtype"3, "#rtype"3                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"4, "#rtype"4                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"5, "#rtype"5                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"6, "#rtype"6                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"7, "#rtype"7                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"8, "#rtype"8                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"9, "#rtype"9                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"10, "#rtype"10                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"11, "#rtype"11                    \n\t"
-
-#define relu24Regs(rtype) \
-    relu12Regs(rtype) \
-    "vpmaxsd "#rtype"31, "#rtype"12, "#rtype"12                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"13, "#rtype"13                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"14, "#rtype"14                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"15, "#rtype"15                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"16, "#rtype"16                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"17, "#rtype"17                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"18, "#rtype"18                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"19, "#rtype"19                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"20, "#rtype"20                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"21, "#rtype"21                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"22, "#rtype"22                    \n\t" \
-    "vpmaxsd "#rtype"31, "#rtype"23, "#rtype"23                    \n\t"
-
-#define convertRegI32ToF32(scalePtr, rtype) \
-    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
-    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
-    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
-
-#define convert2RegsI32ToF32(scalePtr, rtype) \
-    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
-    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
-    "vcvtdq2ps "#rtype"1, "#rtype"1                       \n\t" \
-    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
-    "vmulps "#rtype"1, "#rtype"24, "#rtype"1                       \n\t" \
-
-#define convert3RegsI32ToF32(scalePtr, rtype) \
-    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
-    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
-    "vcvtdq2ps "#rtype"1, "#rtype"1                       \n\t" \
-    "vcvtdq2ps "#rtype"2, "#rtype"2                       \n\t" \
-    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
-    "vmulps "#rtype"1, "#rtype"24, "#rtype"1                       \n\t" \
-    "vmulps "#rtype"2, "#rtype"24, "#rtype"2                       \n\t"
-#define convert12RegsI32ToF32(scalePtr, rtype) \
-    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
-    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
-    "vcvtdq2ps "#rtype"1, "#rtype"1                       \n\t" \
-    "vcvtdq2ps "#rtype"2, "#rtype"2                       \n\t" \
-    "vcvtdq2ps "#rtype"3, "#rtype"3                       \n\t" \
-    "vcvtdq2ps "#rtype"4, "#rtype"4                       \n\t" \
-    "vcvtdq2ps "#rtype"5, "#rtype"5                       \n\t" \
-    "vcvtdq2ps "#rtype"6, "#rtype"6                       \n\t" \
-    "vcvtdq2ps "#rtype"7, "#rtype"7                       \n\t" \
-    "vcvtdq2ps "#rtype"8, "#rtype"8                       \n\t" \
-    "vcvtdq2ps "#rtype"9, "#rtype"9                       \n\t" \
-    "vcvtdq2ps "#rtype"10, "#rtype"10                       \n\t" \
-    "vcvtdq2ps "#rtype"11, "#rtype"11                       \n\t" \
-    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
-    "vmulps "#rtype"1, "#rtype"24, "#rtype"1                       \n\t" \
-    "vmulps "#rtype"2, "#rtype"24, "#rtype"2                       \n\t" \
-    "vmulps "#rtype"3, "#rtype"24, "#rtype"3                       \n\t" \
-    "vmulps "#rtype"4, "#rtype"24, "#rtype"4                       \n\t" \
-    "vmulps "#rtype"5, "#rtype"24, "#rtype"5                       \n\t" \
-    "vmulps "#rtype"6, "#rtype"24, "#rtype"6                       \n\t" \
-    "vmulps "#rtype"7, "#rtype"24, "#rtype"7                       \n\t" \
-    "vmulps "#rtype"8, "#rtype"24, "#rtype"8                       \n\t" \
-    "vmulps "#rtype"9, "#rtype"24, "#rtype"9                       \n\t" \
-    "vmulps "#rtype"10, "#rtype"24, "#rtype"10                     \n\t" \
-    "vmulps "#rtype"11, "#rtype"24, "#rtype"11                     \n\t"
-
-#define convert24RegsI32ToF32(scalePtr, rtype) \
-    convert12RegsI32ToF32(scalePtr, rtype) \
-    "vcvtdq2ps "#rtype"12, "#rtype"12                       \n\t" \
-    "vcvtdq2ps "#rtype"13, "#rtype"13                       \n\t" \
-    "vcvtdq2ps "#rtype"14, "#rtype"14                       \n\t" \
-    "vcvtdq2ps "#rtype"15, "#rtype"15                       \n\t" \
-    "vcvtdq2ps "#rtype"16, "#rtype"16                       \n\t" \
-    "vcvtdq2ps "#rtype"17, "#rtype"17                       \n\t" \
-    "vcvtdq2ps "#rtype"18, "#rtype"18                       \n\t" \
-    "vcvtdq2ps "#rtype"19, "#rtype"19                       \n\t" \
-    "vcvtdq2ps "#rtype"20, "#rtype"20                       \n\t" \
-    "vcvtdq2ps "#rtype"21, "#rtype"21                       \n\t" \
-    "vcvtdq2ps "#rtype"22, "#rtype"22                       \n\t" \
-    "vcvtdq2ps "#rtype"23, "#rtype"23                       \n\t" \
-    "vmulps "#rtype"12, "#rtype"24, "#rtype"12                     \n\t" \
-    "vmulps "#rtype"13, "#rtype"24, "#rtype"13                     \n\t" \
-    "vmulps "#rtype"14, "#rtype"24, "#rtype"14                     \n\t" \
-    "vmulps "#rtype"15, "#rtype"24, "#rtype"15                     \n\t" \
-    "vmulps "#rtype"16, "#rtype"24, "#rtype"16                     \n\t" \
-    "vmulps "#rtype"17, "#rtype"24, "#rtype"17                     \n\t" \
-    "vmulps "#rtype"18, "#rtype"24, "#rtype"18                     \n\t" \
-    "vmulps "#rtype"19, "#rtype"24, "#rtype"19                     \n\t" \
-    "vmulps "#rtype"20, "#rtype"24, "#rtype"20                     \n\t" \
-    "vmulps "#rtype"21, "#rtype"24, "#rtype"21                     \n\t" \
-    "vmulps "#rtype"22, "#rtype"24, "#rtype"22                     \n\t" \
-    "vmulps "#rtype"23, "#rtype"24, "#rtype"23                     \n\t"
-#define load48BiasTo3Regs(bias) \
-    "vmovups ("#bias"), %%zmm0                       \n\t" \
-    "vmovups 0x40("#bias"), %%zmm1                   \n\t" \
-    "vmovups 0x80("#bias"), %%zmm2                   \n\t" \
-
-#define load48BiasTo12Regs(bias) \
-    load48BiasTo3Regs(bias) \
-    "vmovups %%zmm0, %%zmm3                   \n\t" \
-    "vmovups %%zmm1, %%zmm4                   \n\t" \
-    "vmovups %%zmm2, %%zmm5                   \n\t" \
-    "vmovups %%zmm0, %%zmm6                   \n\t" \
-    "vmovups %%zmm1, %%zmm7                   \n\t" \
-    "vmovups %%zmm2, %%zmm8                   \n\t" \
-    "vmovups %%zmm0, %%zmm9                   \n\t" \
-    "vmovups %%zmm1, %%zmm10                   \n\t" \
-    "vmovups %%zmm2, %%zmm11                   \n\t"
-
-#define load48BiasTo24Regs(bias) \
-    load48BiasTo12Regs(bias) \
-    "vmovups %%zmm0, %%zmm12                   \n\t" \
-    "vmovups %%zmm1, %%zmm13                   \n\t" \
-    "vmovups %%zmm2, %%zmm14                   \n\t" \
-    "vmovups %%zmm0, %%zmm15                   \n\t" \
-    "vmovups %%zmm1, %%zmm16                   \n\t" \
-    "vmovups %%zmm2, %%zmm17                   \n\t" \
-    "vmovups %%zmm0, %%zmm18                   \n\t" \
-    "vmovups %%zmm1, %%zmm19                   \n\t" \
-    "vmovups %%zmm2, %%zmm20                   \n\t" \
-    "vmovups %%zmm0, %%zmm21                   \n\t" \
-    "vmovups %%zmm1, %%zmm22                   \n\t" \
-    "vmovups %%zmm2, %%zmm23                   \n\t"
-
 #ifdef _USE_AVX512_VNNI
-#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm0              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm1              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm2              \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm3              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm4              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm5              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm6              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm7              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm8              \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm9              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm10              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm11              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm12              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm13              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm14              \n\t" \
-    "vmovups "#off2"(%[filter]), "#preg2"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm15              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm16              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm17              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm18              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm19              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm20              \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm21              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm22              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm23              \n\t"
-
-#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm0              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm1              \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm2              \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm3              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm4              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm5              \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm6              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm7              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm8              \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm9              \n\t" \
-    "vmovups "#off2"(%[filter]), "#preg2"                             \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm10              \n\t" \
-    "vpdpbusd "#freg2", %%zmm31, %%zmm11              \n\t"
-
-#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
-    "vmovups "#off2"(%[filter]), "#preg2"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm0              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm1              \n\t" \
-    "vpdpbusd "#freg2", %%zmm30, %%zmm2              \n\t"
+#define convKernel8x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    "movq (%[stepC16]), %%r10                 \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31  \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm0       \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm1       \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm2       \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"     \n\t" \
+    "addq %%r10, "#input"                     \n\t" \
+    "addq 0x8(%[stepC16]), "#input"           \n\t" \
+    "movq 0x10(%[stepC16]), %%r10             \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm3       \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm4       \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm5       \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31  \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm6       \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm7       \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm8       \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"     \n\t" \
+    "addq %%r10, "#input"                     \n\t" \
+    "addq 0x18(%[stepC16]), "#input"          \n\t" \
+    "movq 0x20(%[stepC16]), %%r10             \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm9       \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm10      \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm11      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31  \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm12      \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm13      \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm14      \n\t" \
+    "vmovups "#off2"(%[filter]), "#preg2"     \n\t" \
+    "addq %%r10, "#input"                     \n\t" \
+    "addq 0x28(%[stepC16]), "#input"          \n\t" \
+    "movq 0x30(%[stepC16]), %%r10             \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm15      \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm16      \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm17      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31  \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm18      \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm19      \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm20      \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm21      \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm22      \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm23      \n\t"
+
+#define convKernel4x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    "movq (%[stepC16]), %%r10                  \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31   \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm0        \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm1        \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"      \n\t" \
+    "addq %%r10, "#input"                      \n\t" \
+    "addq 0x8(%[stepC16]), "#input"            \n\t" \
+    "movq 0x10(%[stepC16]), %%r10              \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm2        \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm3        \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm4        \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm5        \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31   \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm6        \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm7        \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm8        \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm9        \n\t" \
+    "vmovups "#off2"(%[filter]), "#preg2"      \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm10       \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm11       \n\t"
+
+#define convKernel1x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"      \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"      \n\t" \
+    "vmovups "#off2"(%[filter]), "#preg2"      \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm0        \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm1        \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm2        \n\t"
+
+#define convKernel8x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31   \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm0        \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm1        \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm2        \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"      \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm3        \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm4        \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm5        \n\t" \
+    "addq %%r10, "#input"                      \n\t" \
+    "addq %%r10, "#input"                      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31   \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm6        \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm7        \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm8        \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"      \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm9        \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm10       \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm11       \n\t" \
+    "addq %%r10, "#input"                      \n\t" \
+    "addq %%r10, "#input"                      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31   \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm12       \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm13       \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm14       \n\t" \
+    "vmovups "#off2"(%[filter]), "#preg2"      \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm15       \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm16       \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm17       \n\t" \
+    "addq %%r10, "#input"                      \n\t" \
+    "addq %%r10, "#input"                      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31   \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm18       \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm19       \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm20       \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm21       \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm22       \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm23       \n\t"
+
+#define convKernel4x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31   \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm0        \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm1        \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"      \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm2        \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm3        \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm4        \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm5        \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"      \n\t" \
+    "addq %%r10, "#input"                      \n\t" \
+    "addq %%r10, "#input"                      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31   \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm6        \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm7        \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm8        \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm9        \n\t" \
+    "vmovups "#off2"(%[filter]), "#preg2"      \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm10       \n\t" \
+    "vpdpbusd "#freg2", %%zmm31, %%zmm11       \n\t"
+
+#define convKernel1x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"      \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"      \n\t" \
+    "vmovups "#off2"(%[filter]), "#preg2"      \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm0        \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm1        \n\t" \
+    "vpdpbusd "#freg2", %%zmm30, %%zmm2        \n\t"
+
 #else
+
 #define convKernel8x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
-    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
-    "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm3, "#preg0", %%zmm3              \n\t" \
-    "vpaddd %%zmm4, "#preg1", %%zmm4              \n\t" \
-    "vpaddd %%zmm5, "#preg2", %%zmm5              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm6, "#preg0", %%zmm6              \n\t" \
-    "vpaddd %%zmm7, "#preg1", %%zmm7              \n\t" \
-    "vpaddd %%zmm8, "#preg2", %%zmm8              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm9, "#preg0", %%zmm9              \n\t" \
-    "vpaddd %%zmm10, "#preg1", %%zmm10              \n\t" \
-    "vpaddd %%zmm11, "#preg2", %%zmm11             \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm12, "#preg0", %%zmm12              \n\t" \
-    "vpaddd %%zmm13, "#preg1", %%zmm13              \n\t" \
-    "vpaddd %%zmm14, "#preg2", %%zmm14              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm15, "#preg0", %%zmm15              \n\t" \
-    "vpaddd %%zmm16, "#preg1", %%zmm16              \n\t" \
-    "vpaddd %%zmm17, "#preg2", %%zmm17              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm18, "#preg0", %%zmm18              \n\t" \
-    "vpaddd %%zmm19, "#preg1", %%zmm19              \n\t" \
-    "vpaddd %%zmm20, "#preg2", %%zmm20              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vmovups "#off2"(%[filter]), "#freg2"                             \n\t" \
-    "vpaddd %%zmm21, "#preg0", %%zmm21              \n\t" \
-    "vpaddd %%zmm22, "#preg1", %%zmm22              \n\t" \
-    "vpaddd %%zmm23, "#preg2", %%zmm23              \n\t"
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq %%r10, "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0           \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1           \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq %%r10, "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm3, "#preg0", %%zmm3           \n\t" \
+    "vpaddd %%zmm4, "#preg1", %%zmm4           \n\t" \
+    "vpaddd %%zmm5, "#preg2", %%zmm5           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq %%r10, "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm6, "#preg0", %%zmm6           \n\t" \
+    "vpaddd %%zmm7, "#preg1", %%zmm7           \n\t" \
+    "vpaddd %%zmm8, "#preg2", %%zmm8           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq %%r10, "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm9, "#preg0", %%zmm9           \n\t" \
+    "vpaddd %%zmm10, "#preg1", %%zmm10         \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11         \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq %%r10, "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm12, "#preg0", %%zmm12         \n\t" \
+    "vpaddd %%zmm13, "#preg1", %%zmm13         \n\t" \
+    "vpaddd %%zmm14, "#preg2", %%zmm14         \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq %%r10, "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm15, "#preg0", %%zmm15         \n\t" \
+    "vpaddd %%zmm16, "#preg1", %%zmm16         \n\t" \
+    "vpaddd %%zmm17, "#preg2", %%zmm17         \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq %%r10, "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm18, "#preg0", %%zmm18         \n\t" \
+    "vpaddd %%zmm19, "#preg1", %%zmm19         \n\t" \
+    "vpaddd %%zmm20, "#preg2", %%zmm20         \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vmovups "#off2"(%[filter]), "#freg2"      \n\t" \
+    "vpaddd %%zmm21, "#preg0", %%zmm21         \n\t" \
+    "vpaddd %%zmm22, "#preg1", %%zmm22         \n\t" \
+    "vpaddd %%zmm23, "#preg2", %%zmm23         \n\t"
 
 #define convKernel4x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
-    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
-    "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm3, "#preg0", %%zmm3              \n\t" \
-    "vpaddd %%zmm4, "#preg1", %%zmm4              \n\t" \
-    "vpaddd %%zmm5, "#preg2", %%zmm5              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpaddd %%zmm6, "#preg0", %%zmm6              \n\t" \
-    "vpaddd %%zmm7, "#preg1", %%zmm7              \n\t" \
-    "vpaddd %%zmm8, "#preg2", %%zmm8              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vmovups "#off2"(%[filter]), "#freg2"                             \n\t" \
-    "vpaddd %%zmm9, "#preg0", %%zmm9              \n\t" \
-    "vpaddd %%zmm10, "#preg1", %%zmm10              \n\t" \
-    "vpaddd %%zmm11, "#preg2", %%zmm11             \n\t"
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq %%r10, "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0           \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1           \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq %%r10, "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm3, "#preg0", %%zmm3           \n\t" \
+    "vpaddd %%zmm4, "#preg1", %%zmm4           \n\t" \
+    "vpaddd %%zmm5, "#preg2", %%zmm5           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq %%r10, "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm6, "#preg0", %%zmm6           \n\t" \
+    "vpaddd %%zmm7, "#preg1", %%zmm7           \n\t" \
+    "vpaddd %%zmm8, "#preg2", %%zmm8           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vmovups "#off2"(%[filter]), "#freg2"      \n\t" \
+    "vpaddd %%zmm9, "#preg0", %%zmm9           \n\t" \
+    "vpaddd %%zmm10, "#preg1", %%zmm10         \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11         \n\t"
 
 #define convKernel1x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg2", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vmovups "#off2"(%[filter]), "#freg2"                             \n\t" \
-    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
-    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
-    "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t"
-
-#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    convKernel8x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
-
-#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    convKernel4x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vmovups "#off2"(%[filter]), "#freg2"      \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0           \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1           \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2           \n\t"
+
+#define convKernel8x48c4_4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq (%[stepC16]), "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0           \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1           \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq 0x8(%[stepC16]), "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm3, "#preg0", %%zmm3           \n\t" \
+    "vpaddd %%zmm4, "#preg1", %%zmm4           \n\t" \
+    "vpaddd %%zmm5, "#preg2", %%zmm5           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq 0x10(%[stepC16]), "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm6, "#preg0", %%zmm6           \n\t" \
+    "vpaddd %%zmm7, "#preg1", %%zmm7           \n\t" \
+    "vpaddd %%zmm8, "#preg2", %%zmm8           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq 0x18(%[stepC16]), "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm9, "#preg0", %%zmm9           \n\t" \
+    "vpaddd %%zmm10, "#preg1", %%zmm10         \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11         \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq 0x20(%[stepC16]), "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm12, "#preg0", %%zmm12         \n\t" \
+    "vpaddd %%zmm13, "#preg1", %%zmm13         \n\t" \
+    "vpaddd %%zmm14, "#preg2", %%zmm14         \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq 0x28(%[stepC16]), "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm15, "#preg0", %%zmm15         \n\t" \
+    "vpaddd %%zmm16, "#preg1", %%zmm16         \n\t" \
+    "vpaddd %%zmm17, "#preg2", %%zmm17         \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq 0x30(%[stepC16]), "#input"                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm18, "#preg0", %%zmm18         \n\t" \
+    "vpaddd %%zmm19, "#preg1", %%zmm19         \n\t" \
+    "vpaddd %%zmm20, "#preg2", %%zmm20         \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vmovups "#off2"(%[filter]), "#freg2"      \n\t" \
+    "vpaddd %%zmm21, "#preg0", %%zmm21         \n\t" \
+    "vpaddd %%zmm22, "#preg1", %%zmm22         \n\t" \
+    "vpaddd %%zmm23, "#preg2", %%zmm23         \n\t"
+
+#define convKernel4x48c4_4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq (%[stepC16]), "#input"               \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0           \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1           \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq 0x8(%[stepC16]), "#input"            \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm3, "#preg0", %%zmm3           \n\t" \
+    "vpaddd %%zmm4, "#preg1", %%zmm4           \n\t" \
+    "vpaddd %%zmm5, "#preg2", %%zmm5           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "addq 0x10(%[stepC16]), "#input"           \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpaddd %%zmm6, "#preg0", %%zmm6           \n\t" \
+    "vpaddd %%zmm7, "#preg1", %%zmm7           \n\t" \
+    "vpaddd %%zmm8, "#preg2", %%zmm8           \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vmovups "#off2"(%[filter]), "#freg2"      \n\t" \
+    "vpaddd %%zmm9, "#preg0", %%zmm9           \n\t" \
+    "vpaddd %%zmm10, "#preg1", %%zmm10         \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11         \n\t"
+
+#define convKernel1x48c4_4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    "vpbroadcastd ("#input"), %%zmm30          \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"    \n\t" \
+    "vpmaddubsw "#freg2", %%zmm30, "#preg2"    \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"      \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"      \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"      \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"      \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"      \n\t" \
+    "vmovups "#off2"(%[filter]), "#freg2"      \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0           \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1           \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2           \n\t"
+
+#define convKernel8x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    convKernel8x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, \
+        off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
+
+#define convKernel4x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    convKernel4x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, \
+        off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
+
+#define convKernel1x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    convKernel1x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, \
+        off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
+
+#define convKernel8x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    convKernel8x48c4_4(input, %%zmm24, %%zmm25, %%zmm26, \
+        off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
+
+#define convKernel4x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    convKernel4x48c4_4(input, %%zmm24, %%zmm25, %%zmm26, \
+        off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
+
+#define convKernel1x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
+    convKernel1x48c4_4(input, %%zmm24, %%zmm25, %%zmm26, \
+        off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
 
-#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \
-    convKernel1x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29)
 #endif
 
 
-#define convKernelForLoopXx48(rnum, wsize) \
-     __asm__ __volatile__("vmovups (%[filter]), %%zmm24                             \n\t" \
-                          "vmovups 0x40(%[filter]), %%zmm25                             \n\t" \
-                          "vmovups 0x80(%[filter]), %%zmm26                             \n\t" \
-                          "addq $0xC0, %[filter]                                    \n\t" \
-                          "mov $1, %%eax \n\t" \
-                          "vmovd %%eax, %%xmm0                    \n\t" \
-                          "vpbroadcastw %%xmm0, %%zmm31            \n\t" \
-                          "movq %[flags], %%rax          \n\t" \
-                          "andq $0x1, %%rax          \n\t" \
-                          "jne 0f                                         \n\t" \
-                          load48BiasTo##rnum##Regs(%[bias]) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          "jmp 1f          \n\t" \
-                          ".align 16                                         \n\t" \
-                          "0:                                                \n\t" \
-                          clear##rnum##Regs(%%zmm) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          ".align 16                                         \n\t" \
-                          "1:                                                \n\t" \
-                          "mov %[kh], %%rbx                                     \n\t" \
-                          ".align 16                                         \n\t" \
-                          "2:                                                \n\t" \
-                          "mov %[kw], %%r9                                   \n\t" \
-                          ".align 16                                         \n\t" \
-                          "3:                                                \n\t" \
-                          "movq %[input], %%rax  \n\t" \
-                          convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \
-                          "movq %[input], %%rax  \n\t" \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26) \
-                          "movq %[input], %%rax  \n\t" \
-                          "addq $0x8, %%rax  \n\t" \
-                          convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x180, 0x1C0, 0x200, %%zmm27, %%zmm28, %%zmm29) \
-                          "movq %[input], %%rax  \n\t" \
-                          "addq $0xC, %%rax  \n\t" \
-                          convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0x240, 0x280, 0x2C0, %%zmm24, %%zmm25, %%zmm26) \
-                          "addq $0x300, %[filter]                                    \n\t" \
-                          "addq %[dilateW], %[input]                                    \n\t" \
-                          "dec %%r9                                         \n\t" \
-                          "jg 3b                                             \n\t" \
-                          "addq %[dilateH], %[input]                                    \n\t" \
-                          "dec %%rbx                                         \n\t" \
-                          "jg 2b                                             \n\t" \
-                          "addq %[fStep], %[input]                                    \n\t" \
-                          "subq $0x10, %%rcx                                         \n\t" \
-                          "cmpq $0x10, %%rcx                                         \n\t" \
-                          "jge 1b                                             \n\t" \
-                          "subq %[fStep], %[input]                                    \n\t" \
-                          "addq %[f8Step], %[input]                                    \n\t" \
-                          ".align 16                                         \n\t" \
-                          "4:                                                \n\t" \
-                          : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \
-                          : [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \
-                            [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \
-                            [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), [flags] "r" (c.flags),  \
-                            [f8Step] "r" (c.f8Step) \
-                          : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                            "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                            "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                            "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                            "%zmm31", "memory", "cc"); \
-     if (c.ic > 0) { \
-         __asm__ __volatile__("cmpq $0x8, %%rcx          \n\t" \
-                              "jl 2f            \n\t" \
-                              "subq $0x8, %%rcx          \n\t" \
-                              "shr $1, %[dilateW]                                    \n\t" \
-                              "shr $1, %[dilateH]                                    \n\t" \
-                              "shr $1, %[fStep]                                    \n\t" \
-                              "shr $1, %[stepC16]                                    \n\t" \
-                              "mov %[kh], %%rbx                                     \n\t" \
-                              ".align 16                                         \n\t" \
-                              "0:                                                \n\t" \
-                              "mov %[kw], %%r9                                   \n\t" \
-                              ".align 16                                         \n\t" \
-                              "1:                                                \n\t" \
-                              "movq %[input], %%rax  \n\t" \
-                              convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \
-                              "movq %[input], %%rax  \n\t" \
-                              "addq $0x4, %%rax  \n\t" \
-                              convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26) \
-                              "addq $0x180, %[filter]                                    \n\t" \
-                              "addq %[dilateW], %[input]                                    \n\t" \
-                              "dec %%r9                                         \n\t" \
-                              "jg 1b                                             \n\t" \
-                              "addq %[dilateH], %[input]                                    \n\t" \
-                              "dec %%rbx                                         \n\t" \
-                              "jg 0b                                             \n\t" \
-                              "addq %[f4Step], %[input]                                    \n\t" \
-                              ".align 16                                         \n\t" \
-                              "2:                                                \n\t" \
-                              "cmpq $0x4, %%rcx          \n\t" \
-                              "jl 5f            \n\t" \
-                              "shr $1, %[dilateW]                                    \n\t" \
-                              "shr $1, %[dilateH]                                    \n\t" \
-                              "shr $1, %[stepC16]                                    \n\t" \
-                              "mov %[kh], %%rbx                                     \n\t" \
-                              ".align 16                                         \n\t" \
-                              "3:                                                \n\t" \
-                              "mov %[kw], %%r9                                   \n\t" \
-                              ".align 16                                         \n\t" \
-                              "4:                                                \n\t" \
-                              "movq %[input], %%rax  \n\t" \
-                              convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \
-                              "addq $0xC0, %[filter]                                    \n\t" \
-                              "addq %[dilateW], %[input]                                    \n\t" \
-                              "dec %%r9                                         \n\t" \
-                              "jg 4b                                             \n\t" \
-                              "addq %[dilateH], %[input]                                    \n\t" \
-                              "dec %%rbx                                         \n\t" \
-                              "jg 3b                                             \n\t" \
-                              ".align 16                                         \n\t" \
-                              "5:                                             \n\t" \
-                              : "+c" (c.ic) \
-                              : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \
-                                [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \
-                                [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), \
-                                [f4Step] "r" (c.f4Step) \
-                              : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                                "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                                "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                                "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                                "%zmm31", "memory", "cc"); \
+#define convKernelForLoopXx48(rnum, wsize, cross) \
+    __asm__ __volatile__("vmovups (%[filter]), %%zmm24           \n\t" \
+                         "vmovups 0x40(%[filter]), %%zmm25       \n\t" \
+                         "vmovups 0x80(%[filter]), %%zmm26       \n\t" \
+                         "addq $0xC0, %[filter]                  \n\t" \
+                         "mov $1, %%eax                          \n\t" \
+                         "vmovd %%eax, %%xmm0                    \n\t" \
+                         "vpbroadcastw %%xmm0, %%zmm31           \n\t" \
+                         "movq %[flags], %%rax                   \n\t" \
+                         "andq $0x1, %%rax                       \n\t" \
+                         "jne 0f                                 \n\t" \
+                         load48BiasTo##rnum##Regs(%[bias])             \
+                         "jmp 1f                                 \n\t" \
+                         ".align 16                              \n\t" \
+                         "0:                                     \n\t" \
+                         clear##rnum##Regs(%%zmm)                      \
+                         ".align 16                              \n\t" \
+                         "1:                                     \n\t" \
+                         : [filter] "+r" (c.filter)                    \
+                         : [bias] "r" (c.bias),                        \
+                           [flags] "r" (c.flags)                       \
+                         : "%rax",                                     \
+                           "%zmm0", "%zmm1","%zmm2", "%zmm3", "%zmm4", "%zmm5",        \
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",     \
+                           "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \
+                           "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \
+                           "%zmm24", "%zmm25", "%zmm26", "memory", "cc");              \
+    if (c.ic >= 16) {                                                                  \
+        __asm__ __volatile__("movq (%[stepC16]), %%r10                                    \n\t" \
+                             ".align 16                                                   \n\t" \
+                             "1:                                                          \n\t" \
+                             "mov %[kh], %%rbx                                            \n\t" \
+                             ".align 16                                                   \n\t" \
+                             "2:                                                          \n\t" \
+                             "mov %[kw], %%r9                                             \n\t" \
+                             ".align 16                                                   \n\t" \
+                             "3:                                                          \n\t" \
+                             "movq %[input], %%rax                                        \n\t" \
+                             convKernel##wsize##x48c4_##cross(%%rax, %%zmm24, %%zmm25, %%zmm26, \
+                                0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29)                     \
+                             "movq %[input], %%rax                                        \n\t" \
+                             "addq $0x4, %%rax                                            \n\t" \
+                             convKernel##wsize##x48c4_##cross(%%rax, %%zmm27, %%zmm28, %%zmm29, \
+                                0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26)                  \
+                             "movq %[input], %%rax                                        \n\t" \
+                             "addq $0x8, %%rax                                            \n\t" \
+                             convKernel##wsize##x48c4_##cross(%%rax, %%zmm24, %%zmm25, %%zmm26, \
+                                0x180, 0x1C0, 0x200, %%zmm27, %%zmm28, %%zmm29)                 \
+                             "movq %[input], %%rax                                        \n\t" \
+                             "addq $0xC, %%rax                                            \n\t" \
+                             convKernel##wsize##x48c4_##cross(%%rax, %%zmm27, %%zmm28, %%zmm29, \
+                                0x240, 0x280, 0x2C0, %%zmm24, %%zmm25, %%zmm26)                 \
+                             "addq $0x300, %[filter]                                      \n\t" \
+                             "addq %[dilateW], %[input]                                   \n\t" \
+                             "dec %%r9                                                    \n\t" \
+                             "jg 3b                                                       \n\t" \
+                             "addq %[dilateH], %[input]                                   \n\t" \
+                             "dec %%rbx                                                   \n\t" \
+                             "jg 2b                                                       \n\t" \
+                             "addq %[fStep], %[input]                                     \n\t" \
+                             "subq $0x10, %%rcx                                           \n\t" \
+                             "cmpq $0x10, %%rcx                                           \n\t" \
+                             "jge 1b                                                      \n\t" \
+                             "subq %[fStep], %[input]                                     \n\t" \
+                             "addq %[f8Step], %[input]                                    \n\t" \
+                             ".align 16                                                   \n\t" \
+                             "4:                                                          \n\t" \
+                             : "+c" (c.ic),                                                     \
+                               [input] "+r" (c.input),                                          \
+                               [filter] "+r" (c.filter)                                         \
+                             : [kh] "r" (c.kh),                                                 \
+                               [kw] "r" (c.kw),                                                 \
+                               [stepC16] "r" (c.stepC16),                                       \
+                               [dilateW] "r" (c.dilateW),                                       \
+                               [dilateH] "r" (c.dilateH),                                       \
+                               [fStep] "r" (c.fStep),                                           \
+                               [f8Step] "r" (c.f8Step)                                          \
+                             : "%rax", "%rbx", "%r9", "%r10",                                   \
+                               "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",            \
+                               "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",          \
+                               "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",      \
+                               "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",      \
+                               "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",      \
+                               "%zmm30", "%zmm31", "memory", "cc");                             \
+    } \
+    if (c.ic > 0) { \
+        __asm__ __volatile__("cmpq $0x8, %%rcx                                            \n\t" \
+                             "jl 2f                                                       \n\t" \
+                             "subq $0x8, %%rcx                                            \n\t" \
+                             "shr $1, %[dilateW]                                          \n\t" \
+                             "shr $1, %[dilateH]                                          \n\t" \
+                             "shr $1, %[fStep]                                            \n\t" \
+                             "addq $192, %[stepC16]                                       \n\t" \
+                             "mov %[kh], %%rbx                                            \n\t" \
+                             ".align 16                                                   \n\t" \
+                             "0:                                                          \n\t" \
+                             "mov %[kw], %%r9                                             \n\t" \
+                             ".align 16                                                   \n\t" \
+                             "1:                                                          \n\t" \
+                             "movq %[input], %%rax                                        \n\t" \
+                             convKernel##wsize##x48c4_##cross(%%rax, %%zmm24, %%zmm25, %%zmm26, \
+                                0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29)                     \
+                             "movq %[input], %%rax                                        \n\t" \
+                             "addq $0x4, %%rax                                            \n\t" \
+                             convKernel##wsize##x48c4_##cross(%%rax, %%zmm27, %%zmm28, %%zmm29, \
+                                0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26)                  \
+                             "addq $0x180, %[filter]                                      \n\t" \
+                             "addq %[dilateW], %[input]                                   \n\t" \
+                             "dec %%r9                                                    \n\t" \
+                             "jg 1b                                                       \n\t" \
+                             "addq %[dilateH], %[input]                                   \n\t" \
+                             "dec %%rbx                                                   \n\t" \
+                             "jg 0b                                                       \n\t" \
+                             "addq %[f4Step], %[input]                                    \n\t" \
+                             ".align 16                                                   \n\t" \
+                             "2:                                                          \n\t" \
+                             "cmpq $0x4, %%rcx                                            \n\t" \
+                             "jl 5f                                                       \n\t" \
+                             "shr $1, %[dilateW]                                          \n\t" \
+                             "shr $1, %[dilateH]                                          \n\t" \
+                             "addq $192, %[stepC16]                                       \n\t" \
+                             "mov %[kh], %%rbx                                            \n\t" \
+                             ".align 16                                                   \n\t" \
+                             "3:                                                          \n\t" \
+                             "mov %[kw], %%r9                                             \n\t" \
+                             ".align 16                                                   \n\t" \
+                             "4:                                                          \n\t" \
+                             "movq %[input], %%rax                                        \n\t" \
+                             convKernel##wsize##x48c4_##cross(%%rax, %%zmm24, %%zmm25, %%zmm26, \
+                                0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29)                     \
+                             "addq $0xC0, %[filter]                                       \n\t" \
+                             "addq %[dilateW], %[input]                                   \n\t" \
+                             "dec %%r9                                                    \n\t" \
+                             "jg 4b                                                       \n\t" \
+                             "addq %[dilateH], %[input]                                   \n\t" \
+                             "dec %%rbx                                                   \n\t" \
+                             "jg 3b                                                       \n\t" \
+                             ".align 16                                                   \n\t" \
+                             "5:                                                          \n\t" \
+                             : "+c" (c.ic)                \
+                             : [input] "r" (c.input),     \
+                               [filter] "r" (c.filter),   \
+                               [kh] "r" (c.kh),           \
+                               [kw] "r" (c.kw),           \
+                               [stepC16] "r" (c.stepC16), \
+                               [dilateW] "r" (c.dilateW), \
+                               [dilateH] "r" (c.dilateH), \
+                               [fStep] "r" (c.fStep),     \
+                               [f4Step] "r" (c.f4Step)    \
+                             : "%rax", "%rbx", "%r9",     \
+                               "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",       \
+                               "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",     \
+                               "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \
+                               "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \
+                               "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \
+                               "%zmm30", "%zmm31", "memory", "cc");                        \
     }
 
 void Avx512ConvKernel8x48(ConvController &c) {
-     convKernelForLoopXx48(24, 8)
-
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax), %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x80(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd 0x100(%%rax), %%zmm12, %%zmm12                         \n\t"
-                         "vpaddd 0x140(%%rax), %%zmm15, %%zmm15                         \n\t"
-                         "vpaddd 0x180(%%rax), %%zmm18, %%zmm18                         \n\t"
-                         "vpaddd 0x1C0(%%rax), %%zmm21, %%zmm21                         \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd 0x100(%%rax, %%rbx), %%zmm13, %%zmm13                         \n\t"
-                         "vpaddd 0x140(%%rax, %%rbx), %%zmm16, %%zmm16                         \n\t"
-                         "vpaddd 0x180(%%rax, %%rbx), %%zmm19, %%zmm19                         \n\t"
-                         "vpaddd 0x1C0(%%rax, %%rbx), %%zmm22, %%zmm22                         \n\t"
-                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11                         \n\t"
-                         "vpaddd 0x100(%%rax, %%rbx, 2), %%zmm14, %%zmm14                         \n\t"
-                         "vpaddd 0x140(%%rax, %%rbx, 2), %%zmm17, %%zmm17                         \n\t"
-                         "vpaddd 0x180(%%rax, %%rbx, 2), %%zmm20, %%zmm20                         \n\t"
-                         "vpaddd 0x1C0(%%rax, %%rbx, 2), %%zmm23, %%zmm23                         \n\t"
+    if (c.cross) {
+        convKernelForLoopXx48(24, 8, 1)
+    } else {
+        convKernelForLoopXx48(24, 8, 0)
+    }
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                                \n\t"
+                         "movq %[ostepC16], %%rbx                              \n\t"
+                         "movq %[flags], %%rcx                                 \n\t"
+                         "and $0x1, %%rcx                                      \n\t"
+                         "je 0f                                                \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0                       \n\t"
+                         "vpaddd 0x40(%%rax), %%zmm3, %%zmm3                   \n\t"
+                         "vpaddd 0x80(%%rax), %%zmm6, %%zmm6                   \n\t"
+                         "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9                   \n\t"
+                         "vpaddd 0x100(%%rax), %%zmm12, %%zmm12                \n\t"
+                         "vpaddd 0x140(%%rax), %%zmm15, %%zmm15                \n\t"
+                         "vpaddd 0x180(%%rax), %%zmm18, %%zmm18                \n\t"
+                         "vpaddd 0x1C0(%%rax), %%zmm21, %%zmm21                \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4            \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7            \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10          \n\t"
+                         "vpaddd 0x100(%%rax, %%rbx), %%zmm13, %%zmm13         \n\t"
+                         "vpaddd 0x140(%%rax, %%rbx), %%zmm16, %%zmm16         \n\t"
+                         "vpaddd 0x180(%%rax, %%rbx), %%zmm19, %%zmm19         \n\t"
+                         "vpaddd 0x1C0(%%rax, %%rbx), %%zmm22, %%zmm22         \n\t"
+                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2             \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5         \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8         \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11       \n\t"
+                         "vpaddd 0x100(%%rax, %%rbx, 2), %%zmm14, %%zmm14      \n\t"
+                         "vpaddd 0x140(%%rax, %%rbx, 2), %%zmm17, %%zmm17      \n\t"
+                         "vpaddd 0x180(%%rax, %%rbx, 2), %%zmm20, %%zmm20      \n\t"
+                         "vpaddd 0x1C0(%%rax, %%rbx, 2), %%zmm23, %%zmm23      \n\t"
+
+                         ".align 16                                            \n\t"
+                         "0:                                                   \n\t"
+                         "cmpq $0x0, %[scale]                                  \n\t"
+                         "jne 1f                                               \n\t"
+                         "movq %[flags], %%rcx                                 \n\t"
                          "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         "je 4f                                                \n\t"
                          relu24Regs(%%zmm)
+                         "jmp 4f                                               \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                            \n\t"
+                         "1:                                                   \n\t"
                          convert24RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0, (%%rax)                             \n\t"
-                         "vmovups %%zmm3, 0x40(%%rax)                         \n\t"
-                         "vmovups %%zmm6, 0x80(%%rax)                         \n\t"
-                         "vmovups %%zmm9, 0xC0(%%rax)                         \n\t"
-                         "vmovups %%zmm12, 0x100(%%rax)                         \n\t"
-                         "vmovups %%zmm15, 0x140(%%rax)                         \n\t"
-                         "vmovups %%zmm18, 0x180(%%rax)                         \n\t"
-                         "vmovups %%zmm21, 0x1C0(%%rax)                         \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
-                         "vmovups %%zmm4, 0x40(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm7, 0x80(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm10, 0xC0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm13, 0x100(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm16, 0x140(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm19, 0x180(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm22, 0x1C0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm2, (%%rax, %%rbx, 2)                             \n\t"
-                         "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm14, 0x100(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm17, 0x140(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm20, 0x180(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm23, 0x1C0(%%rax, %%rbx, 2)                         \n\t"
+                         ".align 16                                            \n\t"
+                         "2:                                                   \n\t"
+                         "movq %[flags], %%rcx                                 \n\t"
+                         "and $0x2, %%rcx                                      \n\t"
+                         "je 3f                                                \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0                  \n\t"
+                         "vaddps 0x40(%[eltwise]), %%zmm3, %%zmm3              \n\t"
+                         "vaddps 0x80(%[eltwise]), %%zmm6, %%zmm6              \n\t"
+                         "vaddps 0xC0(%[eltwise]), %%zmm9, %%zmm9              \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm12, %%zmm12           \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm15, %%zmm15           \n\t"
+                         "vaddps 0x180(%[eltwise]), %%zmm18, %%zmm18           \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%zmm21, %%zmm21           \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1           \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx), %%zmm4, %%zmm4       \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx), %%zmm7, %%zmm7       \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx), %%zmm10, %%zmm10     \n\t"
+                         "vaddps 0x100(%[eltwise], %%rbx), %%zmm13, %%zmm13    \n\t"
+                         "vaddps 0x140(%[eltwise], %%rbx), %%zmm16, %%zmm16    \n\t"
+                         "vaddps 0x180(%[eltwise], %%rbx), %%zmm19, %%zmm19    \n\t"
+                         "vaddps 0x1C0(%[eltwise], %%rbx), %%zmm22, %%zmm22    \n\t"
+                         "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2        \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx, 2), %%zmm5, %%zmm5    \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx, 2), %%zmm8, %%zmm8    \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx, 2), %%zmm11, %%zmm11  \n\t"
+                         "vaddps 0x100(%[eltwise], %%rbx, 2), %%zmm14, %%zmm14 \n\t"
+                         "vaddps 0x140(%[eltwise], %%rbx, 2), %%zmm17, %%zmm17 \n\t"
+                         "vaddps 0x180(%[eltwise], %%rbx, 2), %%zmm20, %%zmm20 \n\t"
+                         "vaddps 0x1C0(%[eltwise], %%rbx, 2), %%zmm23, %%zmm23 \n\t"
+
+                         ".align 16                                            \n\t"
+                         "3:                                                   \n\t"
+                         "movq %[flags], %%rcx                                 \n\t"
+                         "and $0xC, %%rcx                                      \n\t"
+                         "je 4f                                                \n\t"
+                         relu24RegsPs(%%zmm)
+
+                         ".align 16                                            \n\t"
+                         "4:                                                   \n\t"
+                         "vmovups %%zmm0, (%%rax)                              \n\t"
+                         "vmovups %%zmm3, 0x40(%%rax)                          \n\t"
+                         "vmovups %%zmm6, 0x80(%%rax)                          \n\t"
+                         "vmovups %%zmm9, 0xC0(%%rax)                          \n\t"
+                         "vmovups %%zmm12, 0x100(%%rax)                        \n\t"
+                         "vmovups %%zmm15, 0x140(%%rax)                        \n\t"
+                         "vmovups %%zmm18, 0x180(%%rax)                        \n\t"
+                         "vmovups %%zmm21, 0x1C0(%%rax)                        \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)                       \n\t"
+                         "vmovups %%zmm4, 0x40(%%rax, %%rbx)                   \n\t"
+                         "vmovups %%zmm7, 0x80(%%rax, %%rbx)                   \n\t"
+                         "vmovups %%zmm10, 0xC0(%%rax, %%rbx)                  \n\t"
+                         "vmovups %%zmm13, 0x100(%%rax, %%rbx)                 \n\t"
+                         "vmovups %%zmm16, 0x140(%%rax, %%rbx)                 \n\t"
+                         "vmovups %%zmm19, 0x180(%%rax, %%rbx)                 \n\t"
+                         "vmovups %%zmm22, 0x1C0(%%rax, %%rbx)                 \n\t"
+                         "vmovups %%zmm2, (%%rax, %%rbx, 2)                    \n\t"
+                         "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2)                \n\t"
+                         "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2)                \n\t"
+                         "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2)               \n\t"
+                         "vmovups %%zmm14, 0x100(%%rax, %%rbx, 2)              \n\t"
+                         "vmovups %%zmm17, 0x140(%%rax, %%rbx, 2)              \n\t"
+                         "vmovups %%zmm20, 0x180(%%rax, %%rbx, 2)              \n\t"
+                         "vmovups %%zmm23, 0x1C0(%%rax, %%rbx, 2)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [eltwise] "r" (c.eltwise),
+                           [ostepC16] "r" (c.ostepC16),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
+                           "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",
+                           "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",
+                           "%zmm30", "%zmm31", "memory", "cc");
 }
 
-void Avx512ConvKernel4x48(ConvController &c) {
-    convKernelForLoopXx48(12, 4)
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax), %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x80(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11                         \n\t"
+void Avx512ConvKernel4x48(ConvController &c) {
+    if (c.cross) {
+        convKernelForLoopXx48(12, 4, 1)
+    } else {
+        convKernelForLoopXx48(12, 4, 0)
+    }
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                               \n\t"
+                         "movq %[ostepC16], %%rbx                             \n\t"
+                         "movq %[flags], %%rcx                                \n\t"
+                         "and $0x1, %%rcx                                     \n\t"
+                         "je 0f                                               \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0                      \n\t"
+                         "vpaddd 0x40(%%rax), %%zmm3, %%zmm3                  \n\t"
+                         "vpaddd 0x80(%%rax), %%zmm6, %%zmm6                  \n\t"
+                         "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9                  \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1               \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4           \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7           \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10         \n\t"
+                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2            \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5        \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8        \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11      \n\t"
+
+                         ".align 16                                           \n\t"
+                         "0:                                                  \n\t"
+                         "cmpq $0x0, %[scale]                                 \n\t"
+                         "jne 1f                                              \n\t"
+                         "movq %[flags], %%rcx                                \n\t"
+                         "and $0xC, %%rcx                                     \n\t"
+                         "je 4f                                               \n\t"
                          relu12Regs(%%zmm)
+                         "jmp 4f                                              \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                           \n\t"
+                         "1:                                                  \n\t"
                          convert12RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
+                         ".align 16                                           \n\t"
+                         "2:                                                  \n\t"
+                         "movq %[flags], %%rcx                                \n\t"
+                         "and $0x2, %%rcx                                     \n\t"
+                         "je 3f                                               \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0                 \n\t"
+                         "vaddps 0x40(%[eltwise]), %%zmm3, %%zmm3             \n\t"
+                         "vaddps 0x80(%[eltwise]), %%zmm6, %%zmm6             \n\t"
+                         "vaddps 0xC0(%[eltwise]), %%zmm9, %%zmm9             \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1          \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx), %%zmm4, %%zmm4      \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx), %%zmm7, %%zmm7      \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx), %%zmm10, %%zmm10    \n\t"
+                         "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2       \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx, 2), %%zmm5, %%zmm5   \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx, 2), %%zmm8, %%zmm8   \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx, 2), %%zmm11, %%zmm11 \n\t"
+
+                         ".align 16                                           \n\t"
+                         "3:                                                  \n\t"
+                         "movq %[flags], %%rcx                                \n\t"
+                         "and $0xC, %%rcx                                     \n\t"
+                         "je 4f                                               \n\t"
+                         relu12RegsPs(%%zmm)
+
+                         ".align 16                                           \n\t"
+                         "4:                                                  \n\t"
                          "vmovups %%zmm0, (%%rax)                             \n\t"
                          "vmovups %%zmm3, 0x40(%%rax)                         \n\t"
                          "vmovups %%zmm6, 0x80(%%rax)                         \n\t"
                          "vmovups %%zmm9, 0xC0(%%rax)                         \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
-                         "vmovups %%zmm4, 0x40(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm7, 0x80(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm10, 0xC0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm2, (%%rax, %%rbx, 2)                             \n\t"
-                         "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2)                         \n\t"
-                         "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2)                         \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)                      \n\t"
+                         "vmovups %%zmm4, 0x40(%%rax, %%rbx)                  \n\t"
+                         "vmovups %%zmm7, 0x80(%%rax, %%rbx)                  \n\t"
+                         "vmovups %%zmm10, 0xC0(%%rax, %%rbx)                 \n\t"
+                         "vmovups %%zmm2, (%%rax, %%rbx, 2)                   \n\t"
+                         "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2)               \n\t"
+                         "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2)               \n\t"
+                         "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [eltwise] "r" (c.eltwise),
+                           [ostepC16] "r" (c.ostepC16),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm24", "%zmm31", "memory", "cc");
 }
 
 void Avx512ConvKernel1x48(ConvController &c) {
-    convKernelForLoopXx48(3, 1)
-
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2                             \n\t"
+    if (c.cross) {
+        convKernelForLoopXx48(3, 1, 1)
+    } else {
+        convKernelForLoopXx48(3, 1, 0)
+    }
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                         \n\t"
+                         "movq %[ostepC16], %%rbx                       \n\t"
+                         "movq %[flags], %%rcx                          \n\t"
+                         "and $0x1, %%rcx                               \n\t"
+                         "je 0f                                         \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0                \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1         \n\t"
+                         "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2      \n\t"
+
+                         ".align 16                                     \n\t"
+                         "0:                                            \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                          \n\t"
+                         "and $0xC, %%rcx                               \n\t"
+                         "je 4f                                         \n\t"
                          relu3Regs(%%zmm)
+                         "jmp 4f                                        \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                     \n\t"
+                         "1:                                            \n\t"
                          convert3RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0, (%%rax)                             \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
-                         "vmovups %%zmm2, (%%rax, %%rbx, 2)                             \n\t"
+                         ".align 16                                     \n\t"
+                         "2:                                            \n\t"
+                         "movq %[flags], %%rcx                          \n\t"
+                         "and $0x2, %%rcx                               \n\t"
+                         "je 3f                                         \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0           \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1    \n\t"
+                         "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2 \n\t"
+
+                         ".align 16                                     \n\t"
+                         "3:                                            \n\t"
+                         "movq %[flags], %%rcx                          \n\t"
+                         "and $0xC, %%rcx                               \n\t"
+                         "je 4f                                         \n\t"
+                         relu3RegsPs(%%zmm)
+
+                         ".align 16                                     \n\t"
+                         "4:                                            \n\t"
+                         "vmovups %%zmm0, (%%rax)                       \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm2, (%%rax, %%rbx, 2)             \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [eltwise] "r" (c.eltwise),
+                           [ostepC16] "r" (c.ostepC16),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm24", "%zmm31",
+                           "memory", "cc");
 }
 
-#define load32BiasTo2Regs(bias) \
-    "vmovups ("#bias"), %%zmm0                       \n\t" \
-    "vmovups 0x40("#bias"), %%zmm1                   \n\t" \
-
-#define load32BiasTo12Regs(bias) \
-    load32BiasTo2Regs(bias) \
-    "vmovups %%zmm0, %%zmm2                   \n\t" \
-    "vmovups %%zmm1, %%zmm3                   \n\t" \
-    "vmovups %%zmm0, %%zmm4                   \n\t" \
-    "vmovups %%zmm1, %%zmm5                   \n\t" \
-    "vmovups %%zmm0, %%zmm6                   \n\t" \
-    "vmovups %%zmm1, %%zmm7                   \n\t" \
-    "vmovups %%zmm0, %%zmm8                   \n\t" \
-    "vmovups %%zmm1, %%zmm9                   \n\t" \
-    "vmovups %%zmm0, %%zmm10                   \n\t" \
-    "vmovups %%zmm1, %%zmm11                   \n\t"
-
-#define load32BiasTo24Regs(bias) \
-    load32BiasTo12Regs(bias) \
-    "vmovups %%zmm0, %%zmm12                   \n\t" \
-    "vmovups %%zmm1, %%zmm13                   \n\t" \
-    "vmovups %%zmm0, %%zmm14                   \n\t" \
-    "vmovups %%zmm1, %%zmm15                   \n\t" \
-    "vmovups %%zmm0, %%zmm16                   \n\t" \
-    "vmovups %%zmm1, %%zmm17                   \n\t" \
-    "vmovups %%zmm0, %%zmm18                   \n\t" \
-    "vmovups %%zmm1, %%zmm19                   \n\t" \
-    "vmovups %%zmm0, %%zmm20                   \n\t" \
-    "vmovups %%zmm1, %%zmm21                   \n\t" \
-    "vmovups %%zmm0, %%zmm22                   \n\t" \
-    "vmovups %%zmm1, %%zmm23                   \n\t"
-
 #ifdef _USE_AVX512_VNNI
-#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
-    "vpbroadcastd ("#input"), %%zmm28                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm29                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpdpbusd "#freg0", %%zmm28, %%zmm0              \n\t" \
-    "vpdpbusd "#freg1", %%zmm28, %%zmm1              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm29, %%zmm2              \n\t" \
-    "vpdpbusd "#freg1", %%zmm29, %%zmm3              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm4              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm5              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm28                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm29                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm6              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm7              \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpdpbusd "#freg0", %%zmm28, %%zmm8              \n\t" \
-    "vpdpbusd "#freg1", %%zmm28, %%zmm9              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm29, %%zmm10              \n\t" \
-    "vpdpbusd "#freg1", %%zmm29, %%zmm11              \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm12              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm13              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm28                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm29                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm14              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm15              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpdpbusd "#freg0", %%zmm28, %%zmm16              \n\t" \
-    "vpdpbusd "#freg1", %%zmm28, %%zmm17              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm29, %%zmm18              \n\t" \
-    "vpdpbusd "#freg1", %%zmm29, %%zmm19              \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm20              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm21              \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm22              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm23              \n\t"
-
-#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
-    "vpbroadcastd ("#input"), %%zmm28                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm29                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm28, %%zmm0              \n\t" \
-    "vpdpbusd "#freg1", %%zmm28, %%zmm1              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm31                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm29, %%zmm2              \n\t" \
-    "vpdpbusd "#freg1", %%zmm29, %%zmm3              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpdpbusd "#freg0", %%zmm30, %%zmm4              \n\t" \
-    "vpdpbusd "#freg1", %%zmm30, %%zmm5              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm28                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm29                     \n\t" \
-    "vpdpbusd "#freg0", %%zmm31, %%zmm6              \n\t" \
-    "vpdpbusd "#freg1", %%zmm31, %%zmm7              \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm28, %%zmm8              \n\t" \
-    "vpdpbusd "#freg1", %%zmm28, %%zmm9              \n\t" \
-    "vpdpbusd "#freg0", %%zmm29, %%zmm10              \n\t" \
-    "vpdpbusd "#freg1", %%zmm29, %%zmm11              \n\t"
-
-#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
-    "vpbroadcastd ("#input"), %%zmm28                     \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vmovups "#off1"(%[filter]), "#preg1"                             \n\t" \
-    "vpdpbusd "#freg0", %%zmm28, %%zmm0              \n\t" \
-    "vpdpbusd "#freg1", %%zmm28, %%zmm1              \n\t"
+#define convKernel12x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \
+    "movq (%[stepC16]), %%r10                     \n\t" \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm29      \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x8(%[stepC16]), "#input"               \n\t" \
+    "movq 0x10(%[stepC16]), %%r10                 \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm0           \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm1           \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31      \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm2           \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm3           \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x18(%[stepC16]), "#input"              \n\t" \
+    "movq 0x20(%[stepC16]), %%r10                 \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm4           \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm5           \n\t" \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm29      \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm6           \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm7           \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x28(%[stepC16]), "#input"              \n\t" \
+    "movq 0x30(%[stepC16]), %%r10                 \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm8           \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm9           \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31      \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm10          \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm11          \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x38(%[stepC16]), "#input"              \n\t" \
+    "movq 0x40(%[stepC16]), %%r10                 \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm12          \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm13          \n\t" \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm29      \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm14          \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm15          \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x48(%[stepC16]), "#input"              \n\t" \
+    "movq 0x50(%[stepC16]), %%r10                 \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm16          \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm17          \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31      \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm18          \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm19          \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm20          \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm21          \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm22          \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm23          \n\t"
+
+#define convKernel6x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \
+    "movq (%[stepC16]), %%r10                     \n\t" \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm29      \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x8(%[stepC16]), "#input"               \n\t" \
+    "movq 0x10(%[stepC16]), %%r10                 \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"         \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm0           \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm1           \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31      \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm2           \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm3           \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x18(%[stepC16]), "#input"              \n\t" \
+    "movq 0x20(%[stepC16]), %%r10                 \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm4           \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm5           \n\t" \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm29      \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm6           \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm7           \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"         \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm8           \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm9           \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm10          \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm11          \n\t"
+
+#define convKernel1x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"         \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"         \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm0           \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm1           \n\t"
+
+#define convKernel12x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm29      \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm0           \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm1           \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31      \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm2           \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm3           \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm4           \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm5           \n\t" \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm29      \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm6           \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm7           \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm8           \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm9           \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31      \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm10          \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm11          \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm12          \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm13          \n\t" \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm29      \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm14          \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm15          \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm16          \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm17          \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31      \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm18          \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm19          \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm20          \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm21          \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm22          \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm23          \n\t"
+
+#define convKernel6x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm29      \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"         \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm0           \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm1           \n\t" \
+    "vpbroadcastd ("#input"), %%zmm30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm31      \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm2           \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm3           \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "vpdpbusd "#freg0", %%zmm30, %%zmm4           \n\t" \
+    "vpdpbusd "#freg1", %%zmm30, %%zmm5           \n\t" \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm29      \n\t" \
+    "vpdpbusd "#freg0", %%zmm31, %%zmm6           \n\t" \
+    "vpdpbusd "#freg1", %%zmm31, %%zmm7           \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"         \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm8           \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm9           \n\t" \
+    "vpdpbusd "#freg0", %%zmm29, %%zmm10          \n\t" \
+    "vpdpbusd "#freg1", %%zmm29, %%zmm11          \n\t"
+
+#define convKernel1x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \
+    "vpbroadcastd ("#input"), %%zmm28             \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"         \n\t" \
+    "vmovups "#off1"(%[filter]), "#preg1"         \n\t" \
+    "vpdpbusd "#freg0", %%zmm28, %%zmm0           \n\t" \
+    "vpdpbusd "#freg1", %%zmm28, %%zmm1           \n\t"
+
 #else
+
 #define convKernel12x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm30                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg2"       \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
     "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
     "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
     "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm30                     \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg2"       \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
     "vpaddd %%zmm3, "#preg0", %%zmm3              \n\t" \
     "vpaddd %%zmm4, "#preg1", %%zmm4              \n\t" \
     "vpaddd %%zmm5, "#preg2", %%zmm5              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm30                     \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg2"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \
     "vpaddd %%zmm6, "#preg0", %%zmm6              \n\t" \
     "vpaddd %%zmm7, "#preg1", %%zmm7              \n\t" \
     "vpaddd %%zmm8, "#preg2", %%zmm8              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm30                     \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg2"       \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \
     "vpaddd %%zmm9, "#preg0", %%zmm9              \n\t" \
-    "vpaddd %%zmm10, "#preg1", %%zmm10              \n\t" \
-    "vpaddd %%zmm11, "#preg2", %%zmm11              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
-    "vpaddd %%zmm12, "#preg0", %%zmm12              \n\t" \
-    "vpaddd %%zmm13, "#preg1", %%zmm13              \n\t" \
-    "vpaddd %%zmm14, "#preg2", %%zmm14              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm30                     \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
-    "vpaddd %%zmm15, "#preg0", %%zmm15              \n\t" \
-    "vpaddd %%zmm16, "#preg1", %%zmm16              \n\t" \
-    "vpaddd %%zmm17, "#preg2", %%zmm17              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm30                     \n\t" \
-    "vpaddd %%zmm18, "#preg0", %%zmm18              \n\t" \
-    "vpaddd %%zmm19, "#preg1", %%zmm19              \n\t" \
-    "vpaddd %%zmm20, "#preg2", %%zmm20              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
-    "vpaddd %%zmm21, "#preg0", %%zmm21              \n\t" \
-    "vpaddd %%zmm22, "#preg1", %%zmm22              \n\t" \
-    "vpaddd %%zmm23, "#preg2", %%zmm23              \n\t"
+    "vpaddd %%zmm10, "#preg1", %%zmm10            \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11            \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg2"       \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpaddd %%zmm12, "#preg0", %%zmm12            \n\t" \
+    "vpaddd %%zmm13, "#preg1", %%zmm13            \n\t" \
+    "vpaddd %%zmm14, "#preg2", %%zmm14            \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg2"       \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpaddd %%zmm15, "#preg0", %%zmm15            \n\t" \
+    "vpaddd %%zmm16, "#preg1", %%zmm16            \n\t" \
+    "vpaddd %%zmm17, "#preg2", %%zmm17            \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg2"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \
+    "vpaddd %%zmm18, "#preg0", %%zmm18            \n\t" \
+    "vpaddd %%zmm19, "#preg1", %%zmm19            \n\t" \
+    "vpaddd %%zmm20, "#preg2", %%zmm20            \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg2"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"         \n\t" \
+    "vpaddd %%zmm21, "#preg0", %%zmm21            \n\t" \
+    "vpaddd %%zmm22, "#preg1", %%zmm22            \n\t" \
+    "vpaddd %%zmm23, "#preg2", %%zmm23            \n\t"
 
 #define convKernel6x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm30                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg2"       \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
     "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
     "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
     "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm30                     \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg2"       \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "addq %%r10, "#input"                    \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
     "vpaddd %%zmm3, "#preg0", %%zmm3              \n\t" \
     "vpaddd %%zmm4, "#preg1", %%zmm4              \n\t" \
     "vpaddd %%zmm5, "#preg2", %%zmm5              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), %%zmm30                     \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg2"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \
     "vpaddd %%zmm6, "#preg0", %%zmm6              \n\t" \
     "vpaddd %%zmm7, "#preg1", %%zmm7              \n\t" \
     "vpaddd %%zmm8, "#preg2", %%zmm8              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg0", %%zmm30, "#preg1"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm30, "#preg2"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg2", %%zmm31, "#preg2"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg2"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"         \n\t" \
     "vpaddd %%zmm9, "#preg0", %%zmm9              \n\t" \
-    "vpaddd %%zmm10, "#preg1", %%zmm10              \n\t" \
-    "vpaddd %%zmm11, "#preg2", %%zmm11              \n\t"
+    "vpaddd %%zmm10, "#preg1", %%zmm10            \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11            \n\t"
 
 #define convKernel1x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \
-    "vpbroadcastd ("#input"), %%zmm29                     \n\t" \
-    "vpmaddubsw "#freg0", %%zmm29, "#preg0"              \n\t" \
-    "vpmaddubsw "#freg1", %%zmm29, "#preg1"              \n\t" \
-    "vpmaddwd "#preg0", %%zmm31, "#preg0"              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpmaddwd "#preg1", %%zmm31, "#preg1"              \n\t" \
-    "vmovups "#off1"(%[filter]), "#freg1"                             \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"         \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t"
+
+#define convKernel12x32c4_4(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \
+    "movq (%[stepC16]), %%r10                     \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30      \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg2"       \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x8(%[stepC16]), "#input"               \n\t" \
+    "movq 0x10(%[stepC16]), %%r10                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg2"       \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x18(%[stepC16]), "#input"              \n\t" \
+    "movq 0x20(%[stepC16]), %%r10                 \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpaddd %%zmm3, "#preg0", %%zmm3              \n\t" \
+    "vpaddd %%zmm4, "#preg1", %%zmm4              \n\t" \
+    "vpaddd %%zmm5, "#preg2", %%zmm5              \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg2"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30      \n\t" \
+    "vpaddd %%zmm6, "#preg0", %%zmm6              \n\t" \
+    "vpaddd %%zmm7, "#preg1", %%zmm7              \n\t" \
+    "vpaddd %%zmm8, "#preg2", %%zmm8              \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x28(%[stepC16]), "#input"              \n\t" \
+    "movq 0x30(%[stepC16]), %%r10                 \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg2"       \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30      \n\t" \
+    "vpaddd %%zmm9, "#preg0", %%zmm9              \n\t" \
+    "vpaddd %%zmm10, "#preg1", %%zmm10            \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11            \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg2"       \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x38(%[stepC16]), "#input"              \n\t" \
+    "movq 0x40(%[stepC16]), %%r10                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpaddd %%zmm12, "#preg0", %%zmm12            \n\t" \
+    "vpaddd %%zmm13, "#preg1", %%zmm13            \n\t" \
+    "vpaddd %%zmm14, "#preg2", %%zmm14            \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg2"       \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x48(%[stepC16]), "#input"              \n\t" \
+    "movq 0x50(%[stepC16]), %%r10                 \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpaddd %%zmm15, "#preg0", %%zmm15            \n\t" \
+    "vpaddd %%zmm16, "#preg1", %%zmm16            \n\t" \
+    "vpaddd %%zmm17, "#preg2", %%zmm17            \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg2"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30      \n\t" \
+    "vpaddd %%zmm18, "#preg0", %%zmm18            \n\t" \
+    "vpaddd %%zmm19, "#preg1", %%zmm19            \n\t" \
+    "vpaddd %%zmm20, "#preg2", %%zmm20            \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg2"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"         \n\t" \
+    "vpaddd %%zmm21, "#preg0", %%zmm21            \n\t" \
+    "vpaddd %%zmm22, "#preg1", %%zmm22            \n\t" \
+    "vpaddd %%zmm23, "#preg2", %%zmm23            \n\t"
+
+#define convKernel6x32c4_4(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \
+    "movq (%[stepC16]), %%r10                     \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30      \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg2"       \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x8(%[stepC16]), "#input"               \n\t" \
+    "movq 0x10(%[stepC16]), %%r10                 \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
+    "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t" \
+    "vpaddd %%zmm2, "#preg2", %%zmm2              \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg2"       \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30      \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "addq %%r10, "#input"                         \n\t" \
+    "addq 0x18(%[stepC16]), "#input"              \n\t" \
+    "movq 0x20(%[stepC16]), %%r10                 \n\t" \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpaddd %%zmm3, "#preg0", %%zmm3              \n\t" \
+    "vpaddd %%zmm4, "#preg1", %%zmm4              \n\t" \
+    "vpaddd %%zmm5, "#preg2", %%zmm5              \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg2"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vpbroadcastd ("#input", %%r10), %%zmm30      \n\t" \
+    "vpaddd %%zmm6, "#preg0", %%zmm6              \n\t" \
+    "vpaddd %%zmm7, "#preg1", %%zmm7              \n\t" \
+    "vpaddd %%zmm8, "#preg2", %%zmm8              \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg0", %%zmm30, "#preg1"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm30, "#preg2"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"         \n\t" \
+    "vpmaddwd "#preg2", %%zmm31, "#preg2"         \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"         \n\t" \
+    "vpaddd %%zmm9, "#preg0", %%zmm9              \n\t" \
+    "vpaddd %%zmm10, "#preg1", %%zmm10            \n\t" \
+    "vpaddd %%zmm11, "#preg2", %%zmm11            \n\t"
+
+#define convKernel1x32c4_4(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \
+    "vpbroadcastd ("#input"), %%zmm29             \n\t" \
+    "vpmaddubsw "#freg0", %%zmm29, "#preg0"       \n\t" \
+    "vpmaddubsw "#freg1", %%zmm29, "#preg1"       \n\t" \
+    "vpmaddwd "#preg0", %%zmm31, "#preg0"         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"         \n\t" \
+    "vpmaddwd "#preg1", %%zmm31, "#preg1"         \n\t" \
+    "vmovups "#off1"(%[filter]), "#freg1"         \n\t" \
     "vpaddd %%zmm0, "#preg0", %%zmm0              \n\t" \
     "vpaddd %%zmm1, "#preg1", %%zmm1              \n\t"
 
-#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
+#define convKernel12x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \
     convKernel12x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28)
 
-#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
+#define convKernel6x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \
     convKernel6x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28)
 
-#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \
+#define convKernel1x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \
     convKernel1x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28)
+
+#define convKernel12x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \
+    convKernel12x32c4_4(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28)
+
+#define convKernel6x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \
+    convKernel6x32c4_4(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28)
+
+#define convKernel1x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \
+    convKernel1x32c4_4(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28)
+
 #endif
 
-#define convKernelForLoopXx32(rnum, wsize) \
-     __asm__ __volatile__("vmovups (%[filter]), %%zmm24                             \n\t" \
-                          "vmovups 0x40(%[filter]), %%zmm25                             \n\t" \
-                          "addq $0x80, %[filter]                                    \n\t" \
-                          "mov $1, %%eax \n\t" \
-                          "vmovd %%eax, %%xmm0                    \n\t" \
-                          "vpbroadcastw %%xmm0, %%zmm31            \n\t" \
-                          "movq %[flags], %%rax          \n\t" \
-                          "andq $0x1, %%rax          \n\t" \
-                          "jne 0f                                         \n\t" \
-                          load32BiasTo##rnum##Regs(%[bias]) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          "jmp 1f          \n\t" \
-                          ".align 16                                         \n\t" \
-                          "0:                                                \n\t" \
-                          clear##rnum##Regs(%%zmm) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          ".align 16                                         \n\t" \
-                          "1:                                                \n\t" \
-                          "mov %[kh], %%rbx                                     \n\t" \
-                          ".align 16                                         \n\t" \
-                          "2:                                                \n\t" \
-                          "mov %[kw], %%r9                                   \n\t" \
-                          ".align 16                                         \n\t" \
-                          "3:                                                \n\t" \
-                          "movq %[input], %%rax  \n\t" \
-                          convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \
-                          "movq %[input], %%rax  \n\t" \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25) \
-                          "movq %[input], %%rax  \n\t" \
-                          "addq $0x8, %%rax  \n\t" \
-                          convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x100, 0x140, %%zmm26, %%zmm27) \
-                          "movq %[input], %%rax  \n\t" \
-                          "addq $0xC, %%rax  \n\t" \
-                          convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x180, 0x1C0, %%zmm24, %%zmm25) \
-                          "addq $0x200, %[filter]                                    \n\t" \
-                          "addq %[dilateW], %[input]                                    \n\t" \
-                          "dec %%r9                                         \n\t" \
-                          "jg 3b                                             \n\t" \
-                          "addq %[dilateH], %[input]                                    \n\t" \
-                          "dec %%rbx                                         \n\t" \
-                          "jg 2b                                             \n\t" \
-                          "addq %[fStep], %[input]                                    \n\t" \
-                          "subq $0x10, %%rcx                                         \n\t" \
-                          "cmpq $0x10, %%rcx                                         \n\t" \
-                          "jge 1b                                             \n\t" \
-                          "subq %[fStep], %[input]                                    \n\t" \
-                          "addq %[f8Step], %[input]                                    \n\t" \
-                          ".align 16                                         \n\t" \
-                          "4:                                                \n\t" \
-                          : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \
-                          : [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \
-                            [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \
-                            [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), [flags] "r" (c.flags),  \
-                            [f8Step] "r" (c.f8Step) \
-                          : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                            "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                            "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                            "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                            "%zmm31", "memory", "cc"); \
+#define convKernelForLoopXx32(rnum, wsize, cross) \
+     __asm__ __volatile__("vmovups (%[filter]), %%zmm24         \n\t" \
+                          "vmovups 0x40(%[filter]), %%zmm25     \n\t" \
+                          "addq $0x80, %[filter]                \n\t" \
+                          "mov $1, %%eax                        \n\t" \
+                          "vmovd %%eax, %%xmm0                  \n\t" \
+                          "vpbroadcastw %%xmm0, %%zmm31         \n\t" \
+                          "movq %[flags], %%rax                 \n\t" \
+                          "andq $0x1, %%rax                     \n\t" \
+                          "jne 0f                               \n\t" \
+                          load32BiasTo##rnum##Regs(%[bias])           \
+                          "jmp 1f                               \n\t" \
+                          ".align 16                            \n\t" \
+                          "0:                                   \n\t" \
+                          clear##rnum##Regs(%%zmm)                    \
+                          ".align 16                            \n\t" \
+                          "1:                                   \n\t" \
+                          : [filter] "+r" (c.filter)                  \
+                          : [bias] "r" (c.bias),                      \
+                            [flags] "r" (c.flags)                     \
+                          : "%rax",                                   \
+                            "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",       \
+                            "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",     \
+                            "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \
+                            "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \
+                            "memory", "cc"); \
+     if (c.ic >= 16) { \
+         __asm__ __volatile__("movq (%[stepC16]), %%r10                            \n\t" \
+                              ".align 16                                           \n\t" \
+                              "1:                                                  \n\t" \
+                              "mov %[kh], %%rbx                                    \n\t" \
+                              ".align 16                                           \n\t" \
+                              "2:                                                  \n\t" \
+                              "mov %[kw], %%r9                                     \n\t" \
+                              ".align 16                                           \n\t" \
+                              "3:                                                  \n\t" \
+                              "movq %[input], %%rax                                \n\t" \
+                              convKernel##wsize##x32c4_##cross(                          \
+                                %%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27)    \
+                              "movq %[input], %%rax                                \n\t" \
+                              "addq $0x4, %%rax                                    \n\t" \
+                              convKernel##wsize##x32c4_##cross(                          \
+                                %%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25)   \
+                              "movq %[input], %%rax                                \n\t" \
+                              "addq $0x8, %%rax                                    \n\t" \
+                              convKernel##wsize##x32c4_##cross(                          \
+                                %%rax, %%zmm24, %%zmm25, 0x100, 0x140, %%zmm26, %%zmm27) \
+                              "movq %[input], %%rax                                \n\t" \
+                              "addq $0xC, %%rax                                    \n\t" \
+                              convKernel##wsize##x32c4_##cross(                          \
+                                %%rax, %%zmm26, %%zmm27, 0x180, 0x1C0, %%zmm24, %%zmm25) \
+                              "addq $0x200, %[filter]                              \n\t" \
+                              "addq %[dilateW], %[input]                           \n\t" \
+                              "dec %%r9                                            \n\t" \
+                              "jg 3b                                               \n\t" \
+                              "addq %[dilateH], %[input]                           \n\t" \
+                              "dec %%rbx                                           \n\t" \
+                              "jg 2b                                               \n\t" \
+                              "addq %[fStep], %[input]                             \n\t" \
+                              "subq $0x10, %%rcx                                   \n\t" \
+                              "cmpq $0x10, %%rcx                                   \n\t" \
+                              "jge 1b                                              \n\t" \
+                              "subq %[fStep], %[input]                             \n\t" \
+                              "addq %[f8Step], %[input]                            \n\t" \
+                              ".align 16                                           \n\t" \
+                              "4:                                                  \n\t" \
+                              : "+c" (c.ic),               \
+                                [input] "+r" (c.input),    \
+                                [filter] "+r" (c.filter)   \
+                              : [kh] "r" (c.kh),           \
+                                [kw] "r" (c.kw),           \
+                                [stepC16] "r" (c.stepC16), \
+                                [dilateW] "r" (c.dilateW), \
+                                [dilateH] "r" (c.dilateH), \
+                                [fStep] "r" (c.fStep),     \
+                                [f8Step] "r" (c.f8Step)    \
+                              : "%rax", "%rbx", "%r9", "%r10",                              \
+                                "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",       \
+                                "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",     \
+                                "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \
+                                "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \
+                                "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \
+                                "%zmm30", "%zmm31", "memory", "cc");                        \
+     }               \
      if (c.ic > 0) { \
-         __asm__ __volatile__("cmpq $0x8, %%rcx          \n\t" \
-                              "jl 2f            \n\t" \
-                              "subq $0x8, %%rcx          \n\t" \
-                              "shr $1, %[dilateW]                                    \n\t" \
-                              "shr $1, %[dilateH]                                    \n\t" \
-                              "shr $1, %[fStep]                                    \n\t" \
-                              "shr $1, %[stepC16]                                    \n\t" \
-                              "mov %[kh], %%rbx                                     \n\t" \
-                              ".align 16                                         \n\t" \
-                              "0:                                                \n\t" \
-                              "mov %[kw], %%r9                                   \n\t" \
-                              ".align 16                                         \n\t" \
-                              "1:                                                \n\t" \
-                              "movq %[input], %%rax  \n\t" \
-                              convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \
-                              "movq %[input], %%rax  \n\t" \
-                              "addq $0x4, %%rax  \n\t" \
-                              convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25) \
-                              "addq $0x100, %[filter]                                    \n\t" \
-                              "addq %[dilateW], %[input]                                    \n\t" \
-                              "dec %%r9                                         \n\t" \
-                              "jg 1b                                             \n\t" \
-                              "addq %[dilateH], %[input]                                    \n\t" \
-                              "dec %%rbx                                         \n\t" \
-                              "jg 0b                                             \n\t" \
-                              "addq %[f4Step], %[input]                                    \n\t" \
-                              ".align 16                                         \n\t" \
-                              "2:                                                \n\t" \
-                              "cmpq $0x4, %%rcx          \n\t" \
-                              "jl 5f            \n\t" \
-                              "shr $1, %[dilateW]                                    \n\t" \
-                              "shr $1, %[dilateH]                                    \n\t" \
-                              "shr $1, %[stepC16]                                    \n\t" \
-                              "mov %[kh], %%rbx                                     \n\t" \
-                              ".align 16                                         \n\t" \
-                              "3:                                                \n\t" \
-                              "mov %[kw], %%r9                                   \n\t" \
-                              ".align 16                                         \n\t" \
-                              "4:                                                \n\t" \
-                              "movq %[input], %%rax  \n\t" \
-                              convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \
-                              "addq $0x80, %[filter]                                    \n\t" \
-                              "addq %[dilateW], %[input]                                    \n\t" \
-                              "dec %%r9                                         \n\t" \
-                              "jg 4b                                             \n\t" \
-                              "addq %[dilateH], %[input]                                    \n\t" \
-                              "dec %%rbx                                         \n\t" \
-                              "jg 3b                                             \n\t" \
-                              ".align 16                                         \n\t" \
-                              "5:                                             \n\t" \
-                              : "+c" (c.ic) \
-                              : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \
-                                [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \
-                                [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), \
-                                [f4Step] "r" (c.f4Step) \
-                              : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                                "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                                "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                                "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                                "%zmm31", "memory", "cc"); \
+         __asm__ __volatile__("cmpq $0x8, %%rcx                                       \n\t" \
+                              "jl 2f                                                  \n\t" \
+                              "subq $0x8, %%rcx                                       \n\t" \
+                              "shr $1, %[dilateW]                                     \n\t" \
+                              "shr $1, %[dilateH]                                     \n\t" \
+                              "shr $1, %[fStep]                                       \n\t" \
+                              "addq $192, %[stepC16]                                     \n\t" \
+                              "mov %[kh], %%rbx                                       \n\t" \
+                              ".align 16                                              \n\t" \
+                              "0:                                                     \n\t" \
+                              "mov %[kw], %%r9                                        \n\t" \
+                              ".align 16                                              \n\t" \
+                              "1:                                                     \n\t" \
+                              "movq %[input], %%rax                                   \n\t" \
+                              convKernel##wsize##x32c4_##cross(                             \
+                                %%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27)       \
+                              "movq %[input], %%rax                                   \n\t" \
+                              "addq $0x4, %%rax                                       \n\t" \
+                              convKernel##wsize##x32c4_##cross(                             \
+                                %%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25)      \
+                              "addq $0x100, %[filter]                                 \n\t" \
+                              "addq %[dilateW], %[input]                              \n\t" \
+                              "dec %%r9                                               \n\t" \
+                              "jg 1b                                                  \n\t" \
+                              "addq %[dilateH], %[input]                              \n\t" \
+                              "dec %%rbx                                              \n\t" \
+                              "jg 0b                                                  \n\t" \
+                              "addq %[f4Step], %[input]                               \n\t" \
+                              ".align 16                                              \n\t" \
+                              "2:                                                     \n\t" \
+                              "cmpq $0x4, %%rcx                                       \n\t" \
+                              "jl 5f                                                  \n\t" \
+                              "shr $1, %[dilateW]                                     \n\t" \
+                              "shr $1, %[dilateH]                                     \n\t" \
+                              "addq $192, %[stepC16]                                     \n\t" \
+                              "mov %[kh], %%rbx                                       \n\t" \
+                              ".align 16                                              \n\t" \
+                              "3:                                                     \n\t" \
+                              "mov %[kw], %%r9                                        \n\t" \
+                              ".align 16                                              \n\t" \
+                              "4:                                                     \n\t" \
+                              "movq %[input], %%rax                                   \n\t" \
+                              convKernel##wsize##x32c4_##cross(                             \
+                                %%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27)       \
+                              "addq $0x80, %[filter]                                  \n\t" \
+                              "addq %[dilateW], %[input]                              \n\t" \
+                              "dec %%r9                                               \n\t" \
+                              "jg 4b                                                  \n\t" \
+                              "addq %[dilateH], %[input]                              \n\t" \
+                              "dec %%rbx                                              \n\t" \
+                              "jg 3b                                                  \n\t" \
+                              ".align 16                                              \n\t" \
+                              "5:                                                     \n\t" \
+                              : "+c" (c.ic)                \
+                              : [input] "r" (c.input),     \
+                                [filter] "r" (c.filter),   \
+                                [bias] "r" (c.bias),       \
+                                [kh] "r" (c.kh),           \
+                                [kw] "r" (c.kw),           \
+                                [stepC16] "r" (c.stepC16), \
+                                [dilateW] "r" (c.dilateW), \
+                                [dilateH] "r" (c.dilateH), \
+                                [fStep] "r" (c.fStep),     \
+                                [f4Step] "r" (c.f4Step)    \
+                              : "%rax", "%rbx", "%r9",     \
+                                "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",       \
+                                "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",     \
+                                "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \
+                                "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \
+                                "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \
+                                "%zmm30", "%zmm31", "memory", "cc");                        \
     }
 
 void Avx512ConvKernel12x32(ConvController &c) {
-    convKernelForLoopXx32(24, 12)
+    if (c.cross) {
+        convKernelForLoopXx32(24, 12, 1)
+    } else {
+        convKernelForLoopXx32(24, 12, 0)
+    }
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                             \n\t"
+                         "movq %[ostepC16], %%rbx                           \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0x1, %%rcx                                   \n\t"
                          "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax), %%zmm2, %%zmm2                         \n\t"
-                         "vpaddd 0x80(%%rax), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0x100(%%rax), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0x140(%%rax), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd 0x180(%%rax), %%zmm12, %%zmm12                         \n\t"
-                         "vpaddd 0x1C0(%%rax), %%zmm14, %%zmm14                         \n\t"
-                         "vpaddd 0x200(%%rax), %%zmm16, %%zmm16                         \n\t"
-                         "vpaddd 0x240(%%rax), %%zmm18, %%zmm18                         \n\t"
-                         "vpaddd 0x280(%%rax), %%zmm20, %%zmm20                         \n\t"
-                         "vpaddd 0x2C0(%%rax), %%zmm22, %%zmm22                         \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11                         \n\t"
-                         "vpaddd 0x180(%%rax, %%rbx), %%zmm13, %%zmm13                         \n\t"
-                         "vpaddd 0x1C0(%%rax, %%rbx), %%zmm15, %%zmm15                         \n\t"
-                         "vpaddd 0x200(%%rax, %%rbx), %%zmm17, %%zmm17                         \n\t"
-                         "vpaddd 0x240(%%rax, %%rbx), %%zmm19, %%zmm19                         \n\t"
-                         "vpaddd 0x280(%%rax, %%rbx), %%zmm21, %%zmm21                         \n\t"
-                         "vpaddd 0x2C0(%%rax, %%rbx), %%zmm23, %%zmm23                         \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0                    \n\t"
+                         "vpaddd 0x40(%%rax), %%zmm2, %%zmm2                \n\t"
+                         "vpaddd 0x80(%%rax), %%zmm4, %%zmm4                \n\t"
+                         "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6                \n\t"
+                         "vpaddd 0x100(%%rax), %%zmm8, %%zmm8               \n\t"
+                         "vpaddd 0x140(%%rax), %%zmm10, %%zmm10             \n\t"
+                         "vpaddd 0x180(%%rax), %%zmm12, %%zmm12             \n\t"
+                         "vpaddd 0x1C0(%%rax), %%zmm14, %%zmm14             \n\t"
+                         "vpaddd 0x200(%%rax), %%zmm16, %%zmm16             \n\t"
+                         "vpaddd 0x240(%%rax), %%zmm18, %%zmm18             \n\t"
+                         "vpaddd 0x280(%%rax), %%zmm20, %%zmm20             \n\t"
+                         "vpaddd 0x2C0(%%rax), %%zmm22, %%zmm22             \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1             \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3         \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5         \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7         \n\t"
+                         "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9        \n\t"
+                         "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11      \n\t"
+                         "vpaddd 0x180(%%rax, %%rbx), %%zmm13, %%zmm13      \n\t"
+                         "vpaddd 0x1C0(%%rax, %%rbx), %%zmm15, %%zmm15      \n\t"
+                         "vpaddd 0x200(%%rax, %%rbx), %%zmm17, %%zmm17      \n\t"
+                         "vpaddd 0x240(%%rax, %%rbx), %%zmm19, %%zmm19      \n\t"
+                         "vpaddd 0x280(%%rax, %%rbx), %%zmm21, %%zmm21      \n\t"
+                         "vpaddd 0x2C0(%%rax, %%rbx), %%zmm23, %%zmm23      \n\t"
 
                          ".align 16                                         \n\t"
                          "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         "cmpq $0x0, %[scale]                               \n\t"
+                         "jne 1f                                            \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0xC, %%rcx                                   \n\t"
+                         "je 4f                                             \n\t"
                          relu24Regs(%%zmm)
+                         "jmp 4f                                            \n\t"
 
                          ".align 16                                         \n\t"
                          "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
                          convert24RegsI32ToF32(%[scale], %%zmm)
 
                          ".align 16                                         \n\t"
                          "2:                                                \n\t"
-                         "vmovups %%zmm0, (%%rax)                             \n\t"
-                         "vmovups %%zmm2, 0x40(%%rax)                         \n\t"
-                         "vmovups %%zmm4, 0x80(%%rax)                         \n\t"
-                         "vmovups %%zmm6, 0xC0(%%rax)                         \n\t"
-                         "vmovups %%zmm8, 0x100(%%rax)                         \n\t"
-                         "vmovups %%zmm10, 0x140(%%rax)                         \n\t"
-                         "vmovups %%zmm12, 0x180(%%rax)                         \n\t"
-                         "vmovups %%zmm14, 0x1C0(%%rax)                         \n\t"
-                         "vmovups %%zmm16, 0x200(%%rax)                         \n\t"
-                         "vmovups %%zmm18, 0x240(%%rax)                         \n\t"
-                         "vmovups %%zmm20, 0x280(%%rax)                         \n\t"
-                         "vmovups %%zmm22, 0x2C0(%%rax)                         \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
-                         "vmovups %%zmm3, 0x40(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm5, 0x80(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm7, 0xC0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm9, 0x100(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm11, 0x140(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm13, 0x180(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm15, 0x1C0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm17, 0x200(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm19, 0x240(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm21, 0x280(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm23, 0x2C0(%%rax, %%rbx)                         \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0x2, %%rcx                                   \n\t"
+                         "je 3f                                             \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0               \n\t"
+                         "vaddps 0x40(%[eltwise]), %%zmm2, %%zmm2           \n\t"
+                         "vaddps 0x80(%[eltwise]), %%zmm4, %%zmm4           \n\t"
+                         "vaddps 0xC0(%[eltwise]), %%zmm6, %%zmm6           \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm8, %%zmm8          \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm10, %%zmm10        \n\t"
+                         "vaddps 0x180(%[eltwise]), %%zmm12, %%zmm12        \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%zmm14, %%zmm14        \n\t"
+                         "vaddps 0x200(%[eltwise]), %%zmm16, %%zmm16        \n\t"
+                         "vaddps 0x240(%[eltwise]), %%zmm18, %%zmm18        \n\t"
+                         "vaddps 0x280(%[eltwise]), %%zmm20, %%zmm20        \n\t"
+                         "vaddps 0x2C0(%[eltwise]), %%zmm22, %%zmm22        \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1        \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx), %%zmm3, %%zmm3    \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx), %%zmm5, %%zmm5    \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx), %%zmm7, %%zmm7    \n\t"
+                         "vaddps 0x100(%[eltwise], %%rbx), %%zmm9, %%zmm9   \n\t"
+                         "vaddps 0x140(%[eltwise], %%rbx), %%zmm11, %%zmm11 \n\t"
+                         "vaddps 0x180(%[eltwise], %%rbx), %%zmm13, %%zmm13 \n\t"
+                         "vaddps 0x1C0(%[eltwise], %%rbx), %%zmm15, %%zmm15 \n\t"
+                         "vaddps 0x200(%[eltwise], %%rbx), %%zmm17, %%zmm17 \n\t"
+                         "vaddps 0x240(%[eltwise], %%rbx), %%zmm19, %%zmm19 \n\t"
+                         "vaddps 0x280(%[eltwise], %%rbx), %%zmm21, %%zmm21 \n\t"
+                         "vaddps 0x2C0(%[eltwise], %%rbx), %%zmm23, %%zmm23 \n\t"
+
+                         ".align 16                                         \n\t"
+                         "3:                                                \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0xC, %%rcx                                   \n\t"
+                         "je 4f                                             \n\t"
+                         relu24RegsPs(%%zmm)
+
+                         ".align 16                                         \n\t"
+                         "4:                                                \n\t"
+                         "vmovups %%zmm0, (%%rax)                           \n\t"
+                         "vmovups %%zmm2, 0x40(%%rax)                       \n\t"
+                         "vmovups %%zmm4, 0x80(%%rax)                       \n\t"
+                         "vmovups %%zmm6, 0xC0(%%rax)                       \n\t"
+                         "vmovups %%zmm8, 0x100(%%rax)                      \n\t"
+                         "vmovups %%zmm10, 0x140(%%rax)                     \n\t"
+                         "vmovups %%zmm12, 0x180(%%rax)                     \n\t"
+                         "vmovups %%zmm14, 0x1C0(%%rax)                     \n\t"
+                         "vmovups %%zmm16, 0x200(%%rax)                     \n\t"
+                         "vmovups %%zmm18, 0x240(%%rax)                     \n\t"
+                         "vmovups %%zmm20, 0x280(%%rax)                     \n\t"
+                         "vmovups %%zmm22, 0x2C0(%%rax)                     \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)                    \n\t"
+                         "vmovups %%zmm3, 0x40(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm5, 0x80(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm7, 0xC0(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm9, 0x100(%%rax, %%rbx)               \n\t"
+                         "vmovups %%zmm11, 0x140(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm13, 0x180(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm15, 0x1C0(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm17, 0x200(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm19, 0x240(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm21, 0x280(%%rax, %%rbx)              \n\t"
+                         "vmovups %%zmm23, 0x2C0(%%rax, %%rbx)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [eltwise] "r" (c.eltwise),
+                           [ostepC16] "r" (c.ostepC16),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
+                           "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",
+                           "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",
+                           "%zmm30", "%zmm31", "memory", "cc");
 }
 
 void Avx512ConvKernel6x32(ConvController &c) {
-    convKernelForLoopXx32(12, 6)
+    if (c.cross) {
+        convKernelForLoopXx32(24, 12, 1)
+    } else {
+        convKernelForLoopXx32(24, 12, 0)
+    }
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                             \n\t"
+                         "movq %[ostepC16], %%rbx                           \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0x1, %%rcx                                   \n\t"
                          "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax), %%zmm2, %%zmm2                         \n\t"
-                         "vpaddd 0x80(%%rax), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0x100(%%rax), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0x140(%%rax), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11                         \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0                    \n\t"
+                         "vpaddd 0x40(%%rax), %%zmm2, %%zmm2                \n\t"
+                         "vpaddd 0x80(%%rax), %%zmm4, %%zmm4                \n\t"
+                         "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6                \n\t"
+                         "vpaddd 0x100(%%rax), %%zmm8, %%zmm8               \n\t"
+                         "vpaddd 0x140(%%rax), %%zmm10, %%zmm10             \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1             \n\t"
+                         "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3         \n\t"
+                         "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5         \n\t"
+                         "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7         \n\t"
+                         "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9        \n\t"
+                         "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11      \n\t"
 
                          ".align 16                                         \n\t"
                          "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0xC, %%rcx                                   \n\t"
+                         "je 4f                                             \n\t"
                          relu12Regs(%%zmm)
+                         "jmp 4f                                            \n\t"
 
                          ".align 16                                         \n\t"
                          "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
                          convert12RegsI32ToF32(%[scale], %%zmm)
 
                          ".align 16                                         \n\t"
                          "2:                                                \n\t"
-                         "vmovups %%zmm0, (%%rax)                             \n\t"
-                         "vmovups %%zmm2, 0x40(%%rax)                         \n\t"
-                         "vmovups %%zmm4, 0x80(%%rax)                         \n\t"
-                         "vmovups %%zmm6, 0xC0(%%rax)                         \n\t"
-                         "vmovups %%zmm8, 0x100(%%rax)                         \n\t"
-                         "vmovups %%zmm10, 0x140(%%rax)                         \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
-                         "vmovups %%zmm3, 0x40(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm5, 0x80(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm7, 0xC0(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm9, 0x100(%%rax, %%rbx)                         \n\t"
-                         "vmovups %%zmm11, 0x140(%%rax, %%rbx)                         \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0x2, %%rcx                                   \n\t"
+                         "je 3f                                             \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0               \n\t"
+                         "vaddps 0x40(%[eltwise]), %%zmm2, %%zmm2           \n\t"
+                         "vaddps 0x80(%[eltwise]), %%zmm4, %%zmm4           \n\t"
+                         "vaddps 0xC0(%[eltwise]), %%zmm6, %%zmm6           \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm8, %%zmm8          \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm10, %%zmm10        \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1        \n\t"
+                         "vaddps 0x40(%[eltwise], %%rbx), %%zmm3, %%zmm3    \n\t"
+                         "vaddps 0x80(%[eltwise], %%rbx), %%zmm5, %%zmm5    \n\t"
+                         "vaddps 0xC0(%[eltwise], %%rbx), %%zmm7, %%zmm7    \n\t"
+                         "vaddps 0x100(%[eltwise], %%rbx), %%zmm9, %%zmm9   \n\t"
+                         "vaddps 0x140(%[eltwise], %%rbx), %%zmm11, %%zmm11 \n\t"
+
+                         ".align 16                                         \n\t"
+                         "3:                                                \n\t"
+                         "movq %[flags], %%rcx                              \n\t"
+                         "and $0xC, %%rcx                                   \n\t"
+                         "je 4f                                             \n\t"
+                         relu12RegsPs(%%zmm)
+
+                         ".align 16                                         \n\t"
+                         "4:                                                \n\t"
+                         "vmovups %%zmm0, (%%rax)                           \n\t"
+                         "vmovups %%zmm2, 0x40(%%rax)                       \n\t"
+                         "vmovups %%zmm4, 0x80(%%rax)                       \n\t"
+                         "vmovups %%zmm6, 0xC0(%%rax)                       \n\t"
+                         "vmovups %%zmm8, 0x100(%%rax)                      \n\t"
+                         "vmovups %%zmm10, 0x140(%%rax)                     \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)                    \n\t"
+                         "vmovups %%zmm3, 0x40(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm5, 0x80(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm7, 0xC0(%%rax, %%rbx)                \n\t"
+                         "vmovups %%zmm9, 0x100(%%rax, %%rbx)               \n\t"
+                         "vmovups %%zmm11, 0x140(%%rax, %%rbx)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm24", "%zmm31", "memory", "cc");
 }
 
 void Avx512ConvKernel1x32(ConvController &c) {
-    convKernelForLoopXx32(2, 1)
-
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1                             \n\t"
-
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    convKernelForLoopXx32(24, 12, 0)
+    
+    __asm__ __volatile__("movq %[output], %%rax                      \n\t"
+                         "movq %[ostepC16], %%rbx                    \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x1, %%rcx                            \n\t"
+                         "je 0f                                      \n\t"
+                         "vpaddd (%%rax), %%zmm0, %%zmm0             \n\t"
+                         "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1      \n\t"
+
+                         ".align 16                                  \n\t"
+                         "0:                                         \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
                          relu2Regs(%%zmm)
+                         "jmp 4f                                     \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                  \n\t"
+                         "1:                                         \n\t"
                          convert2RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0, (%%rax)                             \n\t"
-                         "vmovups %%zmm1, (%%rax, %%rbx)                             \n\t"
+                         ".align 16                                  \n\t"
+                         "2:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x2, %%rcx                            \n\t"
+                         "je 3f                                      \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0        \n\t"
+                         "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t"
+
+                         ".align 16                                  \n\t"
+                         "3:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
+                         relu2RegsPs(%%zmm)
+
+                         ".align 16                                  \n\t"
+                         "4:                                         \n\t"
+                         "vmovups %%zmm0, (%%rax)                    \n\t"
+                         "vmovups %%zmm1, (%%rax, %%rbx)             \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [eltwise] "r" (c.eltwise),
+                           [ostepC16] "r" (c.ostepC16),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm24", "%zmm31",
+                           "memory", "cc");
 }
 
-#define load16BiasTo1Regs(bias, rtype) \
-    "vmovups ("#bias"), "#rtype"0                       \n\t"
-
-#define load16BiasTo12Regs(bias, rtype) \
-    load16BiasTo1Regs(bias, rtype) \
-    "vmovups "#rtype"0, "#rtype"1                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"2                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"3                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"4                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"5                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"6                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"7                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"8                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"9                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"10                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"11                   \n\t"
-
-#define load16BiasTo24Regs(bias, rtype) \
-    load16BiasTo12Regs(bias, rtype) \
-    "vmovups "#rtype"0, "#rtype"12                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"13                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"14                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"15                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"16                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"17                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"18                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"19                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"20                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"21                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"22                   \n\t" \
-    "vmovups "#rtype"0, "#rtype"23                   \n\t"
-
 #ifdef _USE_AVX512_VNNI
-#define convKernel24x16c4(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"27         \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"1              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"2              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"3              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"4              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"5              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"27         \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"6              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"7              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"8              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"9              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"10              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"11              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"27         \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"12              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"13              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"14              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"15              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"16              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"17              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"27         \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"18              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"19              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"20              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"21              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"22              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"23              \n\t"
-
-#define convKernel12x16c4(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"27         \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"1              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"2              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"3              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"4              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"5              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"27         \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"29                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"30                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31                     \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"6              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"27, "#rtype"7              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"28, "#rtype"8              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"29, "#rtype"9              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"30, "#rtype"10              \n\t" \
-    "vpdpbusd "#freg0", "#rtype"31, "#rtype"11              \n\t"
-
-#define convKernel1x16c4(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"26                     \n\t" \
-    "vmovups "#off0"(%[filter]), "#preg0"                             \n\t" \
-    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0              \n\t"
+#define convKernel24x16c4_1(input, freg0, off0, preg0, rtype) \
+    "movq (%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x8(%[stepC16]), "#input"                  \n\t" \
+    "movq 0x10(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"29      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x18(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x20(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"31      \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"            \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"1        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"2        \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x28(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x30(%[stepC16]), %%r10                    \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"3        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"4        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"5        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x38(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x40(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"29      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x48(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x50(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"31      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"6        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"7        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"8        \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x58(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x60(%[stepC16]), %%r10                    \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"9        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"10       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"11       \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x68(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x70(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"29      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x78(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x80(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"31      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"12       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"13       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"14       \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x88(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x90(%[stepC16]), %%r10                    \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"15       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"16       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"17       \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x98(%[stepC16]), "#input"                 \n\t" \
+    "movq 0xA0(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"29      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0xA8(%[stepC16]), "#input"                 \n\t" \
+    "movq 0xB0(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"31      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"18       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"19       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"20       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"21       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"22       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"23       \n\t"
+
+#define convKernel12x16c4_1(input, freg0, off0, preg0, rtype) \
+    "movq (%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x8(%[stepC16]), "#input"                  \n\t" \
+    "movq 0x10(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"29      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x18(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x20(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"31      \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"            \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"1        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"2        \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x28(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x30(%[stepC16]), %%r10                    \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"3        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"4        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"5        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x38(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x40(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"28             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"29      \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq 0x48(%[stepC16]), "#input"                 \n\t" \
+    "movq 0x50(%[stepC16]), %%r10                    \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"30             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"31      \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"6        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"7        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"8        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"9        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"10       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"11       \n\t"
+
+#define convKernel1x16c4_1(input, freg0, off0, preg0, rtype) \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"            \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0        \n\t"
+
+#define convKernel24x16c4_0(input, freg0, off0, preg0, rtype) \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"28   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"29             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"30      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"31   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"            \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"1        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"2        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"3        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"4        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"5        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"28   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"29             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"30      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"31   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"6        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"7        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"8        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"9        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"10       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"11       \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"28   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"29             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"30      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"31   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"12       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"13       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"14       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"15       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"16       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"17       \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"28   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"29             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"30      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"31   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"18       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"19       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"20       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"21       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"22       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"23       \n\t"
+
+#define convKernel12x16c4_0(input, freg0, off0, preg0, rtype) \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"28   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"29             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"30      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"31   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"            \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"1        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"2        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"3        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"4        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"5        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"28   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"29             \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"30      \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"31   \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "addq %%r10, "#input"                            \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"6        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"27, "#rtype"7        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"28, "#rtype"8        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"29, "#rtype"9        \n\t" \
+    "vpdpbusd "#freg0", "#rtype"30, "#rtype"10       \n\t" \
+    "vpdpbusd "#freg0", "#rtype"31, "#rtype"11       \n\t"
+
+#define convKernel1x16c4_0(input, freg0, off0, preg0, rtype) \
+    "vpbroadcastd ("#input"), "#rtype"26             \n\t" \
+    "vmovups "#off0"(%[filter]), "#preg0"            \n\t" \
+    "vpdpbusd "#freg0", "#rtype"26, "#rtype"0        \n\t"
+
 #else
 
 #define convKernel24x16c4_3(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0              \n\t" \
-    "vpaddd "#rtype"1, "#rtype"29, "#rtype"1              \n\t" \
-    "vpaddd "#rtype"2, "#rtype"30, "#rtype"2              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"3, "#rtype"28, "#rtype"3              \n\t" \
-    "vpaddd "#rtype"4, "#rtype"29, "#rtype"4              \n\t" \
-    "vpaddd "#rtype"5, "#rtype"30, "#rtype"5              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"6, "#rtype"28, "#rtype"6              \n\t" \
-    "vpaddd "#rtype"7, "#rtype"29, "#rtype"7              \n\t" \
-    "vpaddd "#rtype"8, "#rtype"30, "#rtype"8              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"9, "#rtype"28, "#rtype"9              \n\t" \
-    "vpaddd "#rtype"10, "#rtype"29, "#rtype"10              \n\t" \
-    "vpaddd "#rtype"11, "#rtype"30, "#rtype"11              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"12, "#rtype"28, "#rtype"12              \n\t" \
-    "vpaddd "#rtype"13, "#rtype"29, "#rtype"13              \n\t" \
-    "vpaddd "#rtype"14, "#rtype"30, "#rtype"14              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"15, "#rtype"28, "#rtype"15              \n\t" \
-    "vpaddd "#rtype"16, "#rtype"29, "#rtype"16              \n\t" \
-    "vpaddd "#rtype"17, "#rtype"30, "#rtype"17              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"18, "#rtype"28, "#rtype"18              \n\t" \
-    "vpaddd "#rtype"19, "#rtype"29, "#rtype"19              \n\t" \
-    "vpaddd "#rtype"20, "#rtype"30, "#rtype"20              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpaddd "#rtype"21, "#rtype"28, "#rtype"21              \n\t" \
-    "vpaddd "#rtype"22, "#rtype"29, "#rtype"22              \n\t" \
-    "vpaddd "#rtype"23, "#rtype"30, "#rtype"23              \n\t"
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0             \n\t" \
+    "vpaddd "#rtype"1, "#rtype"29, "#rtype"1             \n\t" \
+    "vpaddd "#rtype"2, "#rtype"30, "#rtype"2             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpaddd "#rtype"3, "#rtype"28, "#rtype"3             \n\t" \
+    "vpaddd "#rtype"4, "#rtype"29, "#rtype"4             \n\t" \
+    "vpaddd "#rtype"5, "#rtype"30, "#rtype"5             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpaddd "#rtype"6, "#rtype"28, "#rtype"6             \n\t" \
+    "vpaddd "#rtype"7, "#rtype"29, "#rtype"7             \n\t" \
+    "vpaddd "#rtype"8, "#rtype"30, "#rtype"8             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpaddd "#rtype"9, "#rtype"28, "#rtype"9             \n\t" \
+    "vpaddd "#rtype"10, "#rtype"29, "#rtype"10           \n\t" \
+    "vpaddd "#rtype"11, "#rtype"30, "#rtype"11           \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpaddd "#rtype"12, "#rtype"28, "#rtype"12           \n\t" \
+    "vpaddd "#rtype"13, "#rtype"29, "#rtype"13           \n\t" \
+    "vpaddd "#rtype"14, "#rtype"30, "#rtype"14           \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpaddd "#rtype"15, "#rtype"28, "#rtype"15           \n\t" \
+    "vpaddd "#rtype"16, "#rtype"29, "#rtype"16           \n\t" \
+    "vpaddd "#rtype"17, "#rtype"30, "#rtype"17           \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpaddd "#rtype"18, "#rtype"28, "#rtype"18           \n\t" \
+    "vpaddd "#rtype"19, "#rtype"29, "#rtype"19           \n\t" \
+    "vpaddd "#rtype"20, "#rtype"30, "#rtype"20           \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"                \n\t" \
+    "vpaddd "#rtype"21, "#rtype"28, "#rtype"21           \n\t" \
+    "vpaddd "#rtype"22, "#rtype"29, "#rtype"22           \n\t" \
+    "vpaddd "#rtype"23, "#rtype"30, "#rtype"23           \n\t"
 
 #define convKernel12x16c4_3(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0              \n\t" \
-    "vpaddd "#rtype"1, "#rtype"29, "#rtype"1              \n\t" \
-    "vpaddd "#rtype"2, "#rtype"30, "#rtype"2              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"3, "#rtype"28, "#rtype"3              \n\t" \
-    "vpaddd "#rtype"4, "#rtype"29, "#rtype"4              \n\t" \
-    "vpaddd "#rtype"5, "#rtype"30, "#rtype"5              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "addq %[stepC16], "#input"  \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpbroadcastd ("#input", %[stepC16]), "#rtype"26                     \n\t" \
-    "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27                     \n\t" \
-    "vpaddd "#rtype"6, "#rtype"28, "#rtype"6              \n\t" \
-    "vpaddd "#rtype"7, "#rtype"29, "#rtype"7              \n\t" \
-    "vpaddd "#rtype"8, "#rtype"30, "#rtype"8              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29              \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29              \n\t" \
-    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpaddd "#rtype"9, "#rtype"28, "#rtype"9              \n\t" \
-    "vpaddd "#rtype"10, "#rtype"29, "#rtype"10              \n\t" \
-    "vpaddd "#rtype"11, "#rtype"30, "#rtype"11              \n\t"
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0             \n\t" \
+    "vpaddd "#rtype"1, "#rtype"29, "#rtype"1             \n\t" \
+    "vpaddd "#rtype"2, "#rtype"30, "#rtype"2             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpaddd "#rtype"3, "#rtype"28, "#rtype"3             \n\t" \
+    "vpaddd "#rtype"4, "#rtype"29, "#rtype"4             \n\t" \
+    "vpaddd "#rtype"5, "#rtype"30, "#rtype"5             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "addq %%r10, "#input"                           \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26     \n\t" \
+    "vpbroadcastd ("#input", %%r10, 2), "#rtype"27  \n\t" \
+    "vpaddd "#rtype"6, "#rtype"28, "#rtype"6             \n\t" \
+    "vpaddd "#rtype"7, "#rtype"29, "#rtype"7             \n\t" \
+    "vpaddd "#rtype"8, "#rtype"30, "#rtype"8             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"                \n\t" \
+    "vpaddd "#rtype"9, "#rtype"28, "#rtype"9             \n\t" \
+    "vpaddd "#rtype"10, "#rtype"29, "#rtype"10           \n\t" \
+    "vpaddd "#rtype"11, "#rtype"30, "#rtype"11           \n\t"
 
 #define convKernel1x16c4_3(input, freg0, off0, preg0, rtype) \
-    "vpbroadcastd ("#input"), "#rtype"25                   \n\t" \
-    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28              \n\t" \
-    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28              \n\t" \
-    "vmovups "#off0"(%[filter]), "#freg0"                             \n\t" \
-    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0              \n\t"
-
-#define convKernel24x16c4(input, freg0, off0, preg0, rtype) \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"                \n\t" \
+    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0             \n\t"
+
+#define convKernel24x16c4_4(input, freg0, off0, preg0, rtype) \
+    "movq (%[stepC16]), %%r10                            \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x8(%[stepC16]), "#input"                      \n\t" \
+    "movq 0x10(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"27                 \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"25          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x18(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x20(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27          \n\t" \
+    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0             \n\t" \
+    "vpaddd "#rtype"1, "#rtype"29, "#rtype"1             \n\t" \
+    "vpaddd "#rtype"2, "#rtype"30, "#rtype"2             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x28(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x30(%[stepC16]), %%r10                        \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x38(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x40(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"27                 \n\t" \
+    "vpaddd "#rtype"3, "#rtype"28, "#rtype"3             \n\t" \
+    "vpaddd "#rtype"4, "#rtype"29, "#rtype"4             \n\t" \
+    "vpaddd "#rtype"5, "#rtype"30, "#rtype"5             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"25          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x38(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x40(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27          \n\t" \
+    "vpaddd "#rtype"6, "#rtype"28, "#rtype"6             \n\t" \
+    "vpaddd "#rtype"7, "#rtype"29, "#rtype"7             \n\t" \
+    "vpaddd "#rtype"8, "#rtype"30, "#rtype"8             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x48(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x50(%[stepC16]), %%r10                        \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x58(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x60(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"27                 \n\t" \
+    "vpaddd "#rtype"9, "#rtype"28, "#rtype"9             \n\t" \
+    "vpaddd "#rtype"10, "#rtype"29, "#rtype"10           \n\t" \
+    "vpaddd "#rtype"11, "#rtype"30, "#rtype"11           \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"25          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x68(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x70(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27          \n\t" \
+    "vpaddd "#rtype"12, "#rtype"28, "#rtype"12           \n\t" \
+    "vpaddd "#rtype"13, "#rtype"29, "#rtype"13           \n\t" \
+    "vpaddd "#rtype"14, "#rtype"30, "#rtype"14           \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x78(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x80(%[stepC16]), %%r10                        \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x88(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x90(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"27                 \n\t" \
+    "vpaddd "#rtype"15, "#rtype"28, "#rtype"15           \n\t" \
+    "vpaddd "#rtype"16, "#rtype"29, "#rtype"16           \n\t" \
+    "vpaddd "#rtype"17, "#rtype"30, "#rtype"17           \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"25          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x98(%[stepC16]), "#input"                     \n\t" \
+    "movq 0xA0(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27          \n\t" \
+    "vpaddd "#rtype"18, "#rtype"28, "#rtype"18           \n\t" \
+    "vpaddd "#rtype"19, "#rtype"29, "#rtype"19           \n\t" \
+    "vpaddd "#rtype"20, "#rtype"30, "#rtype"20           \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"                \n\t" \
+    "vpaddd "#rtype"21, "#rtype"28, "#rtype"21           \n\t" \
+    "vpaddd "#rtype"22, "#rtype"29, "#rtype"22           \n\t" \
+    "vpaddd "#rtype"23, "#rtype"30, "#rtype"23           \n\t"
+
+#define convKernel12x16c4_4(input, freg0, off0, preg0, rtype) \
+    "movq (%[stepC16]), %%r10                            \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x8(%[stepC16]), "#input"                      \n\t" \
+    "movq 0x10(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"27                 \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"25          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x18(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x20(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27          \n\t" \
+    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0             \n\t" \
+    "vpaddd "#rtype"1, "#rtype"29, "#rtype"1             \n\t" \
+    "vpaddd "#rtype"2, "#rtype"30, "#rtype"2             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x28(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x30(%[stepC16]), %%r10                        \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"26          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x38(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x40(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"27                 \n\t" \
+    "vpaddd "#rtype"3, "#rtype"28, "#rtype"3             \n\t" \
+    "vpaddd "#rtype"4, "#rtype"29, "#rtype"4             \n\t" \
+    "vpaddd "#rtype"5, "#rtype"30, "#rtype"5             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"25          \n\t" \
+    "addq %%r10, "#input"                                \n\t" \
+    "addq 0x48(%[stepC16]), "#input"                     \n\t" \
+    "movq 0x50(%[stepC16]), %%r10                        \n\t" \
+    "vpbroadcastd ("#input"), "#rtype"26                 \n\t" \
+    "vpbroadcastd ("#input", %%r10), "#rtype"27          \n\t" \
+    "vpaddd "#rtype"6, "#rtype"28, "#rtype"6             \n\t" \
+    "vpaddd "#rtype"7, "#rtype"29, "#rtype"7             \n\t" \
+    "vpaddd "#rtype"8, "#rtype"30, "#rtype"8             \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29         \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29         \n\t" \
+    "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"                \n\t" \
+    "vpaddd "#rtype"9, "#rtype"28, "#rtype"9             \n\t" \
+    "vpaddd "#rtype"10, "#rtype"29, "#rtype"10           \n\t" \
+    "vpaddd "#rtype"11, "#rtype"30, "#rtype"11           \n\t"
+
+#define convKernel1x16c4_4(input, freg0, off0, preg0, rtype) \
+    "vpbroadcastd ("#input"), "#rtype"25                 \n\t" \
+    "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28         \n\t" \
+    "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28         \n\t" \
+    "vmovups "#off0"(%[filter]), "#freg0"                \n\t" \
+    "vpaddd "#rtype"0, "#rtype"28, "#rtype"0             \n\t"
+
+#define convKernel24x16c4_0(input, freg0, off0, preg0, rtype) \
     convKernel24x16c4_3(input, rtype##24, off0, rtype##25, rtype)
 
-#define convKernel12x16c4(input, freg0, off0, preg0, rtype) \
+#define convKernel12x16c4_0(input, freg0, off0, preg0, rtype) \
     convKernel12x16c4_3(input, rtype##24, off0, rtype##25, rtype)
 
-#define convKernel1x16c4(input, freg0, off0, preg0, rtype) \
+#define convKernel1x16c4_0(input, freg0, off0, preg0, rtype) \
     convKernel1x16c4_3(input, rtype##24, off0, rtype##25, rtype)
+
+#define convKernel24x16c4_1(input, freg0, off0, preg0, rtype) \
+    convKernel24x16c4_4(input, rtype##24, off0, rtype##25, rtype)
+
+#define convKernel12x16c4_1(input, freg0, off0, preg0, rtype) \
+    convKernel12x16c4_4(input, rtype##24, off0, rtype##25, rtype)
+
+#define convKernel1x16c4_1(input, freg0, off0, preg0, rtype) \
+    convKernel1x16c4_4(input, rtype##24, off0, rtype##25, rtype)
+
 #endif
 
-#define convKernelForLoopXx16(rnum, wsize, rtype, off0, off1, off2, off3, off4) \
-     __asm__ __volatile__("vmovups (%[filter]), "#rtype"24                             \n\t" \
-                          "addq $"#off1", %[filter]                                    \n\t" \
-                          "mov $1, %%eax \n\t" \
-                          "vmovd %%eax, %%xmm0                    \n\t" \
-                          "vpbroadcastw %%xmm0, "#rtype"31            \n\t" \
-                          "movq %[flags], %%rax          \n\t" \
-                          "andq $0x1, %%rax          \n\t" \
-                          "jne 0f                                         \n\t" \
-                          load16BiasTo##rnum##Regs(%[bias], rtype) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          "jmp 1f          \n\t" \
-                          ".align 16                                         \n\t" \
-                          "0:                                                \n\t" \
-                          clear##rnum##Regs(rtype) \
-                          "cmpq $0x10, %%rcx          \n\t" \
-                          "jl 4f            \n\t" \
-                          ".align 16                                         \n\t" \
-                          "1:                                                \n\t" \
-                          "mov %[kh], %%rbx                                     \n\t" \
-                          ".align 16                                         \n\t" \
-                          "2:                                                \n\t" \
-                          "mov %[kw], %%r9                                   \n\t" \
-                          ".align 16                                         \n\t" \
-                          "3:                                                \n\t" \
-                          "movq %[input], %%rax  \n\t" \
-                          convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype) \
-                          "movq %[input], %%rax  \n\t" \
-                          "addq $0x4, %%rax  \n\t" \
-                          convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype) \
-                          "movq %[input], %%rax  \n\t" \
-                          "addq $0x8, %%rax  \n\t" \
-                          convKernel##wsize##x16c4(%%rax, rtype##24, off2, rtype##25, rtype) \
-                          "movq %[input], %%rax  \n\t" \
-                          "addq $0xC, %%rax  \n\t" \
-                          convKernel##wsize##x16c4(%%rax, rtype##25, off3, rtype##24, rtype) \
-                          "addq $"#off4", %[filter]                                    \n\t" \
-                          "addq %[dilateW], %[input]                                    \n\t" \
-                          "dec %%r9                                         \n\t" \
-                          "jg 3b                                             \n\t" \
-                          "addq %[dilateH], %[input]                                    \n\t" \
-                          "dec %%rbx                                         \n\t" \
-                          "jg 2b                                             \n\t" \
-                          "addq %[fStep], %[input]                                    \n\t" \
-                          "subq $0x10, %%rcx                                         \n\t" \
-                          "cmpq $0x10, %%rcx                                         \n\t" \
-                          "jge 1b                                             \n\t" \
-                          "subq %[fStep], %[input]                                    \n\t" \
-                          "addq %[f8Step], %[input]                                    \n\t" \
-                          ".align 16                                         \n\t" \
-                          "4:                                                \n\t" \
-                          : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \
-                          : [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \
-                            [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \
-                            [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), [flags] "r" (c.flags),  \
-                            [f8Step] "r" (c.f8Step) \
-                          : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                            "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                            "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                            "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                            "%zmm31", "memory", "cc"); \
-     if (c.ic > 0) { \
-         __asm__ __volatile__("cmpq $0x8, %%rcx          \n\t" \
-                              "jl 2f            \n\t" \
-                              "subq $0x8, %%rcx          \n\t" \
-                              "shr $1, %[dilateW]                                    \n\t" \
-                              "shr $1, %[dilateH]                                    \n\t" \
-                              "shr $1, %[fStep]                                    \n\t" \
-                              "shr $1, %[stepC16]                                    \n\t" \
-                              "mov %[kh], %%rbx                                     \n\t" \
-                              ".align 16                                         \n\t" \
-                              "0:                                                \n\t" \
-                              "mov %[kw], %%r9                                   \n\t" \
-                              ".align 16                                         \n\t" \
-                              "1:                                                \n\t" \
-                              "movq %[input], %%rax  \n\t" \
-                              convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype) \
-                              "movq %[input], %%rax  \n\t" \
-                              "addq $0x4, %%rax  \n\t" \
-                              convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype) \
-                              "addq $"#off2", %[filter]                                    \n\t" \
-                              "addq %[dilateW], %[input]                                    \n\t" \
-                              "dec %%r9                                         \n\t" \
-                              "jg 1b                                             \n\t" \
-                              "addq %[dilateH], %[input]                                    \n\t" \
-                              "dec %%rbx                                         \n\t" \
-                              "jg 0b                                             \n\t" \
-                              "addq %[f4Step], %[input]                                    \n\t" \
-                              ".align 16                                         \n\t" \
-                              "2:                                                \n\t" \
-                              "cmpq $0x4, %%rcx          \n\t" \
-                              "jl 5f            \n\t" \
-                              "shr $1, %[dilateW]                                    \n\t" \
-                              "shr $1, %[dilateH]                                    \n\t" \
-                              "shr $1, %[stepC16]                                    \n\t" \
-                              "mov %[kh], %%rbx                                     \n\t" \
-                              ".align 16                                         \n\t" \
-                              "3:                                                \n\t" \
-                              "mov %[kw], %%r9                                   \n\t" \
-                              ".align 16                                         \n\t" \
-                              "4:                                                \n\t" \
-                              "movq %[input], %%rax  \n\t" \
-                              convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype) \
-                              "addq $"#off1", %[filter]                                    \n\t" \
-                              "addq %[dilateW], %[input]                                    \n\t" \
-                              "dec %%r9                                         \n\t" \
-                              "jg 4b                                             \n\t" \
-                              "addq %[dilateH], %[input]                                    \n\t" \
-                              "dec %%rbx                                         \n\t" \
-                              "jg 3b                                             \n\t" \
-                              ".align 16                                         \n\t" \
-                              "5:                                             \n\t" \
-                              : "+c" (c.ic) \
-                              : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \
-                                [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \
-                                [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), \
-                                [f4Step] "r" (c.f4Step) \
-                              : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \
-                                "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",  \
-                                "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \
-                                "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \
-                                "%zmm31", "memory", "cc"); \
+#define convKernelForLoopXx16(rnum, wsize, rtype, off0, off1, off2, off3, off4, cross) \
+    __asm__ __volatile__("vmovups (%[filter]), "#rtype"24              \n\t" \
+                         "addq $"#off1", %[filter]                     \n\t" \
+                         "mov $1, %%eax                                \n\t" \
+                         "vmovd %%eax, %%xmm0                          \n\t" \
+                         "vpbroadcastw %%xmm0, "#rtype"31              \n\t" \
+                         "movq %[flags], %%rax                         \n\t" \
+                         "andq $0x1, %%rax                             \n\t" \
+                         "jne 0f                                       \n\t" \
+                         load16BiasTo##rnum##Regs(%[bias], rtype)            \
+                         "jmp 1f                                       \n\t" \
+                         ".align 16                                    \n\t" \
+                         "0:                                           \n\t" \
+                         clear##rnum##Regs(rtype)                            \
+                         ".align 16                                    \n\t" \
+                         "1:                                           \n\t" \
+                         : [filter] "+r" (c.filter)                          \
+                         : [bias] "r" (c.bias),                              \
+                           [flags] "r" (c.flags)                             \
+                         : "%rax",                                           \
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",       \
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",     \
+                           "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \
+                           "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \
+                           "memory", "cc"); \
+    if (c.ic >= 16) {                       \
+        __asm__ __volatile__("movq (%[stepC16]), %%r10              \n\t" \
+                             ".align 16                             \n\t" \
+                             "1:                                    \n\t" \
+                             "mov %[kh], %%rbx                      \n\t" \
+                             ".align 16                             \n\t" \
+                             "2:                                    \n\t" \
+                             "mov %[kw], %%r9                       \n\t" \
+                             ".align 16                             \n\t" \
+                             "3:                                    \n\t" \
+                             "movq %[input], %%rax                  \n\t" \
+                             convKernel##wsize##x16c4_##cross(            \
+                                %%rax, rtype##24, off0, rtype##25, rtype) \
+                             "movq %[input], %%rax                  \n\t" \
+                             "addq $0x4, %%rax                      \n\t" \
+                             convKernel##wsize##x16c4_##cross(            \
+                                %%rax, rtype##25, off1, rtype##24, rtype) \
+                             "movq %[input], %%rax                  \n\t" \
+                             "addq $0x8, %%rax                      \n\t" \
+                             convKernel##wsize##x16c4_##cross(            \
+                                %%rax, rtype##24, off2, rtype##25, rtype) \
+                             "movq %[input], %%rax                  \n\t" \
+                             "addq $0xC, %%rax                      \n\t" \
+                             convKernel##wsize##x16c4_##cross(            \
+                                %%rax, rtype##25, off3, rtype##24, rtype) \
+                             "addq $"#off4", %[filter]              \n\t" \
+                             "addq %[dilateW], %[input]             \n\t" \
+                             "dec %%r9                              \n\t" \
+                             "jg 3b                                 \n\t" \
+                             "addq %[dilateH], %[input]             \n\t" \
+                             "dec %%rbx                             \n\t" \
+                             "jg 2b                                 \n\t" \
+                             "addq %[fStep], %[input]               \n\t" \
+                             "subq $0x10, %%rcx                     \n\t" \
+                             "cmpq $0x10, %%rcx                     \n\t" \
+                             "jge 1b                                \n\t" \
+                             "subq %[fStep], %[input]               \n\t" \
+                             "addq %[f8Step], %[input]              \n\t" \
+                             ".align 16                             \n\t" \
+                             "4:                                    \n\t" \
+                             : "+c" (c.ic),                               \
+                               [input] "+r" (c.input),                    \
+                               [filter] "+r" (c.filter)                   \
+                             : [bias] "r" (c.bias),                       \
+                               [kh] "r" (c.kh),                           \
+                               [kw] "r" (c.kw),                           \
+                               [stepC16] "r" (c.stepC16),                 \
+                               [dilateW] "r" (c.dilateW),                 \
+                               [dilateH] "r" (c.dilateH),                 \
+                               [fStep] "r" (c.fStep),                     \
+                               [f8Step] "r" (c.f8Step)                    \
+                             : "%rax", "%rbx", "%r9", "%r10",             \
+                               "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",       \
+                               "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",     \
+                               "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \
+                               "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \
+                               "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \
+                               "%zmm30", "%zmm31", "memory", "cc"); \
+    }               \
+    if (c.ic > 0) { \
+        __asm__ __volatile__("cmpq $0x8, %%rcx                      \n\t" \
+                             "jl 2f                                 \n\t" \
+                             "subq $0x8, %%rcx                      \n\t" \
+                             "shr $1, %[dilateW]                    \n\t" \
+                             "shr $1, %[dilateH]                    \n\t" \
+                             "shr $1, %[fStep]                      \n\t" \
+                             "addq $192, %[stepC16]                 \n\t" \
+                             "mov %[kh], %%rbx                      \n\t" \
+                             ".align 16                             \n\t" \
+                             "0:                                    \n\t" \
+                             "mov %[kw], %%r9                       \n\t" \
+                             ".align 16                             \n\t" \
+                             "1:                                    \n\t" \
+                             "movq %[input], %%rax                  \n\t" \
+                             convKernel##wsize##x16c4_##cross(            \
+                                %%rax, rtype##24, off0, rtype##25, rtype) \
+                             "movq %[input], %%rax                  \n\t" \
+                             "addq $0x4, %%rax                      \n\t" \
+                             convKernel##wsize##x16c4_##cross(            \
+                                %%rax, rtype##25, off1, rtype##24, rtype) \
+                             "addq $"#off2", %[filter]              \n\t" \
+                             "addq %[dilateW], %[input]             \n\t" \
+                             "dec %%r9                              \n\t" \
+                             "jg 1b                                 \n\t" \
+                             "addq %[dilateH], %[input]             \n\t" \
+                             "dec %%rbx                             \n\t" \
+                             "jg 0b                                 \n\t" \
+                             "addq %[f4Step], %[input]              \n\t" \
+                             ".align 16                             \n\t" \
+                             "2:                                    \n\t" \
+                             "cmpq $0x4, %%rcx                      \n\t" \
+                             "jl 5f                                 \n\t" \
+                             "shr $1, %[dilateW]                    \n\t" \
+                             "shr $1, %[dilateH]                    \n\t" \
+                             "addq $192, %[stepC16]                 \n\t" \
+                             "mov %[kh], %%rbx                      \n\t" \
+                             ".align 16                             \n\t" \
+                             "3:                                    \n\t" \
+                             "mov %[kw], %%r9                       \n\t" \
+                             ".align 16                             \n\t" \
+                             "4:                                    \n\t" \
+                             "movq %[input], %%rax                  \n\t" \
+                             convKernel##wsize##x16c4_##cross(            \
+                                %%rax, rtype##24, off0, rtype##25, rtype) \
+                             "addq $"#off1", %[filter]              \n\t" \
+                             "addq %[dilateW], %[input]             \n\t" \
+                             "dec %%r9                              \n\t" \
+                             "jg 4b                                 \n\t" \
+                             "addq %[dilateH], %[input]             \n\t" \
+                             "dec %%rbx                             \n\t" \
+                             "jg 3b                                 \n\t" \
+                             ".align 16                             \n\t" \
+                             "5:                                    \n\t" \
+                             : "+c" (c.ic)                                \
+                             : [input] "r" (c.input),                     \
+                               [filter] "r" (c.filter),                   \
+                               [bias] "r" (c.bias),                       \
+                               [kh] "r" (c.kh),                           \
+                               [kw] "r" (c.kw),                           \
+                               [stepC16] "r" (c.stepC16),                 \
+                               [dilateW] "r" (c.dilateW),                 \
+                               [dilateH] "r" (c.dilateH),                 \
+                               [fStep] "r" (c.fStep),                     \
+                               [f4Step] "r" (c.f4Step)                    \
+                             : "%rax", "%rbx", "%r9", "%r10",             \
+                               "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",       \
+                               "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",     \
+                               "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \
+                               "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \
+                               "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \
+                               "%zmm30", "%zmm31", "memory", "cc"); \
     }
 
 void Avx512ConvKernel24x16(ConvController &c) {
-    convKernelForLoopXx16(24, 24, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100)
-
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax),      %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x80(%%rax),  %%zmm2, %%zmm2                         \n\t"
-                         "vpaddd 0xC0(%%rax),  %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x100(%%rax), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0x140(%%rax), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0x180(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0x200(%%rax), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0x240(%%rax), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd 0x280(%%rax), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11                         \n\t"
-                         "vpaddd 0x300(%%rax), %%zmm12, %%zmm12                         \n\t"
-                         "vpaddd 0x340(%%rax), %%zmm13, %%zmm13                         \n\t"
-                         "vpaddd 0x380(%%rax), %%zmm14, %%zmm14                         \n\t"
-                         "vpaddd 0x3C0(%%rax), %%zmm15, %%zmm15                         \n\t"
-                         "vpaddd 0x400(%%rax), %%zmm16, %%zmm16                         \n\t"
-                         "vpaddd 0x440(%%rax), %%zmm17, %%zmm17                         \n\t"
-                         "vpaddd 0x480(%%rax), %%zmm18, %%zmm18                         \n\t"
-                         "vpaddd 0x4C0(%%rax), %%zmm19, %%zmm19                         \n\t"
-                         "vpaddd 0x500(%%rax), %%zmm20, %%zmm20                         \n\t"
-                         "vpaddd 0x540(%%rax), %%zmm21, %%zmm21                         \n\t"
-                         "vpaddd 0x580(%%rax), %%zmm22, %%zmm22                         \n\t"
-                         "vpaddd 0x5C0(%%rax), %%zmm23, %%zmm23                         \n\t"
+    if (c.cross) {
+        convKernelForLoopXx16(24, 24, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 1) 
+    } else {
+        convKernelForLoopXx16(24, 24, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 0) 
+    }
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                      \n\t"
+                         "movq %[ostepC16], %%rbx                    \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x1, %%rcx                            \n\t"
+                         "je 0f                                      \n\t"
+                         "vpaddd (%%rax),      %%zmm0, %%zmm0        \n\t"
+                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1        \n\t"
+                         "vpaddd 0x80(%%rax),  %%zmm2, %%zmm2        \n\t"
+                         "vpaddd 0xC0(%%rax),  %%zmm3, %%zmm3        \n\t"
+                         "vpaddd 0x100(%%rax), %%zmm4, %%zmm4        \n\t"
+                         "vpaddd 0x140(%%rax), %%zmm5, %%zmm5        \n\t"
+                         "vpaddd 0x180(%%rax), %%zmm6, %%zmm6        \n\t"
+                         "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7        \n\t"
+                         "vpaddd 0x200(%%rax), %%zmm8, %%zmm8        \n\t"
+                         "vpaddd 0x240(%%rax), %%zmm9, %%zmm9        \n\t"
+                         "vpaddd 0x280(%%rax), %%zmm10, %%zmm10      \n\t"
+                         "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11      \n\t"
+                         "vpaddd 0x300(%%rax), %%zmm12, %%zmm12      \n\t"
+                         "vpaddd 0x340(%%rax), %%zmm13, %%zmm13      \n\t"
+                         "vpaddd 0x380(%%rax), %%zmm14, %%zmm14      \n\t"
+                         "vpaddd 0x3C0(%%rax), %%zmm15, %%zmm15      \n\t"
+                         "vpaddd 0x400(%%rax), %%zmm16, %%zmm16      \n\t"
+                         "vpaddd 0x440(%%rax), %%zmm17, %%zmm17      \n\t"
+                         "vpaddd 0x480(%%rax), %%zmm18, %%zmm18      \n\t"
+                         "vpaddd 0x4C0(%%rax), %%zmm19, %%zmm19      \n\t"
+                         "vpaddd 0x500(%%rax), %%zmm20, %%zmm20      \n\t"
+                         "vpaddd 0x540(%%rax), %%zmm21, %%zmm21      \n\t"
+                         "vpaddd 0x580(%%rax), %%zmm22, %%zmm22      \n\t"
+                         "vpaddd 0x5C0(%%rax), %%zmm23, %%zmm23      \n\t"
+
+                         ".align 16                                  \n\t"
+                         "0:                                         \n\t"
+                         "cmpq $0x0, %[scale]                        \n\t"
+                         "jne 1f                                     \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
                          relu24Regs(%%zmm)
+                         "jmp 4f                                     \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                  \n\t"
+                         "1:                                         \n\t"
+                         "cmpq $0x0, %[scale]                        \n\t"
+                         "je 3f                                      \n\t"
                          convert24RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0,  (%%rax)                     \n\t"
-                         "vmovups %%zmm1,  0x40(%%rax)                        \n\t"
-                         "vmovups %%zmm2,  0x80(%%rax)                 \n\t"
-                         "vmovups %%zmm3,  0xC0(%%rax)                        \n\t"
-                         "vmovups %%zmm4,  0x100(%%rax)                 \n\t"
-                         "vmovups %%zmm5,  0x140(%%rax)                        \n\t"
-                         "vmovups %%zmm6,  0x180(%%rax)                 \n\t"
-                         "vmovups %%zmm7,  0x1C0(%%rax)                        \n\t"
-                         "vmovups %%zmm8,  0x200(%%rax)                  \n\t"
-                         "vmovups %%zmm9,  0x240(%%rax)                         \n\t"
-                         "vmovups %%zmm10, 0x280(%%rax)                  \n\t"
-                         "vmovups %%zmm11, 0x2C0(%%rax)                         \n\t"
-                         "vmovups %%zmm12, 0x300(%%rax)                  \n\t"
-                         "vmovups %%zmm13, 0x340(%%rax)                         \n\t"
-                         "vmovups %%zmm14, 0x380(%%rax)                  \n\t"
-                         "vmovups %%zmm15, 0x3C0(%%rax)                         \n\t"
-                         "vmovups %%zmm16, 0x400(%%rax)                  \n\t"
-                         "vmovups %%zmm17, 0x440(%%rax)                         \n\t"
-                         "vmovups %%zmm18, 0x480(%%rax)                  \n\t"
-                         "vmovups %%zmm19, 0x4C0(%%rax)                         \n\t"
-                         "vmovups %%zmm20, 0x500(%%rax)                  \n\t"
-                         "vmovups %%zmm21, 0x540(%%rax)                         \n\t"
-                         "vmovups %%zmm22, 0x580(%%rax)                  \n\t"
-                         "vmovups %%zmm23, 0x5C0(%%rax)                         \n\t"
+                         ".align 16                                  \n\t"
+                         "2:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x2, %%rcx                            \n\t"
+                         "je 3f                                      \n\t"
+                         "vaddps (%[eltwise]),      %%zmm0, %%zmm0   \n\t"
+                         "vaddps 0x40(%[eltwise]),  %%zmm1, %%zmm1   \n\t"
+                         "vaddps 0x80(%[eltwise]),  %%zmm2, %%zmm2   \n\t"
+                         "vaddps 0xC0(%[eltwise]),  %%zmm3, %%zmm3   \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4   \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5   \n\t"
+                         "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6   \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7   \n\t"
+                         "vaddps 0x200(%[eltwise]), %%zmm8, %%zmm8   \n\t"
+                         "vaddps 0x240(%[eltwise]), %%zmm9, %%zmm9   \n\t"
+                         "vaddps 0x280(%[eltwise]), %%zmm10, %%zmm10 \n\t"
+                         "vaddps 0x2C0(%[eltwise]), %%zmm11, %%zmm11 \n\t"
+                         "vaddps 0x300(%[eltwise]), %%zmm12, %%zmm12 \n\t"
+                         "vaddps 0x340(%[eltwise]), %%zmm13, %%zmm13 \n\t"
+                         "vaddps 0x380(%[eltwise]), %%zmm14, %%zmm14 \n\t"
+                         "vaddps 0x3C0(%[eltwise]), %%zmm15, %%zmm15 \n\t"
+                         "vaddps 0x400(%[eltwise]), %%zmm16, %%zmm16 \n\t"
+                         "vaddps 0x440(%[eltwise]), %%zmm17, %%zmm17 \n\t"
+                         "vaddps 0x480(%[eltwise]), %%zmm18, %%zmm18 \n\t"
+                         "vaddps 0x4C0(%[eltwise]), %%zmm19, %%zmm19 \n\t"
+                         "vaddps 0x500(%[eltwise]), %%zmm20, %%zmm20 \n\t"
+                         "vaddps 0x540(%[eltwise]), %%zmm21, %%zmm21 \n\t"
+                         "vaddps 0x580(%[eltwise]), %%zmm22, %%zmm22 \n\t"
+                         "vaddps 0x5C0(%[eltwise]), %%zmm23, %%zmm23 \n\t"
+
+                         ".align 16                                  \n\t"
+                         "3:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
+                         relu24RegsPs(%%zmm)
+
+                         ".align 16                                  \n\t"
+                         "4:                                         \n\t"
+                         "vmovups %%zmm0,  (%%rax)                   \n\t"
+                         "vmovups %%zmm1,  0x40(%%rax)               \n\t"
+                         "vmovups %%zmm2,  0x80(%%rax)               \n\t"
+                         "vmovups %%zmm3,  0xC0(%%rax)               \n\t"
+                         "vmovups %%zmm4,  0x100(%%rax)              \n\t"
+                         "vmovups %%zmm5,  0x140(%%rax)              \n\t"
+                         "vmovups %%zmm6,  0x180(%%rax)              \n\t"
+                         "vmovups %%zmm7,  0x1C0(%%rax)              \n\t"
+                         "vmovups %%zmm8,  0x200(%%rax)              \n\t"
+                         "vmovups %%zmm9,  0x240(%%rax)              \n\t"
+                         "vmovups %%zmm10, 0x280(%%rax)              \n\t"
+                         "vmovups %%zmm11, 0x2C0(%%rax)              \n\t"
+                         "vmovups %%zmm12, 0x300(%%rax)              \n\t"
+                         "vmovups %%zmm13, 0x340(%%rax)              \n\t"
+                         "vmovups %%zmm14, 0x380(%%rax)              \n\t"
+                         "vmovups %%zmm15, 0x3C0(%%rax)              \n\t"
+                         "vmovups %%zmm16, 0x400(%%rax)              \n\t"
+                         "vmovups %%zmm17, 0x440(%%rax)              \n\t"
+                         "vmovups %%zmm18, 0x480(%%rax)              \n\t"
+                         "vmovups %%zmm19, 0x4C0(%%rax)              \n\t"
+                         "vmovups %%zmm20, 0x500(%%rax)              \n\t"
+                         "vmovups %%zmm21, 0x540(%%rax)              \n\t"
+                         "vmovups %%zmm22, 0x580(%%rax)              \n\t"
+                         "vmovups %%zmm23, 0x5C0(%%rax)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17",
+                           "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23",
+                           "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29",
+                           "%zmm30", "%zmm31", "memory", "cc");
 }
 
 void Avx512ConvKernel12x16(ConvController &c) {
-     convKernelForLoopXx16(12, 12, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100)
-
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax),      %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1                             \n\t"
-                         "vpaddd 0x80(%%rax),  %%zmm2, %%zmm2                         \n\t"
-                         "vpaddd 0xC0(%%rax),  %%zmm3, %%zmm3                         \n\t"
-                         "vpaddd 0x100(%%rax), %%zmm4, %%zmm4                         \n\t"
-                         "vpaddd 0x140(%%rax), %%zmm5, %%zmm5                         \n\t"
-                         "vpaddd 0x180(%%rax), %%zmm6, %%zmm6                         \n\t"
-                         "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7                         \n\t"
-                         "vpaddd 0x200(%%rax), %%zmm8, %%zmm8                         \n\t"
-                         "vpaddd 0x240(%%rax), %%zmm9, %%zmm9                         \n\t"
-                         "vpaddd 0x280(%%rax), %%zmm10, %%zmm10                         \n\t"
-                         "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11                         \n\t"
+    if (c.cross) {
+        convKernelForLoopXx16(12, 12, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 1)
+    } else {
+        convKernelForLoopXx16(12, 12, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 0)
+    }
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                      \n\t"
+                         "movq %[ostepC16], %%rbx                    \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x1, %%rcx                            \n\t"
+                         "je 0f                                      \n\t"
+                         "vpaddd (%%rax),      %%zmm0, %%zmm0        \n\t"
+                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1        \n\t"
+                         "vpaddd 0x80(%%rax),  %%zmm2, %%zmm2        \n\t"
+                         "vpaddd 0xC0(%%rax),  %%zmm3, %%zmm3        \n\t"
+                         "vpaddd 0x100(%%rax), %%zmm4, %%zmm4        \n\t"
+                         "vpaddd 0x140(%%rax), %%zmm5, %%zmm5        \n\t"
+                         "vpaddd 0x180(%%rax), %%zmm6, %%zmm6        \n\t"
+                         "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7        \n\t"
+                         "vpaddd 0x200(%%rax), %%zmm8, %%zmm8        \n\t"
+                         "vpaddd 0x240(%%rax), %%zmm9, %%zmm9        \n\t"
+                         "vpaddd 0x280(%%rax), %%zmm10, %%zmm10      \n\t"
+                         "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11      \n\t"
+
+                         ".align 16                                  \n\t"
+                         "0:                                         \n\t"
+                         "cmpq $0x0, %[scale]                        \n\t"
+                         "jne 1f                                     \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
                          relu12Regs(%%zmm)
+                         "jmp 4f                                     \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                  \n\t"
+                         "1:                                         \n\t"
                          convert12RegsI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0,  (%%rax)                     \n\t"
-                         "vmovups %%zmm1,  0x40(%%rax)                        \n\t"
-                         "vmovups %%zmm2,  0x80(%%rax)                 \n\t"
-                         "vmovups %%zmm3,  0xC0(%%rax)                        \n\t"
-                         "vmovups %%zmm4,  0x100(%%rax)                 \n\t"
-                         "vmovups %%zmm5,  0x140(%%rax)                        \n\t"
-                         "vmovups %%zmm6,  0x180(%%rax)                 \n\t"
-                         "vmovups %%zmm7,  0x1C0(%%rax)                        \n\t"
-                         "vmovups %%zmm8,  0x200(%%rax)                  \n\t"
-                         "vmovups %%zmm9,  0x240(%%rax)                         \n\t"
-                         "vmovups %%zmm10, 0x280(%%rax)                  \n\t"
-                         "vmovups %%zmm11, 0x2C0(%%rax)                         \n\t"
+                         ".align 16                                  \n\t"
+                         "2:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x2, %%rcx                            \n\t"
+                         "je 3f                                      \n\t"
+                         "vaddps (%[eltwise]),      %%zmm0, %%zmm0   \n\t"
+                         "vaddps 0x40(%[eltwise]),  %%zmm1, %%zmm1   \n\t"
+                         "vaddps 0x80(%[eltwise]),  %%zmm2, %%zmm2   \n\t"
+                         "vaddps 0xC0(%[eltwise]),  %%zmm3, %%zmm3   \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4   \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5   \n\t"
+                         "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6   \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7   \n\t"
+                         "vaddps 0x200(%[eltwise]), %%zmm8, %%zmm8   \n\t"
+                         "vaddps 0x240(%[eltwise]), %%zmm9, %%zmm9   \n\t"
+                         "vaddps 0x280(%[eltwise]), %%zmm10, %%zmm10 \n\t"
+                         "vaddps 0x2C0(%[eltwise]), %%zmm11, %%zmm11 \n\t"
+
+                         ".align 16                                  \n\t"
+                         "3:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
+                         relu12RegsPs(%%zmm)
+
+                         ".align 16                                  \n\t"
+                         "4:                                         \n\t"
+                         "vmovups %%zmm0,  (%%rax)                   \n\t"
+                         "vmovups %%zmm1,  0x40(%%rax)               \n\t"
+                         "vmovups %%zmm2,  0x80(%%rax)               \n\t"
+                         "vmovups %%zmm3,  0xC0(%%rax)               \n\t"
+                         "vmovups %%zmm4,  0x100(%%rax)              \n\t"
+                         "vmovups %%zmm5,  0x140(%%rax)              \n\t"
+                         "vmovups %%zmm6,  0x180(%%rax)              \n\t"
+                         "vmovups %%zmm7,  0x1C0(%%rax)              \n\t"
+                         "vmovups %%zmm8,  0x200(%%rax)              \n\t"
+                         "vmovups %%zmm9,  0x240(%%rax)              \n\t"
+                         "vmovups %%zmm10, 0x280(%%rax)              \n\t"
+                         "vmovups %%zmm11, 0x2C0(%%rax)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5",
+                           "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
+                           "%zmm24", "%zmm31", "memory", "cc");
 }
 
 void Avx512ConvKernel1x16(ConvController &c) {
-    convKernelForLoopXx16(1, 1, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100)
+    if (c.cross) {
+        convKernelForLoopXx16(1, 1, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 1)
+    } else {
+        convKernelForLoopXx16(1, 1, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 0)
+    }
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax),      %%zmm0, %%zmm0                             \n\t"
-                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                     \n\t"
+                         "movq %[ostepC16], %%rbx                   \n\t"
+                         "movq %[flags], %%rcx                      \n\t"
+                         "and $0x1, %%rcx                           \n\t"
+                         "je 0f                                     \n\t"
+                         "vpaddd (%%rax),      %%zmm0, %%zmm0       \n\t"
+                         "vpaddd 0x40(%%rax),  %%zmm1, %%zmm1       \n\t"
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         ".align 16                                 \n\t"
+                         "0:                                        \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                      \n\t"
+                         "and $0xC, %%rcx                           \n\t"
+                         "je 4f                                     \n\t"
                          reluReg(%%zmm)
+                         "jmp 4f                                    \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                 \n\t"
+                         "1:                                        \n\t"
                          convertRegI32ToF32(%[scale], %%zmm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%zmm0,  (%%rax)                     \n\t"
+                         ".align 16                                 \n\t"
+                         "2:                                        \n\t"
+                         "movq %[flags], %%rcx                      \n\t"
+                         "and $0x2, %%rcx                           \n\t"
+                         "je 3f                                     \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0       \n\t"
+
+                         ".align 16                                 \n\t"
+                         "3:                                        \n\t"
+                         "movq %[flags], %%rcx                      \n\t"
+                         "and $0xC, %%rcx                           \n\t"
+                         "je 4f                                     \n\t"
+                         reluRegPs(%%zmm)
+
+                         ".align 16                                 \n\t"
+                         "4:                                        \n\t"
+                         "vmovups %%zmm0,  (%%rax)                  \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
-                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
-                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
-                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%zmm0", "%zmm24", "%zmm31",
+                           "memory", "cc");
 }
 
 void Avx512ConvKernel24x8(ConvController &c) {
-    convKernelForLoopXx16(24, 24, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80)
-
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax),      %%ymm0,  %%ymm0                             \n\t"
-                         "vpaddd 0x20(%%rax),  %%ymm1,  %%ymm1                             \n\t"
-                         "vpaddd 0x40(%%rax),  %%ymm2,  %%ymm2                         \n\t"
-                         "vpaddd 0x60(%%rax),  %%ymm3,  %%ymm3                         \n\t"
-                         "vpaddd 0x80(%%rax),  %%ymm4,  %%ymm4                         \n\t"
-                         "vpaddd 0xA0(%%rax),  %%ymm5,  %%ymm5                         \n\t"
-                         "vpaddd 0xC0(%%rax),  %%ymm6,  %%ymm6                         \n\t"
-                         "vpaddd 0xE0(%%rax),  %%ymm7,  %%ymm7                         \n\t"
-                         "vpaddd 0x100(%%rax), %%ymm8,  %%ymm8                         \n\t"
-                         "vpaddd 0x120(%%rax), %%ymm9,  %%ymm9                         \n\t"
-                         "vpaddd 0x140(%%rax), %%ymm10, %%ymm10                         \n\t"
-                         "vpaddd 0x160(%%rax), %%ymm11, %%ymm11                         \n\t"
-                         "vpaddd 0x180(%%rax), %%ymm12, %%ymm12                         \n\t"
-                         "vpaddd 0x1A0(%%rax), %%ymm13, %%ymm13                         \n\t"
-                         "vpaddd 0x1C0(%%rax), %%ymm14, %%ymm14                         \n\t"
-                         "vpaddd 0x1E0(%%rax), %%ymm15, %%ymm15                         \n\t"
-                         "vpaddd 0x200(%%rax), %%ymm16, %%ymm16                         \n\t"
-                         "vpaddd 0x220(%%rax), %%ymm17, %%ymm17                         \n\t"
-                         "vpaddd 0x240(%%rax), %%ymm18, %%ymm18                         \n\t"
-                         "vpaddd 0x260(%%rax), %%ymm19, %%ymm19                         \n\t"
-                         "vpaddd 0x280(%%rax), %%ymm20, %%ymm20                         \n\t"
-                         "vpaddd 0x2A0(%%rax), %%ymm21, %%ymm21                         \n\t"
-                         "vpaddd 0x2C0(%%rax), %%ymm22, %%ymm22                         \n\t"
-                         "vpaddd 0x2E0(%%rax), %%ymm23, %%ymm23                         \n\t"
+    if (c.cross) {
+        convKernelForLoopXx16(24, 24, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 1)
+    } else {
+        convKernelForLoopXx16(24, 24, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 0)
+    }
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                      \n\t"
+                         "movq %[ostepC16], %%rbx                    \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x1, %%rcx                            \n\t"
+                         "je 0f                                      \n\t"
+                         "vpaddd (%%rax),      %%ymm0,  %%ymm0       \n\t"
+                         "vpaddd 0x20(%%rax),  %%ymm1,  %%ymm1       \n\t"
+                         "vpaddd 0x40(%%rax),  %%ymm2,  %%ymm2       \n\t"
+                         "vpaddd 0x60(%%rax),  %%ymm3,  %%ymm3       \n\t"
+                         "vpaddd 0x80(%%rax),  %%ymm4,  %%ymm4       \n\t"
+                         "vpaddd 0xA0(%%rax),  %%ymm5,  %%ymm5       \n\t"
+                         "vpaddd 0xC0(%%rax),  %%ymm6,  %%ymm6       \n\t"
+                         "vpaddd 0xE0(%%rax),  %%ymm7,  %%ymm7       \n\t"
+                         "vpaddd 0x100(%%rax), %%ymm8,  %%ymm8       \n\t"
+                         "vpaddd 0x120(%%rax), %%ymm9,  %%ymm9       \n\t"
+                         "vpaddd 0x140(%%rax), %%ymm10, %%ymm10      \n\t"
+                         "vpaddd 0x160(%%rax), %%ymm11, %%ymm11      \n\t"
+                         "vpaddd 0x180(%%rax), %%ymm12, %%ymm12      \n\t"
+                         "vpaddd 0x1A0(%%rax), %%ymm13, %%ymm13      \n\t"
+                         "vpaddd 0x1C0(%%rax), %%ymm14, %%ymm14      \n\t"
+                         "vpaddd 0x1E0(%%rax), %%ymm15, %%ymm15      \n\t"
+                         "vpaddd 0x200(%%rax), %%ymm16, %%ymm16      \n\t"
+                         "vpaddd 0x220(%%rax), %%ymm17, %%ymm17      \n\t"
+                         "vpaddd 0x240(%%rax), %%ymm18, %%ymm18      \n\t"
+                         "vpaddd 0x260(%%rax), %%ymm19, %%ymm19      \n\t"
+                         "vpaddd 0x280(%%rax), %%ymm20, %%ymm20      \n\t"
+                         "vpaddd 0x2A0(%%rax), %%ymm21, %%ymm21      \n\t"
+                         "vpaddd 0x2C0(%%rax), %%ymm22, %%ymm22      \n\t"
+                         "vpaddd 0x2E0(%%rax), %%ymm23, %%ymm23      \n\t"
+
+                         ".align 16                                  \n\t"
+                         "0:                                         \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
                          relu24Regs(%%ymm)
+                         "jmp 4f                                     \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                  \n\t"
+                         "1:                                         \n\t"
                          convert24RegsI32ToF32(%[scale], %%ymm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%ymm0,  (%%rax)                    \n\t"
-                         "vmovups %%ymm1,  0x20(%%rax)                       \n\t"
-                         "vmovups %%ymm2,  0x40(%%rax)                \n\t"
-                         "vmovups %%ymm3,  0x60(%%rax)                       \n\t"
-                         "vmovups %%ymm4,  0x80(%%rax)                 \n\t"
-                         "vmovups %%ymm5,  0xA0(%%rax)                        \n\t"
-                         "vmovups %%ymm6,  0xC0(%%rax)                 \n\t"
-                         "vmovups %%ymm7,  0xE0(%%rax)                        \n\t"
-                         "vmovups %%ymm8,  0x100(%%rax)                  \n\t"
-                         "vmovups %%ymm9,  0x120(%%rax)                         \n\t"
-                         "vmovups %%ymm10, 0x140(%%rax)                  \n\t"
-                         "vmovups %%ymm11, 0x160(%%rax)                         \n\t"
-                         "vmovups %%ymm12, 0x180(%%rax)                  \n\t"
-                         "vmovups %%ymm13, 0x1A0(%%rax)                         \n\t"
-                         "vmovups %%ymm14, 0x1C0(%%rax)                  \n\t"
-                         "vmovups %%ymm15, 0x1E0(%%rax)                         \n\t"
-                         "vmovups %%ymm16, 0x200(%%rax)                  \n\t"
-                         "vmovups %%ymm17, 0x220(%%rax)                         \n\t"
-                         "vmovups %%ymm18, 0x240(%%rax)                  \n\t"
-                         "vmovups %%ymm19, 0x260(%%rax)                         \n\t"
-                         "vmovups %%ymm20, 0x280(%%rax)                  \n\t"
-                         "vmovups %%ymm21, 0x2A0(%%rax)                         \n\t"
-                         "vmovups %%ymm22, 0x2C0(%%rax)                  \n\t"
-                         "vmovups %%ymm23, 0x2E0(%%rax)                         \n\t"
+                         ".align 16                                  \n\t"
+                         "2:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x2, %%rcx                            \n\t"
+                         "je 3f                                      \n\t"
+                         "vaddps (%[eltwise]),      %%ymm0, %%ymm0   \n\t"
+                         "vaddps 0x20(%[eltwise]),  %%ymm1, %%ymm1   \n\t"
+                         "vaddps 0x40(%[eltwise]),  %%ymm2, %%ymm2   \n\t"
+                         "vaddps 0x60(%[eltwise]),  %%ymm3, %%ymm3   \n\t"
+                         "vaddps 0x80(%[eltwise]),  %%ymm4, %%ymm4   \n\t"
+                         "vaddps 0xA0(%[eltwise]),  %%ymm5, %%ymm5   \n\t"
+                         "vaddps 0xC0(%[eltwise]),  %%ymm6, %%ymm6   \n\t"
+                         "vaddps 0xE0(%[eltwise]),  %%ymm7, %%ymm7   \n\t"
+                         "vaddps 0x100(%[eltwise]), %%ymm8, %%ymm8   \n\t"
+                         "vaddps 0x120(%[eltwise]), %%ymm9, %%ymm9   \n\t"
+                         "vaddps 0x140(%[eltwise]), %%ymm10, %%ymm10 \n\t"
+                         "vaddps 0x160(%[eltwise]), %%ymm11, %%ymm11 \n\t"
+                         "vaddps 0x180(%[eltwise]), %%ymm12, %%ymm12 \n\t"
+                         "vaddps 0x1A0(%[eltwise]), %%ymm13, %%ymm13 \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%ymm14, %%ymm14 \n\t"
+                         "vaddps 0x1E0(%[eltwise]), %%ymm15, %%ymm15 \n\t"
+                         "vaddps 0x200(%[eltwise]), %%ymm16, %%ymm16 \n\t"
+                         "vaddps 0x220(%[eltwise]), %%ymm17, %%ymm17 \n\t"
+                         "vaddps 0x240(%[eltwise]), %%ymm18, %%ymm18 \n\t"
+                         "vaddps 0x260(%[eltwise]), %%ymm19, %%ymm19 \n\t"
+                         "vaddps 0x280(%[eltwise]), %%ymm20, %%ymm20 \n\t"
+                         "vaddps 0x2A0(%[eltwise]), %%ymm21, %%ymm21 \n\t"
+                         "vaddps 0x2C0(%[eltwise]), %%ymm22, %%ymm22 \n\t"
+                         "vaddps 0x2E0(%[eltwise]), %%ymm23, %%ymm23 \n\t"
+
+                         ".align 16                                  \n\t"
+                         "3:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
+                         relu24RegsPs(%%ymm)
+
+                         ".align 16                                  \n\t"
+                         "4:                                         \n\t"
+                         "vmovups %%ymm0,  (%%rax)                   \n\t"
+                         "vmovups %%ymm1,  0x20(%%rax)               \n\t"
+                         "vmovups %%ymm2,  0x40(%%rax)               \n\t"
+                         "vmovups %%ymm3,  0x60(%%rax)               \n\t"
+                         "vmovups %%ymm4,  0x80(%%rax)               \n\t"
+                         "vmovups %%ymm5,  0xA0(%%rax)               \n\t"
+                         "vmovups %%ymm6,  0xC0(%%rax)               \n\t"
+                         "vmovups %%ymm7,  0xE0(%%rax)               \n\t"
+                         "vmovups %%ymm8,  0x100(%%rax)              \n\t"
+                         "vmovups %%ymm9,  0x120(%%rax)              \n\t"
+                         "vmovups %%ymm10, 0x140(%%rax)              \n\t"
+                         "vmovups %%ymm11, 0x160(%%rax)              \n\t"
+                         "vmovups %%ymm12, 0x180(%%rax)              \n\t"
+                         "vmovups %%ymm13, 0x1A0(%%rax)              \n\t"
+                         "vmovups %%ymm14, 0x1C0(%%rax)              \n\t"
+                         "vmovups %%ymm15, 0x1E0(%%rax)              \n\t"
+                         "vmovups %%ymm16, 0x200(%%rax)              \n\t"
+                         "vmovups %%ymm17, 0x220(%%rax)              \n\t"
+                         "vmovups %%ymm18, 0x240(%%rax)              \n\t"
+                         "vmovups %%ymm19, 0x260(%%rax)              \n\t"
+                         "vmovups %%ymm20, 0x280(%%rax)              \n\t"
+                         "vmovups %%ymm21, 0x2A0(%%rax)              \n\t"
+                         "vmovups %%ymm22, 0x2C0(%%rax)              \n\t"
+                         "vmovups %%ymm23, 0x2E0(%%rax)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6",
-                           "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14",
-                           "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22",
-                           "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5",
+                           "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11",
+                           "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%ymm16", "%ymm17",
+                           "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", "%ymm23",
+                           "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29",
+                           "%ymm30", "%ymm31", "memory", "cc");
 }
 
 void Avx512ConvKernel12x8(ConvController &c) {
-    convKernelForLoopXx16(12, 12, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80)
-
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax),      %%ymm0,  %%ymm0                             \n\t"
-                         "vpaddd 0x20(%%rax),  %%ymm1,  %%ymm1                             \n\t"
-                         "vpaddd 0x40(%%rax),  %%ymm2,  %%ymm2                         \n\t"
-                         "vpaddd 0x60(%%rax),  %%ymm3,  %%ymm3                         \n\t"
-                         "vpaddd 0x80(%%rax),  %%ymm4,  %%ymm4                         \n\t"
-                         "vpaddd 0xA0(%%rax),  %%ymm5,  %%ymm5                         \n\t"
-                         "vpaddd 0xC0(%%rax),  %%ymm6,  %%ymm6                         \n\t"
-                         "vpaddd 0xE0(%%rax),  %%ymm7,  %%ymm7                         \n\t"
-                         "vpaddd 0x100(%%rax), %%ymm8,  %%ymm8                         \n\t"
-                         "vpaddd 0x120(%%rax), %%ymm9,  %%ymm9                         \n\t"
-                         "vpaddd 0x140(%%rax), %%ymm10, %%ymm10                         \n\t"
-                         "vpaddd 0x160(%%rax), %%ymm11, %%ymm11                         \n\t"
+    if (c.cross) {
+        convKernelForLoopXx16(12, 12, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 1)
+    } else {
+        convKernelForLoopXx16(12, 12, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 0)
+    }
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                      \n\t"
+                         "movq %[ostepC16], %%rbx                    \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x1, %%rcx                            \n\t"
+                         "je 0f                                      \n\t"
+                         "vpaddd (%%rax),      %%ymm0,  %%ymm0       \n\t"
+                         "vpaddd 0x20(%%rax),  %%ymm1,  %%ymm1       \n\t"
+                         "vpaddd 0x40(%%rax),  %%ymm2,  %%ymm2       \n\t"
+                         "vpaddd 0x60(%%rax),  %%ymm3,  %%ymm3       \n\t"
+                         "vpaddd 0x80(%%rax),  %%ymm4,  %%ymm4       \n\t"
+                         "vpaddd 0xA0(%%rax),  %%ymm5,  %%ymm5       \n\t"
+                         "vpaddd 0xC0(%%rax),  %%ymm6,  %%ymm6       \n\t"
+                         "vpaddd 0xE0(%%rax),  %%ymm7,  %%ymm7       \n\t"
+                         "vpaddd 0x100(%%rax), %%ymm8,  %%ymm8       \n\t"
+                         "vpaddd 0x120(%%rax), %%ymm9,  %%ymm9       \n\t"
+                         "vpaddd 0x140(%%rax), %%ymm10, %%ymm10      \n\t"
+                         "vpaddd 0x160(%%rax), %%ymm11, %%ymm11      \n\t"
+
+                         ".align 16                                  \n\t"
+                         "0:                                         \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
                          relu12Regs(%%ymm)
+                         "jmp 4f                                     \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                                  \n\t"
+                         "1:                                         \n\t"
                          convert12RegsI32ToF32(%[scale], %%ymm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%ymm0,  (%%rax)                    \n\t"
-                         "vmovups %%ymm1,  0x20(%%rax)                       \n\t"
-                         "vmovups %%ymm2,  0x40(%%rax)                \n\t"
-                         "vmovups %%ymm3,  0x60(%%rax)                       \n\t"
-                         "vmovups %%ymm4,  0x80(%%rax)                 \n\t"
-                         "vmovups %%ymm5,  0xA0(%%rax)                        \n\t"
-                         "vmovups %%ymm6,  0xC0(%%rax)                 \n\t"
-                         "vmovups %%ymm7,  0xE0(%%rax)                        \n\t"
-                         "vmovups %%ymm8,  0x100(%%rax)                  \n\t"
-                         "vmovups %%ymm9,  0x120(%%rax)                         \n\t"
-                         "vmovups %%ymm10, 0x140(%%rax)                  \n\t"
-                         "vmovups %%ymm11, 0x160(%%rax)                         \n\t"
+                         ".align 16                                  \n\t"
+                         "2:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0x2, %%rcx                            \n\t"
+                         "je 3f                                      \n\t"
+                         "vaddps (%[eltwise]),      %%ymm0, %%ymm0   \n\t"
+                         "vaddps 0x20(%[eltwise]),  %%ymm1, %%ymm1   \n\t"
+                         "vaddps 0x40(%[eltwise]),  %%ymm2, %%ymm2   \n\t"
+                         "vaddps 0x60(%[eltwise]),  %%ymm3, %%ymm3   \n\t"
+                         "vaddps 0x80(%[eltwise]),  %%ymm4, %%ymm4   \n\t"
+                         "vaddps 0xA0(%[eltwise]),  %%ymm5, %%ymm5   \n\t"
+                         "vaddps 0xC0(%[eltwise]),  %%ymm6, %%ymm6   \n\t"
+                         "vaddps 0xE0(%[eltwise]),  %%ymm7, %%ymm7   \n\t"
+                         "vaddps 0x100(%[eltwise]), %%ymm8, %%ymm8   \n\t"
+                         "vaddps 0x120(%[eltwise]), %%ymm9, %%ymm9   \n\t"
+                         "vaddps 0x140(%[eltwise]), %%ymm10, %%ymm10 \n\t"
+                         "vaddps 0x160(%[eltwise]), %%ymm11, %%ymm11 \n\t"
+
+                         ".align 16                                  \n\t"
+                         "3:                                         \n\t"
+                         "movq %[flags], %%rcx                       \n\t"
+                         "and $0xC, %%rcx                            \n\t"
+                         "je 4f                                      \n\t"
+                         relu24RegsPs(%%ymm)
+
+                         ".align 16                                  \n\t"
+                         "4:                                         \n\t"
+                         "vmovups %%ymm0,  (%%rax)                   \n\t"
+                         "vmovups %%ymm1,  0x20(%%rax)               \n\t"
+                         "vmovups %%ymm2,  0x40(%%rax)               \n\t"
+                         "vmovups %%ymm3,  0x60(%%rax)               \n\t"
+                         "vmovups %%ymm4,  0x80(%%rax)               \n\t"
+                         "vmovups %%ymm5,  0xA0(%%rax)               \n\t"
+                         "vmovups %%ymm6,  0xC0(%%rax)               \n\t"
+                         "vmovups %%ymm7,  0xE0(%%rax)               \n\t"
+                         "vmovups %%ymm8,  0x100(%%rax)              \n\t"
+                         "vmovups %%ymm9,  0x120(%%rax)              \n\t"
+                         "vmovups %%ymm10, 0x140(%%rax)              \n\t"
+                         "vmovups %%ymm11, 0x160(%%rax)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6",
-                           "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14",
-                           "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22",
-                           "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5",
+                           "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11",
+                           "%ymm24","%ymm31", "memory", "cc");
 }
 
 void Avx512ConvKernel1x8(ConvController &c) {
-    convKernelForLoopXx16(1, 1, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80)
+    if (c.cross) {
+        convKernelForLoopXx16(1, 1, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 1)
+    } else {
+        convKernelForLoopXx16(1, 1, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 0)
+    }
 
-    __asm__ __volatile__("movq %[output], %%rax                                      \n\t"
-                         "movq %[ostepC16], %%rbx                                      \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0x1, %%rcx                                  \n\t"
-                         "je 0f                                             \n\t"
-                         "vpaddd (%%rax), %%ymm0,  %%ymm0                             \n\t"
+    __asm__ __volatile__("movq %[output], %%rax                 \n\t"
+                         "movq %[ostepC16], %%rbx               \n\t"
+                         "movq %[flags], %%rcx                  \n\t"
+                         "and $0x1, %%rcx                       \n\t"
+                         "je 0f                                 \n\t"
+                         "vpaddd (%%rax), %%ymm0,  %%ymm0       \n\t"
 
-                         ".align 16                                         \n\t"
-                         "0:                                                \n\t"
-                         "movq %[flags], %%rcx                                      \n\t"
-                         "and $0xC, %%rcx                                      \n\t"
-                         "je 1f                                             \n\t"
+                         ".align 16                             \n\t"
+                         "0:                                    \n\t"
+                         "cmpq $0x0, %[scale] \n\t"
+                         "jne 1f      \n\t"
+                         "movq %[flags], %%rcx                  \n\t"
+                         "and $0xC, %%rcx                       \n\t"
+                         "je 4f                                 \n\t"
                          reluReg(%%ymm)
+                         "jmp 4f                                \n\t"
 
-                         ".align 16                                         \n\t"
-                         "1:                                                \n\t"
-                         "cmpq $0x0, %[scale] \n\t"
-                         "je 2f      \n\t"
+                         ".align 16                             \n\t"
+                         "1:                                    \n\t"
                          convertRegI32ToF32(%[scale], %%ymm)
 
-                         ".align 16                                         \n\t"
-                         "2:                                                \n\t"
-                         "vmovups %%ymm0,  (%%rax)                    \n\t"
+                         ".align 16                             \n\t"
+                         "2:                                    \n\t"
+                         "movq %[flags], %%rcx                  \n\t"
+                         "and $0x2, %%rcx                       \n\t"
+                         "je 3f                                 \n\t"
+                         "vaddps (%[eltwise]), %%ymm0, %%ymm0   \n\t"
+
+                         ".align 16                             \n\t"
+                         "3:                                    \n\t"
+                         "movq %[flags], %%rcx                  \n\t"
+                         "and $0xC, %%rcx                       \n\t"
+                         "je 4f                                 \n\t"
+                         reluRegPs(%%ymm)
+
+                         ".align 16                             \n\t"
+                         "4:                                    \n\t"
+                         "vmovups %%ymm0,  (%%rax)              \n\t"
                          :
-                         : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale)
-                         : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6",
-                           "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14",
-                           "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22",
-                           "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30",
-                           "%zmm31", "memory", "cc");
+                         : [output] "r" (c.output),
+                           [ostepC16] "r" (c.ostepC16),
+                           [eltwise] "r" (c.eltwise),
+                           [flags] "r" (c.flags),
+                           [scale] "r" (c.scale)
+                         : "%rax", "%rbx", "%rcx",
+                           "%ymm0", "%ymm24", "%ymm31",
+                           "memory", "cc");
 }
 
 // clang-format on
 EE convolution_direct(TensorDesc inputDesc,
     UINT8 *inArray,
+    F32 *eltwiseInput,
     TensorDesc filterDesc,
     const INT8 *filterArray,
     ConvolutionParamSpec convParamSpec,
     TensorDesc biasDesc,
-    const I32 *biasArray,
+    const F32 *biasArray,
     U32 tmpBytes,
     void *tmp,
     TensorDesc outputDesc,
@@ -2227,10 +3345,10 @@ EE convolution_direct(TensorDesc inputDesc,
     // get computing params
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
     U32 ih_pad = ih + paddingT + paddingB;
@@ -2246,7 +3364,7 @@ EE convolution_direct(TensorDesc inputDesc,
     convCtl.dilateW = dilateW * SIMDW;
     convCtl.dilateH = (iw_pad - fw * dilateW + (dilateH - 1) * iw_pad) * SIMDW;
     convCtl.fStep = ((ih_pad - fh * dilateH) * iw_pad) * SIMDW;
-    convCtl.stepC16 = strideW * 16;
+    // convCtl.stepC16 = strideW * 16;
     convCtl.kw = fw;
     convCtl.kh = fh;
     convCtl.scale = nullptr;
@@ -2273,9 +3391,12 @@ EE convolution_direct(TensorDesc inputDesc,
         tmp = (void *)((U8 *)tmp + tensorNumElements(outputDesc) * bytesOf(DT_I32));
         outputDesc.dt = DT_I32;
     }
+    if (eltwiseInput != nullptr) {
+        outputDesc.dt = DT_F32;
+    }
     F32 *factorPtr = nullptr;
     F32 factor = 0;
-    if (scale != nullptr && odt == DT_F32) {
+    if (scale != nullptr && outputDesc.dt == DT_F32) {
         factor = 1 / (*scaleO);
         factorPtr = &factor;
     }
@@ -2288,6 +3409,15 @@ EE convolution_direct(TensorDesc inputDesc,
 
     U32 oBytes = bytesOf(outputDesc.dt);
     UINT8 *tmpInput = (UINT8 *)tmp;
+    I64 step[72];
+    I64 normalStep = strideW * 16;
+    I64 lastStep = (iw_pad - (ow - 1) * strideW + (strideH - 1) * iw_pad) * 16;
+    for (U32 i = 0; i < 24; ++i) {
+        step[i] = strideW * 16;
+        step[i + 24] = strideW * 8;
+        step[i + 48] = strideW * 4;
+    }
+    convCtl.stepC16 = step;
     for (U32 n = 0; n < in; ++n) {
         UINT8 *bInArray = inArray + n * ic * ih * iw;
         if (idf == DF_NCHWC16 && paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) {
@@ -2306,6 +3436,7 @@ EE convolution_direct(TensorDesc inputDesc,
             icSize = UNI_MIN(BLOCK_IC_DIM, ic - icbb);
             flags |= (icbb > 0);
             if (icbb == ic - icSize) {
+                flags |= (eltwiseInput != nullptr) << 1;
                 flags |= U32(activationDesc.mode) << 2;
                 convCtl.scale = factorPtr;
             }
@@ -2315,7 +3446,10 @@ EE convolution_direct(TensorDesc inputDesc,
             if (icSize < SIMDW) {
                 simdC = icSizeArray[icSize >> 3];
             }
-            for (U32 h = 0; h < oh; ++h) {
+
+            U32 hwSize = 0;
+            for (U32 hw = 0; hw < oh * ow; hw += hwSize) {
+                hwSize = UNI_MIN(BLOCK_HW_DIM, oh * ow - hw);
                 U32 ocSize = 0;
                 for (U32 ocb = 0; ocb < oc; ocb += ocSize) {
                     ocSize = UNI_MIN(unrollOc, oc - ocb);
@@ -2325,23 +3459,43 @@ EE convolution_direct(TensorDesc inputDesc,
                     UINT8 *curI = tmpInput + icbb * ih_pad * iw_pad;
                     U32 wSize = 8;
                     U32 unrollW = wSizeArray[ocSize >> 4];
-                    for (U32 w = 0; w < ow; w += wSize) {
-                        wSize = UNI_MIN(ow - w, unrollW);
+                    for (U32 ihw = hw; ihw < hw + hwSize; ihw += wSize) {
+                        wSize = UNI_MIN(hw + hwSize - ihw, unrollW);
                         U32 idx = wSize * 2 / unrollW;
                         wSize = UNI_MAX(idx * unrollW / 2, 1);
-                        U32 in_h = h * strideH;
-                        U32 in_w = w * strideW;
+                        U32 in_h = ihw / ow * strideH;
+                        U32 in_w = ihw % ow * strideW;
                         convCtl.input = curI + in_h * iw_pad * simdC + in_w * simdC;
-                        convCtl.output =
-                            output + ((n * oc + ocb) * ohow + (h * ow + w) * simdOc) * oBytes;
+                        convCtl.output = output + ((n * oc + ocb) * ohow + ihw * simdOc) * oBytes;
+                        convCtl.eltwise = eltwiseInput + (n * oc + ocb) * ohow + ihw * simdOc;
                         convCtl.filter = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw;
                         if ((ic % 16 != 0) && (icbb == (int)ic - icSize)) {
                             U32 cx = (ic % 8 == 0) ? 8 : 4;
                             convCtl.f8Step = convCtl.fStep - (in_h * iw_pad + in_w) * (SIMDW - cx);
                             convCtl.f4Step = convCtl.fStep / 2 - (in_h * iw_pad + in_w) * (8 - 4);
                         }
+                        convCtl.cross = false;
+                        if ((ihw % ow + wSize) > ow) {
+                            U32 lane = (ihw % ow + wSize) / ow;
+                            if ((ihw % ow + wSize) % ow == 0) {
+                                --lane;
+                            }
+                            for (U32 ui = 0; ui < lane; ++ui) {
+                                convCtl.stepC16[(ihw / ow + ui + 1) * ow - ihw - 1] = lastStep;
+                            }
+                            convCtl.cross = true;
+                        }
                         convCtl.ic = icSize;
                         kernel[ocSize >> 4][idx](convCtl);
+                        if ((ihw % ow + wSize) > ow) {
+                            U32 lane = (ihw % ow + wSize) / ow;
+                            if ((ihw % ow + wSize) % ow == 0) {
+                                --lane;
+                            }
+                            for (U32 ui = 0; ui < lane; ++ui) {
+                                convCtl.stepC16[(ihw / ow + ui + 1) * ow - ihw - 1] = normalStep;
+                            }
+                        }
                     }
                 }
             }
@@ -2353,6 +3507,7 @@ EE convolution_direct(TensorDesc inputDesc,
         F32 scales[2] = {-1, scaleO[0]};
         TensorDesc qDesc = outputDesc;
         qDesc.dt = DT_U8_Q;
+        I32 *oi = (I32 *)output;
         CHECK_STATUS(quantize_x86(outputDesc, (void *)output, &qDesc, (void *)outArray, scales));
         *scaleO = scales[0];
     }
diff --git a/compute/tensor/src/cpu/x86/int8/convolution_functions.h b/compute/tensor/src/cpu/x86/int8/convolution_functions.h
new file mode 100644
index 00000000..20409f95
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/int8/convolution_functions.h
@@ -0,0 +1,324 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+struct ConvController {
+    UINT8 *input;
+    const INT8 *filter;
+    void *output;
+    F32 *eltwise;
+    UINT8 *u8Output;
+    const I32 *bias;
+    I64 ic;
+    I64 kw;
+    I64 kh;
+    I64 *stepC16;
+    I64 dilateW;
+    I64 dilateH;
+    I64 ostepC16;
+    I64 flags;
+    I64 fStep;
+    I64 f8Step;
+    I64 f4Step;
+    void *scale;
+    bool cross;
+};
+
+typedef void (*kernelFunc)(ConvController &c);
+
+// clang-format off
+#define clear1Regs(rtype) \
+    "vxorps "#rtype"0, "#rtype"0, "#rtype"0                     \n\t"
+
+#define clear2Regs(rtype) \
+    clear1Regs(rtype) \
+    "vxorps "#rtype"1, "#rtype"1, "#rtype"1                     \n\t"
+
+#define clear3Regs(rtype) \
+    clear2Regs(rtype) \
+    "vxorps "#rtype"2, "#rtype"2, "#rtype"2                     \n\t"
+
+#define clear12Regs(rtype) \
+    clear3Regs(rtype) \
+    "vxorps "#rtype"3, "#rtype"3, "#rtype"3                     \n\t" \
+    "vxorps "#rtype"4, "#rtype"4, "#rtype"4                     \n\t" \
+    "vxorps "#rtype"5, "#rtype"5, "#rtype"5                     \n\t" \
+    "vxorps "#rtype"6, "#rtype"6, "#rtype"6                     \n\t" \
+    "vxorps "#rtype"7, "#rtype"7, "#rtype"7                     \n\t" \
+    "vxorps "#rtype"8, "#rtype"8, "#rtype"8                     \n\t" \
+    "vxorps "#rtype"9, "#rtype"9, "#rtype"9                     \n\t" \
+    "vxorps "#rtype"10, "#rtype"10, "#rtype"10                  \n\t" \
+    "vxorps "#rtype"11, "#rtype"11, "#rtype"11                  \n\t"
+
+#define clear24Regs(rtype) \
+    clear12Regs(rtype) \
+    "vxorps "#rtype"12, "#rtype"12, "#rtype"12                  \n\t" \
+    "vxorps "#rtype"13, "#rtype"13, "#rtype"13                  \n\t" \
+    "vxorps "#rtype"14, "#rtype"14, "#rtype"14                  \n\t" \
+    "vxorps "#rtype"15, "#rtype"15, "#rtype"15                  \n\t" \
+    "vxorps "#rtype"16, "#rtype"16, "#rtype"16                  \n\t" \
+    "vxorps "#rtype"17, "#rtype"17, "#rtype"17                  \n\t" \
+    "vxorps "#rtype"18, "#rtype"18, "#rtype"18                  \n\t" \
+    "vxorps "#rtype"19, "#rtype"19, "#rtype"19                  \n\t" \
+    "vxorps "#rtype"20, "#rtype"20, "#rtype"20                  \n\t" \
+    "vxorps "#rtype"21, "#rtype"21, "#rtype"21                  \n\t" \
+    "vxorps "#rtype"22, "#rtype"22, "#rtype"22                  \n\t" \
+    "vxorps "#rtype"23, "#rtype"23, "#rtype"23                  \n\t"
+
+#define reluReg(rtype) \
+    "vpxord "#rtype"31, "#rtype"31, "#rtype"31                  \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"0, "#rtype"0                    \n\t"
+
+#define relu2Regs(rtype) \
+    reluReg(rtype) \
+    "vpmaxsd "#rtype"31, "#rtype"1, "#rtype"1                    \n\t"
+
+#define relu3Regs(rtype) \
+    relu2Regs(rtype) \
+    "vpmaxsd "#rtype"31, "#rtype"2, "#rtype"2                    \n\t"
+
+#define relu12Regs(rtype) \
+    relu3Regs(rtype) \
+    "vpmaxsd "#rtype"31, "#rtype"3, "#rtype"3                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"4, "#rtype"4                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"5, "#rtype"5                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"6, "#rtype"6                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"7, "#rtype"7                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"8, "#rtype"8                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"9, "#rtype"9                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"10, "#rtype"10                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"11, "#rtype"11                    \n\t"
+
+#define relu24Regs(rtype) \
+    relu12Regs(rtype) \
+    "vpmaxsd "#rtype"31, "#rtype"12, "#rtype"12                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"13, "#rtype"13                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"14, "#rtype"14                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"15, "#rtype"15                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"16, "#rtype"16                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"17, "#rtype"17                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"18, "#rtype"18                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"19, "#rtype"19                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"20, "#rtype"20                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"21, "#rtype"21                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"22, "#rtype"22                    \n\t" \
+    "vpmaxsd "#rtype"31, "#rtype"23, "#rtype"23                    \n\t"
+
+
+#define reluRegPs(rtype) \
+    "vpxord "#rtype"31, "#rtype"31, "#rtype"31                  \n\t" \
+    "vmaxps "#rtype"31, "#rtype"0, "#rtype"0                    \n\t"
+
+#define relu2RegsPs(rtype) \
+    reluReg(rtype) \
+    "vmaxps "#rtype"31, "#rtype"1, "#rtype"1                    \n\t"
+
+#define relu3RegsPs(rtype) \
+    relu2Regs(rtype) \
+    "vmaxps "#rtype"31, "#rtype"2, "#rtype"2                    \n\t"
+
+#define relu12RegsPs(rtype) \
+    relu3Regs(rtype) \
+    "vmaxps "#rtype"31, "#rtype"3, "#rtype"3                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"4, "#rtype"4                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"5, "#rtype"5                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"6, "#rtype"6                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"7, "#rtype"7                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"8, "#rtype"8                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"9, "#rtype"9                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"10, "#rtype"10                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"11, "#rtype"11                    \n\t"
+
+#define relu24RegsPs(rtype) \
+    relu12Regs(rtype) \
+    "vmaxps "#rtype"31, "#rtype"12, "#rtype"12                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"13, "#rtype"13                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"14, "#rtype"14                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"15, "#rtype"15                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"16, "#rtype"16                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"17, "#rtype"17                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"18, "#rtype"18                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"19, "#rtype"19                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"20, "#rtype"20                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"21, "#rtype"21                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"22, "#rtype"22                    \n\t" \
+    "vmaxps "#rtype"31, "#rtype"23, "#rtype"23                    \n\t"
+
+#define convertRegI32ToF32(scalePtr, rtype) \
+    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
+    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
+    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
+
+#define convert2RegsI32ToF32(scalePtr, rtype) \
+    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
+    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
+    "vcvtdq2ps "#rtype"1, "#rtype"1                       \n\t" \
+    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
+    "vmulps "#rtype"1, "#rtype"24, "#rtype"1                       \n\t" \
+
+#define convert3RegsI32ToF32(scalePtr, rtype) \
+    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
+    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
+    "vcvtdq2ps "#rtype"1, "#rtype"1                       \n\t" \
+    "vcvtdq2ps "#rtype"2, "#rtype"2                       \n\t" \
+    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
+    "vmulps "#rtype"1, "#rtype"24, "#rtype"1                       \n\t" \
+    "vmulps "#rtype"2, "#rtype"24, "#rtype"2                       \n\t"
+#define convert12RegsI32ToF32(scalePtr, rtype) \
+    "vbroadcastss ("#scalePtr"), "#rtype"24                        \n\t" \
+    "vcvtdq2ps "#rtype"0, "#rtype"0                       \n\t" \
+    "vcvtdq2ps "#rtype"1, "#rtype"1                       \n\t" \
+    "vcvtdq2ps "#rtype"2, "#rtype"2                       \n\t" \
+    "vcvtdq2ps "#rtype"3, "#rtype"3                       \n\t" \
+    "vcvtdq2ps "#rtype"4, "#rtype"4                       \n\t" \
+    "vcvtdq2ps "#rtype"5, "#rtype"5                       \n\t" \
+    "vcvtdq2ps "#rtype"6, "#rtype"6                       \n\t" \
+    "vcvtdq2ps "#rtype"7, "#rtype"7                       \n\t" \
+    "vcvtdq2ps "#rtype"8, "#rtype"8                       \n\t" \
+    "vcvtdq2ps "#rtype"9, "#rtype"9                       \n\t" \
+    "vcvtdq2ps "#rtype"10, "#rtype"10                       \n\t" \
+    "vcvtdq2ps "#rtype"11, "#rtype"11                       \n\t" \
+    "vmulps "#rtype"0, "#rtype"24, "#rtype"0                       \n\t" \
+    "vmulps "#rtype"1, "#rtype"24, "#rtype"1                       \n\t" \
+    "vmulps "#rtype"2, "#rtype"24, "#rtype"2                       \n\t" \
+    "vmulps "#rtype"3, "#rtype"24, "#rtype"3                       \n\t" \
+    "vmulps "#rtype"4, "#rtype"24, "#rtype"4                       \n\t" \
+    "vmulps "#rtype"5, "#rtype"24, "#rtype"5                       \n\t" \
+    "vmulps "#rtype"6, "#rtype"24, "#rtype"6                       \n\t" \
+    "vmulps "#rtype"7, "#rtype"24, "#rtype"7                       \n\t" \
+    "vmulps "#rtype"8, "#rtype"24, "#rtype"8                       \n\t" \
+    "vmulps "#rtype"9, "#rtype"24, "#rtype"9                       \n\t" \
+    "vmulps "#rtype"10, "#rtype"24, "#rtype"10                     \n\t" \
+    "vmulps "#rtype"11, "#rtype"24, "#rtype"11                     \n\t"
+
+#define convert24RegsI32ToF32(scalePtr, rtype) \
+    convert12RegsI32ToF32(scalePtr, rtype) \
+    "vcvtdq2ps "#rtype"12, "#rtype"12                       \n\t" \
+    "vcvtdq2ps "#rtype"13, "#rtype"13                       \n\t" \
+    "vcvtdq2ps "#rtype"14, "#rtype"14                       \n\t" \
+    "vcvtdq2ps "#rtype"15, "#rtype"15                       \n\t" \
+    "vcvtdq2ps "#rtype"16, "#rtype"16                       \n\t" \
+    "vcvtdq2ps "#rtype"17, "#rtype"17                       \n\t" \
+    "vcvtdq2ps "#rtype"18, "#rtype"18                       \n\t" \
+    "vcvtdq2ps "#rtype"19, "#rtype"19                       \n\t" \
+    "vcvtdq2ps "#rtype"20, "#rtype"20                       \n\t" \
+    "vcvtdq2ps "#rtype"21, "#rtype"21                       \n\t" \
+    "vcvtdq2ps "#rtype"22, "#rtype"22                       \n\t" \
+    "vcvtdq2ps "#rtype"23, "#rtype"23                       \n\t" \
+    "vmulps "#rtype"12, "#rtype"24, "#rtype"12                     \n\t" \
+    "vmulps "#rtype"13, "#rtype"24, "#rtype"13                     \n\t" \
+    "vmulps "#rtype"14, "#rtype"24, "#rtype"14                     \n\t" \
+    "vmulps "#rtype"15, "#rtype"24, "#rtype"15                     \n\t" \
+    "vmulps "#rtype"16, "#rtype"24, "#rtype"16                     \n\t" \
+    "vmulps "#rtype"17, "#rtype"24, "#rtype"17                     \n\t" \
+    "vmulps "#rtype"18, "#rtype"24, "#rtype"18                     \n\t" \
+    "vmulps "#rtype"19, "#rtype"24, "#rtype"19                     \n\t" \
+    "vmulps "#rtype"20, "#rtype"24, "#rtype"20                     \n\t" \
+    "vmulps "#rtype"21, "#rtype"24, "#rtype"21                     \n\t" \
+    "vmulps "#rtype"22, "#rtype"24, "#rtype"22                     \n\t" \
+    "vmulps "#rtype"23, "#rtype"24, "#rtype"23                     \n\t"
+
+#define load48BiasTo3Regs(bias) \
+    "vmovups ("#bias"), %%zmm0                       \n\t" \
+    "vmovups 0x40("#bias"), %%zmm1                   \n\t" \
+    "vmovups 0x80("#bias"), %%zmm2                   \n\t" \
+
+#define load48BiasTo12Regs(bias) \
+    load48BiasTo3Regs(bias) \
+    "vmovups %%zmm0, %%zmm3                   \n\t" \
+    "vmovups %%zmm1, %%zmm4                   \n\t" \
+    "vmovups %%zmm2, %%zmm5                   \n\t" \
+    "vmovups %%zmm0, %%zmm6                   \n\t" \
+    "vmovups %%zmm1, %%zmm7                   \n\t" \
+    "vmovups %%zmm2, %%zmm8                   \n\t" \
+    "vmovups %%zmm0, %%zmm9                   \n\t" \
+    "vmovups %%zmm1, %%zmm10                   \n\t" \
+    "vmovups %%zmm2, %%zmm11                   \n\t"
+
+#define load48BiasTo24Regs(bias) \
+    load48BiasTo12Regs(bias) \
+    "vmovups %%zmm0, %%zmm12                   \n\t" \
+    "vmovups %%zmm1, %%zmm13                   \n\t" \
+    "vmovups %%zmm2, %%zmm14                   \n\t" \
+    "vmovups %%zmm0, %%zmm15                   \n\t" \
+    "vmovups %%zmm1, %%zmm16                   \n\t" \
+    "vmovups %%zmm2, %%zmm17                   \n\t" \
+    "vmovups %%zmm0, %%zmm18                   \n\t" \
+    "vmovups %%zmm1, %%zmm19                   \n\t" \
+    "vmovups %%zmm2, %%zmm20                   \n\t" \
+    "vmovups %%zmm0, %%zmm21                   \n\t" \
+    "vmovups %%zmm1, %%zmm22                   \n\t" \
+    "vmovups %%zmm2, %%zmm23                   \n\t"
+
+#define load32BiasTo2Regs(bias) \
+    "vmovups ("#bias"), %%zmm0                       \n\t" \
+    "vmovups 0x40("#bias"), %%zmm1                   \n\t" \
+
+#define load32BiasTo12Regs(bias) \
+    load32BiasTo2Regs(bias) \
+    "vmovups %%zmm0, %%zmm2                   \n\t" \
+    "vmovups %%zmm1, %%zmm3                   \n\t" \
+    "vmovups %%zmm0, %%zmm4                   \n\t" \
+    "vmovups %%zmm1, %%zmm5                   \n\t" \
+    "vmovups %%zmm0, %%zmm6                   \n\t" \
+    "vmovups %%zmm1, %%zmm7                   \n\t" \
+    "vmovups %%zmm0, %%zmm8                   \n\t" \
+    "vmovups %%zmm1, %%zmm9                   \n\t" \
+    "vmovups %%zmm0, %%zmm10                   \n\t" \
+    "vmovups %%zmm1, %%zmm11                   \n\t"
+
+#define load32BiasTo24Regs(bias) \
+    load32BiasTo12Regs(bias) \
+    "vmovups %%zmm0, %%zmm12                   \n\t" \
+    "vmovups %%zmm1, %%zmm13                   \n\t" \
+    "vmovups %%zmm0, %%zmm14                   \n\t" \
+    "vmovups %%zmm1, %%zmm15                   \n\t" \
+    "vmovups %%zmm0, %%zmm16                   \n\t" \
+    "vmovups %%zmm1, %%zmm17                   \n\t" \
+    "vmovups %%zmm0, %%zmm18                   \n\t" \
+    "vmovups %%zmm1, %%zmm19                   \n\t" \
+    "vmovups %%zmm0, %%zmm20                   \n\t" \
+    "vmovups %%zmm1, %%zmm21                   \n\t" \
+    "vmovups %%zmm0, %%zmm22                   \n\t" \
+    "vmovups %%zmm1, %%zmm23                   \n\t"
+
+#define load16BiasTo1Regs(bias, rtype) \
+    "vmovups ("#bias"), "#rtype"0                       \n\t"
+
+#define load16BiasTo12Regs(bias, rtype) \
+    load16BiasTo1Regs(bias, rtype) \
+    "vmovups "#rtype"0, "#rtype"1                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"2                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"3                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"4                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"5                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"6                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"7                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"8                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"9                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"10                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"11                   \n\t"
+
+#define load16BiasTo24Regs(bias, rtype) \
+    load16BiasTo12Regs(bias, rtype) \
+    "vmovups "#rtype"0, "#rtype"12                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"13                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"14                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"15                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"16                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"17                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"18                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"19                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"20                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"21                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"22                   \n\t" \
+    "vmovups "#rtype"0, "#rtype"23                   \n\t"
diff --git a/compute/tensor/src/cpu/x86/int8/convolution_transform.cpp b/compute/tensor/src/cpu/x86/int8/convolution_transform.cpp
index b45767d0..f48cce31 100644
--- a/compute/tensor/src/cpu/x86/int8/convolution_transform.cpp
+++ b/compute/tensor/src/cpu/x86/int8/convolution_transform.cpp
@@ -110,7 +110,7 @@ inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc,
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     if (fdf == ftmDataFormat) {
         *ftmDesc = filterDesc;
-        memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt));
+        UNI_MEMCPY(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt));
         return SUCCESS;
     }
     if (fdf != DF_NCHW) {
diff --git a/compute/tensor/src/cpu/x86/int8/depthwise_convolution_direct.cpp b/compute/tensor/src/cpu/x86/int8/depthwise_convolution_direct.cpp
new file mode 100644
index 00000000..8b313ce2
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/int8/depthwise_convolution_direct.cpp
@@ -0,0 +1,596 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "uni.h"
+#include "cpu/x86/fp32/convolution_functions.h"
+#include "cpu/x86/tensor_computing_x86.h"
+
+#define UNROLL_W 4
+#define UNROLL_OC_BLOCK_DIM 16
+#define SIMDW 16
+
+struct ConvController {
+    UINT8 *input;
+    const INT8 *filter;
+    void *output;
+    F32 *eltwise;
+    UINT8 *u8Output;
+    const I32 *bias;
+    I64 ic;
+    I64 kw;
+    I64 kh;
+    I64 *stepC16;
+    I64 ostepC16;
+    I64 flags;
+    I64 fStep;
+    I64 hStep;
+    I64 stride;
+    I64 k4Num;
+    void *scale;
+};
+
+typedef void (*kernelFunc)(ConvController &c);
+
+void Avx512DepthConvKernel16x16(ConvController &c) {
+    __asm__ __volatile__("prefetcht0 (%[output])                  \n\t"
+                         "prefetcht0 0x40(%[output])                  \n\t"
+                         "prefetcht0 0x80(%[output])                  \n\t"
+                         "prefetcht0 0xC0(%[output])                  \n\t"
+                         "prefetcht0 0x100(%[output])                  \n\t"
+                         "prefetcht0 0x140(%[output])                  \n\t"
+                         "prefetcht0 0x180(%[output])                  \n\t"
+                         "prefetcht0 0x1C0(%[output])                  \n\t"
+                         "vmovups (%[bias]), %%zmm0                   \n\t"
+                         "vmovups %%zmm0, %%zmm1                   \n\t"
+                         "vmovups %%zmm0, %%zmm2                   \n\t"
+                         "vmovups %%zmm0, %%zmm3                   \n\t"
+                         "vmovups %%zmm0, %%zmm4                   \n\t"
+                         "vmovups %%zmm0, %%zmm5                   \n\t"
+                         "vmovups %%zmm0, %%zmm6                   \n\t"
+                         "vmovups %%zmm0, %%zmm7                   \n\t"
+                         "vmovups %%zmm0, %%zmm8                   \n\t"
+                         "vmovups %%zmm0, %%zmm9                   \n\t"
+                         "vmovups %%zmm0, %%zmm10                   \n\t"
+                         "vmovups %%zmm0, %%zmm11                   \n\t"
+                         "vmovups %%zmm0, %%zmm12                   \n\t"
+                         "vmovups %%zmm0, %%zmm13                   \n\t"
+                         "vmovups %%zmm0, %%zmm14                   \n\t"
+                         "vmovups %%zmm0, %%zmm15                   \n\t"
+                         :
+                         : [bias] "r" (c.bias), [flags] "r" (c.flags), [output] "r" (c.output)
+                         : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
+                           "%zmm7", "memory", "cc");
+
+    __asm__ __volatile__(".align 16                                         \n\t"
+                         "0:                                                \n\t"
+                         "vmovups (%[filter]), %%zmm16     \n\t"
+                         "vmovups (%[input]), %%zmm17     \n\t"
+                         "vmovups 0x40(%[input]), %%zmm18     \n\t"
+                         "vmovups 0x80(%[input]), %%zmm19     \n\t"
+                         "vmovups 0xC0(%[input]), %%zmm20     \n\t"
+                         "vpdpbusd %%zmm16, %%zmm17, %%zmm0          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm18, %%zmm1          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm19, %%zmm2          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm20, %%zmm3          \n\t"
+                         "vmovups 0x100(%[input]), %%zmm21     \n\t"
+                         "vmovups 0x140(%[input]), %%zmm22     \n\t"
+                         "vmovups 0x180(%[input]), %%zmm23     \n\t"
+                         "vmovups 0x1C0(%[input]), %%zmm24     \n\t"
+                         "vpdpbusd %%zmm16, %%zmm21, %%zmm4          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm22, %%zmm5          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm23, %%zmm6          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm24, %%zmm7          \n\t"
+                         "vmovups 0x200(%[input]), %%zmm25     \n\t"
+                         "vmovups 0x240(%[input]), %%zmm26     \n\t"
+                         "vmovups 0x280(%[input]), %%zmm27     \n\t"
+                         "vmovups 0x2C0(%[input]), %%zmm28     \n\t"
+                         "vpdpbusd %%zmm16, %%zmm25, %%zmm8         \n\t"
+                         "vpdpbusd %%zmm16, %%zmm26, %%zmm9         \n\t"
+                         "vpdpbusd %%zmm16, %%zmm27, %%zmm10          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm28, %%zmm11          \n\t"
+                         "vmovups 0x300(%[input]), %%zmm17     \n\t"
+                         "vmovups 0x340(%[input]), %%zmm18     \n\t"
+                         "vmovups 0x380(%[input]), %%zmm19     \n\t"
+                         "vmovups 0x3C0(%[input]), %%zmm20     \n\t"
+                         "vpdpbusd %%zmm16, %%zmm17, %%zmm12          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm18, %%zmm13          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm19, %%zmm14          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm20, %%zmm15          \n\t"
+                         "addq $0x40, %[filter]                                    \n\t"
+                         "addq %[hStep], %[input]                                         \n\t"
+                         "dec %%rcx                                         \n\t"
+                         "jg 0b                                             \n\t"
+                         : [input] "+r" (c.input), [filter] "+r" (c.filter)
+                         : [k4Num] "c" (c.k4Num), [stride] "r" (c.stride), [hStep] "r" (c.hStep)
+                         : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
+                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", 
+                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
+                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
+                           "%zmm31", "memory", "cc");
+
+    __asm__ __volatile__("cmpq $0x0, %[scale]                                       \n\t"
+                         "jne 1f                                                    \n\t"
+                         "movq %[flags], %%rcx                                      \n\t"
+                         "and $0xC, %%rcx                                           \n\t"
+                         "je 4f                                                     \n\t"
+                         "vpxord %%zmm31, %%zmm31, %%zmm31                             \n\t"
+                         "vpmaxsd %%zmm31, %%zmm0, %%zmm0                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm1, %%zmm1                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm2, %%zmm2                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm3, %%zmm3                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm4, %%zmm4                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm5, %%zmm5                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm6, %%zmm6                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm7, %%zmm7                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm8, %%zmm8                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm9, %%zmm9                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm10, %%zmm10                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm11, %%zmm11                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm12, %%zmm12                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm13, %%zmm13                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm14, %%zmm14                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm15, %%zmm15                          \n\t"
+                         "jmp 4f                                                    \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "1:                                                        \n\t"
+                         "vbroadcastss (%[scale]), %%zmm30                          \n\t"
+                         "vcvtdq2ps %%zmm30, %%zmm31                                \n\t"
+                         "vmulps %%zmm31, %%zmm0, %%zmm0                          \n\t"
+                         "vmulps %%zmm31, %%zmm1, %%zmm1                          \n\t"
+                         "vmulps %%zmm31, %%zmm2, %%zmm2                          \n\t"
+                         "vmulps %%zmm31, %%zmm3, %%zmm3                          \n\t"
+                         "vmulps %%zmm31, %%zmm4, %%zmm4                          \n\t"
+                         "vmulps %%zmm31, %%zmm5, %%zmm5                          \n\t"
+                         "vmulps %%zmm31, %%zmm6, %%zmm6                          \n\t"
+                         "vmulps %%zmm31, %%zmm7, %%zmm7                          \n\t"
+                         "vmulps %%zmm31, %%zmm8, %%zmm8                          \n\t"
+                         "vmulps %%zmm31, %%zmm9, %%zmm9                          \n\t"
+                         "vmulps %%zmm31, %%zmm10, %%zmm10                          \n\t"
+                         "vmulps %%zmm31, %%zmm11, %%zmm11                          \n\t"
+                         "vmulps %%zmm31, %%zmm12, %%zmm12                          \n\t"
+                         "vmulps %%zmm31, %%zmm13, %%zmm13                          \n\t"
+                         "vmulps %%zmm31, %%zmm14, %%zmm14                          \n\t"
+                         "vmulps %%zmm31, %%zmm15, %%zmm15                          \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "2:                                                        \n\t"
+                         "movq %[flags], %%rcx                                      \n\t"
+                         "and $0x2, %%rcx                                           \n\t"
+                         "je 3f                                                     \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0                       \n\t"
+                         "vaddps 0x40(%[eltwise]), %%zmm1, %%zmm1                \n\t"
+                         "vaddps 0x80(%[eltwise]), %%zmm2, %%zmm2                \n\t"
+                         "vaddps 0xC0(%[eltwise]), %%zmm3, %%zmm3                \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4                \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5                \n\t"
+                         "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6                \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7                \n\t"
+                         "vaddps 0x200(%[eltwise]), %%zmm8, %%zmm8                \n\t"
+                         "vaddps 0x240(%[eltwise]), %%zmm9, %%zmm9                \n\t"
+                         "vaddps 0x280(%[eltwise]), %%zmm10, %%zmm10                \n\t"
+                         "vaddps 0x2C0(%[eltwise]), %%zmm11, %%zmm11                \n\t"
+                         "vaddps 0x300(%[eltwise]), %%zmm12, %%zmm12                \n\t"
+                         "vaddps 0x340(%[eltwise]), %%zmm13, %%zmm13                \n\t"
+                         "vaddps 0x380(%[eltwise]), %%zmm14, %%zmm14                \n\t"
+                         "vaddps 0x3C0(%[eltwise]), %%zmm15, %%zmm15                \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "3:                                                        \n\t"
+                         "movq %[flags], %%rcx                                      \n\t"
+                         "and $0xC, %%rcx                                           \n\t"
+                         "je 4f                                                     \n\t"
+                         "vpxord %%zmm31, %%zmm31, %%zmm31                             \n\t"
+                         "vmaxps %%zmm31, %%zmm0, %%zmm0                          \n\t"
+                         "vmaxps %%zmm31, %%zmm1, %%zmm1                          \n\t"
+                         "vmaxps %%zmm31, %%zmm2, %%zmm2                          \n\t"
+                         "vmaxps %%zmm31, %%zmm3, %%zmm3                          \n\t"
+                         "vmaxps %%zmm31, %%zmm4, %%zmm4                          \n\t"
+                         "vmaxps %%zmm31, %%zmm5, %%zmm5                          \n\t"
+                         "vmaxps %%zmm31, %%zmm6, %%zmm6                          \n\t"
+                         "vmaxps %%zmm31, %%zmm7, %%zmm7                          \n\t"
+                         "vmaxps %%zmm31, %%zmm8, %%zmm8                          \n\t"
+                         "vmaxps %%zmm31, %%zmm9, %%zmm9                          \n\t"
+                         "vmaxps %%zmm31, %%zmm10, %%zmm10                          \n\t"
+                         "vmaxps %%zmm31, %%zmm11, %%zmm11                          \n\t"
+                         "vmaxps %%zmm31, %%zmm12, %%zmm12                          \n\t"
+                         "vmaxps %%zmm31, %%zmm13, %%zmm13                          \n\t"
+                         "vmaxps %%zmm31, %%zmm14, %%zmm14                          \n\t"
+                         "vmaxps %%zmm31, %%zmm15, %%zmm15                          \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "4:                                                        \n\t"
+                         "vmovups %%zmm0, (%[output])                               \n\t"
+                         "vmovups %%zmm1, 0x40(%[output])                           \n\t"
+                         "vmovups %%zmm2, 0x80(%[output])                           \n\t"
+                         "vmovups %%zmm3, 0xC0(%[output])                           \n\t"
+                         "vmovups %%zmm4, 0x100(%[output])                          \n\t"
+                         "vmovups %%zmm5, 0x140(%[output])                          \n\t"
+                         "vmovups %%zmm6, 0x180(%[output])                          \n\t"
+                         "vmovups %%zmm7, 0x1C0(%[output])                          \n\t"
+                         "vmovups %%zmm8, 0x200(%[output])                          \n\t"
+                         "vmovups %%zmm9, 0x240(%[output])                          \n\t"
+                         "vmovups %%zmm10, 0x280(%[output])                          \n\t"
+                         "vmovups %%zmm11, 0x2C0(%[output])                          \n\t"
+                         "vmovups %%zmm12, 0x300(%[output])                          \n\t"
+                         "vmovups %%zmm13, 0x340(%[output])                          \n\t"
+                         "vmovups %%zmm14, 0x380(%[output])                          \n\t"
+                         "vmovups %%zmm15, 0x3C0(%[output])                          \n\t"
+                         :
+                         : [output] "r" (c.output), [eltwise] "r" (c.eltwise), [ostepC16] "r" (c.ostepC16),
+                           [flags] "r" (c.flags), [scale] "r" (c.scale)
+                         : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
+                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
+                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
+                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
+                           "%zmm31", "memory", "cc");
+}
+
+void Avx512DepthConvKernel8x16(ConvController &c) {
+    __asm__ __volatile__("prefetcht0 (%[output])                  \n\t"
+                         "prefetcht0 0x40(%[output])                  \n\t"
+                         "prefetcht0 0x80(%[output])                  \n\t"
+                         "prefetcht0 0xC0(%[output])                  \n\t"
+                         "prefetcht0 0x100(%[output])                  \n\t"
+                         "prefetcht0 0x140(%[output])                  \n\t"
+                         "prefetcht0 0x180(%[output])                  \n\t"
+                         "prefetcht0 0x1C0(%[output])                  \n\t"
+                         "vmovups (%[bias]), %%zmm0                   \n\t"
+                         "vmovups %%zmm0, %%zmm1                   \n\t"
+                         "vmovups %%zmm0, %%zmm2                   \n\t"
+                         "vmovups %%zmm0, %%zmm3                   \n\t"
+                         "vmovups %%zmm0, %%zmm4                   \n\t"
+                         "vmovups %%zmm0, %%zmm5                   \n\t"
+                         "vmovups %%zmm0, %%zmm6                   \n\t"
+                         "vmovups %%zmm0, %%zmm7                   \n\t"
+                         :
+                         : [bias] "r" (c.bias), [flags] "r" (c.flags), [output] "r" (c.output)
+                         : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
+                           "%zmm7", "memory", "cc");
+
+    __asm__ __volatile__(".align 16                                         \n\t"
+                         "0:                                                \n\t"
+                         "vmovups (%[filter]), %%zmm16     \n\t"
+                         "vmovups (%[input]), %%zmm17     \n\t"
+                         "vmovups 0x40(%[input]), %%zmm18     \n\t"
+                         "vmovups 0x80(%[input]), %%zmm19     \n\t"
+                         "vmovups 0xC0(%[input]), %%zmm20     \n\t"
+                         "vpdpbusd %%zmm16, %%zmm17, %%zmm0          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm18, %%zmm1          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm19, %%zmm2          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm20, %%zmm3          \n\t"
+                         "vmovups 0x100(%[input]), %%zmm21     \n\t"
+                         "vmovups 0x140(%[input]), %%zmm22     \n\t"
+                         "vmovups 0x180(%[input]), %%zmm23     \n\t"
+                         "vmovups 0x1C0(%[input]), %%zmm24     \n\t"
+                         "vpdpbusd %%zmm16, %%zmm21, %%zmm4          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm22, %%zmm5          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm23, %%zmm6          \n\t"
+                         "vpdpbusd %%zmm16, %%zmm24, %%zmm7          \n\t"
+                         "addq $0x40, %[filter]                                    \n\t"
+                         "addq %[hStep], %[input]                                         \n\t"
+                         "dec %%rcx                                         \n\t"
+                         "jg 0b                                             \n\t"
+                         : [input] "+r" (c.input), [filter] "+r" (c.filter)
+                         : [k4Num] "c" (c.k4Num), [stride] "r" (c.stride), [hStep] "r" (c.hStep)
+                         : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
+                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", 
+                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
+                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
+                           "%zmm31", "memory", "cc");
+
+    __asm__ __volatile__("cmpq $0x0, %[scale]                                       \n\t"
+                         "jne 1f                                                    \n\t"
+                         "movq %[flags], %%rcx                                      \n\t"
+                         "and $0xC, %%rcx                                           \n\t"
+                         "je 4f                                                     \n\t"
+                         "vpxord %%zmm31, %%zmm31, %%zmm31                             \n\t"
+                         "vpmaxsd %%zmm31, %%zmm0, %%zmm0                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm1, %%zmm1                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm2, %%zmm2                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm3, %%zmm3                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm4, %%zmm4                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm5, %%zmm5                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm6, %%zmm6                          \n\t"
+                         "vpmaxsd %%zmm31, %%zmm7, %%zmm7                          \n\t"
+                         "jmp 4f                                                    \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "1:                                                        \n\t"
+                         "vbroadcastss (%[scale]), %%zmm30                          \n\t"
+                         "vcvtdq2ps %%zmm30, %%zmm31                                \n\t"
+                         "vmulps %%zmm31, %%zmm0, %%zmm0                          \n\t"
+                         "vmulps %%zmm31, %%zmm1, %%zmm1                          \n\t"
+                         "vmulps %%zmm31, %%zmm2, %%zmm2                          \n\t"
+                         "vmulps %%zmm31, %%zmm3, %%zmm3                          \n\t"
+                         "vmulps %%zmm31, %%zmm4, %%zmm4                          \n\t"
+                         "vmulps %%zmm31, %%zmm5, %%zmm5                          \n\t"
+                         "vmulps %%zmm31, %%zmm6, %%zmm6                          \n\t"
+                         "vmulps %%zmm31, %%zmm7, %%zmm7                          \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "2:                                                        \n\t"
+                         "movq %[flags], %%rcx                                      \n\t"
+                         "and $0x2, %%rcx                                           \n\t"
+                         "je 3f                                                     \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0                       \n\t"
+                         "vaddps 0x40(%[eltwise]), %%zmm1, %%zmm1                \n\t"
+                         "vaddps 0x80(%[eltwise]), %%zmm2, %%zmm2                \n\t"
+                         "vaddps 0xC0(%[eltwise]), %%zmm3, %%zmm3                \n\t"
+                         "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4                \n\t"
+                         "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5                \n\t"
+                         "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6                \n\t"
+                         "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7                \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "3:                                                        \n\t"
+                         "movq %[flags], %%rcx                                      \n\t"
+                         "and $0xC, %%rcx                                           \n\t"
+                         "je 4f                                                     \n\t"
+                         "vpxord %%zmm31, %%zmm31, %%zmm31                             \n\t"
+                         "vmaxps %%zmm31, %%zmm0, %%zmm0                          \n\t"
+                         "vmaxps %%zmm31, %%zmm1, %%zmm1                          \n\t"
+                         "vmaxps %%zmm31, %%zmm2, %%zmm2                          \n\t"
+                         "vmaxps %%zmm31, %%zmm3, %%zmm3                          \n\t"
+                         "vmaxps %%zmm31, %%zmm4, %%zmm4                          \n\t"
+                         "vmaxps %%zmm31, %%zmm5, %%zmm5                          \n\t"
+                         "vmaxps %%zmm31, %%zmm6, %%zmm6                          \n\t"
+                         "vmaxps %%zmm31, %%zmm7, %%zmm7                          \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "4:                                                        \n\t"
+                         "vmovups %%zmm0, (%[output])                               \n\t"
+                         "vmovups %%zmm1, 0x40(%[output])                           \n\t"
+                         "vmovups %%zmm2, 0x80(%[output])                           \n\t"
+                         "vmovups %%zmm3, 0xC0(%[output])                           \n\t"
+                         "vmovups %%zmm4, 0x100(%[output])                          \n\t"
+                         "vmovups %%zmm5, 0x140(%[output])                          \n\t"
+                         "vmovups %%zmm6, 0x180(%[output])                          \n\t"
+                         "vmovups %%zmm7, 0x1C0(%[output])                          \n\t"
+                         :
+                         : [output] "r" (c.output), [eltwise] "r" (c.eltwise), [ostepC16] "r" (c.ostepC16),
+                           [flags] "r" (c.flags), [scale] "r" (c.scale)
+                         : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
+                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
+                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
+                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
+                           "%zmm31", "memory", "cc");
+}
+
+void Avx512DepthConvKernel1x16(ConvController &c) {
+    __asm__ __volatile__("prefetcht0 (%[output])                  \n\t"
+                         "vmovups (%[bias]), %%zmm0                   \n\t"
+                         :
+                         : [bias] "r" (c.bias), [flags] "r" (c.flags), [output] "r" (c.output)
+                         : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
+                           "%zmm7", "memory", "cc");
+
+    __asm__ __volatile__(".align 16                                         \n\t"
+                         "0:                                                \n\t"
+                         "vmovups (%[filter]), %%zmm16     \n\t"
+                         "vmovups (%[input]), %%zmm17     \n\t"
+                         "vpdpbusd %%zmm16, %%zmm17, %%zmm0          \n\t"
+                         "addq $0x40, %[filter]                                    \n\t"
+                         "addq %[hStep], %[input]                                         \n\t"
+                         "dec %%rcx                                         \n\t"
+                         "jg 0b                                             \n\t"
+                         : [input] "+r" (c.input), [filter] "+r" (c.filter)
+                         : [k4Num] "c" (c.k4Num), [stride] "r" (c.stride), [hStep] "r" (c.hStep)
+                         : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
+                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", 
+                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
+                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
+                           "%zmm31", "memory", "cc");
+
+    __asm__ __volatile__("cmpq $0x0, %[scale]                                       \n\t"
+                         "jne 1f                                                    \n\t"
+                         "movq %[flags], %%rcx                                      \n\t"
+                         "and $0xC, %%rcx                                           \n\t"
+                         "je 4f                                                     \n\t"
+                         "vpxord %%zmm31, %%zmm31, %%zmm31                             \n\t"
+                         "vpmaxsd %%zmm31, %%zmm0, %%zmm0                          \n\t"
+                         "jmp 4f                                                    \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "1:                                                        \n\t"
+                         "vbroadcastss (%[scale]), %%zmm30                          \n\t"
+                         "vcvtdq2ps %%zmm30, %%zmm31                                \n\t"
+                         "vmulps %%zmm31, %%zmm0, %%zmm0                          \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "2:                                                        \n\t"
+                         "movq %[flags], %%rcx                                      \n\t"
+                         "and $0x2, %%rcx                                           \n\t"
+                         "je 3f                                                     \n\t"
+                         "vaddps (%[eltwise]), %%zmm0, %%zmm0                       \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "3:                                                        \n\t"
+                         "movq %[flags], %%rcx                                      \n\t"
+                         "and $0xC, %%rcx                                           \n\t"
+                         "je 4f                                                     \n\t"
+                         "vpxord %%zmm31, %%zmm31, %%zmm31                             \n\t"
+                         "vmaxps %%zmm31, %%zmm0, %%zmm0                          \n\t"
+
+                         ".align 16                                                 \n\t"
+                         "4:                                                        \n\t"
+                         "vmovups %%zmm0, (%[output])                               \n\t"
+                         :
+                         : [output] "r" (c.output), [eltwise] "r" (c.eltwise), [ostepC16] "r" (c.ostepC16),
+                           [flags] "r" (c.flags), [scale] "r" (c.scale)
+                         : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6",
+                           "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14",
+                           "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
+                           "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30",
+                           "%zmm31", "memory", "cc");
+}
+
+EE depthwise_pointwise_convolution_int8(TensorDesc inputDesc,
+    UINT8 *inArray,
+    F32 *eltwiseInput,
+    TensorDesc dwFilterDesc,
+    const INT8 *dwFilterArray,
+    TensorDesc pwFilterDesc,
+    const INT8 *pwFilterArray,
+    ConvolutionParamSpec convParamSpec,
+    TensorDesc dwBiasDesc,
+    const F32 *dwBiasArray,
+    TensorDesc pwBiasDesc,
+    const F32 *pwBiasArray,
+    U32 tmpBytes,
+    void *tmp,
+    TensorDesc outputDesc,
+    void *outArray,
+    F32 *scale,
+    ActivationParamSpec depthwiseActivationParamSpec,
+    ActivationParamSpec pointwiseActivationParamSpec)
+{
+    DataType idt, fdt, odt;
+    DataFormat idf, fdf, odf;
+    I32 in, ic, ih, iw;
+    I32 fn, fc, fh, fw;
+    I32 on, oc, oh, ow;
+    CHECK_STATUS(tensor4dGetI32(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+    CHECK_STATUS(tensor4dGetI32(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
+    CHECK_STATUS(tensor4dGetI32(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+
+    if ((idf != DF_NCHWC16) || (ic % 16 != 0)) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+
+    // get computing params
+    I32 strideH = convParamSpec.stride_h;
+    I32 strideW = convParamSpec.stride_w;
+    I32 paddingT = convParamSpec.pad_top;
+    I32 paddingB = convParamSpec.pad_bottom;
+    I32 paddingL = convParamSpec.pad_left;
+    I32 paddingR = convParamSpec.pad_right;
+    I32 dilateH = convParamSpec.dilatedRate_h;
+    I32 dilateW = convParamSpec.dilatedRate_w;
+    I32 fhDilated = (fh - 1) * dilateH + 1;
+    I32 fwDilated = (fw - 1) * dilateW + 1;
+    I32 ohow = oh * ow;
+    I32 fhfw = fh * fw;
+    I32 iw_pad = iw + paddingL + paddingR;
+    I32 ih_pad = ih + paddingT + paddingB;
+
+    // infer kernel params
+    ConvController convCtl;
+    convCtl.ostepC16 = oh * ow * SIMDW * 4;
+    convCtl.fStep = ih_pad * iw_pad * SIMDW;
+    convCtl.kw = fw;
+    convCtl.kh = fh;
+    convCtl.scale = nullptr;
+    convCtl.stride = strideW;
+
+    // fuse dw+pw
+    F32 *useOutArray = (F32 *)tmp;
+    if (pwFilterArray == nullptr) {
+        useOutArray = (F32 *)outArray;
+    }
+    F32 *output = (F32 *)useOutArray;
+
+    const kernelFunc kernel[3] = {
+        Avx512DepthConvKernel1x16, Avx512DepthConvKernel8x16, Avx512DepthConvKernel16x16};
+    U32 hwSizes[3] = {1, 8, 16};
+
+    // quantization
+    F32 *scaleI = scale;
+    F32 *scaleO = scale + 1;
+    F32 *scaleF = scale + 2;
+    if (idt != DT_U8_Q) {
+        //quantize to U8_Q
+        TensorDesc qDesc = inputDesc;
+        qDesc.dt = DT_U8_Q;
+        CHECK_STATUS(quantize_x86(inputDesc, (void *)inArray, &qDesc, tmp, scaleI));
+        inArray = (UINT8 *)tmp;
+        tmp = (void *)((U8 *)tmp + tensorNumBytes(qDesc));
+    }
+    *scaleO = scaleI[0] * scaleF[0];
+    if (odt != DT_F32 && odt != DT_I32) {
+        output = (F32 *)tmp;
+        tmp = (void *)((U8 *)tmp + tensorNumElements(outputDesc) * bytesOf(DT_I32));
+        outputDesc.dt = DT_I32;
+    }
+    if (eltwiseInput != nullptr) {
+        outputDesc.dt = DT_F32;
+    }
+    F32 *factorPtr = nullptr;
+    F32 factor = 0;
+    if (scale != nullptr && outputDesc.dt == DT_F32) {
+        factor = 1 / (*scaleO);
+        factorPtr = &factor;
+    }
+
+    I32 *offsetC = (I32 *)tmp;
+    tmp = (void *)((U8 *)tmp + oc * bytesOf(DT_I32));
+    CHECK_STATUS(quantize_bias_offsetC((const void *)dwBiasArray, dwBiasDesc, DT_I32,
+        (const void *)dwFilterArray, dwFilterDesc, scaleO, offsetC));
+    dwFilterArray += oc * 4;
+
+    U32 kernelSize = (fh * fw + 3) / 4 * 4;
+    convCtl.k4Num = kernelSize / 4;
+    UINT8 *tmpInput = (UINT8 *)tmp;
+
+    I64 flags = 0;
+    flags |= (eltwiseInput != nullptr) << 1;
+    flags |= U32(depthwiseActivationParamSpec.mode) << 2;
+    convCtl.scale = factorPtr;
+    convCtl.flags = flags;
+
+    for (I32 n = 0; n < in; ++n) {
+        I32 ocSize = 16;
+        //  Padding
+        for (I32 ocb = 0; ocb < oc; ocb += ocSize) {
+            convCtl.bias = offsetC + ocb;
+            F32 *curO = output + (n * oc + ocb) * oh * ow;
+            I32 hwSize = 0;
+                UINT8 *curI = inArray + (n * ic + ocb) * ih * iw;
+            for (I32 hw = 0; hw < ohow; hw += hwSize) {
+                hwSize = UNI_MIN(ohow - hw, 16);
+                hwSize = hwSizes[hwSize >> 3];
+                I32 h = hw / ow;
+                I32 w = hw % ow;
+                I32 in_h_0 = h * strideH;
+                I32 in_w_0 = w * strideW;
+
+                // TODO: optimize
+                for (U32 kk = 0; kk < kernelSize; kk += 4) {
+                    for (I32 ii = 0; ii < hwSize; ++ii) {
+                        for (I32 jj = 0; jj < SIMDW; ++jj) {
+                            for (I32 k4 = 0; k4 < 4; ++k4) {
+                                I32 oidx = k4 + jj * 4 + ii * 4 * SIMDW + kk * SIMDW * hwSize;
+                                if ((k4 + kk) < fhfw) {
+                                    in_h_0 = (hw + ii) / ow * strideH + (kk + k4) / fw;
+                                    in_w_0 = (hw + ii) % ow * strideW + (kk + k4) % fw;
+                                    I32 iidx = jj + (in_h_0 * iw + in_w_0) * SIMDW;
+                                    tmpInput[oidx] = curI[iidx];
+                                } else {
+                                    tmpInput[oidx] = 0;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                convCtl.input = tmpInput;
+                convCtl.output = curO + (h * ow + w) * SIMDW;
+                convCtl.filter = dwFilterArray + ocb * kernelSize;
+                convCtl.hStep = hwSize * SIMDW * 4;
+                kernel[hwSize >> 3](convCtl);
+            }
+        }
+    }
+
+    return SUCCESS;
+}
diff --git a/compute/tensor/src/cpu/x86/int8/depthwise_convolution_transform.cpp b/compute/tensor/src/cpu/x86/int8/depthwise_convolution_transform.cpp
new file mode 100644
index 00000000..22c118e3
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/int8/depthwise_convolution_transform.cpp
@@ -0,0 +1,99 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/x86/int8/tensor_computing_int8.h"
+#include "cpu/x86/int8/transform_functions_int8.h"
+
+EE depthwise_convolution_transform_filter_int8(
+    TensorDesc filterDesc, const INT8 *filter, TensorDesc *ftmDesc, INT8 *filterTransformed)
+{
+    DataFormat ftmDataFormat = DF_NCHWN8HW4;  // for flag, actually DF_NCHWN16HW4
+
+    DataType fdt;
+    DataFormat fdf;
+    U32 fn, fc, fh, fw;
+    CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
+    if (fdf == ftmDataFormat) {
+        *ftmDesc = filterDesc;
+        UNI_MEMCPY(filterTransformed, filter, fn * fc * fh * fw * bytesOf(fdt));
+        return SUCCESS;
+    }
+    if (fdf != DF_NCHW) {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
+    filterDesc = tensor4df(fdt, fdf, fc, 1, fh, fw);
+    *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, 1, fh, fw);
+
+    U32 fhfw = fh * fw;
+    U32 fhfwAligned = (fhfw + 3) / 4 * 4;
+
+    I32 *offsetC = (I32 *)filterTransformed;
+    filterTransformed += fc * bytesOf(DT_I32);
+    for (U32 n = 0; n < fc; ++n) {
+        I32 sum = 0;
+        for (U32 i = 0; i < fh * fw; ++i) {
+            sum += filter[i + n * fh * fw];
+        }
+        offsetC[n] = -128 * sum;
+    }
+
+    for (U32 n = 0; n < fn; ++n) {
+        for (U32 c = 0; c < fc; c += 16) {
+            for (U32 hw = 0; hw < fhfwAligned; hw += 4) {
+                U32 c16;
+                for (c16 = 0; (c16 < 16) && (c16 < (fc - c)); ++c16) {
+                    U32 w4;
+                    for (w4 = 0; (w4 < 4) && (w4 < (fhfw - hw)); ++w4) {
+                        U32 iidx = n * c * fhfw + (c + c16) * fhfw + hw + w4;
+                        U32 oidx = n * c * fhfwAligned + c * fhfwAligned + hw * 16 + 4 * c16 + w4;
+                        filterTransformed[oidx] = filter[iidx];
+                    }
+                    for (; w4 < 4; ++w4) {
+                        filterTransformed[n * c * fhfwAligned + c * fhfwAligned + hw * 16 +
+                            4 * c16 + w4] = 0;
+                    }
+                }
+                for (; c16 < 16; ++c16) {
+                    UNI_MEMSET(
+                        filterTransformed + n * c * fhfw + c * fhfw + hw * 16 + c16 * 4, 0, 4);
+                }
+            }
+        }
+    }
+
+    return SUCCESS;
+}
+
+EE depthwise_pointwise_convolution_transform_filter_int8(TensorDesc dwFilterDesc,
+    const INT8 *dwFilter,
+    TensorDesc pwFilterDesc,
+    const INT8 *pwFilter,
+    DepthwiseConvolutionForwardAlgorithm algorithm,
+    TensorDesc *dwFtmDesc,
+    INT8 *dwFilterTransformed,
+    TensorDesc *pwFtmDesc,
+    INT8 *pwFilterTransformed)
+{
+    EE ret = depthwise_convolution_transform_filter_int8(
+        dwFilterDesc, dwFilter, dwFtmDesc, dwFilterTransformed);
+    CHECK_STATUS(ret);
+    if (pwFilter == nullptr) {
+        return ret;
+    }
+
+    ConvolutionParamSpec p = createConvolutionParamSpec(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
+        1, pwFilterDesc.dims[pwFilterDesc.nDims - 1], CONVOLUTION_POINTWISE);
+    ret = convolution_transform_filter_int8(
+        pwFilterDesc, pwFilter, p, CONVOLUTION_ALGORITHM_POINTWISE, pwFtmDesc, pwFilterTransformed);
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/x86/int8/lstm.cpp b/compute/tensor/src/cpu/x86/int8/lstm.cpp
new file mode 100644
index 00000000..ead97669
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/int8/lstm.cpp
@@ -0,0 +1,189 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/x86/int8/tensor_computing_int8.h"
+#include "cpu/x86/fp32/x86_functions_fp32.h"
+#include "cpu/x86/fp32/mvm_nkn32.h"
+#include "cpu/tensor_computing_cpu.h"
+#include "blas_enhance.h"
+
+EE lstmcell_int8(TensorDesc xDesc,
+    const void *currentX,
+    const TensorDesc *filterDesc,
+    const void **filter,
+    const TensorDesc *biasDesc,
+    const void **bias,
+    F32 *scale,
+    void *state,
+    U32 tmpBytes,
+    void *tmp,
+    RNNParamSpec rnnParamSpec,
+    U32 batchStrideX,
+    U32 batchStrideH,
+    TensorDesc hDesc,
+    void *output,
+    Arch arch)
+{
+    UNUSED(biasDesc);
+    UNUSED(tmpBytes);
+    UNUSED(arch);
+    if (nullptr == filter || nullptr == bias || nullptr == state || nullptr == tmp ||
+        nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+
+    DataType idt, fdt, odt;
+    DataFormat idf, fdf, odf;
+    U32 in, ix;
+    U32 on, oh;
+    U32 fk, fn;
+    CHECK_STATUS(tensor2dGet(xDesc, &idt, &idf, &in, &ix));
+    CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk));
+    CHECK_STATUS(tensor2dGet(hDesc, &odt, &odf, &on, &oh));
+    if (fdf != DF_NKN32) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+    fn /= 32;
+
+    U32 batch = in;
+    I32 xDim = ix;
+    I32 hDim = rnnParamSpec.num_outputs;
+    I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
+    int num1 = rnnParamSpec.bi_direction ? 2 : 1;
+    U32 steps = batchStrideH / hDim / num1;
+    if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+    if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+    F32 forgetBias = rnnParamSpec.forget_bias;
+    if (rnnParamSpec.activation_type != ACTIVATION_TANH) {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
+
+    const F32 *currentXArray = (const F32 *)currentX;
+    F32 *lastStateArray = (F32 *)state;
+    F32 *lastHArray = lastStateArray + column;
+    F32 *tmpArray = (F32 *)tmp;
+    F32 *currentStateArray = (F32 *)state;
+    F32 *currentHArray = currentStateArray + column;
+    F32 *outputArray = (F32 *)output;
+    F32 *xhArray = tmpArray;
+    F32 *intermediateH = xhArray + (xDim + hDim);
+    UINT8 *quant = (UINT8 *)(intermediateH + fn * 32);
+    U32 lastStateStride = column + hDim;
+    U32 lastHStride = column + hDim;
+    U32 currentStateStride = column + hDim;
+    U32 currentHStride = column + hDim;
+    __m256 forgetBiasVector = _mm256_set1_ps(forgetBias);
+    for (U32 m = 0; m < batch; m++) {
+        F32 *lastBatchH = lastHArray + m * lastHStride;
+        if (xDim > 0) {
+            UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32));
+            UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F32));
+        } else {
+            intermediateH = tmpArray;
+            xhArray = lastBatchH;
+        }
+        const F32 *mBias = (const F32 *)bias[0] + m * steps * column * 4;
+
+        TensorDesc aDesc = tensor2df(DT_I8, targetFormat4mvmMatrix(DT_I8), fn * 32, fk);
+        TensorDesc b0Desc = tensor1d(DT_F32, fk);
+        TensorDesc b1Desc = tensor1d(DT_U8_Q, fk);
+        TensorDesc cDesc = tensor1d(DT_F32, fn * 32);
+        F32 iScale = -1, fScale = *scale;
+        CHECK_STATUS(quantize_cpu(b0Desc, xhArray, &b1Desc, quant, &iScale, arch));
+        F32 oScale = iScale * fScale;
+        UNI_MEMSET(intermediateH, 0, sizeof(F32) * fn * 32);
+        CHECK_STATUS(matrix_vector_multiply(aDesc, filter[0], b1Desc, quant, tmpBytes,
+            (void *)filter[0], cDesc, intermediateH, &oScale, arch));
+        array_add_f32(intermediateH, mBias, intermediateH, fn * 32);
+
+        F32 *out_i = intermediateH;
+        F32 *out_g = out_i + column;
+        F32 *out_f = out_i + column * 2;
+        F32 *out_o = out_i + column * 3;
+
+        F32 *lastBatchState = lastStateArray + m * lastStateStride;
+        F32 *currentBatchState = currentStateArray + m * currentStateStride;
+        F32 *currentBatchH = currentHArray + m * currentHStride;
+        F32 *currentOutput = outputArray + m * batchStrideH;
+
+        F32 *tmpState, *tmpHH, *tmpH;
+        if (rnnParamSpec.zoneout_cell == 0) {
+            tmpState = currentBatchState;
+        } else {
+            tmpState = out_i;
+        }
+        if (rnnParamSpec.num_projection > 0) {
+            tmpHH = out_g;
+            tmpH = currentOutput;
+        } else {
+            tmpHH = currentOutput;
+            tmpH = out_g;
+        }
+
+        I32 h = 0;
+        for (; h < column - 7; h += 8) {
+            __m256 out_i_v = _mm256_loadu_ps(out_i + h);
+            __m256 out_g_v = _mm256_loadu_ps(out_g + h);
+            __m256 out_f_v = _mm256_loadu_ps(out_f + h);
+            __m256 out_o_v = _mm256_loadu_ps(out_o + h);
+            __m256 C_v = _mm256_loadu_ps(lastBatchState + h);
+            __m256 I_v = _mm256_sigmod_ps(out_i_v);
+            __m256 F_v = _mm256_sigmod_ps(_mm256_add_ps(out_f_v, forgetBiasVector));
+            __m256 O_v = _mm256_sigmod_ps(out_o_v);
+            __m256 G_v = _mm256_tanh_ps(out_g_v);
+            C_v = _mm256_add_ps(_mm256_mul_ps(C_v, F_v), _mm256_mul_ps(I_v, G_v));
+            __m256 out_hidden_v = _mm256_mul_ps(O_v, _mm256_tanh_ps(C_v));
+            _mm256_storeu_ps(tmpState + h, C_v);
+            _mm256_storeu_ps(tmpHH + h, out_hidden_v);
+        }
+        for (; h < column; h++) {
+            F32 C_s = lastBatchState[h];
+            F32 I_s = 1.0 / (1.0 + exp(-out_i[h]));
+            F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias)));
+            F32 O_s = 1.0 / (1.0 + exp(-out_o[h]));
+            F32 G_s = tanh(out_g[h]);
+            C_s = C_s * F_s + I_s * G_s;
+            F32 value = O_s * tanh(C_s);
+            tmpState[h] = C_s;
+            tmpHH[h] = value;
+        }
+        if (rnnParamSpec.zoneout_cell != 0) {
+            array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneout_cell, 0);
+            array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneout_cell, 0);
+            array_add_f32(tmpState, lastBatchState, currentBatchState, column);
+        }
+
+        if (rnnParamSpec.num_projection > 0) {
+            mvm_nkn32_with_bias(hDim / 32, rnnParamSpec.num_projection, (const F32 *)filter[1],
+                tmpHH, tmpH, nullptr);
+        }
+
+        if (rnnParamSpec.zoneout_output != 0) {
+            if (rnnParamSpec.num_projection > 0) {
+                array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0);
+            } else {
+                array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0);
+            }
+            array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneout_output, 0);
+            array_add_f32(out_f, lastBatchH, currentBatchH, hDim);
+        } else {
+            UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F32) * hDim);
+        }
+    }
+    return SUCCESS;
+}
diff --git a/compute/tensor/src/cpu/x86/int8/pooling_int8.cpp b/compute/tensor/src/cpu/x86/int8/pooling_int8.cpp
new file mode 100644
index 00000000..094dd16d
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/int8/pooling_int8.cpp
@@ -0,0 +1,429 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/x86/fp32/tensor_computing_fp32.h"
+
+#define UNROLL_W 4
+
+typedef void (*pooling_max_func)(
+    const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride);
+typedef void (*pooling_mean_func)(
+    const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, I32 poolSize);
+
+void pooling_c16_max_w4(const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __asm__ __volatile__("mov %%eax, %%eax                                  \n\t"
+                         "mov %4, %%eax                                  \n\t"
+                         "mov %%rax, %%rdi                                  \n\t"
+                         "mov %%eax, %%eax                                  \n\t"
+                         "mov %5, %%eax                                  \n\t"
+                         "mov %%rax, %%r9                                  \n\t"
+                         "add %%r9, %%r9                                  \n\t"
+                         "mov %%rax, %%r10                                  \n\t"
+                         "add %%r9, %%r10                                  \n\t"
+                         "add %0, %%rax                                  \n\t"
+                         "add %0, %%r9                                  \n\t"
+                         "add %0, %%r10                                  \n\t"
+
+                         "vmovups (%0), %%xmm0                     \n\t"
+                         "vmovups (%%rax), %%xmm1                     \n\t"
+                         "vmovups (%%r9), %%xmm2                     \n\t"
+                         "vmovups (%%r10), %%xmm3                     \n\t"
+
+                         ".align 16                                         \n\t"
+                         "0:                                                \n\t"
+
+                         "mov %2, %%ecx                                     \n\t"
+                         ".align 16                                         \n\t"
+                         "1:                                                \n\t"
+
+                         "vmovups (%0), %%xmm4                     \n\t"
+                         "vmovups (%%rax), %%xmm5                     \n\t"
+                         "vmovups (%%r9), %%xmm6                     \n\t"
+                         "vmovups (%%r10), %%xmm7                     \n\t"
+
+                         "vpmaxub %%xmm0, %%xmm4, %%xmm0                     \n\t"
+                         "vpmaxub %%xmm1, %%xmm5, %%xmm1                     \n\t"
+                         "vpmaxub %%xmm2, %%xmm6, %%xmm2                     \n\t"
+                         "vpmaxub %%xmm3, %%xmm7, %%xmm3                     \n\t"
+
+                         "add $0x10, %0                                      \n\t"
+                         "add $0x10, %%rax                                      \n\t"
+                         "add $0x10, %%r9                                      \n\t"
+                         "add $0x10, %%r10                                      \n\t"
+                         "dec %%ecx                                         \n\t"
+                         "jg 1b                                             \n\t"
+
+                         "add %%rdi, %0                                      \n\t"
+                         "add %%rdi, %%rax                                      \n\t"
+                         "add %%rdi, %%r9                                      \n\t"
+                         "add %%rdi, %%r10                                      \n\t"
+                         "dec %%ebx                                         \n\t"
+                         "jg 0b                                             \n\t"
+
+                         "vmovups %%xmm0, (%1)                              \n\t"
+                         "vmovups %%xmm1, 0x10(%1)                          \n\t"
+                         "vmovups %%xmm2, 0x20(%1)                          \n\t"
+                         "vmovups %%xmm3, 0x30(%1)                          \n\t"
+                         :
+                         : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
+                         : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%xmm0", "%xmm1", "%xmm2",
+                         "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "memory", "cc");
+}
+
+void pooling_c16_max_w2(const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __asm__ __volatile__(
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %5, %%eax                                  \n\t"
+        "add %0, %%rax                                  \n\t"
+        "vmovups (%0), %%xmm0                     \n\t"
+        "vmovups (%%rax), %%xmm1                     \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%xmm4                     \n\t"
+        "vmovups (%%rax), %%xmm5                     \n\t"
+        "vpmaxub %%xmm0, %%xmm4, %%xmm0                     \n\t"
+        "vpmaxub %%xmm1, %%xmm5, %%xmm1                     \n\t"
+        "add $0x10, %0                                      \n\t"
+        "add $0x10, %%rax                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "add %%rdi, %%rax                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vmovups %%xmm0, (%1)                              \n\t"
+        "vmovups %%xmm1, 0x10(%1)                          \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
+        : "%eax", "%rax", "%ecx", "%rdi", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "memory", "cc");
+}
+
+void pooling_c16_max_w1(const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride)
+{
+    __asm__ __volatile__("mov %%eax, %%eax                                  \n\t"
+                         "mov %4, %%eax                                  \n\t"
+                         "mov %%rax, %%rdi                                  \n\t"
+                         "vmovups (%0), %%xmm0                     \n\t"
+                         ".align 16                                         \n\t"
+                         "0:                                                \n\t"
+                         "mov %2, %%ecx                                     \n\t"
+                         ".align 16                                         \n\t"
+                         "1:                                                \n\t"
+                         "vmovups (%0), %%xmm4                     \n\t"
+                         "vpmaxub %%xmm0, %%xmm4, %%xmm0                     \n\t"
+                         "add $0x10, %0                                      \n\t"
+                         "dec %%ecx                                         \n\t"
+                         "jg 1b                                             \n\t"
+                         "add %%rdi, %0                                      \n\t"
+                         "dec %%ebx                                         \n\t"
+                         "jg 0b                                             \n\t"
+                         "vmovups %%xmm0, (%1)                              \n\t"
+                         :
+                         : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride)
+                         : "%eax", "%rax", "%ecx", "%rdi", "%xmm0", "%xmm4", "memory", "cc");
+}
+
+void pooling_c16_mean_w4(
+    const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, I32 poolSize)
+{
+    __asm__ __volatile__(
+        "mov $-128, %%eax                                  \n\t"
+        "imul %%ebx, %%eax                                  \n\t"
+        "imul %2, %%eax                                  \n\t"
+        "vmovd %%eax, %%xmm0              \n\t"
+        "vpbroadcastd %%xmm0, %%zmm10              \n\t"
+        "vpbroadcastd %%xmm0, %%zmm11              \n\t"
+        "vpbroadcastd %%xmm0, %%zmm12              \n\t"
+        "vpbroadcastd %%xmm0, %%zmm13              \n\t"
+        "mov $0x80, %%eax \n\t"
+        "vmovd %%eax, %%xmm1                    \n\t"
+        "vpbroadcastb %%xmm1, %%xmm8            \n\t"
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "mov %5, %%eax                                  \n\t"
+        "mov %%rax, %%r9                                  \n\t"
+        "add %%r9, %%r9                                  \n\t"
+        "mov %%rax, %%r10                                  \n\t"
+        "add %%r9, %%r10                                  \n\t"
+        "add %0, %%rax                                  \n\t"
+        "add %0, %%r9                                  \n\t"
+        "add %0, %%r10                                  \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%xmm4              \n\t"
+        "vmovups (%%rax), %%xmm5              \n\t"
+        "vmovups (%%r9), %%xmm6              \n\t"
+        "vmovups (%%r10), %%xmm7              \n\t"
+        "vpmovzxbd %%xmm4, %%zmm0                     \n\t"
+        "vpmovzxbd %%xmm5, %%zmm1                     \n\t"
+        "vpmovzxbd %%xmm6, %%zmm2                     \n\t"
+        "vpmovzxbd %%xmm7, %%zmm3                     \n\t"
+        "vpaddd %%zmm10, %%zmm0, %%zmm10                     \n\t"
+        "vpaddd %%zmm11, %%zmm1, %%zmm11                     \n\t"
+        "vpaddd %%zmm12, %%zmm2, %%zmm12                     \n\t"
+        "vpaddd %%zmm13, %%zmm3, %%zmm13                     \n\t"
+        "add $0x10, %0                                      \n\t"
+        "add $0x10, %%rax                                      \n\t"
+        "add $0x10, %%r9                                      \n\t"
+        "add $0x10, %%r10                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "add %%rdi, %%rax                                      \n\t"
+        "add %%rdi, %%r9                                      \n\t"
+        "add %%rdi, %%r10                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vbroadcastss (%6), %%zmm0                     \n\t"
+        "vpmulld %%zmm0, %%zmm10, %%zmm10                     \n\t"
+        "vpmulld %%zmm0, %%zmm11, %%zmm11                     \n\t"
+        "vpmulld %%zmm0, %%zmm12, %%zmm12                     \n\t"
+        "vpmulld %%zmm0, %%zmm13, %%zmm13                     \n\t"
+        "vpsrld $16, %%zmm10, %%zmm10                     \n\t"
+        "vpsrld $16, %%zmm11, %%zmm11                     \n\t"
+        "vpsrld $16, %%zmm12, %%zmm12                     \n\t"
+        "vpsrld $16, %%zmm13, %%zmm13                     \n\t"
+        "mov $128, %%eax                                  \n\t"
+        "vmovd %%eax, %%xmm0              \n\t"
+        "vpbroadcastd %%xmm0, %%zmm4              \n\t"
+        "vpaddd %%zmm10, %%zmm4, %%zmm10                     \n\t"
+        "vpaddd %%zmm11, %%zmm4, %%zmm11                     \n\t"
+        "vpaddd %%zmm12, %%zmm4, %%zmm12                     \n\t"
+        "vpaddd %%zmm13, %%zmm4, %%zmm13                     \n\t"
+        "vpmovusdb %%zmm10, (%1)                              \n\t"
+        "vpmovusdb %%zmm11, 0x10(%1)                          \n\t"
+        "vpmovusdb %%zmm12, 0x20(%1)                          \n\t"
+        "vpmovusdb %%zmm13, 0x30(%1)                          \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
+        : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4",
+        "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "memory", "cc");
+}
+
+void pooling_c16_mean_w2(
+    const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, I32 poolSize)
+{
+    __asm__ __volatile__(
+        "mov $-128, %%eax                                  \n\t"
+        "imul %%ebx, %%eax                                  \n\t"
+        "imul %2, %%eax                                  \n\t"
+        "vmovd %%eax, %%xmm0              \n\t"
+        "vpbroadcastd %%xmm0, %%zmm10              \n\t"
+        "vpbroadcastd %%xmm0, %%zmm11              \n\t"
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "mov %5, %%eax                                  \n\t"
+        "add %0, %%rax                                  \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%xmm4              \n\t"
+        "vmovups (%%rax), %%xmm5              \n\t"
+        "vpmovzxbd %%xmm4, %%zmm0                     \n\t"
+        "vpmovzxbd %%xmm5, %%zmm1                     \n\t"
+        "vpaddd %%zmm10, %%zmm0, %%zmm10                     \n\t"
+        "vpaddd %%zmm11, %%zmm1, %%zmm11                     \n\t"
+        "add $0x10, %0                                      \n\t"
+        "add $0x10, %%rax                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "add %%rdi, %%rax                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vbroadcastss (%6), %%zmm0                     \n\t"
+        "vpmulld %%zmm0, %%zmm10, %%zmm10                     \n\t"
+        "vpmulld %%zmm0, %%zmm11, %%zmm11                     \n\t"
+        "vpsrld $16, %%zmm10, %%zmm10                     \n\t"
+        "vpsrld $16, %%zmm11, %%zmm11                     \n\t"
+        "mov $128, %%eax                                  \n\t"
+        "vmovd %%eax, %%xmm0              \n\t"
+        "vpbroadcastd %%xmm0, %%zmm4              \n\t"
+        "vpaddd %%zmm10, %%zmm4, %%zmm10                     \n\t"
+        "vpaddd %%zmm11, %%zmm4, %%zmm11                     \n\t"
+        "vpmovusdb %%zmm10, (%1)                              \n\t"
+        "vpmovusdb %%zmm11, 0x10(%1)                          \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
+        : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm5", "%zmm8", "%zmm10",
+        "%zmm11", "memory", "cc");
+}
+
+void pooling_c16_mean_w1(
+    const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, I32 poolSize)
+{
+    __asm__ __volatile__(
+        "mov %%eax, %%eax                                  \n\t"
+        "mov %4, %%eax                                  \n\t"
+        "mov %%rax, %%rdi                                  \n\t"
+        "mov $-128, %%eax                                  \n\t"
+        "imul %%ebx, %%eax                                  \n\t"
+        "imul %2, %%eax                                  \n\t"
+        "vmovd %%eax, %%xmm0              \n\t"
+        "vpbroadcastd %%xmm0, %%zmm10              \n\t"
+        ".align 16                                         \n\t"
+        "0:                                                \n\t"
+        "mov %2, %%ecx                                     \n\t"
+        ".align 16                                         \n\t"
+        "1:                                                \n\t"
+        "vmovups (%0), %%xmm4              \n\t"
+        "vpmovzxbd %%xmm4, %%zmm0                     \n\t"
+        "vpaddd %%zmm10, %%zmm0, %%zmm10                     \n\t"
+        "add $0x10, %0                                      \n\t"
+        "dec %%ecx                                         \n\t"
+        "jg 1b                                             \n\t"
+        "add %%rdi, %0                                      \n\t"
+        "dec %%ebx                                         \n\t"
+        "jg 0b                                             \n\t"
+        "vbroadcastss (%6), %%zmm0                     \n\t"
+        "vpmulld %%zmm0, %%zmm10, %%zmm10                     \n\t"
+        "mov $128, %%eax                                  \n\t"
+        "vmovd %%eax, %%xmm0              \n\t"
+        "vpbroadcastd %%xmm0, %%zmm4              \n\t"
+        "vpsrld $16, %%zmm10, %%zmm10                     \n\t"
+        "vpaddd %%zmm10, %%zmm4, %%zmm10                     \n\t"
+        "vpmovusdb %%zmm10, (%1)                              \n\t"
+        :
+        : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize)
+        : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm8", "%zmm10", "memory",
+        "cc");
+}
+
+EE pooling_c16_uint8(TensorDesc inputDesc,
+    const UINT8 *input,
+    PoolingParamSpec p,
+    TensorDesc outputDesc,
+    UINT8 *output,
+    void *scale)
+{
+    if (nullptr == input || nullptr == output) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    DataType idt, odt;
+    DataFormat idf, odf;
+    U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0;
+    CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+    CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+
+    if (idt != odt || idt != DT_U8_Q) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+    if (in != on || ic != oc) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+    if (idf != DF_NCHWC16 || odf != idf) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+
+    PoolingMode pm = p.mode;
+    U32 strideH = p.stride_h;
+    U32 strideW = p.stride_w;
+    U32 paddingT = p.pad_top;
+    U32 paddingL = p.pad_left;
+    U32 kernelSizeH = p.kernel_h;
+    U32 kernelSizeW = p.kernel_w;
+    U32 wSize, kh, kw, iStep;
+    UINT8 *curO;
+    const UINT8 *curI;
+    if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
+
+    if (ic % 16 != 0) {
+        CHECK_STATUS(NOT_MATCH);
+    }
+
+    F32 *inputScale = (F32 *)scale;
+    F32 *outputScale = inputScale + 1;
+    I32 shift = 65536;
+    I32 factor = shift / (kernelSizeH * kernelSizeW);
+    if (factor < 1) {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
+    if (pm == POOLING_MAX) {
+        *outputScale = *inputScale;
+    } else {
+        *outputScale = *inputScale * factor * (kernelSizeW * kernelSizeH) / (F32)shift;
+    }
+
+    ic /= 16;
+    U32 owInter = (iw + paddingL - kernelSizeW) / strideW + 1;
+    U32 wSizes[3] = {1, 2, 4};
+    pooling_max_func pooling_max[3] = {pooling_c16_max_w1, pooling_c16_max_w2, pooling_c16_max_w4};
+    pooling_mean_func pooling_mean[3] = {
+        pooling_c16_mean_w1, pooling_c16_mean_w2, pooling_c16_mean_w4};
+    F32 poolSize = shift / (kernelSizeH * kernelSizeW);
+    for (U32 n = 0; n < in; n++) {
+        for (U32 c = 0; c < ic; c++) {
+            for (U32 h = 0; h < oh; h++) {
+                for (U32 w = 0; w < ow; w += wSize) {
+                    if (w < owInter) {
+                        wSize = UNI_MIN(owInter - w, UNROLL_W);
+                    } else {
+                        wSize = 1;
+                    }
+                    wSize = wSizes[wSize >> 1];
+                    int hstart = (int)h * (int)strideH - (int)paddingT;
+                    int wstart = (int)w * (int)strideW - (int)paddingL;
+                    int hend = UNI_MIN(hstart + kernelSizeH, ih);
+                    int wend = UNI_MIN(wstart + kernelSizeW, iw);
+                    hstart = UNI_MAX(hstart, 0);
+                    wstart = UNI_MAX(wstart, 0);
+
+                    curI = input + (hstart * iw + wstart) * 16;
+                    curO = output + (h * ow + w) * 16;
+                    kh = hend - hstart;
+                    kw = wend - wstart;
+                    iStep = (iw - kw) * 16;
+                    if (!p.count_include_pad) {
+                        poolSize = shift / (kh * kw);
+                    }
+                    if (kw < kernelSizeW) {
+                        wSize = 1;
+                    }
+                    switch (pm) {
+                        case POOLING_MAX: {
+                            pooling_max[wSize >> 1](curI, curO, kw, kh, iStep, strideW * 16);
+                            break;
+                        }
+                        case POOLING_MEAN: {
+                            pooling_mean[wSize >> 1](
+                                curI, curO, kw, kh, iStep, strideW * 16, poolSize);
+                            break;
+                        }
+                        default:
+                            return NOT_SUPPORTED;
+                    }
+                }
+            }
+            input += ih * iw * 16;
+            output += oh * ow * 16;
+        }
+    }
+    return SUCCESS;
+}
diff --git a/compute/tensor/src/cpu/x86/int8/quantize.cpp b/compute/tensor/src/cpu/x86/int8/quantize.cpp
index 5910545b..6b8af3e0 100644
--- a/compute/tensor/src/cpu/x86/int8/quantize.cpp
+++ b/compute/tensor/src/cpu/x86/int8/quantize.cpp
@@ -20,6 +20,7 @@
 
 inline void getSymmetricQuantizeScale(U32 num16, U32 resMask, const F32 *data, F32 *scale)
 {
+    F32 maxVal = 0;
     __asm__ __volatile__("vxorps %%zmm0, %%zmm0, %%zmm0            \n\t"
                          "mov $0x7FFFFFFF, %%ebx \n\t"
                          "vmovd %%ebx, %%xmm1                      \n\t"
@@ -55,17 +56,20 @@ inline void getSymmetricQuantizeScale(U32 num16, U32 resMask, const F32 *data, F
                          "vmaxps %%xmm1, %%xmm0, %%xmm0            \n\t"
                          "vpermilps $0b00000001, %%xmm0, %%xmm1    \n\t"
                          "vmaxps %%xmm1, %%xmm0, %%xmm0            \n\t"
-                         "mov $0x42FE0000, %%ebx  \n\t"
-                         "vmovd %%ebx, %%xmm1                      \n\t"
-                         "vdivps %%xmm0, %%xmm1, %%xmm2            \n\t"
-                         "vmovss %%xmm2, (%1)                      \n\t"
-                         : "+r"(data), "+r"(scale)
+                         "vmovd %%xmm0, %1                      \n\t"
+                         : "+r"(data), "+r"(maxVal)
                          : "r"(num16), "a"(resMask)
                          : "%k2", "%ebx", "%zmm0", "%zmm1", "%zmm2", "memory", "cc");
+    if (maxVal == 0) {
+        *scale = 1;
+    } else {
+        *scale = 127 / maxVal;
+    }
 }
 
 inline void getSymmetricQuantizeScaleI32(U32 num16, U32 resMask, const I32 *data, F32 *scale)
 {
+    F32 maxVal = 0;
     __asm__ __volatile__("vxorps %%zmm0, %%zmm0, %%zmm0            \n\t"
                          "mov %2, %%ebx \n\t"
                          "cmp $0x0, %%ebx                         \n\t"
@@ -98,14 +102,16 @@ inline void getSymmetricQuantizeScaleI32(U32 num16, U32 resMask, const I32 *data
                          "vpmaxsd %%xmm1, %%xmm0, %%xmm0            \n\t"
                          "vpermilps $0b00000001, %%xmm0, %%xmm1    \n\t"
                          "vpmaxsd %%xmm1, %%xmm0, %%xmm0            \n\t"
-                         "mov $0x42FE0000, %%ebx  \n\t"
-                         "vmovd %%ebx, %%xmm1                      \n\t"
                          "vcvtdq2ps %%xmm0, %%xmm0                       \n\t"
-                         "vdivps %%xmm0, %%xmm1, %%xmm2            \n\t"
-                         "vmovss %%xmm2, (%1)                      \n\t"
-                         : "+r"(data), "+r"(scale)
+                         "vmovd %%xmm0, %1                     \n\t"
+                         : "+r"(data), "+r"(maxVal)
                          : "r"(num16), "a"(resMask)
                          : "%k2", "%ebx", "%zmm0", "%zmm1", "%zmm2", "memory", "cc");
+    if (maxVal == 0) {
+        *scale = 1;
+    } else {
+        *scale = 127 / maxVal;
+    }
 }
 
 EE quantizeF32ToU8(TensorDesc dDesc, const F32 *data, TensorDesc *qDesc, UINT8 *qData, F32 *scale)
@@ -223,7 +229,7 @@ EE quantizeF32ToI8(TensorDesc dDesc, const F32 *data, TensorDesc *qDesc, INT8 *q
     return SUCCESS;
 }
 
-EE quantizeBiasOffsetCI32(F32 *bias,
+EE quantizeBiasOffsetCI32(const F32 *bias,
     TensorDesc biasDesc,
     INT8 *filter,
     TensorDesc filterDesc,
@@ -233,17 +239,20 @@ EE quantizeBiasOffsetCI32(F32 *bias,
     U32 N = tensorNumElements(biasDesc);
     std::set<DataFormat> nativeFormat = {DF_NCHW, DF_NHWC, DF_MTK, DF_NORMAL, DF_TRANSPOSE};
     I32 *offsetC = (I32 *)filter;
-    if (bias == nullptr || N == 0) {
+    if ((bias == nullptr) && (filter == nullptr)) {
+        return SUCCESS;
+    }
+    if ((bias == nullptr) || (N == 0)) {
         N = UNI_MAX(filterDesc.dims[0], filterDesc.dims[1]);
         if (nativeFormat.count(filterDesc.df)) {
-            memset(offsetCBias, 0, N * bytesOf(DT_I32));
+            UNI_MEMSET(offsetCBias, 0, N * bytesOf(DT_I32));
         } else {
-            memcpy(offsetCBias, offsetC, N * bytesOf(DT_I32));
+            UNI_MEMCPY(offsetCBias, offsetC, N * bytesOf(DT_I32));
         }
         return SUCCESS;
     }
 
-    if (nativeFormat.count(filterDesc.df)) {
+    if ((filter == nullptr) || nativeFormat.count(filterDesc.df)) {
         for (U32 i = 0; i < N; ++i) {
             offsetCBias[i] = round(bias[i] * scale[0]);
         }
@@ -259,7 +268,12 @@ EE transformU8ToI8(TensorDesc dDesc, const UINT8 *data, TensorDesc *qDesc, INT8
 {
     U32 dataNum = tensorNumElements(dDesc);
     U32 num16 = dataNum / 64;
-    I64 resMask = pow(2, dataNum % 64) - 1;
+    U64 resMask = dataNum % 64;
+    if (resMask == 63) {
+        resMask = 0xFFFFFFFFFFFFFFFF;
+    } else {
+        resMask = (1LL << resMask) - 1;
+    }
 
     __asm__ __volatile__("mov $0x80, %%ebx \n\t"
                          "vmovd %%ebx, %%xmm1                    \n\t"
diff --git a/compute/tensor/src/cpu/x86/int8/rnn.cpp b/compute/tensor/src/cpu/x86/int8/rnn.cpp
new file mode 100644
index 00000000..ee9fed99
--- /dev/null
+++ b/compute/tensor/src/cpu/x86/int8/rnn.cpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "cpu/x86/int8/tensor_computing_int8.h"
+
+EE rnncell_int8(TensorDesc xDesc,
+    const void *currentX,
+    const TensorDesc *filterDesc,
+    const void **filter,
+    const TensorDesc *biasDesc,
+    const void **bias,
+    F32* scale,
+    void *state,
+    U32 tmpBytes,
+    void *tmp,
+    RNNParamSpec rnnParamSpec,
+    U32 batchStrideX,
+    U32 batchStrideH,
+    TensorDesc hDesc,
+    void *output,
+    Arch arch)
+{
+    EE ret = NOT_SUPPORTED;
+    switch (rnnParamSpec.mode) {
+        case RNN_LSTM: {
+            ret = lstmcell_int8(xDesc, currentX, filterDesc, filter, biasDesc, bias, scale, state,
+                tmpBytes, tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch);
+            break;
+        }
+        default:
+            break;
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/cpu/x86/int8/tensor_computing_int8.h b/compute/tensor/src/cpu/x86/int8/tensor_computing_int8.h
index 4639d703..0ec53c5c 100644
--- a/compute/tensor/src/cpu/x86/int8/tensor_computing_int8.h
+++ b/compute/tensor/src/cpu/x86/int8/tensor_computing_int8.h
@@ -21,7 +21,7 @@
 
 EE dequantizeI32ToF32(TensorDesc qDesc, I32 *qData, const F32 *scale, TensorDesc dDesc, F32 *data);
 
-EE quantizeBiasOffsetCI32(F32 *bias,
+EE quantizeBiasOffsetCI32(const F32 *bias,
     TensorDesc biasDesc,
     INT8 *filter,
     TensorDesc filterDesc,
@@ -40,12 +40,13 @@ EE quantizeI32ToI8(TensorDesc dDesc, const I32 *data, TensorDesc *qDesc, INT8 *q
 
 EE convolution_int8(TensorDesc inputDesc,
     UINT8 *input,
+    F32 *eltwiseInput,
     TensorDesc filterDesc,
     const INT8 *filter,
     ConvolutionParamSpec convParamSpec,
     ConvolutionForwardAlgorithm algorithm,
     TensorDesc biasDesc,
-    const I32 *bias,
+    const F32 *bias,
     U32 tmpBytes,
     void *tmp,
     TensorDesc outputDesc,
@@ -56,11 +57,12 @@ EE convolution_int8(TensorDesc inputDesc,
 
 EE convolution_direct(TensorDesc inputDesc,
     UINT8 *inArray,
+    F32 *eltwiseInput,
     TensorDesc filterDesc,
     const INT8 *filterArray,
     ConvolutionParamSpec convParamSpec,
     TensorDesc biasDesc,
-    const I32 *biasArray,
+    const F32 *biasArray,
     U32 tmpBytes,
     void *tmp,
     TensorDesc outputDesc,
@@ -84,11 +86,12 @@ EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc,
 
 EE convolution_1x1_direct(TensorDesc inputDesc,
     UINT8 *inArray,
+    F32 *eltwiseInput,
     TensorDesc filterDesc,
     const INT8 *filterArray,
     ConvolutionParamSpec convParamSpec,
     TensorDesc biasDesc,
-    const I32 *biasArray,
+    const F32 *biasArray,
     U32 tmpBytes,
     void *tmp,
     TensorDesc outputDesc,
@@ -96,4 +99,79 @@ EE convolution_1x1_direct(TensorDesc inputDesc,
     F32 *scale,
     ActivationParamSpec activationDesc);
 
-#endif  //CHEETAH_TENSOR_COMPUTING_INT8_H
\ No newline at end of file
+EE pooling_c16_uint8(TensorDesc inputDesc,
+    const UINT8 *input,
+    PoolingParamSpec poolingParamSpec,
+    TensorDesc outputDesc,
+    UINT8 *output,
+    void *scale);
+
+EE rnncell_int8(TensorDesc xDesc,
+    const void *currentX,
+    const TensorDesc *filterDesc,
+    const void **filter,
+    const TensorDesc *biasDesc,
+    const void **bias,
+    F32 *scale,
+    void *state,
+    U32 tmpBytes,
+    void *tmp,
+    RNNParamSpec rnnParamSpec,
+    U32 batchStrideX,
+    U32 batchStrideH,
+    TensorDesc hDesc,
+    void *output,
+    Arch arch);
+
+EE lstmcell_int8(TensorDesc xDesc,
+    const void *currentX,
+    const TensorDesc *filterDesc,
+    const void **filter,
+    const TensorDesc *biasDesc,
+    const void **bias,
+    F32 *scale,
+    void *state,
+    U32 tmpBytes,
+    void *tmp,
+    RNNParamSpec rnnParamSpec,
+    U32 batchStrideX,
+    U32 batchStrideH,
+    TensorDesc hDesc,
+    void *output,
+    Arch arch);
+
+EE depthwise_pointwise_convolution_int8(TensorDesc inputDesc,
+    UINT8 *inArray,
+    F32 *eltwiseInput,
+    TensorDesc dwFilterDesc,
+    const INT8 *dwFilterArray,
+    TensorDesc pwFilterDesc,
+    const INT8 *pwFilterArray,
+    ConvolutionParamSpec convParamSpec,
+    TensorDesc dwBiasDesc,
+    const F32 *dwBiasArray,
+    TensorDesc pwBiasDesc,
+    const F32 *pwBiasArray,
+    U32 tmpBytes,
+    void *tmp,
+    TensorDesc outputDesc,
+    void *outArray,
+    F32 *scale,
+    ActivationParamSpec depthwiseActivationParamSpec,
+    ActivationParamSpec pointwiseActivationParamSpec);
+
+EE depthwise_convolution_transform_filter_int8(TensorDesc filterDesc,
+    const INT8 *filter,
+    TensorDesc *ftmDesc,
+    INT8 *filterTransformed);
+
+EE depthwise_pointwise_convolution_transform_filter_int8(TensorDesc dwFilterDesc,
+    const INT8 *dwFilter,
+    TensorDesc pwFilterDesc,
+    const INT8 *pwFilter,
+    DepthwiseConvolutionForwardAlgorithm algorithm,
+    TensorDesc *dwFtmDesc,
+    INT8 *dwFilterTransformed,
+    TensorDesc *pwFtmDesc,
+    INT8 *pwFilterTransformed);
+#endif  //CHEETAH_TENSOR_COMPUTING_INT8_H
diff --git a/compute/tensor/src/cpu/x86/int8/transform_functions_int8.h b/compute/tensor/src/cpu/x86/int8/transform_functions_int8.h
index 6196d16d..29ef416f 100644
--- a/compute/tensor/src/cpu/x86/int8/transform_functions_int8.h
+++ b/compute/tensor/src/cpu/x86/int8/transform_functions_int8.h
@@ -24,10 +24,10 @@ inline void PaddingNCHWC16(
     DataFormat idf;
     U32 in, ic, ih, iw;
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     U32 padih = paddingT + paddingB + ih;
     U32 padiw = paddingL + paddingR + iw;
@@ -38,8 +38,8 @@ inline void PaddingNCHWC16(
     U32 icNum = ic / 16;
     for (U32 c = 0; c < icNum; ++c) {
         U32 coff = c * padih * padiw * simdW;
-        memset(tmp + coff, 128, padiw * paddingT * simdW);
-        memset(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW);
+        UNI_MEMSET(tmp + coff, 128, padiw * paddingT * simdW);
+        UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW);
     }
     for (U32 hc = 0; hc < ih * icNum; ++hc) {
         U32 c = hc / ih;
@@ -47,10 +47,10 @@ inline void PaddingNCHWC16(
         U32 h = hc % ih;
         U32 hoff = (h + paddingT) * padiw;
 
-        memset(tmp + coff + hoff * simdW, 128, paddingL * simdW);
-        memcpy(tmp + coff + (hoff + paddingL) * simdW, data + c * ih * iw * simdW + h * iw * simdW,
-            iw * simdW);
-        memset(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW);
+        UNI_MEMSET(tmp + coff + hoff * simdW, 128, paddingL * simdW);
+        UNI_MEMCPY(tmp + coff + (hoff + paddingL) * simdW,
+            data + c * ih * iw * simdW + h * iw * simdW, iw * simdW);
+        UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW);
     }
 
     icNum *= 16;
@@ -58,14 +58,14 @@ inline void PaddingNCHWC16(
     while (resC > 0) {
         U32 cx = (resC == 12) ? 8 : resC;  // resC: 4, 8, 12, 16
         U32 coff = icNum * padih * padiw;
-        memset(tmp + coff, 128, padiw * paddingT * cx);
-        memset(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx);
+        UNI_MEMSET(tmp + coff, 128, padiw * paddingT * cx);
+        UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx);
         for (U32 h = 0; h < ih; ++h) {
             U32 hoff = (h + paddingT) * padiw;
-            memset(tmp + coff + hoff * cx, 128, paddingL * cx);
-            memcpy(
+            UNI_MEMSET(tmp + coff + hoff * cx, 128, paddingL * cx);
+            UNI_MEMCPY(
                 tmp + coff + (hoff + paddingL) * cx, data + icNum * ih * iw + h * iw * cx, iw * cx);
-            memset(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx);
+            UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx);
         }
         resC -= cx;
     }
@@ -79,10 +79,10 @@ inline void PaddingNCHW2NCHWC16(
     DataFormat idf;
     U32 in, ic, ih, iw;
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     U32 padih = paddingT + paddingB + ih;
     U32 padiw = paddingL + paddingR + iw;
@@ -92,8 +92,8 @@ inline void PaddingNCHW2NCHWC16(
     U32 icNum = ic / 16;
     for (U32 c = 0; c < icNum; ++c) {
         U32 coff = c * padih * padiw * simdW;
-        memset(tmp + coff, 128, padiw * paddingT * simdW);
-        memset(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW);
+        UNI_MEMSET(tmp + coff, 128, padiw * paddingT * simdW);
+        UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW);
     }
     for (U32 hc = 0; hc < ih * icNum; ++hc) {
         U32 c = hc / ih;
@@ -101,7 +101,7 @@ inline void PaddingNCHW2NCHWC16(
         U32 h = hc % ih;
         U32 hoff = (h + paddingT) * padiw;
 
-        memset(tmp + coff + hoff * simdW, 128, paddingL * simdW);
+        UNI_MEMSET(tmp + coff + hoff * simdW, 128, paddingL * simdW);
         for (U32 w = 0; w < iw; ++w) {
             for (U32 s = 0; s < simdW; ++s) {
                 U32 iIdx = (c * simdW + s) * ih * iw + h * iw + w;
@@ -109,7 +109,7 @@ inline void PaddingNCHW2NCHWC16(
                 tmp[oIdx] = data[iIdx];
             }
         }
-        memset(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW);
+        UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW);
     }
 
     icNum *= 16;
@@ -118,11 +118,11 @@ inline void PaddingNCHW2NCHWC16(
         U32 icx = ic - icNum;
         U32 cx = (resC == 12) ? 8 : resC;  // resC: 4, 8, 12, 16
         U32 coff = icNum * padih * padiw;
-        memset(tmp + coff, 128, padiw * paddingT * cx);
-        memset(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx);
+        UNI_MEMSET(tmp + coff, 128, padiw * paddingT * cx);
+        UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx);
         for (U32 h = 0; h < ih; ++h) {
             U32 hoff = (h + paddingT) * padiw;
-            memset(tmp + coff + hoff * cx, 128, paddingL * cx);
+            UNI_MEMSET(tmp + coff + hoff * cx, 128, paddingL * cx);
             for (U32 w = 0; w < iw; ++w) {
                 U32 woff = (hoff + paddingL) * cx + w * cx;
                 for (U32 s = 0; s < icx; ++s) {
@@ -130,9 +130,9 @@ inline void PaddingNCHW2NCHWC16(
                     U32 oIdx = coff + woff + s;
                     tmp[oIdx] = data[iIdx];
                 }
-                memset(tmp + coff + woff + icx, 128, cx - icx);
+                UNI_MEMSET(tmp + coff + woff + icx, 128, cx - icx);
             }
-            memset(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx);
+            UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx);
         }
         resC -= cx;
     }
@@ -146,10 +146,10 @@ inline void PaddingNCHWC8ToNCHWC16(
     DataFormat idf;
     U32 in, ic, ih, iw;
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
 
     U32 padih = paddingT + paddingB + ih;
     U32 padiw = paddingL + paddingR + iw;
@@ -161,8 +161,8 @@ inline void PaddingNCHWC8ToNCHWC16(
     if (paddingT != 0 || paddingB != 0) {
         for (U32 c = 0; c < icNum; ++c) {
             U32 coff = c * padih * padiw * simdW;
-            memset(tmp + coff, 128, padiw * paddingT * simdW);
-            memset(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW);
+            UNI_MEMSET(tmp + coff, 128, padiw * paddingT * simdW);
+            UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW);
         }
     }
     for (U32 hc = 0; hc < ih * icNum; ++hc) {
@@ -171,32 +171,32 @@ inline void PaddingNCHWC8ToNCHWC16(
         U32 h = hc % ih;
         U32 hoff = (h + paddingT) * padiw;
 
-        memset(tmp + coff + hoff * simdW, 128, paddingL * simdW);
+        UNI_MEMSET(tmp + coff + hoff * simdW, 128, paddingL * simdW);
         for (U32 w = 0; w < iw; ++w) {
             for (U32 s = 0; s < simdW; s += 8) {
                 U32 iIdx = (c * simdW + s) * ih * iw + (h * iw + w) * 8;
                 U32 oIdx = coff + (hoff + paddingL) * simdW + w * simdW + s;
-                memcpy(tmp + oIdx, data + iIdx, 8);
+                UNI_MEMCPY(tmp + oIdx, data + iIdx, 8);
             }
         }
-        memset(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW);
+        UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW);
     }
 
     icNum *= 16;
     if (ic > icNum) {
         U32 cx = 8;
         U32 coff = icNum * padih * padiw;
-        memset(tmp + coff, 128, padiw * paddingT * cx);
-        memset(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx);
+        UNI_MEMSET(tmp + coff, 128, padiw * paddingT * cx);
+        UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx);
         for (U32 h = 0; h < ih; ++h) {
             U32 hoff = (h + paddingT) * padiw;
-            memset(tmp + coff + hoff * cx, 128, paddingL * cx);
+            UNI_MEMSET(tmp + coff + hoff * cx, 128, paddingL * cx);
             for (U32 w = 0; w < iw; ++w) {
                 U32 iIdx = icNum * ih * iw + (h * iw + w) * 8;
                 U32 oIdx = coff + (hoff + paddingL) * cx + w * cx;
-                memcpy(tmp + oIdx, data + iIdx, 8);
+                UNI_MEMCPY(tmp + oIdx, data + iIdx, 8);
             }
-            memset(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx);
+            UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx);
         }
     }
 }
diff --git a/compute/tensor/src/cpu/x86/int8/x86_functions_int8.h b/compute/tensor/src/cpu/x86/int8/x86_functions_int8.h
index 79bb1c1b..1b50aeb5 100644
--- a/compute/tensor/src/cpu/x86/int8/x86_functions_int8.h
+++ b/compute/tensor/src/cpu/x86/int8/x86_functions_int8.h
@@ -24,9 +24,7 @@ inline EE activation_offset_int8(
 {
     U32 num32 = len / 32;
     U32 resMask = pow(2, len % 32) - 1;
-
     EE ret = SUCCESS;
-
     switch (activationDesc.mode) {
         case ACTIVATION_NULL: {
             break;
diff --git a/compute/tensor/src/cpu/x86/normalization.cpp b/compute/tensor/src/cpu/x86/normalization.cpp
index aaf9f160..3cb55145 100644
--- a/compute/tensor/src/cpu/x86/normalization.cpp
+++ b/compute/tensor/src/cpu/x86/normalization.cpp
@@ -16,8 +16,13 @@
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 #endif
 
-EE layer_normalization_x86(
-    TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output)
+EE layer_normalization_x86(TensorDesc inputDesc,
+    void *input,
+    LayerNormParamSpec p,
+    void *alpha,
+    void *beta,
+    TensorDesc outputDesc,
+    void *output)
 {
     DataType idt = inputDesc.dt;
     EE ret = SUCCESS;
@@ -25,7 +30,7 @@ EE layer_normalization_x86(
 #ifdef _USE_FP32
         case DT_F32: {
             ret = layer_normalization_fp32(
-                inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output);
+                inputDesc, (F32 *)input, p, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output);
             break;
         }
 #endif
diff --git a/compute/tensor/src/cpu/x86/pooling.cpp b/compute/tensor/src/cpu/x86/pooling.cpp
index 9b7c8d95..b7ca2bdf 100644
--- a/compute/tensor/src/cpu/x86/pooling.cpp
+++ b/compute/tensor/src/cpu/x86/pooling.cpp
@@ -15,11 +15,14 @@
 #ifdef _USE_FP32
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 #endif
+#ifdef _USE_INT8
+#include "cpu/x86/int8/tensor_computing_int8.h"
+#endif
 
 EE pooling_x86(TensorDesc inputDesc,
     const void *input,
     PoolingParamSpec poolingParamSpec,
-    const void *scale,
+    void *scale,
     TensorDesc outputDesc,
     void *output)
 {
@@ -34,6 +37,20 @@ EE pooling_x86(TensorDesc inputDesc,
             } else if (inputDesc.df == DF_NCHWC16) {
                 ret = pooling_c16_fp32(
                     inputDesc, (const F32 *)input, poolingParamSpec, outputDesc, (F32 *)output);
+            } else if (inputDesc.df == DF_NCHW) {
+                ret = pooling_nchw_fp32(
+                    inputDesc, (const F32 *)input, poolingParamSpec, outputDesc, (F32 *)output);
+            } else {
+                ret = NOT_SUPPORTED;
+            }
+            break;
+        }
+#endif
+#ifdef _USE_INT8
+        case DT_U8_Q: {
+            if (inputDesc.df == DF_NCHWC16) {
+                ret = pooling_c16_uint8(inputDesc, (const UINT8 *)input, poolingParamSpec,
+                    outputDesc, (UINT8 *)output, scale);
             } else {
                 ret = NOT_SUPPORTED;
             }
@@ -67,4 +84,4 @@ EE pooling_bp_x86(TensorDesc inputDesc,
             break;
     }
     return ret;
-}
\ No newline at end of file
+}
diff --git a/compute/tensor/src/cpu/x86/quantize.cpp b/compute/tensor/src/cpu/x86/quantize.cpp
index f2b5e732..4dbe0d8d 100644
--- a/compute/tensor/src/cpu/x86/quantize.cpp
+++ b/compute/tensor/src/cpu/x86/quantize.cpp
@@ -87,8 +87,26 @@ EE quantize_bias_offsetC(const void *bias,
         switch (qType) {
 #ifdef _USE_INT8
             case DT_I32: {
-                ret = quantizeBiasOffsetCI32(
-                    (F32 *)bias, biasDesc, (INT8 *)filter, filterDesc, scale, (I32 *)offsetCBias);
+                ret = quantizeBiasOffsetCI32((const F32 *)bias, biasDesc, (INT8 *)filter,
+                    filterDesc, scale, (I32 *)offsetCBias);
+                break;
+            }
+#endif
+            default:
+                ret = NOT_SUPPORTED;
+                break;
+        }
+    } else if (biasDesc.dt == DT_I32) {
+        switch (qType) {
+#ifdef _USE_INT8
+            case DT_I32: {
+                if (filter == nullptr) {
+                    UNI_MEMCPY(offsetCBias, bias, tensorNumBytes(biasDesc));
+                } else {
+                    for (U32 i = 0; i < tensorNumElements(biasDesc); ++i) {
+                        ((I32 *)offsetCBias)[i] = ((I32 *)bias)[i] + ((I32 *)filter)[i];
+                    }
+                }
                 break;
             }
 #endif
diff --git a/compute/tensor/src/cpu/x86/rnn.cpp b/compute/tensor/src/cpu/x86/rnn.cpp
index 7e9ce1d2..e4e247f0 100644
--- a/compute/tensor/src/cpu/x86/rnn.cpp
+++ b/compute/tensor/src/cpu/x86/rnn.cpp
@@ -15,7 +15,9 @@
 #ifdef _USE_FP32
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 #endif
-#include "blas_enhance.h"
+#ifdef _USE_INT8
+#include "cpu/x86/int8/tensor_computing_int8.h"
+#endif
 
 EE rnncell_x86(TensorDesc xDesc,
     const void *currentX,
@@ -23,6 +25,7 @@ EE rnncell_x86(TensorDesc xDesc,
     const void **filter,
     const TensorDesc *biasDesc,
     const void **bias,
+    float *scale,
     void *state,
     U32 tmpBytes,
     void *tmp,
@@ -33,17 +36,25 @@ EE rnncell_x86(TensorDesc xDesc,
     void *output,
     Arch arch)
 {
-    EE ret = SUCCESS;
+    EE ret = NOT_SUPPORTED;
     switch (xDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
-            ret = rnncell_fp32(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes,
-                tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch);
+            if (0) {
+#if defined(_USE_INT8) && defined(_USE_ULTRA_OPTIMIZATION)
+            } else if (arch == X86_AVX512 && rnnParamSpec.mode == RNN_LSTM &&
+                rnnParamSpec.num_projection == 0) {
+                ret = rnncell_int8(xDesc, currentX, filterDesc, filter, biasDesc, bias, scale, state,
+                    tmpBytes, tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch);
+#endif
+            } else {
+                ret = rnncell_fp32(xDesc, currentX, filterDesc, filter, biasDesc, bias, state,
+                    tmpBytes, tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch);
+            }
             break;
         }
 #endif
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
     return ret;
diff --git a/compute/tensor/src/cpu/x86/scale.cpp b/compute/tensor/src/cpu/x86/scale.cpp
index 6c7ded30..00d8c9a3 100644
--- a/compute/tensor/src/cpu/x86/scale.cpp
+++ b/compute/tensor/src/cpu/x86/scale.cpp
@@ -12,6 +12,7 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/x86/tensor_computing_x86.h"
+#include "cpu/x86/int32/tensor_computing_int32.h"
 #ifdef _USE_FP32
 #include "cpu/x86/fp32/tensor_computing_fp32.h"
 #endif
@@ -37,7 +38,7 @@ EE scale_x86(TensorDesc inputDesc,
         CHECK_REQUIREMENT(oc % 16 == 0);
         axis = outputDesc.nDims + 1;
     }
-    EE ret = SUCCESS;
+    EE ret = NOT_SUPPORTED;
     switch (outputDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
@@ -46,10 +47,13 @@ EE scale_x86(TensorDesc inputDesc,
             break;
         }
 #endif
+        case DT_I32: {
+            ret = scale_int32((I32 *)input, axis, outputDesc.nDims, (I32 *)alpha, (I32 *)beta, on,
+                oc, elements_per_channel, ic, (I32 *)output);
+            break;
+        }
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
-
     return ret;
 }
diff --git a/compute/tensor/src/cpu/x86/softmax.cpp b/compute/tensor/src/cpu/x86/softmax.cpp
index 9c2a37f0..da00fcda 100644
--- a/compute/tensor/src/cpu/x86/softmax.cpp
+++ b/compute/tensor/src/cpu/x86/softmax.cpp
@@ -19,9 +19,8 @@
 EE softmax_x86(
     TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output)
 {
-    DataType idt = inputDesc.dt;
-    EE ret = SUCCESS;
-    switch (idt) {
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
             ret = softmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output);
@@ -29,9 +28,24 @@ EE softmax_x86(
         }
 #endif
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
+    return ret;
+}
 
+EE logsoftmax_x86(
+    TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output)
+{
+    EE ret = NOT_SUPPORTED;
+    switch (inputDesc.dt) {
+#ifdef _USE_FP32
+        case DT_F32: {
+            ret = logsoftmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output);
+            break;
+        }
+#endif
+        default:
+            break;
+    }
     return ret;
 }
diff --git a/compute/tensor/src/cpu/x86/tensor_computing_x86.h b/compute/tensor/src/cpu/x86/tensor_computing_x86.h
index 05f4cef0..16f3fcf0 100644
--- a/compute/tensor/src/cpu/x86/tensor_computing_x86.h
+++ b/compute/tensor/src/cpu/x86/tensor_computing_x86.h
@@ -102,6 +102,7 @@ EE depthwise_pointwise_convolution_x86(TensorDesc inputDesc,
     const void *pwFilter,
     ConvolutionParamSpec convParamSpec,
     DepthwiseConvolutionForwardAlgorithm algorithm,
+    void *scale,
     TensorDesc dwBiasDesc,
     const void *dwBias,
     TensorDesc pwBiasDesc,
@@ -124,6 +125,7 @@ EE depthwise_convolution_transform_filter_x86(TensorDesc filterDesc,
     void *filterTransformed);
 
 EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc,
+    TensorDesc dwFilterDesc,
     TensorDesc outputDesc,
     ConvolutionParamSpec convParamSpec,
     DepthwiseConvolutionForwardAlgorithm algorithm,
@@ -135,6 +137,7 @@ EE depthwise_convolution_x86(TensorDesc inputDesc,
     const void *filter,
     ConvolutionParamSpec convParamSpec,
     DepthwiseConvolutionForwardAlgorithm algorithm,
+    void *scale,
     TensorDesc biasDesc,
     const void *bias,
     U32 tmpBytes,
@@ -152,8 +155,13 @@ EE eltwise_x86(DataType dataType,
     void *output,
     EltwiseMode eltwiseMode);
 
-EE layer_normalization_x86(
-    TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output);
+EE layer_normalization_x86(TensorDesc inputDesc,
+    void *input,
+    LayerNormParamSpec p,
+    void *alpha,
+    void *beta,
+    TensorDesc outputDesc,
+    void *output);
 
 EE rnncell_x86(TensorDesc xDesc,
     const void *currentX,
@@ -161,6 +169,7 @@ EE rnncell_x86(TensorDesc xDesc,
     const void **filter,
     const TensorDesc *biasDesc,
     const void **bias,
+    float *scale,
     void *state,
     U32 tmpBytes,
     void *tmp,
@@ -182,7 +191,7 @@ EE scale_x86(TensorDesc inputDesc,
 EE pooling_x86(TensorDesc inputDesc,
     const void *input,
     PoolingParamSpec poolingParamSpec,
-    const void *scale,
+    void *scale,
     TensorDesc outputDesc,
     void *output);
 
@@ -197,6 +206,9 @@ EE reshape_x86(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *o
 EE softmax_x86(
     TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output);
 
+EE logsoftmax_x86(
+    TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output);
+
 EE deconvolution_transform_filter_x86(TensorDesc filterDesc,
     const void *filter,
     ConvolutionForwardAlgorithm algorithm,
diff --git a/compute/tensor/src/cpu/x86/x86_functions.h b/compute/tensor/src/cpu/x86/x86_functions.h
index 7c93143f..e9353619 100644
--- a/compute/tensor/src/cpu/x86/x86_functions.h
+++ b/compute/tensor/src/cpu/x86/x86_functions.h
@@ -91,6 +91,9 @@ inline void array_power_x86(DataType dt, void *input, void *output, I32 len, F32
             array_power_f32((F32 *)input, (F32 *)output, len, power);
             break;
 #endif
+        case DT_I64:
+            array_power_template<I64>((I64 *)input, (I64 *)output, len, power);
+            break;
         case DT_I32:
             array_power_template<I32>((I32 *)input, (I32 *)output, len, power);
             break;
@@ -112,6 +115,10 @@ inline F32 array_sum_x86(DataType dt, const void *data, I32 len)
             result = array_sum_f32((const F32 *)data, len);
             break;
 #endif
+        case DT_U32:
+        case DT_I32:
+            result = array_sum_i32((const I32 *)data, len);
+            break;
         default:
             CHECK_STATUS(NOT_SUPPORTED);
             break;
@@ -128,6 +135,9 @@ inline void array_scale_x86(
             array_scale_f32((const F32 *)input, (F32 *)output, len, alpha, beta);
             break;
 #endif
+        case DT_I64:
+            array_scale_template<I64>((const I64 *)input, (I64 *)output, len, alpha, beta);
+            break;
         case DT_I32:
             array_scale_template<I32>((const I32 *)input, (I32 *)output, len, alpha, beta);
             break;
@@ -188,6 +198,9 @@ inline EE array_minmax_value_x86(DataType dt, const void *data, I32 len, int mod
             ret = array_minmax_value_f32((const F32 *)data, len, mode, result);
             break;
 #endif
+        case DT_U32:
+            ret = array_minmax_value_general(dt, data, len, mode, result);
+            break;
         case DT_I32:
             ret = array_minmax_value_i32((const I32 *)data, len, mode, result);
             break;
diff --git a/compute/tensor/src/cpu/yolov3detectionoutput.cpp b/compute/tensor/src/cpu/yolov3detectionoutput.cpp
index 966af6a3..1afe1f17 100644
--- a/compute/tensor/src/cpu/yolov3detectionoutput.cpp
+++ b/compute/tensor/src/cpu/yolov3detectionoutput.cpp
@@ -12,86 +12,9 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "cpu/tensor_computing_cpu.h"
+#include "cpu/non_max_suppression.h"
 #include "tensor_transpose.h"
 
-inline EE qsort_descent(std::vector<BoxRect> &boxes, std::vector<F32> &scores, int left, int right)
-{
-    if (boxes.empty() || scores.empty()) {
-        return NOT_SUPPORTED;
-    }
-
-    int i = left;
-    int j = right;
-    F32 temp = scores[(left + right) / 2];
-
-    while (i <= j) {
-        while (scores[i] > temp) {
-            i++;
-        }
-        while (scores[j] < temp) {
-            j--;
-        }
-        if (i <= j) {
-            std::swap(boxes[i], boxes[j]);
-            std::swap(scores[i], scores[j]);
-            i++;
-            j--;
-        }
-    }
-
-    if (left < j) {
-        qsort_descent(boxes, scores, left, j);
-    }
-    if (i < right) {
-        qsort_descent(boxes, scores, i, right);
-    }
-
-    return SUCCESS;
-}
-
-inline F32 intersectionarea(BoxRect a, BoxRect b)
-{
-    if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) {
-        return 0.f;
-    }
-    F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin);
-    F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin);
-
-    return inter_width * inter_height;
-}
-
-inline EE nms_pickedboxes(std::vector<BoxRect> boxes, std::vector<I64> &picked, F32 nms_threshold)
-{
-    I64 n = boxes.size();
-
-    std::vector<F32> areas(n);
-    for (I64 i = 0; i < n; i++) {
-        BoxRect box = boxes[i];
-
-        F32 width = box.xmax - box.xmin;
-        F32 height = box.ymax - box.ymin;
-
-        areas[i] = width * height;
-    }
-    for (I64 i = 0; i < n; i++) {
-        BoxRect a = boxes[i];
-        int keep = 1;
-        for (int j = 0; j < (int)picked.size(); j++) {
-            BoxRect b = boxes[picked[j]];
-            F32 inter_area = intersectionarea(a, b);
-            F32 union_area = areas[i] + areas[picked[j]] - inter_area;
-
-            if (inter_area / union_area > nms_threshold) {
-                keep = 0;
-            }
-        }
-        if (keep) {
-            picked.push_back(i);
-        }
-    }
-    return SUCCESS;
-}
-
 template <typename T>
 EE yolov3detectionoutput(std::vector<void *> input,
     T *output,
@@ -123,7 +46,6 @@ EE yolov3detectionoutput(std::vector<void *> input,
     }
 
     std::vector<BoxRect> all_boxrects;
-    std::vector<F32> all_boxscores;
     I64 input_size = inputDesc.size();
     U32 info_per_box = 4 + 1 + num_class;
     ActivationParamSpec activationdesc_sigmoid;
@@ -134,14 +56,11 @@ EE yolov3detectionoutput(std::vector<void *> input,
         CHECK_REQUIREMENT(inputDesc[i].df == DF_NCHWC8 || inputDesc[i].df == DF_NCHW);
         if (inputDesc[i].df == DF_NCHWC8) {
             T *tmp = (T *)malloc(tensorNumBytes(inputDesc[0]));
-            memcpy(tmp, in, tensorNumBytes(inputDesc[0]));
+            UNI_MEMCPY(tmp, in, tensorNumBytes(inputDesc[0]));
             CHECK_STATUS(transformToNCHW(inputDesc[0], tmp, inputDesc[0], in));
             free(tmp);
         }
-        std::vector<std::vector<BoxRect>> allbox_boxrects;
-        std::vector<std::vector<F32>> allbox_boxscores;
-        allbox_boxrects.resize(num_box);
-        allbox_boxscores.resize(num_box);
+        std::vector<std::vector<BoxRect>> allbox_boxrects(num_box);
 
         U32 w = inputDesc[i].dims[0];
         U32 h = inputDesc[i].dims[1];
@@ -190,9 +109,9 @@ EE yolov3detectionoutput(std::vector<void *> input,
                         F32 box_ymin = box_cy - box_h * 0.5;
                         F32 box_xmax = box_cx + box_w * 0.5;
                         F32 box_ymax = box_cy + box_h * 0.5;
-                        BoxRect box = {box_xmin, box_ymin, box_xmax, box_ymax, label};
+                        BoxRect box = {
+                            box_xmin, box_ymin, box_xmax, box_ymax, label, score_conf, INT_MAX};
                         allbox_boxrects[b].push_back(box);
-                        allbox_boxscores[b].push_back(score_conf);
                     }
                     idx++;
                 }
@@ -202,34 +121,28 @@ EE yolov3detectionoutput(std::vector<void *> input,
         for (U32 b = 0; b < num_box; b++) {
             all_boxrects.insert(
                 all_boxrects.end(), allbox_boxrects[b].begin(), allbox_boxrects[b].end());
-            all_boxscores.insert(
-                all_boxscores.end(), allbox_boxscores[b].begin(), allbox_boxscores[b].end());
         }
     }
     // sort boxes
-    qsort_descent(all_boxrects, all_boxscores, 0, static_cast<int>(all_boxscores.size() - 1));
+    std::stable_sort(all_boxrects.begin(), all_boxrects.end(),
+        [&](const BoxRect &a, const BoxRect &b) { return (a.score > b.score); });
     // apply nms
-    std::vector<I64> picked;
-    nms_pickedboxes(all_boxrects, picked, nms_threshold);
+    std::vector<I32> picked = nms_pickedboxes(all_boxrects, nms_threshold);
 
     std::vector<BoxRect> boxrects;
-    std::vector<F32> boxscores;
-    for (I64 p = 0; p < (I64)picked.size(); p++) {
+    for (U32 p = 0; p < picked.size(); p++) {
         I64 picked_box = picked[p];
         boxrects.push_back(all_boxrects[picked_box]);
-        boxscores.push_back(all_boxscores[picked_box]);
     }
 
-    U32 num_detected = static_cast<U32>(boxrects.size());
+    U32 num_detected = boxrects.size();
     // the first box contains the number of availble boxes
     output[0] = num_detected;
     output[1] = output[2] = output[3] = output[4] = output[5] = 0;
     for (U32 i = 0; i < num_detected; i++) {
         BoxRect b = boxrects[i];
-        F32 score = boxscores[i];
-
         output[(i + 1) * 6] = b.label + 1;
-        output[(i + 1) * 6 + 1] = score;
+        output[(i + 1) * 6 + 1] = b.score;
         output[(i + 1) * 6 + 2] = b.xmin;
         output[(i + 1) * 6 + 3] = b.ymin;
         output[(i + 1) * 6 + 4] = b.xmax;
diff --git a/compute/tensor/src/cumsum.cpp b/compute/tensor/src/cumsum.cpp
new file mode 100644
index 00000000..1cda9689
--- /dev/null
+++ b/compute/tensor/src/cumsum.cpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "tensor_computing.h"
+#ifdef _USE_GENERAL
+#include "cpu/general/tensor_computing_general.h"
+#endif
+#ifdef _USE_NEON
+#include "cpu/arm/tensor_computing_arm.h"
+#endif
+#ifdef _USE_X86
+#include "cpu/x86/tensor_computing_x86.h"
+#endif
+
+EE cumsum_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo)
+{
+    if (inputTensor == nullptr || outputTensor == nullptr) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    TensorDesc inputDesc = inputTensor->get_desc();
+    TensorDesc outputDesc = inputDesc;
+    outputTensor->resize(outputDesc);
+    return SUCCESS;
+}
+
+EE cumsum(Tensor inputTensor, CumSumParamSpec p, Tensor outputTensor, ArchInfo_t archInfo)
+{
+    auto arch = archInfo->arch;
+    TensorDesc inputDesc = inputTensor.get_desc();
+    void *input = get_ptr_from_tensor(inputTensor, arch);
+    TensorDesc outputDesc = outputTensor.get_desc();
+    void *output = get_ptr_from_tensor(outputTensor, arch);
+    EE ret = NOT_SUPPORTED;
+    if (IS_CPU(arch)) {
+        ret = cumsum_general(inputDesc, input, p, outputDesc, output);
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/deconvolution.cpp b/compute/tensor/src/deconvolution.cpp
index 9d3256b8..78261c34 100644
--- a/compute/tensor/src/deconvolution.cpp
+++ b/compute/tensor/src/deconvolution.cpp
@@ -26,9 +26,6 @@ inline EE deconvolution_infer_output_size_cpu(TensorDesc inputDesc,
     TensorDesc *outputDesc,
     DataType targetDataType)
 {
-    if (nullptr == outputDesc) {
-        CHECK_STATUS(NULL_POINTER);
-    }
     DataType idt, fdt;
     DataFormat idf, fdf;
     U32 in, ic, ih, iw;
@@ -37,23 +34,22 @@ inline EE deconvolution_infer_output_size_cpu(TensorDesc inputDesc,
     CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
     CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
     CHECK_REQUIREMENT(1 == fn || ic == fn);
-
     if (fh < 1 || fw < 1) {
-        CHECK_STATUS(NOT_SUPPORTED);
+        return NOT_SUPPORTED;
     }
 
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    if (convParamSpec.rm == TF_SAME) {
+    if (convParamSpec.round_mode == ROUND_TF_SAME) {
         oh = strideH * ih;
         ow = strideW * iw;
     } else {
-        U32 paddingT = convParamSpec.padding_top;
-        U32 paddingB = convParamSpec.padding_bottom;
-        U32 paddingL = convParamSpec.padding_left;
-        U32 paddingR = convParamSpec.padding_right;
-        oh = fh + strideH * (ih - 1) - paddingT - paddingB;
-        ow = fw + strideW * (iw - 1) - paddingL - paddingR;
+        U32 paddingT = convParamSpec.pad_top;
+        U32 paddingB = convParamSpec.pad_bottom;
+        U32 paddingL = convParamSpec.pad_left;
+        U32 paddingR = convParamSpec.pad_right;
+        oh = fh + strideH * (ih - 1) - paddingT - paddingB + convParamSpec.output_pad_h;
+        ow = fw + strideW * (iw - 1) - paddingL - paddingR + convParamSpec.output_pad_w;
     }
 
     *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fc, oh, ow);
@@ -67,32 +63,29 @@ EE deconvolution_infer_output_size(Tensor *inputTensor,
     DataType targetDataType,
     ArchInfo_t archInfo)
 {
-    if (inputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-    if (outputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
+    if (inputTensor == nullptr || outputTensor == nullptr) {
+        return NULL_POINTER;
     }
     TensorDesc inputDesc = inputTensor->get_desc();
     TensorDesc filterDesc = filterTensor.get_desc();
     TensorDesc outputDesc = outputTensor->get_desc();
-    CHECK_STATUS(deconvolution_infer_output_size_cpu(
-        inputDesc, filterDesc, convParamSpec, &outputDesc, targetDataType));
+    EE ret = deconvolution_infer_output_size_cpu(
+        inputDesc, filterDesc, convParamSpec, &outputDesc, targetDataType);
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
         OclMemory *inputMem = (OclMemory *)inputTensor->get_memory();
         OclMemory *outputMem = (OclMemory *)outputTensor->get_memory();
-        CHECK_STATUS(deconvolution_padding_input_mali(
-            inputDesc, filterDesc, convParamSpec, &outputDesc, inputMem, outputMem));
+        ret = deconvolution_padding_input_mali(
+            inputDesc, filterDesc, convParamSpec, &outputDesc, inputMem, outputMem);
 #endif
     } else {
         U32 fc = filterDesc.dims[filterDesc.nDims - 2];
         if (fc % 8 != 0) {
-            CHECK_STATUS(NOT_SUPPORTED);
+            ret = NOT_SUPPORTED;
         }
     }
     outputTensor->resize(outputDesc);
-    return SUCCESS;
+    return ret;
 }
 
 EE deconvolution_infer_forward_algorithm(Tensor inputTensor,
@@ -108,7 +101,6 @@ EE deconvolution_infer_forward_algorithm(Tensor inputTensor,
     TensorDesc inputDesc = inputTensor.get_desc();
     TensorDesc filterDesc = filterTensor.get_desc();
     TensorDesc outputDesc = outputTensor.get_desc();
-
     EE ret = NOT_SUPPORTED;
     auto arch = archInfo->arch;
     if (IS_GENERAL(arch)) {
diff --git a/compute/tensor/src/depth2space.cpp b/compute/tensor/src/depth2space.cpp
index 06511b6e..2e764ecb 100644
--- a/compute/tensor/src/depth2space.cpp
+++ b/compute/tensor/src/depth2space.cpp
@@ -12,6 +12,9 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "tensor_computing.h"
+#ifdef _USE_CPU
+#include "cpu/tensor_computing_cpu.h"
+#endif
 #ifdef _USE_GPU
 #include "gpu/mali/tensor_computing_mali.h"
 #endif
@@ -26,7 +29,7 @@ EE depth2space_infer_output_size(
         CHECK_STATUS(NULL_POINTER);
     }
     TensorDesc inputDesc = inputTensor->get_desc();
-    TensorDesc outputDesc = outputTensor->get_desc();
+    TensorDesc outputDesc = inputDesc;
     EE ret = NOT_SUPPORTED;
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
@@ -34,6 +37,13 @@ EE depth2space_infer_output_size(
         OclMemory *outputMem = (OclMemory *)outputTensor->get_memory();
         ret = depth2space_padding_input_mali(inputDesc, p, &outputDesc, inputMem, outputMem);
 #endif
+    } else {
+        for (int i = 0; i < (int)outputDesc.nDims - 2; i++) {
+            outputDesc.dims[i] *= p.block_size;
+            outputDesc.dims[outputDesc.nDims - 2] /= p.block_size;
+        }
+        outputDesc.df = getTensorDefaultDataFormat(outputDesc.nDims);
+        ret = SUCCESS;
     }
     outputTensor->resize(outputDesc);
     return ret;
@@ -49,6 +59,9 @@ EE depth2space_infer_forward_tmp_bytes(
         TensorDesc outputDesc = outputTensor.get_desc();
         ret = depth2space_infer_tmpBuf_size_mali(inputDesc, p, outputDesc, bytes);
 #endif
+    } else {
+        *bytes = 0;
+        ret = SUCCESS;
     }
     return ret;
 }
@@ -60,16 +73,20 @@ EE depth2space(Tensor inputTensor,
     ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
+    TensorDesc inputDesc = inputTensor.get_desc();
+    void *input = get_ptr_from_tensor(inputTensor, arch);
+    void *tmp = get_ptr_from_tensor(tmpTensor, arch);
+    TensorDesc outputDesc = outputTensor.get_desc();
+    void *output = get_ptr_from_tensor(outputTensor, arch);
     EE ret = NOT_SUPPORTED;
     if (IS_GPU(arch)) {
 #ifdef _USE_GPU
-        TensorDesc inputDesc = inputTensor.get_desc();
-        void *input = get_ptr_from_tensor(inputTensor, arch);
-        void *tmp = get_ptr_from_tensor(tmpTensor, arch);
-        TensorDesc outputDesc = outputTensor.get_desc();
-        void *output = get_ptr_from_tensor(outputTensor, arch);
         ret = depth2space_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc,
             (GCLMem_t)input, p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output);
+#endif
+    } else {
+#ifdef _USE_CPU
+        ret = depth2space_cpu(inputDesc, input, p, outputDesc, output);
 #endif
     }
     return ret;
diff --git a/compute/tensor/src/depthwise_convolution.cpp b/compute/tensor/src/depthwise_convolution.cpp
index 31895ff1..aac5fdf7 100644
--- a/compute/tensor/src/depthwise_convolution.cpp
+++ b/compute/tensor/src/depthwise_convolution.cpp
@@ -48,10 +48,10 @@ inline EE depthwise_convolution_infer_output_size_cpu(TensorDesc inputDesc,
 
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
 
@@ -64,7 +64,12 @@ inline EE depthwise_convolution_infer_output_size_cpu(TensorDesc inputDesc,
         CHECK_STATUS(NOT_MATCH);
     }
 
-    *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, ic, oh, ow);
+    DataFormat odf = DF_NCHWC8;
+    if ((idt == DT_U8_Q || idf == DF_NCHWC16) && ic % 16 == 0) {
+        odf = DF_NCHWC16;
+    }
+
+    *outputDesc = tensor4df(targetDataType, odf, in, ic, oh, ow);
     return SUCCESS;
 }
 
@@ -227,9 +232,7 @@ EE depthwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor,
     ArchInfo_t archInfo)
 {
     TensorDesc inputDesc = inputTensor.get_desc();
-#if defined(_USE_NEON) || defined(_USE_GPU)
     TensorDesc filterDesc = filterTensor.get_desc();
-#endif
     TensorDesc outputDesc = outputTensor.get_desc();
 
     EE ret = NOT_SUPPORTED;
@@ -242,7 +245,7 @@ EE depthwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor,
 #ifdef _USE_X86
     } else if (IS_X86(arch)) {
         ret = depthwise_convolution_infer_forward_tmp_bytes_x86(
-            inputDesc, outputDesc, convParamSpec, algorithm, bytes);
+            inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes);
 #endif
 #ifdef _USE_NEON
     } else if (IS_ARM(arch)) {
@@ -263,6 +266,7 @@ EE depthwise_convolution(Tensor inputTensor,
     Tensor filterTensor,
     ConvolutionParamSpec convParamSpec,
     DepthwiseConvolutionForwardAlgorithm algorithm,
+    void *scale,
     Tensor biasTensor,
     Tensor tmpTensor,
     Tensor outputTensor,
@@ -290,7 +294,7 @@ EE depthwise_convolution(Tensor inputTensor,
 #ifdef _USE_X86
     } else if (IS_X86(arch)) {
         ret = depthwise_convolution_x86(inputDesc, input, filterDesc, filter, convParamSpec,
-            algorithm, biasDesc, bias, tmpBytes, tmp, outputDesc, output,
+            algorithm, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output,
             depthwiseActivationParamSpec, archInfo->arch);
 #endif
 #ifdef _USE_NEON
diff --git a/compute/tensor/src/depthwise_pointwise_convolution.cpp b/compute/tensor/src/depthwise_pointwise_convolution.cpp
index d34bc889..22d8cfc6 100644
--- a/compute/tensor/src/depthwise_pointwise_convolution.cpp
+++ b/compute/tensor/src/depthwise_pointwise_convolution.cpp
@@ -50,10 +50,10 @@ inline EE depthwise_pointwise_convolution_infer_output_size_cpu(TensorDesc input
 
     U32 strideH = convParamSpec.stride_h;
     U32 strideW = convParamSpec.stride_w;
-    U32 paddingT = convParamSpec.padding_top;
-    U32 paddingB = convParamSpec.padding_bottom;
-    U32 paddingL = convParamSpec.padding_left;
-    U32 paddingR = convParamSpec.padding_right;
+    U32 paddingT = convParamSpec.pad_top;
+    U32 paddingB = convParamSpec.pad_bottom;
+    U32 paddingL = convParamSpec.pad_left;
+    U32 paddingR = convParamSpec.pad_right;
     U32 dilateH = convParamSpec.dilatedRate_h;
     U32 dilateW = convParamSpec.dilatedRate_w;
 
@@ -66,7 +66,12 @@ inline EE depthwise_pointwise_convolution_infer_output_size_cpu(TensorDesc input
         CHECK_STATUS(NOT_MATCH);
     }
 
-    *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fn2, oh, ow);
+    DataFormat odf = DF_NCHWC8;
+    if ((idt == DT_U8_Q || idf == DF_NCHWC16) && ic % 16 == 0) {
+        odf = DF_NCHWC16;
+    }
+
+    *outputDesc = tensor4df(targetDataType, odf, in, fn2, oh, ow);
     return SUCCESS;
 }
 
@@ -103,6 +108,15 @@ EE depthwise_pointwise_convolution_infer_output_size(Tensor *inputTensor,
         if (fn % 8 != 0) {
             CHECK_STATUS(NOT_SUPPORTED);
         }
+#ifdef _USE_INT8
+        if (IS_X86_AVX512(archInfo->arch) && (inputDesc.dt == DT_U8_Q))
+        {
+            outputDesc.df = DF_NCHWC16;
+            if (fn % 16 != 0) {
+                CHECK_STATUS(NOT_SUPPORTED);
+            }
+        }
+#endif
     }
     outputTensor->resize(outputDesc);
     return SUCCESS;
@@ -180,7 +194,17 @@ EE depthwise_pointwise_convolution_transform_filter_bytes(Tensor dwFilterTensor,
 #ifdef _USE_X86
     } else if (IS_X86(arch)) {
         U32 *size = (U32 *)dwBytes;
-        *size = tensorNumBytes(dwFilterDesc) + 32;
+        if (DT_I8 == dwFilterDesc.dt) {
+            DataType fdt;
+            DataFormat fdf;
+            U32 fn, fc, fh, fw;
+            CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw));
+            U32 alignSize = 4;
+            U32 filterSize = (fh * fw + alignSize - 1) / alignSize * alignSize;
+            *size = filterSize * fn * fc + 32 + fc * 4;
+        } else {
+            *size = tensorNumBytes(dwFilterDesc) + 32;
+        }
         size = (U32 *)pwBytes;
         *size = tensorNumBytes(pwFilterDesc) + 32;
         ret = SUCCESS;
@@ -281,7 +305,7 @@ EE depthwise_pointwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor,
 #ifdef _USE_X86
     } else if (IS_X86(arch)) {
         ret = depthwise_convolution_infer_forward_tmp_bytes_x86(
-            inputDesc, outputDesc, convParamSpec, algorithm, bytes);
+            inputDesc, dwFilterDesc, outputDesc, convParamSpec, algorithm, bytes);
 #endif
 #ifdef _USE_NEON
     } else if (IS_ARM(arch)) {
@@ -303,6 +327,7 @@ EE depthwise_pointwise_convolution(std::vector<Tensor> inputTensors,
     Tensor pwFilterTensor,
     ConvolutionParamSpec convParamSpec,
     DepthwiseConvolutionForwardAlgorithm algorithm,
+    void *scale,
     Tensor dwBiasTensor,
     Tensor pwBiasTensor,
     std::vector<Tensor> tmpTensors,
@@ -358,7 +383,7 @@ EE depthwise_pointwise_convolution(std::vector<Tensor> inputTensors,
 #ifdef _USE_X86
     } else if (IS_X86(arch)) {
         ret = depthwise_pointwise_convolution_x86(inputDesc, input, eltwiseInput, dwFilterDesc,
-            dwFilter, pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias,
+            dwFilter, pwFilterDesc, pwFilter, convParamSpec, algorithm, scale, dwBiasDesc, dwBias,
             pwBiasDesc, pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec,
             pointwiseActivationParamSpec, archInfo->arch);
 #endif
@@ -388,7 +413,7 @@ EE depthwise_pointwise_convolution(std::vector<Tensor> inputTensors,
     if (inputTensors.size() > 1 && isEltwiseSeperate) {
         std::vector<Tensor> eltwiseInputTensors = {outputTensor, inputTensors[1]};
         EltwiseParamSpec eltwiseDesc;
-        eltwiseDesc.elt_mode = ELTWISE_SUM;
+        eltwiseDesc.mode = ELTWISE_SUM;
         eltwiseDesc.activation_type = eltwiseActDesc.mode;
         eltwiseDesc.activation_spec = convParamSpec.activation_spec;
         ret = eltwise(eltwiseInputTensors, eltwiseDesc, tmpTensors[0], outputTensor, archInfo);
diff --git a/compute/tensor/src/eltwise.cpp b/compute/tensor/src/eltwise.cpp
index ff892065..795db3ed 100644
--- a/compute/tensor/src/eltwise.cpp
+++ b/compute/tensor/src/eltwise.cpp
@@ -26,37 +26,24 @@
 inline EE eltwise_infer_output_size_cpu(std::vector<TensorDesc> inputDesc, TensorDesc *outputDesc)
 {
     if (nullptr == outputDesc) {
-        CHECK_STATUS(NULL_POINTER);
+        return NULL_POINTER;
     }
     U32 num = inputDesc.size();
-    if (num <= 0) {
+    if (num <= 1) {
         return NOT_MATCH;
     }
 
-    if (num == 1) {
-        *outputDesc = inputDesc[0];
-        return SUCCESS;
-    }
-
     U32 arrayDimMax = 0;
-    U32 minDims = inputDesc[0].nDims;
     for (U32 i = 1; i < num; i++) {
         if (inputDesc[i].nDims > inputDesc[arrayDimMax].nDims) {
             arrayDimMax = i;
         }
-        if (inputDesc[i].nDims < minDims) {
-            minDims = inputDesc[i].nDims;
-        }
     }
     U32 nchwc8Count = 0;
     U32 nchwc16Count = 0;
     U32 nhwcCount = 0;
+    bool sameDim = true;
     for (U32 i = 0; i < num; i++) {
-        // Output from 1D-conv + 3D tensors
-        //if (inputDesc[i].nDims == 4 && inputDesc[i].dims[0] == 1 && minDims == 3) {
-        //    inputDesc[i] = tensor3df(inputDesc[i].dt, inputDesc[i].df, inputDesc[i].dims[3],
-        //        inputDesc[i].dims[2], inputDesc[i].dims[1]);
-        //}
         if (inputDesc[i].df == DF_NCHWC8) {
             nchwc8Count++;
         }
@@ -68,12 +55,13 @@ inline EE eltwise_infer_output_size_cpu(std::vector<TensorDesc> inputDesc, Tenso
             nhwcCount++;
             std::swap(inputDesc[i].dims[0], inputDesc[i].dims[1]);
         }
+        if (tensorNumElements(inputDesc[i]) != tensorNumElements(inputDesc[0])) {
+            sameDim = false;
+        }
     }
 
-    U32 dim = inputDesc[arrayDimMax].nDims;
     *outputDesc = inputDesc[arrayDimMax];
-
-    for (U32 i = 0; i < dim; i++) {
+    for (U32 i = 0; i < outputDesc->nDims; i++) {
         for (U32 j = 0; j < num; j++) {
             if (inputDesc[j].nDims > i) {
                 int max_value = UNI_MAX(outputDesc->dims[i], inputDesc[j].dims[i]);
@@ -92,13 +80,9 @@ inline EE eltwise_infer_output_size_cpu(std::vector<TensorDesc> inputDesc, Tenso
     if (nchwc16Count > 0 && nchwc16Count != num) {
         outputDesc->df = DF_NCHWC16;
     }
-    //if (nchwc8Count > 0 && nhwcCount > 0) {
-    //    outputDesc->df = DF_NCHWC8;
-    //    if (outputDesc->nDims == 3) {
-    //        *outputDesc = tensor4df(outputDesc->dt, DF_NCHWC8, outputDesc->dims[2],
-    //            outputDesc->dims[1], outputDesc->dims[0], 1);
-    //    }
-    //}
+    if (!sameDim && (nchwc8Count > 0 || nchwc16Count > 0)) {
+        outputDesc->df = DF_NCHW;
+    }
     return SUCCESS;
 }
 
@@ -167,8 +151,11 @@ EE eltwise(std::vector<Tensor> inputTensor,
     void *tmp = get_ptr_from_tensor(tmpTensor, arch);
     TensorDesc outputDesc = outputTensor.get_desc();
     void *output = get_ptr_from_tensor(outputTensor, arch);
+
+    EE ret = NOT_SUPPORTED;
+    if (IS_CPU(arch)) {
+#ifdef _USE_CPU
 #if defined(_USE_NEON) && defined(_USE_INT8)
-    if (!IS_GPU(arch)) {
         for (U32 i = 0; i < inputTensor.size(); i++) {
             if (inputDesc[i].dt == DT_I8) {
                 F32 scale = inputTensor[i].get_scale();
@@ -182,12 +169,7 @@ EE eltwise(std::vector<Tensor> inputTensor,
                 tmp = (U8 *)tmp + dTensor.bytes();
             }
         }
-    }
 #endif
-
-    EE ret = NOT_SUPPORTED;
-    if (IS_CPU(arch)) {
-#ifdef _USE_CPU
         ret = eltwise_cpu(inputDesc, input, eltwiseDesc, tmpBytes, tmp, outputDesc, output, arch);
 #endif
 #ifdef _USE_GPU
diff --git a/compute/tensor/src/embedding.cpp b/compute/tensor/src/embedding.cpp
index d9ca84c2..98ebf7cc 100644
--- a/compute/tensor/src/embedding.cpp
+++ b/compute/tensor/src/embedding.cpp
@@ -44,7 +44,7 @@ EE embedding_infer_output_size(Tensor *inputTensor,
     }
     CHECK_REQUIREMENT(tensorIs2d(inputDesc));
     CHECK_STATUS(tensor2dGet(inputDesc, &dt, &df, &batch, &step));
-    outputDesc = tensor3df(outputDt, DF_MTK, batch, step, p.num_output);
+    outputDesc = tensor3df(outputDt, DF_MTK, batch, step, p.num_outputs);
     if (inputOneDim) {
         outputDesc.nDims = 2;
         outputDesc.df = DF_NORMAL;
diff --git a/compute/tensor/src/equal.cpp b/compute/tensor/src/equal.cpp
index ee129c48..50a86a43 100644
--- a/compute/tensor/src/equal.cpp
+++ b/compute/tensor/src/equal.cpp
@@ -10,6 +10,7 @@
 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#if 0
 #include "tensor_computing.h"
 
 EE equal_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo)
@@ -22,9 +23,8 @@ EE equal_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t
 }
 
 // attention: comparision ptr will be fixed in mt
-template <typename T>
-static EE equal_kernel(
-    U32 inputLen, U32 comparisonLen, T *inputPtr, F32 *comparisionPtr, bool not_equal, U8 *outputPtr)
+template <typename T1, typename T2>
+static EE equal_kernel(T1 *a1, int len1, T2 *a2, int len2, bool not_equal, U8 *out)
 {
     U8 equal_flag, notequal_flag;
     if (not_equal) {
@@ -34,27 +34,27 @@ static EE equal_kernel(
         equal_flag = 1;
         notequal_flag = 0;
     }
-    if (inputLen == comparisonLen) {
-        for (U32 i = 0; i < inputLen; ++i) {
-            if (inputPtr[i] == (T)(comparisionPtr[i])) {
-                outputPtr[i] = equal_flag;
+    EE ret = SUCCESS;
+    if (len1 == len2) {
+        for (int i = 0; i < len1; ++i) {
+            if (a1[i] == (T1)(a2[i])) {
+                out[i] = equal_flag;
             } else {
-                outputPtr[i] = notequal_flag;
+                out[i] = notequal_flag;
             }
         }
-    } else if (comparisonLen == 1) {
-        F32 compF = comparisionPtr[0];
-        for (U32 i = 0; i < inputLen; ++i) {
-            if (inputPtr[i] == (T)compF) {
-                outputPtr[i] = equal_flag;
+    } else if (len2 == 1) {
+        for (int i = 0; i < len1; ++i) {
+            if (a1[i] == (T1)(a2[0])) {
+                out[i] = equal_flag;
             } else {
-                outputPtr[i] = notequal_flag;
+                out[i] = notequal_flag;
             }
         }
     } else {
-        return NOT_SUPPORTED;
+        ret = NOT_SUPPORTED;
     }
-    return SUCCESS;
+    return ret;
 }
 
 EE equal(Tensor inputTensor,
@@ -64,37 +64,43 @@ EE equal(Tensor inputTensor,
     ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
-    void *input = get_ptr_from_tensor(inputTensor, arch);
-    void *comparision = get_ptr_from_tensor(compareTensor, arch);
-    void *output = get_ptr_from_tensor(outputTensor, arch);
     TensorDesc inputDesc = inputTensor.get_desc();
     U32 inputLen = tensorNumElements(inputDesc);
-    U32 comparisonLen = tensorNumElements(compareTensor.get_desc());
+    void *input = get_ptr_from_tensor(inputTensor, arch);
+    TensorDesc compareDesc = compareTensor.get_desc();
+    U32 compareLen = tensorNumElements(compareDesc);
+    void *compare = get_ptr_from_tensor(compareTensor, arch);
+    void *output = get_ptr_from_tensor(outputTensor, arch);
 
-    EE ret = SUCCESS;
+    EE ret = NOT_SUPPORTED;
     switch (inputDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
-            ret = equal_kernel<F32>(
-                inputLen, comparisonLen, (F32 *)input, (F32 *)comparision, p.invert, (U8 *)output);
+            ret = equal_kernel<F32, F32>(
+                (F32 *)input, inputLen, (F32 *)compare, compareLen, p.invert, (U8 *)output);
             break;
         }
 #endif
 #ifdef _USE_FP16
         case DT_F16: {
-            ret = equal_kernel<F16>(
-                inputLen, comparisonLen, (F16 *)input, (F32 *)comparision, p.invert, (U8 *)output);
+            if (compareDesc.dt == DT_F32) {
+                ret = equal_kernel<F16, F32>(
+                    (F16 *)input, inputLen, (F32 *)compare, compareLen, p.invert, (U8 *)output);
+            } else {
+                ret = equal_kernel<F16, F16>(
+                    (F16 *)input, inputLen, (F16 *)compare, compareLen, p.invert, (U8 *)output);
+            }
             break;
         }
 #endif
         case DT_I32: {
-            ret = equal_kernel<I32>(
-                inputLen, comparisonLen, (I32 *)input, (F32 *)comparision, p.invert, (U8 *)output);
+            ret = equal_kernel<I32, I32>(
+                (I32 *)input, inputLen, (I32 *)compare, compareLen, p.invert, (U8 *)output);
             break;
         }
         default:
-            ret = NOT_SUPPORTED;
             break;
     }
     return ret;
 }
+#endif
diff --git a/compute/tensor/src/expand.cpp b/compute/tensor/src/expand.cpp
index a34b9562..88c8316d 100644
--- a/compute/tensor/src/expand.cpp
+++ b/compute/tensor/src/expand.cpp
@@ -21,16 +21,16 @@ EE expand_infer_output_size(
 {
     TensorDesc inputDesc = inputTensor->get_desc();
     TensorDesc outputDesc = inputDesc;
-    CHECK_REQUIREMENT((I32)inputDesc.nDims <= p.shape_size);
-    outputDesc.nDims = (U32)p.shape_size;
+    CHECK_REQUIREMENT((I32)inputDesc.nDims <= p.num_shape);
+    outputDesc.nDims = (U32)p.num_shape;
     I32 inputDims = inputDesc.nDims;
-    for (I32 i = 0; i < p.shape_size; ++i) {
-        I32 reverseDim = p.shape_size - 1 - i;
+    for (I32 i = 0; i < p.num_shape; ++i) {
+        I32 reverseDim = p.num_shape - 1 - i;
         if ((reverseDim >= inputDims) ||
             (reverseDim < inputDims && inputDesc.dims[reverseDim] == 1)) {
-            outputDesc.dims[reverseDim] = p.shape_dims[i];
+            outputDesc.dims[reverseDim] = p.shape[i];
         } else {
-            CHECK_REQUIREMENT(p.shape_dims[i] <= (I32)inputDesc.dims[reverseDim]);
+            CHECK_REQUIREMENT(p.shape[i] <= (I32)inputDesc.dims[reverseDim]);
             outputDesc.dims[reverseDim] = inputDesc.dims[reverseDim];
         }
     }
@@ -41,6 +41,10 @@ EE expand_infer_output_size(
         }
 #endif
     }
+    if (outputDesc.dt == DT_F32 && outputDesc.nDims == 4 &&
+        outputDesc.dims[outputDesc.nDims - 2] % 8 == 0) {
+        outputDesc.df = DF_NCHWC8;
+    }
     outputTensor->resize(outputDesc);
     return SUCCESS;
 }
@@ -48,17 +52,20 @@ EE expand_infer_output_size(
 EE expand_infer_forward_tmp_bytes(
     Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo)
 {
+    TensorDesc outputDesc = outputTensor.get_desc();
+    TensorDesc inputDesc = inputTensor.get_desc();
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
         GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor);
         GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor);
-        TensorDesc inputDesc = inputTensor.get_desc();
-        TensorDesc outputDesc = outputTensor.get_desc();
         CHECK_STATUS(expand_infer_forward_tmp_bytes_mali(
             inputDesc, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes));
 #endif
     } else {
         *bytes = 0;
+        if (outputDesc.df != inputDesc.df) {
+            *bytes += tensorNumBytes(outputDesc);
+        }
     }
     return SUCCESS;
 }
@@ -80,10 +87,10 @@ void expand_copy_kernel(U32 dims,
     if (dims == lastDims) {
         if (dims >= inDims || inD[dims] == 1) {
             for (U32 i = 0; i < outD[dims]; ++i) {
-                memcpy(output + i * minCopySize, input, minCopySize);
+                UNI_MEMCPY(output + i * minCopySize, input, minCopySize);
             }
         } else {
-            memcpy(output, input, minCopySize * inD[dims]);
+            UNI_MEMCPY(output, input, minCopySize * inD[dims]);
         }
         return;
     }
@@ -97,7 +104,7 @@ void expand_copy_kernel(U32 dims,
         expand_copy_kernel(
             dims - 1, inDims, outDims, inD, outD, input, output, dt, lastDims, minCopySize);
         for (U32 i = 1; i < outD[dims]; ++i) {
-            memcpy(output + i * oOffSize, output, oOffSize);
+            UNI_MEMCPY(output + i * oOffSize, output, oOffSize);
         }
         return;
     }
@@ -120,11 +127,14 @@ EE expand(
     auto arch = archInfo->arch;
     void *input = get_ptr_from_tensor(inputTensor, arch);
     void *output = get_ptr_from_tensor(outputTensor, arch);
+    void *tmp = get_ptr_from_tensor(tmpTensor, arch);
     TensorDesc inputDesc = inputTensor.get_desc();
     TensorDesc outputDesc = outputTensor.get_desc();
+    if (outputDesc.df != inputDesc.df) {
+        output = tmp;
+    }
     if (IS_GPU(arch)) {
 #ifdef _USE_GPU
-        void *tmp = get_ptr_from_tensor(tmpTensor, arch);
         CHECK_STATUS(expand_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc,
             (GCLMem_t)input, p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output));
 #endif
@@ -144,6 +154,11 @@ EE expand(
 
         expand_copy_kernel((outputDesc.nDims - 1), inputDesc.nDims, outputDesc.nDims,
             inputDesc.dims, outputDesc.dims, (U8 *)input, (U8 *)output, idt, lastDims, minCopySize);
+        if (outputDesc.df != inputDesc.df) {
+            TensorDesc oldDesc = outputDesc;
+            oldDesc.df = inputDesc.df;
+            transformFormat(oldDesc, output, outputDesc, get_ptr_from_tensor(outputTensor, arch));
+        }
     }
     return SUCCESS;
 }
diff --git a/compute/tensor/src/fully_connected.cpp b/compute/tensor/src/fully_connected.cpp
index 811f2a65..99ff0b92 100644
--- a/compute/tensor/src/fully_connected.cpp
+++ b/compute/tensor/src/fully_connected.cpp
@@ -11,8 +11,6 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include <string.h>
-
 #include "tensor_computing.h"
 #include "blas_enhance.h"
 #ifdef _USE_GPU
@@ -196,10 +194,12 @@ EE fully_connected_transform_filter_bytes(Tensor filterTensor, void *bytes, Arch
         } else if (IS_X86(archInfo->arch)) {
             alignments = 8;
 #ifdef _USE_INT8
+            alignments = 16;
+            fh = (fh + 8 - 1) / 8 * 8;
             *size += UNI_MAX(fw, fh) * 4;
 #endif
         }
-        fh = (fh + alignments - 1) / alignments * alignments;
+        fw = (fw + alignments - 1) / alignments * alignments;
         *size += fw * fh + 32;
         *size *= bytesOf(fdt);
     }
@@ -248,7 +248,7 @@ EE fully_connected_transform_filter_kernel(TensorDesc inputDesc,
             }
         }
     } else {
-        memcpy(filterTransformed, filter, tensorNumBytes(filterDesc));
+        UNI_MEMCPY(filterTransformed, filter, tensorNumBytes(filterDesc));
     }
 
     U32 fh_after = fh;
@@ -391,13 +391,7 @@ EE fully_connected(Tensor inputTensor,
                 qIDesc.dt = DT_I8;
                 qODesc.dt = DT_I32;
             }
-            if (qIDesc.dt != idt) {
-                CHECK_STATUS(quantize_cpu(inputDesc, input, &qIDesc, tmp, &scaleI, arch));
-                inputDesc = qIDesc;
-                idt = qIDesc.dt;
-                input = (U8 *)tmp;
-                tmp = (U8 *)tmp + tensorNumBytes(inputDesc);
-            }
+            CHECK_REQUIREMENT(idt == qIDesc.dt);
             scaleO = scaleI * filterTensor.get_scale();
 
             if (IS_X86(arch)) {
@@ -406,8 +400,10 @@ EE fully_connected(Tensor inputTensor,
                 if (outputDesc.dt != qODesc.dt) {
                     offsetC += tensorNumBytes(qODesc);
                 }
+                void *transOffsetC = (void *)((U8 *)filter +
+                    UNI_ALIGN(filterDesc.dims[0], 16) * UNI_ALIGN(filterDesc.dims[1], 8));
                 CHECK_STATUS(quantize_bias_offsetC(
-                    bias, biasDesc, DT_I32, filter, filterDesc, &scaleO, offsetC));
+                    bias, biasDesc, DT_I32, transOffsetC, filterDesc, &scaleO, offsetC));
                 bias = nullptr;
                 if (outputDesc.dt == DT_U8_Q && outputTensor.get_scale() > 0) {
                     scale[1] = scale[1] / scaleO;
@@ -421,7 +417,7 @@ EE fully_connected(Tensor inputTensor,
                     CHECK_REQUIREMENT(DT_I8 == outputDesc.dt);
                     biasDesc.dt = DT_I32;
                     I32 *biasI = (I32 *)tmp;
-#ifdef __aarch64__
+#ifdef _USE_FP16
                     F16 *biasF = (F16 *)bias;
 #else
                     F32 *biasF = (F32 *)bias;
@@ -452,11 +448,11 @@ EE fully_connected(Tensor inputTensor,
                 U8 *outArray = (U8 *)output;
                 U32 size = tensorNumBytes(biasDesc);
                 for (U32 i = 0; i < M; i++) {
-                    memcpy(outArray + i * size, bias, size);
+                    UNI_MEMCPY(outArray + i * size, bias, size);
                 }
             }
         } else {
-            memset(output, 0, tensorNumBytes(outputDesc));
+            UNI_MEMSET(output, 0, tensorNumBytes(outputDesc));
         }
 
         // If weight is transformed for mmm, don't run as mvm
diff --git a/compute/tensor/src/gather.cpp b/compute/tensor/src/gather.cpp
index 6feeaf54..a6eddbab 100644
--- a/compute/tensor/src/gather.cpp
+++ b/compute/tensor/src/gather.cpp
@@ -55,11 +55,8 @@ EE gather_infer_output_size(Tensor *dataTensor,
     ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
-    if (dataTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-    if (outputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
+    if (dataTensor == nullptr || outputTensor == nullptr) {
+        return NULL_POINTER;
     }
     TensorDesc dataDesc = dataTensor->get_desc();
     TensorDesc indexDesc = indexTensor->get_desc();
@@ -76,13 +73,20 @@ EE gather_infer_output_size(Tensor *dataTensor,
         }
         outputDesc.nDims = e + indexDesc.nDims;
     } else {
-        outputDesc = dataDesc;
+        outputDesc = indexDesc;
+        outputDesc.dt = dataDesc.dt;
         if (!p.element_level) {
+            outputDesc = dataDesc;
             if (tensorNumElements(indexDesc) == 1 && p.index_scalar) {
                 for (int i = axis; i < (int)outputDesc.nDims - 1; i++) {
                     outputDesc.dims[i] = outputDesc.dims[i + 1];
                 }
-                outputDesc.nDims--;
+                if (outputDesc.nDims > 1) {
+                    outputDesc.nDims--;
+                } else {
+                    outputDesc.dims[0] = 1;
+                    outputDesc.df = DF_SCALAR;
+                }
             } else {
                 for (int i = (int)outputDesc.nDims - 1; i > axis; i--) {
                     outputDesc.dims[i + indexDesc.nDims - 1] = outputDesc.dims[i];
@@ -105,8 +109,16 @@ EE gather_infer_output_size(Tensor *dataTensor,
         }
 #endif
     }
+    EE ret = SUCCESS;
+#ifdef _USE_CPU
+    if (tensorIsShape(dataDesc)) {
+        ret = gather_cpu(dataDesc, dataDesc.dims + dataDesc.nDims, indexDesc,
+            indexDesc.dims + indexDesc.nDims, p, nullptr, outputDesc,
+            outputDesc.dims + outputDesc.nDims);
+    }
+#endif
     outputTensor->resize(outputDesc);
-    return SUCCESS;
+    return ret;
 }
 
 EE gather_infer_forward_tmp_bytes(Tensor dataTensor,
@@ -117,13 +129,14 @@ EE gather_infer_forward_tmp_bytes(Tensor dataTensor,
     ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
-
+    EE ret = NOT_SUPPORTED;
     if (IS_CPU(arch)) {
         if (dataTensor.get_desc().df == DF_NCHWC8) {
             *bytes = dataTensor.bytes();
         } else {
             *bytes = 0;
         }
+        ret = SUCCESS;
 #ifdef _USE_GPU
     } else if (IS_GPU(arch)) {
         TensorDesc dataDesc = dataTensor.get_desc();
@@ -131,9 +144,9 @@ EE gather_infer_forward_tmp_bytes(Tensor dataTensor,
         TensorDesc outputDesc = outputTensor.get_desc();
         GCLMemDesc gclmemDataDesc = ocl_get_desc(dataTensor);
         GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor);
-        CHECK_STATUS(gather_infer_forward_tmp_bytes_mali(
-            dataDesc, gclmemDataDesc, indexDesc, p, outputDesc, gclmemOutputDesc, bytes));
+        ret = gather_infer_forward_tmp_bytes_mali(
+            dataDesc, gclmemDataDesc, indexDesc, p, outputDesc, gclmemOutputDesc, bytes);
 #endif
     }
-    return SUCCESS;
+    return ret;
 }
diff --git a/compute/tensor/src/gpu/mali/activation.cpp b/compute/tensor/src/gpu/mali/activation.cpp
index d011a26f..9af9f23f 100644
--- a/compute/tensor/src/gpu/mali/activation.cpp
+++ b/compute/tensor/src/gpu/mali/activation.cpp
@@ -36,7 +36,8 @@ inline EE activation_checkpara_mali(GCLHandle_t handle,
         activationMode != ACTIVATION_H_SWISH && activationMode != ACTIVATION_GELU &&
         activationMode != ACTIVATION_TANH && activationMode != ACTIVATION_SIGMOID &&
         activationMode != ACTIVATION_ABS && activationMode != ACTIVATION_LOG &&
-        activationMode != ACTIVATION_NEG) {
+        activationMode != ACTIVATION_NEG && activationMode != ACTIVATION_EXP &&
+        activationMode != ACTIVATION_SWISH) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
     if (input->desc.memFormat != output->desc.memFormat) {
diff --git a/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp b/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp
index 3113d809..e888381d 100644
--- a/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp
+++ b/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp
@@ -33,7 +33,7 @@ inline EE bilateral_slice_apply_checkpara_mali_common(GCLHandle_t handle,
     if (nullptr == handle || nullptr == input || nullptr == grid || nullptr == output) {
         return NULL_POINTER;
     }
-    if (bilateralSliceApplyParamSpec.mode == BSliceApply_NULL && nullptr == guide) {
+    if (bilateralSliceApplyParamSpec.mode == BSLICE_APPLY_NULL && nullptr == guide) {
         return NULL_POINTER;
     }
     if (inputDesc.df != guideDesc.df || inputDesc.df != gridDesc.df) {
@@ -51,15 +51,14 @@ inline EE bilateral_slice_apply_checkpara_mali_common(GCLHandle_t handle,
     if (inputDesc.dims[2] != outputDesc.dims[2]) {
         return NOT_MATCH;
     }
-    if ((gridDesc.dims[2] % bilateralSliceApplyParamSpec.coefficient_len) != 0) {
+    if ((gridDesc.dims[2] % bilateralSliceApplyParamSpec.coefficient) != 0) {
         return NOT_MATCH;
     }
     if (bilateralSliceApplyParamSpec.has_offset == true) {
-        if (bilateralSliceApplyParamSpec.coefficient_len !=
-            inputDesc.dims[2] * (inputDesc.dims[2] + 1)) {
+        if (bilateralSliceApplyParamSpec.coefficient != inputDesc.dims[2] * (inputDesc.dims[2] + 1)) {
             return NOT_MATCH;
         }
-        if (bilateralSliceApplyParamSpec.coefficient_len != 12) {
+        if (bilateralSliceApplyParamSpec.coefficient != 12) {
             return NOT_SUPPORTED;
         }
     } else {
diff --git a/compute/tensor/src/gpu/mali/cast.cpp b/compute/tensor/src/gpu/mali/cast.cpp
index 218f795d..b959807b 100644
--- a/compute/tensor/src/gpu/mali/cast.cpp
+++ b/compute/tensor/src/gpu/mali/cast.cpp
@@ -42,9 +42,9 @@ inline void set_dt_name(TensorDesc desc, char *name)
 {
     DataType dt = desc.dt;
     if (dt == DT_F16) {
-        strcpy(name, "f16");
+        UNI_STRCPY(name, "f16");
     } else if (dt == DT_I32) {
-        strcpy(name, "i32");
+        UNI_STRCPY(name, "i32");
     } else {
         CHECK_STATUS(NOT_SUPPORTED);
     }
diff --git a/compute/tensor/src/gpu/mali/check.cpp b/compute/tensor/src/gpu/mali/check.cpp
index 084b824a..80aea30d 100644
--- a/compute/tensor/src/gpu/mali/check.cpp
+++ b/compute/tensor/src/gpu/mali/check.cpp
@@ -41,7 +41,7 @@ inline EE check_checkpara_mali(GCLHandle_t handle,
     if (outputDesc.dt != DT_I32) {
         CHECK_STATUS(NOT_MATCH);
     }
-    if (p.check_mode != CHECK_EQUAL) {
+    if (p.mode != CHECK_EQUAL) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
     return SUCCESS;
diff --git a/compute/tensor/src/gpu/mali/cl/activation.cl b/compute/tensor/src/gpu/mali/cl/activation.cl
index 590889f3..530ff37e 100644
--- a/compute/tensor/src/gpu/mali/cl/activation.cl
+++ b/compute/tensor/src/gpu/mali/cl/activation.cl
@@ -51,7 +51,7 @@ __kernel void MANGLE_NAME(activation_, IOM, FM, AM)(const int w,
     LOAD_MEM_V4_COMMON(val, idx, idy, idz, iw_str, ih_str, i_off, input);
 
     ACTIVATION_V4(val);
-#if defined(USE_TANH) || defined(USE_SIGMOID) || defined(USE_HSIGMOID) || defined(USE_GELU)
+#if defined(USE_TANH) || defined(USE_SIGMOID) || defined(USE_HSIGMOID) || defined(USE_GELU) || defined(USE_EXP)
     char ec = (((idz << 2) + 4) <= c) ? 4 : (c & 3);
     if (ec < 2) {
         val.y = 0;
diff --git a/compute/tensor/src/gpu/mali/cl/col2im.cl b/compute/tensor/src/gpu/mali/cl/col2im.cl
index ff20e0b4..2685cc74 100644
--- a/compute/tensor/src/gpu/mali/cl/col2im.cl
+++ b/compute/tensor/src/gpu/mali/cl/col2im.cl
@@ -48,7 +48,7 @@ __kernel void MANGLE_NAME(col2im_, IOM)(const int iw,
     int sidh_j = pidy % sh;
     int in_hx = (sidh_i < ih) ? sidh_i : (ih - 1);
     int in_hy = (sidh_i < ih) ? sidh_j : ((sidh_i - ih + 1) * sh + sidh_j);
-    int in_hl = (fw - in_hy + sh - 1) / sh;
+    int in_hl = (fh - in_hy + sh - 1) / sh;
     if (in_hl > in_hx + 1) {
         in_hl = in_hx + 1;
     }
diff --git a/compute/tensor/src/gpu/mali/cl/conv_invgemm_col2img.cl b/compute/tensor/src/gpu/mali/cl/conv_invgemm_col2img.cl
new file mode 100644
index 00000000..32fa8ce5
--- /dev/null
+++ b/compute/tensor/src/gpu/mali/cl/conv_invgemm_col2img.cl
@@ -0,0 +1,80 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "kernel_def.h"
+#define MANGLE_NAME_IMPL(base, IOM, AM) base##IOM##AM
+#define MANGLE_NAME(base, IOM, AM) MANGLE_NAME_IMPL(base, IOM, AM)
+
+__kernel void MANGLE_NAME(conv_invgemm_col2img_, IOM, AM)(const int iw,
+    const int ih,
+    const int fw,
+    const int fh,
+    const int pw,
+    const int ph,
+    const int ow_str,
+    const int oh_str,
+    const int o_off,
+    const int oc,
+    const int bx,
+    const int by,
+    __global const T *in,
+    __read_only image1d_t bias,
+    KERNEL_MEM out)
+{
+    const int idx = get_global_id(0);
+    const int idy = get_global_id(1);
+    const int idz = get_global_id(2);
+    const ushort c_pitch = (oc + 3) >> 2;
+    const int idc = idz % c_pitch; 
+    if (idx >= bx || idy >= by) {
+        return;
+    }
+
+    const int pidx = idx + pw;
+    const int pidy = idy + ph;
+
+    int in_hx = (pidy < ih) ? pidy : (ih - 1);
+    int in_hy = (pidy < ih) ? 0 : (pidy - ih + 1);
+    int in_hl = fh - in_hy;
+    if (in_hl > in_hx + 1) {
+        in_hl = in_hx + 1;
+    }
+    if (pidy < 0) {
+        in_hl = 0;
+    }
+
+    int in_wx = (pidx < iw) ? pidx : (iw - 1);
+    int in_wy = (pidx < iw) ? 0 : (pidx - iw + 1);
+    int in_wl = fw - in_wy;
+    if (in_wl > in_wx + 1) {
+        in_wl = in_wx + 1;
+    }
+    if (pidx < 0) {
+        in_wl = 0;
+    }
+
+    int in_off_h = iw * (in_hx + ih * fw * (in_hy + idz * fh));
+    int in_str_h = iw * (ih * fw - 1);
+    int in_off_w = in_wx + in_wy * ih * iw;
+    int in_str_w = ih * iw - 1;
+    T4 sum = read_imageh(bias, sampler, idc);
+
+    for (int i = 0; i < in_hl; i++) {
+        for (int j = 0; j < in_wl; j++) {
+            sum += vload4(in_off_h + in_off_w + j * in_str_w, in);
+        }
+        in_off_h += in_str_h;
+    }
+    ACTIVATION_V4(sum);
+    STORE_MEM_V4_COMMON(sum, idx, idy, idz, ow_str, oh_str, o_off, out);
+}
diff --git a/compute/tensor/src/gpu/mali/cl/conv_invgemm_trans_flt.cl b/compute/tensor/src/gpu/mali/cl/conv_invgemm_trans_flt.cl
new file mode 100644
index 00000000..e9402b61
--- /dev/null
+++ b/compute/tensor/src/gpu/mali/cl/conv_invgemm_trans_flt.cl
@@ -0,0 +1,52 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "kernel_def.h"
+#define MANGLE_NAME_IMPL(base, K) base##K
+#define MANGLE_NAME(base, K) MANGLE_NAME_IMPL(base, K)
+
+__kernel void MANGLE_NAME(conv_invgemm_trans_flt_, K)(const int fw,
+    const int fh,
+    const int fwh,
+    const int fc,
+    const int fn,
+    __global const T *fltdata,
+    __global T *flt)
+{
+    int idx = get_global_id(0);
+    int idy = get_global_id(1);
+    int idz = get_global_id(2);
+    int iy = idy << 2;
+    const int flt_off = (idz * fc + iy) * fwh + idx;
+    T4 val = 0;
+    val.x = fltdata[flt_off];
+    if (iy + 1 < fc) {
+        val.y = fltdata[flt_off + fwh];
+    }
+    if (iy + 2 < fc) {
+        val.z = fltdata[flt_off + fwh * 2];
+    }
+    if (iy + 3 < fc) {
+        val.w = fltdata[flt_off + fwh * 3];
+    }
+    const int bc = (fc + 3) >> 2;
+    int ox = idz & 3;
+    int oy = idy;
+    int oz = (idz >> 2) * fwh + fwh - 1 - idx;
+    int K_pitch = K >> 2;
+    ox = ox + (oz % K_pitch) * 4;
+    oz = oz / K_pitch;
+
+    int out_off = (oz * bc + oy) * K + ox;
+    vstore4(val, out_off, flt);
+}
diff --git a/compute/tensor/src/gpu/mali/cl/gemm_tn.cl b/compute/tensor/src/gpu/mali/cl/gemm_tn.cl
index cdf2e23c..95f1a6fd 100644
--- a/compute/tensor/src/gpu/mali/cl/gemm_tn.cl
+++ b/compute/tensor/src/gpu/mali/cl/gemm_tn.cl
@@ -74,8 +74,7 @@
 #if defined(USE_OUTPUT_IMG)
 #define ADD_C_OFF(off) \
     {                  \
-        \ 
-    off.z += 1;        \
+        off.z += 1;    \
     }
 #else
 #define ADD_C_OFF(off) \
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_def.h b/compute/tensor/src/gpu/mali/cl/kernel_def.h
index edf721c3..81a451ce 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_def.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_def.h
@@ -245,36 +245,36 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP |
 #define LOAD_BUF_ARRAY5(v, off, buf)   \
     {                                  \
         T4 tmp = vload4(0, buf + off); \
-        v[0] = tmp.s0;                 \
-        v[1] = tmp.s1;                 \
-        v[2] = tmp.s2;                 \
-        v[3] = tmp.s3;                 \
+        v[0] = tmp.x;                  \
+        v[1] = tmp.y;                  \
+        v[2] = tmp.z;                  \
+        v[3] = tmp.w;                  \
         v[4] = buf[off + 4];           \
     }
 
 #define LOAD_BUF_ARRAY6(v, off, buf)         \
-    {                                        \ 
+    {                                        \
         T4 tmp = vload4(0, buf + off);       \
+        v[0] = tmp.x;                        \
+        v[1] = tmp.y;                        \
+        v[2] = tmp.z;                        \
+        v[3] = tmp.w;                        \
         T2 tmpex = vload2(0, buf + off + 4); \
-        v[0] = tmp.s0;                       \
-        v[1] = tmp.s1;                       \
-        v[2] = tmp.s2;                       \
-        v[3] = tmp.s3;                       \
-        v[4] = tmpex.s0;                     \
-        v[5] = tmpex.s1;                     \
+        v[4] = tmpex.x;                      \
+        v[5] = tmpex.y;                      \
     }
 
 #define LOAD_BUF_ARRAY7(v, off, buf)         \
     {                                        \
         T4 tmp = vload4(0, buf + off);       \
+        v[0] = tmp.x;                        \
+        v[1] = tmp.y;                        \
+        v[2] = tmp.z;                        \
+        v[3] = tmp.w;                        \
         T3 tmpex = vload3(0, buf + off + 4); \
-        v[0] = tmp.s0;                       \
-        v[1] = tmp.s1;                       \
-        v[2] = tmp.s2;                       \
-        v[3] = tmp.s3;                       \
-        v[4] = tmpex.s0;                     \
-        v[5] = tmpex.s1;                     \
-        v[6] = tmpex.s2;                     \
+        v[4] = tmpex.x;                      \
+        v[5] = tmpex.y;                      \
+        v[6] = tmpex.z;                      \
     }
 
 #define LOAD_BUF_ARRAY8(v, off, buf)   \
@@ -1341,6 +1341,14 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP |
         v.s2 = 1.0 / (1.0 + exp(-1.0 * v.s2)); \
         v.s3 = 1.0 / (1.0 + exp(-1.0 * v.s3)); \
     }
+#elif defined(USE_SWISH)
+#define ACTIVATION_V4(v)                        \
+    {                                           \
+        v.s0 = v.s0 / (1.0 + exp(-1.0 * v.s0)); \
+        v.s1 = v.s1 / (1.0 + exp(-1.0 * v.s1)); \
+        v.s2 = v.s2 / (1.0 + exp(-1.0 * v.s2)); \
+        v.s3 = v.s3 / (1.0 + exp(-1.0 * v.s3)); \
+    }
 #elif defined(USE_ABS)
 #define ACTIVATION_V4(v)   \
     {                      \
@@ -1365,6 +1373,14 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP |
         v.s2 = -v.s2;    \
         v.s3 = -v.s3;    \
     }
+#elif defined(USE_EXP)
+#define ACTIVATION_V4(v)  \
+    {                     \
+        v.s0 = exp(v.s0); \
+        v.s1 = exp(v.s1); \
+        v.s2 = exp(v.s2); \
+        v.s3 = exp(v.s3); \
+    }
 #else
 #define ACTIVATION_V1(v) \
     {}
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/activation_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/activation_opt.h
index d4dd65c7..78067099 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/activation_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/activation_opt.h
@@ -17,8 +17,9 @@ inline EE set_activation_opt_mali(bool useNchwFormat,
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
     char modeName[128] = "";
     CHECK_STATUS(set_activation_mode_name(activeMode, modeName));
-    sprintf(kernelName, "activation_%s%s%s", ioMemName, formatName.c_str(), modeName);
-    sprintf(kernelOpt->sourceName, "activation");
+    std::string kernel = std::string("activation_") + ioMemName + formatName + modeName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "activation");
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     CHECK_STATUS(set_activation_define_opt(activeMode, opt));
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/cast_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/cast_opt.h
index 5422fcd5..bb7914d2 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/cast_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/cast_opt.h
@@ -36,8 +36,9 @@ inline EE set_cast_opt_mali(bool useNchwFormat,
     } else {
         CHECK_STATUS(NOT_SUPPORTED);
     }
-    sprintf(kernelName, "cast_%s%s_to_%s", formatName.c_str(), idtName.c_str(), odtName.c_str());
-    sprintf(kernelOpt->sourceName, "cast");
+    std::string kernel = std::string("cast_") + formatName + idtName + std::string("_to_") + odtName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "cast");
     if (useNchwFormat) {
         CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt));
     }
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/channel_resize_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/channel_resize_opt.h
index a5e8ad24..188c7dfa 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/channel_resize_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/channel_resize_opt.h
@@ -15,8 +15,9 @@ inline EE set_channel_resize_opt_mali(bool useNchwFormat,
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
 
-    sprintf(kernelName, "channel_resize_%s%s", ioMemName, formatName.c_str());
-    sprintf(kernelOpt->sourceName, "channel_resize");
+    std::string kernel = std::string("channel_resize_") + ioMemName + formatName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "channel_resize");
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     if (useNchwFormat) {
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/clip_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/clip_opt.h
index e2efdb24..0866f53a 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/clip_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/clip_opt.h
@@ -10,14 +10,14 @@ inline EE set_clip_opt_mali(bool useNchwFormat,
 {
     char *opt = kernelOpt->option;
     kernelOpt->kernelDataType = dt;
-    std::string formatName = "";
-    if (useNchwFormat) {
-        formatName = "nchw_";
-    }
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
-    sprintf(kernelName, "clip_%s%s", ioMemName, formatName.c_str());
-    sprintf(kernelOpt->sourceName, "clip");
+    std::string name = "clip_" + std::string(ioMemName);
+    if (useNchwFormat) {
+        name += "nchw_";
+    }
+    UNI_STRCPY(kernelName, name.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "clip");
     if (useNchwFormat) {
         CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt));
     }
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/common_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/common_opt.h
index cdf928c4..130d201a 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/common_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/common_opt.h
@@ -9,7 +9,7 @@ inline EE set_chars_define_opt(const char *optName, char *&opt)
     std::string sopt = "-D";
     sopt += optName;
     sopt += " ";
-    strcpy(opt, sopt.c_str());
+    UNI_STRCPY(opt, sopt.c_str());
     opt += sopt.length();
     return SUCCESS;
 }
@@ -22,7 +22,7 @@ inline EE set_value_define_opt(U32 val, const char *valName, char *&opt)
     sopt += "=";
     sopt += sval;
     sopt += " ";
-    strcpy(opt, sopt.c_str());
+    UNI_STRCPY(opt, sopt.c_str());
     opt += sopt.length();
     return SUCCESS;
 }
@@ -64,11 +64,17 @@ inline EE set_activation_define_opt(ActivationMode activeMode, char *&opt)
         case ACTIVATION_NEG:
             sopt = "-DUSE_NEG -D AM=neg_ ";
             break;
+        case ACTIVATION_EXP:
+            sopt = "-DUSE_EXP -D AM=exp_ ";
+            break;
+        case ACTIVATION_SWISH:
+            sopt = "-DUSE_SWISH -D AM=swish_ ";
+            break;
         default:
             CHECK_STATUS(NOT_SUPPORTED);
             break;
     }
-    strcpy(opt, sopt.c_str());
+    UNI_STRCPY(opt, sopt.c_str());
     opt += sopt.length();
     return SUCCESS;
 }
@@ -109,11 +115,17 @@ inline EE set_activation_mode_name(ActivationMode activeMode, char *name)
         case ACTIVATION_NEG:
             sname = "neg_";
             break;
+        case ACTIVATION_EXP:
+            sname = "exp_";
+            break;
+        case ACTIVATION_SWISH:
+            sname = "swish_";
+            break;
         default:
             CHECK_STATUS(NOT_SUPPORTED);
             break;
     }
-    strcpy(name, sname.c_str());
+    UNI_STRCPY(name, sname.c_str());
     return SUCCESS;
 }
 
@@ -143,7 +155,7 @@ inline EE set_eltwise_define_opt(EltwiseMode eltwiseMode, char *&opt)
             CHECK_STATUS(NOT_SUPPORTED);
             break;
     }
-    strcpy(opt, sopt.c_str());
+    UNI_STRCPY(opt, sopt.c_str());
     opt += sopt.length();
     return SUCCESS;
 }
@@ -174,7 +186,7 @@ inline EE set_eltwise_mode_name(EltwiseMode eltwiseMode, char *name)
             CHECK_STATUS(NOT_SUPPORTED);
             break;
     }
-    strcpy(name, sname.c_str());
+    UNI_STRCPY(name, sname.c_str());
     return SUCCESS;
 }
 
@@ -198,7 +210,7 @@ inline EE set_io_mem_define_opt(GCLMemType inputType, GCLMemType outputType, cha
     } else {
         def += "-D IOM= ";
     }
-    strcpy(opt, def.c_str());
+    UNI_STRCPY(opt, def.c_str());
     opt += def.length();
     return SUCCESS;
 }
@@ -215,7 +227,7 @@ inline EE set_io_mem_name(GCLMemType inputType, GCLMemType outputType, char *nam
     } else if (useInputImg && useOutputImg) {
         sname = "iom_";
     }
-    strcpy(name, sname.c_str());
+    UNI_STRCPY(name, sname.c_str());
     return SUCCESS;
 }
 
@@ -273,7 +285,7 @@ inline EE set_io_mems_name_and_define_opts(GCLMemType *inputMemType,
     }
     CHECK_STATUS(set_chars_define_opt(iomDef.c_str(), opt));
 
-    strcpy(name, iom.c_str());
+    UNI_STRCPY(name, iom.c_str());
     return SUCCESS;
 }
 
@@ -291,7 +303,7 @@ inline EE set_data_type_name(DataType dt, char *name)
     } else {
         return NOT_SUPPORTED;
     }
-    strcpy(name, sname.c_str());
+    UNI_STRCPY(name, sname.c_str());
     return SUCCESS;
 }
 
@@ -309,7 +321,7 @@ inline EE set_data_type_define_opt(DataType dt, char *&opt)
     } else {
         return NOT_SUPPORTED;
     }
-    strcpy(opt, sopt.c_str());
+    UNI_STRCPY(opt, sopt.c_str());
     opt += sopt.length();
     return SUCCESS;
 }
@@ -323,8 +335,9 @@ inline EE set_common_opt(DataType dt,
 {
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
-    sprintf(kernelName, "%s_%s", sourceName, ioMemName);
-    strcpy(kernelOpt->sourceName, sourceName);
+    std::string kernel = sourceName + std::string("_") + ioMemName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, sourceName);
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
@@ -339,7 +352,7 @@ inline bool check_qualcomm_device(char *devName = nullptr)
         if (useQualcommDev) {
             dev = "_qc";
         }
-        strcpy(devName, dev.c_str());
+        UNI_STRCPY(devName, dev.c_str());
     }
     return useQualcommDev;
 }
@@ -347,7 +360,7 @@ inline bool check_qualcomm_device(char *devName = nullptr)
 inline EE add_qcom_acc_16_bit_opt(char *&opt)
 {
     std::string qcom_acc = "-qcom-accelerate-16-bit ";
-    strcpy(opt, qcom_acc.c_str());
+    UNI_STRCPY(opt, qcom_acc.c_str());
     opt += qcom_acc.length();
     return SUCCESS;
 }
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/concat_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/concat_opt.h
index bc6e723d..95ebf549 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/concat_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/concat_opt.h
@@ -46,11 +46,12 @@ inline EE set_concat_opt_mali(U32 concatDim,
     char iomName[128] = "";
     CHECK_STATUS(
         set_io_mems_name_and_define_opts(inputMemType, &outputMemType, inputNum, 1, iomName, opt));
-    sprintf(kernelName, "concat_%s%s%s%d", formatName.c_str(), iomName, dimName.c_str(), inputNum);
+    std::string kernel = "concat_" + formatName + iomName + dimName + std::to_string(inputNum);
+    UNI_STRCPY(kernelName, kernel.c_str());
     if (useNchwFormat) {
-        sprintf(kernelOpt->sourceName, "concat_nchw");
+        UNI_STRCPY(kernelOpt->sourceName, "concat_nchw");
     } else {
-        sprintf(kernelOpt->sourceName, "concat");
+        UNI_STRCPY(kernelOpt->sourceName, "concat");
     }
     kernelOpt->kernelDataType = dt;
     CHECK_STATUS(set_value_define_opt(inputNum, "N", opt));
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_depthwise_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_depthwise_opt.h
index 44137315..2db31b5a 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_depthwise_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_depthwise_opt.h
@@ -12,8 +12,10 @@ inline EE set_conv_depthwise_trans_flt(U32 workFiltersPerThread,
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     CHECK_STATUS(set_io_mem_name(GCL_MEM_BUF, outputMemType, ioMemName));
-    sprintf(kernelName, "conv_depthwise_trans_fltbuf_%s%d", ioMemName, item_k);
-    sprintf(kernelOpt->sourceName, "conv_depthwise_trans_fltbuf");
+    std::string kernel =
+        std::string("conv_depthwise_trans_fltbuf_") + ioMemName + std::to_string(item_k);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "conv_depthwise_trans_fltbuf");
     CHECK_STATUS(set_value_define_opt(item_k, "K", opt));
     CHECK_STATUS(set_io_mem_define_opt(GCL_MEM_BUF, outputMemType, opt));
     return SUCCESS;
@@ -54,10 +56,11 @@ inline EE set_conv_depthwise_opt_mali(U32 fw,
     if (outputNchwMode) {
         formatName = "nchw_";
     }
-    sprintf(kernelName, "conv_depthwise_sh%d%s_%s%s%s%d%d%d", sh, devName, ioMemName, modeName,
-        formatName.c_str(), fw, fh, ON);
-
-    sprintf(kernelOpt->sourceName, "conv_depthwise_sh%d%s", sh, devName);
+    std::string source = std::string("conv_depthwise_sh") + std::to_string(sh) + devName;
+    std::string kernel = source + std::string("_") + ioMemName + modeName + formatName +
+        std::to_string(fw) + std::to_string(fh) + std::to_string(ON);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, source.c_str());
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     if (ON < 1 || ON > 8) {
@@ -152,10 +155,13 @@ inline EE set_conv_depthwise_dila_opt_mali(U32 fw,
     if (outputNchwMode) {
         formatName = "nchw_";
     }
-    sprintf(kernelName, "conv_depthwise_sh%d_%s%s%s%s%d%d%d", sh, dilaMode.c_str(), ioMemName,
-        modeName, formatName.c_str(), fw, fh, ON);
-
-    sprintf(kernelOpt->sourceName, "conv_depthwise_sh%d_dila", sh);
+    std::string kernel = std::string("conv_depthwise_sh") + std::to_string(sh) + std::string("_") +
+        dilaMode + ioMemName + modeName + formatName + std::to_string(fw) + std::to_string(fh) +
+        std::to_string(ON);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    std::string source =
+        std::string("conv_depthwise_sh") + std::to_string(sh) + std::string("_dila");
+    UNI_STRCPY(kernelOpt->sourceName, source.c_str());
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     if (ON < 1 || ON > 8) {
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_direct_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_direct_opt.h
index 965a1b4c..60e901f0 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_direct_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_direct_opt.h
@@ -21,9 +21,10 @@ inline EE set_conv_direct_trans_flt(U32 workChannelsPerThread,
         transWHName = "hw_";
         CHECK_STATUS(set_chars_define_opt("USE_TRANS_WH", opt));
     }
-    sprintf(kernelName, "conv_direct_trans_flt_%s%s%d%d", ioMemName, transWHName.c_str(), item_c,
-        item_k);
-    sprintf(kernelOpt->sourceName, "conv_direct_trans_flt");
+    std::string kernel = std::string("conv_direct_trans_flt_") + ioMemName + transWHName +
+        std::to_string(item_c) + std::to_string(item_k);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "conv_direct_trans_flt");
     CHECK_STATUS(set_value_define_opt(item_c, "C", opt));
     CHECK_STATUS(set_value_define_opt(item_k, "K", opt));
     CHECK_STATUS(set_io_mem_define_opt(GCL_MEM_BUF, outputMemType, opt));
@@ -81,15 +82,18 @@ inline EE set_conv_direct_opt_mali(U32 fw,
         biasName = "nobias_";
     }
 
+    std::string kernel, source;
     if (ft > 1) {
-        sprintf(kernelName, "conv_direct_3d_sh%d%s_%s%s%s%d%d%d%d%d", sh, devName, ioMemName,
-            modeName, biasName.c_str(), fw, fh, ft, ON, KN);
-        sprintf(kernelOpt->sourceName, "conv_direct_3d_sh%d%s", sh, devName);
+        source = std::string("conv_direct_3d_sh") + std::to_string(sh) + std::string(devName);
+        kernel = source + std::string("_") + ioMemName + modeName + biasName + std::to_string(fw) +
+            std::to_string(fh) + std::to_string(ft) + std::to_string(ON) + std::to_string(KN);
     } else {
-        sprintf(kernelName, "conv_direct_sh%d%s_%s%s%s%d%d%d%d", sh, devName, ioMemName, modeName,
-            biasName.c_str(), fw, fh, ON, KN);
-        sprintf(kernelOpt->sourceName, "conv_direct_sh%d%s", sh, devName);
+        source = std::string("conv_direct_sh") + std::to_string(sh) + std::string(devName);
+        kernel = source + std::string("_") + ioMemName + modeName + biasName + std::to_string(fw) +
+            std::to_string(fh) + std::to_string(ON) + std::to_string(KN);
     }
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, source.c_str());
 
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
@@ -192,9 +196,11 @@ inline EE set_conv_direct_multi_batch_opt_mali(U32 fw,
     CHECK_STATUS(set_activation_mode_name(activeMode, modeName));
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
-    sprintf(kernelName, "conv_direct_multi_batch_sh%d_%s%s%d%d%d%d%d", sh, ioMemName, modeName, fw,
-        fh, ON, KN, BN);
-    sprintf(kernelOpt->sourceName, "conv_direct_multi_batch_sh%d", sh);
+    std::string source = std::string("conv_direct_multi_batch_sh") + std::to_string(sh);
+    std::string kernel = source + std::string("_") + ioMemName + modeName + std::to_string(fw) +
+        std::to_string(fh) + std::to_string(ON) + std::to_string(KN) + std::to_string(BN);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, source.c_str());
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     if (ON < 1 || ON > 8) {
@@ -284,8 +290,10 @@ inline EE set_conv_direct_reuse_w_opt_mali(U32 fw,
     CHECK_STATUS(set_activation_mode_name(activeMode, modeName));
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
-    sprintf(kernelName, "conv_direct_sw1_reuse_w_%s%s%d%d%d%d", ioMemName, modeName, fw, fh, ON, KN);
-    sprintf(kernelOpt->sourceName, "conv_direct_sw1_reuse_w");
+    std::string kernel = std::string("conv_direct_sw1_reuse_w_") + ioMemName + modeName +
+        std::to_string(fw) + std::to_string(fh) + std::to_string(ON) + std::to_string(KN);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "conv_direct_sw1_reuse_w");
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     if (ON < 1 || ON > 8) {
@@ -353,15 +361,20 @@ inline EE set_conv_direct_nchw_to_nchwc4_opt_mali(U32 fw,
     CHECK_STATUS(set_activation_mode_name(activeMode, modeName));
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
+    std::string kernel, source;
     if (ft > 1) {
-        sprintf(kernelName, "conv_direct_3d_sw%d_nchw_to_nchwc4%s_%s%s%d%d%d%d", sw, devName, 
-            ioMemName, modeName, fw, fh, ft, ON);
-        sprintf(kernelOpt->sourceName, "conv_direct_3d_sw%d_nchw_to_nchwc4%s", sw, devName);
+        source = std::string("conv_direct_3d_sw") + std::to_string(sw) +
+            std::string("_nchw_to_nchwc4") + devName;
+        kernel = source + std::string("_") + ioMemName + modeName + std::to_string(fw) +
+            std::to_string(fh) + std::to_string(ft) + std::to_string(ON);
     } else {
-        sprintf(kernelName, "conv_direct_sw%d_nchw_to_nchwc4%s_%s%s%d%d%d", sw, devName, ioMemName,
-            modeName, fw, fh, ON);
-        sprintf(kernelOpt->sourceName, "conv_direct_sw%d_nchw_to_nchwc4%s", sw, devName);
+        source = std::string("conv_direct_sw") + std::to_string(sw) +
+            std::string("_nchw_to_nchwc4") + devName;
+        kernel = source + std::string("_") + ioMemName + modeName + std::to_string(fw) +
+            std::to_string(fh) + std::to_string(ON);
     }
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, source.c_str());
 
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
@@ -441,9 +454,13 @@ inline EE set_conv_direct_dila_opt_mali(U32 fw,
     if (dh == 2) {
         dilaMode = "dila2_";
     }
-    sprintf(kernelName, "conv_direct_sh%d%s_%s%s%s%d%d%d%d", sh, devName, dilaMode.c_str(),
-        ioMemName, modeName, fw, fh, ON, KN);
-    sprintf(kernelOpt->sourceName, "conv_direct_sh%d%s_dila", sh, devName);
+    std::string kernel = std::string("conv_direct_sh") + std::to_string(sh) + devName +
+        std::string("_") + dilaMode + ioMemName + modeName + std::to_string(fw) +
+        std::to_string(fh) + std::to_string(ON) + std::to_string(KN);
+    std::string source =
+        std::string("conv_direct_sh") + std::to_string(sh) + devName + std::string("_dila");
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, source.c_str());
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     if (ON < 1 || ON > 8) {
@@ -517,9 +534,10 @@ inline EE set_conv_direct_sh1_fn_spe_opt_mali(U32 fw,
     if (useNchwFormat) {
         formatName = "nchw_";
     }
-    sprintf(kernelName, "conv_direct_sh1_fn_spe_%s%s%s%d%d%d", ioMemName, modeName,
-        formatName.c_str(), fw, fh, ON);
-    sprintf(kernelOpt->sourceName, "conv_direct_sh1_fn_spe");
+    std::string buffer = std::string("conv_direct_sh1_fn_spe_") + ioMemName + modeName +
+        formatName + std::to_string(fw) + std::to_string(fh) + std::to_string(ON);
+    UNI_STRCPY(kernelName, buffer.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "conv_direct_sh1_fn_spe");
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
 
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_invgemm_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_invgemm_opt.h
new file mode 100644
index 00000000..13973247
--- /dev/null
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_invgemm_opt.h
@@ -0,0 +1,37 @@
+#ifndef CONV_INVGEMM_OPT
+#define CONV_INVGEMM_OPT
+#include "common_opt.h"
+inline EE set_conv_invgemm_trans_flt_opt(
+    U32 workFiltersPerThread, DataType dt, char *kernelName, KernelOpt *kernelOpt)
+{
+    kernelOpt->kernelDataType = dt;
+    char *opt = kernelOpt->option;
+    U32 item_k = workFiltersPerThread;
+    std::string kernel = std::string("conv_invgemm_trans_flt_") + std::to_string(item_k);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "conv_invgemm_trans_flt");
+    CHECK_STATUS(set_value_define_opt(item_k, "K", opt));
+    return SUCCESS;
+}
+
+inline EE set_conv_invgemm_col2img_opt(ActivationMode activeMode,
+    DataType dt,
+    GCLMemType inputMemType,
+    GCLMemType outputMemType,
+    char *kernelName,
+    KernelOpt *kernelOpt)
+{
+    char *opt = kernelOpt->option;
+    kernelOpt->kernelDataType = dt;
+    char modeName[128];
+    CHECK_STATUS(set_activation_mode_name(activeMode, modeName));
+    char ioMemName[128] = "";
+    CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
+    std::string kernel = std::string("conv_invgemm_col2img_") + ioMemName + modeName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "conv_invgemm_col2img");
+    CHECK_STATUS(set_activation_define_opt(activeMode, opt));
+    CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
+    return SUCCESS;
+}
+#endif
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_wino_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_wino_opt.h
index 9ece942c..17f7e988 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_wino_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_wino_opt.h
@@ -7,13 +7,14 @@ inline EE set_conv_wino_rotate_flt(
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     U32 fwh = fw * fh;
-    sprintf(kernelName, "conv_wino_rotate_fltbuf_%d", fwh);
-    sprintf(kernelOpt->sourceName, "conv_wino_rotate_fltbuf");
+    std::string kernel = std::string("conv_wino_rotate_fltbuf_") + std::to_string(fwh);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "conv_wino_rotate_fltbuf");
     CHECK_STATUS(set_value_define_opt(fwh, "FWH", opt));
     return SUCCESS;
 }
 
-inline EE set_conv_wino_preprocess_input_opt(DataType dt, 
+inline EE set_conv_wino_preprocess_input_opt(DataType dt,
     bool useNchwFormat,
     GCLMemType inputMemType,
     GCLMemType outputMemType,
@@ -24,12 +25,13 @@ inline EE set_conv_wino_preprocess_input_opt(DataType dt,
     char *opt = kernelOpt->option;
     kernelOpt->kernelDataType = dt;
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
-    std::string formatName= "";
+    std::string formatName = "";
     if (useNchwFormat) {
         formatName = "nchw";
     }
-    sprintf(kernelName, "conv_wino_preprocess_input_%s%s", ioMemName, formatName.c_str());
-    sprintf(kernelOpt->sourceName, "conv_wino_preprocess_input");
+    std::string kernel = std::string("conv_wino_preprocess_input_") + ioMemName + formatName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "conv_wino_preprocess_input");
     CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
     if (useNchwFormat) {
         CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt));
@@ -55,8 +57,9 @@ inline EE set_conv_wino_trans_outbuf_opt(bool useAlign,
     if (useAlign) {
         alignName = "align";
     }
-    sprintf(kernelName, "conv_wino_trans_outbuf_%s%s%s", ioMemName, modeName, alignName.c_str());
-    sprintf(kernelOpt->sourceName, "conv_wino_trans_outbuf");
+    std::string kernel = std::string("conv_wino_trans_outbuf_") + ioMemName + modeName + alignName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "conv_wino_trans_outbuf");
     CHECK_STATUS(set_activation_define_opt(activeMode, opt));
     if (useAlign) {
         CHECK_STATUS(set_chars_define_opt("USE_ALIGN", opt));
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/copy_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/copy_opt.h
index d3463fcf..8707b599 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/copy_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/copy_opt.h
@@ -28,8 +28,9 @@ inline EE set_copy_opt_mali(bool useBlockIndex, DataType dt, char *kernelName, K
             CHECK_STATUS(NOT_SUPPORTED);
     }
 
-    sprintf(kernelName, "copy_%s%s", BINDName.c_str(), dtName.c_str());
-    sprintf(kernelOpt->sourceName, "copy");
+    std::string kernel = std::string("copy_") + BINDName + dtName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "copy");
     kernelOpt->kernelDataType = dt;
     if (useBlockIndex) {
         CHECK_STATUS(set_chars_define_opt("USE_BLOCK_INDEX", opt));
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/deconv_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/deconv_opt.h
index a60098aa..2f031a4f 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/deconv_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/deconv_opt.h
@@ -15,8 +15,10 @@ inline EE set_deconv_gemm_trans_fltbuf(U32 workChannelsPerThread,
     char *opt = kernelOpt->option;
     CHECK_STATUS(set_io_mem_name(GCL_MEM_BUF, outputMemType, ioMemName));
 
-    sprintf(kernelName, "deconv_gemm_trans_fltbuf_%d%d", item_c, item_k);
-    sprintf(kernelOpt->sourceName, "deconv_gemm_trans_fltbuf");
+    std::string kernel =
+        std::string("deconv_gemm_trans_fltbuf_") + std::to_string(item_c) + std::to_string(item_k);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "deconv_gemm_trans_fltbuf");
     CHECK_STATUS(set_value_define_opt(item_c, "C", opt));
     CHECK_STATUS(set_value_define_opt(item_k, "K", opt));
     CHECK_STATUS(set_io_mem_define_opt(GCL_MEM_BUF, outputMemType, opt));
@@ -131,9 +133,11 @@ inline EE set_deconv_gemm_f2s2_opt(U32 workChannelsPerThread,
     if (reuseOnW) {
         reuseOnWName = "w_";
     }
-    sprintf(kernelName, "deconv_gemm_f2s2%s_%s%s%s%d%d", devName, reuseOnWName.c_str(), ioMemName,
-        modeName, ON, KN);
-    sprintf(kernelOpt->sourceName, "deconv_gemm_f2s2%s", devName);
+    std::string source = std::string("deconv_gemm_f2s2") + devName;
+    std::string kernel = source + std::string("_") + reuseOnWName + ioMemName + modeName +
+        std::to_string(ON) + std::to_string(KN);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, source.c_str());
     U32 IN = ON;
     U32 LN = ON;
     CHECK_STATUS(set_value_define_opt(ON, "ON", opt));
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/depth2space_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/depth2space_opt.h
index c00eeae7..97bc43d8 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/depth2space_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/depth2space_opt.h
@@ -17,8 +17,9 @@ inline EE set_depth2space_nchwc4_2x2_opt(bool useOutputNchw,
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
     kernelOpt->kernelDataType = DT_F16;
-    sprintf(kernelName, "depth2space_nchwc4_2x2_%s%s", ioMemName, outputFormatName.c_str());
-    sprintf(kernelOpt->sourceName, "depth2space_nchwc4_2x2");
+    std::string kernel = std::string("depth2space_nchwc4_2x2_") + ioMemName + outputFormatName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "depth2space_nchwc4_2x2");
     if (useOutputNchw) {
         CHECK_STATUS(set_chars_define_opt("OUT_NCHW", opt));
     }
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/eltwise_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/eltwise_opt.h
index 8b81788b..8bb8fae8 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/eltwise_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/eltwise_opt.h
@@ -24,9 +24,10 @@ inline EE set_eltwise_opt_mali(U32 inputNum,
     char iomName[128] = "";
     CHECK_STATUS(
         set_io_mems_name_and_define_opts(inputMemType, &outputMemType, inputNum, 1, iomName, opt));
-    sprintf(
-        kernelName, "eltwise_%s%s%s%s%d", iomName, actName, eltName, formatName.c_str(), inputNum);
-    sprintf(kernelOpt->sourceName, "eltwise");
+    std::string kernel = std::string("eltwise_") + iomName + actName + eltName + formatName +
+        std::to_string(inputNum);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "eltwise");
     kernelOpt->kernelDataType = dt;
     CHECK_STATUS(set_value_define_opt(inputNum, "N", opt));
     CHECK_STATUS(set_activation_define_opt(activeMode, opt));
@@ -74,9 +75,10 @@ inline EE set_eltwise_broadcast_opt_mali(bool useNchwFormat,
     char iomName[128] = "";
     CHECK_STATUS(set_io_mems_name_and_define_opts(inputMemType, &outputMemType, 2, 1, iomName, opt));
 
-    sprintf(kernelName, "eltwise_broadcast_%s%s%s%s%s%s", iomName, actName, eltName,
-        swapInputName.c_str(), formatName.c_str(), axisName.c_str());
-    sprintf(kernelOpt->sourceName, "eltwise_broadcast");
+    std::string kernel = std::string("eltwise_broadcast_") + iomName + actName + eltName +
+        swapInputName + formatName + axisName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "eltwise_broadcast");
     CHECK_STATUS(set_activation_define_opt(activeMode, opt));
     CHECK_STATUS(set_eltwise_define_opt(eltwiseMode, opt));
     if (useNchwFormat) {
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/expand_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/expand_opt.h
index b674305c..ab0b8597 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/expand_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/expand_opt.h
@@ -11,9 +11,10 @@ inline EE set_expand_opt_mali(U32 nDims,
     char *opt = kernelOpt->option;
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
+    std::string kernel = std::string("expand_") + ioMemName + std::to_string(nDims);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "expand");
     kernelOpt->kernelDataType = dt;
-    sprintf(kernelName, "expand_%s%d", ioMemName, nDims);
-    sprintf(kernelOpt->sourceName, "expand");
     CHECK_STATUS(set_value_define_opt(nDims, "DN", opt));
     CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
     return SUCCESS;
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/fill_memory_zero_vec4_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/fill_memory_zero_vec4_opt.h
index b7f25cd6..4f89255e 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/fill_memory_zero_vec4_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/fill_memory_zero_vec4_opt.h
@@ -10,8 +10,10 @@ inline EE set_fill_memory_zero_vec4_opt_mali(
     kernelOpt->kernelDataType = dt;
     char dtName[128];
     CHECK_STATUS(set_data_type_name(dt, dtName));
-    sprintf(kernelName, "fill_memory_zero_vec4_%s%s", ioMemName, dtName);
-    sprintf(kernelOpt->sourceName, "fill_memory_zero_vec4");
+    std::string buffer =
+        std::string("fill_memory_zero_vec4_") + std::string(ioMemName) + std::string(dtName);
+    UNI_STRCPY(kernelName, buffer.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "fill_memory_zero_vec4");
     CHECK_STATUS(set_data_type_define_opt(dt, opt));
     CHECK_STATUS(set_io_mem_define_opt(GCL_MEM_BUF, outputMemType, opt));
     return SUCCESS;
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/gemm_tn_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/gemm_tn_opt.h
index 8aeec7fa..409d2755 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/gemm_tn_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/gemm_tn_opt.h
@@ -113,9 +113,11 @@ inline EE set_gemm_tn_opt_mali(U32 item_m,
         matCMemName = "cm_";
         CHECK_STATUS(set_chars_define_opt("USE_OUTPUT_IMG", opt));
     }
-    sprintf(kernelName, "gemm_tn%s_%s%s%s%s%s%s%d%d", devName, matAMemName.c_str(), matBMemName.c_str(),
-        matCMemName.c_str(), modeName, formatName.c_str(), biasName.c_str(), item_m, item_n);
-    sprintf(kernelOpt->sourceName, "gemm_tn%s", devName);
+    std::string source = std::string("gemm_tn") + devName;
+    std::string kernel = source + std::string("_") + matAMemName + matBMemName + matCMemName +
+        modeName + formatName + biasName + std::to_string(item_m) + std::to_string(item_n);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, source.c_str());
     kernelOpt->kernelDataType = dt;
     U32 UN = item_n - 1;
     CHECK_STATUS(set_value_define_opt(item_m, "LM", opt));
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/gemv_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/gemv_opt.h
index ffac49c2..6f479ac8 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/gemv_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/gemv_opt.h
@@ -15,8 +15,9 @@ inline EE set_gemv_trans_mat_opt(U32 workMatChannelsPerThread,
         CHECK_STATUS(set_chars_define_opt("USE_TRANS_CK", opt));
         transName = "kc_";
     }
-    sprintf(kernelName, "gemv_trans_mat_%s%d", transName.c_str(), C);
-    sprintf(kernelOpt->sourceName, "gemv_trans_mat");
+    std::string kernel = std::string("gemv_trans_mat_") + transName + std::to_string(C);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "gemv_trans_mat");
     kernelOpt->kernelDataType = dt;
     CHECK_STATUS(set_value_define_opt(C, "C", opt));
     return SUCCESS;
@@ -64,9 +65,11 @@ inline EE set_gemv_opt(U32 workMatChannelsPerThread,
         reduceName = "_reduce";
     }
 
-    sprintf(kernelName, "gemv%s_%s%s%s%d", reduceName.c_str(), modeName, outFormatName.c_str(),
-        biasName.c_str(), OC);
-    sprintf(kernelOpt->sourceName, "gemv%s", reduceName.c_str());
+    std::string source = "gemv" + reduceName;
+    std::string kernel =
+        source + std::string("_") + modeName + outFormatName + biasName + std::to_string(OC);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, source.c_str());
     kernelOpt->kernelDataType = dt;
     CHECK_STATUS(set_value_define_opt(OC, "OC", opt));
     CHECK_STATUS(set_activation_define_opt(activeMode, opt));
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/mem_trans_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/mem_trans_opt.h
index fb7b83f0..40559f4d 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/mem_trans_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/mem_trans_opt.h
@@ -46,14 +46,15 @@ inline EE set_mem_trans_opt_mali(MemTransFormType type,
         default:
             CHECK_STATUS(NOT_MATCH);
     }
-    sprintf(kernelName, "mem_trans_%s%s%s%s", use3dFormat.c_str(), ioMemName, inputFormat.c_str(),
-        outputFormat.c_str());
-    kernelOpt->kernelDataType = dt;
+    std::string kernel =
+        std::string("mem_trans_") + use3dFormat + ioMemName + inputFormat + outputFormat;
+    UNI_STRCPY(kernelName, kernel.c_str());
     if (use3dMode) {
-        sprintf(kernelOpt->sourceName, "mem_trans_3d");
+        UNI_STRCPY(kernelOpt->sourceName, "mem_trans_3d");
     } else {
-        sprintf(kernelOpt->sourceName, "mem_trans");
+        UNI_STRCPY(kernelOpt->sourceName, "mem_trans");
     }
+    kernelOpt->kernelDataType = dt;
     CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
     return SUCCESS;
 }
@@ -83,8 +84,9 @@ inline EE set_mem_trans_c_opt_mali(MemTransCType type,
     } else {
         CHECK_STATUS(NOT_MATCH);
     }
-    sprintf(kernelName, "mem_trans_c_%s%s", ioMemName, transFormat.c_str());
-    sprintf(kernelOpt->sourceName, "mem_trans_c");
+    std::string kernel = std::string("mem_trans_c_") + ioMemName + transFormat;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "mem_trans_c");
     kernelOpt->kernelDataType = dt;
     CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
     return SUCCESS;
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/normalization_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/normalization_opt.h
index 6eeec422..d93e8c8d 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/normalization_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/normalization_opt.h
@@ -5,13 +5,12 @@
 inline EE set_normalization_opt_mali(
     bool useNchwFormat, DataType dt, char *kernelName, KernelOpt *kernelOpt)
 {
-    std::string formatName = "";
+    std::string kernel = "normalization";
     if (useNchwFormat) {
-        formatName = "_nchw";
+        kernel += "_nchw";
     }
-
-    sprintf(kernelName, "normalization%s", formatName.c_str());
-    sprintf(kernelOpt->sourceName, "normalization");
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "normalization");
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     if (useNchwFormat) {
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/padding_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/padding_opt.h
index 89e544d4..324505e7 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/padding_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/padding_opt.h
@@ -11,36 +11,36 @@ inline EE set_padding_opt_mali(bool useNchwFormat,
 {
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
-    std::string formatName = "";
+    std::string name = "padding_";
     if (useNchwFormat) {
-        formatName = "nchw_";
+        name += "nchw_";
     }
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
-    std::string modeName = "";
+    std::string modeName;
     switch (mode) {
-        case Pad_Constant:
+        case PAD_CONSTANT:
             modeName = "constant";
             CHECK_STATUS(set_chars_define_opt("USE_CONSTANT", opt));
             break;
-        case Pad_Edge:
+        case PAD_EDGE:
             modeName = "edge";
             CHECK_STATUS(set_chars_define_opt("USE_EDGE", opt));
             break;
-        case Pad_Reflect:
+        case PAD_REFLECT:
             modeName = "reflect";
             CHECK_STATUS(set_chars_define_opt("USE_REFLECT", opt));
             break;
-        case Pad_Symmetric:
+        case PAD_SYMMETRIC:
             modeName = "symmetric";
             CHECK_STATUS(set_chars_define_opt("USE_SYMMETRIC", opt));
             break;
         default:
             return NOT_SUPPORTED;
     }
-
-    sprintf(kernelName, "padding_%s%s", formatName.c_str(), modeName.c_str());
-    sprintf(kernelOpt->sourceName, "padding");
+    name += modeName;
+    UNI_STRCPY(kernelName, name.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "padding");
     if (useNchwFormat) {
         CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt));
     }
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/pooling_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/pooling_opt.h
index 6333dc34..f85c79b4 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/pooling_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/pooling_opt.h
@@ -22,8 +22,9 @@ inline EE set_pooling_opt_mali(PoolingMode mode,
     }
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
-    sprintf(kernelName, "pooling_%s%s", ioMemName, modeName.c_str());
-    sprintf(kernelOpt->sourceName, "pooling");
+    std::string kernel = std::string("pooling_") + ioMemName + modeName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "pooling");
     kernelOpt->kernelDataType = dt;
     CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
     return SUCCESS;
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/power_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/power_opt.h
index 97c590c4..173da092 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/power_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/power_opt.h
@@ -20,8 +20,9 @@ inline EE set_power_opt_mali(bool useNchwFormat,
     }
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
-    sprintf(kernelName, "power_%s%s%s", ioMemName, formatName.c_str(), dtName.c_str());
-    sprintf(kernelOpt->sourceName, "power");
+    std::string kernel = std::string("power_") + ioMemName + formatName + dtName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "power");
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     if (useNchwFormat) {
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/prelu_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/prelu_opt.h
index 73730241..4c412457 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/prelu_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/prelu_opt.h
@@ -36,9 +36,9 @@ inline EE set_prelu_opt_mali(bool propagate_down,
     }
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
-    sprintf(kernelName, "prelu_%s%s%s%s", ioMemName, formatName.c_str(), reluAxisName.c_str(),
-        progName.c_str());
-    sprintf(kernelOpt->sourceName, "prelu");
+    std::string kernel = std::string("prelu_") + ioMemName + formatName + reluAxisName + progName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "prelu");
     kernelOpt->kernelDataType = dt;
     if (useNchwFormat) {
         CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt));
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/reduction_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/reduction_opt.h
index 8418c515..d31e91c6 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/reduction_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/reduction_opt.h
@@ -51,12 +51,13 @@ inline EE set_reduction_opt_mali(bool useNchwFormat,
             return NOT_SUPPORTED;
     }
 
-    sprintf(kernelName, "reduction_%s%s%s%d", formatName.c_str(), outputC4Name.c_str(),
-        modeName.c_str(), axis);
+    std::string kernel =
+        std::string("reduction_") + formatName + outputC4Name + modeName + std::to_string(axis);
+    UNI_STRCPY(kernelName, kernel.c_str());
     if (useNchwFormat) {
-        sprintf(kernelOpt->sourceName, "reduction_nchw");
+        UNI_STRCPY(kernelOpt->sourceName, "reduction_nchw");
     } else {
-        sprintf(kernelOpt->sourceName, "reduction");
+        UNI_STRCPY(kernelOpt->sourceName, "reduction");
     }
     CHECK_STATUS(set_value_define_opt(axis, "AXIS", opt));
     return SUCCESS;
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/rnncell_update_res_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/rnncell_update_res_opt.h
index 3408dc6a..dfd90ee7 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/rnncell_update_res_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/rnncell_update_res_opt.h
@@ -23,8 +23,9 @@ inline EE set_rnncell_update_res_opt_mali(bool useProjection,
         CHECK_STATUS(set_chars_define_opt("USE_RNN_MODE", opt));
     }
     kernelOpt->kernelDataType = dt;
-    sprintf(kernelName, "rnncell_update_res_%s%s", proName.c_str(), modeName.c_str());
-    sprintf(kernelOpt->sourceName, "rnncell_update_res");
+    std::string kernel = std::string("rnncell_update_res_") + proName + modeName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "rnncell_update_res");
     CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
     return SUCCESS;
 }
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/roialign_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/roialign_opt.h
index b2a0db6f..254bf29e 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/roialign_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/roialign_opt.h
@@ -27,8 +27,9 @@ inline EE set_roialign_opt_mali(bool useNchwFormat,
     } else {
         CHECK_STATUS(NOT_SUPPORTED);
     }
-    sprintf(kernelName, "roialign_%s%s%s", ioMemName, formatName.c_str(), modeName.c_str());
-    sprintf(kernelOpt->sourceName, "roialign");
+    std::string kernel = std::string("roialign_") + ioMemName + formatName + modeName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "roialign");
     if (useNchwFormat) {
         CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt));
     }
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/scale_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/scale_opt.h
index 40d9466f..a78d264f 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/scale_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/scale_opt.h
@@ -48,9 +48,10 @@ inline EE set_scale_opt_mali(bool useAlpha,
 
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
-    sprintf(kernelName, "scale_%s%s%s%s%s%s", ioMemName, formatName.c_str(), broadName.c_str(),
-        axisName.c_str(), alphaName.c_str(), betaName.c_str());
-    sprintf(kernelOpt->sourceName, "scale");
+    std::string kernel = std::string("scale_") + ioMemName + formatName + broadName + axisName +
+        alphaName + betaName;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "scale");
     CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
     return SUCCESS;
 }
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/slice_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/slice_opt.h
index 6a2bf5ca..6dc3a01e 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/slice_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/slice_opt.h
@@ -4,13 +4,13 @@
 inline EE set_slice_opt_mali(
     bool useNchwFormat, U32 axis, U32 slice_num, DataType dt, char *kernelName, KernelOpt *kernelOpt)
 {
-    std::string formatName = "";
+    std::string name = "slice_";
     if (useNchwFormat) {
-        formatName = "nchw_";
+        name += "nchw_";
     }
-
-    sprintf(kernelName, "slice_%s%d%d", formatName.c_str(), axis, slice_num);
-    sprintf(kernelOpt->sourceName, "slice");
+    name += std::to_string(axis) + std::to_string(slice_num);
+    UNI_STRCPY(kernelName, name.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "slice");
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     CHECK_STATUS(set_value_define_opt(axis, "AXIS_NUM", opt));
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/softmax_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/softmax_opt.h
index 078a8018..62138184 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/softmax_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/softmax_opt.h
@@ -15,8 +15,9 @@ inline EE set_softmax_opt_mali(U32 axis,
     if (useNchwFormat) {
         formatName = "nchw_";
     }
-    sprintf(kernelName, "softmax_%s%s%d", ioMemName, formatName.c_str(), axis);
-    sprintf(kernelOpt->sourceName, "softmax");
+    std::string kernel = std::string("softmax_") + ioMemName + formatName + std::to_string(axis);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "softmax");
     kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     CHECK_STATUS(set_value_define_opt(axis, "AXIS", opt));
@@ -83,9 +84,10 @@ inline EE set_softmax_vec_reduce_opt_mali(bool useNchwFormat,
             }
         }
     }
-    sprintf(kernelName, "softmax_vec_reduce_%s%s%s%s", ioMemName, formatName.c_str(),
-        inputAxis.c_str(), outputAxis.c_str());
-    sprintf(kernelOpt->sourceName, "softmax_vec_reduce");
+    std::string kernel =
+        std::string("softmax_vec_reduce_") + ioMemName + formatName + inputAxis + outputAxis;
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "softmax_vec_reduce");
     kernelOpt->kernelDataType = dt;
     if (useNchwFormat) {
         CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt));
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/space2depth_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/space2depth_opt.h
index e82e9df8..a7f38b7b 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/space2depth_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/space2depth_opt.h
@@ -10,15 +10,15 @@ inline EE set_space2depth_opt(bool useFormatNchw,
     KernelOpt *kernelOpt)
 {
     char *opt = kernelOpt->option;
-    std::string formatName = "";
+    std::string name = "space2depth";
     if (useFormatNchw) {
-        formatName = "_nchw";
+        name += "_nchw";
     }
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
     kernelOpt->kernelDataType = DT_F16;
-    sprintf(kernelName, "space2depth%s", formatName.c_str());
-    sprintf(kernelOpt->sourceName, "space2depth");
+    UNI_STRCPY(kernelName, name.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "space2depth");
     if (useFormatNchw) {
         CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt));
     }
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/tile_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/tile_opt.h
index 2952260f..01c21ecd 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/tile_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/tile_opt.h
@@ -12,8 +12,9 @@ inline EE set_tile_opt_mali(U32 nDims,
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
     kernelOpt->kernelDataType = dt;
-    sprintf(kernelName, "tile_%s%d", ioMemName, nDims);
-    sprintf(kernelOpt->sourceName, "tile");
+    std::string kernel = std::string("tile_") + ioMemName + std::to_string(nDims);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "tile");
     CHECK_STATUS(set_value_define_opt(nDims, "DN", opt));
     CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
     return SUCCESS;
diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/transpose_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/transpose_opt.h
index 2f3d1745..5c63e340 100644
--- a/compute/tensor/src/gpu/mali/cl/kernel_option/transpose_opt.h
+++ b/compute/tensor/src/gpu/mali/cl/kernel_option/transpose_opt.h
@@ -12,8 +12,9 @@ inline EE set_transpose_opt_mali(U32 nDims,
     char ioMemName[128] = "";
     CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName));
     kernelOpt->kernelDataType = dt;
-    sprintf(kernelName, "transpose_nchw_%s%d", ioMemName, nDims);
-    sprintf(kernelOpt->sourceName, "transpose_nchw");
+    std::string kernel = std::string("transpose_nchw_") + ioMemName + std::to_string(nDims);
+    UNI_STRCPY(kernelName, kernel.c_str());
+    UNI_STRCPY(kernelOpt->sourceName, "transpose_nchw");
     CHECK_STATUS(set_value_define_opt(nDims, "DN", opt));
     CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt));
     return SUCCESS;
diff --git a/compute/tensor/src/gpu/mali/cl/pooling.cl b/compute/tensor/src/gpu/mali/cl/pooling.cl
index 158fd3b6..b6207663 100644
--- a/compute/tensor/src/gpu/mali/cl/pooling.cl
+++ b/compute/tensor/src/gpu/mali/cl/pooling.cl
@@ -72,6 +72,7 @@ __kernel void MANGLE_NAME(pooling_, IOM, PM)(const int iw_str,
     const int ph,
     const int kw,
     const int kh,
+    const int count_include_pad,
     READ_ONLY_KERNEL_MEM in,
     KERNEL_MEM out)
 {
@@ -116,7 +117,7 @@ __kernel void MANGLE_NAME(pooling_, IOM, PM)(const int iw_str,
         ADD_IN_OFF
     }
 #if defined(USE_POOLING_MEAN)
-    float psize = (eh - bh) * (ew - bw);
+    float psize = count_include_pad ? (kh * kw) : ((eh - bh) * (ew - bw));
     res = res / psize;
 #endif
     STORE_OUT;
diff --git a/compute/tensor/src/gpu/mali/convolution.cpp b/compute/tensor/src/gpu/mali/convolution.cpp
index 47e31470..7111f251 100644
--- a/compute/tensor/src/gpu/mali/convolution.cpp
+++ b/compute/tensor/src/gpu/mali/convolution.cpp
@@ -68,7 +68,7 @@ inline void convolution_produce_algos_paras(TensorDesc inputDesc,
     }
     algoNumIndex->push_back(vecH->size());
 
-    if (fw == 3 && fh == 3 && sw == 1 && sh == 1 && dw == 1 && dh == 1 
+    if (fw == 3 && fh == 3 && ft == 1 && sw == 1 && sh == 1 && dw == 1 && dh == 1 
         && idf != DF_NCHW && odf != DF_NCHW && ic > 32 && fn >= 128 && ih > 64 && iw > 64)
     {
         convolutionAlgorithms->push_back(CONVOLUTION_ALGORITHM_WINOGRAD);
@@ -76,6 +76,13 @@ inline void convolution_produce_algos_paras(TensorDesc inputDesc,
         get_gemm_tn_cal_scheme(vecH, vecC, vecK, mt, mt, GCL_MEM_BUF);
         algoNumIndex->push_back(vecH->size());
     }
+
+    if (sw == 1 && sh == 1 && dw == 1 && dh == 1 && fw * fh > 1 && ft == 1
+        && idf != DF_NCHW && odf != DF_NCHW && ic > iw * 4 && ic > ih * 4) {
+        convolutionAlgorithms->push_back(CONVOLUTION_ALGORITHM_INVGEMM);
+        CHECK_STATUS(get_conv_direct_cal_scheme(vecH, vecC, vecK, 1, 1, fn));
+        algoNumIndex->push_back(vecH->size());
+    }
 }
 
 inline void infer_align_val(ConvolutionForwardAlgorithm algo,
@@ -214,6 +221,14 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle,
     if (policy == CONVOLUTION_FASTEST) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
+    GCLMemType imt = inputMemDesc.memType;
+    GCLMemType omt = outputMemDesc.memType;
+    std::vector<TensorDesc> filterDescVec(1, filterDesc);
+    std::vector<I32> flag = build_conv_forward_algorithm_flag(
+        inputDesc, filterDescVec, OT_Conv, imt, omt, convParamSpec);
+    if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) {
+        return SUCCESS;
+    }
     DataType dt;
     U32 ic, ih, iw, fn, fh, fw, ft;
     tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw);
@@ -230,8 +245,6 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle,
     std::vector<U32> vecK;
     DataFormat idf = inputDesc.df;
     DataFormat odf = outputDesc.df;
-    GCLMemType imt = inputMemDesc.memType;
-    GCLMemType omt = outputMemDesc.memType;
     convolution_produce_algos_paras(inputDesc, filterDesc, convParamSpec, idf, odf, imt, omt,
         &convolutionAlgorithms, &algoNumIndex, &vecH, &vecC, &vecK);
     if (vecH.size() == 1) {
@@ -328,13 +341,19 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle,
         gcl_create_memory(handle, filter);
         gcl_create_memory(handle, bias);
         gcl_create_memory(handle, biasbuf);
-        std::vector<GCLMem_t> tmpDir(3, NULL);
+        std::vector<GCLMem_t> tmpDir(1, NULL);
+        std::vector<GCLMem_t> tmpInv(1, NULL);
         std::vector<GCLMem_t> tmpWino(3, NULL);
-        std::vector<GCLMem_t> tmp;
-        tmpbuf->desc.byteSize = maxBytes[0] + 1;
+        std::vector<GCLMem_t> tmp(3, NULL);
+        if (maxBytes[0]) {
+            tmpbuf->desc.byteSize = maxBytes[0];
+        } else {
+            tmpbuf->desc.byteSize = 128;
+        }
         gcl_create_memory(handle, tmpbuf);
         tmpDir[0] = tmpbuf;
         tmpWino[0] = tmpbuf;
+        tmpInv[0] = tmpbuf;
         if (check_qualcomm_device() && 
             maxBytes[1] > 0 && maxBytes[2] > 0 && maxBytes[3] > 0) {
             tmpImgA->desc.memType = GCL_MEM_IMG_3D;
@@ -355,16 +374,19 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle,
             tmpWino[2] = tmpImgB;
         }
 
-        double minTimeWinograd = DBL_MAX;
         double minTime = DBL_MAX;
+        double minTimeWinograd = DBL_MAX;
         double winogradPicTranTime = DBL_MAX;
         double winogradOutTranTime = DBL_MAX;
+        double minTimeInvGemm = DBL_MAX;
+        double invGemmCol2ImgTime = DBL_MAX;
         U32 runKernelBe = 0;
         U32 runKernelEnd = 0;
         ForwardRunInfoMali bestRunInfo;
         ForwardRunInfoMali bestRunInfoWinograd;
+        ForwardRunInfoMali bestRunInfoInvGemm;
         GCLMem_t fltMem = filter;
-        tmp = tmpDir;
+        tmp[0] = tmpDir[0];
         for (U32 i = 0; i < algosNum; i++) {
             GCLMem_t biasMem = (runInfos[i].best_k[0] == 0) ? biasbuf : bias;
             if (check_qualcomm_device()) {
@@ -376,14 +398,22 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle,
                         break;
                     }
                 }
-                if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_WINOGRAD) {
+                if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_DIRECT) {
+                    fltMem = filter;
+                    tmp[0] = tmpDir[0];
+                } else if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_INVGEMM) {
+                    fltMem = filter;
+                    tmp[0] = tmpInv[0];
+                } else if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_WINOGRAD) {
                     if (useWinoFltImg) {
                         gcl_create_memory(handle, filterImg);
                         useWinoFltImg = false;
-                        fltMem = filterImg;
                     }
-                    tmp = tmpWino;
-                } 
+                    fltMem = filterImg;
+                    for (U32 i = 0; i < 3; i++) {
+                        tmp[i] = tmpWino[i];
+                    }
+                }
             }
             if (convolution_mali(handle, inputDesc, input, filterDesc, fltMem, convParamSpec,
                     &runInfos[i], scaleDesc, NULL, biasDesc, biasMem, maxBytes[0], tmp, outputDesc,
@@ -391,11 +421,11 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle,
                 if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_DIRECT) {
                     runKernelEnd = handle->kernelVec->size();
                     gcl_run_kernelVec_timing(handle, runKernelEnd - 1, runKernelEnd);
-                    runKernelBe = runKernelEnd;
                     if (minTime > handle->t_execute) {
                         minTime = handle->t_execute;
                         bestRunInfo = runInfos[i];
                     }
+                    runKernelBe = runKernelEnd;
                 }
 
                 if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_WINOGRAD) {
@@ -416,6 +446,19 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle,
                     }
                     runKernelBe = runKernelEnd;
                 }
+                if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_INVGEMM) {
+                    runKernelEnd = handle->kernelVec->size();
+                    if (invGemmCol2ImgTime == DBL_MAX) {
+                        gcl_run_kernelVec_timing(handle, runKernelEnd - 1, runKernelEnd);
+                        invGemmCol2ImgTime = handle->t_execute;
+                    }
+                    gcl_run_kernelVec_timing(handle, runKernelEnd - 2, runKernelEnd - 1);
+                    if (minTimeInvGemm > handle->t_execute) {
+                        minTimeInvGemm = handle->t_execute;
+                        bestRunInfoInvGemm = runInfos[i];
+                    }
+                    runKernelBe = runKernelEnd;
+                }
             }
         }
 
@@ -426,10 +469,18 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle,
             minTime = minTimeWinograd;
             bestRunInfo = bestRunInfoWinograd;
         }
+        if (minTimeInvGemm != DBL_MAX) {
+            minTimeInvGemm = minTimeInvGemm + invGemmCol2ImgTime;
+        }
+        if (minTimeInvGemm < minTime) {
+            minTime = minTimeInvGemm;
+            bestRunInfo = bestRunInfoInvGemm;
+        }
         if (minTime == DBL_MAX) {
             CHECK_STATUS(NOT_SUPPORTED);
         }
         *forwardRunInfo = bestRunInfo;
+        gcl_set_runInfo_to_cache(handle, flag, bestRunInfo);
         CHECK_STATUS(gcl_finish(handle));
         gcl_destroy_gclmem(input);
         gcl_destroy_gclmem(filter);
diff --git a/compute/tensor/src/gpu/mali/deconvolution.cpp b/compute/tensor/src/gpu/mali/deconvolution.cpp
index 4045344f..ae7d23c9 100644
--- a/compute/tensor/src/gpu/mali/deconvolution.cpp
+++ b/compute/tensor/src/gpu/mali/deconvolution.cpp
@@ -98,6 +98,14 @@ EE deconvolution_infer_forward_algorithm_mali(GCLHandle_t handle,
     if (algorithm != CONVOLUTION_ALGORITHM_NULL) {
         return SUCCESS;
     }
+    GCLMemType imt = inputMemDesc.memType;
+    GCLMemType omt = outputMemDesc.memType;
+    std::vector<TensorDesc> filterDescVec(1, filterDesc);
+    std::vector<I32> flag = build_conv_forward_algorithm_flag(
+        inputDesc, filterDescVec, OT_Deconvolution, imt, omt, convParamSpec);
+    if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) {
+        return SUCCESS;
+    }
     DataType dt;
     U32 ih, iw, fc, fh, fw;
     tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw);
@@ -210,6 +218,7 @@ EE deconvolution_infer_forward_algorithm_mali(GCLHandle_t handle,
             CHECK_STATUS(NOT_SUPPORTED);
         }
         *forwardRunInfo = bestRunInfo;
+        gcl_set_runInfo_to_cache(handle, flag, bestRunInfo);
         CHECK_STATUS(gcl_finish(handle));
         gcl_destroy_gclmem(input);
         gcl_destroy_gclmem(filter);
diff --git a/compute/tensor/src/gpu/mali/depth2space.cpp b/compute/tensor/src/gpu/mali/depth2space.cpp
index efb113b9..97b331e7 100644
--- a/compute/tensor/src/gpu/mali/depth2space.cpp
+++ b/compute/tensor/src/gpu/mali/depth2space.cpp
@@ -49,14 +49,14 @@ EE depth2space_padding_input_mali(TensorDesc inputDesc,
     U32 ow, oh, oc, on;
     tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw);
     on = in;
-    oc = ic / (p.blockSize * p.blockSize);
-    oh = ih * p.blockSize;
-    ow = iw * p.blockSize;
-    if (ic % (p.blockSize * p.blockSize) != 0) {
+    oc = ic / (p.block_size * p.block_size);
+    oh = ih * p.block_size;
+    ow = iw * p.block_size;
+    if (ic % (p.block_size * p.block_size) != 0) {
         return NOT_MATCH;
     }
     DataFormat odf = idf;
-    if ((p.blockSize == 2 && oc < 4) || p.blockSize != 2) {
+    if ((p.block_size == 2 && oc < 4) || p.block_size != 2) {
         odf = DF_NCHW;
     }
     *outputDesc = tensor4df(idt, odf, on, oc, oh, ow);
diff --git a/compute/tensor/src/gpu/mali/depthwise_convolution.cpp b/compute/tensor/src/gpu/mali/depthwise_convolution.cpp
index fcfcfef5..f5bbcaff 100644
--- a/compute/tensor/src/gpu/mali/depthwise_convolution.cpp
+++ b/compute/tensor/src/gpu/mali/depthwise_convolution.cpp
@@ -103,6 +103,14 @@ EE depthwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle,
     if (policy == CONVOLUTION_FASTEST) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
+    GCLMemType imt = inputMemDesc.memType;
+    GCLMemType omt = outputMemDesc.memType;
+    std::vector<TensorDesc> filterDescVec(1, filterDesc);
+    std::vector<I32> flag = build_conv_forward_algorithm_flag(
+        inputDesc, filterDescVec, OT_Conv, imt, omt, convParamSpec);
+    if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) {
+        return SUCCESS;
+    }
     U32 dw = convParamSpec.dilatedRate_w;
     U32 dh = convParamSpec.dilatedRate_h;
     std::vector<DepthwiseConvolutionForwardAlgorithm> depthwiseConvAlgorithms;
@@ -217,6 +225,7 @@ EE depthwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle,
             CHECK_STATUS(NOT_SUPPORTED);
         }
         *forwardRunInfo = bestRunInfo;
+        gcl_set_runInfo_to_cache(handle, flag, bestRunInfo);
         CHECK_STATUS(gcl_finish(handle));
         gcl_destroy_gclmem(input);
         gcl_destroy_gclmem(filter);
diff --git a/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp b/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp
index 8f66eab3..1cb308e6 100644
--- a/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp
+++ b/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp
@@ -110,12 +110,12 @@ EE depthwise_pointwise_convolution_padding_input_mali(TensorDesc inputDesc,
         }
         ih_align *= sh;
         U32 fhd = (fh - 1) * dh + 1;
-        U32 pl = convParamSpec.padding_left;
-        U32 pr = convParamSpec.padding_right;
-        U32 pt = convParamSpec.padding_top;
+        U32 pl = convParamSpec.pad_left;
+        U32 pr = convParamSpec.pad_right;
+        U32 pt = convParamSpec.pad_top;
         U32 pb = ih_align + (fhd / 2 * 2) - pt - ih;
-        if (pb < convParamSpec.padding_bottom) {
-            pb = convParamSpec.padding_bottom;
+        if (pb < convParamSpec.pad_bottom) {
+            pb = convParamSpec.pad_bottom;
         }
         inputMem->padding(pl, pr, pt, pb);
     }
@@ -149,6 +149,14 @@ EE depthwise_pointwise_convolution_infer_forward_algorithm_mali(GCLHandle_t hand
     if (policy == CONVOLUTION_FASTEST) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
+    GCLMemType imt = inputMemDesc.memType;
+    GCLMemType omt = outputMemDesc.memType;
+    std::vector<TensorDesc> filterDescVec = {dwFilterDesc, pwFilterDesc};
+    std::vector<I32> flag = build_conv_forward_algorithm_flag(
+        inputDesc, filterDescVec, OT_Conv, imt, omt, convParamSpec);
+    if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) {
+        return SUCCESS;
+    }
     std::vector<DepthwiseConvolutionForwardAlgorithm> depthwisePointwiseConvAlgorithms;
     std::vector<U32> algoNumIndexD;
     std::vector<U32> vecHD;
@@ -372,6 +380,7 @@ EE depthwise_pointwise_convolution_infer_forward_algorithm_mali(GCLHandle_t hand
         }
 
         *forwardRunInfo = bestRunInfo[0];
+        gcl_set_runInfo_to_cache(handle, flag, bestRunInfo[0]);
         CHECK_STATUS(gcl_finish(handle));
         gcl_destroy_gclmem(input);
         gcl_destroy_gclmem(dwFilter);
diff --git a/compute/tensor/src/gpu/mali/eltwise.cpp b/compute/tensor/src/gpu/mali/eltwise.cpp
index 80c9035b..e81557b1 100644
--- a/compute/tensor/src/gpu/mali/eltwise.cpp
+++ b/compute/tensor/src/gpu/mali/eltwise.cpp
@@ -65,7 +65,7 @@ inline EE eltwise_checkpara_mali(GCLHandle_t handle,
             CHECK_STATUS(NULL_POINTER);
         }
     }
-    EltwiseMode eltwiseMode = eltwiseDesc.elt_mode;
+    EltwiseMode eltwiseMode = eltwiseDesc.mode;
     U32 arrayDimMax = 0;
     bool sameDesc = eltwise_same_desc(inputDesc, &arrayDimMax);
     if (sameDesc) {
diff --git a/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp
index 81fc3f13..05dc9d42 100644
--- a/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp
@@ -29,8 +29,6 @@ inline EE activation_core_mali_fp16(GCLHandle_t handle,
     GCLMem_t output,
     ActivationMode activationMode)
 {
-    UNUSED(inputDesc);
-    UNUSED(outputDesc);
     U32 ow, oh, oc, on;
     U32 iw_str, ih_str, iw_off, ih_off;
     U32 ow_str, oh_str, ow_off, oh_off;
diff --git a/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp
index 7f87ea2d..5e29c17b 100644
--- a/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp
@@ -64,10 +64,8 @@ inline EE argmax_core_mali_fp16(GCLHandle_t handle,
         Mem inv1 = input->mem;
         Mem ini1 = input->mem;
         Mem outv1024, outi1024, outv128, outi128;
-        char kernelName[128];
-        char kernelNameIndex[128];
-        sprintf(kernelName, "argmax_x");
-        sprintf(kernelNameIndex, "argmax_x_index");
+        const char *kernelName = "argmax_x";
+        const char *kernelNameIndex = "argmax_x_index";
         bool use_index = false;
         U32 offset = 0;
         U32 len = iw;
diff --git a/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp
index 1564e6f3..7b45135a 100644
--- a/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp
@@ -47,7 +47,7 @@ inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle,
     tensorSelectGet(gridDesc, NULL, NULL, &gn, &gc, &gh, &gw);
     tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow);
 
-    U32 coe = bilateralSliceApplyParamSpec.coefficient_len;
+    U32 coe = bilateralSliceApplyParamSpec.coefficient;
     BilateralSliceApplyMode mode = bilateralSliceApplyParamSpec.mode;
     //    bool has_offset = bilateralSliceApplyParamSpec.has_offset;
     U32 dep = gc / coe;
@@ -60,7 +60,7 @@ inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle,
     gridbuf = grid->mem;
     outbuf = output->mem;
     gridTran = tmpBuf->mem;
-    if (mode == BSliceApply_NULL) {
+    if (mode == BSLICE_APPLY_NULL) {
         guidebuf = guide->mem;
     } else {
         guidebuf = inbuf;
@@ -80,11 +80,11 @@ inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle,
         gcl_run_kernel_profiling(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre"));
     CHECK_STATUS(gcl_print_memory<F16>(handle, grid, "bilateral_slice_apply_grid"));
 #endif
-    char kernelname[128];
-    if (mode == BSliceApply_CONV) {
-        sprintf(kernelname, "bilateral_slice_apply_c12_conv");
+    const char *kernelname;
+    if (mode == BSLICE_APPLY_CONV) {
+        kernelname = "bilateral_slice_apply_c12_conv";
     } else {
-        sprintf(kernelname, "bilateral_slice_apply_c12");
+        kernelname = "bilateral_slice_apply_c12";
     }
     U32 gs[2] = {ow, oh};
     U32 ls[2] = {0, 0};
@@ -98,7 +98,7 @@ inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle,
     CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim, gs, ls, kernelname));
     CHECK_STATUS(gcl_print_memory<F16>(handle, input, "bilateral_slice_apply_input"));
     CHECK_STATUS(gcl_print_memory<F16>(handle, output, "bilateral_slice_apply_output"));
-    if (mode == BSliceApply_NULL) {
+    if (mode == BSLICE_APPLY_NULL) {
         CHECK_STATUS(gcl_print_memory<F16>(handle, guide, "bilateral_slice_apply_guide"));
     }
 #endif
diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp
index 8176c0d9..86658cfd 100644
--- a/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp
@@ -17,20 +17,29 @@
 #include "gpu/mali/cl/kernel_option/conv_direct_opt.h"
 #include "gpu/mali/cl/kernel_option/gemv_opt.h"
 
-inline TensorDesc get_nchw_desc_for_img(TensorDesc inputDesc, ConvolutionParamSpec convParamSpec) {
+inline TensorDesc get_nchw_desc_for_img(TensorDesc inputDesc, ConvolutionParamSpec convParamSpec)
+{
     TensorDesc desc = inputDesc;
-    desc.dims[0] += convParamSpec.padding_left + convParamSpec.padding_right;
-    desc.dims[1] += convParamSpec.padding_bottom;
+    desc.dims[0] += convParamSpec.pad_left + convParamSpec.pad_right;
+    desc.dims[1] += convParamSpec.pad_bottom;
     return desc;
 }
 
-inline EE trans_input_nchw_to_img(GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, 
-    ConvolutionParamSpec convParamSpec, GCLMem_t tmp, U32 *iw_str, U32 *ih_str, I32 *iw_off, I32 *ih_off) {
+inline EE trans_input_nchw_to_img(GCLHandle_t handle,
+    TensorDesc inputDesc,
+    GCLMem_t input,
+    ConvolutionParamSpec convParamSpec,
+    GCLMem_t tmp,
+    U32 *iw_str,
+    U32 *ih_str,
+    I32 *iw_off,
+    I32 *ih_off)
+{
     TensorDesc descNchwImg = get_nchw_desc_for_img(inputDesc, convParamSpec);
     GCLMem inputTran = *input;
-    inputTran.desc.dims[0] = descNchwImg.dims[0];//move left padding zero into img
+    inputTran.desc.dims[0] = descNchwImg.dims[0];  //move left padding zero into img
     inputTran.desc.dims[1] = descNchwImg.dims[1];
-    inputTran.desc.offset[0] -= convParamSpec.padding_left;
+    inputTran.desc.offset[0] -= convParamSpec.pad_left;
     if (inputTran.desc.offset[0] < 0) {
         CHECK_STATUS(NOT_MATCH);
     }
@@ -47,7 +56,7 @@ inline EE trans_input_nchw_to_img(GCLHandle_t handle, TensorDesc inputDesc, GCLM
     *iw_str = inputImg.desc.stride[0];
     *ih_str = inputImg.desc.stride[1];
     *iw_off = 0;
-    *ih_off = -convParamSpec.padding_top;
+    *ih_off = -convParamSpec.pad_top;
     return SUCCESS;
 }
 
@@ -84,9 +93,9 @@ inline EE direct_core_nchw_to_nchwc4_mali_fp16(GCLHandle_t handle,
     sw = convParamSpec.stride_w;
     sh = convParamSpec.stride_h;
     st = convParamSpec.stride_t;
-    ph = convParamSpec.padding_top;
-    pw = convParamSpec.padding_left;
-    pt = convParamSpec.padding_before;
+    ph = convParamSpec.pad_top;
+    pw = convParamSpec.pad_left;
+    pt = convParamSpec.pad_before;
 
     tensorSelectGet(inputDesc, NULL, &df, NULL, &ic, &ih, &iw, &it);
     tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow, &ot);
@@ -104,14 +113,14 @@ inline EE direct_core_nchw_to_nchwc4_mali_fp16(GCLHandle_t handle,
     o_off = oh_off * ow_str + ow_off;
 
     if (tmpBuf->desc.memType != GCL_MEM_BUF) {
-        CHECK_STATUS(trans_input_nchw_to_img(handle, inputDesc, input, convParamSpec,
-            tmpBuf, &iw_str, &ih_str, &iw_off, &ih_off));
+        CHECK_STATUS(trans_input_nchw_to_img(
+            handle, inputDesc, input, convParamSpec, tmpBuf, &iw_str, &ih_str, &iw_off, &ih_off));
         iwh_str = iw_str * ih_str;
         inbuf = tmpBuf->mem;
         imt = tmpBuf->desc.memType;
     }
 
-    U32 item_w = forwardRunInfo->best_h[0];//for nchw, reuse on w
+    U32 item_w = forwardRunInfo->best_h[0];  //for nchw, reuse on w
     char kernelName[128];
     KernelOpt kernelOpt;
     Kernel kernel;
@@ -125,8 +134,7 @@ inline EE direct_core_nchw_to_nchwc4_mali_fp16(GCLHandle_t handle,
         CHECK_STATUS(NOT_SUPPORTED);
     }
     CHECK_STATUS(set_conv_direct_nchw_to_nchwc4_opt_mali(
-        fw, fh, ft, sw, item_w, activationMode, 
-        DT_F16, imt, omt, kernelName, &kernelOpt));
+        fw, fh, ft, sw, item_w, activationMode, DT_F16, imt, omt, kernelName, &kernelOpt));
     CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
     if (ot > 1) {
         CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, ic_str, iw_off, ih_off, ow_str,
@@ -177,8 +185,8 @@ inline EE direct_core_fn_spe(GCLHandle_t handle,
     fh = convParamSpec.kernel_h;
     sw = convParamSpec.stride_w;
     sh = convParamSpec.stride_h;
-    ph = convParamSpec.padding_top;
-    pw = convParamSpec.padding_left;
+    ph = convParamSpec.pad_top;
+    pw = convParamSpec.pad_left;
     tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw);
     tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow);
     fn = oc;
@@ -269,9 +277,9 @@ inline EE direct_core_mali_fp16(GCLHandle_t handle,
     sw = convParamSpec.stride_w;
     sh = convParamSpec.stride_h;
     st = convParamSpec.stride_t;
-    ph = convParamSpec.padding_top;
-    pw = convParamSpec.padding_left;
-    pt = convParamSpec.padding_before;
+    ph = convParamSpec.pad_top;
+    pw = convParamSpec.pad_left;
+    pt = convParamSpec.pad_before;
     tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw, &it);
     tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow, &ot);
     if (on > 1 && ot > 1) {
@@ -389,8 +397,8 @@ inline EE direct_dila_core_mali_fp16(GCLHandle_t handle,
     fh = convParamSpec.kernel_h;
     sw = convParamSpec.stride_w;
     sh = convParamSpec.stride_h;
-    pw = convParamSpec.padding_left;
-    ph = convParamSpec.padding_top;
+    pw = convParamSpec.pad_left;
+    ph = convParamSpec.pad_top;
     dw = convParamSpec.dilatedRate_w;
     dh = convParamSpec.dilatedRate_h;
     tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw);
@@ -572,7 +580,8 @@ EE convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc,
     bool useGemvMode = useGemvCalMode(inputDesc, convParamSpec, GCL_MEM_BUF, GCL_MEM_BUF);
     bool useNchwMode = useNchwCalMode(idf, fw, ic, dw, dh);
     if (useGemvMode) {
-        CHECK_STATUS(gemv_infer_forward_tmp_bytes_mali_fp16(inputDesc, outputDesc, bytes, forwardRunInfo));
+        CHECK_STATUS(
+            gemv_infer_forward_tmp_bytes_mali_fp16(inputDesc, outputDesc, bytes, forwardRunInfo));
     } else if (useNchwMode) {
         bool useImg = check_qualcomm_device();
         if (useImg) {
@@ -589,9 +598,9 @@ EE convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc,
                 bytes[3] = depth;
             }
         }
-    } else if (idf == DF_NCHW) {//use tran c1 to c4
-        GCLMemDesc desc = convolution_get_input_nchwc4_desc(inputDesc, filterDesc,
-            convParamSpec, outputDesc, useNchwMode, forwardRunInfo);
+    } else if (idf == DF_NCHW) {  //use tran c1 to c4
+        GCLMemDesc desc = convolution_get_input_nchwc4_desc(
+            inputDesc, filterDesc, convParamSpec, outputDesc, useNchwMode, forwardRunInfo);
         if (desc.memType == GCL_MEM_IMG_3D) {
             bytes[1] = desc.stride[0];
             bytes[2] = desc.stride[1];
diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.cpp
new file mode 100644
index 00000000..558f2302
--- /dev/null
+++ b/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.cpp
@@ -0,0 +1,217 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "gpu/mali/fp16/convolution_mali_fp16.h"
+#include "gpu/mali/fp16/convolution_invgemm_mali_fp16.h"
+#include "gpu/mali/cl/kernel_option/conv_invgemm_opt.h"
+#include "gpu/mali/cl/kernel_option/conv_direct_opt.h"
+
+inline EE invgemm_core_mali_fp16(GCLHandle_t handle,
+    TensorDesc inputDesc,
+    const GCLMem_t input,
+    TensorDesc filterDesc,
+    const GCLMem_t filter,
+    ConvolutionParamSpec convParamSpec,
+    ForwardRunInfoMali_t forwardRunInfo,
+    TensorDesc biasDesc,
+    const GCLMem_t bias,
+    U32 tmpBytes,
+    GCLMem_t tmpBuf,
+    TensorDesc outputDesc,
+    GCLMem_t output,
+    ActivationMode activationMode)
+{
+    cl_mem inbuf, biasmem, outbuf, fltbuf, tmp;
+    inbuf = input->mem;
+    fltbuf = filter->mem;
+    biasmem = bias->mem;
+    outbuf = output->mem;
+    tmp = tmpBuf->mem;
+    U32 iw, ih, ic, in;
+    U32 fw, fh, sw, sh, pl, pt;
+    U32 ow, oh, oc, on;
+    fw = convParamSpec.kernel_w;
+    fh = convParamSpec.kernel_h;
+    sw = convParamSpec.stride_w;
+    sh = convParamSpec.stride_h;
+    pl = convParamSpec.pad_left;
+    pt = convParamSpec.pad_top;
+    tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw);
+    tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow);
+    U32 item_h = forwardRunInfo->best_h[0];
+    U32 item_c = forwardRunInfo->best_c[0];
+    U32 item_k = forwardRunInfo->best_k[0];
+    item_k = item_k >> 2;
+
+    U32 iw_str, ih_str, ihw_str, ic_str, iw_off, ih_off, in_str;
+    get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off);
+    U32 i_off = ih_off * iw_str + iw_off;
+    ihw_str = ih_str * iw_str;
+    ic_str = (ic + item_c - 1) / item_c;
+    in_str = ihw_str * ic_str;
+    U32 ow_str, oh_str, ow_off, oh_off, o_off;
+    get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off);
+    o_off = oh_off * ow_str + ow_off;
+
+    char kernelName[128];
+    KernelOpt kernelOpt;
+    U32 gs[3];
+    U32 ls[3] = {0, 0, 0};
+    U32 dim;
+    Kernel kernel;
+    if (sw == 1 && sh == 1) {
+        U32 tw = iw;
+        U32 th = ih;
+        U32 tc = fw * fh * ((oc + 3) / 4 * 4);
+        U32 tw_str = iw;
+        U32 th_str = ih;
+        U32 t_off = 0;
+        U32 thw_str = tw_str * th_str;
+        U32 tn_str = thw_str * ((tc + 3) / 4);
+        gs[0] = tw;
+        gs[1] = (th + item_h - 1) / item_h;
+        gs[2] = tc / 4 / item_k * on;
+        dim = 3;
+        CHECK_STATUS(set_conv_direct_opt_mali(1, 1, 1, 1, item_h, item_k, true, ACTIVATION_NULL,
+            DT_F16, input->desc.memType, GCL_MEM_BUF, kernelName, &kernelOpt));
+        CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
+        CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ihw_str, ic_str, iw_off, ih_off, tw_str,
+            thw_str, t_off, th, tc, sw, in_str, tn_str, gs[0], gs[1], inbuf, fltbuf, biasmem, tmp));
+        gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName);
+#ifdef _DEBUG
+        CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
+        handle->t_total += handle->t_execute;
+#endif
+        gs[0] = ow;
+        gs[1] = oh;
+        gs[2] = (oc + 3) / 4 * on;
+        I32 pw = fw - 1 - pl;
+        I32 ph = fh - 1 - pt;
+        CHECK_STATUS(set_conv_invgemm_col2img_opt(
+            activationMode, DT_F16, GCL_MEM_BUF, output->desc.memType, kernelName, &kernelOpt));
+        CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
+        CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, ih, fw, fh, pw, ph, ow_str, oh_str, o_off, oc,
+            gs[0], gs[1], tmp, biasmem, outbuf));
+        gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName);
+#ifdef _DEBUG
+        CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
+        handle->t_total += handle->t_execute;
+#endif
+    } else {
+        CHECK_STATUS(NOT_SUPPORTED);
+    }
+    return SUCCESS;
+}
+
+inline TensorDesc transform_filter_desc(TensorDesc filterDesc, U32 item_c, U32 item_k)
+{
+    U32 fw, fh, fc, fn;
+    tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw);
+    TensorDesc desc;
+    desc.df = DF_NCHW;
+    desc.dt = DT_F16;
+    desc.nDims = 4;
+    desc.dims[0] = item_k * item_c;
+    desc.dims[1] = (fc + item_c - 1) / item_c;
+    desc.dims[2] = (fn + item_k - 1) / item_k * fw * fh;
+    desc.dims[3] = 1;
+    return desc;
+}
+
+EE convolution_invgemm_transform_filter_bytes_mali_fp16(
+    TensorDesc filterDesc, ForwardRunInfoMali_t forwardRunInfo, TensorDesc *ftmDesc)
+{
+    U32 item_c = forwardRunInfo->best_c[0];
+    U32 item_k = forwardRunInfo->best_k[0];
+    *ftmDesc = transform_filter_desc(filterDesc, item_c, item_k);
+    return SUCCESS;
+}
+
+EE convolution_invgemm_transform_filter_mali_fp16(GCLHandle_t handle,
+    TensorDesc filterDesc,
+    GCLMem_t filter,
+    ForwardRunInfoMali_t forwardRunInfo,
+    TensorDesc *fltmemDesc,
+    GCLMem_t fltmem)
+{
+    DataType fdt;
+    DataFormat fdf;
+    U32 fw, fh, fc, fn;
+    tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw);
+    U32 fwh = fw * fh;
+    U32 item_c = forwardRunInfo->best_c[0];
+    U32 item_k = forwardRunInfo->best_k[0];
+    char kernelName[128];
+    KernelOpt kernelOpt;
+    Kernel kernel;
+    U32 gs[3] = {0, 0, 0};
+    U32 ls[3] = {0, 0, 0};
+    U32 dim = 3;
+    CHECK_STATUS(set_conv_invgemm_trans_flt_opt(item_k, DT_F16, kernelName, &kernelOpt));
+    gs[0] = fwh;
+    gs[1] = (fc + item_c - 1) / item_c;
+    gs[2] = (fn + item_k - 1) / item_k * item_k;
+    CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelName, &kernel, &kernelOpt));
+    CHECK_STATUS(gcl_set_kernelArgs(kernel, fw, fh, fwh, fc, fn, filter->mem, fltmem->mem));
+    CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
+    *fltmemDesc = transform_filter_desc(filterDesc, item_c, item_k);
+    return SUCCESS;
+}
+
+EE convolution_invgemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc,
+    TensorDesc filterDesc,
+    TensorDesc outputDesc,
+    ConvolutionParamSpec convParamSpec,
+    ForwardRunInfoMali_t forwardRunInfo,
+    U32 *bytes)
+{
+    DataType dt = inputDesc.dt;
+    U32 iw = inputDesc.dims[0];
+    U32 ih = inputDesc.dims[1];
+    U32 fw = convParamSpec.kernel_w;
+    U32 fh = convParamSpec.kernel_h;
+    U32 oc = outputDesc.dims[outputDesc.nDims - 2];
+    U32 on = outputDesc.dims[outputDesc.nDims - 1];
+    U32 bufSize = 0;
+    U32 item_c = forwardRunInfo->best_c[0];
+    U32 item_k = forwardRunInfo->best_k[0];
+
+    U32 tw = iw;
+    U32 th = ih;
+    U32 tc = fw * fh * ((oc + 3) / 4 * 4);
+    U32 tn = on;
+    bufSize = tw * th * tc * tn * bytesOf(dt);
+    *bytes = bufSize;
+    return SUCCESS;
+}
+
+EE convolution_invgemm_mali_fp16(GCLHandle_t handle,
+    TensorDesc inputDesc,
+    const GCLMem_t input,
+    TensorDesc filterDesc,
+    const GCLMem_t filter,
+    ConvolutionParamSpec convParamSpec,
+    ForwardRunInfoMali_t forwardRunInfo,
+    TensorDesc biasDesc,
+    const GCLMem_t bias,
+    U32 tmpBytes,
+    GCLMem_t tmpBuf,
+    TensorDesc outputDesc,
+    GCLMem_t output,
+    ActivationMode activationMode)
+{
+    CHECK_STATUS(fill_output_zero(handle, output, outputDesc));
+    CHECK_STATUS(invgemm_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convParamSpec,
+        forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode));
+    return SUCCESS;
+}
diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.h
new file mode 100644
index 00000000..49dcf1a0
--- /dev/null
+++ b/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_CONVOLUTION_INVGEMM_MALI_FP16
+#define _H_CONVOLUTION_INVGEMM_MALI_FP16
+
+#include "gpu/mali/fp16/tensor_computing_fp16.h"
+
+EE convolution_invgemm_transform_filter_bytes_mali_fp16(
+    TensorDesc filterDesc, ForwardRunInfoMali_t forwardRunInfo, TensorDesc *ftmDesc);
+
+EE convolution_invgemm_transform_filter_mali_fp16(GCLHandle_t handle,
+    TensorDesc filterDesc,
+    GCLMem_t filter,
+    ForwardRunInfoMali_t forwardRunInfo,
+    TensorDesc *fltmemDesc,
+    GCLMem_t fltmem);
+
+EE convolution_invgemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc,
+    TensorDesc filterDesc,
+    TensorDesc outputDesc,
+    ConvolutionParamSpec convParamSpec,
+    ForwardRunInfoMali_t forwardRunInfo,
+    U32 *bytes);
+
+EE convolution_invgemm_mali_fp16(GCLHandle_t handle,
+    TensorDesc inputDesc,
+    const GCLMem_t input,
+    TensorDesc filterDesc,
+    const GCLMem_t filter,
+    ConvolutionParamSpec convParamSpec,
+    ForwardRunInfoMali_t forwardRunInfo,
+    TensorDesc biasDesc,
+    const GCLMem_t bias,
+    U32 tmpBytes,
+    GCLMem_t tmpBuf,
+    TensorDesc outputDesc,
+    GCLMem_t output,
+    ActivationMode activationMode);
+#endif
diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp
index c4508530..ee1fbcf5 100644
--- a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp
@@ -14,6 +14,7 @@
 #include "gpu/mali/fp16/convolution_mali_fp16.h"
 #include "gpu/mali/fp16/convolution_direct_mali_fp16.h"
 #include "gpu/mali/fp16/convolution_wino_mali_fp16.h"
+#include "gpu/mali/fp16/convolution_invgemm_mali_fp16.h"
 
 inline EE convolution_checkpara_mali_fp16(GCLHandle_t handle,
     TensorDesc inputDesc,
@@ -61,6 +62,10 @@ EE convolution_transform_filter_bytes_mali_fp16(
             ret = convolution_wino_transform_filter_bytes_mali_fp16(
                 filterDesc, forwardRunInfo, ftmDesc);
             break;
+        case CONVOLUTION_ALGORITHM_INVGEMM:
+            ret = convolution_invgemm_transform_filter_bytes_mali_fp16(
+                filterDesc, forwardRunInfo, ftmDesc);
+            break;
         default:
             ret = NOT_SUPPORTED;
             break;
@@ -90,6 +95,10 @@ EE convolution_transform_filter_mali_fp16(GCLHandle_t handle,
             ret = convolution_wino_transform_filter_mali_fp16(
                 handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem, tmp);
             break;
+        case CONVOLUTION_ALGORITHM_INVGEMM:
+            ret = convolution_invgemm_transform_filter_mali_fp16(
+                handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem);
+            break;
         default:
             ret = NOT_SUPPORTED;
             break;
@@ -118,6 +127,10 @@ EE convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc,
             ret = convolution_wino_infer_forward_tmp_bytes_mali_fp16(
                 inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes);
             break;
+        case CONVOLUTION_ALGORITHM_INVGEMM:
+            ret = convolution_invgemm_infer_forward_tmp_bytes_mali_fp16(
+                inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes);
+            break;
         default:
             ret = NOT_SUPPORTED;
             break;
@@ -158,6 +171,11 @@ EE convolution_mali_fp16(GCLHandle_t handle,
                 convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output,
                 activationMode);
             break;
+        case CONVOLUTION_ALGORITHM_INVGEMM:
+            ret = convolution_invgemm_mali_fp16(handle, inputDesc, input, filterDesc, filter,
+                convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf[0], outputDesc, output,
+                activationMode);
+            break;
         default:
             ret = NOT_SUPPORTED;
             break;
diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h
index b664d39e..bd1ae9a9 100644
--- a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h
+++ b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h
@@ -16,11 +16,7 @@
 
 #include "gpu/mali/fp16/tensor_computing_fp16.h"
 #include "gpu/mali/cl/kernel_option/common_opt.h"
-inline bool useNchwCalMode(DataFormat idf,
-    U32 fw,
-    U32 ic,
-    U32 dw,
-    U32 dh)
+inline bool useNchwCalMode(DataFormat idf, U32 fw, U32 ic, U32 dw, U32 dh)
 {
     bool useNchwMode = false;
     bool qualCommDev = check_qualcomm_device();
@@ -54,8 +50,18 @@ inline bool useGemvCalMode(
 }
 
 inline void calPaddingVal(TensorDesc inputDesc,
-    TensorDesc filterDesc, ConvolutionParamSpec convParamSpec, U32 w_align, U32 h_align, U32 n_align,
-    bool useNchwMode, U32 *pl, U32 *pr, U32 *pt, U32 *pb, U32 *pa, U32 *pf)
+    TensorDesc filterDesc,
+    ConvolutionParamSpec convParamSpec,
+    U32 w_align,
+    U32 h_align,
+    U32 n_align,
+    bool useNchwMode,
+    U32 *pl,
+    U32 *pr,
+    U32 *pt,
+    U32 *pb,
+    U32 *pa,
+    U32 *pf)
 {
     U32 iw, ih, ic, it, in;
     tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw, &it);
@@ -63,10 +69,10 @@ inline void calPaddingVal(TensorDesc inputDesc,
     U32 fh = convParamSpec.kernel_h;
     U32 sh = convParamSpec.stride_h;
     U32 dh = convParamSpec.dilatedRate_h;
-    U32 fhd = (fh - 1) * dh + 1; 
+    U32 fhd = (fh - 1) * dh + 1;
     h_align *= sh;
-    plv = convParamSpec.padding_left;
-    ptv = convParamSpec.padding_top; 
+    plv = convParamSpec.pad_left;
+    ptv = convParamSpec.pad_top;
     if (useNchwMode) {
         U32 fw = convParamSpec.kernel_w;
         U32 sw = convParamSpec.stride_w;
@@ -74,18 +80,18 @@ inline void calPaddingVal(TensorDesc inputDesc,
         U32 fwd = (fw - 1) * dw + 1;
         w_align *= sw;
         prv = w_align + (fwd / 2 * 2) - plv - iw;
-        if (prv < convParamSpec.padding_right) {
-            prv = convParamSpec.padding_right;
+        if (prv < convParamSpec.pad_right) {
+            prv = convParamSpec.pad_right;
         }
         pbv = h_align + (fhd / 2 * 2) - ptv - ih;
-        if (pbv < convParamSpec.padding_bottom) {
-            pbv = convParamSpec.padding_bottom;
+        if (pbv < convParamSpec.pad_bottom) {
+            pbv = convParamSpec.pad_bottom;
         }
-    } else  {
-        prv = convParamSpec.padding_right;
+    } else {
+        prv = convParamSpec.pad_right;
         pbv = h_align + (fhd / 2 * 2) - ptv - ih;
-        if (pbv < convParamSpec.padding_bottom) {
-            pbv = convParamSpec.padding_bottom;
+        if (pbv < convParamSpec.pad_bottom) {
+            pbv = convParamSpec.pad_bottom;
         }
         ic = (ic + 3) / 4;
     }
diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp
index b589879d..f92a3b15 100644
--- a/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp
@@ -16,14 +16,15 @@
 #include "gpu/mali/cl/kernel_option/conv_wino_opt.h"
 #include "gpu/mali/cl/kernel_option/gemm_tn_opt.h"
 
-TensorDesc getInputPreProcessDesc(TensorDesc inputDesc, ConvolutionParamSpec convParamSpec, U32 wino_w, U32 wino_h)
+TensorDesc getInputPreProcessDesc(
+    TensorDesc inputDesc, ConvolutionParamSpec convParamSpec, U32 wino_w, U32 wino_h)
 {
     U32 fw = convParamSpec.kernel_w;
     U32 fh = convParamSpec.kernel_h;
-    U32 pl = convParamSpec.padding_left;
-    U32 pr = convParamSpec.padding_right;
-    U32 pt = convParamSpec.padding_top;
-    U32 pb = convParamSpec.padding_bottom;
+    U32 pl = convParamSpec.pad_left;
+    U32 pr = convParamSpec.pad_right;
+    U32 pt = convParamSpec.pad_top;
+    U32 pb = convParamSpec.pad_bottom;
     TensorDesc desc = inputDesc;
     desc.df = DF_NCHW;
     desc.dims[0] = wino_w * 4;
@@ -35,7 +36,7 @@ TensorDesc getInputPreProcessDesc(TensorDesc inputDesc, ConvolutionParamSpec con
     return desc;
 }
 
-TensorDesc getPicTranDesc(DataType dt, U32 wino_w, U32 wino_h, U32 wino_num, U32 ic, U32 item_n) 
+TensorDesc getPicTranDesc(DataType dt, U32 wino_w, U32 wino_h, U32 wino_num, U32 ic, U32 item_n)
 {
     TensorDesc desc;
     desc.df = DF_NCHW;
@@ -48,7 +49,7 @@ TensorDesc getPicTranDesc(DataType dt, U32 wino_w, U32 wino_h, U32 wino_num, U32
     return desc;
 }
 
-TensorDesc getGemmOutDesc(DataType dt, U32 M, U32 N, U32 wino_num) 
+TensorDesc getGemmOutDesc(DataType dt, U32 M, U32 N, U32 wino_num)
 {
     TensorDesc desc;
     desc.df = DF_NCHW;
@@ -61,17 +62,30 @@ TensorDesc getGemmOutDesc(DataType dt, U32 M, U32 N, U32 wino_num)
     return desc;
 }
 
-inline EE wino_preprocess_input(GCLHandle_t handle, DataType dt, DataFormat df,
-    U32 iw_str, U32 ih_str, U32 i_off, U32 ow_str, U32 oh_str, 
-    U32 iw, U32 ih, U32 ic, U32 pw, U32 ph, 
-    GCLMemType imt, GCLMemType omt, Mem in, Mem out)
+inline EE wino_preprocess_input(GCLHandle_t handle,
+    DataType dt,
+    DataFormat df,
+    U32 iw_str,
+    U32 ih_str,
+    U32 i_off,
+    U32 ow_str,
+    U32 oh_str,
+    U32 iw,
+    U32 ih,
+    U32 ic,
+    U32 pw,
+    U32 ph,
+    GCLMemType imt,
+    GCLMemType omt,
+    Mem in,
+    Mem out)
 {
     char kernelName[128];
     KernelOpt kernelOpt;
     Kernel kernel;
-    bool useNchwFormat = (df == DF_NCHW) ? true :false;
-    CHECK_STATUS(set_conv_wino_preprocess_input_opt(dt, useNchwFormat, imt, omt,
-        kernelName, &kernelOpt));
+    bool useNchwFormat = (df == DF_NCHW) ? true : false;
+    CHECK_STATUS(
+        set_conv_wino_preprocess_input_opt(dt, useNchwFormat, imt, omt, kernelName, &kernelOpt));
     U32 gs[3] = {(ow_str + 3) / 4, oh_str, (ic + 3) / 4};
     U32 ls[3] = {0};
     U32 dim = 3;
@@ -80,8 +94,8 @@ inline EE wino_preprocess_input(GCLHandle_t handle, DataType dt, DataFormat df,
         gs[2] = ic;
     }
     CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
-    CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, i_off, ow_str, oh_str,
-        iw, ih, ic, pw, ph, gs[0], gs[1], in, out));
+    CHECK_STATUS(gcl_set_kernelArgs(
+        kernel, iw_str, ih_str, i_off, ow_str, oh_str, iw, ih, ic, pw, ph, gs[0], gs[1], in, out));
     gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName);
 #ifdef _DEBUG
     CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
@@ -90,21 +104,31 @@ inline EE wino_preprocess_input(GCLHandle_t handle, DataType dt, DataFormat df,
     return SUCCESS;
 }
 
-inline EE wino_trans_pic_nchw(GCLHandle_t handle, DataType dt, U32 wino_w, U32 wino_h, U32 ic,
-    U32 iw_str, U32 ih_str, U32 i_off, U32 pw_str, U32 pwh_str,
-    GCLMemType imt, Mem in, Mem out)
+inline EE wino_trans_pic_nchw(GCLHandle_t handle,
+    DataType dt,
+    U32 wino_w,
+    U32 wino_h,
+    U32 ic,
+    U32 iw_str,
+    U32 ih_str,
+    U32 i_off,
+    U32 pw_str,
+    U32 pwh_str,
+    GCLMemType imt,
+    Mem in,
+    Mem out)
 {
     char kernelName[128];
     KernelOpt kernelOpt;
     Kernel kernel;
-    CHECK_STATUS(set_common_opt(dt, imt, GCL_MEM_BUF, "conv_wino_trans_picbuf_nchw",
-        kernelName, &kernelOpt));
+    CHECK_STATUS(
+        set_common_opt(dt, imt, GCL_MEM_BUF, "conv_wino_trans_picbuf_nchw", kernelName, &kernelOpt));
     U32 gs[3] = {wino_w, wino_h, ic};
     U32 ls[3] = {0};
     U32 dim = 3;
     CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
-    CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, i_off, pw_str, pwh_str,
-        gs[0], gs[1], in, out));
+    CHECK_STATUS(
+        gcl_set_kernelArgs(kernel, iw_str, ih_str, i_off, pw_str, pwh_str, gs[0], gs[1], in, out));
     gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName);
 #ifdef _DEBUG
     CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
@@ -129,15 +153,25 @@ inline EE wino_trans_pic_img(GCLHandle_t handle, TensorDesc picTranDesc, Mem pic
     return SUCCESS;
 }
 
-inline EE wino_gemm(GCLHandle_t handle, DataType dt,
-    U32 M, U32 N, U32 K, U32 item_m, U32 item_n, U32 wino_num,
-    GCLMemType ma, GCLMemType mb, Mem A, Mem B, Mem C)
+inline EE wino_gemm(GCLHandle_t handle,
+    DataType dt,
+    U32 M,
+    U32 N,
+    U32 K,
+    U32 item_m,
+    U32 item_n,
+    U32 wino_num,
+    GCLMemType ma,
+    GCLMemType mb,
+    Mem A,
+    Mem B,
+    Mem C)
 {
     char kernelName[128];
     KernelOpt kernelOpt;
     Kernel kernel;
-    CHECK_STATUS(set_gemm_tn_opt_mali(item_m, item_n, NO_BIAS, false, ACTIVATION_NULL, dt,
-        ma, mb, GCL_MEM_BUF, kernelName, &kernelOpt));
+    CHECK_STATUS(set_gemm_tn_opt_mali(item_m, item_n, NO_BIAS, false, ACTIVATION_NULL, dt, ma, mb,
+        GCL_MEM_BUF, kernelName, &kernelOpt));
     U32 gs[3] = {N / item_n, M / item_m, wino_num * wino_num};
     U32 ls[3] = {0};
     U32 dim = 3;
@@ -149,8 +183,8 @@ inline EE wino_gemm(GCLHandle_t handle, DataType dt,
     U32 ch = M;
     U32 cc = wino_num * wino_num;
     CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
-    CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, A_str, B_str, C_str, 0, 0, 0,
-        cw_str, cw, ch, cc, gs[0], gs[1], A, B, C, C));
+    CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, A_str, B_str, C_str, 0, 0, 0, cw_str, cw, ch,
+        cc, gs[0], gs[1], A, B, C, C));
     gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName);
 #ifdef _DEBUG
     CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
@@ -184,8 +218,8 @@ inline EE wino_trans_out(GCLHandle_t handle,
     if ((oh & 3) == 0 && (ow & 3) == 0) {
         useAlign = true;
     }
-    CHECK_STATUS(set_conv_wino_trans_outbuf_opt(useAlign, activationMode, DT_F16, GCL_MEM_BUF,
-        omt, kernelName, &kernelOpt));
+    CHECK_STATUS(set_conv_wino_trans_outbuf_opt(
+        useAlign, activationMode, DT_F16, GCL_MEM_BUF, omt, kernelName, &kernelOpt));
     U32 gs[3] = {wino_w, wino_h, (oc + 3) / 4};
     U32 ls[3] = {0, 0, 0};
     U32 dim = 3;
@@ -260,8 +294,8 @@ EE convolution_wino_transform_filter_mali_fp16(GCLHandle_t handle,
         U32 offset = ALIGN(fn_align * fwhc * bytesOf(fdt), BUFFER_ALIGN_BASE);
         CHECK_STATUS(gcl_create_sub_buffer(bytes, &offset, tmp, &fltTranMem));
     }
-    CHECK_STATUS(set_common_opt(DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, "conv_wino_trans_fltbuf_3x3",
-        kernelName, &kernelOpt));
+    CHECK_STATUS(set_common_opt(
+        DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, "conv_wino_trans_fltbuf_3x3", kernelName, &kernelOpt));
     CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelName, &kernel, &kernelOpt));
     CHECK_STATUS(gcl_set_kernelArgs(kernel, fn_align, fc, fnc, tmp->mem, fltTranMem));
     gs[0] = fn_align;
@@ -304,7 +338,7 @@ EE convolution_wino_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc,
         if (useImg) {
             U32 width = (inputNchwDesc.dims[0] + 3) / 4;
             U32 height = inputNchwDesc.dims[1];
-            U32 depth =  inputNchwDesc.dims[2] * inputNchwDesc.dims[3];
+            U32 depth = inputNchwDesc.dims[2] * inputNchwDesc.dims[3];
             if (CHECK_MEET_IMAGE_LIMITS(width, height, depth)) {
                 bytes[1] = width;
                 bytes[2] = height;
@@ -316,7 +350,7 @@ EE convolution_wino_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc,
         if (!useImg) {
             bufSize += ALIGN(tensorNumBytes(inputNchwDesc), BUFFER_ALIGN_BASE);
         }
-    } else {//for input is NCHW and memType is image
+    } else {  //for input is NCHW and memType is image
         bufSize += ALIGN(tensorNumBytes(inputNchwDesc), BUFFER_ALIGN_BASE);
     }
 
@@ -383,17 +417,17 @@ EE convolution_wino_mali_fp16(GCLHandle_t handle,
     U32 ow, oh, oc, on;
     fw = convParamSpec.kernel_w;
     fh = convParamSpec.kernel_h;
-    pw = convParamSpec.padding_left;
-    ph = convParamSpec.padding_top;
+    pw = convParamSpec.pad_left;
+    ph = convParamSpec.pad_top;
     tensorSelectGet(inputDesc, &idt, NULL, NULL, &ic, &ih, &iw);
     tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow);
     fc = ic;
     fn = oc;
     Mem inMem = input->mem;
-    U32 iw_str ,ih_str;
+    U32 iw_str, ih_str;
     I32 iw_off, ih_off, i_off;
-    get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, (U32*)&iw_off, (U32*)&ih_off);
-    U32 ow_str ,oh_str, ow_off, oh_off;
+    get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, (U32 *)&iw_off, (U32 *)&ih_off);
+    U32 ow_str, oh_str, ow_off, oh_off;
     get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off);
 
     GCLMemType imt = input->desc.memType;
@@ -408,7 +442,7 @@ EE convolution_wino_mali_fp16(GCLHandle_t handle,
         Mem inputPre;
         GCLMemType omt;
         bool useImg = (tmp[1]) ? true : false;
-        if (inputDesc.df == DF_NCHW) {//for padding input(must be image), have to set data to buffer
+        if (inputDesc.df == DF_NCHW) {  //for padding input(must be image), have to set data to buffer
             useImg = false;
         }
         if (useImg) {
@@ -422,9 +456,8 @@ EE convolution_wino_mali_fp16(GCLHandle_t handle,
         U32 tw_str = desc.dims[0];
         U32 th_str = desc.dims[1];
         i_off = ih_off * iw_str + iw_off;
-        CHECK_STATUS(wino_preprocess_input(handle, desc.dt, input->desc.df, 
-            iw_str, ih_str, i_off, tw_str, th_str, 
-            iw, ih, ic, pw, ph, imt, omt, inMem, inputPre));
+        CHECK_STATUS(wino_preprocess_input(handle, desc.dt, input->desc.df, iw_str, ih_str, i_off,
+            tw_str, th_str, iw, ih, ic, pw, ph, imt, omt, inMem, inputPre));
         inMem = inputPre;
         iw_str = tw_str;
         ih_str = th_str;
@@ -441,8 +474,8 @@ EE convolution_wino_mali_fp16(GCLHandle_t handle,
     CHECK_STATUS(gcl_create_sub_buffer(picTranSize, &offset, tmp[0], &picTran));
     U32 pw_str = picTranDesc.dims[0];
     U32 pwh_str = pw_str * picTranDesc.dims[1];
-    CHECK_STATUS(wino_trans_pic_nchw(handle, picTranDesc.dt, wino_w, wino_h, ic,
-        iw_str, ih_str, i_off, pw_str, pwh_str, imt, inMem, picTran));
+    CHECK_STATUS(wino_trans_pic_nchw(handle, picTranDesc.dt, wino_w, wino_h, ic, iw_str, ih_str,
+        i_off, pw_str, pwh_str, imt, inMem, picTran));
     if (tmp[2]) {
         CHECK_STATUS(wino_trans_pic_img(handle, picTranDesc, picTran, tmp[2]->mem));
         picTran = tmp[2]->mem;
@@ -459,12 +492,12 @@ EE convolution_wino_mali_fp16(GCLHandle_t handle,
     GCLMemType fltTranType = filter->desc.memType;
 
     CHECK_STATUS(gcl_create_sub_buffer(gemmOutSize, &offset, tmp[0], &gemmOut));
-    CHECK_STATUS(wino_gemm(handle, idt, M, N, K, item_m, item_n, wino_num, 
-        fltTranType, picTranType, fltTran, picTran, gemmOut));
+    CHECK_STATUS(wino_gemm(handle, idt, M, N, K, item_m, item_n, wino_num, fltTranType, picTranType,
+        fltTran, picTran, gemmOut));
     Mem biasbuf = bias->mem;
     Mem outbuf = output->mem;
 
-    CHECK_STATUS(wino_trans_out(handle, wino_w, wino_h, N, N * M, ow_str, oh_str, ow_off, oh_off, ow,
-        oh, oc, output->desc.memType, activationMode, biasbuf, gemmOut, outbuf));
+    CHECK_STATUS(wino_trans_out(handle, wino_w, wino_h, N, N * M, ow_str, oh_str, ow_off, oh_off,
+        ow, oh, oc, output->desc.memType, activationMode, biasbuf, gemmOut, outbuf));
     return SUCCESS;
 }
diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp
index d477b7e2..e6a6ed68 100644
--- a/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp
@@ -42,8 +42,8 @@ inline EE deconv_gemm_core_mali_fp16(GCLHandle_t handle,
     U32 ow, oh, oc, on;
     sw = convParamSpec.stride_w;
     sh = convParamSpec.stride_h;
-    ph = convParamSpec.padding_top;
-    pw = convParamSpec.padding_left;
+    ph = convParamSpec.pad_top;
+    pw = convParamSpec.pad_left;
     fw = convParamSpec.kernel_w;
     fh = convParamSpec.kernel_h;
     tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw);
diff --git a/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp
index 8cb35531..b52b4af6 100644
--- a/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp
@@ -56,7 +56,7 @@ inline EE depth2space_core_mali_fp16(GCLHandle_t handle,
     char kernelName[128];
     KernelOpt kernelOpt;
 
-    if (imf == DF_NCHWC4 && p.blockSize == 2) {
+    if (imf == DF_NCHWC4 && p.block_size == 2) {
         U32 gs[3] = {iw, ih, (ic_str + 3) / 4};
         U32 ls[3] = {0, 0, 0};
         U32 dim = 3;
@@ -64,8 +64,8 @@ inline EE depth2space_core_mali_fp16(GCLHandle_t handle,
         CHECK_STATUS(set_depth2space_nchwc4_2x2_opt(
             useOutputNchw, DT_F16, input->desc.memType, GCL_MEM_BUF, kernelName, &kernelOpt));
         CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
-        CHECK_STATUS(gcl_set_kernelArgs(kernel, p.blockSize, iw_str, ihw_str, ic_str, i_off, ow_str,
-            oh_str, ohw_str, o_off, iw, ih, oc, inbuf, outbuf));
+        CHECK_STATUS(gcl_set_kernelArgs(kernel, p.block_size, iw_str, ihw_str, ic_str, i_off,
+            ow_str, oh_str, ohw_str, o_off, iw, ih, oc, inbuf, outbuf));
         gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName);
 #ifdef _DEBUG
         CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
@@ -85,13 +85,13 @@ inline EE depth2space_core_mali_fp16(GCLHandle_t handle,
             inbuf = tmp;
         }
         U32 gs[3] = {
-            iw, ih, (ic / (p.blockSize * p.blockSize) + 3) / 4 * (p.blockSize * p.blockSize)};
+            iw, ih, (ic / (p.block_size * p.block_size) + 3) / 4 * (p.block_size * p.block_size)};
         U32 ls[3] = {0, 0, 0};
         U32 dim = 3;
         CHECK_STATUS(set_common_opt(
             DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, "depth2space_nchw", kernelName, &kernelOpt));
         CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
-        CHECK_STATUS(gcl_set_kernelArgs(kernel, p.blockSize, iw_str, ihw_str, ow_str, ohw_str,
+        CHECK_STATUS(gcl_set_kernelArgs(kernel, p.block_size, iw_str, ihw_str, ow_str, ohw_str,
             i_off, o_off, iw, ih, ic, inbuf, outbuf));
         gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName);
 #ifdef _DEBUG
@@ -110,7 +110,7 @@ EE depth2space_infer_tmpBuf_size_mali_fp16(
     U32 iw, ih, ic, in;
     tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw);
     *bytes = 0;
-    if (p.blockSize != 2) {
+    if (p.block_size != 2) {
         *bytes = in * ic * ih * iw * bytesOf(idt);
     }
     return SUCCESS;
diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp
index 0ff1d6ec..7cf8fcc6 100644
--- a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp
@@ -45,8 +45,8 @@ inline EE depthwise_core_mali_fp16(GCLHandle_t handle,
     U32 ow, oh, oc, on;
     sw = convParamSpec.stride_w;
     sh = convParamSpec.stride_h;
-    pw = convParamSpec.padding_left;
-    ph = convParamSpec.padding_top;
+    pw = convParamSpec.pad_left;
+    ph = convParamSpec.pad_top;
     dw = convParamSpec.dilatedRate_w;
     dh = convParamSpec.dilatedRate_h;
     fw = convParamSpec.kernel_w;
diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h
index fd018781..ad7c3f9c 100644
--- a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h
+++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h
@@ -29,12 +29,12 @@ inline void calDepthwisePaddingVal(TensorDesc inputDesc,
     U32 dh = convParamSpec.dilatedRate_h;
     U32 fhd = (fh - 1) * dh + 1;
     U32 ih = inputDesc.dims[1];
-    U32 plv = convParamSpec.padding_left;
-    U32 prv = convParamSpec.padding_right;
-    U32 ptv = convParamSpec.padding_top;
+    U32 plv = convParamSpec.pad_left;
+    U32 prv = convParamSpec.pad_right;
+    U32 ptv = convParamSpec.pad_top;
     U32 pbv = edge_align * sh + (fhd / 2) * 2 - ptv - ih;
-    if (pbv < convParamSpec.padding_bottom) {
-        pbv = convParamSpec.padding_bottom;
+    if (pbv < convParamSpec.pad_bottom) {
+        pbv = convParamSpec.pad_bottom;
     }
     *pl = plv;
     *pr = prv;
diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp
index d3334951..1d8c639a 100644
--- a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp
@@ -48,8 +48,8 @@ inline EE depthwise_pointwise_direct_core_mali_fp16(GCLHandle_t handle,
     U32 ow, oh, oc, on;
     sw = convParamSpec.stride_w;
     sh = convParamSpec.stride_h;
-    ph = convParamSpec.padding_top;
-    pw = convParamSpec.padding_left;
+    ph = convParamSpec.pad_top;
+    pw = convParamSpec.pad_left;
     dw = convParamSpec.dilatedRate_w;
     dh = convParamSpec.dilatedRate_h;
     fw = convParamSpec.kernel_w;
diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp
index 5e350d79..64de44f2 100644
--- a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp
@@ -49,8 +49,8 @@ inline EE depthwise_pointwise_gemm_core_mali_fp16(GCLHandle_t handle,
     U32 ow, oh, oc, on;
     sw = convParamSpec.stride_w;
     sh = convParamSpec.stride_h;
-    ph = convParamSpec.padding_top;
-    pw = convParamSpec.padding_left;
+    ph = convParamSpec.pad_top;
+    pw = convParamSpec.pad_left;
     dw = convParamSpec.dilatedRate_w;
     dh = convParamSpec.dilatedRate_h;
     fw = convParamSpec.kernel_w;
diff --git a/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp
index a04ddae0..2bda3f91 100644
--- a/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp
@@ -147,7 +147,7 @@ inline EE eltwise_core_mali_fp16(GCLHandle_t handle,
     KernelOpt kernelOpt;
     char kernelName[128];
     bool useNchwFormat = (inputMem[arrayDimMax]->desc.memFormat == DF_NCHW) ? true : false;
-    EltwiseMode eltwiseMode = eltwiseDesc.elt_mode;
+    EltwiseMode eltwiseMode = eltwiseDesc.mode;
     ActivationMode activeMode = eltwiseDesc.activation_type;
     U32 gs[3] = {iw, ih, (ic + 3) / 4 * in * it};
     U32 ls[3] = {0, 0, 0};
diff --git a/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp
index 3d9100f6..4fcec723 100644
--- a/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp
@@ -27,11 +27,11 @@ inline EE padding_checkpara_mali_fp16(GCLHandle_t handle,
     if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) {
         return NOT_SUPPORTED;
     }
-    if (padParamSpec.pad_mode == Pad_Reflect &&
+    if (padParamSpec.pad_mode == PAD_REFLECT &&
         (padParamSpec.top >= inputDesc.dims[1] || padParamSpec.bottom >= inputDesc.dims[1])) {
         return NOT_SUPPORTED;
     }
-    if (padParamSpec.pad_mode == Pad_Symmetric &&
+    if (padParamSpec.pad_mode == PAD_SYMMETRIC &&
         (padParamSpec.left > inputDesc.dims[0] || padParamSpec.right > inputDesc.dims[0])) {
         return NOT_SUPPORTED;
     }
diff --git a/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp
index 48a0fff2..6b47d7c0 100644
--- a/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp
@@ -33,10 +33,10 @@ inline EE pooling_checkpara_mali_fp16(GCLHandle_t handle,
     if (inputDesc.dims[2] != outputDesc.dims[2] || inputDesc.dims[3] != outputDesc.dims[3]) {
         return NOT_SUPPORTED;
     }
-    if (poolingParamSpec.padding_top >= poolingParamSpec.kernel_h) {
+    if (poolingParamSpec.pad_top >= poolingParamSpec.kernel_h) {
         return NOT_SUPPORTED;
     }
-    if (poolingParamSpec.padding_bottom >= poolingParamSpec.kernel_w) {
+    if (poolingParamSpec.pad_bottom >= poolingParamSpec.kernel_w) {
         return NOT_SUPPORTED;
     }
     if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCHWC4) {
@@ -74,9 +74,9 @@ inline EE pooling_core_mali_fp16(GCLHandle_t handle,
     sw = poolingParamSpec.stride_w;
     sh = poolingParamSpec.stride_h;
     st = poolingParamSpec.stride_t;
-    pw = poolingParamSpec.padding_left;
-    ph = poolingParamSpec.padding_top;
-    pt = poolingParamSpec.padding_before;
+    pw = poolingParamSpec.pad_left;
+    ph = poolingParamSpec.pad_top;
+    pt = poolingParamSpec.pad_before;
     kw = poolingParamSpec.kernel_w;
     kh = poolingParamSpec.kernel_h;
     kt = poolingParamSpec.kernel_t;
@@ -134,7 +134,8 @@ inline EE pooling_core_mali_fp16(GCLHandle_t handle,
             mode, DT_F16, input->desc.memType, output->desc.memType, kernelName, &kernelOpt));
         CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
         CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str,
-            o_off, iw, ih, ow, oh, sw, sh, pw, ph, kw, kh, inbuf, outbuf));
+            o_off, iw, ih, ow, oh, sw, sh, pw, ph, kw, kh, (int)poolingParamSpec.count_include_pad,
+            inbuf, outbuf));
         CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName));
 #ifdef _DEBUG
         CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
diff --git a/compute/tensor/src/gpu/mali/fp16/reduction_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/reduction_mali_fp16.cpp
index 77268dee..a8190758 100644
--- a/compute/tensor/src/gpu/mali/fp16/reduction_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/reduction_mali_fp16.cpp
@@ -36,7 +36,7 @@ inline EE reduction_core_mali_fp16(GCLHandle_t handle,
 {
     int axisTran[6];
     int axis;
-    for (int i = 0; i < p.axes_num; i++) {
+    for (int i = 0; i < p.num_axes; i++) {
         axis = p.axes[i];
         if (axis < 0) {
             axis = inputDesc.nDims + axis;
@@ -97,8 +97,8 @@ inline EE reduction_core_mali_fp16(GCLHandle_t handle,
         useNchw = true;
         edge = ow;
     }
-    CHECK_STATUS(set_reduction_opt_mali(useNchw, useOc4, axis, p.reduction_mode, DT_F16,
-        GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt));
+    CHECK_STATUS(set_reduction_opt_mali(
+        useNchw, useOc4, axis, p.mode, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt));
     CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
     CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, ow_str, oh_str, i_off, o_off, iw, ih,
         ic, edge, keep_dim, od, gs[0], gs[1], inbuf, outbuf));
diff --git a/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp
index 9779aecc..94a1cecf 100644
--- a/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp
@@ -232,9 +232,9 @@ inline EE rnn_core_update(GCLHandle_t handle,
     char *kernelName,
     KernelOpt *kernelOpt)
 {
-    float fbias = rnnPara.forgetBias;
-    float zonecell = rnnPara.zoneoutCell;
-    float zoneout = rnnPara.zoneoutOutput;
+    float fbias = rnnPara.forget_bias;
+    float zonecell = rnnPara.zoneout_cell;
+    float zoneout = rnnPara.zoneout_output;
     U32 gs = (col + 3) / 4;
     U32 ls = 16;
     U32 dim = 1;
@@ -310,7 +310,7 @@ inline EE rnn_core_mali_fp16(GCLHandle_t handle,
     GCLMem_t output,
     ForwardRunInfoMali_t forwardRunInfo)
 {
-    bool project = (rnnPara.numProjection > 0) ? true : false;
+    bool project = (rnnPara.num_projection > 0) ? true : false;
     if (project) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
@@ -337,8 +337,8 @@ inline EE rnn_core_mali_fp16(GCLHandle_t handle,
     U32 batch = desc.dims[desc.nDims - 1];
     U32 step = desc.dims[desc.nDims - 2];
     U32 xDim = desc.dims[desc.nDims - 3];
-    U32 hDim = rnnPara.numOutput;
-    U32 col = (rnnPara.numProjection > 0) ? rnnPara.numProjection : hDim;
+    U32 hDim = rnnPara.num_outputs;
+    U32 col = (rnnPara.num_projection > 0) ? rnnPara.num_projection : hDim;
     for (U32 i = 0; i < desc.nDims - 3; i++) {
         xDim *= desc.dims[i];
     }
@@ -421,7 +421,7 @@ inline EE rnn_core_mali_fp16(GCLHandle_t handle,
             rnn_core_copy_stateH(handle, col, hDim, outputDescs.size(), false, stateH, output));
     }
 
-    if (rnnPara.biDirection) {
+    if (rnnPara.bi_direction) {
         gemmMatB = filter[filterCount].mem;
         gemmMatBType = filter[filterCount].desc.memType;
         gemmBias = bias[biasCount].mem;
@@ -458,7 +458,7 @@ inline void transform_filter_desc(TensorDesc filterDesc,
 {
     U32 filterRow, filterCol;
     tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &filterRow, &filterCol);
-    U32 hDim = rnnPara.numOutput;
+    U32 hDim = rnnPara.num_outputs;
     U32 xDim = filterCol - hDim;
 
     TensorDesc desc;
@@ -499,7 +499,7 @@ EE rnn_transform_filter_mali_fp16(GCLHandle_t handle,
     DataType fdt;
     U32 filterRow, filterCol;
     tensorSelectGet(filterDesc, &fdt, NULL, NULL, NULL, &filterRow, &filterCol);
-    U32 hDim = rnnPara.numOutput;
+    U32 hDim = rnnPara.num_outputs;
     U32 xDim = filterCol - hDim;
     char kernelName[128];
     KernelOpt kernelOpt;
@@ -515,7 +515,7 @@ EE rnn_transform_filter_mali_fp16(GCLHandle_t handle,
     CHECK_STATUS(gcl_create_sub_buffer(weightGemmSize, &subMemOff, tmpBuf, &weightGemm));
     CHECK_STATUS(gcl_create_sub_buffer(weightGemvSize, &subMemOff, tmpBuf, &weightGemv));
 
-    U32 biDirNum = (rnnPara.biDirection) ? 2 : 1;
+    U32 biDirNum = (rnnPara.bi_direction) ? 2 : 1;
     U32 filterCount = 0;
     U32 filterTranCount = 0;
     U32 item_n = forwardRunInfo->best_h[0];
@@ -596,8 +596,8 @@ EE rnn_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc,
         size += ALIGN(gemmMatASize, BUFFER_ALIGN_BASE);
     }
 
-    U32 hDim = rnnPara.numOutput;
-    U32 col = (rnnPara.numProjection > 0) ? rnnPara.numProjection : hDim;
+    U32 hDim = rnnPara.num_outputs;
+    U32 col = (rnnPara.num_projection > 0) ? rnnPara.num_projection : hDim;
     U32 filterRow = col * 4;
     U32 M = ALIGN(step * batch, item_m);
     U32 N = ALIGN(filterRow, item_n);
diff --git a/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp
index 87f2a536..83b4dc7e 100644
--- a/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp
@@ -44,12 +44,12 @@ inline EE rnncell_core_mali_fp16(GCLHandle_t handle,
     ForwardRunInfoMali_t forwardRunInfo)
 {
     U32 item_c = forwardRunInfo->best_c[0];
-    U32 hDim = rnncellDesc.numOutput;
-    U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim;
-    bool project = (rnncellDesc.numProjection > 0) ? true : false;
-    float fbias = rnncellDesc.forgetBias;
-    float zonecell = rnncellDesc.zoneoutCell;
-    float zoneout = rnncellDesc.zoneoutOutput;
+    U32 hDim = rnncellDesc.num_outputs;
+    U32 col = (rnncellDesc.num_projection > 0) ? rnncellDesc.num_projection : hDim;
+    bool project = (rnncellDesc.num_projection > 0) ? true : false;
+    float fbias = rnncellDesc.forget_bias;
+    float zonecell = rnncellDesc.zoneout_cell;
+    float zoneout = rnncellDesc.zoneout_output;
     U32 xw_str, xh_str, xh_off, xw_off;
     U32 hw_str, hh_str, hh_off, hw_off;
     CHECK_STATUS(gclmem_get_desc_padding(currentX->desc, &xw_str, &xh_str, NULL, &xw_off, &xh_off));
@@ -123,7 +123,7 @@ inline EE rnncell_core_mali_fp16(GCLHandle_t handle,
 
     if (project) {
         item_c = forwardRunInfo->best_c[1];
-        filterRow = rnncellDesc.numOutput;
+        filterRow = rnncellDesc.num_outputs;
         fltbuf = filter[1].mem;
         tmpOff = offset;
         //biasMem = bias[1].mem;
@@ -150,13 +150,13 @@ inline void transform_filter_desc(TensorDesc filterDesc,
     U32 item_c = forwardRunInfo->best_c[0];
     U32 item_k = forwardRunInfo->best_k[0];
     ftmDesc[0] = gemv_transform_filter_desc(filterDesc, item_h, item_c, item_k);
-    bool useProject = (rnnParamSpec.numProjection > 0) ? true : false;
+    bool useProject = (rnnParamSpec.num_projection > 0) ? true : false;
     if (useProject) {
         item_h = forwardRunInfo->best_h[1];
         item_c = forwardRunInfo->best_c[1];
         item_k = forwardRunInfo->best_k[1];
-        TensorDesc filterDescPro =
-            tensor2df(filterDesc.dt, DF_NORMAL, rnnParamSpec.numOutput, rnnParamSpec.numProjection);
+        TensorDesc filterDescPro = tensor2df(
+            filterDesc.dt, DF_NORMAL, rnnParamSpec.num_outputs, rnnParamSpec.num_projection);
         ftmDesc[1] = gemv_transform_filter_desc(filterDescPro, item_h, item_c, item_k);
     }
 }
@@ -178,15 +178,15 @@ EE rnncell_transform_filter_mali_fp16(GCLHandle_t handle,
     GCLMem_t fltmem,
     ForwardRunInfoMali_t forwardRunInfo)
 {
-    U32 filterNum = (rnnParamSpec.numProjection > 0) ? 2 : 1;
+    U32 filterNum = (rnnParamSpec.num_projection > 0) ? 2 : 1;
     for (U32 i = 0; i < filterNum; i++) {
         ForwardRunInfoMali runInfo = *forwardRunInfo;
         if (i == 1) {
             runInfo.best_h[i - 1] = runInfo.best_h[i];
             runInfo.best_c[i - 1] = runInfo.best_c[i];
             runInfo.best_k[i - 1] = runInfo.best_k[i];
-            filterDesc.dims[0] = rnnParamSpec.numProjection;
-            filterDesc.dims[1] = rnnParamSpec.numOutput;
+            filterDesc.dims[0] = rnnParamSpec.num_projection;
+            filterDesc.dims[1] = rnnParamSpec.num_outputs;
         }
         CHECK_STATUS(gemv_transform_filter_mali_fp16(
             handle, filterDesc, &filter[i], &fltmemDesc[i], &fltmem[i], &runInfo));
@@ -204,12 +204,12 @@ EE rnncell_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc,
     U32 item_c = forwardRunInfo->best_c[0];
     DataType dt = inputDesc.dt;
     U32 xDim = inputDesc.dims[0];
-    U32 hDim = rnncellDesc.numOutput;
+    U32 hDim = rnncellDesc.num_outputs;
     U32 c_align = (item_c > 16) ? (item_c >> 4) : item_c;
     U32 xhNum = ALIGN(xDim + hDim, c_align);
     U32 xhSize = ALIGN(xhNum * bytesOf(dt), BUFFER_ALIGN_BASE);
 
-    U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim;
+    U32 col = (rnncellDesc.num_projection > 0) ? rnncellDesc.num_projection : hDim;
     U32 filterRow = col * 4;
     U32 interNum = filterRow + 4;
     U32 interSize = ALIGN(interNum * bytesOf(dt), BUFFER_ALIGN_BASE);
@@ -217,12 +217,12 @@ EE rnncell_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc,
     U32 tmpOutSize = 0;
     U32 filterRowPro = 0;
     U32 item_cp = item_c;
-    if (rnncellDesc.numProjection > 0) {
+    if (rnncellDesc.num_projection > 0) {
         item_cp = forwardRunInfo->best_c[1];
         U32 cp_align = (item_cp > 16) ? (item_cp >> 4) : item_cp;
         U32 tmpOutNum = ALIGN(col, cp_align);
         tmpOutSize = ALIGN(tmpOutNum * bytesOf(dt), BUFFER_ALIGN_BASE);
-        filterRowPro = rnncellDesc.numOutput;
+        filterRowPro = rnncellDesc.num_outputs;
     }
 
     U32 reduceSize = 0;
diff --git a/compute/tensor/src/gpu/mali/fp16/tensor_computing_fp16.h b/compute/tensor/src/gpu/mali/fp16/tensor_computing_fp16.h
index 05c8ba25..5947e040 100644
--- a/compute/tensor/src/gpu/mali/fp16/tensor_computing_fp16.h
+++ b/compute/tensor/src/gpu/mali/fp16/tensor_computing_fp16.h
@@ -24,4 +24,97 @@
     (gcl_check_meet_device_image3d_limits(            \
         OCLContext::getInstance().handle.get(), width, height, depth))
 
+inline std::vector<I32> build_conv_forward_algorithm_flag(TensorDesc inputDesc,
+    std::vector<TensorDesc> filterDesc,
+    OperatorType opType,
+    GCLMemType imt,
+    GCLMemType omt,
+    ConvolutionParamSpec convParamSpec)
+{
+    std::vector<I32> flag;
+    flag.push_back(opType);
+    flag.push_back(convParamSpec.convolution_type);
+    for (U32 i = 0; i < inputDesc.nDims; i++) {
+        flag.push_back(inputDesc.dims[i]);
+    }
+    for (auto &p : filterDesc) {
+        for (U32 i = 0; i < p.nDims; i++) {
+            flag.push_back(p.dims[i]);
+        }
+    }
+    flag.push_back(convParamSpec.kernel_t);
+    flag.push_back(convParamSpec.kernel_h);
+    flag.push_back(convParamSpec.kernel_w);
+    flag.push_back(convParamSpec.stride_t);
+    flag.push_back(convParamSpec.stride_h);
+    flag.push_back(convParamSpec.stride_w);
+    flag.push_back(convParamSpec.group);
+    flag.push_back(convParamSpec.dilatedRate_t);
+    flag.push_back(convParamSpec.dilatedRate_h);
+    flag.push_back(convParamSpec.dilatedRate_w);
+    flag.push_back(imt);
+    flag.push_back(omt);
+    return flag;
+}
+
+inline std::vector<I32> build_fully_connected_forward_algorithm_flag(
+    TensorDesc inputDesc, TensorDesc filterDesc, GCLMemType imt, GCLMemType omt)
+{
+    std::vector<I32> flag;
+    flag.push_back(OT_FC);
+    for (U32 i = 0; i < inputDesc.nDims; i++) {
+        flag.push_back(inputDesc.dims[i]);
+    }
+    for (U32 i = 0; i < filterDesc.nDims; i++) {
+        flag.push_back(filterDesc.dims[i]);
+    }
+    flag.push_back(imt);
+    flag.push_back(omt);
+    return flag;
+}
+
+inline std::vector<I32> build_matmul_forward_algorithm_flag(TensorDesc matrixADesc,
+    bool transposeA,
+    TensorDesc matrixBDesc,
+    bool transposeB,
+    GCLMemType amt,
+    GCLMemType bmt,
+    GCLMemType cmt)
+{
+    std::vector<I32> flag;
+    flag.push_back(OT_MatMul);
+    flag.push_back(transposeA);
+    flag.push_back(transposeB);
+    for (U32 i = 0; i < matrixADesc.nDims; i++) {
+        flag.push_back(matrixADesc.dims[i]);
+    }
+    for (U32 i = 0; i < matrixBDesc.nDims; i++) {
+        flag.push_back(matrixBDesc.dims[i]);
+    }
+    flag.push_back(amt);
+    flag.push_back(bmt);
+    flag.push_back(cmt);
+    return flag;
+}
+
+inline std::vector<I32> build_rnn_forward_algorithm_flag(
+    TensorDesc inputDesc, std::vector<TensorDesc> filterDesc, RNNParamSpec rnnPara)
+{
+    std::vector<I32> flag;
+    flag.push_back(OT_RNN);
+    flag.push_back(rnnPara.steps);
+    flag.push_back(rnnPara.mode);
+    flag.push_back(rnnPara.num_outputs);
+    flag.push_back(rnnPara.num_projection);
+    flag.push_back(rnnPara.bi_direction);
+    for (U32 i = 0; i < inputDesc.nDims; i++) {
+        flag.push_back(inputDesc.dims[i]);
+    }
+    for (auto &p : filterDesc) {
+        for (U32 i = 0; i < p.nDims; i++) {
+            flag.push_back(p.dims[i]);
+        }
+    }
+    return flag;
+}
 #endif
diff --git a/compute/tensor/src/gpu/mali/fp16/tfslice_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/tfslice_mali_fp16.cpp
index cec878b9..7b57577d 100644
--- a/compute/tensor/src/gpu/mali/fp16/tfslice_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/tfslice_mali_fp16.cpp
@@ -56,7 +56,6 @@ inline EE tfslice_core_mali_fp16(GCLHandle_t handle,
     DataFormat imf = input->desc.memFormat;
     DataFormat omf = output->desc.memFormat;
 
-    char kernelName[128];
     Kernel kernel;
     U32 gs[3] = {0, 0, 0};
     U32 ls[3] = {0, 0, 0};
@@ -87,7 +86,7 @@ inline EE tfslice_core_mali_fp16(GCLHandle_t handle,
     gs[0] = ow;
     gs[1] = oh;
     gs[2] = oc * on;
-    sprintf(kernelName, "tfslice_nchw");
+    const char *kernelName = "tfslice_nchw";
     CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel));
     CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, ow_str, oh_str, i_off, o_off, ic, oc,
         be[0], be[1], be[2], be[3], stride[0], stride[1], stride[2], stride[3], gs[0], gs[1], inMem,
diff --git a/compute/tensor/src/gpu/mali/fp16/topk_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/topk_mali_fp16.cpp
index 7bf03f6a..45ee9cad 100644
--- a/compute/tensor/src/gpu/mali/fp16/topk_mali_fp16.cpp
+++ b/compute/tensor/src/gpu/mali/fp16/topk_mali_fp16.cpp
@@ -43,15 +43,16 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle,
     axis = inputDesc.nDims - 1 - axis;
     U32 len = inputDesc.dims[axis];
     I32 sorted = p.sorted;
-    I32 top_k = p.topk;
+    I32 top_k = p.k;
     I32 largest = p.largest;
-    char modeName[128];
+    std::string modeName;
     if (largest) {
-        strcpy(modeName, "max");
+        modeName = "max";
     } else {
-        strcpy(modeName, "min");
+        modeName = "min";
     }
     if (sorted) {
+        UNI_ERROR_LOG("GPU have not support topK sorted");
         CHECK_STATUS(NOT_SUPPORTED);
     }
     Mem outputId = outputIndices->mem;
@@ -89,17 +90,16 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle,
         CHECK_STATUS(gcl_create_sub_buffer(size, &sub_off, tmpbuf, &sub_id[3]));
 
         Kernel kernel;
-        char kernelName[1024];
-        sprintf(kernelName, "topk_sort_%s", modeName);
-        CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel));
+        std::string kernelName = "topk_sort_" + modeName;
+        CHECK_STATUS(gcl_create_kernel(handle, kernelName.c_str(), &kernel));
         U32 gs[3] = {0, 0, 0};
         U32 ls[3] = {0, 0, 0};
         U32 dim = 1;
         gs[0] = (len + 15) / 16;
         CHECK_STATUS(gcl_set_kernelArgs(kernel, len, gs[0], input->mem, sub[0], sub_id[0]));
-        CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName));
+        CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName.c_str()));
 #ifdef _DEBUG
-        CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
+        CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName.c_str()));
 #endif
 
         U32 top_k_loop = (top_k + 15) / 16;
@@ -108,7 +108,7 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle,
             U32 mem_out_index = 1;
             U32 out_off = 0;
             U32 out_val_num = 16;
-            sprintf(kernelName, "topk_merge_%s", modeName);
+            kernelName = "topk_merge_" + modeName;
             Mem merge_in, merge_out, merge_in_id, merge_out_id;
             gs[0] = (len + 15) / 16;
             ls[0] = 0;
@@ -124,12 +124,12 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle,
                     out_off = i * 16;
                     out_val_num = ((i * 16 + 16) <= (U32)top_k) ? 16 : (top_k % 16);
                 }
-                CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel));
+                CHECK_STATUS(gcl_create_kernel(handle, kernelName.c_str(), &kernel));
                 CHECK_STATUS(gcl_set_kernelArgs(kernel, total_group_num, out_val_num, out_off,
                     gs[0], merge_in, merge_in_id, merge_out, merge_out_id));
-                CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName));
+                CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName.c_str()));
 #ifdef _DEBUG
-                CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
+                CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName.c_str()));
 #endif
                 if (gs[0] > 1) {
                     mem_in_index++;
@@ -144,7 +144,7 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle,
             }
 
             if (i < top_k_loop - 1 || need_out_id) {
-                sprintf(kernelName, "topk_update_%s", modeName);
+                kernelName = "topk_update_" + modeName;
                 gs[0] = 16;
                 ls[0] = 16;
                 int out_id_off = out_off;
@@ -152,12 +152,12 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle,
                 if (!need_out_id) {
                     outputId = sub_id[0];
                 }
-                CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel));
+                CHECK_STATUS(gcl_create_kernel(handle, kernelName.c_str(), &kernel));
                 CHECK_STATUS(gcl_set_kernelArgs(kernel, need_out_id, out_id_off, out_id_num, gs[0],
                     merge_out_id, sub[0], sub_id[0], outputId));
-                CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName));
+                CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName.c_str()));
 #ifdef _DEBUG
-                CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName));
+                CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName.c_str()));
 #endif
             }
         }
diff --git a/compute/tensor/src/gpu/mali/fully_connected.cpp b/compute/tensor/src/gpu/mali/fully_connected.cpp
index ab00b100..3ccfae4e 100644
--- a/compute/tensor/src/gpu/mali/fully_connected.cpp
+++ b/compute/tensor/src/gpu/mali/fully_connected.cpp
@@ -98,6 +98,13 @@ EE fully_connected_infer_forward_algorithm_mali(GCLHandle_t handle,
     if (algorithm != CONVOLUTION_ALGORITHM_NULL) {
         return SUCCESS;
     }
+    GCLMemType imt = inputMemDesc.memType;
+    GCLMemType omt = outputMemDesc.memType;
+    std::vector<I32> flag = build_fully_connected_forward_algorithm_flag(
+        inputDesc, filterDesc, imt, omt);
+    if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) {
+        return SUCCESS;
+    }
     DataType dt = inputDesc.dt;
     U32 fc = filterDesc.dims[0];
     U32 fn = filterDesc.dims[1];
@@ -229,6 +236,7 @@ EE fully_connected_infer_forward_algorithm_mali(GCLHandle_t handle,
         CHECK_STATUS(NOT_SUPPORTED);
     }
     *forwardRunInfo = bestRunInfo;
+    gcl_set_runInfo_to_cache(handle, flag, bestRunInfo);
     CHECK_STATUS(gcl_finish(handle));
     gcl_destroy_gclmem(input);
     gcl_destroy_gclmem(tmpBuf);
diff --git a/compute/tensor/src/gpu/mali/matmul.cpp b/compute/tensor/src/gpu/mali/matmul.cpp
index 2e4b6199..fbbc8396 100644
--- a/compute/tensor/src/gpu/mali/matmul.cpp
+++ b/compute/tensor/src/gpu/mali/matmul.cpp
@@ -202,6 +202,14 @@ EE matmul_infer_forward_algorithm_mali(GCLHandle_t handle,
     if (algorithm != CONVOLUTION_ALGORITHM_NULL) {
         return SUCCESS;
     }
+    GCLMemType amt = gclmemMatrixADesc.memType;
+    GCLMemType bmt = gclmemMatrixBDesc.memType;
+    GCLMemType cmt = gclmemMatrixCDesc.memType;
+    std::vector<I32> flag = build_matmul_forward_algorithm_flag(
+        matrixADesc, transposeA, matrixBDesc, transposeB, amt, bmt, cmt);
+    if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) {
+        return SUCCESS;
+    }
     std::vector<ConvolutionForwardAlgorithm> matmulAlgorithms;
     std::vector<U32> vecH;
     std::vector<U32> vecC;
@@ -290,6 +298,7 @@ EE matmul_infer_forward_algorithm_mali(GCLHandle_t handle,
         CHECK_STATUS(NOT_SUPPORTED);
     }
     *forwardRunInfo = bestRunInfo;
+    gcl_set_runInfo_to_cache(handle, flag, bestRunInfo);
     CHECK_STATUS(gcl_finish(handle));
     gcl_destroy_gclmem(matrixA);
     gcl_destroy_gclmem(matrixB);
diff --git a/compute/tensor/src/gpu/mali/pooling.cpp b/compute/tensor/src/gpu/mali/pooling.cpp
index 24e16ea6..4cde1ff5 100644
--- a/compute/tensor/src/gpu/mali/pooling.cpp
+++ b/compute/tensor/src/gpu/mali/pooling.cpp
@@ -27,12 +27,12 @@ EE pooling_padding_input_mali(TensorDesc inputDesc,
     if (inputMem == nullptr || outputMem == nullptr || outputDesc == nullptr) {
         CHECK_STATUS(NULL_POINTER);
     }
-    U32 pl = poolingParamSpec.padding_left;
-    U32 pr = poolingParamSpec.padding_right;
-    U32 pt = poolingParamSpec.padding_top;
-    U32 pb = poolingParamSpec.padding_bottom;
-    U32 pf = poolingParamSpec.padding_before;
-    U32 pa = poolingParamSpec.padding_after;
+    U32 pl = poolingParamSpec.pad_left;
+    U32 pr = poolingParamSpec.pad_right;
+    U32 pt = poolingParamSpec.pad_top;
+    U32 pb = poolingParamSpec.pad_bottom;
+    U32 pf = poolingParamSpec.pad_before;
+    U32 pa = poolingParamSpec.pad_after;
     inputMem->padding(pl, pr, pt, pb, pf, pa);
     return SUCCESS;
 }
diff --git a/compute/tensor/src/gpu/mali/reduction.cpp b/compute/tensor/src/gpu/mali/reduction.cpp
index 8dd5a1e6..99f3f85d 100644
--- a/compute/tensor/src/gpu/mali/reduction.cpp
+++ b/compute/tensor/src/gpu/mali/reduction.cpp
@@ -37,7 +37,7 @@ inline EE reduction_checkpara_mali(GCLHandle_t handle,
     if (tensorNumElements(maskDesc) != 0) {
         CHECK_STATUS(NOT_SUPPORTED);  //unsupport currently
     }
-    if (p.axes_num > 1) {
+    if (p.num_axes > 1) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
     int axis = p.axes[0];
@@ -64,7 +64,7 @@ EE reduction_padding_input_mali(TensorDesc inputDesc,
 
     int axisTran[6];
     TensorDesc tmpDesc = inputDesc;
-    for (int i = 0; i < p.axes_num; i++) {
+    for (int i = 0; i < p.num_axes; i++) {
         int axis = p.axes[i];
         if (axis < 0) {
             axis = tmpDesc.nDims + axis;
diff --git a/compute/tensor/src/gpu/mali/rnn.cpp b/compute/tensor/src/gpu/mali/rnn.cpp
index cb9fc93b..7859e490 100644
--- a/compute/tensor/src/gpu/mali/rnn.cpp
+++ b/compute/tensor/src/gpu/mali/rnn.cpp
@@ -92,6 +92,10 @@ EE rnn_infer_forward_algorithm_mali(GCLHandle_t handle,
     if (algorithm != CONVOLUTION_ALGORITHM_NULL) {
         return SUCCESS;
     }
+    std::vector<I32> flag = build_rnn_forward_algorithm_flag(inputDesc, filterDescs, rnnPara);
+    if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) {
+        return SUCCESS;
+    }
     std::vector<ConvolutionForwardAlgorithm> rnnAlgorithms;
     std::vector<U32> algoNumIndexGemm;
     std::vector<U32> vecHGemm;
@@ -105,7 +109,7 @@ EE rnn_infer_forward_algorithm_mali(GCLHandle_t handle,
     std::vector<U32> vecHGemvPro;
     std::vector<U32> vecCGemvPro;
     std::vector<U32> vecKGemvPro;
-    bool useProjection = (rnnPara.numProjection > 0) ? true : false;
+    bool useProjection = (rnnPara.num_projection > 0) ? true : false;
     U32 filterCol = filterDescs[0].dims[0];
     U32 filterRow = filterDescs[0].dims[1];
     U32 filterColPro = (useProjection) ? filterDescs[1].dims[0] : filterCol;
@@ -267,7 +271,7 @@ EE rnn_infer_forward_algorithm_mali(GCLHandle_t handle,
     outputDescs.push_back(outputDesc);
     std::vector<GCLMem> filters;
     std::vector<GCLMem> biases;
-    U32 biDirNum = (rnnPara.biDirection) ? 2 : 1;
+    U32 biDirNum = (rnnPara.bi_direction) ? 2 : 1;
     for (U32 i = 0; i < biDirNum; i++) {
         filters.push_back(*filterX);
         filters.push_back(*filterH);
@@ -329,6 +333,7 @@ EE rnn_infer_forward_algorithm_mali(GCLHandle_t handle,
         CHECK_STATUS(NOT_SUPPORTED);
     }
     *forwardRunInfo = bestRunInfo;
+    gcl_set_runInfo_to_cache(handle, flag, bestRunInfo);
     CHECK_STATUS(gcl_finish(handle));
     gcl_destroy_gclmem(input);
     gcl_destroy_gclmem(filterX);
diff --git a/compute/tensor/src/gpu/mali/rnncell.cpp b/compute/tensor/src/gpu/mali/rnncell.cpp
index d2780bae..95c886a2 100644
--- a/compute/tensor/src/gpu/mali/rnncell.cpp
+++ b/compute/tensor/src/gpu/mali/rnncell.cpp
@@ -32,7 +32,7 @@ inline void rnncell_produce_algos_paras(RNNParamSpec rnnPara,
     rnncellAlgorithms->push_back(CONVOLUTION_ALGORITHM_GEMM);
     CHECK_STATUS(get_gemv_cal_scheme(vecH, vecC, vecK));
     algoNumIndex->push_back(vecH->size());
-    if (rnnPara.numProjection) {
+    if (rnnPara.num_projection) {
         CHECK_STATUS(get_gemv_cal_scheme(vecHP, vecCP, vecKP));
         algoNumIndexP->push_back(vecHP->size());
     }
@@ -61,7 +61,7 @@ inline EE rnncell_checkpara_mali(GCLHandle_t handle,
     if (iB != 1) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
-    U32 hDim = rnnPara.numOutput;
+    U32 hDim = rnnPara.num_outputs;
     if (hDesc.dims[0] != hDim && hDesc.dims[1] != hDim) {
         CHECK_STATUS(NOT_MATCH);
     }
@@ -88,6 +88,11 @@ EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle,
     if (algorithm != CONVOLUTION_ALGORITHM_NULL) {
         return SUCCESS;
     }
+    std::vector<TensorDesc> filterDescVec(1, filterDesc);
+    std::vector<I32> flag = build_rnn_forward_algorithm_flag(xDesc, filterDescVec, rnnPara);
+    if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) {
+        return SUCCESS;
+    }
     std::vector<ConvolutionForwardAlgorithm> rnncellAlgorithms;
     std::vector<U32> algoNumIndex;
     std::vector<U32> vecH;
@@ -118,7 +123,7 @@ EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle,
     U32 offset[3] = {0, 0, 0};
     U32 maxFilterSize[2] = {0, 0};
     TensorDesc ftmDesc[2];
-    bool useProject = (rnnPara.numProjection > 0) ? true : false;
+    bool useProject = (rnnPara.num_projection > 0) ? true : false;
     U32 filterNum = (useProject) ? 2 : 1;
     ForwardRunInfoMali runInfo;
     runInfo.algorithm = rnncellAlgorithms[0];
@@ -155,7 +160,7 @@ EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle,
     if (algosNum == 0) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
-    U32 col = (useProject) ? rnnPara.numProjection : rnnPara.numOutput;
+    U32 col = (useProject) ? rnnPara.num_projection : rnnPara.num_outputs;
     stride[0] = col * 4;
     stride[1] = 1;
     stride[2] = 1;
@@ -176,7 +181,7 @@ EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle,
         stride[2] = ftmDesc[1].dims[2];
         CHECK_STATUS(gclmem_set_desc_padding(
             &filter1->desc, stride, offset, dt, DF_NCHW, GCL_MEM_BUF, CL_MEM_READ_WRITE));
-        stride[0] = rnnPara.numOutput;
+        stride[0] = rnnPara.num_outputs;
         CHECK_STATUS(gclmem_set_desc_padding(
             &bias1->desc, stride, offset, dt, DF_NHWC, GCL_MEM_BUF, CL_MEM_READ_WRITE));
         gcl_create_memory(handle, filter1);
@@ -240,6 +245,7 @@ EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle,
         CHECK_STATUS(NOT_SUPPORTED);
     }
     *forwardRunInfo = bestRunInfo;
+    gcl_set_runInfo_to_cache(handle, flag, bestRunInfo);
     CHECK_STATUS(gcl_finish(handle));
     gcl_destroy_gclmem(currentX);
     gcl_destroy_gclmem(state);
diff --git a/compute/tensor/src/gpu/mali/roialign.cpp b/compute/tensor/src/gpu/mali/roialign.cpp
index 96a5008e..77f914b7 100644
--- a/compute/tensor/src/gpu/mali/roialign.cpp
+++ b/compute/tensor/src/gpu/mali/roialign.cpp
@@ -43,7 +43,7 @@ inline EE roialign_checkpara_mali(GCLHandle_t handle,
         outputDesc.dims[3] != inputDescs[1].dims[1]) {
         CHECK_STATUS(NOT_MATCH)
     }
-    if (roiAlignParamSpec.coordinateTransformationMode != ROIALIGN_HALF_PIXEL) {
+    if (roiAlignParamSpec.trans_mode != COORDINATE_TRANS_HALF_PIXEL) {
         CHECK_STATUS(NOT_SUPPORTED);
     }
     return SUCCESS;
diff --git a/compute/tensor/src/gpu/mali/space2depth.cpp b/compute/tensor/src/gpu/mali/space2depth.cpp
index d15797b4..71b9481b 100644
--- a/compute/tensor/src/gpu/mali/space2depth.cpp
+++ b/compute/tensor/src/gpu/mali/space2depth.cpp
@@ -52,7 +52,7 @@ inline EE space2depth_core_mali_fp16(GCLHandle_t handle,
     inbuf = input->mem;
     outbuf = output->mem;
     bool useNchw = (inputDesc.df == DF_NCHWC4) ? false : true;
-    U32 blockSize = space2DepthPara.blockSize;
+    U32 blockSize = space2DepthPara.block_size;
 
     U32 gs[3] = {iw, ih, (ic + 3) / 4};
     U32 ls[3] = {0, 0, 0};
@@ -87,7 +87,7 @@ EE space2depth_padding_input_mali(TensorDesc inputDesc,
     if (inputMem == nullptr || outputMem == nullptr || outputDesc == nullptr) {
         CHECK_STATUS(NULL_POINTER);
     }
-    U32 blockSize = space2DepthPara.blockSize;
+    U32 blockSize = space2DepthPara.block_size;
     DataType idt;
     DataFormat idf;
     U32 iw, ih, ic, in;
diff --git a/compute/tensor/src/gpu/mali/transpose.cpp b/compute/tensor/src/gpu/mali/transpose.cpp
index 6124bb7b..5306b846 100644
--- a/compute/tensor/src/gpu/mali/transpose.cpp
+++ b/compute/tensor/src/gpu/mali/transpose.cpp
@@ -27,7 +27,7 @@ EE transpose_padding_input_mali(TensorDesc inputDesc,
     if (outputDesc == nullptr || inputMem == nullptr || outputMem == nullptr) {
         CHECK_STATUS(NULL_POINTER);
     }
-    U32 *dim = p.trans_dims;
+    U32 *dim = p.axes;
     U32 dimTran[6] = {1, 1, 1, 1, 1, 1};
     U32 nDims = inputDesc.nDims;
     for (U32 i = 0; i < nDims; ++i) {
@@ -88,8 +88,7 @@ EE transpose_mali(GCLHandle_t handle,
     CHECK_STATUS(transpose_checkpara_mali(handle, inputDesc, input, outputDesc, output));
     switch (inputDesc.dt) {
         case DT_F16: {
-            ret = transpose_mali_fp16(
-                handle, inputDesc, input, outputDesc, output, tmpbuf, p.trans_dims);
+            ret = transpose_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf, p.axes);
             break;
         }
         default:
diff --git a/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp
index 660d3c13..c71bcccd 100644
--- a/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp
+++ b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp
@@ -50,7 +50,7 @@ inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle,
     tensorSelectGet(gridDesc, NULL, NULL, &gn, &gc, &gh, &gw);
     tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow);
 
-    U32 coe = bilateralSliceApplyParamSpec.coefficient_len;
+    U32 coe = bilateralSliceApplyParamSpec.coefficient;
     BilateralSliceApplyMode mode = bilateralSliceApplyParamSpec.mode;
     U32 dep = gc / coe;
     U32 gcw = gc * gw;
@@ -62,7 +62,7 @@ inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle,
     gridbuf = grid->mem;
     outbuf = output->mem;
     gridTran = tmpBuf->mem;
-    if (mode == BSliceApply_NULL) {
+    if (mode == BSLICE_APPLY_NULL) {
         guidebuf = guide->mem;
     } else {
         guidebuf = inbuf;
@@ -85,11 +85,12 @@ inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle,
     U32 gs[2] = {ow, oh};
     U32 ls[2] = {0, 0};
     U32 dim = 2;
-    char kernelname[128];
-    if (mode == BSliceApply_CONV) {
-        sprintf(kernelname, "bilateral_slice_apply_c12_conv_uchar");
+    const char *kernelname;
+    if (mode == BSLICE_APPLY_CONV) {
+        kernelname = "bilateral_slice_apply_c12_conv_uchar";
+        ;
     } else {
-        sprintf(kernelname, "bilateral_slice_apply_c12_uchar");
+        kernelname = "bilateral_slice_apply_c12_uchar";
     }
     CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel));
     CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, wh, gc, gw, gh, gcw, dep, coe, gs[0], gs[1],
@@ -100,7 +101,7 @@ inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle,
     CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim, gs, ls, kernelname));
     CHECK_STATUS(gcl_print_memory<F16>(handle, input, "bilateral_slice_apply_input"));
     CHECK_STATUS(gcl_print_memory<F16>(handle, output, "bilateral_slice_apply_output"));
-    if (mode == BSliceApply_NULL) {
+    if (mode == BSLICE_APPLY_NULL) {
         CHECK_STATUS(gcl_print_memory<F16>(handle, guide, "bilateral_slice_apply_guide"));
     }
 #endif
diff --git a/compute/tensor/src/kl.cpp b/compute/tensor/src/kl.cpp
index 07e5912e..4cec2e93 100644
--- a/compute/tensor/src/kl.cpp
+++ b/compute/tensor/src/kl.cpp
@@ -143,8 +143,6 @@ std::vector<F32> compute_scale_with_KL(std::vector<F32> &histogram, F32 interval
                 }
             }
         }
-        F32 qSum = sum_func(DT_F32, qExpand.data(), i);
-        scale_func(DT_F32, qExpand.data(), qExpand.data(), i, 1 / qSum, 0);
         F32 kld = compute_KLD(i, clipDist.data(), qExpand.data());
 
         if (kld < minKLD) {
diff --git a/compute/tensor/src/matmul.cpp b/compute/tensor/src/matmul.cpp
index d04dda5b..5d039ebb 100644
--- a/compute/tensor/src/matmul.cpp
+++ b/compute/tensor/src/matmul.cpp
@@ -13,7 +13,7 @@
 
 #include "tensor_computing.h"
 #include "blas_enhance.h"
-#include <string.h>
+
 #ifdef _USE_GPU
 #include "gpu/mali/tensor_computing_mali.h"
 #endif
@@ -169,6 +169,38 @@ inline bool useINT8Type(DataType aDt, DataType bDt, DataType cDt, I32 flag)
         DT_I8 == cDt || flag != 0);
 }
 
+EE mmm_infer_forward_tmp_bytes(U32 *bytes,
+    U32 kDimA,
+    U32 kDimB,
+    DataFormat dataFormatA,
+    DataFormat dataFormatB,
+    TensorDesc matrixADesc,
+    TensorDesc matrixBDesc,
+    Arch arch)
+{
+    EE ret = NOT_SUPPORTED;
+    if (matrixADesc.dims[1 - kDimA] == 1 || matrixBDesc.dims[1 - kDimB] == 1) {
+        TensorDesc matrixDesc, vectorDesc;
+        if (matrixADesc.dims[1 - kDimA] == 1) {
+            matrixDesc =
+                tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]);
+            vectorDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]);
+        } else {
+            matrixDesc =
+                tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]);
+            vectorDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]);
+        }
+        ret = matrix_vector_multiply_tmp_bytes(matrixDesc, vectorDesc, bytes, arch);
+    } else {
+        TensorDesc matrixA2DDesc =
+            tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]);
+        TensorDesc matrixB2Ddesc =
+            tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]);
+        ret = matrix_matrix_multiply_tmp_bytes(matrixA2DDesc, matrixB2Ddesc, bytes, arch);
+    }
+    return ret;
+}
+
 EE matmul_infer_forward_tmp_bytes(Tensor matrixATensor,
     bool transposeA,
     Tensor matrixBTensor,
@@ -247,25 +279,12 @@ EE matmul_infer_forward_tmp_bytes(Tensor matrixATensor,
         kDimB = 1;
         dataFormatB = DF_NORMAL;
     }
-    if (matrixADesc.dims[1 - kDimA] == 1 || matrixBDesc.dims[1 - kDimB] == 1) {
-        TensorDesc matrixDesc, vectorDesc;
-        if (matrixADesc.dims[1 - kDimA] == 1) {
-            matrixDesc =
-                tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]);
-            vectorDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]);
-        } else {
-            matrixDesc =
-                tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]);
-            vectorDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]);
-        }
-        ret = matrix_vector_multiply_tmp_bytes(matrixDesc, vectorDesc, bytes, archInfo->arch);
-    } else {
-        TensorDesc matrixA2DDesc =
-            tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]);
-        TensorDesc matrixB2Ddesc =
-            tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]);
-        ret = matrix_matrix_multiply_tmp_bytes(matrixA2DDesc, matrixB2Ddesc, bytes, archInfo->arch);
-    }
+    mmm_infer_forward_tmp_bytes(
+        bytes, kDimA, kDimB, dataFormatA, dataFormatB, matrixADesc, matrixBDesc, archInfo->arch);
+#ifdef _USE_OPENMP
+    U32 loopsC = tensorNumElements(matrixCDesc) / (matrixCDesc.dims[1] * matrixCDesc.dims[0]);
+    *bytes *= loopsC;
+#endif
 
     if (quantA) {
         *bytes += tensorNumBytes(matrixADesc);
@@ -396,112 +415,122 @@ EE matmul(Tensor matrixATensor,
 #endif
 
     U32 kDimA, kDimB;
+    DataFormat dataFormatA, dataFormatB;
     if (transposeA) {
         kDimA = 1;
+        dataFormatA = DF_TRANSPOSE;
     } else {
         kDimA = 0;
+        dataFormatA = DF_NORMAL;
     }
     if (transposeB) {
         kDimB = 0;
+        dataFormatB = DF_TRANSPOSE;
     } else {
         kDimB = 1;
+        dataFormatB = DF_NORMAL;
     }
+    align_input_desc(&matrixADesc, &matrixBDesc);
+    std::vector<U8 *> p = {(U8 *)matrixA, (U8 *)matrixB, (U8 *)matrixC, (U8 *)tmp};
 
-    U32 matrixA2DBytes = (matrixADesc.dims[1] * matrixADesc.dims[0]) * bytesOf(matrixADesc.dt);
-    U32 matrixB2DBytes = (matrixBDesc.dims[1] * matrixBDesc.dims[0]) * bytesOf(matrixBDesc.dt);
-    U32 matrixC2DBytes = (matrixCDesc.dims[1] * matrixCDesc.dims[0]) * bytesOf(matrixCDesc.dt);
     if (biasTensor.bytes() > 0) {
         U8 *bias = (U8 *)get_ptr_from_tensor(biasTensor, arch);
         for (U32 i = 0; i < tensorNumBytes(matrixCDesc) / biasTensor.bytes(); i++) {
-            memcpy((U8 *)matrixC + i * biasTensor.bytes(), bias, biasTensor.bytes());
+            UNI_MEMCPY((U8 *)matrixC + i * biasTensor.bytes(), bias, biasTensor.bytes());
         }
     } else {
-        memset(matrixC, 0, tensorNumBytes(matrixCDesc));
+        UNI_MEMSET(matrixC, 0, tensorNumBytes(matrixCDesc));
     }
-    std::vector<U32> ADims, BDims, CDims;
-    U32 loopsA = tensorNumElements(matrixADesc) / (matrixADesc.dims[1] * matrixADesc.dims[0]);
-    U32 loopsB = tensorNumElements(matrixBDesc) / (matrixBDesc.dims[1] * matrixBDesc.dims[0]);
-    U32 loopsC = tensorNumElements(matrixCDesc) / (matrixCDesc.dims[1] * matrixCDesc.dims[0]);
-    align_input_desc(&matrixADesc, &matrixBDesc);
-    U32 ia, ib;
-    for (U32 ic = 0; ic < loopsC; ic++) {
-        CDims = calculateLocalIndex(ic, matrixCDesc.dims + 2, matrixCDesc.nDims - 2);
-        if (loopsA == loopsC) {
-            ia = ic;
-        } else {
-            ADims = CDims;
-            for (U32 i = 2; i < matrixADesc.nDims; i++) {
-                if (ADims[i - 2] >= matrixADesc.dims[i]) {
-                    ADims[i - 2] = 0;
+
+    U32 mmmBytes = 0;
+#if defined(_USE_OPENMP) && defined(_USE_CPU)
+    mmm_infer_forward_tmp_bytes(&mmmBytes, kDimA, kDimB, dataFormatA, dataFormatB, matrixADesc,
+        matrixBDesc, archInfo->arch);
+#pragma omp parallel num_threads(OMP_NUM_THREADS)
+#endif
+    {
+        U32 matrixA2DBytes = (matrixADesc.dims[1] * matrixADesc.dims[0]) * bytesOf(matrixADesc.dt);
+        U32 matrixB2DBytes = (matrixBDesc.dims[1] * matrixBDesc.dims[0]) * bytesOf(matrixBDesc.dt);
+        U32 matrixC2DBytes = (matrixCDesc.dims[1] * matrixCDesc.dims[0]) * bytesOf(matrixCDesc.dt);
+        U32 loopsA = tensorNumElements(matrixADesc) / (matrixADesc.dims[1] * matrixADesc.dims[0]);
+        U32 loopsB = tensorNumElements(matrixBDesc) / (matrixBDesc.dims[1] * matrixBDesc.dims[0]);
+        U32 loopsC = tensorNumElements(matrixCDesc) / (matrixCDesc.dims[1] * matrixCDesc.dims[0]);
+#if defined(_USE_OPENMP)
+#pragma omp for
+#endif
+        for (U32 ic = 0; ic < loopsC; ic++) {
+            U32 ia, ib;
+            std::vector<U32> ADims, BDims, CDims;
+            U8 *tmpPtr = p[3] + ic * mmmBytes;
+            CDims = calculateLocalIndex(ic, matrixCDesc.dims + 2, matrixCDesc.nDims - 2);
+            if (loopsA == loopsC) {
+                ia = ic;
+            } else {
+                ADims = CDims;
+                for (U32 i = 2; i < matrixADesc.nDims; i++) {
+                    if (ADims[i - 2] >= matrixADesc.dims[i]) {
+                        ADims[i - 2] = 0;
+                    }
                 }
+                ia = calculateGlobalIndex(ADims.data(), matrixADesc.dims + 2, matrixADesc.nDims - 2);
             }
-            ia = calculateGlobalIndex(ADims.data(), matrixADesc.dims + 2, matrixADesc.nDims - 2);
-        }
-        if (loopsB == loopsC) {
-            ib = ic;
-        } else {
-            BDims = CDims;
-            for (U32 i = 2; i < matrixBDesc.nDims; i++) {
-                if (BDims[i - 2] >= matrixBDesc.dims[i]) {
-                    BDims[i - 2] = 0;
+            if (loopsB == loopsC) {
+                ib = ic;
+            } else {
+                BDims = CDims;
+                for (U32 i = 2; i < matrixBDesc.nDims; i++) {
+                    if (BDims[i - 2] >= matrixBDesc.dims[i]) {
+                        BDims[i - 2] = 0;
+                    }
                 }
+                ib = calculateGlobalIndex(BDims.data(), matrixBDesc.dims + 2, matrixBDesc.nDims - 2);
             }
-            ib = calculateGlobalIndex(BDims.data(), matrixBDesc.dims + 2, matrixBDesc.nDims - 2);
-        }
 
-        U8 *matrixAPtr = (U8 *)matrixA + ia * matrixA2DBytes;
-        U8 *matrixBPtr = (U8 *)matrixB + ib * matrixB2DBytes;
-        U8 *matrixCPtr = (U8 *)matrixC + ic * matrixC2DBytes;
-        if (matrixADesc.dims[1 - kDimA] == 1) {
-            TensorDesc matrixA1DDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]);
-            TensorDesc matrixB2DDesc = tensor2df(matrixBDesc.dt,
-                transposeB ? DF_NORMAL : DF_TRANSPOSE, matrixBDesc.dims[1], matrixBDesc.dims[0]);
-            TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[0]);
+            U8 *matrixAPtr = p[0] + ia * matrixA2DBytes;
+            U8 *matrixBPtr = p[1] + ib * matrixB2DBytes;
+            U8 *matrixCPtr = p[2] + ic * matrixC2DBytes;
+            if (matrixADesc.dims[1 - kDimA] == 1) {
+                TensorDesc matrixA1DDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]);
+                TensorDesc matrixB2DDesc = tensor2df(matrixBDesc.dt,
+                    transposeB ? DF_NORMAL : DF_TRANSPOSE, matrixBDesc.dims[1], matrixBDesc.dims[0]);
+                TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[0]);
 
-            CHECK_STATUS(matrix_vector_multiply(matrixB2DDesc, matrixBPtr, matrixA1DDesc,
-                matrixAPtr, tmpBytes, tmp, matrixC1DDesc, matrixCPtr, scalePtr, archInfo->arch));
-        } else {
-            if (matrixBDesc.dims[1 - kDimB] == 1) {
-                TensorDesc matrixA2DDesc;
-                if (transposeA) {
-                    matrixA2DDesc = tensor2df(
-                        matrixADesc.dt, DF_TRANSPOSE, matrixADesc.dims[1], matrixADesc.dims[0]);
-                } else {
-                    matrixA2DDesc = tensor2df(
-                        matrixADesc.dt, DF_NORMAL, matrixADesc.dims[1], matrixADesc.dims[0]);
-                }
-                TensorDesc matrixB1DDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]);
-                TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[1]);
-
-                CHECK_STATUS(matrix_vector_multiply(matrixA2DDesc, matrixAPtr, matrixB1DDesc,
-                    matrixBPtr, tmpBytes, tmp, matrixC1DDesc, matrixCPtr, scalePtr, archInfo->arch));
+                CHECK_STATUS(
+                    matrix_vector_multiply(matrixB2DDesc, matrixBPtr, matrixA1DDesc, matrixAPtr,
+                        tmpBytes, tmpPtr, matrixC1DDesc, matrixCPtr, scalePtr, archInfo->arch));
             } else {
-                DataFormat dataFormatA, dataFormatB;
-                if (transposeA) {
-                    dataFormatA = DF_TRANSPOSE;
-                } else {
-                    dataFormatA = DF_NORMAL;
-                }
-                if (transposeB) {
-                    dataFormatB = DF_TRANSPOSE;
+                if (matrixBDesc.dims[1 - kDimB] == 1) {
+                    TensorDesc matrixA2DDesc;
+                    if (transposeA) {
+                        matrixA2DDesc = tensor2df(
+                            matrixADesc.dt, DF_TRANSPOSE, matrixADesc.dims[1], matrixADesc.dims[0]);
+                    } else {
+                        matrixA2DDesc = tensor2df(
+                            matrixADesc.dt, DF_NORMAL, matrixADesc.dims[1], matrixADesc.dims[0]);
+                    }
+                    TensorDesc matrixB1DDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]);
+                    TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[1]);
+
+                    CHECK_STATUS(
+                        matrix_vector_multiply(matrixA2DDesc, matrixAPtr, matrixB1DDesc, matrixBPtr,
+                            tmpBytes, tmpPtr, matrixC1DDesc, matrixCPtr, scalePtr, archInfo->arch));
                 } else {
-                    dataFormatB = DF_NORMAL;
-                }
-                TensorDesc matrixA2DDesc = tensor2df(
-                    matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]);
-                TensorDesc matrixB2DDesc = tensor2df(
-                    matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]);
-                TensorDesc matrixC2DDesc =
-                    tensor2df(matrixCDesc.dt, DF_NORMAL, matrixCDesc.dims[1], matrixCDesc.dims[0]);
+                    TensorDesc matrixA2DDesc = tensor2df(
+                        matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]);
+                    TensorDesc matrixB2DDesc = tensor2df(
+                        matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]);
+                    TensorDesc matrixC2DDesc = tensor2df(
+                        matrixCDesc.dt, DF_NORMAL, matrixCDesc.dims[1], matrixCDesc.dims[0]);
 #if defined(_USE_X86) && defined(_USE_INT8)
-                memset(tmp, 0, matrixCDesc.dims[0] * bytesOf(DT_I32));
+                    UNI_MEMSET(tmpPtr, 0, matrixCDesc.dims[0] * bytesOf(DT_I32));
 #endif
-                CHECK_STATUS(matrix_matrix_multiply(matrixA2DDesc, matrixAPtr, matrixB2DDesc,
-                    matrixBPtr, tmpBytes, tmp, matrixC2DDesc, matrixCPtr, scalePtr, archInfo->arch));
+                    CHECK_STATUS(
+                        matrix_matrix_multiply(matrixA2DDesc, matrixAPtr, matrixB2DDesc, matrixBPtr,
+                            tmpBytes, tmpPtr, matrixC2DDesc, matrixCPtr, scalePtr, archInfo->arch));
+                }
             }
         }
     }
-
 #ifdef _USE_INT8
     if (useINT8 && (matrixCTensor.get_desc().dt != matrixCDesc.dt)) {
         if (DT_I8 == matrixCTensor.get_desc().dt || DT_U8_Q == matrixCTensor.get_desc().dt) {
diff --git a/compute/tensor/src/non_max_suppression.cpp b/compute/tensor/src/non_max_suppression.cpp
index cf04825f..c77cb482 100644
--- a/compute/tensor/src/non_max_suppression.cpp
+++ b/compute/tensor/src/non_max_suppression.cpp
@@ -35,15 +35,14 @@ inline EE non_max_suppression_infer_output_size_cpu(
     CHECK_REQUIREMENT(p.max_output_boxes_per_class != 0);
     // output size
     U32 oh, ow;
-    // oh = the first box for saving the number of available boxes(1) + the maximum number of dectected boxes(max_output_boxes_per_class * num_class)
+    // oh = the first box for saving the maximum number of dectected boxes(max_output_boxes_per_class * num_class)
     U32 max_output_boxes_per_class = p.max_output_boxes_per_class;
     U32 num_class = ic1;
     U32 num_detected_max = max_output_boxes_per_class * num_class;
-    oh = num_detected_max + 1;
+    oh = num_detected_max;
     // Each width is a 3 dimension vector, which stores [batch_index, class_index, box_index] -> 3
-    // The first box is [ number of available boxes, 0, 0 ]
     ow = 3;
-    *outputDesc = tensor2d(idt0, oh, ow);
+    *outputDesc = tensor2d(DT_I32, oh, ow);
     return SUCCESS;
 }
 
@@ -77,7 +76,10 @@ EE non_max_suppression(std::vector<Tensor> inputTensor,
     EE ret = NOT_SUPPORTED;
     if (IS_CPU(arch)) {
 #ifdef _USE_CPU
-        ret = non_max_suppression_cpu(inputDesc, input, p, outputDesc, output);
+        U32 length = 0;
+        ret = non_max_suppression_cpu(inputDesc, input, p, outputDesc, output, &length);
+        outputDesc.dims[1] = length;
+        outputTensor.resize(outputDesc);
 #endif
     }
     return ret;
diff --git a/compute/tensor/src/non_zero.cpp b/compute/tensor/src/non_zero.cpp
new file mode 100644
index 00000000..fe8c443e
--- /dev/null
+++ b/compute/tensor/src/non_zero.cpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "tensor_computing.h"
+#ifdef _USE_CPU
+#include "cpu/tensor_computing_cpu.h"
+#endif
+
+EE non_zero(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo)
+{
+    auto arch = archInfo->arch;
+    TensorDesc inputDesc = inputTensor.get_desc();
+    void *input = get_ptr_from_tensor(inputTensor, arch);
+    TensorDesc outputDesc = outputTensor.get_desc();
+    void *output = get_ptr_from_tensor(outputTensor, arch);
+    EE ret = NOT_SUPPORTED;
+    if (IS_CPU(arch)) {
+        U32 length = 0;
+        ret = non_zero_cpu(inputDesc, input, outputDesc, output, &length);
+        outputDesc.dims[0] = length;
+        outputTensor.resize(outputDesc);
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/normalization.cpp b/compute/tensor/src/normalization.cpp
index 4bf343e6..7115d500 100644
--- a/compute/tensor/src/normalization.cpp
+++ b/compute/tensor/src/normalization.cpp
@@ -29,6 +29,7 @@
 #endif
 
 EE layer_normalization(Tensor inputTensor,
+    LayerNormParamSpec p,
     Tensor alphaTensor,
     Tensor betaTensor,
     Tensor tmpTensor,
@@ -54,22 +55,28 @@ EE layer_normalization(Tensor inputTensor,
     EE ret = NOT_SUPPORTED;
     if (IS_GENERAL(arch)) {
 #ifdef _USE_GENERAL
-        ret = layer_normalization_general(inputDesc, input, alpha, beta, outputDesc, output);
+        ret = layer_normalization_general(inputDesc, input, p, alpha, beta, outputDesc, output);
 #endif
 #ifdef _USE_X86
     } else if (IS_X86(arch)) {
-        ret = layer_normalization_x86(inputDesc, input, alpha, beta, outputDesc, output);
+        ret = layer_normalization_x86(inputDesc, input, p, alpha, beta, outputDesc, output);
 #endif
 #ifdef _USE_NEON
     } else if (IS_ARM(arch)) {
-        ret = layer_normalization_arm(inputDesc, input, alpha, beta, outputDesc, output);
+        ret = layer_normalization_arm(inputDesc, input, p, alpha, beta, outputDesc, output);
 #endif
 #ifdef _USE_GPU
     } else if (IS_GPU(arch)) {
         void *tmpbuf = get_ptr_from_tensor(tmpTensor, arch);
-        ret = layer_normalization_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc,
-            (GCLMem_t)input, (GCLMem_t)alpha, (GCLMem_t)beta, (GCLMem_t)tmpbuf, outputDesc,
-            (GCLMem_t)output);
+        if (p.axis == -1) {
+            ret = layer_normalization_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc,
+                (GCLMem_t)input, (GCLMem_t)alpha, (GCLMem_t)beta, (GCLMem_t)tmpbuf, outputDesc,
+                (GCLMem_t)output);
+        } else {
+            UNI_WARNING_LOG("please close optimizeTransposeLN in "
+                            "model_tools/include/OPOptimizers/LayerNormOptimizer.hpp and "
+                            "reconverter model.\n");
+        }
 #endif
     }
 
@@ -97,10 +104,7 @@ EE normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, Ar
     if (outputTensor == nullptr) {
         CHECK_STATUS(NULL_POINTER);
     }
-    TensorDesc inputDesc = inputTensor->get_desc();
-    TensorDesc outputDesc = outputTensor->get_desc();
-    outputDesc = inputDesc;
-    outputTensor->resize(outputDesc);
+    outputTensor->resize(inputTensor->get_desc());
     return SUCCESS;
 }
 
@@ -109,13 +113,15 @@ EE normalization_infer_forward_tmp_bytes(Tensor inputTensor, U32 *bytes, ArchInf
     if (bytes == nullptr) {
         CHECK_STATUS(NULL_POINTER);
     }
+    EE ret = NOT_SUPPORTED;
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
         GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor);
-        CHECK_STATUS(normalization_infer_forward_tmp_bytes_mali(gclmemInputDesc, bytes));
+        ret = normalization_infer_forward_tmp_bytes_mali(gclmemInputDesc, bytes);
 #endif
     } else {
         *bytes = 0;
+        ret = SUCCESS;
     }
-    return SUCCESS;
+    return ret;
 }
diff --git a/compute/tensor/src/onehot.cpp b/compute/tensor/src/onehot.cpp
new file mode 100644
index 00000000..2530bc83
--- /dev/null
+++ b/compute/tensor/src/onehot.cpp
@@ -0,0 +1,53 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "tensor_computing.h"
+#ifdef _USE_CPU
+#include "cpu/tensor_computing_cpu.h"
+#endif
+
+EE onehot_infer_output_size(
+    Tensor *inputTensor, OneHotParamSpec p, DataType type, Tensor *outputTensor, ArchInfo_t archInfo)
+{
+    if (inputTensor == nullptr || outputTensor == nullptr) {
+        CHECK_STATUS(NULL_POINTER);
+    }
+    TensorDesc inputDesc = inputTensor->get_desc();
+    TensorDesc outputDesc = inputDesc;
+    outputDesc.dt = type;
+    outputDesc.nDims++;
+    int axis = (p.axis + outputDesc.nDims) % outputDesc.nDims;
+    axis = outputDesc.nDims - 1 - axis;
+    for (U32 i = axis + 1; i < outputDesc.nDims; i++) {
+        outputDesc.dims[i] = outputDesc.dims[i - 1];
+    }
+    outputDesc.dims[axis] = p.depth;
+    outputTensor->resize(outputDesc);
+    return SUCCESS;
+}
+
+EE onehot(Tensor inputTensor, OneHotParamSpec p, Tensor outputTensor, ArchInfo_t archInfo)
+{
+    auto arch = archInfo->arch;
+    TensorDesc inputDesc = inputTensor.get_desc();
+    void *input = get_ptr_from_tensor(inputTensor, arch);
+    TensorDesc outputDesc = outputTensor.get_desc();
+    void *output = get_ptr_from_tensor(outputTensor, arch);
+    EE ret = NOT_SUPPORTED;
+    if (IS_CPU(arch)) {
+#ifdef _USE_CPU
+        ret = onehot_cpu(inputDesc, input, p, outputDesc, output);
+#endif
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/pooling.cpp b/compute/tensor/src/pooling.cpp
index b2949bf9..286b1921 100644
--- a/compute/tensor/src/pooling.cpp
+++ b/compute/tensor/src/pooling.cpp
@@ -46,41 +46,31 @@ inline EE pooling_infer_output_size_cpu(
         CHECK_STATUS(NOT_SUPPORTED);
         return NOT_SUPPORTED;
     }
-    RoundMode rm = p.rm;
+    RoundMode rm = p.round_mode;
     U32 ot = 0, oh = 0, ow = 0;
     EE ret = SUCCESS;
     switch (rm) {
-        case CEIL: {
-            ot = (U32)(ceil(
-                     (double(it + p.padding_before + p.padding_after - p.kernel_t) / p.stride_t))) +
-                1;
-            oh = (U32)(ceil(
-                     (double(ih + p.padding_top + p.padding_bottom - p.kernel_h) / p.stride_h))) +
-                1;
-            ow = (U32)(ceil(
-                     (double(iw + p.padding_left + p.padding_right - p.kernel_w) / p.stride_w))) +
+        case ROUND_CEIL: {
+            ot = (U32)(ceil((double(it + p.pad_before + p.pad_after - p.kernel_t) / p.stride_t))) +
                 1;
+            oh = (U32)(ceil((double(ih + p.pad_top + p.pad_bottom - p.kernel_h) / p.stride_h))) + 1;
+            ow = (U32)(ceil((double(iw + p.pad_left + p.pad_right - p.kernel_w) / p.stride_w))) + 1;
             break;
         }
-        case FLOOR: {
-            ot = (U32)(floor(
-                     (double(it + p.padding_before + p.padding_after - p.kernel_t) / p.stride_t))) +
-                1;
-            oh = (U32)(floor(
-                     (double(ih + p.padding_top + p.padding_bottom - p.kernel_h) / p.stride_h))) +
-                1;
-            ow = (U32)(floor(
-                     (double(iw + p.padding_left + p.padding_right - p.kernel_w) / p.stride_w))) +
+        case ROUND_FLOOR: {
+            ot = (U32)(floor((double(it + p.pad_before + p.pad_after - p.kernel_t) / p.stride_t))) +
                 1;
+            oh = (U32)(floor((double(ih + p.pad_top + p.pad_bottom - p.kernel_h) / p.stride_h))) + 1;
+            ow = (U32)(floor((double(iw + p.pad_left + p.pad_right - p.kernel_w) / p.stride_w))) + 1;
             break;
         }
-        case TF_SAME: {
+        case ROUND_TF_SAME: {
             ot = (U32)(ceil((double(it) / p.stride_t)));
             oh = (U32)(ceil((double(ih) / p.stride_h)));
             ow = (U32)(ceil((double(iw) / p.stride_w)));
             break;
         }
-        case TF_VALID: {
+        case ROUND_TF_VALID: {
             ot = (U32)(ceil((double(it - p.kernel_t + 1) / p.stride_t)));
             oh = (U32)(ceil((double(ih - p.kernel_h + 1) / p.stride_h)));
             ow = (U32)(ceil((double(iw - p.kernel_w + 1) / p.stride_w)));
@@ -91,16 +81,46 @@ inline EE pooling_infer_output_size_cpu(
             break;
         }
     }
+    DataFormat odf = idf;
+    if (idt == DT_U8_Q) {
+        odf = DF_NCHWC16;
+    }
     if (tensorIs3d(inputDesc)) {
-        *outputDesc = tensor3df(idt, idf, in, ic, oh);
+        *outputDesc = tensor3df(idt, odf, in, ic, oh);
     } else if (tensorIs4d(inputDesc)) {
-        *outputDesc = tensor4df(idt, idf, in, ic, oh, ow);
+        *outputDesc = tensor4df(idt, odf, in, ic, oh, ow);
     } else if (tensorIs5d(inputDesc)) {
-        *outputDesc = tensor5df(idt, idf, in, ic, ot, oh, ow);
+        *outputDesc = tensor5df(idt, odf, in, ic, ot, oh, ow);
     }
     return ret;
 }
 
+static inline PoolingParamSpec update_param(TensorDesc inDesc, PoolingParamSpec poolingParamSpec)
+{
+    if (0 == poolingParamSpec.kernel_w) {
+        if (inDesc.nDims > 3) {
+            poolingParamSpec.kernel_w = inDesc.dims[0];
+        } else {
+            poolingParamSpec.kernel_w = 1;
+        }
+    }
+    if (0 == poolingParamSpec.kernel_h) {
+        if (inDesc.nDims > 3) {
+            poolingParamSpec.kernel_h = inDesc.dims[1];
+        } else {
+            poolingParamSpec.kernel_h = inDesc.dims[0];
+        }
+    }
+    if (0 == poolingParamSpec.kernel_t) {
+        if (inDesc.nDims > 4) {
+            poolingParamSpec.kernel_t = inDesc.dims[2];
+        } else {
+            poolingParamSpec.kernel_t = 1;
+        }
+    }
+    return poolingParamSpec;
+}
+
 EE pooling_infer_output_size(
     Tensor *inputTensor, PoolingParamSpec poolingParamSpec, Tensor *outputTensor, ArchInfo_t archInfo)
 {
@@ -110,15 +130,7 @@ EE pooling_infer_output_size(
     TensorDesc inputDesc = inputTensor->get_desc();
     TensorDesc newInputDesc = transformDescTo4d(inputDesc);
     TensorDesc outputDesc = outputTensor->get_desc();
-    if (0 == poolingParamSpec.kernel_w) {
-        poolingParamSpec.kernel_w = newInputDesc.dims[0];
-    }
-    if (0 == poolingParamSpec.kernel_h) {
-        poolingParamSpec.kernel_h = newInputDesc.dims[1];
-    }
-    if (0 == poolingParamSpec.kernel_t) {
-        poolingParamSpec.kernel_t = newInputDesc.dims[2];
-    }
+    poolingParamSpec = update_param(newInputDesc, poolingParamSpec);
     CHECK_STATUS(pooling_infer_output_size_cpu(inputDesc, poolingParamSpec, &outputDesc));
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
@@ -145,55 +157,74 @@ EE pooling(Tensor inputTensor,
     void *output = get_ptr_from_tensor(outputTensor, arch);
     F32 scale[2] = {inputTensor.get_scale(), -1};
     void *tmp = get_ptr_from_tensor(tmpTensor, arch);
-
-    if (0 == poolingParamSpec.kernel_w) {
-        poolingParamSpec.kernel_w = inputDesc.dims[0];
-    }
-    if (0 == poolingParamSpec.kernel_h) {
-        poolingParamSpec.kernel_h = inputDesc.dims[1];
-    }
-    if (0 == poolingParamSpec.kernel_t) {
-        poolingParamSpec.kernel_t = inputDesc.dims[2];
-    }
-    TensorDesc inDescCPU = inputDesc;
-    U8 *inputCPU = (U8 *)input;
-    TensorDesc outDescCPU = outputDesc;
-    U8 *outputCPU = (U8 *)output;
-    if (DF_NCHWC16 != inputDesc.df && DF_NCHWC8 != inputDesc.df && IS_CPU(arch)) {
-        int channelAxis = inputDesc.nDims - 2;
-        U32 paddedC = (inputDesc.dims[channelAxis] + 7) / 8 * 8;
-        inDescCPU.dims[channelAxis] = paddedC;
-        inDescCPU.df = DF_NCHWC8;
-        outDescCPU.dims[channelAxis] = paddedC;
-        outDescCPU.df = DF_NCHWC8;
-        inputCPU = (U8 *)tmp;
-        outputCPU = inputCPU + tensorNumBytes(inDescCPU);
-        transformNCHWToNCHWC8(inputDesc, input, inDescCPU, inputCPU);
-    }
+    poolingParamSpec = update_param(inputDesc, poolingParamSpec);
     EE ret = NOT_SUPPORTED;
-    if (IS_GENERAL(arch)) {
+    if (IS_GPU(arch)) {
+#ifdef _USE_GPU
+        ret = pooling_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc,
+            (const GCLMem_t)input, poolingParamSpec, scale, (GCLMem_t)tmp, outputDesc,
+            (GCLMem_t)output);
+#endif
+    } else if (IS_CPU(arch)) {
+#ifdef _USE_CPU
+        U8 *inputCPU = (U8 *)input;
+        U8 *outputCPU = (U8 *)output;
+        TensorDesc inDescCPU = inputDesc;
+        TensorDesc outDescCPU = outputDesc;
+        DataFormat dstF = outputDesc.df;
+        int channelAxis = inputDesc.nDims - 2;
+
+        U32 cx = 8;
+        if (IS_X86(arch)) {
+            if (dstF == DF_NCHW || dstF == DF_MTK) {
+                cx = 1;
+            }
+            if (inputDesc.dt == DT_U8_Q) {
+                dstF = DF_NCHWC16;  // padding to 16
+                cx = 16;
+            }
+        } else {
+            dstF = DF_NCHWC8;
+        }
+
+        U32 paddedC = (inputDesc.dims[channelAxis] + cx - 1) / cx * cx;
+
+        if (paddedC != inputDesc.dims[channelAxis] || (inputDesc.df != dstF)) {
+            inDescCPU.dims[channelAxis] = paddedC;
+            inDescCPU.df = dstF;
+            inputCPU = (U8 *)tmp;
+            tmp = (U8 *)tmp + tensorNumBytes(inDescCPU);
+            transformFormat(inputDesc, input, inDescCPU, inputCPU);
+        }
+
+        if (paddedC != inputDesc.dims[channelAxis] || (outputDesc.df != dstF)) {
+            outDescCPU.dims[channelAxis] = paddedC;
+            outDescCPU.df = dstF;
+            outputCPU = (U8 *)tmp;
+        }
+
+        if (IS_GENERAL(arch)) {
 #ifdef _USE_GENERAL
-        ret = pooling_general(inDescCPU, inputCPU, poolingParamSpec, outDescCPU, outputCPU);
+            ret = pooling_general(
+                inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU);
 #endif
 #ifdef _USE_X86
-    } else if (IS_X86(arch)) {
-        ret = pooling_x86(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU);
+        } else if (IS_X86(arch)) {
+            ret = pooling_x86(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU);
 #endif
 #ifdef _USE_NEON
-    } else if (IS_ARM(arch)) {
-        ret = pooling_arm(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU);
+        } else if (IS_ARM(arch)) {
+            ret = pooling_arm(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU);
 #endif
-#ifdef _USE_GPU
-    } else if (IS_GPU(arch)) {
-        ret = pooling_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc,
-            (const GCLMem_t)input, poolingParamSpec, scale, (GCLMem_t)tmp, outputDesc,
-            (GCLMem_t)output);
+        }
+
+        if (paddedC != inputDesc.dims[channelAxis] || (outputDesc.df != outDescCPU.df)) {
+            transformFormat(outDescCPU, outputCPU, outputDesc, output);
+        }
+        outputTensor.set_scale(scale[1]);
 #endif
     }
-    if (DF_NCHWC16 != inputDesc.df && DF_NCHWC8 != outputDesc.df && IS_CPU(arch)) {
-        transformToNCHW(outDescCPU, outputCPU, outputDesc, output);
-    }
-    outputTensor.set_scale(scale[1]);
+
     return ret;
 }
 
@@ -213,13 +244,32 @@ EE pooling_infer_forward_tmp_bytes(
     } else {
         *bytes = 0;
         ret = SUCCESS;
-        if (DF_NCHW == inputDesc.df) {
-            int channelAxis = inputDesc.nDims - 2;
-            U32 paddedC = (inputDesc.dims[channelAxis] + 7) / 8 * 8;
-            TensorDesc outputDesc = outputTensor.get_desc();
+
+        TensorDesc outputDesc = transformDescTo4d(outputTensor.get_desc());
+        DataFormat dstF = outputDesc.df;
+        int channelAxis = inputDesc.nDims - 2;
+        U32 cx = 8;
+        if (IS_X86(archInfo->arch)) {
+            if (dstF == DF_NCHW || dstF == DF_MTK) {
+                cx = 1;
+            }
+            if (inputDesc.dt == DT_U8_Q) {
+                dstF = DF_NCHWC16;  // padding to 16
+                cx = 16;
+            }
+        } else {
+            dstF = DF_NCHWC8;
+        }
+        U32 paddedC = (inputDesc.dims[channelAxis] + cx - 1) / cx * cx;
+
+        if (paddedC != inputDesc.dims[channelAxis] || (inputDesc.df != dstF)) {
             inputDesc.dims[channelAxis] = paddedC;
+            *bytes += tensorNumBytes(inputDesc);
+        }
+
+        if (paddedC != outputDesc.dims[channelAxis] || (outputDesc.df != dstF)) {
             outputDesc.dims[channelAxis] = paddedC;
-            *bytes = tensorNumBytes(inputDesc) + tensorNumBytes(outputDesc);
+            *bytes += tensorNumBytes(outputDesc);
         }
     }
     return ret;
diff --git a/compute/tensor/src/power.cpp b/compute/tensor/src/power.cpp
index 42658b5e..4e10838f 100644
--- a/compute/tensor/src/power.cpp
+++ b/compute/tensor/src/power.cpp
@@ -19,28 +19,31 @@
 #include "gpu/mali/tensor_computing_mali.h"
 #endif
 
-inline EE power_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc)
+inline EE power_infer_output_size_cpu(
+    TensorDesc inputDesc, PowerParamSpec p, TensorDesc *outputDesc, Arch arch)
 {
-    if (nullptr == outputDesc) {
-        CHECK_STATUS(NULL_POINTER);
-    }
     *outputDesc = inputDesc;
-    return SUCCESS;
+    EE ret = SUCCESS;
+#ifdef _USE_CPU
+    if (tensorIsShape(inputDesc)) {
+        ret = power_cpu(inputDesc, inputDesc.dims + inputDesc.nDims, p, *outputDesc,
+            outputDesc->dims + outputDesc->nDims, arch);
+    }
+#endif
+    return ret;
 }
 
-EE power_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo)
+EE power_infer_output_size(
+    Tensor *inputTensor, PowerParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo)
 {
-    if (inputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-    if (outputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
+    if (inputTensor == nullptr || outputTensor == nullptr) {
+        return NULL_POINTER;
     }
     TensorDesc inputDesc = inputTensor->get_desc();
     TensorDesc outputDesc = outputTensor->get_desc();
-    CHECK_STATUS(power_infer_output_size_cpu(inputDesc, &outputDesc));
+    EE ret = power_infer_output_size_cpu(inputDesc, p, &outputDesc, archInfo->arch);
     outputTensor->resize(outputDesc);
-    return SUCCESS;
+    return ret;
 }
 
 EE power(Tensor inputTensor, PowerParamSpec p, Tensor outputTensor, ArchInfo_t archInfo)
@@ -50,7 +53,6 @@ EE power(Tensor inputTensor, PowerParamSpec p, Tensor outputTensor, ArchInfo_t a
     void *input = get_ptr_from_tensor(inputTensor, arch);
     TensorDesc outputDesc = outputTensor.get_desc();
     void *output = get_ptr_from_tensor(outputTensor, arch);
-
     EE ret = NOT_SUPPORTED;
     if (IS_CPU(arch)) {
 #ifdef _USE_CPU
diff --git a/compute/tensor/src/preallocated_memory.cpp b/compute/tensor/src/preallocated_memory.cpp
index 35f542cb..575ba51f 100644
--- a/compute/tensor/src/preallocated_memory.cpp
+++ b/compute/tensor/src/preallocated_memory.cpp
@@ -16,17 +16,32 @@
 #include "gpu/mali/tensor_computing_mali.h"
 #endif
 
-EE preallocated_memory_infer_output_size(Tensor *outputTensor, ArchInfo_t archInfo)
+EE preallocated_memory_infer_output_size(std::vector<Tensor *> inputTensors,
+    PreAllocatedMemoryParamSpec p,
+    Tensor *outputTensor,
+    ArchInfo_t archInfo)
 {
     if (outputTensor == nullptr) {
         CHECK_STATUS(NULL_POINTER);
     }
-    TensorDesc outputDesc = outputTensor->get_desc();
-    outputTensor->resize(outputDesc);
+    TensorDesc outDesc = p.desc;
+    if (inputTensors.size() > 0) {
+        TensorDesc inDesc = inputTensors[0]->get_desc();
+        if (outDesc.nDims == 0) {
+            outDesc = inDesc;
+        } else {
+            for (U32 i = 0; i < UNI_MIN(inDesc.nDims, outDesc.nDims); i++) {
+                if (outDesc.dims[outDesc.nDims - 1 - i] <= 0) {
+                    outDesc.dims[outDesc.nDims - 1 - i] = inDesc.dims[inDesc.nDims - 1 - i];
+                }
+            }
+        }
+    }
+    outputTensor->resize(outDesc);
     return SUCCESS;
 }
 
-EE preallocated_memory(Tensor outputTensor, ArchInfo_t archInfo)
+EE preallocated_memory(PreAllocatedMemoryParamSpec p, Tensor outputTensor, ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
     TensorDesc outputDesc = outputTensor.get_desc();
@@ -40,7 +55,7 @@ EE preallocated_memory(Tensor outputTensor, ArchInfo_t archInfo)
 #endif
 #ifdef _USE_CPU
     } else {
-        memset(output, 0, tensorNumBytes(outputDesc));
+        UNI_INIT(tensorNumElements(outputDesc), outputDesc.dt, p.value, output);
         ret = SUCCESS;
 #endif
     }
diff --git a/compute/tensor/src/reduction.cpp b/compute/tensor/src/reduction.cpp
index 74577aa8..56ded759 100644
--- a/compute/tensor/src/reduction.cpp
+++ b/compute/tensor/src/reduction.cpp
@@ -35,7 +35,6 @@ EE reduction(Tensor inputTensor,
     void *tmp = get_ptr_from_tensor(tmpTensor, arch);
     TensorDesc outputDesc = outputTensor.get_desc();
     void *output = get_ptr_from_tensor(outputTensor, arch);
-
     EE ret = NOT_SUPPORTED;
     if (IS_CPU(arch)) {
 #ifdef _USE_CPU
@@ -55,31 +54,33 @@ EE reduction_infer_forward_tmp_bytes(
     Tensor inputTensor, ReductionParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo)
 {
     TensorDesc inputDesc = inputTensor.get_desc();
+    EE ret = NOT_SUPPORTED;
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
         TensorDesc outputDesc = outputTensor.get_desc();
         GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor);
         GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor);
-        CHECK_STATUS(reduction_infer_forward_tmp_bytes_mali(
-            inputDesc, p, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes));
-        return SUCCESS;
+        ret = reduction_infer_forward_tmp_bytes_mali(
+            inputDesc, p, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes);
 #endif
-    }
-    int factor = 0;
-    if (p.axes_num > 1) {
-        factor = 2;
-    }
-    if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) {
-        for (int i = 0; i < p.axes_num; i++) {
-            // channel dimension
-            if (p.axes[i] == 1 || p.axes[i] == -3) {
-                factor = 2;
-                break;
+    } else {
+        int factor = 0;
+        if (p.num_axes > 1) {
+            factor = 2;
+        }
+        if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) {
+            for (int i = 0; i < p.num_axes; i++) {
+                // channel dimension
+                if (p.axes[i] == 1 || p.axes[i] == -3) {
+                    factor = 2;
+                    break;
+                }
             }
         }
+        *bytes = UNI_MAX(inputTensor.bytes(), outputTensor.bytes()) * factor;
+        ret = SUCCESS;
     }
-    *bytes = UNI_MAX(inputTensor.bytes(), outputTensor.bytes()) * factor;
-    return SUCCESS;
+    return ret;
 }
 
 EE reduction_infer_output_size(Tensor *inputTensor,
@@ -88,28 +89,26 @@ EE reduction_infer_output_size(Tensor *inputTensor,
     Tensor *outputTensor,
     ArchInfo_t archInfo)
 {
-    if (inputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-    if (outputTensor == nullptr) {
+    if (inputTensor == nullptr || outputTensor == nullptr) {
         CHECK_STATUS(NULL_POINTER);
     }
     TensorDesc inputDesc = inputTensor->get_desc();
     TensorDesc maskDesc = maskTensor.get_desc();
     TensorDesc outputDesc = outputTensor->get_desc();
-    if (IS_GPU(archInfo->arch)) {
+    Arch arch = archInfo->arch;
+    EE ret = NOT_SUPPORTED;
+    if (IS_GPU(arch)) {
 #ifdef _USE_GPU
         OclMemory *inputMem = (OclMemory *)inputTensor->get_memory();
         OclMemory *outputMem = (OclMemory *)outputTensor->get_memory();
-        CHECK_STATUS(
-            reduction_padding_input_mali(inputDesc, maskDesc, p, &outputDesc, inputMem, outputMem));
+        ret = reduction_padding_input_mali(inputDesc, maskDesc, p, &outputDesc, inputMem, outputMem);
 #endif
     } else {
         int start = 0;
         TensorDesc tmpDesc = inputDesc;
         U32 cx = (inputDesc.df == DF_NCHWC8) ? 8 : 16;
         if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) {
-            for (int i = 0; i < p.axes_num; i++) {
+            for (int i = 0; i < p.num_axes; i++) {
                 // channel dimension
                 if (p.axes[i] == 1 || p.axes[i] == -3) {
                     start = -1;
@@ -124,10 +123,10 @@ EE reduction_infer_output_size(Tensor *inputTensor,
             tmpDesc.nDims += 1;
         }
         outputDesc = tmpDesc;
-        for (int i = start; i < p.axes_num; i++) {
+        for (int i = start; i < p.num_axes; i++) {
             int axis;
             if (i == -1) {
-                axis = 4;
+                axis = inputDesc.nDims;
             } else {
                 axis = p.axes[i];
             }
@@ -179,7 +178,14 @@ EE reduction_infer_output_size(Tensor *inputTensor,
                 }
             }
         }
+        ret = SUCCESS;
+    }
+#ifdef _USE_CPU
+    if (tensorIsShape(inputDesc)) {
+        ret = reduction_cpu(inputDesc, inputDesc.dims + inputDesc.nDims, tensor0d(), nullptr, p, 0,
+            nullptr, outputDesc, outputDesc.dims + outputDesc.nDims, arch);
     }
+#endif
     outputTensor->resize(outputDesc);
-    return SUCCESS;
+    return ret;
 }
diff --git a/compute/tensor/src/reshape.cpp b/compute/tensor/src/reshape.cpp
index 61b30c55..7a328ceb 100644
--- a/compute/tensor/src/reshape.cpp
+++ b/compute/tensor/src/reshape.cpp
@@ -56,7 +56,7 @@ EE reshape_infer_forward_tmp_bytes(
             inputDesc, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes);
 #endif
     } else {
-        *bytes = UNI_MAX(inputTensor.bytes(), outputTensor.bytes());
+        *bytes = 0;
         ret = SUCCESS;
     }
     return ret;
diff --git a/compute/tensor/src/rnn.cpp b/compute/tensor/src/rnn.cpp
index 7d58dbe6..cf38ccd4 100644
--- a/compute/tensor/src/rnn.cpp
+++ b/compute/tensor/src/rnn.cpp
@@ -33,13 +33,13 @@ EE rnn_transform_filter(std::vector<Tensor> filterTensors,
     std::vector<void *> filters = get_data_from_tensors<void *>(filterTensors, arch);
     std::vector<TensorDesc> ftmDescs(ftmTensors.size());
     std::vector<void *> ftms = get_data_from_tensor_ptrs<void *>(ftmTensors, arch);
+    std::vector<float> scale(ftmTensors.size(), -1);
 
     EE ret = NOT_SUPPORTED;
-
     if (IS_CPU(arch)) {
 #ifdef _USE_CPU
         ret = rnn_transform_filter_cpu(filterDescs.data(), (const void **)filters.data(),
-            rnnParamSpec, ftmDescs.data(), ftms.data(), arch);
+            rnnParamSpec, ftmDescs.data(), ftms.data(), scale.data(), arch);
 #endif
 #ifdef _USE_GPU
     } else if (IS_GPU(arch)) {
@@ -59,6 +59,7 @@ EE rnn_transform_filter(std::vector<Tensor> filterTensors,
     }
     for (U32 i = 0; i < ftmTensors.size(); i++) {
         ftmTensors[i]->resize(ftmDescs[i]);
+        ftmTensors[i]->set_scale(scale[i]);
     }
     return ret;
 }
@@ -103,14 +104,14 @@ EE rnn_infer_output_size(std::vector<Tensor *> inputTensors,
     for (U32 i = 0; i < inputDesc.nDims - 3; ++i) {
         xDim *= inputDesc.dims[i];
     }
-    U32 num = (rnnParamSpec.biDirection) ? 2 : 1;
-    U32 hDim = num * rnnParamSpec.numOutput;
+    U32 num = (rnnParamSpec.bi_direction) ? 2 : 1;
+    U32 hDim = num * rnnParamSpec.num_outputs;
 
     std::vector<TensorDesc> outputDescs;
     TensorDesc outputDesc = tensor3df(idt, DF_MTK, batch, step, hDim);
     outputDescs.push_back(outputDesc);
-    U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection
-                                                  : rnnParamSpec.numOutput;
+    U32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
     if (outputTensors.size() == 2) {
         if (rnnParamSpec.mode == RNN_LSTM) {
             outputDesc = tensor2df(idt, DF_NORMAL, batch, column + hDim);
@@ -205,9 +206,13 @@ EE rnn(std::vector<Tensor> inputTensors,
     EE ret = NOT_SUPPORTED;
     if (IS_CPU(arch)) {
 #ifdef _USE_CPU
+        std::vector<float> scale(filterTensors.size());
+        for (U32 i = 0; i < filterTensors.size(); i++) {
+            scale[i] = filterTensors[i].get_scale();
+        }
         ret = rnn_cpu(inputDescs[0], inputs[0], filterDescs.data(), (const void **)filters.data(),
-            biasDescs.data(), (const void **)biases.data(), rnnParamSpec, tmpBytes, tmp,
-            outputDescs[0], outputs[0], arch);
+            biasDescs.data(), (const void **)biases.data(), scale.data(), rnnParamSpec, tmpBytes,
+            tmp, outputDescs[0], outputs[0], arch);
 #endif
     } else if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
@@ -260,7 +265,7 @@ EE rnncell_infer_output_size(std::vector<Tensor *> inputTensor,
     DataFormat idf;
     U32 batch, xDim;
     CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &batch, &xDim));
-    U32 hDim = rnnParamSpec.numOutput;
+    U32 hDim = rnnParamSpec.num_outputs;
     outputDesc = tensor2df(idt, idf, batch, hDim);
     if (IS_GPU(arch)) {
 #ifdef _USE_GPU
@@ -339,6 +344,7 @@ EE rnncell_transform_filter(std::vector<Tensor> filterTensors,
     std::vector<void *> filters = get_data_from_tensors<void *>(filterTensors, arch);
     std::vector<TensorDesc> ftmDescs(ftmTensors.size());
     std::vector<void *> ftms = get_data_from_tensor_ptrs<void *>(ftmTensors, arch);
+    std::vector<float> scale(ftmTensors.size(), -1);
 
     EE ret = NOT_SUPPORTED;
     if (IS_GPU(arch)) {
@@ -347,7 +353,7 @@ EE rnncell_transform_filter(std::vector<Tensor> filterTensors,
         GCLMem filterTranArray[2];
         filterArray[0] = *((GCLMem_t)filters[0]);
         filterTranArray[0] = *((GCLMem_t)ftms[0]);
-        if (rnnParamSpec.numProjection > 0) {
+        if (rnnParamSpec.num_projection > 0) {
             filterArray[1] = *((GCLMem_t)filters[1]);
             filterTranArray[1] = *((GCLMem_t)ftms[1]);
         }
@@ -358,6 +364,7 @@ EE rnncell_transform_filter(std::vector<Tensor> filterTensors,
     }
     for (U32 i = 0; i < ftmTensors.size(); i++) {
         ftmTensors[i]->resize(ftmDescs[i]);
+        ftmTensors[i]->set_scale(scale[i]);
     }
     return ret;
 }
@@ -407,9 +414,13 @@ EE rnncell(Tensor xTensor,
     EE ret = NOT_SUPPORTED;
     if (IS_CPU(arch)) {
 #ifdef _USE_CPU
+        std::vector<float> scale(filterTensors.size());
+        for (U32 i = 0; i < filterTensors.size(); i++) {
+            scale[i] = filterTensors[i].get_scale();
+        }
         ret = rnncell_cpu(xDesc, currentX, filterDescs.data(), (const void **)filters.data(),
-            biasDescs.data(), (const void **)biases.data(), state, rnnParamSpec, batchStrideX,
-            batchStrideH, tmpBytes, tmp, hDesc, currentH, archInfo->arch);
+            biasDescs.data(), (const void **)biases.data(), scale.data(), state, rnnParamSpec,
+            batchStrideX, batchStrideH, tmpBytes, tmp, hDesc, currentH, archInfo->arch);
 #endif
 #ifdef _USE_GPU
     } else if (IS_GPU(arch)) {
@@ -417,7 +428,7 @@ EE rnncell(Tensor xTensor,
         GCLMem biasArray[2];
         filterArray[0] = *((GCLMem_t)filters[0]);
         biasArray[0] = *((GCLMem_t)biases[0]);
-        if (rnnParamSpec.numProjection > 0) {
+        if (rnnParamSpec.num_projection > 0) {
             filterArray[1] = *((GCLMem_t)filters[1]);
             //biasArray[1] = *((GCLMem_t)biases[1]);currently only init one bias
         }
diff --git a/compute/tensor/src/roialign.cpp b/compute/tensor/src/roialign.cpp
index af9711fe..dafc68c3 100644
--- a/compute/tensor/src/roialign.cpp
+++ b/compute/tensor/src/roialign.cpp
@@ -85,6 +85,8 @@ EE roialign_infer_forward_tmp_bytes(
         CHECK_STATUS(
             roialign_infer_forward_tmp_bytes_mali(inputDesc, gclmemInputDesc, outputDesc, bytes));
 #endif
+    } else {
+        *bytes = 0;
     }
     return SUCCESS;
 }
@@ -98,7 +100,6 @@ EE roialign(std::vector<Tensor> inputTensor,
     auto arch = archInfo->arch;
     std::vector<TensorDesc> inputDesc = get_desc_from_tensors(inputTensor);
     std::vector<void *> input = get_data_from_tensors<void *>(inputTensor, arch);
-    void *tmpbuf = get_ptr_from_tensor(tmpTensor, arch);
     TensorDesc outputDesc = outputTensor.get_desc();
     void *output = get_ptr_from_tensor(outputTensor, arch);
     EE ret = NOT_SUPPORTED;
@@ -108,6 +109,7 @@ EE roialign(std::vector<Tensor> inputTensor,
 #endif
     } else if (IS_GPU(arch)) {
 #ifdef _USE_GPU
+        void *tmpbuf = get_ptr_from_tensor(tmpTensor, arch);
         ret = roialign_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, input, p,
             (GCLMem_t)tmpbuf, outputDesc, (GCLMem_t)output);
 #endif
diff --git a/compute/tensor/src/scale.cpp b/compute/tensor/src/scale.cpp
index 94a02828..1c02eba5 100644
--- a/compute/tensor/src/scale.cpp
+++ b/compute/tensor/src/scale.cpp
@@ -12,14 +12,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "tensor_computing.h"
-#ifdef _USE_GENERAL
-#include "cpu/general/tensor_computing_general.h"
-#endif
-#ifdef _USE_X86
-#include "cpu/x86/tensor_computing_x86.h"
-#endif
-#ifdef _USE_NEON
-#include "cpu/arm/tensor_computing_arm.h"
+#ifdef _USE_CPU
+#include "cpu/tensor_computing_cpu.h"
 #endif
 #ifdef _USE_GPU
 #include "gpu/mali/tensor_computing_mali.h"
@@ -83,17 +77,9 @@ EE scale(Tensor inputTensor,
     void *output = get_ptr_from_tensor(outputTensor, arch);
 
     EE ret = NOT_SUPPORTED;
-    if (IS_GENERAL(arch)) {
-#ifdef _USE_GENERAL
-        ret = scale_general(inputDesc, input, alpha, beta, p, outputDesc, output);
-#endif
-#ifdef _USE_X86
-    } else if (IS_X86(arch)) {
-        ret = scale_x86(inputDesc, input, alpha, beta, p, outputDesc, output);
-#endif
-#ifdef _USE_NEON
-    } else if (IS_ARM(arch)) {
-        ret = scale_arm(inputDesc, input, alpha, beta, p, outputDesc, output);
+    if (IS_CPU(arch)) {
+#ifdef _USE_CPU
+        ret = scale_cpu(inputDesc, input, alpha, beta, p, outputDesc, output, arch);
 #endif
 #ifdef _USE_GPU
     } else if (IS_GPU(arch)) {
diff --git a/compute/tensor/src/slice.cpp b/compute/tensor/src/slice.cpp
index a7764c7f..2b218dcc 100644
--- a/compute/tensor/src/slice.cpp
+++ b/compute/tensor/src/slice.cpp
@@ -20,12 +20,9 @@
 #endif
 
 inline EE slice_infer_output_size_cpu(
-    TensorDesc inputDesc, SliceParamSpec p, std::vector<TensorDesc> *outputDesc)
+    TensorDesc inputDesc, SliceParamSpec p, std::vector<TensorDesc>& outputDesc)
 {
-    if (nullptr == outputDesc) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-    U32 num = (*outputDesc).size();
+    U32 num = outputDesc.size();
     int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims;
     I32 *slice_points = p.slice_points;
 
@@ -37,12 +34,13 @@ inline EE slice_infer_output_size_cpu(
         }
     }
     I32 target_axis = inputDesc.nDims - 1 - axis;
+    I32 cDim = (I32)inputDesc.nDims - 2;
     if (splitEqual) {
         CHECK_REQUIREMENT(0 == inputDesc.dims[target_axis] % num);
         inputDesc.dims[target_axis] /= num;
     }
     for (U32 i = 0; i < num; i++) {
-        (*outputDesc)[i] = inputDesc;
+        outputDesc[i] = inputDesc;
         if (splitEqual) {
             continue;
         }
@@ -55,7 +53,7 @@ inline EE slice_infer_output_size_cpu(
         if (i < num - 1) {
             next_point = slice_points[i];
         }
-        if (i == 0 && num == 1 && p.slice_size == 1) {  // Could happen in onnx
+        if (i == 0 && num == 1 && p.num_slice == 1) {  // Could happen in onnx
             next_point = slice_points[0];
         }
         if (prev_point < 0) {
@@ -70,20 +68,45 @@ inline EE slice_infer_output_size_cpu(
                 next_point = 0;
             }
         }
-        (*outputDesc)[i].dims[target_axis] = next_point - prev_point;
+        outputDesc[i].dims[target_axis] = next_point - prev_point;
+    }
+
+    for (U32 i = 0; i < num; i++) {
+        if ((cDim >= 0) && (outputDesc[i].dims[cDim] % 8 != 0)) {
+            if (outputDesc[i].nDims >= 4) {
+                outputDesc[i].df = DF_NCHW;
+            } else if (outputDesc[i].nDims == 3) {
+                outputDesc[i].df = DF_MTK;
+            } else if (outputDesc[i].nDims == 2) {
+                outputDesc[i].df = DF_NORMAL;
+            } else {
+                return NOT_SUPPORTED;
+            }
+        }
     }
-    return SUCCESS;
+
+    EE ret = SUCCESS;
+#ifdef _USE_CPU
+    if (tensorIsShape(inputDesc)) {
+        std::vector<void *> output(num);
+        for (U32 i = 0; i < num; i++) {
+            output[i] = outputDesc[i].dims + outputDesc[i].nDims;
+        }
+        ret = slice_cpu(inputDesc, inputDesc.dims + inputDesc.nDims, p, outputDesc, output);
+    }
+#endif
+    return ret;
 }
 
 EE slice_infer_output_size(
     Tensor *inputTensor, SliceParamSpec p, std::vector<Tensor *> outputTensor, ArchInfo_t archInfo)
 {
     if (inputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
+        return NULL_POINTER;
     }
     TensorDesc inputDesc = inputTensor->get_desc();
     std::vector<TensorDesc> outputDesc = get_desc_from_tensor_ptrs(outputTensor);
-    CHECK_STATUS(slice_infer_output_size_cpu(inputDesc, p, &outputDesc));
+    EE ret = slice_infer_output_size_cpu(inputDesc, p, outputDesc);
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
         OclMemory *inputMem = (OclMemory *)inputTensor->get_memory();
@@ -91,13 +114,13 @@ EE slice_infer_output_size(
         for (U32 i = 0; i < outputTensor.size(); i++) {
             outputMems.push_back((OclMemory *)outputTensor[i]->get_memory());
         }
-        CHECK_STATUS(slice_padding_input_mali(inputDesc, p, &outputDesc, inputMem, outputMems));
+        ret = slice_padding_input_mali(inputDesc, p, &outputDesc, inputMem, outputMems);
 #endif
     }
     for (U32 i = 0; i < outputTensor.size(); i++) {
         outputTensor[i]->resize(outputDesc[i]);
     }
-    return SUCCESS;
+    return ret;
 }
 
 EE slice_infer_forward_tmp_bytes(Tensor inputTensor,
@@ -139,7 +162,7 @@ EE slice(Tensor inputTensor,
     EE ret = NOT_SUPPORTED;
     if (IS_CPU(arch)) {
 #ifdef _USE_CPU
-        ret = slice_cpu(inputDesc, input, p, outputDesc, &output);
+        ret = slice_cpu(inputDesc, input, p, outputDesc, output);
 #endif
 #ifdef _USE_GPU
     } else if (IS_GPU(arch)) {
diff --git a/compute/tensor/src/softmax.cpp b/compute/tensor/src/softmax.cpp
index cc173c1b..ac2b02b8 100644
--- a/compute/tensor/src/softmax.cpp
+++ b/compute/tensor/src/softmax.cpp
@@ -25,37 +25,6 @@
 #include "gpu/mali/tensor_computing_mali.h"
 #endif
 
-EE softmax(
-    Tensor inputTensor, SoftmaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo)
-{
-    auto arch = archInfo->arch;
-    TensorDesc inputDesc = inputTensor.get_desc();
-    void *input = get_ptr_from_tensor(inputTensor, arch);
-    TensorDesc outputDesc = outputTensor.get_desc();
-    void *output = get_ptr_from_tensor(outputTensor, arch);
-    EE ret = NOT_SUPPORTED;
-    if (IS_GENERAL(arch)) {
-#ifdef _USE_GENERAL
-        ret = softmax_general(inputDesc, input, p, outputDesc, output);
-#endif
-#ifdef _USE_X86
-    } else if (IS_X86(arch)) {
-        ret = softmax_x86(inputDesc, input, p, outputDesc, output);
-#endif
-#ifdef _USE_NEON
-    } else if (IS_ARM(arch)) {
-        ret = softmax_arm(inputDesc, input, p, outputDesc, output);
-#endif
-#ifdef _USE_GPU
-    } else if (IS_GPU(arch)) {
-        void *tmp = get_ptr_from_tensor(tmpTensor, arch);
-        ret = softmax_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input,
-            p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output);
-#endif
-    }
-    return ret;
-}
-
 EE softmax_infer_output_size(
     Tensor *inputTensor, SoftmaxParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo)
 {
@@ -102,3 +71,59 @@ EE softmax_infer_forward_tmp_bytes(
     }
     return ret;
 }
+
+EE softmax(
+    Tensor inputTensor, SoftmaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo)
+{
+    auto arch = archInfo->arch;
+    TensorDesc inputDesc = inputTensor.get_desc();
+    void *input = get_ptr_from_tensor(inputTensor, arch);
+    TensorDesc outputDesc = outputTensor.get_desc();
+    void *output = get_ptr_from_tensor(outputTensor, arch);
+    EE ret = NOT_SUPPORTED;
+    if (IS_GENERAL(arch)) {
+#ifdef _USE_GENERAL
+        ret = softmax_general(inputDesc, input, p, outputDesc, output);
+#endif
+#ifdef _USE_X86
+    } else if (IS_X86(arch)) {
+        ret = softmax_x86(inputDesc, input, p, outputDesc, output);
+#endif
+#ifdef _USE_NEON
+    } else if (IS_ARM(arch)) {
+        ret = softmax_arm(inputDesc, input, p, outputDesc, output);
+#endif
+#ifdef _USE_GPU
+    } else if (IS_GPU(arch)) {
+        void *tmp = get_ptr_from_tensor(tmpTensor, arch);
+        ret = softmax_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input,
+            p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output);
+#endif
+    }
+    return ret;
+}
+
+EE logsoftmax(
+    Tensor inputTensor, SoftmaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo)
+{
+    auto arch = archInfo->arch;
+    TensorDesc inputDesc = inputTensor.get_desc();
+    void *input = get_ptr_from_tensor(inputTensor, arch);
+    TensorDesc outputDesc = outputTensor.get_desc();
+    void *output = get_ptr_from_tensor(outputTensor, arch);
+    EE ret = NOT_SUPPORTED;
+    if (IS_GENERAL(arch)) {
+#ifdef _USE_GENERAL
+        ret = logsoftmax_general(inputDesc, input, p, outputDesc, output);
+#endif
+#ifdef _USE_X86
+    } else if (IS_X86(arch)) {
+        ret = logsoftmax_x86(inputDesc, input, p, outputDesc, output);
+#endif
+#ifdef _USE_NEON
+    } else if (IS_ARM(arch)) {
+        ret = logsoftmax_arm(inputDesc, input, p, outputDesc, output);
+#endif
+    }
+    return ret;
+}
diff --git a/compute/tensor/src/space2depth.cpp b/compute/tensor/src/space2depth.cpp
index 7c6b9cb5..6c8d6efc 100644
--- a/compute/tensor/src/space2depth.cpp
+++ b/compute/tensor/src/space2depth.cpp
@@ -12,14 +12,15 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "tensor_computing.h"
+#ifdef _USE_CPU
+#include "cpu/tensor_computing_cpu.h"
+#endif
 #ifdef _USE_GPU
 #include "gpu/mali/tensor_computing_mali.h"
 #endif
 
-EE space2depth_infer_output_size(Tensor *inputTensor,
-    Space2DepthParamSpec space2DepthPara,
-    Tensor *outputTensor,
-    ArchInfo_t archInfo)
+EE space2depth_infer_output_size(
+    Tensor *inputTensor, Space2DepthParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo)
 {
     if (inputTensor == nullptr) {
         CHECK_STATUS(NULL_POINTER);
@@ -29,35 +30,42 @@ EE space2depth_infer_output_size(Tensor *inputTensor,
     }
     auto arch = archInfo->arch;
     TensorDesc inputDesc = inputTensor->get_desc();
-    TensorDesc outputDesc = outputTensor->get_desc();
+    TensorDesc outputDesc = inputDesc;
+    EE ret = NOT_SUPPORTED;
     if (IS_GPU(arch)) {
 #ifdef _USE_GPU
         OclMemory *inputMem = (OclMemory *)inputTensor->get_memory();
         OclMemory *outputMem = (OclMemory *)outputTensor->get_memory();
-        CHECK_STATUS(space2depth_padding_input_mali(
-            inputDesc, space2DepthPara, &outputDesc, inputMem, outputMem));
+        ret = space2depth_padding_input_mali(inputDesc, p, &outputDesc, inputMem, outputMem);
 #endif
+    } else {
+        for (int i = 0; i < (int)outputDesc.nDims - 2; i++) {
+            outputDesc.dims[i] /= p.block_size;
+            outputDesc.dims[outputDesc.nDims - 2] *= p.block_size;
+        }
+        outputDesc.df = getTensorDefaultDataFormat(outputDesc.nDims);
+        ret = SUCCESS;
     }
     outputTensor->resize(outputDesc);
-    return SUCCESS;
+    return ret;
 }
 
-EE space2depth(Tensor inputTensor,
-    Space2DepthParamSpec space2DepthPara,
-    Tensor outputTensor,
-    ArchInfo_t archInfo)
+EE space2depth(Tensor inputTensor, Space2DepthParamSpec p, Tensor outputTensor, ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
+    TensorDesc inputDesc = inputTensor.get_desc();
+    void *input = get_ptr_from_tensor(inputTensor, arch);
+    TensorDesc outputDesc = outputTensor.get_desc();
+    void *output = get_ptr_from_tensor(outputTensor, arch);
     EE ret = NOT_SUPPORTED;
     if (IS_GPU(arch)) {
 #ifdef _USE_GPU
-        TensorDesc inputDesc = inputTensor.get_desc();
-        void *input = get_ptr_from_tensor(inputTensor, arch);
-        TensorDesc outputDesc = outputTensor.get_desc();
-        void *output = get_ptr_from_tensor(outputTensor, arch);
-
         ret = space2depth_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc,
-            (GCLMem_t)input, space2DepthPara, outputDesc, (GCLMem_t)output);
+            (GCLMem_t)input, p, outputDesc, (GCLMem_t)output);
+#endif
+    } else {
+#ifdef _USE_CPU
+        ret = space2depth_cpu(inputDesc, input, p, outputDesc, output);
 #endif
     }
     return ret;
diff --git a/compute/tensor/src/squeeze.cpp b/compute/tensor/src/squeeze.cpp
index d9d2c264..d08b1431 100644
--- a/compute/tensor/src/squeeze.cpp
+++ b/compute/tensor/src/squeeze.cpp
@@ -15,13 +15,13 @@
 #ifdef _USE_GPU
 #include "gpu/mali/tensor_computing_mali.h"
 #endif
-#include <string.h>
 
 EE squeeze(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
     TensorDesc inputDesc = inputTensor.get_desc();
     void *input = get_ptr_from_tensor(inputTensor, arch);
+    TensorDesc outputDesc = outputTensor.get_desc();
     void *output = get_ptr_from_tensor(outputTensor, arch);
 
     EE ret = NOT_SUPPORTED;
@@ -34,8 +34,13 @@ EE squeeze(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t
 #endif
 #ifdef _USE_CPU
     } else {
-        if (output != input) {
-            memcpy(output, input, tensorNumBytes(inputDesc));
+        if ((inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) &&
+            inputDesc.df != outputDesc.df) {
+            TensorDesc nchwDesc = inputDesc;
+            nchwDesc.df = DF_NCHW;
+            transformToNCHW(inputDesc, input, nchwDesc, output);
+        } else {
+            UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
         }
         ret = SUCCESS;
 #endif
@@ -47,25 +52,41 @@ EE squeeze_infer_output_size_cpu(
     TensorDesc inputDesc, int *axes, int axesNum, TensorDesc *outputDesc)
 {
     *outputDesc = inputDesc;
+    if ((int)inputDesc.nDims == axesNum) {
+        outputDesc->nDims = 1;
+        outputDesc->df = DF_SCALAR;
+        return SUCCESS;
+    }
     for (int i = 0; i < axesNum; i++) {
         int axis = axes[i];
         if (axis < 0) {
             axis += inputDesc.nDims;
         }
-        outputDesc->dims[inputDesc.nDims - 1 - axis] = 0;
+        if (outputDesc->dims[inputDesc.nDims - 1 - axis] != 1) {
+            UNI_ERROR_LOG(
+                "try to squeeze non-one dimension in (%s).\n", tensorDesc2Str(inputDesc).c_str());
+        }
+        outputDesc->dims[inputDesc.nDims - 1 - axis] = INT_MAX;
     }
     U32 index = 0;
     for (U32 i = 0; i < inputDesc.nDims; i++) {
-        if (outputDesc->dims[i] != 0) {
+        if (outputDesc->dims[i] != INT_MAX) {
             outputDesc->dims[index++] = outputDesc->dims[i];
         }
     }
     CHECK_REQUIREMENT(index + axesNum == inputDesc.nDims);
     outputDesc->nDims = index;
-    if (inputDesc.df != DF_NCHWC8) {
-        outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims);
-    } else {
-        outputDesc->df = DF_NCHWC8;
+    outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims);
+    if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) {
+        bool changeChannelAxis = false;
+        for (int i = 0; i < axesNum; i++) {
+            if (axes[i] < 1) {
+                changeChannelAxis = true;
+            }
+        }
+        if (!changeChannelAxis) {
+            outputDesc->df = inputDesc.df;
+        }
     }
     return SUCCESS;
 }
@@ -81,7 +102,7 @@ EE squeeze_infer_output_size(
     }
     TensorDesc inputDesc = inputTensor->get_desc();
     TensorDesc outputDesc = outputTensor->get_desc();
-    CHECK_STATUS(squeeze_infer_output_size_cpu(inputDesc, p.axes, p.axes_num, &outputDesc));
+    CHECK_STATUS(squeeze_infer_output_size_cpu(inputDesc, p.axes, p.num_axes, &outputDesc));
     outputTensor->resize(outputDesc);
     return SUCCESS;
 }
diff --git a/compute/tensor/src/tfslice.cpp b/compute/tensor/src/tfslice.cpp
index b6c0a824..f477a3c5 100644
--- a/compute/tensor/src/tfslice.cpp
+++ b/compute/tensor/src/tfslice.cpp
@@ -22,10 +22,7 @@
 EE tfslice_infer_output_size(
     Tensor *inputTensor, TfSliceParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo)
 {
-    if (inputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-    if (outputTensor == nullptr) {
+    if (inputTensor == nullptr || outputTensor == nullptr) {
         CHECK_STATUS(NULL_POINTER);
     }
     TensorDesc inputDesc = inputTensor->get_desc();
@@ -38,8 +35,15 @@ EE tfslice_infer_output_size(
         }
 #endif
     }
+    EE ret = SUCCESS;
+#ifdef _USE_CPU
+    if (tensorIsShape(inputDesc)) {
+        ret = tfslice_cpu(inputDesc, inputDesc.dims + inputDesc.nDims, p, outputDesc,
+            outputDesc.dims + outputDesc.nDims);
+    }
+#endif
     outputTensor->resize(outputDesc);
-    return SUCCESS;
+    return ret;
 }
 
 EE tfslice_infer_forward_tmp_bytes(
diff --git a/compute/tensor/src/tile.cpp b/compute/tensor/src/tile.cpp
index cce7feff..94ce1b87 100644
--- a/compute/tensor/src/tile.cpp
+++ b/compute/tensor/src/tile.cpp
@@ -23,15 +23,15 @@ EE tile_infer_output_size(
     auto inDim = inputTensor->get_desc();
     auto outDim = inDim;
 
-    if ((int)inDim.nDims == tileParamSpec.dimsSize) {
-        for (int i = 0; i < tileParamSpec.dimsSize; i++) {
-            outDim.dims[tileParamSpec.dimsSize - 1 - i] =
-                inDim.dims[tileParamSpec.dimsSize - 1 - i] * tileParamSpec.repeatsInfo[i];
+    if ((int)inDim.nDims == tileParamSpec.num_repeats) {
+        for (int i = 0; i < tileParamSpec.num_repeats; i++) {
+            outDim.dims[tileParamSpec.num_repeats - 1 - i] =
+                inDim.dims[tileParamSpec.num_repeats - 1 - i] * tileParamSpec.repeats[i];
         }
     } else {
         int axis = (tileParamSpec.axis >= 0) ? tileParamSpec.axis : tileParamSpec.axis + inDim.nDims;
         axis = inDim.nDims - 1 - axis;
-        outDim.dims[axis] = outDim.dims[axis] * tileParamSpec.repeatsInfo[0];
+        outDim.dims[axis] = outDim.dims[axis] * tileParamSpec.repeats[0];
     }
     if (IS_GPU(archInfo->arch)) {
 #ifdef _USE_GPU
@@ -89,25 +89,25 @@ EE tile(Tensor inputTensor,
             outputDesc.dims[0] *= 8;
         }
 
-        if (tileParamSpec.dimsSize != (int)inputDesc.nDims) {
-            CHECK_REQUIREMENT(tileParamSpec.dimsSize == 1);
+        if (tileParamSpec.num_repeats != (int)inputDesc.nDims) {
+            CHECK_REQUIREMENT(tileParamSpec.num_repeats == 1);
             int axis = (tileParamSpec.axis >= 0) ? tileParamSpec.axis
                                                  : tileParamSpec.axis + inputDesc.nDims;
-            U32 tiles = tileParamSpec.repeatsInfo[0];
+            U32 tiles = tileParamSpec.repeats[0];
             for (int i = 0; i < (int)inputDesc.nDims; ++i) {
-                tileParamSpec.repeatsInfo[i] = 1;
+                tileParamSpec.repeats[i] = 1;
                 if (axis == i) {
-                    tileParamSpec.repeatsInfo[i] = tiles;
+                    tileParamSpec.repeats[i] = tiles;
                 }
             }
         }
 
         U32 repeat_num = 0;
         for (U32 i = 0; i < inputDesc.nDims; ++i) {
-            repeat_num += (tileParamSpec.repeatsInfo[inputDesc.nDims - 1 - i] > 1);
+            repeat_num += (tileParamSpec.repeats[inputDesc.nDims - 1 - i] > 1);
         }
         if (repeat_num == 0) {
-            memcpy(output, input, tensorNumBytes(inputDesc));
+            UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
             return SUCCESS;
         }
 
@@ -122,14 +122,14 @@ EE tile(Tensor inputTensor,
 
         bool first_copy = true;
         for (U32 j = 0; j < inputDesc.nDims; ++j) {
-            if (tileParamSpec.repeatsInfo[inputDesc.nDims - 1 - j] > 1) {
-                U32 tiles = tileParamSpec.repeatsInfo[inputDesc.nDims - 1 - j];
+            if (tileParamSpec.repeats[inputDesc.nDims - 1 - j] > 1) {
+                U32 tiles = tileParamSpec.repeats[inputDesc.nDims - 1 - j];
                 int loopOuter = itile_size[inputDesc.nDims - 1] / itile_size[j];
                 if (first_copy) {
                     first_copy = false;
                     for (int i = 0; i < loopOuter; ++i) {
                         for (U32 ii = 0; ii < tiles; ++ii) {
-                            memcpy(output_ptr + i * tiles * itile_size[j] + ii * itile_size[j],
+                            UNI_MEMCPY(output_ptr + i * tiles * itile_size[j] + ii * itile_size[j],
                                 input_ptr + i * itile_size[j], itile_size[j]);
                         }
                     }
@@ -138,7 +138,7 @@ EE tile(Tensor inputTensor,
                         for (U32 ii = 0; ii < tiles; ++ii) {
                             if (i != 0 || ii != 0) {
                                 U32 copy_size = otile_size[j - 1] * inputDesc.dims[i];
-                                memcpy(output_ptr + i * tiles * copy_size + ii * copy_size,
+                                UNI_MEMCPY(output_ptr + i * tiles * copy_size + ii * copy_size,
                                     output_ptr + i * copy_size, copy_size);
                             }
                         }
diff --git a/compute/tensor/src/topk.cpp b/compute/tensor/src/topk.cpp
index 1a9dc4b2..71a15d69 100644
--- a/compute/tensor/src/topk.cpp
+++ b/compute/tensor/src/topk.cpp
@@ -86,8 +86,10 @@ EE topk_infer_output_size(Tensor *inputTensor,
     outputDesc = inputDesc;
     outputIndicesDesc = inputDesc;
     int axis = inputDesc.nDims - 1 - (p.axis + inputDesc.nDims) % inputDesc.nDims;
-    outputDesc.dims[axis] = p.topk;
-    outputIndicesDesc.dims[axis] = p.topk;
+    if (p.k > 0) {
+        outputDesc.dims[axis] = p.k;
+        outputIndicesDesc.dims[axis] = p.k;
+    }
     outputIndicesDesc.dt = DT_I32;
     outputTensor->resize(outputDesc);
     outputIndicesTensor->resize(outputIndicesDesc);
diff --git a/compute/tensor/src/transpose.cpp b/compute/tensor/src/transpose.cpp
index 1f9444cd..0b848e58 100644
--- a/compute/tensor/src/transpose.cpp
+++ b/compute/tensor/src/transpose.cpp
@@ -34,13 +34,13 @@ EE transpose(Tensor inputTensor,
     void *input = get_ptr_from_tensor(inputTensor, arch);
     TensorDesc outputDesc = outputTensor.get_desc();
     void *output = get_ptr_from_tensor(outputTensor, arch);
-    std::vector<U32> tmpDims(p.trans_dims, p.trans_dims + p.trans_size);
+    std::vector<U32> tmpDims(p.axes, p.axes + p.num_axes);
     if (IS_CPU(arch)) {
         // Keep transDims unchanged so that input resize does not lead to error
-        if (inputDesc.nDims == 4 && p.trans_size == 3 && inputDesc.dims[0] == 1) {
-            inputDesc = tensor3df(inputDesc.dt, inputDesc.df, inputDesc.dims[3], inputDesc.dims[2],
-                inputDesc.dims[1]);
-        }
+        //if (inputDesc.nDims == 4 && p.num_axes == 3 && inputDesc.dims[0] == 1) {
+        //    inputDesc = tensor3df(inputDesc.dt, inputDesc.df, inputDesc.dims[3], inputDesc.dims[2],
+        //        inputDesc.dims[1]);
+        //}
 
         if (DF_NCHWC8 == inputDesc.df || DF_NCHWC16 == inputDesc.df) {
             U32 cx = 8;
@@ -48,7 +48,7 @@ EE transpose(Tensor inputTensor,
                 cx = 16;
                 CHECK_REQUIREMENT(inputDesc.dims[inputDesc.nDims - 2] % 16 == 0);
             }
-            if (inputDesc.nDims == p.trans_size) {
+            if (inputDesc.nDims == p.num_axes) {
                 auto ptr = std::find(tmpDims.begin(), tmpDims.end(), 1);
                 tmpDims.insert(ptr + 1, inputDesc.nDims);
             }
@@ -75,6 +75,22 @@ EE transpose(Tensor inputTensor,
             }
             outputDesc = desc;
         }
+        if (outputDesc.df == DF_NCHWC8) {
+            int icaxis = inputDesc.nDims - 1 - p.axes[1];
+            for (int i = inputDesc.nDims; i > icaxis; i--) {
+                inputDesc.dims[i] = inputDesc.dims[i - 1];
+            }
+            inputDesc.nDims++;
+            inputDesc.dims[icaxis] = 8;
+            inputDesc.dims[icaxis + 1] /= 8;
+            for (int i = outputDesc.nDims; i > 0; i--) {
+                outputDesc.dims[i] = outputDesc.dims[i - 1];
+            }
+            outputDesc.nDims++;
+            outputDesc.dims[0] = 8;
+            outputDesc.dims[outputDesc.nDims - 2] /= 8;
+            tmpDims.push_back(tmpDims.size());
+        }
     }
     EE ret = NOT_SUPPORTED;
     if (IS_GENERAL(arch)) {
@@ -102,32 +118,31 @@ inline EE transpose_infer_output_size_cpu(
         CHECK_STATUS(NULL_POINTER);
     }
 
-    U32 *dim = p.trans_dims;
+    U32 *dim = p.axes;
     *outputDesc = inputDesc;
-    U32 inputDim = inputDesc.nDims;
-    if (4 == inputDim) {
-        (*outputDesc).df = DF_NCHW;
-    }
-    U32 outputDim = (*outputDesc).nDims;
+    U32 num = inputDesc.nDims;
     U32 index = 0;
-    for (U32 i = 0; i < p.trans_size; i++) {
+    for (U32 i = 0; i < p.num_axes; i++) {
         // use 5-dim array to transpose a NCHWC8 tensor. skip c8 axis
-        if (dim[i] >= inputDim) {
+        if (dim[i] >= num) {
             continue;
         }
         // NOTE: TensorDesc.dims array is in [W H C N] order.
         // so if you want to transpose [N C H W] format data, we use (dims - 1 - *)
         // [5 6 7 8] + [0 3 2 1] = [5 8 7 6]
         // [8 7 6 5] + [0 3 2 1] = [6 7 8 5]
-        (*outputDesc).dims[outputDim - 1 - index] = inputDesc.dims[inputDim - 1 - dim[i]];
+        outputDesc->dims[num - 1 - index] = inputDesc.dims[num - 1 - dim[i]];
         index++;
     }
-    if (outputDesc->nDims >= 4 || inputDesc.df == DF_NCHWC8) {
+    if (inputDesc.df == DF_NCHWC8) {
         outputDesc->df = DF_NCHW;
     }
-    if ((*outputDesc).nDims == 4 && p.trans_size == 3 && (*outputDesc).dims[0] == 1) {
-        (*outputDesc) = tensor3df(inputDesc.dt, DF_NCHW, (*outputDesc).dims[3],
-            (*outputDesc).dims[2], (*outputDesc).dims[1]);
+    //if (outputDesc->nDims == 4 && p.num_axes == 3 && outputDesc->dims[0] == 1) {
+    //    (*outputDesc) = tensor3df(inputDesc.dt, DF_NCHW, outputDesc->dims[3],
+    //        outputDesc->dims[2], outputDesc->dims[1]);
+    //}
+    if (p.df == DF_NCHWC8 && outputDesc->dims[num - 2] % 8 == 0) {
+        outputDesc->df = DF_NCHWC8;
     }
     return SUCCESS;
 }
diff --git a/compute/tensor/src/unsqueeze.cpp b/compute/tensor/src/unsqueeze.cpp
index 289d3086..27ac3d1a 100644
--- a/compute/tensor/src/unsqueeze.cpp
+++ b/compute/tensor/src/unsqueeze.cpp
@@ -15,13 +15,13 @@
 #ifdef _USE_GPU
 #include "gpu/mali/tensor_computing_mali.h"
 #endif
-#include <string.h>
 
 EE unsqueeze(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
     TensorDesc inputDesc = inputTensor.get_desc();
     void *input = get_ptr_from_tensor(inputTensor, arch);
+    TensorDesc outputDesc = outputTensor.get_desc();
     void *output = get_ptr_from_tensor(outputTensor, arch);
 
     EE ret = NOT_SUPPORTED;
@@ -34,8 +34,13 @@ EE unsqueeze(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo
 #endif
 #ifdef _USE_CPU
     } else {
-        if (output != input) {
-            memcpy(output, input, tensorNumBytes(inputDesc));
+        if ((inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) &&
+            inputDesc.df != outputDesc.df) {
+            TensorDesc nchwDesc = inputDesc;
+            nchwDesc.df = DF_NCHW;
+            transformToNCHW(inputDesc, input, nchwDesc, output);
+        } else {
+            UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
         }
         ret = SUCCESS;
 #endif
@@ -47,11 +52,22 @@ EE unsqueeze_infer_output_size_cpu(
     TensorDesc inputDesc, int *axes, int axesNum, TensorDesc *outputDesc)
 {
     outputDesc->dt = inputDesc.dt;
-    outputDesc->nDims = inputDesc.nDims + axesNum;
-    if (inputDesc.df != DF_NCHWC8) {
-        outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims);
+    if (inputDesc.df == DF_SCALAR) {
+        outputDesc->nDims = axesNum;
     } else {
-        outputDesc->df = DF_NCHWC8;
+        outputDesc->nDims = inputDesc.nDims + axesNum;
+    }
+    outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims);
+    if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) {
+        bool changeChannelAxis = false;
+        for (int i = 0; i < axesNum; i++) {
+            if (axes[i] <= 1) {
+                changeChannelAxis = true;
+            }
+        }
+        if (!changeChannelAxis) {
+            outputDesc->df = inputDesc.df;
+        }
     }
     for (U32 i = 0; i < outputDesc->nDims; i++) {
         outputDesc->dims[i] = 0;
@@ -69,22 +85,28 @@ EE unsqueeze_infer_output_size_cpu(
             outputDesc->dims[i] = inputDesc.dims[index++];
         }
     }
-    CHECK_REQUIREMENT(index == inputDesc.nDims);
+    if (inputDesc.df != DF_SCALAR) {
+        CHECK_REQUIREMENT(index == inputDesc.nDims);
+    }
+#ifdef _USE_CPU
+    if (tensorIsShape(inputDesc)) {
+        for (U32 i = 0; outputDesc->nDims + i < DIM_LEN; i++) {
+            outputDesc->dims[outputDesc->nDims + i] = inputDesc.dims[inputDesc.nDims + i];
+        }
+    }
+#endif
     return SUCCESS;
 }
 
 EE unsqueeze_infer_output_size(
     Tensor *inputTensor, UnsqueezeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo)
 {
-    if (inputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
-    }
-    if (outputTensor == nullptr) {
-        CHECK_STATUS(NULL_POINTER);
+    if (inputTensor == nullptr || outputTensor == nullptr) {
+        return NULL_POINTER;
     }
     TensorDesc inputDesc = inputTensor->get_desc();
     TensorDesc outputDesc = outputTensor->get_desc();
-    EE ret = unsqueeze_infer_output_size_cpu(inputDesc, p.axes, p.axes_num, &outputDesc);
+    EE ret = unsqueeze_infer_output_size_cpu(inputDesc, p.axes, p.num_axes, &outputDesc);
     outputTensor->resize(outputDesc);
     return ret;
 }
diff --git a/compute/tensor/src/where.cpp b/compute/tensor/src/where.cpp
index 7eec13bc..e0a031f7 100644
--- a/compute/tensor/src/where.cpp
+++ b/compute/tensor/src/where.cpp
@@ -10,129 +10,153 @@
 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-#include "tensor_computing.h"
 
-EE where_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo)
-{
-    auto inDesc = inputTensor->get_desc();
-    auto outDesc = inDesc;
-    outputTensor->resize(outDesc);
-    return SUCCESS;
-}
+#include "tensor_computing.h"
 
-bool tensorDescEqual(TensorDesc a, TensorDesc b)
+EE where_infer_output_size(
+    Tensor *xTensor, Tensor *yTensor, Tensor *outputTensor, ArchInfo_t archInfo)
 {
-    if (a.nDims != b.nDims) {
-        return false;
-    } else {
-        for (int i = 0; i < (int)(a.nDims); i++) {
-            if (a.dims[i] != b.dims[i]) {
-                return false;
-            }
+    TensorDesc xDesc = xTensor->get_desc();
+    TensorDesc yDesc = yTensor->get_desc();
+    TensorDesc outDesc = (xDesc.nDims > yDesc.nDims) ? xDesc : yDesc;
+    for (U32 i = 0; i < xDesc.nDims; i++) {
+        if (xDesc.dims[i] > outDesc.dims[i]) {
+            outDesc.dims[i] = xDesc.dims[i];
+        }
+    }
+    for (U32 i = 0; i < yDesc.nDims; i++) {
+        if (yDesc.dims[i] > outDesc.dims[i]) {
+            outDesc.dims[i] = yDesc.dims[i];
         }
     }
-    return true;
+    outputTensor->resize(outDesc);
+    return SUCCESS;
 }
 
-int brocastIndex(TensorDesc inputDesc, TensorDesc conditionDesc)
+inline static std::vector<U32> get_dims(const TensorDesc &desc)
 {
-    if (inputDesc.nDims != conditionDesc.nDims) {
-        return -1;
+    std::vector<U32> dims;
+    if (desc.df == DF_NCHWC8) {
+        dims.push_back(8);
     }
-
-    for (int i = 2; i < (int)(inputDesc.nDims); i++) {
-        if (inputDesc.dims[i] != conditionDesc.dims[i]) {
-            return i;
-        }
+    for (U32 i = 0; i < desc.nDims; i++) {
+        dims.push_back(desc.dims[i]);
     }
-    return -1;
+    return dims;
 }
 
 template <typename T>
-static EE diffSourceWhere(TensorDesc inputDesc,
-    TensorDesc conditionDesc,
-    TensorDesc yDesc,
-    T *inputPtr,
-    U8 *conditionPtr,
-    T *yPtr,
-    T *outputPtr)
+static void where_kernel(const TensorDesc &conditionDesc,
+    const U8 *condition,
+    const TensorDesc &xDesc,
+    const T *x,
+    const TensorDesc &yDesc,
+    const T *y,
+    const TensorDesc &outDesc,
+    T *out)
 {
-    if (tensorDescEqual(inputDesc, conditionDesc)) {
-        for (int i = 0; i < (int)(tensorNumElements(inputDesc)); i++) {
-            if (tensorNumElements(yDesc) == 1) {
-                outputPtr[i] = (conditionPtr[i] > 0) ? inputPtr[i] : yPtr[0];
-            } else if (tensorNumElements(inputDesc) == tensorNumElements(yDesc)) {
-                outputPtr[i] = (conditionPtr[i] > 0) ? inputPtr[i] : yPtr[i];
-            } else {
-                return NOT_SUPPORTED;
-            }
+    if (tensorNumElements(xDesc) == 1 &&
+        tensorNumElements(conditionDesc) >= outDesc.dims[0] &&
+        tensorNumElements(yDesc) == tensorNumElements(outDesc))
+    {
+        UNI_MEMCPY(out, y, tensorNumBytes(yDesc));
+        DataType odt;
+        DataFormat odf;
+        U32 on, oc, oh, ow;
+        if (tensorIs3d(outDesc)) {
+            CHECK_STATUS(tensor3dGet(outDesc, &odt, &odf, &on, &oc, &ow));
+            oh = 1;
+        } else if (tensorIs4d(outDesc)) {
+            CHECK_STATUS(tensor4dGet(outDesc, &odt, &odf, &on, &oc, &oh, &ow));
+        } else {
+            UNI_ERROR_LOG("where currently only support 3d/4d tensor.\n");
+            return;
         }
-    } else {
-        int bIndex = brocastIndex(inputDesc, conditionDesc);
-        if (bIndex == -1) {
-            return NOT_SUPPORTED;
+        U8 c8 = 1;
+        if (odf == DF_NCHWC8) {
+            c8 = 8;
         }
-        int batchNum = 1;
-        for (int i = 0; i < bIndex; i++) {
-            batchNum *= inputDesc.dims[i];
-        }
-        for (int i = 0; i < (int)(inputDesc.dims[bIndex]); i++) {
-            for (int j = 0; j < (int)(inputDesc.dims[1]); j++) {
-                for (int k = 0; k < (int)(inputDesc.dims[0]); k++) {
-                    if (tensorNumElements(yDesc) == 1) {
-                        outputPtr[i * batchNum + j * inputDesc.dims[0] + k] =
-                            conditionPtr[j * conditionDesc.dims[0] + k] > 0
-                            ? inputPtr[i * batchNum + j * inputDesc.dims[0] + k]
-                            : yPtr[0];
-                    } else if (tensorNumElements(inputDesc) == tensorNumElements(yDesc)) {
-                        outputPtr[i * batchNum + j * inputDesc.dims[0] + k] =
-                            conditionPtr[j * conditionDesc.dims[0] + k] > 0
-                            ? inputPtr[i * batchNum + j * inputDesc.dims[0] + k]
-                            : yPtr[i * batchNum + j * inputDesc.dims[0] + k];
-                    } else {
-                        return NOT_SUPPORTED;
+        oc /= c8;
+        for (U32 w = 0; w < ow; w++) {
+            if (condition[w]) {
+                for (U32 n = 0; n < on; n++) {
+                    for (U32 c0 = 0; c0 < oc; c0++) {
+                        for (U32 h = 0; h < oh; h++) {
+                            for (U32 c1 = 0; c1 < c8; c1++) {
+                                out[(((n * oc + c0) * oh + h) * ow + w) * c8 + c1] = x[0];
+                            }
+                        }
                     }
                 }
             }
         }
+        return;
+    }
+    U32 length = tensorNumElements(outDesc);
+    if (xDesc.df != DF_NCHWC8 && yDesc.df != DF_NCHWC8) {
+        for (U32 i = 0; i < length; i++) {
+            const std::vector<U32> &id = calculateLocalIndex(i, outDesc.dims, outDesc.nDims);
+            int ci = calculateGlobalIndex(id.data(), conditionDesc.dims, conditionDesc.nDims);
+            int xi = calculateGlobalIndex(id.data(), xDesc.dims, xDesc.nDims);
+            int yi = calculateGlobalIndex(id.data(), yDesc.dims, yDesc.nDims);
+            out[i] = condition[ci] ? x[xi] : y[yi];
+        }
+        return;
+    }
+    const std::vector<U32> &cdims = get_dims(conditionDesc);
+    const std::vector<U32> &xdims = get_dims(xDesc);
+    const std::vector<U32> &ydims = get_dims(yDesc);
+    const std::vector<U32> &odims = get_dims(outDesc);
+    std::vector<U32> id_c1(odims.size()), id_c8(odims.size() + 1);
+    U32 *cid = (conditionDesc.nDims == cdims.size()) ? id_c1.data() : id_c8.data();
+    U32 *xid = (xDesc.nDims == xdims.size()) ? id_c1.data() : id_c8.data();
+    U32 *yid = (yDesc.nDims == ydims.size()) ? id_c1.data() : id_c8.data();
+    int axis = outDesc.nDims - 2;
+    for (U32 i = 0; i < length; i++) {
+        const std::vector<U32> &id = calculateLocalIndex(i, odims.data(), odims.size());
+        if (outDesc.nDims != odims.size()) {
+            UNI_MEMCPY(id_c8.data(), id.data(), id.size() * sizeof(float));
+            UNI_MEMCPY(id_c1.data(), id.data() + 1, (id.size() - 1) * sizeof(float));
+            id_c1[axis] = id_c1[axis] * 8 + id[0];
+        } else {
+            UNI_MEMCPY(id_c1.data(), id.data(), id.size() * sizeof(float));
+            UNI_MEMCPY(id_c8.data() + 1, id.data(), id.size() * sizeof(float));
+            id_c8[0] = id[axis] % 8;
+            id_c8[axis + 1] = id[axis] / 8;
+        }
+        int ci = calculateGlobalIndex(cid, cdims.data(), cdims.size());
+        int xi = calculateGlobalIndex(xid, xdims.data(), xdims.size());
+        int yi = calculateGlobalIndex(yid, ydims.data(), ydims.size());
+        out[i] = condition[ci] ? x[xi] : y[yi];
     }
-    return SUCCESS;
 }
 
-// replaceF -> yTensor
-EE where(Tensor inputTensor,
-    Tensor conditionTensor,
-    Tensor yTensor,
-    Tensor outputTensor,
-    ArchInfo_t archInfo)
+EE where(
+    Tensor conditionTensor, Tensor xTensor, Tensor yTensor, Tensor outputTensor, ArchInfo_t archInfo)
 {
     auto arch = archInfo->arch;
-    void *input = get_ptr_from_tensor(inputTensor, arch);
     void *condition = get_ptr_from_tensor(conditionTensor, arch);
-    void *yPtr = get_ptr_from_tensor(yTensor, arch);
-    void *output = get_ptr_from_tensor(outputTensor, arch);
-    TensorDesc inputDesc = inputTensor.get_desc();
+    void *x = get_ptr_from_tensor(xTensor, arch);
+    void *y = get_ptr_from_tensor(yTensor, arch);
+    void *out = get_ptr_from_tensor(outputTensor, arch);
     TensorDesc conditionDesc = conditionTensor.get_desc();
+    TensorDesc xDesc = xTensor.get_desc();
     TensorDesc yDesc = yTensor.get_desc();
-
-    if (inputDesc.dims[1] == 1) {
-        memcpy(output, input, tensorNumBytes(inputDesc));
-        return SUCCESS;
-    }
+    TensorDesc outDesc = outputTensor.get_desc();
 
     EE ret = SUCCESS;
-    switch (inputDesc.dt) {
+    switch (xDesc.dt) {
 #ifdef _USE_FP32
         case DT_F32: {
-            ret = diffSourceWhere(inputDesc, conditionDesc, yDesc, (F32 *)input, (U8 *)condition,
-                (F32 *)yPtr, (F32 *)output);
+            where_kernel<F32>(conditionDesc, (const U8 *)condition, xDesc, (const F32 *)x, yDesc,
+                (const F32 *)y, outDesc, (F32 *)out);
             break;
         }
 #endif
 #ifdef _USE_FP16
         case DT_F16: {
-            ret = diffSourceWhere(inputDesc, conditionDesc, yDesc, (F16 *)input, (U8 *)condition,
-                (F16 *)yPtr, (F16 *)output);
+            where_kernel<F16>(conditionDesc, (const U8 *)condition, xDesc, (const F16 *)x, yDesc,
+                (const F16 *)y, outDesc, (F16 *)out);
             break;
         }
 #endif
diff --git a/compute/tensor/tests/test_activation.cpp b/compute/tensor/tests/test_activation.cpp
index 02b7e443..5c2bb229 100644
--- a/compute/tensor/tests/test_activation.cpp
+++ b/compute/tensor/tests/test_activation.cpp
@@ -23,7 +23,7 @@ int activationFunctionTest(U32 in,
     const char *activationType)
 {
     DataFormat df = DF_NCHWC8;
-    memset(activationDesc.value, 0, sizeof(activationDesc.value));
+    UNI_MEMSET(activationDesc.value, 0, sizeof(activationDesc.value));
 
     TensorDesc dataDesc = tensor4df(dt, df, in, ic, ih, iw);
     U32 len = tensorNumElements(dataDesc);
@@ -32,8 +32,8 @@ int activationFunctionTest(U32 in,
 
     Tensor dataTensor = Tensor::alloc_sized<CPUMem>(dataDesc);
     Tensor dataTensorRef = Tensor::alloc_sized<CPUMem>(dataDesc);
-    memcpy(get_ptr_from_tensor(dataTensor, CPU_GENERAL), data, tensorNumBytes(dataDesc));
-    memcpy(get_ptr_from_tensor(dataTensorRef, CPU_GENERAL), data, tensorNumBytes(dataDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(dataTensor, CPU_GENERAL), data, tensorNumBytes(dataDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(dataTensorRef, CPU_GENERAL), data, tensorNumBytes(dataDesc));
 
     if (UT_CHECK) {
         //check
diff --git a/compute/tensor/tests/test_argmax.cpp b/compute/tensor/tests/test_argmax.cpp
index cec11ded..05f2219f 100644
--- a/compute/tensor/tests/test_argmax.cpp
+++ b/compute/tensor/tests/test_argmax.cpp
@@ -30,7 +30,7 @@ int argmaxTest(int argc, char **argv, DataType dt)
     Tensor inputTensor;
     inputTensor.resize(inDesc);
     inputTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
 
     Tensor outputTensor;
     Tensor outputTensorRef;
diff --git a/compute/tensor/tests/test_attention.cpp b/compute/tensor/tests/test_attention.cpp
index 0e3fe0e1..c1b0d0cb 100644
--- a/compute/tensor/tests/test_attention.cpp
+++ b/compute/tensor/tests/test_attention.cpp
@@ -56,7 +56,7 @@ int attentionTest(int argc, char **argv, DataType dt)
         }
     }
 
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
 
     if (UT_CHECK) {
         CHECK_STATUS(attention(inputTensor, outputTensor, &UT_CPU_ARCHINFO));
diff --git a/compute/tensor/tests/test_axpby.cpp b/compute/tensor/tests/test_axpby.cpp
index 02f9cfa9..23188c45 100644
--- a/compute/tensor/tests/test_axpby.cpp
+++ b/compute/tensor/tests/test_axpby.cpp
@@ -28,7 +28,7 @@ int axpbyTest(int argc, char **argv, DataType dt)
     U8 *y = ut_input_v(len, dt, UT_INIT_RANDOM);
     U8 *y_ref = ut_input_v(len, dt, UT_INIT_ZERO);
 
-    memcpy(y_ref, y, tensorNumBytes(yDesc));
+    UNI_MEMCPY(y_ref, y, tensorNumBytes(yDesc));
     // check
     if (UT_CHECK) {
         CHECK_STATUS(vector_vector_axpby(a, xDesc, x, b, yDesc, y, UT_CPU_ARCH));
diff --git a/compute/tensor/tests/test_check.cpp b/compute/tensor/tests/test_check.cpp
index 2acb8965..e6c664ea 100644
--- a/compute/tensor/tests/test_check.cpp
+++ b/compute/tensor/tests/test_check.cpp
@@ -24,28 +24,23 @@ int checkTest(int argc, char **argv, DataType dt)
 
     DataFormat df = DF_NCHW;
     CheckParamSpec p;
-    p.check_mode = CHECK_EQUAL;
+    p.mode = CHECK_EQUAL;
 
     TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw);
     U8 *inputA = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM);
     U8 *inputB = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM);
 
-    Tensor inputTensorA;
-    Tensor inputTensorB;
-    inputTensorA.resize(inDesc);
-    inputTensorB.resize(inDesc);
-    inputTensorA.alloc();
-    inputTensorB.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorA, CPU_GENERAL), inputA, tensorNumBytes(inDesc));
-    memcpy(get_ptr_from_tensor(inputTensorB, CPU_GENERAL), inputB, tensorNumBytes(inDesc));
+    Tensor inputTensorA = Tensor::alloc_sized<CPUMem>(inDesc);
+    Tensor inputTensorB = Tensor::alloc_sized<CPUMem>(inDesc);
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensorA, CPU_GENERAL), inputA, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensorB, CPU_GENERAL), inputB, tensorNumBytes(inDesc));
 
     Tensor outputTensor;
-    Tensor outputTensorRef;
     CHECK_STATUS(
         check_infer_output_size({&inputTensorA, &inputTensorB}, &outputTensor, &UT_CPU_ARCHINFO));
     outputTensor.alloc();
-    outputTensorRef.resize(outputTensor.get_desc());
-    outputTensorRef.alloc();
+    TensorDesc outDesc = outputTensor.get_desc();
+    Tensor outputTensorRef = Tensor::alloc_sized<CPUMem>(outDesc);
 
     if (UT_CHECK) {
         CHECK_STATUS(check(inputTensorA, inputTensorB, p, outputTensor, &UT_CPU_ARCHINFO));
@@ -55,7 +50,7 @@ int checkTest(int argc, char **argv, DataType dt)
 
         // check
         ut_check_v(get_ptr_from_tensor(outputTensor, CPU_GENERAL),
-            get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), outputTensor.length(), DT_I32, 0,
+            get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), outputTensor.length(), outDesc.dt, 0,
             __FILE__, __LINE__);
     }
 
diff --git a/compute/tensor/tests/test_clip.cpp b/compute/tensor/tests/test_clip.cpp
index ea447ea9..2baca31c 100644
--- a/compute/tensor/tests/test_clip.cpp
+++ b/compute/tensor/tests/test_clip.cpp
@@ -27,7 +27,7 @@ int clipTest(int argc, char **argv, DataType dt)
     Tensor inputTensor;
     inputTensor.resize(inDesc);
     inputTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
 
     Tensor outputTensor;
     Tensor outputTensorRef;
diff --git a/compute/tensor/tests/test_concat.cpp b/compute/tensor/tests/test_concat.cpp
index 750308ec..433dbd59 100644
--- a/compute/tensor/tests/test_concat.cpp
+++ b/compute/tensor/tests/test_concat.cpp
@@ -60,7 +60,7 @@ int concatTest(int argc, char **argv, DataType dt)
 
     // setup tmp
     U32 tmpBytes;
-    CHECK_STATUS(concat_infer_forward_tmp_bytes(inTensors, &tmpBytes, &UT_CPU_ARCHINFO));
+    CHECK_STATUS(concat_infer_forward_tmp_bytes(inTensors, outTensor, &tmpBytes, &UT_CPU_ARCHINFO));
     Tensor tmpTensor;
     tmpTensor.resize(tensor1d(DT_U8, tmpBytes));
     tmpTensor.alloc();
@@ -85,7 +85,7 @@ int concatTest(int argc, char **argv, DataType dt)
             transformToNCHW(inputDesc, srcPtr, tmpDesc, tmpPtr);
             srcPtr = tmpPtr;
         }
-        memcpy(outputRef + count, srcPtr, bytes);
+        UNI_MEMCPY(outputRef + count, srcPtr, bytes);
         count += bytes;
         tmpPtr += bytes;
     }
diff --git a/compute/tensor/tests/test_concat_int8.cpp b/compute/tensor/tests/test_concat_int8.cpp
index 16f8088f..e9f11c13 100644
--- a/compute/tensor/tests/test_concat_int8.cpp
+++ b/compute/tensor/tests/test_concat_int8.cpp
@@ -75,7 +75,7 @@ int int8ConcatTest(int argc, char **argv, DataType dt)
         U8 *tmp = (U8 *)ut_input_v(in_len, dt, UT_INIT_ZERO);
         U8 *out_d = (U8 *)ut_input_v(in_len, dt, UT_INIT_ZERO);
         for (int i = 0, index = 0; i < num; i++) {
-            memcpy(tmp + index, get_ptr_from_tensor(inTensorsRef[i], CPU_GENERAL),
+            UNI_MEMCPY(tmp + index, get_ptr_from_tensor(inTensorsRef[i], CPU_GENERAL),
                 inTensorsRef[i].bytes());
             index += inTensorsRef[i].bytes();
         }
diff --git a/compute/tensor/tests/test_concat_ocl.cpp b/compute/tensor/tests/test_concat_ocl.cpp
index 6ab06559..055957ce 100644
--- a/compute/tensor/tests/test_concat_ocl.cpp
+++ b/compute/tensor/tests/test_concat_ocl.cpp
@@ -85,7 +85,7 @@ int concatTest(int argc, char **argv, DataType dt)
 
     U32 maxBytes = 0;
     U32 tmpBytes = 0;
-    CHECK_STATUS(concat_infer_forward_tmp_bytes(inputTensor, &tmpBytes, &archInfo));
+    CHECK_STATUS(concat_infer_forward_tmp_bytes(inputTensor, outputTensor, &tmpBytes, &archInfo));
     maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes;
 
     GCLMem_t output = alloc(outputTensor);
@@ -129,7 +129,7 @@ int concatTest(int argc, char **argv, DataType dt)
     for (int i = 0; i < num; i++) {
         inputTensorCpu[i].alloc();
         inputDesc[i].df = DF_NCHW;
-        memcpy(get_ptr_from_tensor(inputTensorCpu[i], CPU_GENERAL), input_cpu[i],
+        UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu[i], CPU_GENERAL), input_cpu[i],
             tensorNumBytes(inputDesc[i]));
     }
 
diff --git a/compute/tensor/tests/test_convolution.cpp b/compute/tensor/tests/test_convolution.cpp
index c8395c66..5877bdec 100644
--- a/compute/tensor/tests/test_convolution.cpp
+++ b/compute/tensor/tests/test_convolution.cpp
@@ -51,7 +51,7 @@ int convolutionTest(int argc, char *argv[], DataType dt)
     TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw);
     TensorDesc biasDesc = tensor1d(dt, oc);
     ConvolutionParamSpec p = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, stride, 0, 0,
-        padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Depthwise_Pointwise);
+        padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DEPTHWISE_POINTWISE);
 
     // setup input, filter, bias
     U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM);
@@ -76,12 +76,15 @@ int convolutionTest(int argc, char *argv[], DataType dt)
     filterTensor.alloc();
     filterTensorRef.alloc();
     biasTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, bytesOf(dt) * fn * fc * fh * fw);
-    memcpy(
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
+    UNI_MEMCPY(
+        get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, bytesOf(dt) * fn * fc * fh * fw);
+    UNI_MEMCPY(
         get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, bytesOf(dt) * fn * fc * fh * fw);
-    memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, bytesOf(dt) * oc);
+    UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, bytesOf(dt) * oc);
 
     // setup output, bias
     CHECK_STATUS(convolution_infer_output_size(
diff --git a/compute/tensor/tests/test_convolution_bnn.cpp b/compute/tensor/tests/test_convolution_bnn.cpp
index 765ad294..0a44058c 100644
--- a/compute/tensor/tests/test_convolution_bnn.cpp
+++ b/compute/tensor/tests/test_convolution_bnn.cpp
@@ -14,7 +14,7 @@
 #include "tensor_computing.h"
 #include "ut_util.h"
 
-int bnnConvolutionTest(int argc, char *argv[], DataType dt)
+int bnnConvolutionTest(int argc, char *argv[], DataType idt, DataType fdt)
 {
     CHECK_REQUIREMENT(argc == 16);
     // in data
@@ -39,21 +39,20 @@ int bnnConvolutionTest(int argc, char *argv[], DataType dt)
 
     CHECK_REQUIREMENT(in == 1 && on == 1);
 
-    DataType fdt = DT_BIN11;  // Use dt to distinguish DoReFa and XNOR
     ActivationParamSpec activationDesc;
     activationDesc.mode = ACTIVATION_NULL;
 
-    TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw);
+    TensorDesc inputDesc = tensor4df(idt, DF_NCHWC8, in, ic, ih, iw);
     TensorDesc filterDesc = tensor4df(fdt, DF_NCHW, oc, ic, fh, fw);
-    TensorDesc biasDesc = tensor1d(dt, oc * 2);  // including scale and bias
+    TensorDesc biasDesc = tensor1d(idt, oc * 2);  // including scale and bias
     ConvolutionParamSpec p = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, stride, 0, 0,
-        padding, padding, padding, padding, 1, 1, 1, oc, Convolution_Depthwise_Pointwise);
+        padding, padding, padding, padding, 1, 1, 1, oc, CONVOLUTION_DEPTHWISE_POINTWISE);
 
     // setup input, filter, bias
-    U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM);
+    U8 *input = ut_input_v(in * ic * ih * iw, idt, UT_INIT_RANDOM);
     if (fdt == DT_BIN01) {
         for (U32 i = 0; i < in * ic * ih * iw; i++) {
-            switch (dt) {
+            switch (idt) {
 #ifdef _USE_FP16
                 case DT_F16:
                     ((F16 *)input)[i] += 0.5;
@@ -71,7 +70,7 @@ int bnnConvolutionTest(int argc, char *argv[], DataType dt)
     }
 
     BIN8 *filter = (BIN8 *)ut_input_v(fn * fc * fh * fw / 8, fdt, UT_INIT_POS);
-    U8 *bias = ut_input_v(oc * 2, dt, UT_INIT_RANDOM);
+    U8 *bias = ut_input_v(oc * 2, idt, UT_INIT_RANDOM);
     Tensor inputTensor;
     Tensor inputTensorRef;
     Tensor filterTensor;
@@ -91,15 +90,18 @@ int bnnConvolutionTest(int argc, char *argv[], DataType dt)
     filterTensor.alloc();
     filterTensorRef.alloc();
     biasTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
-    memcpy(get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
-    memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(idt) * in * ic * ih * iw);
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(idt) * in * ic * ih * iw);
+    UNI_MEMCPY(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc));
 
     // setup output, bias
     CHECK_STATUS(convolution_infer_output_size(
-        &inputTensor, filterTensor, p, &outputTensor, dt, &UT_CPU_ARCHINFO));
+        &inputTensor, filterTensor, p, &outputTensor, idt, &UT_CPU_ARCHINFO));
 
     outputTensor.alloc();
     outputTensorRef.resize(outputTensor.get_desc());
@@ -141,7 +143,7 @@ int bnnConvolutionTest(int argc, char *argv[], DataType dt)
             tmpTensors, outputTensorRef, activationDesc, &UT_SERIAL_ARCHINFO));
         // check
         ut_check_v(get_ptr_from_tensor(outputTensor, CPU_GENERAL),
-            get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), outputTensor.length(), dt, 1,
+            get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), outputTensor.length(), idt, 1,
             __FILE__, __LINE__);
     }
 
@@ -172,7 +174,8 @@ int bnnConvolutionTest(int argc, char *argv[], DataType dt)
 int main(int argc, char **argv)
 {
 #ifdef _USE_FP16
-    bnnConvolutionTest(argc, argv, DT_F16);
+    bnnConvolutionTest(argc, argv, DT_F16, DT_BIN01);
+    bnnConvolutionTest(argc, argv, DT_F16, DT_BIN11);
 #endif
     return 0;
 }
diff --git a/compute/tensor/tests/test_convolution_int8.cpp b/compute/tensor/tests/test_convolution_int8.cpp
index c39f1fd7..8f569f04 100644
--- a/compute/tensor/tests/test_convolution_int8.cpp
+++ b/compute/tensor/tests/test_convolution_int8.cpp
@@ -45,7 +45,7 @@ int int8ConvolutionTest(int argc, char *argv[], DataType dt, DataType filterData
 
     TensorDesc inputDesc, filterDesc, outputDesc, biasDesc;
     ConvolutionParamSpec p = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, stride, 0, 0,
-        padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Depthwise_Pointwise);
+        padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DEPTHWISE_POINTWISE);
 
     if (ic % 8 != 0) {
         printf("[WARN] can not quantize the first layer\n");
@@ -120,10 +120,8 @@ int int8ConvolutionTest(int argc, char *argv[], DataType dt, DataType filterData
                 CHECK_STATUS(convolution_transform_filter(
                     filterTensor, p, alg, tmpTensor, &tFilter, &UT_CPU_ARCHINFO));
 
-                TensorDesc ftmDesc = tFilter.get_desc();
-                ftmDesc.dt = DT_I8;
-                ftmTensor = Tensor::alloc_sized<CPUMem>(ftmDesc);
-
+                U32 ftmBytes = ftBytes / bytesOf(filterDataType);
+                ftmTensor = Tensor::alloc_sized<CPUMem>(tensor1d(DT_U8, ftmBytes));
                 scales = std::vector<F32>(38);
                 CHECK_STATUS(quantize(tFilter, &ftmTensor, scales.data() + 2, &UT_CPU_ARCHINFO));
                 break;
@@ -159,7 +157,7 @@ int int8ConvolutionTest(int argc, char *argv[], DataType dt, DataType filterData
         //         TensorDesc inputC16Desc = inputDesc;
         //         inputC16Desc.df = DF_NCHWC16;
         //         transformToNCHWC16(inputDesc, (void *)get_ptr_from_tensor(inputTensor, CPU_GENERAL), inputC16Desc, inputC16);
-        //         memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), inputC16, tensorNumBytes(inputDesc));
+        //         UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), inputC16, tensorNumBytes(inputDesc));
         //         inputTensor.resize(inputC16Desc);
         //         free(inputC16);
         // #endif
diff --git a/compute/tensor/tests/test_convolution_ocl.cpp b/compute/tensor/tests/test_convolution_ocl.cpp
index 080049fa..e8479769 100644
--- a/compute/tensor/tests/test_convolution_ocl.cpp
+++ b/compute/tensor/tests/test_convolution_ocl.cpp
@@ -96,7 +96,7 @@ int convolutionTest(int argc, char *argv[], DataType dt)
         dilationH = atoi(argv[14]);
         dilationW = atoi(argv[15]);
         if (argc == 17) {
-            use_nchw = atoi(argv[6]);
+            use_nchw = atoi(argv[16]);
         }
     }
 
@@ -136,7 +136,7 @@ int convolutionTest(int argc, char *argv[], DataType dt)
     activationDesc.mode = ACTIVATION_NULL;
     ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, ft, fh, fw, strideT,
         strideH, strideW, paddingTF, paddingTB, paddingT, paddingB, paddingL, paddingR, 1,
-        dilationH, dilationW, fn, Convolution_Depthwise_Pointwise);
+        dilationH, dilationW, fn, CONVOLUTION_DEPTHWISE_POINTWISE);
 
     TensorDesc inputDesc, filterDesc, inputDesc_gpu;
     if (it > 1) {
@@ -220,7 +220,7 @@ int convolutionTest(int argc, char *argv[], DataType dt)
     U32 ocAlign = (oc + 3) / 4 * 4;
     if (ocAlign != oc) {
         U8 *bias_cpu_align = ut_input_v(ocAlign, dt, UT_INIT_ZERO);
-        memcpy(bias_cpu_align, bias_cpu, oc * bytesOf(dt));
+        UNI_MEMCPY(bias_cpu_align, bias_cpu, oc * bytesOf(dt));
         free(bias_cpu);
         bias_cpu = bias_cpu_align;
     }
@@ -245,7 +245,7 @@ int convolutionTest(int argc, char *argv[], DataType dt)
         tmp[0] = tmpTensorImgA;
     }
     alloc_img(tmpTensorImgB, maxBytes + 4);
-    Tensor filterTensorTran = filterTensor; 
+    Tensor filterTensorTran = filterTensor;
 
     if (alg == CONVOLUTION_ALGORITHM_WINOGRAD && archInfo.arch == QUALCOMM) {
         tmp[0] = tmpTensor;
@@ -260,8 +260,8 @@ int convolutionTest(int argc, char *argv[], DataType dt)
 
     CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true));
     std::vector<Tensor> inputTensors(1, inputTensor);
-    CHECK_STATUS(convolution(inputTensors, filterTensorTran, convParamSpec, alg, nullptr, biasTensor,
-        tmp, outputTensor, activationDesc, &archInfo));
+    CHECK_STATUS(convolution(inputTensors, filterTensorTran, convParamSpec, alg, nullptr,
+        biasTensor, tmp, outputTensor, activationDesc, &archInfo));
 
     /*warp up*/
     for (U32 i = 0; i < 2; i++) {
@@ -314,18 +314,19 @@ int convolutionTest(int argc, char *argv[], DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
 
     Tensor filterTensorCpu;
     filterTensorCpu.resize(filterDesc);
     filterTensorCpu.alloc();
-    memcpy(
+    UNI_MEMCPY(
         get_ptr_from_tensor(filterTensorCpu, CPU_GENERAL), filter_cpu, tensorNumBytes(filterDesc));
 
     Tensor biasTensorCpu;
     biasTensorCpu.resize(biasDesc);
     biasTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc));
 
     Tensor outputTensorCpu;
     outputDesc.df = DF_NCHW;
diff --git a/compute/tensor/tests/test_deconvolution.cpp b/compute/tensor/tests/test_deconvolution.cpp
index adcd9104..154f40c1 100644
--- a/compute/tensor/tests/test_deconvolution.cpp
+++ b/compute/tensor/tests/test_deconvolution.cpp
@@ -36,13 +36,12 @@ int deconvolutionTest(int argc, char **argv, DataType dt)
     U32 oc = atoi(argv[13]);
     U32 oh = atoi(argv[14]);
     U32 ow = atoi(argv[15]);
-    CHECK_REQUIREMENT(in == 1 && on == 1);
     CHECK_REQUIREMENT(ic % 8 == 0 && oc % 8 == 0);
 
     ActivationParamSpec activationDesc;
     activationDesc.mode = ACTIVATION_NULL;
     ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride,
-        stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Deconvolution);
+        stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DECONVOLUTION);
 
     TensorDesc outputDesc;
     TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw);
@@ -73,11 +72,14 @@ int deconvolutionTest(int argc, char **argv, DataType dt)
     filterTensor.alloc();
     filterTensorRef.alloc();
     biasTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
-    memcpy(get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
-    memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
+    UNI_MEMCPY(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc));
 
     // setup output, bias
     CHECK_STATUS(deconvolution_infer_output_size(
diff --git a/compute/tensor/tests/test_deconvolution_ocl.cpp b/compute/tensor/tests/test_deconvolution_ocl.cpp
index 86b81227..701664c9 100644
--- a/compute/tensor/tests/test_deconvolution_ocl.cpp
+++ b/compute/tensor/tests/test_deconvolution_ocl.cpp
@@ -53,7 +53,7 @@ int deconvolutionTest(int argc, char *argv[], DataType dt)
     ActivationParamSpec activationDesc;
     activationDesc.mode = ACTIVATION_NULL;
     ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride,
-        stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Deconvolution);
+        stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DECONVOLUTION);
 
     TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw);
     TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw);
@@ -116,7 +116,7 @@ int deconvolutionTest(int argc, char *argv[], DataType dt)
     if ((oc & 3) != 0) {
         U32 ocAlign = (oc + 3) / 4 * 4;
         U8 *bias_cpu_align = ut_input_v(ocAlign, dt, UT_INIT_ZERO);
-        memcpy(bias_cpu_align, bias_cpu, oc * bytesOf(dt));
+        UNI_MEMCPY(bias_cpu_align, bias_cpu, oc * bytesOf(dt));
         free(bias_cpu);
         bias_cpu = bias_cpu_align;
     }
@@ -161,18 +161,19 @@ int deconvolutionTest(int argc, char *argv[], DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
 
     Tensor filterTensorCpu;
     filterTensorCpu.resize(filterDesc);
     filterTensorCpu.alloc();
-    memcpy(
+    UNI_MEMCPY(
         get_ptr_from_tensor(filterTensorCpu, CPU_GENERAL), filter_cpu, tensorNumBytes(filterDesc));
 
     Tensor biasTensorCpu;
     biasTensorCpu.resize(biasDesc);
     biasTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc));
 
     Tensor outputTensorCpu;
     outputTensorCpu.resize(outputDesc);
diff --git a/compute/tensor/tests/test_depthwise_convolution.cpp b/compute/tensor/tests/test_depthwise_convolution.cpp
index 3679b03d..b8f4035a 100644
--- a/compute/tensor/tests/test_depthwise_convolution.cpp
+++ b/compute/tensor/tests/test_depthwise_convolution.cpp
@@ -52,7 +52,7 @@ int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataTyp
         pwBiasDesc = tensor1d(dt, oc);
     }
     ConvolutionParamSpec p = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, stride, 0, 0,
-        padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Depthwise);
+        padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DEPTHWISE);
 
     // setup input, filter, bias
     U8 *dwFilter = nullptr;
@@ -82,13 +82,15 @@ int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataTyp
     dwFilterTensor.alloc();
     dwFilterTensorRef.alloc();
     dwBiasTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
+    UNI_MEMCPY(
         get_ptr_from_tensor(dwFilterTensor, CPU_GENERAL), dwFilter, bytesOf(dt) * 1 * ic * fh * fw);
-    memcpy(get_ptr_from_tensor(dwFilterTensorRef, CPU_GENERAL), dwFilter,
+    UNI_MEMCPY(get_ptr_from_tensor(dwFilterTensorRef, CPU_GENERAL), dwFilter,
         bytesOf(dt) * 1 * ic * fh * fw);
-    memcpy(get_ptr_from_tensor(dwBiasTensor, CPU_GENERAL), dwBias, bytesOf(dt) * ic);
+    UNI_MEMCPY(get_ptr_from_tensor(dwBiasTensor, CPU_GENERAL), dwBias, bytesOf(dt) * ic);
     Tensor pwFilterTensor;
     Tensor pwFilterTensorRef;
     Tensor pwBiasTensor;
@@ -101,11 +103,11 @@ int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataTyp
         pwFilterTensor.alloc();
         pwFilterTensorRef.alloc();
         pwBiasTensor.alloc();
-        memcpy(get_ptr_from_tensor(pwFilterTensor, CPU_GENERAL), pwFilter,
+        UNI_MEMCPY(get_ptr_from_tensor(pwFilterTensor, CPU_GENERAL), pwFilter,
             bytesOf(dt) * oc * ic * 1 * 1);
-        memcpy(get_ptr_from_tensor(pwFilterTensorRef, CPU_GENERAL), pwFilter,
+        UNI_MEMCPY(get_ptr_from_tensor(pwFilterTensorRef, CPU_GENERAL), pwFilter,
             bytesOf(dt) * oc * ic * 1 * 1);
-        memcpy(get_ptr_from_tensor(pwBiasTensor, CPU_GENERAL), pwBias, bytesOf(dt) * oc);
+        UNI_MEMCPY(get_ptr_from_tensor(pwBiasTensor, CPU_GENERAL), pwBias, bytesOf(dt) * oc);
     }
 
     // setup output, bias
@@ -183,21 +185,21 @@ int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataTyp
     if (UT_CHECK) {
         if (isFusedWithPw) {
             CHECK_STATUS(depthwise_pointwise_convolution(inputTensors, dwFtmTensor, pwFtmTensor, p,
-                alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, dwActivationParamSpec,
-                pwActivationParamSpec, &UT_CPU_ARCHINFO));
+                alg, nullptr, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor,
+                dwActivationParamSpec, pwActivationParamSpec, &UT_CPU_ARCHINFO));
 
             // naive implement
             CHECK_STATUS(depthwise_pointwise_convolution(inputTensorsRef, dwFilterTensorRef,
-                pwFilterTensorRef, p, alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensorRef,
-                dwActivationParamSpec, pwActivationParamSpec, &UT_SERIAL_ARCHINFO));
+                pwFilterTensorRef, p, alg, nullptr, dwBiasTensor, pwBiasTensor, tmpTensors,
+                outputTensorRef, dwActivationParamSpec, pwActivationParamSpec, &UT_SERIAL_ARCHINFO));
         } else {
-            CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, dwBiasTensor,
-                tmpTensor, outputTensor, dwActivationParamSpec, &UT_CPU_ARCHINFO));
+            CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, nullptr,
+                dwBiasTensor, tmpTensor, outputTensor, dwActivationParamSpec, &UT_CPU_ARCHINFO));
 
             // naive implement
-            CHECK_STATUS(
-                depthwise_convolution(inputTensorRef, dwFilterTensorRef, p, alg, dwBiasTensor,
-                    tmpTensor, outputTensorRef, dwActivationParamSpec, &UT_SERIAL_ARCHINFO));
+            CHECK_STATUS(depthwise_convolution(inputTensorRef, dwFilterTensorRef, p, alg, nullptr,
+                dwBiasTensor, tmpTensor, outputTensorRef, dwActivationParamSpec,
+                &UT_SERIAL_ARCHINFO));
         }
 
         // check
@@ -211,11 +213,11 @@ int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataTyp
     for (int iter = 0; iter < UT_LOOPS; iter++) {
         if (isFusedWithPw) {
             CHECK_STATUS(depthwise_pointwise_convolution(inputTensors, dwFtmTensor, pwFtmTensor, p,
-                alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, dwActivationParamSpec,
-                pwActivationParamSpec, &UT_CPU_ARCHINFO));
+                alg, nullptr, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor,
+                dwActivationParamSpec, pwActivationParamSpec, &UT_CPU_ARCHINFO));
         } else {
-            CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, dwBiasTensor,
-                tmpTensor, outputTensor, dwActivationParamSpec, &UT_CPU_ARCHINFO));
+            CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, nullptr,
+                dwBiasTensor, tmpTensor, outputTensor, dwActivationParamSpec, &UT_CPU_ARCHINFO));
         }
     }
     double time_end = ut_time_ms();
diff --git a/compute/tensor/tests/test_depthwise_convolution_int8.cpp b/compute/tensor/tests/test_depthwise_convolution_int8.cpp
index df11b4cf..52f5d09f 100644
--- a/compute/tensor/tests/test_depthwise_convolution_int8.cpp
+++ b/compute/tensor/tests/test_depthwise_convolution_int8.cpp
@@ -53,7 +53,7 @@ int main(int argc, char *argv[])
     dwBiasDesc = tensor1d(odt, ic);
     pwBiasDesc = tensor1d(odt, oc);
     ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride,
-        stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Depthwise);
+        stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DEPTHWISE);
 
     // setup input, filter, bias
     INT8 *input = (INT8 *)ut_input_v(in * ic * ih * iw, DT_I8, UT_INIT_RANDOM);
@@ -81,13 +81,15 @@ int main(int argc, char *argv[])
     dwFilterTensor.alloc();
     dwFilterTensorRef.alloc();
     dwBiasTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
+    UNI_MEMCPY(
         get_ptr_from_tensor(dwFilterTensor, CPU_GENERAL), dwFilter, bytesOf(dt) * 1 * ic * fh * fw);
-    memcpy(get_ptr_from_tensor(dwFilterTensorRef, CPU_GENERAL), dwFilter,
+    UNI_MEMCPY(get_ptr_from_tensor(dwFilterTensorRef, CPU_GENERAL), dwFilter,
         bytesOf(dt) * 1 * ic * fh * fw);
-    memcpy(get_ptr_from_tensor(dwBiasTensor, CPU_GENERAL), dwBias, bytesOf(dt) * ic);
+    UNI_MEMCPY(get_ptr_from_tensor(dwBiasTensor, CPU_GENERAL), dwBias, bytesOf(dt) * ic);
 
     Tensor pwFilterTensor;
     Tensor pwFilterTensorRef;
@@ -98,11 +100,11 @@ int main(int argc, char *argv[])
     pwFilterTensor.alloc();
     pwFilterTensorRef.alloc();
     pwBiasTensor.alloc();
-    memcpy(
+    UNI_MEMCPY(
         get_ptr_from_tensor(pwFilterTensor, CPU_GENERAL), pwFilter, bytesOf(dt) * oc * ic * 1 * 1);
-    memcpy(get_ptr_from_tensor(pwFilterTensorRef, CPU_GENERAL), pwFilter,
+    UNI_MEMCPY(get_ptr_from_tensor(pwFilterTensorRef, CPU_GENERAL), pwFilter,
         bytesOf(dt) * oc * ic * 1 * 1);
-    memcpy(get_ptr_from_tensor(pwBiasTensor, CPU_GENERAL), pwBias, bytesOf(dt) * oc);
+    UNI_MEMCPY(get_ptr_from_tensor(pwBiasTensor, CPU_GENERAL), pwBias, bytesOf(dt) * oc);
 
     // setup output, bias
     CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size(&inputTensor, dwFilterTensor,
@@ -143,15 +145,16 @@ int main(int argc, char *argv[])
     std::vector<Tensor> inputTensors(1, inputTensor);
     std::vector<Tensor> inputTensorsRef(1, inputTensorRef);
     std::vector<Tensor> tmpTensors(1, tmpTensor);
+    F32 scales[3] = {1, 1, 1};
 
     if (UT_CHECK) {
         CHECK_STATUS(depthwise_pointwise_convolution(inputTensors, dwFtmTensor, pwFtmTensor,
-            convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor,
+            convParamSpec, alg, scales, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor,
             dwActivationParamSpec, pwActivationParamSpec, &UT_CPU_ARCHINFO));
 
         // naive implement
         CHECK_STATUS(depthwise_pointwise_convolution(inputTensorsRef, dwFilterTensorRef,
-            pwFilterTensorRef, convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensors,
+            pwFilterTensorRef, convParamSpec, alg, scales, dwBiasTensor, pwBiasTensor, tmpTensors,
             outputTensorRef, dwActivationParamSpec, pwActivationParamSpec, &UT_SERIAL_ARCHINFO));
 
         // check
@@ -164,7 +167,7 @@ int main(int argc, char *argv[])
     double time_start = ut_time_ms();
     for (int iter = 0; iter < UT_LOOPS; iter++) {
         CHECK_STATUS(depthwise_pointwise_convolution(inputTensors, dwFtmTensor, pwFtmTensor,
-            convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor,
+            convParamSpec, alg, scales, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor,
             dwActivationParamSpec, pwActivationParamSpec, &UT_CPU_ARCHINFO));
     }
     double time_end = ut_time_ms();
diff --git a/compute/tensor/tests/test_depthwise_convolution_ocl.cpp b/compute/tensor/tests/test_depthwise_convolution_ocl.cpp
index bc977906..6cc96821 100644
--- a/compute/tensor/tests/test_depthwise_convolution_ocl.cpp
+++ b/compute/tensor/tests/test_depthwise_convolution_ocl.cpp
@@ -82,7 +82,7 @@ int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat
     dwActivationParamSpec.mode = ACTIVATION_NULL;
     ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride,
         stride, 0, 0, padding, padding, padding, padding, dila, dila, dila, fn,
-        Convolution_Depthwise);
+        CONVOLUTION_DEPTHWISE);
 
     U32 filterLen = fn * fc * fh * fw;
     U32 biasLen = oc;
@@ -149,7 +149,7 @@ int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat
     if ((oc & 3) != 0) {
         U32 ocAlign = (oc + 3) / 4 * 4;
         U8 *bias_cpu_align = ut_input_v(ocAlign, dt, UT_INIT_ZERO);
-        memcpy(bias_cpu_align, bias_cpu, oc * bytesOf(dt));
+        UNI_MEMCPY(bias_cpu_align, bias_cpu, oc * bytesOf(dt));
         free(bias_cpu);
         bias_cpu = bias_cpu_align;
     }
@@ -172,8 +172,8 @@ int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat
 
     CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true));
 
-    CHECK_STATUS(depthwise_convolution(inputTensor, filterTensor, convParamSpec, alg, biasTensor,
-        tmp, outputTensor, dwActivationParamSpec, &archInfo));
+    CHECK_STATUS(depthwise_convolution(inputTensor, filterTensor, convParamSpec, alg, nullptr,
+        biasTensor, tmp, outputTensor, dwActivationParamSpec, &archInfo));
 
     /*warp up*/
     for (U32 i = 0; i < 2; i++) {
@@ -201,18 +201,19 @@ int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat
     outputDesc.df = DF_NCHW;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
 
     Tensor filterTensorCpu;
     filterTensorCpu.resize(filterDesc);
     filterTensorCpu.alloc();
-    memcpy(
+    UNI_MEMCPY(
         get_ptr_from_tensor(filterTensorCpu, CPU_GENERAL), filter_cpu, tensorNumBytes(filterDesc));
 
     Tensor biasTensorCpu;
     biasTensorCpu.resize(biasDesc);
     biasTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc));
 
     Tensor outputTensorCpu;
     outputTensorCpu.resize(outputDesc);
@@ -226,8 +227,8 @@ int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat
     tmpTensorCpu.alloc();
 
     CHECK_STATUS(depthwise_convolution(inputTensorCpu, filterTensorCpu, convParamSpec,
-        DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, biasTensorCpu, tmpTensorCpu, outputTensorCpu,
-        dwActivationParamSpec, &UT_SERIAL_ARCHINFO));
+        DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, nullptr, biasTensorCpu, tmpTensorCpu,
+        outputTensorCpu, dwActivationParamSpec, &UT_SERIAL_ARCHINFO));
     ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, CPU_GENERAL), on * oc * ow * oh, dt);
 
     CHECK_STATUS(gcl_finish(handle));
diff --git a/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp b/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp
index 9b48a4fd..a5ba7d98 100644
--- a/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp
+++ b/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp
@@ -88,7 +88,7 @@ int depthwisePointwiseConvolutionTest(
     pwActivationParamSpec.mode = ACTIVATION_NULL;
     ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride,
         stride, 0, 0, pt, pb, pl, pr, dilation, dilation, dilation, fn,
-        Convolution_Depthwise_Pointwise);
+        CONVOLUTION_DEPTHWISE_POINTWISE);
 
     U32 dwFilterLen = 1 * fc * fh * fw;
     U32 pwFilterLen = fn * fc * 1 * 1;
@@ -181,14 +181,14 @@ int depthwisePointwiseConvolutionTest(
     if ((ic & 3) != 0) {
         U32 icAlign = (ic + 3) / 4 * 4;
         U8 *tmp = ut_input_v(icAlign, dt, UT_INIT_ZERO);
-        memcpy(tmp, dw_bias_cpu, ic * bytesOf(dt));
+        UNI_MEMCPY(tmp, dw_bias_cpu, ic * bytesOf(dt));
         free(dw_bias_cpu);
         dw_bias_cpu = tmp;
     }
     alloc_host_ptr(dwBiasTensor, dw_bias_cpu);
 
     U8 *pw_bias_val = ut_input_v(oc + 8, dt, UT_INIT_ZERO);
-    memcpy(pw_bias_val, pw_bias_cpu, oc * bytesOf(dt));
+    UNI_MEMCPY(pw_bias_val, pw_bias_cpu, oc * bytesOf(dt));
     free(pw_bias_cpu);
     pw_bias_cpu = pw_bias_val;
     alloc_host_ptr(pwBiasTensorImg, pw_bias_cpu);
@@ -216,7 +216,7 @@ int depthwisePointwiseConvolutionTest(
 
     std::vector<Tensor> inputTensors(1, inputTensor);
     CHECK_STATUS(depthwise_pointwise_convolution(inputTensors, dwFilterTensor, pwFilterTensor,
-        convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor,
+        convParamSpec, alg, nullptr, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor,
         dwActivationParamSpec, pwActivationParamSpec, &archInfo));
     /*warp up*/
     UNI_INFO_LOG("warm up gpu:\n")
@@ -246,30 +246,31 @@ int depthwisePointwiseConvolutionTest(
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
 
     Tensor dwFilterTensorCpu;
     dwFilterTensorCpu.resize(dwFilterDesc);
     dwFilterTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(dwFilterTensorCpu, CPU_GENERAL), dw_filter_cpu,
+    UNI_MEMCPY(get_ptr_from_tensor(dwFilterTensorCpu, CPU_GENERAL), dw_filter_cpu,
         tensorNumBytes(dwFilterDesc));
 
     Tensor pwFilterTensorCpu;
     pwFilterTensorCpu.resize(pwFilterDesc);
     pwFilterTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(pwFilterTensorCpu, CPU_GENERAL), pw_filter_cpu,
+    UNI_MEMCPY(get_ptr_from_tensor(pwFilterTensorCpu, CPU_GENERAL), pw_filter_cpu,
         tensorNumBytes(pwFilterDesc));
 
     Tensor dwBiasTensorCpu;
     dwBiasTensorCpu.resize(dwBiasDesc);
     dwBiasTensorCpu.alloc();
-    memcpy(
+    UNI_MEMCPY(
         get_ptr_from_tensor(dwBiasTensorCpu, CPU_GENERAL), dw_bias_cpu, tensorNumBytes(dwBiasDesc));
 
     Tensor pwBiasTensorCpu;
     pwBiasTensorCpu.resize(pwBiasDesc);
     pwBiasTensorCpu.alloc();
-    memcpy(
+    UNI_MEMCPY(
         get_ptr_from_tensor(pwBiasTensorCpu, CPU_GENERAL), pw_bias_cpu, tensorNumBytes(pwBiasDesc));
 
     Tensor outputTensorCpu;
@@ -287,8 +288,8 @@ int depthwisePointwiseConvolutionTest(
     std::vector<Tensor> inputTensorsCpu(1, inputTensorCpu);
     std::vector<Tensor> tmpTensorsCpu(1, tmpTensorCpu);
     CHECK_STATUS(depthwise_pointwise_convolution(inputTensorsCpu, dwFilterTensorCpu,
-        pwFilterTensorCpu, convParamSpec, DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, dwBiasTensorCpu,
-        pwBiasTensorCpu, tmpTensorsCpu, outputTensorCpu, dwActivationParamSpec,
+        pwFilterTensorCpu, convParamSpec, DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, nullptr,
+        dwBiasTensorCpu, pwBiasTensorCpu, tmpTensorsCpu, outputTensorCpu, dwActivationParamSpec,
         pwActivationParamSpec, &UT_SERIAL_ARCHINFO));
     ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, CPU_GENERAL), on * oc * ow * oh, dt);
 
diff --git a/compute/tensor/tests/test_detectionoutput.cpp b/compute/tensor/tests/test_detectionoutput.cpp
index 3f0e1894..af4ed5c6 100644
--- a/compute/tensor/tests/test_detectionoutput.cpp
+++ b/compute/tensor/tests/test_detectionoutput.cpp
@@ -57,11 +57,11 @@ int detectionoutputTest(int argc, char **argv, DataType dt)
     U8 *input_loc = ut_input_v(input_len_loc, dt, UT_INIT_RANDOM);
     U8 *input_conf = ut_input_v(input_len_conf, dt, UT_INIT_RANDOM);
     U8 *input_priorbox = ut_input_v(input_len_priorbox, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensor_loc, CPU_GENERAL), input_loc,
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor_loc, CPU_GENERAL), input_loc,
         tensorNumBytes(inputDesc_loc));
-    memcpy(get_ptr_from_tensor(inputTensor_conf, CPU_GENERAL), input_conf,
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor_conf, CPU_GENERAL), input_conf,
         tensorNumBytes(inputDesc_conf));
-    memcpy(get_ptr_from_tensor(inputTensor_priorbox, CPU_GENERAL), input_priorbox,
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor_priorbox, CPU_GENERAL), input_priorbox,
         tensorNumBytes(inputDesc_priorbox));
     inputTensors[0] = inputTensor_loc;
     inputTensors[1] = inputTensor_conf;
diff --git a/compute/tensor/tests/test_dilated_convolution.cpp b/compute/tensor/tests/test_dilated_convolution.cpp
index 1dc29edb..a9b76ef9 100644
--- a/compute/tensor/tests/test_dilated_convolution.cpp
+++ b/compute/tensor/tests/test_dilated_convolution.cpp
@@ -50,7 +50,7 @@ int dilatedConvolutionTest(int argc, char **argv, DataType dt)
     TensorDesc filterDesc = tensor4df(dt, DF_NCHW, oc, ic, fh, fw);
     TensorDesc biasDesc = tensor1d(dt, oc);
     ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride,
-        stride, 0, 0, padding, padding, padding, padding, 1, rate, rate, fn, Convolution_Dilation);
+        stride, 0, 0, padding, padding, padding, padding, 1, rate, rate, fn, CONVOLUTION_DILATION);
 
     // setup input, filter, bias
     U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM);
@@ -76,11 +76,14 @@ int dilatedConvolutionTest(int argc, char **argv, DataType dt)
     filterTensor.alloc();
     filterTensorRef.alloc();
     biasTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
-    memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
-    memcpy(get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
-    memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw);
+    UNI_MEMCPY(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc));
 
     // setup output, bias
     CHECK_STATUS(convolution_infer_output_size(
diff --git a/compute/tensor/tests/test_eltwise.cpp b/compute/tensor/tests/test_eltwise.cpp
index a44af17f..b697aff9 100644
--- a/compute/tensor/tests/test_eltwise.cpp
+++ b/compute/tensor/tests/test_eltwise.cpp
@@ -28,7 +28,7 @@ int eltwiseTest(int argc, char **argv, DataType dt)
     U32 len = in * ic * ih * iw;
     EltwiseMode eltwiseMode = ELTWISE_MAX;
     EltwiseParamSpec eltwiseDesc;
-    eltwiseDesc.elt_mode = eltwiseMode;
+    eltwiseDesc.mode = eltwiseMode;
     eltwiseDesc.activation_type = ACTIVATION_NULL;
 
     std::vector<void *> input(num);
@@ -40,7 +40,7 @@ int eltwiseTest(int argc, char **argv, DataType dt)
         input[i] = (void *)ut_input_v(len, dt, UT_INIT_RANDOM);
         inTensors[i].resize(inDesc);
         inTensors[i].alloc();
-        memcpy(get_ptr_from_tensor(inTensors[i], CPU_GENERAL), input[i], tensorNumBytes(inDesc));
+        UNI_MEMCPY(get_ptr_from_tensor(inTensors[i], CPU_GENERAL), input[i], tensorNumBytes(inDesc));
         inTensorPtr[i] = &inTensors[i];
     }
 
diff --git a/compute/tensor/tests/test_eltwise_ocl.cpp b/compute/tensor/tests/test_eltwise_ocl.cpp
index 75a709e9..033a56ce 100644
--- a/compute/tensor/tests/test_eltwise_ocl.cpp
+++ b/compute/tensor/tests/test_eltwise_ocl.cpp
@@ -53,7 +53,7 @@ int eltwiseTest(int argc, char *argv[], DataType dt)
 
     EltwiseMode eltwiseMode = ELTWISE_SUM;
     EltwiseParamSpec eltwiseDesc;
-    eltwiseDesc.elt_mode = eltwiseMode;
+    eltwiseDesc.mode = eltwiseMode;
     eltwiseDesc.activation_type = ACTIVATION_NULL;
 
     std::vector<void *> inputCpu(num);
@@ -71,7 +71,7 @@ int eltwiseTest(int argc, char *argv[], DataType dt)
             inTensorsCpu[i].resize(inDesc);
         }
         inTensorsCpu[i].alloc();
-        memcpy(get_ptr_from_tensor(inTensorsCpu[i], CPU_GENERAL), inputCpu[i],
+        UNI_MEMCPY(get_ptr_from_tensor(inTensorsCpu[i], CPU_GENERAL), inputCpu[i],
             tensorNumBytes(inTensorsCpu[i].get_desc()));
         inTensorPtrCpu[i] = &inTensorsCpu[i];
     }
diff --git a/compute/tensor/tests/test_expand.cpp b/compute/tensor/tests/test_expand.cpp
index 2aedf53e..b8402af1 100644
--- a/compute/tensor/tests/test_expand.cpp
+++ b/compute/tensor/tests/test_expand.cpp
@@ -28,11 +28,11 @@ int expandTest(int argc, char **argv, DataType dt)
     U32 oh = atoi(argv[7]);
     U32 ow = atoi(argv[8]);
     ExpandParamSpec p;
-    p.shape_size = 4;
-    p.shape_dims[0] = on;
-    p.shape_dims[1] = oc;
-    p.shape_dims[2] = oh;
-    p.shape_dims[3] = ow;
+    p.num_shape = 4;
+    p.shape[0] = on;
+    p.shape[1] = oc;
+    p.shape[2] = oh;
+    p.shape[3] = ow;
 
     DataFormat df = DF_NCHW;
     TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw);
@@ -41,7 +41,7 @@ int expandTest(int argc, char **argv, DataType dt)
     Tensor inputTensor;
     inputTensor.resize(inDesc);
     inputTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
 
     Tensor outputTensor;
     CHECK_STATUS(expand_infer_output_size(&inputTensor, p, &outputTensor, &UT_CPU_ARCHINFO));
diff --git a/compute/tensor/tests/test_fully_connected.cpp b/compute/tensor/tests/test_fully_connected.cpp
index b6b1a847..a54b401e 100644
--- a/compute/tensor/tests/test_fully_connected.cpp
+++ b/compute/tensor/tests/test_fully_connected.cpp
@@ -29,15 +29,15 @@ int fullyConnectedTest(int argc, char **argv, DataType dt)
 
     Tensor inputTensor = Tensor::alloc_sized<CPUMem>(inputDesc);
     U8 *input = ut_input_v(m * k, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
 
     Tensor filterTensor = Tensor::alloc_sized<CPUMem>(filterDesc);
     U8 *filter = ut_input_v(k * n, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
 
     Tensor biasTensor = Tensor::alloc_sized<CPUMem>(biasDesc);
     U8 *bias = ut_input_v(n, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc));
     // set output
     Tensor outputTensor;
     CHECK_STATUS(fully_connected_infer_output_size(
diff --git a/compute/tensor/tests/test_fully_connected_int8.cpp b/compute/tensor/tests/test_fully_connected_int8.cpp
index c6133896..2709053a 100644
--- a/compute/tensor/tests/test_fully_connected_int8.cpp
+++ b/compute/tensor/tests/test_fully_connected_int8.cpp
@@ -32,23 +32,24 @@ int fullyConnectedTest(int argc, char **argv, DataType dt, DataType filterDataTy
     inputTensor.resize(inputDesc);
     inputTensor.alloc();
     U8 *input = ut_input_v(m * k, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
 
     filterTensor.resize(filterDesc);
     filterTensor.alloc();
     U8 *filter = ut_input_v(k * n, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc));
     if (m == 1) {
         filterDescRef.df = DF_NORMAL;
     }
     filterTensorRef.resize(filterDescRef);
     filterTensorRef.alloc();
-    memcpy(get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDescRef));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDescRef));
 
     biasTensor.resize(biasDesc);
     biasTensor.alloc();
     U8 *bias = ut_input_v(n, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc));
 
     // set output
     Tensor outputTensor, outputTensorRef;
diff --git a/compute/tensor/tests/test_fully_connected_ocl.cpp b/compute/tensor/tests/test_fully_connected_ocl.cpp
index 268fed5c..5be67b1b 100644
--- a/compute/tensor/tests/test_fully_connected_ocl.cpp
+++ b/compute/tensor/tests/test_fully_connected_ocl.cpp
@@ -111,7 +111,7 @@ int fullyConnectedTest(int argc, char *argv[], DataType dt)
     biasNum = (fn + item_m - 1) / item_m * item_m;
     if (biasNum > fn) {
         U8 *bias_val = ut_input_v(biasNum, dt, UT_INIT_ZERO);
-        memcpy(bias_val, bias_cpu, fn * bytesOf(dt));
+        UNI_MEMCPY(bias_val, bias_cpu, fn * bytesOf(dt));
         free(bias_cpu);
         bias_cpu = bias_val;
     }
@@ -169,18 +169,19 @@ int fullyConnectedTest(int argc, char *argv[], DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
 
     Tensor filterTensorCpu;
     filterTensorCpu.resize(filterDesc);
     filterTensorCpu.alloc();
-    memcpy(
+    UNI_MEMCPY(
         get_ptr_from_tensor(filterTensorCpu, CPU_GENERAL), filter_cpu, tensorNumBytes(filterDesc));
 
     Tensor biasTensorCpu;
     biasTensorCpu.resize(biasDesc);
     biasTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc));
 
     Tensor outputTensorCpu;
     outputTensorCpu.resize(outputDesc_cpu);
diff --git a/compute/tensor/tests/test_gather_ocl.cpp b/compute/tensor/tests/test_gather_ocl.cpp
index 41e08d93..684ac396 100644
--- a/compute/tensor/tests/test_gather_ocl.cpp
+++ b/compute/tensor/tests/test_gather_ocl.cpp
@@ -75,10 +75,12 @@ int gatherTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu, indexTensorCpu, outputTensorCpu;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inputDesc));
     indexTensorCpu.resize(indexDesc);
     indexTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(indexTensorCpu, CPU_GENERAL), indexCpu, tensorNumBytes(indexDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(indexTensorCpu, CPU_GENERAL), indexCpu, tensorNumBytes(indexDesc));
     CHECK_STATUS(gather_infer_output_size(
         &inputTensorCpu, &indexTensorCpu, p, &outputTensorCpu, &UT_SERIAL_ARCHINFO));
     outputTensorCpu.alloc();
diff --git a/compute/tensor/tests/test_l2normalization.cpp b/compute/tensor/tests/test_l2normalization.cpp
index d4cb317b..61f540d0 100644
--- a/compute/tensor/tests/test_l2normalization.cpp
+++ b/compute/tensor/tests/test_l2normalization.cpp
@@ -29,7 +29,7 @@ int l2normalizationTest(int argc, char **argv, DataType dt)
     Tensor inputTensor;
     inputTensor.resize(inputDesc);
     inputTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
 
     // set output
     Tensor outputTensor, outputTensorRef;
diff --git a/compute/tensor/tests/test_matmul_int8.cpp b/compute/tensor/tests/test_matmul_int8.cpp
index dabb4ac4..1cdaa6a8 100644
--- a/compute/tensor/tests/test_matmul_int8.cpp
+++ b/compute/tensor/tests/test_matmul_int8.cpp
@@ -55,10 +55,12 @@ int MatmulTest(int argc, char **argv, DataType dt, DataType filterDataType)
     }
     matrixBTensor.set_scale(1);
     matrixBTensorRef.set_scale(1);
-    memcpy(get_ptr_from_tensor(matrixATensor, CPU_GENERAL), A, tensorNumBytes(matrixADesc));
-    memcpy(get_ptr_from_tensor(matrixATensorRef, CPU_GENERAL), ARef, tensorNumBytes(matrixADescRef));
-    memcpy(get_ptr_from_tensor(matrixBTensor, CPU_GENERAL), B, tensorNumBytes(matrixBDesc));
-    memcpy(get_ptr_from_tensor(matrixBTensorRef, CPU_GENERAL), BRef, tensorNumBytes(matrixBDescRef));
+    UNI_MEMCPY(get_ptr_from_tensor(matrixATensor, CPU_GENERAL), A, tensorNumBytes(matrixADesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(matrixATensorRef, CPU_GENERAL), ARef, tensorNumBytes(matrixADescRef));
+    UNI_MEMCPY(get_ptr_from_tensor(matrixBTensor, CPU_GENERAL), B, tensorNumBytes(matrixBDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(matrixBTensorRef, CPU_GENERAL), BRef, tensorNumBytes(matrixBDescRef));
 
     bool transposeA = (matrixADesc.df == DF_TRANSPOSE);
     bool transposeB = (matrixBDesc.df == DF_TRANSPOSE);
diff --git a/compute/tensor/tests/test_matmul_ocl.cpp b/compute/tensor/tests/test_matmul_ocl.cpp
index d71d96af..4cd759bc 100644
--- a/compute/tensor/tests/test_matmul_ocl.cpp
+++ b/compute/tensor/tests/test_matmul_ocl.cpp
@@ -194,13 +194,13 @@ int matmulTest(int argc, char *argv[], DataType dt)
     Tensor matrixATensorCpu;
     matrixATensorCpu.resize(matrixADesc);
     matrixATensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(matrixATensorCpu, CPU_GENERAL), matrixA_cpu,
+    UNI_MEMCPY(get_ptr_from_tensor(matrixATensorCpu, CPU_GENERAL), matrixA_cpu,
         tensorNumBytes(matrixADesc));
 
     Tensor matrixBTensorCpu;
     matrixBTensorCpu.resize(matrixBDesc);
     matrixBTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(matrixBTensorCpu, CPU_GENERAL), matrixB_cpu,
+    UNI_MEMCPY(get_ptr_from_tensor(matrixBTensorCpu, CPU_GENERAL), matrixB_cpu,
         tensorNumBytes(matrixBDesc));
 
     Tensor matrixCTensorCpu;
diff --git a/compute/tensor/tests/test_matmul_ocl_f32.cpp b/compute/tensor/tests/test_matmul_ocl_f32.cpp
index e34a0593..57ca760b 100644
--- a/compute/tensor/tests/test_matmul_ocl_f32.cpp
+++ b/compute/tensor/tests/test_matmul_ocl_f32.cpp
@@ -32,13 +32,13 @@ inline U8 *matmulF32Cpu(TensorDesc matrixADesc,
     Tensor matrixATensorCpu;
     matrixATensorCpu.resize(matrixADesc);
     matrixATensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(matrixATensorCpu, CPU_GENERAL), matrixA_cpu,
+    UNI_MEMCPY(get_ptr_from_tensor(matrixATensorCpu, CPU_GENERAL), matrixA_cpu,
         tensorNumBytes(matrixADesc));
 
     Tensor matrixBTensorCpu;
     matrixBTensorCpu.resize(matrixBDesc);
     matrixBTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(matrixBTensorCpu, CPU_GENERAL), matrixB_cpu,
+    UNI_MEMCPY(get_ptr_from_tensor(matrixBTensorCpu, CPU_GENERAL), matrixB_cpu,
         tensorNumBytes(matrixBDesc));
 
     CHECK_STATUS(matmul_infer_output_size(&matrixATensorCpu, transposeA, &matrixBTensorCpu,
diff --git a/compute/tensor/tests/test_non_max_suppression.cpp b/compute/tensor/tests/test_non_max_suppression.cpp
index 0a3a5353..b4147d05 100644
--- a/compute/tensor/tests/test_non_max_suppression.cpp
+++ b/compute/tensor/tests/test_non_max_suppression.cpp
@@ -25,9 +25,6 @@ int nonmaxsuppressionTest(int argc, char **argv, DataType dt)
     U32 in1 = atoi(argv[4]);
     U32 ic1 = atoi(argv[5]);
     U32 ilens1 = atoi(argv[6]);
-    // output
-    U32 oh = atoi(argv[7]);
-    U32 ow = atoi(argv[8]);
     // nonMaxSuppressionParamSpec
     U32 max_output_boxes_per_class = atoi(argv[9]);
     F32 iou_threshold = (F32)atof(argv[10]);
@@ -45,11 +42,11 @@ int nonmaxsuppressionTest(int argc, char **argv, DataType dt)
     inputTensors[1] = Tensor::alloc_sized<CPUMem>(input_desc_scores);
     U32 input_len_boxes = tensorNumElements(input_desc_boxes);
     U8 *input_boxes = ut_input_v(input_len_boxes, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensors[0], CPU_GENERAL), input_boxes,
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensors[0], CPU_GENERAL), input_boxes,
         tensorNumBytes(input_desc_boxes));
     U32 input_len_scores = tensorNumElements(input_desc_scores);
     U8 *input_scores = ut_input_v(input_len_scores, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensors[1], CPU_GENERAL), input_scores,
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensors[1], CPU_GENERAL), input_scores,
         tensorNumBytes(input_desc_scores));
     std::vector<Tensor *> inputTensorsPtr(2);
     inputTensorsPtr[0] = &inputTensors[0];
@@ -60,9 +57,8 @@ int nonmaxsuppressionTest(int argc, char **argv, DataType dt)
         inputTensorsPtr, nonMaxSuppressionParamSpec, &outputTensor, &UT_CPU_ARCHINFO));
     outputTensor.alloc();
     Tensor outputTensorRef = Tensor::alloc_sized<CPUMem>(outputTensor.get_desc());
-    U32 output_len = outputTensor.length();
-    CHECK_REQUIREMENT(input_len_boxes == in0 * ic0 * ilens0 &&
-        input_len_scores == in1 * ic1 * ilens1 && output_len == oh * ow);
+    CHECK_REQUIREMENT(
+        input_len_boxes == in0 * ic0 * ilens0 && input_len_scores == in1 * ic1 * ilens1);
     /*
        You can also change codes and use datas in the following example.
        Command: ./test_non_max_suppression 1 6 4 1 2 6 7 3 3 0.5 0
@@ -90,35 +86,16 @@ int nonmaxsuppressionTest(int argc, char **argv, DataType dt)
             inputTensors, nonMaxSuppressionParamSpec, outputTensorRef, &UT_SERIAL_ARCHINFO));
         // check
         ut_check_v(get_ptr_from_tensor(outputTensor, CPU_GENERAL),
-            get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), output_len, dt, 0.05, __FILE__,
-            __LINE__);
+            get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), outputTensor.length(), dt, 0.05,
+            __FILE__, __LINE__);
     }
 
-    U32 num_detected_max = max_output_boxes_per_class * ic1;
-    if (dt == DT_F32) {
-        F32 *output_f32 = (F32 *)get_ptr_from_tensor(outputTensor, CPU_GENERAL);
-        int idx = 0;
-        for (U32 i = 0; i < 1 + num_detected_max; i++) {
-            for (int j = 0; j < 3; j++) {
-                printf("%d:%f ", j, output_f32[idx + j]);
-            }
-            printf("\n");
-            idx = idx + 3;
-        }
+    TensorDesc outputDesc = outputTensor.get_desc();
+    I32 *out = (I32 *)get_ptr_from_tensor(outputTensor, CPU_GENERAL);
+    U32 num_detected = outputDesc.dims[1];
+    for (U32 i = 0; i < num_detected; i++) {
+        printf("(%d, %d, %d)\n", out[i * 3], out[i * 3 + 1], out[i * 3 + 2]);
     }
-#ifdef _USE_FP16
-    if (dt == DT_F16) {
-        F16 *output_f16 = (F16 *)get_ptr_from_tensor(outputTensorRef, CPU_GENERAL);
-        int idx = 0;
-        for (U32 i = 0; i < 1 + num_detected_max; i++) {
-            for (int j = 0; j < 3; j++) {
-                printf("%d:%f ", j + 1, output_f16[idx + j]);
-            }
-            printf("\n");
-            idx = idx + 3;
-        }
-    }
-#endif
     free(input_boxes);
     free(input_scores);
     return 0;
diff --git a/compute/tensor/tests/test_normalization.cpp b/compute/tensor/tests/test_normalization.cpp
index 63337cbe..22aa5dcb 100644
--- a/compute/tensor/tests/test_normalization.cpp
+++ b/compute/tensor/tests/test_normalization.cpp
@@ -22,6 +22,8 @@ int normalizationTest(int argc, char **argv, DataType dt)
     U32 ic = atoi(argv[3]);
     U32 ih = atoi(argv[4]);
     U32 iw = atoi(argv[5]);
+    LayerNormParamSpec p;
+    p.axis = -1;
 
     DataFormat df = DF_MTK;
     Tensor inputTensor;
@@ -30,7 +32,7 @@ int normalizationTest(int argc, char **argv, DataType dt)
     inputTensor.alloc();
     U32 input_len = tensorNumElements(inputDesc);
     U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
 
     // set output
     Tensor outputTensor, outputTensorRef;
@@ -56,16 +58,16 @@ int normalizationTest(int argc, char **argv, DataType dt)
     betaTensor.resize(betaDesc);
     alphaTensor.alloc();
     betaTensor.alloc();
-    memcpy(get_ptr_from_tensor(alphaTensor, CPU_GENERAL), alpha_list, tensorNumBytes(alphaDesc));
-    memcpy(get_ptr_from_tensor(betaTensor, CPU_GENERAL), beta_list, tensorNumBytes(betaDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(alphaTensor, CPU_GENERAL), alpha_list, tensorNumBytes(alphaDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(betaTensor, CPU_GENERAL), beta_list, tensorNumBytes(betaDesc));
 
     if (UT_CHECK) {
         CHECK_STATUS(layer_normalization(
-            inputTensor, alphaTensor, betaTensor, tmpTensor, outputTensor, &UT_CPU_ARCHINFO));
+            inputTensor, p, alphaTensor, betaTensor, tmpTensor, outputTensor, &UT_CPU_ARCHINFO));
 
         // naive implement
-        CHECK_STATUS(layer_normalization(
-            inputTensor, alphaTensor, betaTensor, tmpTensor, outputTensorRef, &UT_SERIAL_ARCHINFO));
+        CHECK_STATUS(layer_normalization(inputTensor, p, alphaTensor, betaTensor, tmpTensor,
+            outputTensorRef, &UT_SERIAL_ARCHINFO));
 
         // check
         ut_check_v(get_ptr_from_tensor(outputTensor, CPU_GENERAL),
@@ -77,7 +79,7 @@ int normalizationTest(int argc, char **argv, DataType dt)
     double time_start = ut_time_ms();
     for (int iter = 0; iter < UT_LOOPS; iter++) {
         CHECK_STATUS(layer_normalization(
-            inputTensor, alphaTensor, betaTensor, tmpTensor, outputTensor, &UT_CPU_ARCHINFO));
+            inputTensor, p, alphaTensor, betaTensor, tmpTensor, outputTensor, &UT_CPU_ARCHINFO));
     }
     double time_end = ut_time_ms();
     double time = (time_end - time_start) / UT_LOOPS;
diff --git a/compute/tensor/tests/test_padding.cpp b/compute/tensor/tests/test_padding.cpp
index c6ddf9ea..e310c2bb 100644
--- a/compute/tensor/tests/test_padding.cpp
+++ b/compute/tensor/tests/test_padding.cpp
@@ -52,20 +52,20 @@ int paddingTest(int argc, char **argv, DataType dt)
     padParamSpec.constant_value = 0.0;
     switch (mode) {
         case 0: {
-            padParamSpec.pad_mode = Pad_Constant;
+            padParamSpec.pad_mode = PAD_CONSTANT;
             break;
         }
         case 1: {
-            padParamSpec.pad_mode = Pad_Edge;
+            padParamSpec.pad_mode = PAD_EDGE;
             break;
         }
         case 2: {
             // limitation: the h_fir and the h_sec should lower than 0
-            padParamSpec.pad_mode = Pad_Reflect;
+            padParamSpec.pad_mode = PAD_REFLECT;
             break;
         }
         case 3: {
-            padParamSpec.pad_mode = Pad_Symmetric;
+            padParamSpec.pad_mode = PAD_SYMMETRIC;
             break;
         }
         default: {
@@ -80,7 +80,7 @@ int paddingTest(int argc, char **argv, DataType dt)
     inputTensor.alloc();
     U32 input_len = tensorNumElements(inputDesc);
     U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
 
     // set output
     Tensor outputTensor, outputTensorRef;
diff --git a/compute/tensor/tests/test_padding_ocl.cpp b/compute/tensor/tests/test_padding_ocl.cpp
index 677325bd..df8db6a2 100644
--- a/compute/tensor/tests/test_padding_ocl.cpp
+++ b/compute/tensor/tests/test_padding_ocl.cpp
@@ -44,20 +44,20 @@ int paddingTest(int argc, char **argv, DataType dt)
     padParamSpec.constant_value = 0.0;
     switch (mode) {
         case 0: {
-            padParamSpec.pad_mode = Pad_Constant;
+            padParamSpec.pad_mode = PAD_CONSTANT;
             break;
         }
         case 1: {
-            padParamSpec.pad_mode = Pad_Edge;
+            padParamSpec.pad_mode = PAD_EDGE;
             break;
         }
         case 2: {
             // limitation: the h_fir and the h_sec should lower than 0
-            padParamSpec.pad_mode = Pad_Reflect;
+            padParamSpec.pad_mode = PAD_REFLECT;
             break;
         }
         case 3: {
-            padParamSpec.pad_mode = Pad_Symmetric;
+            padParamSpec.pad_mode = PAD_SYMMETRIC;
             break;
         }
         default: {
@@ -135,7 +135,8 @@ int paddingTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inputDescCPU);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCPU, tensorNumBytes(inputDescCPU));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCPU, tensorNumBytes(inputDescCPU));
 
     Tensor outputTensorCpu;
     CHECK_STATUS(padding_infer_output_size(
diff --git a/compute/tensor/tests/test_pooling.cpp b/compute/tensor/tests/test_pooling.cpp
index 67555d0e..07055813 100644
--- a/compute/tensor/tests/test_pooling.cpp
+++ b/compute/tensor/tests/test_pooling.cpp
@@ -26,19 +26,19 @@ int poolingTest(int argc, char **argv, DataType dt)
 
     PoolingParamSpec p;
     p.mode = POOLING_MAX;
-    p.rm = CEIL;
+    p.round_mode = ROUND_CEIL;
     p.kernel_t = atoi(argv[6]);
     p.kernel_h = atoi(argv[7]);
     p.kernel_w = atoi(argv[8]);
     p.stride_t = atoi(argv[9]);
     p.stride_h = atoi(argv[10]);
     p.stride_w = atoi(argv[11]);
-    p.padding_before = atoi(argv[12]);
-    p.padding_after = atoi(argv[13]);
-    p.padding_top = atoi(argv[14]);
-    p.padding_bottom = atoi(argv[15]);
-    p.padding_left = atoi(argv[16]);
-    p.padding_right = atoi(argv[17]);
+    p.pad_before = atoi(argv[12]);
+    p.pad_after = atoi(argv[13]);
+    p.pad_top = atoi(argv[14]);
+    p.pad_bottom = atoi(argv[15]);
+    p.pad_left = atoi(argv[16]);
+    p.pad_right = atoi(argv[17]);
 
     TensorDesc inputDesc;
     if (it == 1) {
@@ -49,7 +49,7 @@ int poolingTest(int argc, char **argv, DataType dt)
     Tensor inputTensor = Tensor::alloc_sized<CPUMem>(inputDesc);
     U32 input_len = inputTensor.length();
     U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes());
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes());
 
     // set output
     Tensor outputTensor;
diff --git a/compute/tensor/tests/test_pooling_bp.cpp b/compute/tensor/tests/test_pooling_bp.cpp
index 34932ab8..c94a5cc4 100644
--- a/compute/tensor/tests/test_pooling_bp.cpp
+++ b/compute/tensor/tests/test_pooling_bp.cpp
@@ -26,19 +26,19 @@ int poolingbpTest(int argc, char **argv, DataType dt)
 
     PoolingParamSpec p;
     p.mode = POOLING_MEAN;
-    p.rm = CEIL;
+    p.round_mode = ROUND_CEIL;
     p.kernel_t = atoi(argv[6]);
     p.kernel_h = atoi(argv[7]);
     p.kernel_w = atoi(argv[8]);
     p.stride_t = atoi(argv[9]);
     p.stride_h = atoi(argv[10]);
     p.stride_w = atoi(argv[11]);
-    p.padding_before = atoi(argv[12]);
-    p.padding_after = atoi(argv[13]);
-    p.padding_top = atoi(argv[14]);
-    p.padding_bottom = atoi(argv[15]);
-    p.padding_left = atoi(argv[16]);
-    p.padding_right = atoi(argv[17]);
+    p.pad_before = atoi(argv[12]);
+    p.pad_after = atoi(argv[13]);
+    p.pad_top = atoi(argv[14]);
+    p.pad_bottom = atoi(argv[15]);
+    p.pad_left = atoi(argv[16]);
+    p.pad_right = atoi(argv[17]);
 
     TensorDesc inputDesc;
     if (it == 1) {
@@ -49,7 +49,7 @@ int poolingbpTest(int argc, char **argv, DataType dt)
     Tensor inputTensor = Tensor::alloc_sized<CPUMem>(inputDesc);
     U32 input_len = inputTensor.length();
     U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes());
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes());
 
     // set output
     Tensor outputTensor;
diff --git a/compute/tensor/tests/test_pooling_int8.cpp b/compute/tensor/tests/test_pooling_int8.cpp
index 05634f24..767d7c84 100644
--- a/compute/tensor/tests/test_pooling_int8.cpp
+++ b/compute/tensor/tests/test_pooling_int8.cpp
@@ -27,19 +27,19 @@ int int8PoolingTest(int argc, char **argv, DataType dt)
 
     PoolingParamSpec p;
     p.mode = POOLING_MEAN;
-    p.rm = CEIL;
+    p.round_mode = ROUND_CEIL;
     p.kernel_t = atoi(argv[6]);
     p.kernel_h = atoi(argv[7]);
     p.kernel_w = atoi(argv[8]);
     p.stride_t = atoi(argv[9]);
     p.stride_h = atoi(argv[10]);
     p.stride_w = atoi(argv[11]);
-    p.padding_before = atoi(argv[12]);
-    p.padding_after = atoi(argv[13]);
-    p.padding_top = atoi(argv[14]);
-    p.padding_bottom = atoi(argv[15]);
-    p.padding_left = atoi(argv[16]);
-    p.padding_right = atoi(argv[17]);
+    p.pad_before = atoi(argv[12]);
+    p.pad_after = atoi(argv[13]);
+    p.pad_top = atoi(argv[14]);
+    p.pad_bottom = atoi(argv[15]);
+    p.pad_left = atoi(argv[16]);
+    p.pad_right = atoi(argv[17]);
 
     TensorDesc inputDesc = tensor4df(DT_I8, DF_NCHWC8, in, ic, ih, iw);
     TensorDesc inputDescRef = inputDesc;
diff --git a/compute/tensor/tests/test_pooling_ocl.cpp b/compute/tensor/tests/test_pooling_ocl.cpp
index a9ceee44..71c8cdb6 100644
--- a/compute/tensor/tests/test_pooling_ocl.cpp
+++ b/compute/tensor/tests/test_pooling_ocl.cpp
@@ -47,19 +47,19 @@ int poolingTest(int argc, char **argv, DataType dt)
 
     PoolingParamSpec p;
     p.mode = POOLING_MEAN;
-    p.rm = CEIL;
+    p.round_mode = ROUND_CEIL;
     p.kernel_t = atoi(argv[6]);
     p.kernel_h = atoi(argv[7]);
     p.kernel_w = atoi(argv[8]);
     p.stride_t = atoi(argv[9]);
     p.stride_h = atoi(argv[10]);
     p.stride_w = atoi(argv[11]);
-    p.padding_before = atoi(argv[12]);
-    p.padding_after = atoi(argv[13]);
-    p.padding_top = atoi(argv[14]);
-    p.padding_bottom = atoi(argv[15]);
-    p.padding_left = atoi(argv[16]);
-    p.padding_right = atoi(argv[17]);
+    p.pad_before = atoi(argv[12]);
+    p.pad_after = atoi(argv[13]);
+    p.pad_top = atoi(argv[14]);
+    p.pad_bottom = atoi(argv[15]);
+    p.pad_left = atoi(argv[16]);
+    p.pad_right = atoi(argv[17]);
 
     ArchInfo archInfo;
     archInfo.arch = MALI;
@@ -84,7 +84,7 @@ int poolingTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inputDescCpu);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu_nchwc8,
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu_nchwc8,
         tensorNumBytes(inputDescCpu));
 
     Tensor outputTensorCpu;
diff --git a/compute/tensor/tests/test_power.cpp b/compute/tensor/tests/test_power.cpp
index a61371fc..e4146da4 100644
--- a/compute/tensor/tests/test_power.cpp
+++ b/compute/tensor/tests/test_power.cpp
@@ -28,10 +28,10 @@ int powerTest(int argc, char **argv, DataType dt)
     inputTensor.resize(inputDesc);
     inputTensor.alloc();
     U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
     // set output
     Tensor outputTensor, outputTensorRef;
-    CHECK_STATUS(power_infer_output_size(&inputTensor, &outputTensor, &UT_CPU_ARCHINFO));
+    CHECK_STATUS(power_infer_output_size(&inputTensor, p, &outputTensor, &UT_CPU_ARCHINFO));
     outputTensor.alloc();
     TensorDesc outputDesc_ref = outputTensor.get_desc();
     outputTensorRef.resize(outputDesc_ref);
diff --git a/compute/tensor/tests/test_power_ocl.cpp b/compute/tensor/tests/test_power_ocl.cpp
index bcc3dc70..009f041b 100644
--- a/compute/tensor/tests/test_power_ocl.cpp
+++ b/compute/tensor/tests/test_power_ocl.cpp
@@ -61,7 +61,7 @@ int powerTest(int argc, char **argv, DataType dt)
     MaliPara maliPara;
     maliPara.handle = handle;
     archInfo.archPara = &maliPara;
-    CHECK_STATUS(power_infer_output_size(&inputTensor, &outputTensor, &archInfo));
+    CHECK_STATUS(power_infer_output_size(&inputTensor, p, &outputTensor, &archInfo));
     TensorDesc output_desc_gpu = outputTensor.get_desc();
     U8 *output_gpu = ut_input_v(on * oc * oh * ow, dt, UT_INIT_RANDOM);
 
@@ -106,7 +106,7 @@ int powerTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(input_desc_cpu);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu,
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu,
         tensorNumBytes(input_desc_cpu));
 
     Tensor outputTensorCpu;
diff --git a/compute/tensor/tests/test_prelu.cpp b/compute/tensor/tests/test_prelu.cpp
index e5536df0..cbd2c109 100644
--- a/compute/tensor/tests/test_prelu.cpp
+++ b/compute/tensor/tests/test_prelu.cpp
@@ -35,8 +35,8 @@ int preluTest(int argc, char **argv, DataType dt)
 
     Tensor inputTensor = Tensor::alloc_sized<CPUMem>(inputDesc);
     Tensor weightTensor = Tensor::alloc_sized<CPUMem>(weightDesc);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
-    memcpy(get_ptr_from_tensor(weightTensor, CPU_GENERAL), weight, tensorNumBytes(weightDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(weightTensor, CPU_GENERAL), weight, tensorNumBytes(weightDesc));
 
     // set output
     Tensor outputTensor;
diff --git a/compute/tensor/tests/test_prelu_ocl.cpp b/compute/tensor/tests/test_prelu_ocl.cpp
index 473803b5..a6c2a991 100644
--- a/compute/tensor/tests/test_prelu_ocl.cpp
+++ b/compute/tensor/tests/test_prelu_ocl.cpp
@@ -70,7 +70,7 @@ int preluTest(int argc, char **argv, DataType dt)
     U32 icAlign = (ic + 3) / 4 * 4;
     if (!preluDesc.propagate_down) {
         U8 *weightAlign = ut_input_v(icAlign, dt, UT_INIT_ZERO);
-        memcpy(weightAlign, weightCPU, ic * bytesOf(dt));
+        UNI_MEMCPY(weightAlign, weightCPU, ic * bytesOf(dt));
         free(weightCPU);
         weightCPU = weightAlign;
         alloc_padding(weightTensor, 0, icAlign - ic, 0, 0, weightCPU);
diff --git a/compute/tensor/tests/test_reduction.cpp b/compute/tensor/tests/test_reduction.cpp
index 327af183..1fda5cf1 100644
--- a/compute/tensor/tests/test_reduction.cpp
+++ b/compute/tensor/tests/test_reduction.cpp
@@ -22,11 +22,11 @@ int reductionTest(int argc, char **argv, DataType dt)
     U32 ih = atoi(argv[3]);
     U32 iw = atoi(argv[4]);
     ReductionParamSpec p;
-    p.axes_num = atoi(argv[5]);
-    for (int i = 0; i < p.axes_num; i++) {
+    p.num_axes = atoi(argv[5]);
+    for (int i = 0; i < p.num_axes; i++) {
         p.axes[i] = atoi(argv[6 + i]);
     }
-    p.reduction_mode = REDUCTION_MEAN;
+    p.mode = REDUCTION_MEAN;
     p.coeff = 1.0;
     p.keep_dim = true;
     DataFormat df = DF_NCHW;
@@ -38,7 +38,7 @@ int reductionTest(int argc, char **argv, DataType dt)
     Tensor inputTensor;
     inputTensor.resize(inDesc);
     inputTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
 
     Tensor maskTensor;
     maskTensor.resize(maskDesc);
@@ -85,7 +85,7 @@ int reductionTest(int argc, char **argv, DataType dt)
     CHECK_STATUS(tensor4dGet(outputTensor.get_desc(), &dt, &df, &on, &oc, &oh, &ow));
     char buffer[150];
     char params[120];
-    sprintf(params, "(%u %u %u %u) %d =(%u %u %u %u)", in, ic, ih, iw, p.axes_num, on, oc, oh, ow);
+    sprintf(params, "(%u %u %u %u) %d =(%u %u %u %u)", in, ic, ih, iw, p.num_axes, on, oc, oh, ow);
     sprintf(buffer, "%20s, %80s", "Reduction", params);
     double ops = 1.0 * in * ic * ih * iw;
     ut_log(dt, buffer, ops, time / UT_LOOPS);
diff --git a/compute/tensor/tests/test_reduction_ocl.cpp b/compute/tensor/tests/test_reduction_ocl.cpp
index b7a67d33..9d2bb0a9 100644
--- a/compute/tensor/tests/test_reduction_ocl.cpp
+++ b/compute/tensor/tests/test_reduction_ocl.cpp
@@ -24,11 +24,11 @@ int reductionTest(int argc, char **argv, DataType dt)
     ReductionParamSpec p;
     p.keep_dim = atoi(argv[5]);
     bool use_c4 = atoi(argv[6]);
-    p.axes_num = atoi(argv[7]);
-    for (int i = 0; i < p.axes_num; i++) {
+    p.num_axes = atoi(argv[7]);
+    for (int i = 0; i < p.num_axes; i++) {
         p.axes[i] = atoi(argv[8 + i]);
     }
-    p.reduction_mode = REDUCTION_MEAN;
+    p.mode = REDUCTION_MEAN;
     p.coeff = 1.0;
     TensorDesc maskDesc;
     maskDesc.nDims = 0;
@@ -45,7 +45,8 @@ int reductionTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
 
     Tensor maskTensorCpu;
     maskTensorCpu.resize(maskDesc);
@@ -117,7 +118,7 @@ int reductionTest(int argc, char **argv, DataType dt)
 
     char buffer[150];
     char params[120];
-    sprintf(params, "(%u %u %u %u) %d =(%u %u %u %u)", in, ic, ih, iw, p.axes_num, on, oc, oh, ow);
+    sprintf(params, "(%u %u %u %u) %d =(%u %u %u %u)", in, ic, ih, iw, p.num_axes, on, oc, oh, ow);
     sprintf(buffer, "%20s, %80s", "Reduction", params);
 #ifdef _DEBUG
     double ops = len;
diff --git a/compute/tensor/tests/test_reshape.cpp b/compute/tensor/tests/test_reshape.cpp
index 2bf39380..0d73239b 100644
--- a/compute/tensor/tests/test_reshape.cpp
+++ b/compute/tensor/tests/test_reshape.cpp
@@ -24,10 +24,10 @@ int reshapeTest(int argc, char **argv, DataType dt)
     U32 ih = atoi(argv[3]);
     U32 iw = atoi(argv[4]);
     ReshapeParamSpec p;
-    p.shape_size = atoi(argv[5]);
-    CHECK_REQUIREMENT(argc == 6 + p.shape_size);
-    for (I32 i = 0; i < p.shape_size; i++) {
-        p.shape_dims[i] = atoi(argv[6 + i]);
+    p.num_shape = atoi(argv[5]);
+    CHECK_REQUIREMENT(argc == 6 + p.num_shape);
+    for (I32 i = 0; i < p.num_shape; i++) {
+        p.shape[i] = atoi(argv[6 + i]);
     }
 
     DataFormat df = DF_NCHW;
@@ -37,7 +37,7 @@ int reshapeTest(int argc, char **argv, DataType dt)
     Tensor inputTensor;
     inputTensor.resize(inDesc);
     inputTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
 
     Tensor outputTensor;
     CHECK_STATUS(reshape_infer_output_size(&inputTensor, p, &outputTensor, &UT_CPU_ARCHINFO));
@@ -61,16 +61,16 @@ int reshapeTest(int argc, char **argv, DataType dt)
     // log performance data
     char buffer[150];
     char params[120];
-    memset(params, 0, 120);
+    UNI_MEMSET(params, 0, 120);
     sprintf(params, "(%u %u %u %u)=(", in, ic, ih, iw);
-    for (I32 i = 0; i < p.shape_size; i++) {
+    for (I32 i = 0; i < p.num_shape; i++) {
         I32 index = 0;
         for (; index < 120; index++) {
             if (params[index] == '\0') {
                 break;
             }
         }
-        if (i != p.shape_size - 1) {
+        if (i != p.num_shape - 1) {
             sprintf(params + index, "%d ", outDesc.dims[outDesc.nDims - 1 - i]);
         } else {
             sprintf(params + index, "%d)", outDesc.dims[outDesc.nDims - 1 - i]);
diff --git a/compute/tensor/tests/test_reshape_ocl.cpp b/compute/tensor/tests/test_reshape_ocl.cpp
index f17b15f3..b7d5fde6 100644
--- a/compute/tensor/tests/test_reshape_ocl.cpp
+++ b/compute/tensor/tests/test_reshape_ocl.cpp
@@ -24,9 +24,9 @@ int reshapeTest(int argc, char **argv, DataType dt)
     for (U32 i = 0; i < inputDesc.nDims; i++) {
         inputDesc.dims[inputDesc.nDims - i - 1] = atoi(argv[i + 2]);
     }
-    p.shape_size = atoi(argv[inputDesc.nDims + 2]);
-    for (I32 i = 0; i < p.shape_size; i++) {
-        p.shape_dims[i] = atoi(argv[i + inputDesc.nDims + 3]);
+    p.num_shape = atoi(argv[inputDesc.nDims + 2]);
+    for (I32 i = 0; i < p.num_shape; i++) {
+        p.shape[i] = atoi(argv[i + inputDesc.nDims + 3]);
     }
 
     ArchInfo archInfo;
@@ -38,7 +38,8 @@ int reshapeTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
 
     Tensor outputTensorCpu;
     Tensor tmpTensorCpu;
@@ -98,7 +99,7 @@ int reshapeTest(int argc, char **argv, DataType dt)
 
     char buffer[150];
     char params[120];
-    memset(params, 0, 120);
+    UNI_MEMSET(params, 0, 120);
     sprintf(params, "(");
     for (U32 i = 0; i < inputDesc.nDims; i++) {
         if (i != inputDesc.nDims - 1) {
@@ -107,14 +108,14 @@ int reshapeTest(int argc, char **argv, DataType dt)
             sprintf(params + i * 2 + 1, "%d) = (", inputDesc.dims[inputDesc.nDims - 1 - i]);
         }
     }
-    for (I32 i = 0; i < p.shape_size; i++) {
+    for (I32 i = 0; i < p.num_shape; i++) {
         I32 index = 0;
         for (; index < 120; index++) {
             if (params[index] == '\0') {
                 break;
             }
         }
-        if (i != p.shape_size - 1) {
+        if (i != p.num_shape - 1) {
             sprintf(params + index, "%d ", outputDesc.dims[outputDesc.nDims - 1 - i]);
         } else {
             sprintf(params + index, "%d)", outputDesc.dims[outputDesc.nDims - 1 - i]);
diff --git a/compute/tensor/tests/test_rnn.cpp b/compute/tensor/tests/test_rnn.cpp
index 556fa58a..f78b58b9 100644
--- a/compute/tensor/tests/test_rnn.cpp
+++ b/compute/tensor/tests/test_rnn.cpp
@@ -25,20 +25,20 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
     RNNParamSpec rnnParamSpec;
     rnnParamSpec.mode = mode;
     rnnParamSpec.steps = step;
-    rnnParamSpec.biDirection = false;
-    rnnParamSpec.numOutput = hDim;
-    rnnParamSpec.numProjection = 0;
-    rnnParamSpec.forgetBias = 1.0;
-    rnnParamSpec.activationMode = ACTIVATION_TANH;
-    rnnParamSpec.zoneoutCell = 0;
-    rnnParamSpec.zoneoutOutput = 0;
+    rnnParamSpec.bi_direction = false;
+    rnnParamSpec.num_outputs = hDim;
+    rnnParamSpec.num_projection = 0;
+    rnnParamSpec.forget_bias = 1.0;
+    rnnParamSpec.activation_type = ACTIVATION_TANH;
+    rnnParamSpec.zoneout_cell = 0;
+    rnnParamSpec.zoneout_output = 0;
 
     U32 weightNum = 1;
     U32 biasNum = 1;
     int factor = 0;
     switch (mode) {
         case RNN_LSTM:
-            rnnParamSpec.numProjection = 1024;
+            rnnParamSpec.num_projection = 1024;
             factor = 4;
             break;
         case RNN_GRU:
@@ -52,39 +52,39 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
             return 1;
     }
     F32 threshold = 10;
-    if (rnnParamSpec.numProjection > 0) {
+    if (rnnParamSpec.num_projection > 0) {
         weightNum++;
         biasNum++;
         threshold = 40;
     }
 
     if (rnnParamSpec.mode != RNN_LSTM) {
-        rnnParamSpec.numProjection = 0;
-        rnnParamSpec.forgetBias = 0;
+        rnnParamSpec.num_projection = 0;
+        rnnParamSpec.forget_bias = 0;
     }
 
-    U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection
-                                                  : rnnParamSpec.numOutput;
+    U32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection
+                                                   : rnnParamSpec.num_outputs;
     TensorDesc inputDesc = tensor3df(dt, DF_MTK, batch, step, xDim);
     Tensor inputTensor;
     inputTensor.resize(inputDesc);
     inputTensor.alloc();
     U32 inputLength = batch * step * xDim;
     U8 *input = ut_input_v(inputLength, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc));
 
     U32 tmpBytes;
     std::vector<TensorDesc> filterDesc(2), biasDesc(2);
     filterDesc[0] = tensor2df(dt, DF_NK, factor * column, xDim + hDim);
-    filterDesc[1] = tensor2df(dt, DF_NK, rnnParamSpec.numOutput, rnnParamSpec.numProjection);
+    filterDesc[1] = tensor2df(dt, DF_NK, rnnParamSpec.num_outputs, rnnParamSpec.num_projection);
     biasDesc[0] = tensor1d(dt, column * factor);
-    biasDesc[1] = tensor1d(dt, rnnParamSpec.numOutput);
+    biasDesc[1] = tensor1d(dt, rnnParamSpec.num_outputs);
     std::vector<Tensor> filterTensor(weightNum), biasTensor(biasNum);
     for (U32 i = 0; i < weightNum; i++) {
         filterTensor[i].resize(filterDesc[i]);
         filterTensor[i].alloc();
         U8 *filter = ut_input_v(tensorNumBytes(filterDesc[i]) / bytesOf(dt), dt, UT_INIT_RANDOM);
-        memcpy(get_ptr_from_tensor(filterTensor[i], CPU_GENERAL), filter,
+        UNI_MEMCPY(get_ptr_from_tensor(filterTensor[i], CPU_GENERAL), filter,
             tensorNumBytes(filterDesc[i]));
         free(filter);
     }
@@ -93,7 +93,8 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
         biasTensor[i].resize(biasDesc[i]);
         biasTensor[i].alloc();
         U8 *bias = ut_input_v(tensorNumBytes(biasDesc[i]) / bytesOf(dt), dt, UT_INIT_RANDOM);
-        memcpy(get_ptr_from_tensor(biasTensor[i], CPU_GENERAL), bias, tensorNumBytes(biasDesc[i]));
+        UNI_MEMCPY(
+            get_ptr_from_tensor(biasTensor[i], CPU_GENERAL), bias, tensorNumBytes(biasDesc[i]));
         free(bias);
     }
 
@@ -140,12 +141,12 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
     std::vector<Tensor> outputTensorRefVec(1, outputTensorRef);
     std::vector<Tensor> tmpTensorVec(1, tmpTensor);
     if (UT_CHECK) {
-        memset(get_ptr_from_tensor(tmpTensor, UT_CPU_ARCHINFO.arch), 0, tmpBytes);
+        UNI_MEMSET(get_ptr_from_tensor(tmpTensor, UT_CPU_ARCHINFO.arch), 0, tmpBytes);
         CHECK_STATUS(rnn(inputTensorVec, ftmTensor, biasTensor, rnnParamSpec, tmpTensorVec,
             outputTensorVec, &UT_CPU_ARCHINFO));
 
         // naive implement
-        memset(get_ptr_from_tensor(tmpTensor, UT_CPU_ARCHINFO.arch), 0, tmpBytes);
+        UNI_MEMSET(get_ptr_from_tensor(tmpTensor, UT_CPU_ARCHINFO.arch), 0, tmpBytes);
         CHECK_STATUS(rnn(inputTensorVec, ftmTensorRef, biasTensor, rnnParamSpec, tmpTensorVec,
             outputTensorRefVec, &UT_SERIAL_ARCHINFO));
 
@@ -172,7 +173,7 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
     double hxDim = hDim + xDim;
     double ops = 1.0 * batch * step *
         (2.0 * hxDim * column * factor + column * factor +
-            rnnParamSpec.numProjection * rnnParamSpec.numOutput);
+            rnnParamSpec.num_projection * rnnParamSpec.num_outputs);
     ut_log(dt, buffer, ops, time);
 
     free(input);
diff --git a/compute/tensor/tests/test_rnn_ocl.cpp b/compute/tensor/tests/test_rnn_ocl.cpp
index 97df2e6f..51e60be1 100644
--- a/compute/tensor/tests/test_rnn_ocl.cpp
+++ b/compute/tensor/tests/test_rnn_ocl.cpp
@@ -16,12 +16,12 @@
 
 int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
 {
-    U32 batch, step, xDim, hDim, numProjection, biDir;
+    U32 batch, step, xDim, hDim, num_projection, biDir;
     batch = atoi(argv[1]);
     step = atoi(argv[2]);
     xDim = atoi(argv[3]);
     hDim = atoi(argv[4]);
-    numProjection = atoi(argv[5]);
+    num_projection = atoi(argv[5]);
     biDir = atoi(argv[6]);
     ArchInfo archInfo;
     archInfo.arch = MALI;
@@ -31,22 +31,22 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
 
     RNNParamSpec rnnParamSpec;
     rnnParamSpec.mode = RNN_LSTM;
-    rnnParamSpec.numOutput = hDim;
-    rnnParamSpec.numProjection = numProjection;
-    rnnParamSpec.forgetBias = 1.0;
-    rnnParamSpec.zoneoutCell = 0;
-    rnnParamSpec.zoneoutOutput = 0;
+    rnnParamSpec.num_outputs = hDim;
+    rnnParamSpec.num_projection = num_projection;
+    rnnParamSpec.forget_bias = 1.0;
+    rnnParamSpec.zoneout_cell = 0;
+    rnnParamSpec.zoneout_output = 0;
     rnnParamSpec.steps = 0;
-    rnnParamSpec.biDirection = (biDir) ? true : false;
-    rnnParamSpec.activationMode = ACTIVATION_TANH;
+    rnnParamSpec.bi_direction = (biDir) ? true : false;
+    rnnParamSpec.activation_type = ACTIVATION_TANH;
 
-    U32 col = (numProjection > 0) ? numProjection : hDim;
+    U32 col = (num_projection > 0) ? num_projection : hDim;
     TensorDesc inputDesc = tensor3df(dt, DF_NORMAL, batch, step, xDim);
 
     std::vector<TensorDesc> biasDesc(2);
     std::vector<TensorDesc> filterDesc(2);
     filterDesc[0] = tensor2df(dt, DF_NK, 4 * col, xDim + hDim);
-    filterDesc[1] = tensor2df(dt, DF_NK, hDim, numProjection);
+    filterDesc[1] = tensor2df(dt, DF_NK, hDim, num_projection);
     biasDesc[0] = tensor1d(dt, 4 * col);
     biasDesc[1] = tensor1d(dt, hDim);
 
@@ -54,7 +54,7 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
 
-    U32 filterNum = (numProjection) ? 2 : 1;
+    U32 filterNum = (num_projection) ? 2 : 1;
     U32 biDirNum = (biDir) ? 2 : 1;
     std::vector<Tensor> filterTensorCpu(filterNum * biDirNum);
     std::vector<Tensor> biasTensorCpu(filterNum * biDirNum);
@@ -70,7 +70,7 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
     Tensor outputTensorCpu;
     U32 inputLen = tensorNumElements(inputDesc);
     U8 *input_cpu = ut_input_v(inputLen, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, inputLen * bytesOf(dt));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, inputLen * bytesOf(dt));
 
     std::vector<U8 *> bias_cpu(filterNum * biDirNum);
     std::vector<U8 *> filter_cpu(filterNum * biDirNum);
@@ -78,12 +78,12 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
         for (U32 j = 0; j < filterNum; j++) {
             U32 len = tensorNumElements(biasDesc[j]);
             bias_cpu[i * filterNum + j] = ut_input_v(len, dt, UT_INIT_RANDOM);
-            memcpy(get_ptr_from_tensor(biasTensorCpu[i * filterNum + j], CPU_GENERAL),
+            UNI_MEMCPY(get_ptr_from_tensor(biasTensorCpu[i * filterNum + j], CPU_GENERAL),
                 bias_cpu[i * filterNum + j], len * bytesOf(dt));
 
             len = tensorNumElements(filterDesc[j]);
             filter_cpu[i * filterNum + j] = ut_input_v(len, dt, UT_INIT_RANDOM);
-            memcpy(get_ptr_from_tensor(filterTensorCpu[i * filterNum + j], CPU_GENERAL),
+            UNI_MEMCPY(get_ptr_from_tensor(filterTensorCpu[i * filterNum + j], CPU_GENERAL),
                 filter_cpu[i * filterNum + j], len * bytesOf(dt));
         }
     }
@@ -103,7 +103,7 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
     TensorDesc tmpDesc = tensor1d(DT_U8, tmpBytes);
     tmpTensorCpu.resize(tmpDesc);
     tmpTensorCpu.alloc();
-    memset(get_ptr_from_tensor(tmpTensorCpu, CPU_GENERAL), 0, tmpBytes);
+    UNI_MEMSET(get_ptr_from_tensor(tmpTensorCpu, CPU_GENERAL), 0, tmpBytes);
 
     std::vector<U32> ftmBytes(4);
     CHECK_STATUS(rnn_transform_filter_bytes(
@@ -259,7 +259,7 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode)
 #ifdef _DEBUG
     double hxDim = hDim + xDim;
     double ops = 1.0 * batch * step *
-        (2.0 * hxDim * col * 4 + col * 4 + rnnParamSpec.numProjection * rnnParamSpec.numOutput);
+        (2.0 * hxDim * col * 4 + col * 4 + rnnParamSpec.num_projection * rnnParamSpec.num_outputs);
     ut_log(dt, buffer, ops, time);
 #endif
     ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, CPU_GENERAL),
diff --git a/compute/tensor/tests/test_rnncell_ocl.cpp b/compute/tensor/tests/test_rnncell_ocl.cpp
index ea952c69..16e1d17c 100644
--- a/compute/tensor/tests/test_rnncell_ocl.cpp
+++ b/compute/tensor/tests/test_rnncell_ocl.cpp
@@ -16,36 +16,36 @@
 
 int rnncellTest(int argc, char **argv, DataType dt, RNNMode mode)
 {
-    U32 xDim, hDim, numProjection;
+    U32 xDim, hDim, num_projection;
     xDim = atoi(argv[1]);
     hDim = atoi(argv[2]);
     if (argc == 4) {
-        numProjection = atoi(argv[3]);
+        num_projection = atoi(argv[3]);
     } else {
-        numProjection = 0;
+        num_projection = 0;
     }
     ArchInfo archInfo;
     archInfo.arch = MALI;
 
     RNNParamSpec rnnParamSpec;
     rnnParamSpec.mode = RNN_LSTM;
-    rnnParamSpec.numOutput = hDim;
-    rnnParamSpec.numProjection = numProjection;
-    rnnParamSpec.forgetBias = 1.0;
-    rnnParamSpec.zoneoutCell = 0;
-    rnnParamSpec.zoneoutOutput = 0;
+    rnnParamSpec.num_outputs = hDim;
+    rnnParamSpec.num_projection = num_projection;
+    rnnParamSpec.forget_bias = 1.0;
+    rnnParamSpec.zoneout_cell = 0;
+    rnnParamSpec.zoneout_output = 0;
     rnnParamSpec.steps = -1;
-    rnnParamSpec.biDirection = false;
-    rnnParamSpec.activationMode = ACTIVATION_TANH;
+    rnnParamSpec.bi_direction = false;
+    rnnParamSpec.activation_type = ACTIVATION_TANH;
 
-    U32 col = (numProjection > 0) ? numProjection : hDim;
+    U32 col = (num_projection > 0) ? num_projection : hDim;
     TensorDesc inputDesc = tensor2df(dt, DF_NORMAL, 1, xDim);
     TensorDesc stateDesc = tensor2df(dt, DF_NORMAL, 1, col + hDim);
 
     std::vector<TensorDesc> biasDesc(2);
     std::vector<TensorDesc> filterDesc(2);
     filterDesc[0] = tensor2df(dt, DF_NK, 4 * col, xDim + hDim);
-    filterDesc[1] = tensor2df(dt, DF_NK, hDim, numProjection);
+    filterDesc[1] = tensor2df(dt, DF_NK, hDim, num_projection);
     biasDesc[0] = tensor1d(dt, 4 * col);
     biasDesc[1] = tensor1d(dt, hDim);
 
@@ -69,10 +69,10 @@ int rnncellTest(int argc, char **argv, DataType dt, RNNMode mode)
     U32 inputLen = tensorNumElements(inputDesc);
     U32 stateLen = tensorNumElements(stateDesc);
     U8 *input_cpu = ut_input_v(inputLen, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, inputLen * bytesOf(dt));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, inputLen * bytesOf(dt));
 
     U8 *state_cpu = ut_input_v(stateLen, dt, UT_INIT_RANDOM);
-    memcpy(get_ptr_from_tensor(stateTensorCpu, CPU_GENERAL), state_cpu, stateLen * bytesOf(dt));
+    UNI_MEMCPY(get_ptr_from_tensor(stateTensorCpu, CPU_GENERAL), state_cpu, stateLen * bytesOf(dt));
     U8 *state_gpu_host = ut_input_v(stateLen, dt, UT_INIT_ZERO);
 
     std::vector<U8 *> bias_cpu(2);
@@ -80,11 +80,12 @@ int rnncellTest(int argc, char **argv, DataType dt, RNNMode mode)
     for (U32 i = 0; i < 2; i++) {
         U32 len = tensorNumElements(biasDesc[i]);
         bias_cpu[i] = ut_input_v(len, dt, UT_INIT_RANDOM);
-        memcpy(get_ptr_from_tensor(biasTensorCpu[i], CPU_GENERAL), bias_cpu[i], len * bytesOf(dt));
+        UNI_MEMCPY(
+            get_ptr_from_tensor(biasTensorCpu[i], CPU_GENERAL), bias_cpu[i], len * bytesOf(dt));
 
         len = tensorNumElements(filterDesc[i]);
         filter_cpu[i] = ut_input_v(len, dt, UT_INIT_RANDOM);
-        memcpy(
+        UNI_MEMCPY(
             get_ptr_from_tensor(filterTensorCpu[i], CPU_GENERAL), filter_cpu[i], len * bytesOf(dt));
     }
 
@@ -102,7 +103,7 @@ int rnncellTest(int argc, char **argv, DataType dt, RNNMode mode)
     TensorDesc tmpDesc = tensor1d(DT_U8, tmpBytes);
     tmpTensorCpu.resize(tmpDesc);
     tmpTensorCpu.alloc();
-    memset(get_ptr_from_tensor(tmpTensorCpu, CPU_GENERAL), 0, tmpBytes);
+    UNI_MEMSET(get_ptr_from_tensor(tmpTensorCpu, CPU_GENERAL), 0, tmpBytes);
 
     std::vector<U32> ftmBytes(2);
     CHECK_STATUS(rnn_transform_filter_bytes(
@@ -235,7 +236,7 @@ int rnncellTest(int argc, char **argv, DataType dt, RNNMode mode)
 #ifdef _DEBUG
     double hxDim = hDim + xDim;
     double ops = 1.0 *
-        (2.0 * hxDim * col * 4 + col * 4 + rnnParamSpec.numProjection * rnnParamSpec.numOutput);
+        (2.0 * hxDim * col * 4 + col * 4 + rnnParamSpec.num_projection * rnnParamSpec.num_outputs);
     ut_log(dt, buffer, ops, time);
 #endif
     ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, CPU_GENERAL),
diff --git a/compute/tensor/tests/test_roialign.cpp b/compute/tensor/tests/test_roialign.cpp
index 6ff8ccde..c1e7633d 100644
--- a/compute/tensor/tests/test_roialign.cpp
+++ b/compute/tensor/tests/test_roialign.cpp
@@ -39,6 +39,7 @@ int roialignTest(int argc, char **argv, DataType dt)
     F32 spatial_scale = (F32)atof(argv[15]);
 
     RoIAlignParamSpec p;
+    p.mode = POOLING_MEAN;
     p.output_h = output_h;
     p.output_w = output_w;
     p.sampling_ratio = sampling_ratio;
@@ -58,11 +59,11 @@ int roialignTest(int argc, char **argv, DataType dt)
     U8 *input_feat = ut_input_v(input_len_feat, dt, UT_INIT_RANDOM);
     U8 *input_rois = ut_input_v(input_len_rois, dt, UT_INIT_RANDOM);
     U8 *input_batch = ut_input_v(input_len_batch, dt, UT_INIT_ZERO);
-    memcpy(get_ptr_from_tensor(inputTensor_feat, CPU_GENERAL), input_feat,
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor_feat, CPU_GENERAL), input_feat,
         tensorNumBytes(inputDesc_feat));
-    memcpy(get_ptr_from_tensor(inputTensor_rois, CPU_GENERAL), input_rois,
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor_rois, CPU_GENERAL), input_rois,
         tensorNumBytes(inputDesc_rois));
-    memcpy(get_ptr_from_tensor(inputTensor_batch, CPU_GENERAL), input_batch,
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor_batch, CPU_GENERAL), input_batch,
         tensorNumBytes(inputDesc_batch));
     inputTensors[0] = inputTensor_feat;
     inputTensors[1] = inputTensor_rois;
diff --git a/compute/tensor/tests/test_roialign_ocl.cpp b/compute/tensor/tests/test_roialign_ocl.cpp
index 8d3882a4..297b315b 100644
--- a/compute/tensor/tests/test_roialign_ocl.cpp
+++ b/compute/tensor/tests/test_roialign_ocl.cpp
@@ -58,7 +58,7 @@ int roialignTest(int argc, char *argv[], DataType dt)
     }
 
     RoIAlignParamSpec p;
-    p.coordinateTransformationMode = ROIALIGN_HALF_PIXEL;
+    p.trans_mode = COORDINATE_TRANS_HALF_PIXEL;
     p.mode = POOLING_MEAN;
     p.output_w = ow;
     p.output_h = oh;
@@ -88,7 +88,7 @@ int roialignTest(int argc, char *argv[], DataType dt)
             inTensorsCpu[i].resize(roiDesc);
         }
         inTensorsCpu[i].alloc();
-        memcpy(get_ptr_from_tensor(inTensorsCpu[i], CPU_GENERAL), inputCpu[i],
+        UNI_MEMCPY(get_ptr_from_tensor(inTensorsCpu[i], CPU_GENERAL), inputCpu[i],
             tensorNumBytes(inTensorsCpu[i].get_desc()));
         inTensorPtrCpu[i] = &inTensorsCpu[i];
     }
diff --git a/compute/tensor/tests/test_scale.cpp b/compute/tensor/tests/test_scale.cpp
index 9bf5a068..82a92609 100644
--- a/compute/tensor/tests/test_scale.cpp
+++ b/compute/tensor/tests/test_scale.cpp
@@ -35,8 +35,8 @@ int scaleTest(int argc, char **argv, DataType dt)
     dataTensorRef.resize(inDesc);
     dataTensor.alloc();
     dataTensorRef.alloc();
-    memcpy(get_ptr_from_tensor(dataTensor, CPU_GENERAL), data, tensorNumBytes(inDesc));
-    memcpy(get_ptr_from_tensor(dataTensorRef, CPU_GENERAL), data, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(dataTensor, CPU_GENERAL), data, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(dataTensorRef, CPU_GENERAL), data, tensorNumBytes(inDesc));
 
     U8 *alpha = ut_input_v(ic, dt, UT_INIT_RANDOM);
     U8 *beta = ut_input_v(ic, dt, UT_INIT_RANDOM);
diff --git a/compute/tensor/tests/test_scale_ocl.cpp b/compute/tensor/tests/test_scale_ocl.cpp
index e9c4fa07..7f9e56ae 100644
--- a/compute/tensor/tests/test_scale_ocl.cpp
+++ b/compute/tensor/tests/test_scale_ocl.cpp
@@ -65,7 +65,8 @@ int scaleTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu, outputTensorCpu;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inputDesc));
     CHECK_STATUS(
         scale_infer_output_size(&inputTensorCpu, p, axisLen, &outputTensorCpu, &UT_SERIAL_ARCHINFO));
     outputTensorCpu.alloc();
diff --git a/compute/tensor/tests/test_slice.cpp b/compute/tensor/tests/test_slice.cpp
index a0f1142b..3d3b9e11 100644
--- a/compute/tensor/tests/test_slice.cpp
+++ b/compute/tensor/tests/test_slice.cpp
@@ -27,8 +27,8 @@ int sliceTest(int argc, char **argv, DataType dt)
     U32 iw = atoi(argv[5]);
     SliceParamSpec p;
     p.axis = atoi(argv[6]);
-    p.slice_size = num - 1;
-    for (U32 i = 0; i < p.slice_size; i++) {
+    p.num_slice = num - 1;
+    for (U32 i = 0; i < p.num_slice; i++) {
         p.slice_points[i] = atoi(argv[7 + i]);
     }
 
@@ -39,7 +39,7 @@ int sliceTest(int argc, char **argv, DataType dt)
     Tensor inputTensor;
     inputTensor.resize(inDesc);
     inputTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
 
     std::vector<Tensor> outputTensors(num);
     std::vector<Tensor *> outputTensorsPtr(num);
diff --git a/compute/tensor/tests/test_slice_ocl.cpp b/compute/tensor/tests/test_slice_ocl.cpp
index b0ec7672..9870f343 100644
--- a/compute/tensor/tests/test_slice_ocl.cpp
+++ b/compute/tensor/tests/test_slice_ocl.cpp
@@ -27,8 +27,8 @@ int sliceTest(int argc, char **argv, DataType dt)
     U32 iw = atoi(argv[5]);
     SliceParamSpec p;
     p.axis = atoi(argv[6]);
-    p.slice_size = num - 1;
-    for (U32 i = 0; i < p.slice_size; i++) {
+    p.num_slice = num - 1;
+    for (U32 i = 0; i < p.num_slice; i++) {
         p.slice_points[i] = atoi(argv[7 + i]);
     }
     ArchInfo archInfo;
@@ -45,7 +45,7 @@ int sliceTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inDesc));
     std::vector<Tensor> outputTensorsCpu(num);
     std::vector<Tensor *> outputTensorsPtrCpu(num);
     for (I32 i = 0; i < num; i++) {
diff --git a/compute/tensor/tests/test_softmax.cpp b/compute/tensor/tests/test_softmax.cpp
index 4b3a9361..210932eb 100644
--- a/compute/tensor/tests/test_softmax.cpp
+++ b/compute/tensor/tests/test_softmax.cpp
@@ -24,7 +24,7 @@ int softmaxTest(int argc, char **argv, DataType dt)
     TensorDesc inDesc = tensor2df(dt, DF_NORMAL, 1, len);
     U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM);
     Tensor inputTensor = Tensor::alloc_sized<CPUMem>(inDesc);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
 
     Tensor outputTensor;
     CHECK_STATUS(softmax_infer_output_size(&inputTensor, p, &outputTensor, &UT_CPU_ARCHINFO));
diff --git a/compute/tensor/tests/test_softmax_ocl.cpp b/compute/tensor/tests/test_softmax_ocl.cpp
index f640f1bd..b1c01d90 100644
--- a/compute/tensor/tests/test_softmax_ocl.cpp
+++ b/compute/tensor/tests/test_softmax_ocl.cpp
@@ -108,7 +108,7 @@ int softmaxTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(in_desc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(in_desc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(in_desc));
 
     Tensor outputTensorCpu;
     outputTensorCpu.resize(out_desc);
diff --git a/compute/tensor/tests/test_split.cpp b/compute/tensor/tests/test_split.cpp
index 03699ca6..8102da9b 100644
--- a/compute/tensor/tests/test_split.cpp
+++ b/compute/tensor/tests/test_split.cpp
@@ -32,7 +32,7 @@ int splitTest(int argc, char **argv, DataType dt)
     Tensor inputTensor;
     inputTensor.resize(inDesc);
     inputTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
 
     std::vector<Tensor> outputTensors(num);
     std::vector<Tensor *> outputTensorsPtr(num);
diff --git a/compute/tensor/tests/test_tfslice_ocl.cpp b/compute/tensor/tests/test_tfslice_ocl.cpp
index 5ac0eec4..0afd6386 100644
--- a/compute/tensor/tests/test_tfslice_ocl.cpp
+++ b/compute/tensor/tests/test_tfslice_ocl.cpp
@@ -49,7 +49,8 @@ int tfsliceTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
 
     Tensor outputTensorCpu;
     Tensor tmpTensorCpu;
@@ -110,7 +111,7 @@ int tfsliceTest(int argc, char **argv, DataType dt)
 
     char buffer[150];
     char params[120];
-    memset(params, 0, 120);
+    UNI_MEMSET(params, 0, 120);
     sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow);
     sprintf(buffer, "%20s, %80s", "tfslice", params);
 #ifdef _DEBUG
diff --git a/compute/tensor/tests/test_tile.cpp b/compute/tensor/tests/test_tile.cpp
index 76ad94d9..45a9bade 100644
--- a/compute/tensor/tests/test_tile.cpp
+++ b/compute/tensor/tests/test_tile.cpp
@@ -23,8 +23,8 @@ int tileTest(int argc, char **argv, DataType dt)
     //input axis and tiles
     TileParamSpec tileParamSpec;
     tileParamSpec.axis = atoi(argv[5]);
-    tileParamSpec.dimsSize = 1;
-    tileParamSpec.repeatsInfo[0] = atoi(argv[6]);
+    tileParamSpec.num_repeats = 1;
+    tileParamSpec.repeats[0] = atoi(argv[6]);
 
     //set input
     DataFormat df = DF_NCHW;
@@ -32,7 +32,7 @@ int tileTest(int argc, char **argv, DataType dt)
     U32 len = tensorNumElements(inDesc);
     U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM);
     Tensor inputTensor = Tensor::alloc_sized<CPUMem>(inDesc);
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes());
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes());
 
     //set output
     Tensor outputTensor;
@@ -43,7 +43,7 @@ int tileTest(int argc, char **argv, DataType dt)
         Tensor tmpTensor;
         CHECK_STATUS(tile(inputTensor, tileParamSpec, tmpTensor, outputTensor, &UT_CPU_ARCHINFO));
 
-        CHECK_REQUIREMENT(outputTensor.length() == (len * tileParamSpec.repeatsInfo[0]));
+        CHECK_REQUIREMENT(outputTensor.length() == (len * tileParamSpec.repeats[0]));
     }
 
     return 0;
diff --git a/compute/tensor/tests/test_tile_ocl.cpp b/compute/tensor/tests/test_tile_ocl.cpp
index 4ebbb5b2..d19c0d42 100644
--- a/compute/tensor/tests/test_tile_ocl.cpp
+++ b/compute/tensor/tests/test_tile_ocl.cpp
@@ -22,12 +22,12 @@ int tileTest(int argc, char **argv, DataType dt)
     }
     U32 iDim[8];
     TileParamSpec tileParamSpec;
-    tileParamSpec.dimsSize = nDims;
+    tileParamSpec.num_repeats = nDims;
     for (U32 i = 2; i < nDims + 2; i++) {
         iDim[i - 2] = atoi(argv[i]);
     }
     for (U32 i = nDims + 2; i < 2 * nDims + 2; i++) {
-        tileParamSpec.repeatsInfo[i - nDims - 2] = atoi(argv[i]);
+        tileParamSpec.repeats[i - nDims - 2] = atoi(argv[i]);
     }
 
     ArchInfo archInfo;
@@ -58,7 +58,8 @@ int tileTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu, outputTensorCpu, tmpTensorCpu;
     inputTensorCpu.resize(inputDesc);
     inputTensorCpu.alloc();
-    memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
+    UNI_MEMCPY(
+        get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc));
 
     CHECK_STATUS(tile_infer_output_size(
         &inputTensorCpu, tileParamSpec, &outputTensorCpu, &UT_SERIAL_ARCHINFO));
@@ -122,7 +123,7 @@ int tileTest(int argc, char **argv, DataType dt)
 
     char buffer[150];
     char params[120];
-    memset(params, 0, 120);
+    UNI_MEMSET(params, 0, 120);
     sprintf(params, "(");
     for (U32 i = 0; i < inputDesc.nDims; i++) {
         if (i != inputDesc.nDims - 1) {
diff --git a/compute/tensor/tests/test_topk_ocl.cpp b/compute/tensor/tests/test_topk_ocl.cpp
index 66352a03..251f7567 100644
--- a/compute/tensor/tests/test_topk_ocl.cpp
+++ b/compute/tensor/tests/test_topk_ocl.cpp
@@ -18,14 +18,14 @@ inline void topk_cpu_max(F16 *input, U32 len, U32 topk, F16 *output, I32 *output
 {
     for (U32 i = 0; i < topk; i++) {
         U32 index = 0;
-        F16 max_val = -65536;
+        F16 max_val = -UNI_F16_MAX;
         for (U32 j = 0; j < len; j++) {
             if (input[j] > max_val) {
                 max_val = input[j];
                 index = j;
             }
         }
-        input[index] = -65536;
+        input[index] = -UNI_F16_MAX;
         output[i] = max_val;
         outputId[i] = index;
     }
@@ -36,8 +36,8 @@ inline void sort_gpu_result(
 {
     std::vector<U32> skip_j;
     for (U32 i = 0; i < topk; i++) {
-        F16 max_val = -65536;
-        I32 index = 65536;
+        F16 max_val = -UNI_F16_MAX;
+        I32 index = UNI_F16_MAX;
         U32 sj = 0;
         for (U32 j = 0; j < topk; j++) {
             bool skip = false;
@@ -73,7 +73,7 @@ int topkTest(int argc, char **argv, DataType dt)
     U32 iw = 3000;
     TopKParamSpec p;
     p.axis = 0;
-    p.topk = 30;
+    p.k = 30;
     p.largest = 1;
     p.sorted = 0;
     if (argc == 8) {
@@ -82,7 +82,7 @@ int topkTest(int argc, char **argv, DataType dt)
         ih = atoi(argv[3]);
         iw = atoi(argv[4]);
         p.axis = atof(argv[5]);
-        p.topk = atof(argv[6]);
+        p.k = atof(argv[6]);
         p.largest = atof(argv[7]);
         p.sorted = atof(argv[8]);
     }
@@ -94,8 +94,8 @@ int topkTest(int argc, char **argv, DataType dt)
     U32 len = in * ic * ih * iw;
 
     TensorDesc input_desc_cpu = tensor1d(dt, len);
-    TensorDesc output_desc_cpu = tensor1d(dt, (U32)p.topk);
-    TensorDesc output_indices_desc_cpu = tensor1d(DT_I32, (U32)p.topk);
+    TensorDesc output_desc_cpu = tensor1d(dt, (U32)p.k);
+    TensorDesc output_indices_desc_cpu = tensor1d(DT_I32, (U32)p.k);
     TensorDesc input_desc_gpu = tensor1d(dt, len);
     TensorDesc output_desc_gpu, output_indices_desc_gpu;
 
@@ -166,16 +166,16 @@ int topkTest(int argc, char **argv, DataType dt)
     sprintf(params, "(%u %u %u %u) = (%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow);
     sprintf(buffer, "16bit%20s, %80s", "topk", params);
 
-    F16 *output_cpu = (F16 *)malloc(sizeof(F16) * p.topk);
-    I32 *output_id_cpu = (I32 *)malloc(sizeof(I32) * p.topk);
-    F16 *res_gpu_sort = (F16 *)malloc(sizeof(F16) * p.topk);
-    I32 *res_id_gpu_sort = (I32 *)malloc(sizeof(I32) * p.topk);
-    topk_cpu_max((F16 *)input_cpu, len, p.topk, output_cpu, output_id_cpu);
+    F16 *output_cpu = (F16 *)malloc(sizeof(F16) * p.k);
+    I32 *output_id_cpu = (I32 *)malloc(sizeof(I32) * p.k);
+    F16 *res_gpu_sort = (F16 *)malloc(sizeof(F16) * p.k);
+    I32 *res_id_gpu_sort = (I32 *)malloc(sizeof(I32) * p.k);
+    topk_cpu_max((F16 *)input_cpu, len, p.k, output_cpu, output_id_cpu);
     sort_gpu_result(
-        (F16 *)output_gpu, (I32 *)output_indices_gpu, p.topk, res_gpu_sort, res_id_gpu_sort);
+        (F16 *)output_gpu, (I32 *)output_indices_gpu, p.k, res_gpu_sort, res_id_gpu_sort);
 
-    ut_check_a(res_gpu_sort, output_cpu, p.topk, dt);
-    ut_check_a(res_id_gpu_sort, output_id_cpu, p.topk, dt);
+    ut_check_a(res_gpu_sort, output_cpu, p.k, dt);
+    ut_check_a(res_id_gpu_sort, output_id_cpu, p.k, dt);
 
     CHECK_STATUS(gcl_finish(handle));
     CHECK_STATUS(gcl_clean_kernelVec(handle));
diff --git a/compute/tensor/tests/test_transpose.cpp b/compute/tensor/tests/test_transpose.cpp
index 0d707b90..2df702ec 100644
--- a/compute/tensor/tests/test_transpose.cpp
+++ b/compute/tensor/tests/test_transpose.cpp
@@ -24,12 +24,12 @@ int transposeTest(int argc, char **argv, DataType dt)
     U32 ih = atoi(argv[3]);
     U32 iw = atoi(argv[4]);
     TransposeParamSpec p, p_inv;
-    p.trans_size = 4;
-    p_inv.trans_size = 4;
-    for (int i = 0; i < 4; i++) {
+    p.num_axes = 4;
+    p_inv.num_axes = 4;
+    for (U32 i = 0; i < p_inv.num_axes; i++) {
         I32 value = atoi(argv[5 + i]);
-        p.trans_dims[i] = value;
-        p_inv.trans_dims[value] = i;
+        p.axes[i] = value;
+        p_inv.axes[value] = i;
     }
 
     DataFormat df = DF_NCHW;
@@ -39,7 +39,7 @@ int transposeTest(int argc, char **argv, DataType dt)
     Tensor inputTensor;
     inputTensor.resize(inDesc);
     inputTensor.alloc();
-    memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
+    UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc));
 
     Tensor outputTensor1;
     Tensor outputTensor2;
diff --git a/compute/tensor/tests/test_transpose_ocl.cpp b/compute/tensor/tests/test_transpose_ocl.cpp
index 0386d451..16585391 100644
--- a/compute/tensor/tests/test_transpose_ocl.cpp
+++ b/compute/tensor/tests/test_transpose_ocl.cpp
@@ -31,13 +31,13 @@ int transposeTest(int argc, char **argv, DataType dt)
         inputDesc_cpu.df = DF_NCHW;
     }
     CHECK_REQUIREMENT(argc == (int)(nDims * 2 + 2));
-    p.trans_size = nDims;
-    p_inv.trans_size = nDims;
+    p.num_axes = nDims;
+    p_inv.num_axes = nDims;
     for (U32 i = 0; i < nDims; i++) {
         inputDesc_cpu.dims[nDims - 1 - i] = atoi(argv[2 + i]);
         I32 value = atoi(argv[2 + nDims + i]);
-        p.trans_dims[i] = value;
-        p_inv.trans_dims[value] = i;
+        p.axes[i] = value;
+        p_inv.axes[value] = i;
     }
     inputDesc_gpu = inputDesc_cpu;
 
@@ -52,7 +52,7 @@ int transposeTest(int argc, char **argv, DataType dt)
     Tensor inputTensorCpu;
     inputTensorCpu.resize(inputDesc_cpu);
     inputTensorCpu.alloc();
-    memcpy(
+    UNI_MEMCPY(
         get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc_cpu));
     Tensor outputTensorCpu;
     Tensor tmpTensorCpu;
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index 5f8612e2..ff0020ea 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -20,7 +20,9 @@
   - [engine](../inference/engine) hosts the inference engine of neural networks.
   - [flow](../inference/flow) hosts the multi-backends(CPU+GPU) heterogeneous device schedule for time series data.
   - [examples](../inference/examples) gives some application examples (Network Benchmark, ImageNet classification).
-- [kit](../kit)  
+- [kit](../kit)
   - kit provides some application demos.
+- [Training](../training)
+  - training provides all on-device training modules and examples.
 
 For API, Flow and operator development, please refer to [DEVELOPER.md](DEVELOPER.md).
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 05400769..73ac5ac2 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -7,16 +7,54 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 and this project adheres to [Semantic Versioning](<https://semver.org/spec/v2.0.0.html>).
 
+### [1.3.0] - 2022-2-28
+
+#### Added
+
+- Support on-device training for MLP, CNN(lenet, resnet50, mobilnetv1), transformer/bert(text to speech)
+- Support change model input and output names in X2bolt
+- Support more graph optimizations : Transpose+Convolution, Swish, Quantization, Power+Scale
+- Support dynamic output related operators : Shape, ConstantOfShape, GenerateProposals, NonZero, NonMaxSuppression, Reshape, etc
+- Support more operators : GridSample, CumSum, OneHot, Round, Floor, Ceil
+- Support more networks on CPU : yolov2, yolov3, yolov4, yolov5, faster-rcnn, mask-rcnn, retinanet, dfsmn, frill, conformer, unet, etc
+- Support Armv8 int8 to accelerate NLP network
+- Improve inference performance on avx2 CPU
+- Support netron to visualize bolt model
+- Support not to bind CPU core
+- Add C API MemoryCheck to check bolt memory leak
+
+#### Changed
+
+- X2bolt add -I and -O options to change model input and output names.
+- X2bolt add -t option to convert model for on-device training.
+- C API CreateModel and AllocAllResultHandle return value is set to NULL when unsuccessful.
+- install.sh add --neon option to close arm neon acceleration on old platform.
+- some operator parameter defination
+
+#### Fixed
+
+- Fix GPU depth2space and deconv bug
+- Fix GPU preprocess tool on armv8 platform bug
+- Fix x86 Sigmoid precision
+- Fix C API CloneResultHandle bug
+- Fix mobilnetv1 int8 inference
+- Fix Java API build bug on Windows
+- Fix ONNX converter deconv, pooling parameter bug
+
+#### Removed
+
+- Equal operator is replaced with Check.
+
 
 ### [1.2.1] - 2021-9-11
 
 #### Added
 
 - Support more graph optimizations : Convolution+Convolution, LayerNorm
-- Support more operators: ROIAlign, GenerateProposals, Reciprocal, Not, Log, ReductionL2, InstanceNorm, Expand, Gather, Scatter
+- Support more operators : ROIAlign, GenerateProposals, Reciprocal, Not, Log, ReductionL2, InstanceNorm, Expand, Gather, Scatter
 - Support more operators(PReLU) process NCHW input data.
 - Support ONNX share weight between Linear, MatMul, Gemm and Gather
-- Support more networks on CPU: vision transformer(ViT, TNT), recommendation networks
+- Support more networks on CPU : vision transformer(ViT, TNT), recommendation networks
 - Support more networks on GPU : ASR, Faster_RCNN
 - Support Armv7 int8 to accelerate NLP network(50%+ speed-up)
 - Support X86 AVX512 int8 to accelerate NLP network(3x+ speed-up)
diff --git a/docs/DEVELOPER.md b/docs/DEVELOPER.md
index 8e69f718..698462cc 100644
--- a/docs/DEVELOPER.md
+++ b/docs/DEVELOPER.md
@@ -24,7 +24,7 @@ you can customize the unsupported operators step by step which has been describe
 
 ### C API
  
-Bolt provides <a href="API/html/bolt_8h.html" target="_blank">C API document</a> generated by doxygen to help you use [C API](../inference/engine/api/c/bolt.h), [image classification example](../inference/examples/c_api/c_image_classifification.c) and [Chinese input method example](../inference/examples/c_api/c_input_method.c). 
+Bolt provides <a href="API/html/bolt_8h.html" target="_blank">C API document</a> generated by doxygen to help you use [C API](../inference/engine/api/c/bolt.h), [image classification example](../inference/examples/c_api/c_image_classification.c) and [Chinese input method example](../inference/examples/c_api/c_input_method.c). 
 You can compile it and link *libbolt.so* library with your C/C++ project.
 
 ### Java API
@@ -120,13 +120,13 @@ In [model_tools](../model_tools), you can define any operator for model conversi
          unsigned int stride_t;
          unsigned int stride_h;
          unsigned int stride_w;
-         unsigned int padding_before;
-         unsigned int padding_after;
-         unsigned int padding_top;
-         unsigned int padding_bottom;
-         unsigned int padding_left;
-         unsigned int padding_right;
-         RoundMode rm;
+         unsigned int pad_before;
+         unsigned int pad_after;
+         unsigned int pad_top;
+         unsigned int pad_bottom;
+         unsigned int pad_left;
+         unsigned int pad_right;
+         RoundMode round_mode;
          PoolingMode mode;
      } PoolingParamSpec;
      // <====== Addition 
@@ -149,13 +149,11 @@ In [model_tools](../model_tools), you can define any operator for model conversi
 
      ```c++
      OperatorType convert_caffe_type(std::string inputType) {
-         // Addition ======>
-         if (inputType == "Pooling") {    
-             return OT_Pooling;    
-         }    // <====== Addition
-         else if (inputType ==  "Convolution") {
-            ...
-         } 
+         std::map<std::string, OperatorType> operatorMap = {
+            // Addition ======>
+            {"Pooling", OT_Pooling},
+            // <====== Addition
+         };
      }
      ```
 
@@ -163,13 +161,11 @@ In [model_tools](../model_tools), you can define any operator for model conversi
 
      ```c++
      virtual EE adapt_operator(OperatorType type, ParameterSpec *ps) {
-         ...
-         // Addition ======>
-         else if (type == OT_Pooling) {
-             *ps = adapt_Pooling();
-         }
-         // <====== Addition
-         ...
+         std::map<OperatorType, AdaptOperatorFunction> functions = {
+             // Addition ======>
+             {OT_Pooling, &ModelAdaptee::adapt_Pooling},
+             // <====== Addition
+         };
      }
      
      // Addition ======>
@@ -183,62 +179,62 @@ In [model_tools](../model_tools), you can define any operator for model conversi
      // Addition ======>
      ParameterSpec adapt_Pooling() override
      {
-         ParameterSpec curPs;
-         memset(&curPs, 0, sizeof(curPs));
-         PoolingParamSpec pps;
-         memset(&pps, 0, sizeof(pps));
-         pps.kernel_t = 1;
-         pps.stride_t = 1;
-         pps.padding_before = 0;
-         pps.padding_after = 0;
-         if (layer.pooling_param().has_kernel_w() && layer.pooling_param().has_kernel_h()) {
-             pps.kernel_w = layer.pooling_param().kernel_w();
-             pps.kernel_h = layer.pooling_param().kernel_h();
+         ParameterSpec ps;
+         PoolingParamSpec p;
+         memset(&p, 0, sizeof(p));
+         p.kernel_t = 1;
+         p.stride_t = 1;
+         p.pad_before = 0;
+         p.pad_after = 0;
+         auto cp = layer.pooling_param();
+         if (cp.has_kernel_w() && cp.has_kernel_h()) {
+             p.kernel_w = cp.kernel_w();
+             p.kernel_h = cp.kernel_h();
          } else {
-             pps.kernel_h = layer.pooling_param().kernel_size();
-             pps.kernel_w = pps.kernel_h;
+             p.kernel_h = cp.kernel_size();
+             p.kernel_w = p.kernel_h;
          }
-         if (layer.pooling_param().has_stride_w() && layer.pooling_param().has_stride_h()) {
-             pps.stride_w = layer.pooling_param().stride_w();
-             pps.stride_h = layer.pooling_param().stride_h();
+         if (cp.has_stride_w() && cp.has_stride_h()) {
+             p.stride_w = cp.stride_w();
+             p.stride_h = cp.stride_h();
          } else {
-             pps.stride_h = layer.pooling_param().stride();
-             pps.stride_w = pps.stride_h;
+             p.stride_h = cp.stride();
+             p.stride_w = p.stride_h;
          }
-         bool global_pooling = layer.pooling_param().global_pooling();
+         bool global_pooling = cp.global_pooling();
          if (global_pooling) {
-             pps.kernel_h = 0;
-             pps.kernel_w = 0;
-             pps.stride_h = 1;
-             pps.stride_w = 1;
+             p.kernel_h = 0;
+             p.kernel_w = 0;
+             p.stride_h = 1;
+             p.stride_w = 1;
          } else {
-             CHECK_REQUIREMENT(pps.kernel_h > 0);
+             CHECK_REQUIREMENT(p.kernel_h > 0);
          }
-         if (layer.pooling_param().has_pad_w() && layer.pooling_param().has_pad_h()) {
-             pps.padding_left = layer.pooling_param().pad_w();
-             pps.padding_right = pps.padding_left;
-             pps.padding_top = layer.pooling_param().pad_h();
-             pps.padding_bottom = pps.padding_top;
+         if (cp.has_pad_w() && cp.has_pad_h()) {
+             p.pad_left = cp.pad_w();
+             p.pad_right = p.pad_left;
+             p.pad_top = cp.pad_h();
+             p.pad_bottom = p.pad_top;
          } else {
-             pps.padding_top = layer.pooling_param().has_pad() ? layer.pooling_param().pad() : 0;
-             pps.padding_bottom = pps.padding_top;
-             pps.padding_left = pps.padding_top;
-             pps.padding_right = pps.padding_top;
+             p.pad_top = cp.has_pad() ? cp.pad() : 0;
+             p.pad_bottom = p.pad_top;
+             p.pad_left = p.pad_top;
+             p.pad_right = p.pad_top;
          }
 
-         if (layer.pooling_param().has_round_mode() && layer.pooling_param().round_mode() == 1) {
-             pps.rm = FLOOR;
+         if (cp.has_round_mode() && cp.round_mode() == 1) {
+             p.round_mode = ROUND_FLOOR;
          } else {
-             pps.rm = CEIL;
+             p.round_mode = ROUND_CEIL;
          }
-         auto op = layer.pooling_param().pool();
+         auto op = cp.pool();
          switch (op) {
              case caffe::PoolingParameter_PoolMethod_MAX: {
-                 pps.mode = POOLING_MAX;
+                 p.mode = POOLING_MAX;
                  break;
              }
              case caffe::PoolingParameter_PoolMethod_AVE: {
-                 pps.mode = POOLING_MEAN;
+                 p.mode = POOLING_MEAN;
                  break;
              }
              default: {
@@ -248,9 +244,9 @@ In [model_tools](../model_tools), you can define any operator for model conversi
                      this->layer.name().c_str(), descriptor->FindValueByNumber(op)->name().c_str());
              }
          }
-         curPs.pooling_spec = pps;
-         return curPs;
-     }     
+         ps.pooling_spec = p;
+         return ps;
+     }
      // <====== Addition
      ```
 
@@ -274,13 +270,13 @@ In [model_tools](../model_tools), you can define any operator for model conversi
 
      ```c++
      OperatorType convert_onnx_type(std::string inputType) {
-         // Addition ======>
-         if (inputType == "AveragePool" || inputType == "MaxPool" || inputType == "GlobalAveragePool") {
-             return OT_Pooling;
-         } // <====== Addition
-         else if (inputType == "Conv") {
-             ...
-         }
+         std::map<std::string, OperatorType> operatorMap = {
+             // Addition ======>
+             {"AveragePool", OT_Pooling},
+             {"MaxPool", OT_Pooling},
+             {"GlobalAveragePool", OT_Pooling},
+             // <====== Addition
+         };
      }
      ```
 
@@ -288,13 +284,11 @@ In [model_tools](../model_tools), you can define any operator for model conversi
 
      ```c++
      virtual EE adapt_operator(OperatorType type, ParameterSpec *ps) {
-         ...
-         // Addition ======>
-         else if (type == OT_Pooling) {
-             *ps = adapt_Pooling();
-         }
-         // <====== Addition
-         ...
+         std::map<OperatorType, AdaptOperatorFunction> functions = {
+             // Addition ======>
+             {OT_Pooling, &ModelAdaptee::adapt_Pooling},
+             // <====== Addition
+         };
      }
      
      // Addition ======>
@@ -308,82 +302,89 @@ In [model_tools](../model_tools), you can define any operator for model conversi
      // Addition ======>
      ParameterSpec adapt_Pooling() override
      {
-         ParameterSpec curPs;
-         memset(&curPs, 0, sizeof(curPs));
-         PoolingParamSpec pps;
-         memset(&pps, 0, sizeof(pps));
-         std::string autoPad = get_node_str_attribute_by_name(node, "auto_pad");  // deprecated
-         std::vector<int> kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape");
-         std::vector<int> strides = get_node_vector_ints_attribute_by_name(node, "strides");
-         std::vector<int> pads = get_node_vector_ints_attribute_by_name(node, "pads");
-
-         if (op == "AveragePool" || op == "ReduceMean" || op == "GlobalAveragePool") {
-             pps.mode = POOLING_MEAN;
+         ParameterSpec ps;
+         PoolingParamSpec p;
+         memset(&p, 0, sizeof(p));
+         std::string autoPad = get_string(this->onnxNode, "auto_pad");
+         std::vector<int> kernels = get_ints(this->onnxNode, "kernel_shape");
+         std::vector<int> strides = get_ints(this->onnxNode, "strides");
+         std::vector<int> pads = get_ints(this->onnxNode, "pads");
+         int ceil_mode = get_int(this->onnxNode, "ceil_mode", 0);
+
+         const std::string &onnxNodeType = this->onnxNode.op_type();
+         if (onnxNodeType == "AveragePool" || onnxNodeType == "ReduceMean" ||
+             onnxNodeType == "GlobalAveragePool") {
+             p.mode = POOLING_MEAN;
          } else {
-             pps.mode = POOLING_MAX;
+             p.mode = POOLING_MAX;
          }
 
-         if (autoPad == "SAME_UPPER") {
-             pps.rm = CEIL;
+         if (ceil_mode) {
+             p.round_mode = ROUND_CEIL;
          } else {
-             pps.rm = FLOOR;
+             p.round_mode = ROUND_FLOOR;
          }
 
-         pps.kernel_t = 0;
-         pps.kernel_h = 0;
-         pps.kernel_w = 0;
-         if (kernelShape.size() == 3) {
-             pps.kernel_t = kernelShape[0];
-             pps.kernel_h = kernelShape[1];
-             pps.kernel_w = kernelShape[2];
-         } else if (kernelShape.size() == 2) {
-             pps.kernel_t = 1;
-             pps.kernel_h = kernelShape[0];
-             pps.kernel_w = kernelShape[1];
-         } else if (kernelShape.size() == 1) {
-             pps.kernel_t = 1;
-             pps.kernel_h = kernelShape[0];
-             pps.kernel_w = 1;
+         p.kernel_t = 0;
+         p.kernel_h = 0;
+         p.kernel_w = 0;
+         if (kernels.size() == 3) {
+             p.kernel_t = kernels[0];
+             p.kernel_h = kernels[1];
+             p.kernel_w = kernels[2];
+         } else if (kernels.size() == 2) {
+             p.kernel_t = 1;
+             p.kernel_h = kernels[0];
+             p.kernel_w = kernels[1];
+         } else if (kernels.size() == 1) {
+             p.kernel_t = 1;
+             p.kernel_h = kernels[0];
+             p.kernel_w = 1;
          }
 
-         pps.stride_t = 1;
-         pps.stride_h = 1;
-         pps.stride_w = 1;
+         p.stride_t = 1;
+         p.stride_h = 1;
+         p.stride_w = 1;
          if (strides.size() == 3) {
-             pps.stride_t = strides[0];
-             pps.stride_h = strides[1];
-             pps.stride_w = strides[2];
+             p.stride_t = strides[0];
+             p.stride_h = strides[1];
+             p.stride_w = strides[2];
          } else if (strides.size() == 2) {
-             pps.stride_h = strides[0];
-             pps.stride_w = strides[1];
+             p.stride_h = strides[0];
+             p.stride_w = strides[1];
          } else if (strides.size() == 1) {
-             pps.stride_h = strides[0];
+             p.stride_h = strides[0];
          }
 
-         pps.padding_before = 0;
-         pps.padding_top = 0;
-         pps.padding_left = 0;
-         pps.padding_after = 0;
-         pps.padding_bottom = 0;
-         pps.padding_right = 0;
+         p.pad_before = 0;
+         p.pad_top = 0;
+         p.pad_left = 0;
+         p.pad_after = 0;
+         p.pad_bottom = 0;
+         p.pad_right = 0;
          if (pads.size() == 6) {
-             pps.padding_before = pads[0];
-             pps.padding_top = pads[1];
-             pps.padding_left = pads[2];
-             pps.padding_after = pads[3];
-             pps.padding_bottom = pads[4];
-             pps.padding_right = pads[5];
+             p.pad_before = pads[0];
+             p.pad_top = pads[1];
+             p.pad_left = pads[2];
+             p.pad_after = pads[3];
+             p.pad_bottom = pads[4];
+             p.pad_right = pads[5];
          } else if (pads.size() == 4) {
-             pps.padding_top = pads[0];
-             pps.padding_left = pads[1];
-             pps.padding_bottom = pads[2];
-             pps.padding_right = pads[3];
+             p.pad_top = pads[0];
+             p.pad_left = pads[1];
+             p.pad_bottom = pads[2];
+             p.pad_right = pads[3];
          } else if (pads.size() == 2) {
-             pps.padding_top = pads[0];
-             pps.padding_bottom = pads[1];
+             p.pad_top = pads[0];
+             p.pad_bottom = pads[1];
+         } else if (autoPad == "SAME_UPPER") {
+             p.pad_top = (p.kernel_h - 1) / 2;
+             p.pad_bottom = (p.kernel_h - 1) - p.pad_top;
+             p.pad_left = (p.kernel_w - 1) / 2;
+             p.pad_right = (p.kernel_w - 1) - p.pad_left;
          }
-         curPs.pooling_spec = pps;
-         return curPs;
+         ps.pooling_spec = p;
+         return ps;
      }
      // <======= Addition
      ```
@@ -408,13 +409,12 @@ In [model_tools](../model_tools), you can define any operator for model conversi
 
      ```c++
      OperatorType convert_tflite_type(tflite::BuiltinOperator tfliteType) {
-         // Addition ======>
-         if (tfliteType == tflite::BuiltinOperator_MAX_POOL_2D || tfliteOperatorType == tflite::BuiltinOperator_AVERAGE_POOL_2D) {
-             return OT_Pooling;
-         } // <====== Addition
-         else if (tfliteType == tflite::BuiltinOperator_CONCATENATION) {
-             ...
-         }
+         std::map<tflite::BuiltinOperator, OperatorType> operatorMap = {
+             // Addition ======>
+             {tflite::BuiltinOperator_MAX_POOL_2D, OT_Pooling},
+             {tflite::BuiltinOperator_AVERAGE_POOL_2D, OT_Pooling},
+             // <====== Addition
+         };
      }
      ```
 
@@ -422,13 +422,11 @@ In [model_tools](../model_tools), you can define any operator for model conversi
 
      ```c++
      virtual EE adapt_operator(OperatorType type, ParameterSpec *ps) {
-         ...
-         // Addition ======>
-         else if (type == OT_Pooling) {
-             *ps = adapt_Pooling();
-         }
-         // <====== Addition
-         ...
+         std::map<OperatorType, AdaptOperatorFunction> functions = {
+             // Addition ======>
+             {OT_Pooling, &ModelAdaptee::adapt_Pooling},
+             // <====== Addition
+         };
      }
      
      // Addition ======>
@@ -442,19 +440,18 @@ In [model_tools](../model_tools), you can define any operator for model conversi
      // Addition ======>
      ParameterSpec adapt_Pooling() override
      {
-         ParameterSpec curPs;
-         memset(&curPs, 0, sizeof(curPs));
-         PoolingParamSpec poolingPs;
-         memset(&poolingPs, 0, sizeof(poolingPs));
-         poolingPs.kernel_t = 1;
-         poolingPs.stride_t = 1;
-         poolingPs.padding_before = 0;
-         poolingPs.padding_after = 0;
-         poolingPs.padding_top = 0;
-         poolingPs.padding_bottom = 0;
-         poolingPs.padding_left = 0;
-         poolingPs.padding_right = 0;
-         poolingPs.rm = CEIL;
+         ParameterSpec ps;
+         PoolingParamSpec p;
+         memset(&p, 0, sizeof(p));
+         p.kernel_t = 1;
+         p.stride_t = 1;
+         p.pad_before = 0;
+         p.pad_after = 0;
+         p.pad_top = 0;
+         p.pad_bottom = 0;
+         p.pad_left = 0;
+         p.pad_right = 0;
+         p.round_mode = ROUND_CEIL;
 
          const auto &inputTensor =
              this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]];
@@ -466,49 +463,47 @@ In [model_tools](../model_tools), you can define any operator for model conversi
              const auto &axisData = tfliteModelBuffer[axisTensor->buffer]->data;
              auto axisPtr = reinterpret_cast<const int32_t *>(axisData.data());
              CHECK_REQUIREMENT(1 == axisPtr[0] && 2 == axisPtr[1]);
-             poolingPs.mode = POOLING_MEAN;
-             poolingPs.kernel_h = 0;
-             poolingPs.kernel_w = 0;
-             poolingPs.stride_h = 1;
-             poolingPs.stride_w = 1;
+             p.mode = POOLING_MEAN;
+             p.kernel_h = 0;
+             p.kernel_w = 0;
+             p.stride_h = 1;
+             p.stride_w = 1;
          } else {
              const auto &tflitePoolOption =
                  this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsPool2DOptions();
-             poolingPs.kernel_h = tflitePoolOption->filter_height;
-             poolingPs.kernel_w = tflitePoolOption->filter_width;
-             poolingPs.stride_h = tflitePoolOption->stride_h;
-             poolingPs.stride_w = tflitePoolOption->stride_w;
+             p.kernel_h = tflitePoolOption->filter_height;
+             p.kernel_w = tflitePoolOption->filter_width;
+             p.stride_h = tflitePoolOption->stride_h;
+             p.stride_w = tflitePoolOption->stride_w;
              int tfPaddingRoundMode = tflitePoolOption->padding;
              if (tfPaddingRoundMode == 0) {
-                 poolingPs.rm = TF_SAME;
-
-                 int oLength = (inputShape[2] + poolingPs.stride_w - 1) / poolingPs.stride_w;
-                 int padLength = UNI_MAX(
-                     (oLength - 1) * poolingPs.stride_w + poolingPs.kernel_w - inputShape[2], 0);
-                 poolingPs.padding_left = padLength / 2;
-                 poolingPs.padding_right = padLength - poolingPs.padding_left;
-
-                 oLength = (inputShape[1] + poolingPs.stride_h - 1) / poolingPs.stride_h;
-                 padLength = UNI_MAX(
-                     (oLength - 1) * poolingPs.stride_h + poolingPs.kernel_h - inputShape[1], 0);
-                 poolingPs.padding_top = padLength / 2;
-                 poolingPs.padding_bottom = padLength - poolingPs.padding_top;
+                 p.round_mode = ROUND_TF_SAME;
+
+                 int oLength = (inputShape[2] + p.stride_w - 1) / p.stride_w;
+                 int padLength = UNI_MAX((oLength - 1) * p.stride_w + p.kernel_w - inputShape[2], 0);
+                 p.pad_left = padLength / 2;
+                 p.pad_right = padLength - p.pad_left;
+
+                 oLength = (inputShape[1] + p.stride_h - 1) / p.stride_h;
+                 padLength = UNI_MAX((oLength - 1) * p.stride_h + p.kernel_h - inputShape[1], 0);
+                 p.pad_top = padLength / 2;
+                 p.pad_bottom = padLength - p.pad_top;
              } else if (tfPaddingRoundMode == 1) {
-                 poolingPs.rm = TF_VALID;
+                 p.round_mode = ROUND_TF_VALID;
              } else {
                  UNI_ERROR_LOG("can not process operator location:%d Pooling round mode.\n",
                      this->tfliteOperatorIndex);
              }
              if (opCode == tflite::BuiltinOperator_MAX_POOL_2D) {
-                 poolingPs.mode = POOLING_MAX;
+                 p.mode = POOLING_MAX;
              } else if (opCode == tflite::BuiltinOperator_AVERAGE_POOL_2D) {
-                 poolingPs.mode = POOLING_MEAN;
+                 p.mode = POOLING_MEAN;
              }
              insertActivationOperator(
                  getActivationOperatorType(tflitePoolOption->fused_activation_function));
          }
-         curPs.pooling_spec = poolingPs;
-         return curPs;
+         ps.pooling_spec = p;
+         return ps;
      }
      // <====== Addition
      ```
diff --git a/docs/INSTALL.md b/docs/INSTALL.md
index 0379054d..093156f8 100644
--- a/docs/INSTALL.md
+++ b/docs/INSTALL.md
@@ -57,10 +57,10 @@
 
 - #### Android NDK
 
-    Refer to the [NDK installation example](https://askubuntu.com/questions/837847/how-to-install-android-ndk) to install [android-ndk-r20](https://developer.android.google.cn/ndk/downloads) and set shell environment variable **ANDROID_NDK_ROOT**.
+    Refer to the [NDK installation example](https://askubuntu.com/questions/837847/how-to-install-android-ndk) to install [android-ndk-r22b](https://developer.android.google.cn/ndk/downloads) and set shell environment variable **ANDROID_NDK_ROOT**.
 
     ```
-    export ANDROID_NDK_ROOT=/data/opt/android-ndk-r20
+    export ANDROID_NDK_ROOT=/data/opt/android-ndk-r22b
     ```
 
 ### Linux-AArch64 Target System Cross-Compilation Tools
@@ -143,7 +143,7 @@ We will install Bolt to *install_[target]* directory. These subdirectories will
     - *benchmark* for measuring inference performance of bolt model
     
     These examples will be build when using *--example* option
-    - *classification* for imagenet classification task，*c_image_classifification* is a simplified C API version
+    - *classification* for imagenet classification task，*c_image_classification* is a simplified C API version
     - *u2net* for object detection
     - *ultra_face* for face detection
     - *tinybert* and *tinybert_onnx* for tinybert intention identification
@@ -176,6 +176,9 @@ We will install Bolt to *install_[target]* directory. These subdirectories will
   7. optional. save <http://www.ijg.org/files/jpegsrc.v9c.tar.gz> to *third_party/sources/jpegsrc.v9c.tar.gz* when using example.
   8. optional. save <https://codeload.github.com/anthonix/ffts/zip/master> to *third_party/sources/ffts-master.zip* when using Flow.
   9. optional. save <https://github.com/opencv/opencv/archive/refs/tags/4.5.2.zip> to *third_party/sources/opencv-4.5.2.zip* when using face detection example.
+  10. optional. save <https://codeload.github.com/agruzdev/Yato/zip/9b5a49f6ec4169b67b9e5ffd11fdae9c238b0a3d> to *third_party/sources/half-2.2.0.zip* when using on-device training.
+  11. optional. save <https://telkomuniversity.dl.sourceforge.net/project/half/half/2.2.0/half-2.2.0.zip> to *third_party/sources/Yato-9b5a49f6ec4169b67b9e5ffd11fdae9c238b0a3d.zip* when using on-device training.
+  12. optional. save <https://gitee.com/Janisa/huawei_secure_c> to *third_party/sources/huawei_secure_c-master.zip* when using Huawei secure C functions.
 
 - #### MinGW version error
 
diff --git a/docs/KIT.md b/docs/KIT.md
index f11a11b7..009800c8 100644
--- a/docs/KIT.md
+++ b/docs/KIT.md
@@ -5,6 +5,11 @@
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Android Overview](#android-overview)  
 &nbsp;&nbsp;&nbsp;&nbsp;[Examples](#examples)  
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Image Classification](#image-classification)  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Camera Enlarge](#camera-enlarge)  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Semantics Analysis](#semantics-analysis)  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Chinese Speech Recognition](#chinese-speech-recognition)  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Face Detection](#face-detection)  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Reading Comprehension](#reading-comprehension)  
 
 # Overview
 ---
@@ -31,7 +36,7 @@ In the [kit](../kit) directory, you can find the available demo project. In orde
 
 - ### Image Classification
 
-  <div align=center><img src="images/ImageClassification.PNG" width = 30% height = 30% /></div>
+  <div align=center><img src="images/ImageClassification.gif" width = 20% height = 20% /></div>
 
   The demo takes video input from camera, and uses [GhostNet](https://github.com/huawei-noah/ghostnet) model trained on ImageNet. Given the same FLOPs, GhostNet shows a clear advantage over other lightweight CNNs. The models that we provide are trained with width as 1.0 on TensorFlow, which reaches a TOP1 accuracy of 74%.
 
@@ -73,15 +78,15 @@ In the [kit](../kit) directory, you can find the available demo project. In orde
 
 - ### Camera Enlarge
 
-  <div align=center><img src="images/CameraEnlarge.PNG" width = 30% height = 30% /></div>
+  <div align=center><img src="images/CameraEnlarge.PNG" width = 20% height = 20% /></div>
 
   The demo takes video input from camera, 32 pixels x 32 pixels, and uses [ESR_EA](https://github.com/huawei-noah/vega/blob/master/docs/en/algorithms/esr_ea.md) model to enlarge input image to 64 pixels x 64 pixels.
 
   You can easily switch to other models trained on other datasets, following the steps below. As a tutorial, we will show how to change the model to the FP16 ESR_EA that is also included in the project (kit/models).
 
-  0. Similar with Image Classification
+  0. Similar with Image Classification.
 
-  1. Similar with Image Classification
+  1. Similar with Image Classification.
   
   2. Adjust the pixelProcess function, which is registered as the preprocessing function for the Inference node. For FP16 inference, actual input to the model should be in FP16:
 
@@ -118,9 +123,9 @@ In the [kit](../kit) directory, you can find the available demo project. In orde
      }
      ```
 
-- ### Semantics
+- ### Semantics Analysis
 
-  <div align=center><img src="images/Semantics.PNG" width = 30% height = 30% /></div>
+  <div align=center><img src="images/SemanticsAnalysis.gif" width = 20% height = 20% /></div>
 
   The demo tokenize input words, and use [tinybert](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT) model to do senmantic analysis.
    
@@ -160,7 +165,7 @@ In the [kit](../kit) directory, you can find the available demo project. In orde
      float[][] result = boltResult.getResultData();
      ```
   
-  3. Obtain the analysis result by comparing the size of the two probabilities in the result array
+  3. Obtain the analysis result by comparing the size of the two probabilities in the result array.
   
      ```
      if (result[0][0]>result[0][1]) {
@@ -170,17 +175,17 @@ In the [kit](../kit) directory, you can find the available demo project. In orde
      }
      ```
   
-- ### ChineseSpeechRecognition
+- ### Chinese Speech Recognition
 
-  <div align=center><img src="images/ChineseSpeechRecognition.PNG" width = 30% height = 30% /></div>
+  <div align=center><img src="images/ChineseSpeechRecognition.gif" width = 20% height = 20% /></div>
 
   The demo recognizes the input Chinese speech, and uses the [ASR](https://github.com/huawei-noah/xxx) model to convert Chinese text.
     
   You can easily switch to other models trained on other datasets, following the steps below. As a tutorial, we will show how to change the model to the FP32 ASR that is also included in the project.
     
-  0. Call the copyAssetAndWrite method to copy the path, and then change the path of the bin file and bolt model in the prototxt file to the copied path
+  0. Call the copyAssetAndWrite method to copy the path, and then change the path of the bin file and bolt model in the prototxt file to the copied path.
     
-  1. Import flow_asr.h in native-lib, flow_asr defines the pre- and post-processing methods and the initialization of flow and the acquisition of results,add init method and get result method in native-lib.cpp
+  1. Import flow_asr.h in native-lib, flow_asr defines the pre- and post-processing methods and the initialization of flow and the acquisition of results,add init method and get result method in native-lib.cpp.
 
      ```
      extern "C"
@@ -206,7 +211,7 @@ In the [kit](../kit) directory, you can find the available demo project. In orde
      }
      ```
     
-  2. Call Jni method  initFlow
+  2. Call Jni method  initFlow.
   
      ```
      initFlow(getCacheDir()+"/encoder_flow.prototxt",getCacheDir()+"/prediction_flow.prototxt",
@@ -214,28 +219,67 @@ In the [kit](../kit) directory, you can find the available demo project. In orde
      
      ```
     
-  3. Call Jni method  runFlow Incoming audio files in wav format get result
+  3. Call Jni method  runFlow Incoming audio files in wav format get result.
   
      ```
      runFlow(wavFileName)
      ```
   
-- ### FaceDetection
+- ### Face Detection
 
-  <div align=center><img src="images/FaceDetection.PNG" width = 30% height = 30% /></div>
+  <div align=center><img src="images/20_bolt_face_detection.gif" width = 20% height = 20% /></div>
     
   The demo detects the input picture, and outputs A photo framed a human face.
     
-  0. bolt path get Similar with Semantics
+  0. bolt path get Similar with Semantics.
     
-  1. Call the getDetectionImgPath method Bitmap and model path to go directly to the detection result picture path
+  1. Call the getDetectionImgPath method Bitmap and model path to go directly to the detection result picture path.
   
      ```
      resultImgPath=boltResult.getDetectionImgPath(bitmap,boltPath);
      ```
     
-  2. The parameters in the prior_boxes_generator method in the jni method initBolt are fixed input parameters of the model and cannot be changed
+  2. The parameters in the prior_boxes_generator method in the jni method initBolt are fixed input parameters of the model and cannot be changed.
 
      ```
      prior_boxes_generator(320,240,0.7,0.3);
-     ```
\ No newline at end of file
+     ```
+     
+- ### Reading Comprehension
+
+<div align=center><img src="images/ReadingComprehension.gif" width = 20% height = 20% /></div>
+
+
+The demo is to input a piece of content, and input a content-related question will output the corresponding answer
+
+0. Call the copyAssetAndWrite method to copy the path, and the model path is used in the BoltModel class.
+
+1. Incoming content and questions to obtain the input data required by the dynamic library.
+
+ ```
+ float[][] tokenizers = appTokenizer.runTokenizer(content.getText().toString(), question.getText().toString());
+ ```
+ 
+ 2. set the input and output names and other input parameters according to your model to initialize BoltModel.
+ 
+    ```
+    BoltModel boltModel = new BoltModel(modelPath, AffinityType.CPU_HIGH_PERFORMANCE, inputNum, inputName, inputN,inputCMax, inputH, inputW, inputDatatype, inputDataFormat, outputNum, outputName);
+    BoltResult boltResult = boltModel.run(inputNum, inputName, inputN, inputCActual, inputH, inputW,
+                                    inputDatatype, inputDataFormat, tokenizers);
+                                    
+    ```
+    
+    3. Call the run method of the BoltModel class to obtain the output result. Tokenizers are the processed input data, and inputCActual is the actual length of the input data. Call getResultData of BoltResult class to get the analysis result, get the result array, two float data.
+    
+       ```
+       BoltResult boltResult = boltModel.run(inputNum, inputName, inputN, inputCActual, inputH, inputW,
+                                       inputDatatype, inputDataFormat, tokenizers);
+       float[][] result = boltResult.getResultData();
+       ```
+       
+       4. Call the getResultAnswer method to get the answer of the output result conversion
+       
+       ```
+       String resultStr = getResultAnswer(result);
+       
+       ```
\ No newline at end of file
diff --git a/docs/OPERATORS.md b/docs/OPERATORS.md
index 75a15a08..73e0836e 100644
--- a/docs/OPERATORS.md
+++ b/docs/OPERATORS.md
@@ -12,7 +12,7 @@
 | Prelu                     | prelu activation |
 | BatchNorm                 | y = (x - mean) / sqrt(variance + eps) per channel |
 | LayerNorm                 | layernorm |
-| L2Normalization           | L2-Normalization |
+| L2Normalization           | L2 Normalization |
 | Reduction                 | sum, min, max, mean reduction |
 | ArgMax                    | max value index |
 | Softmax                   | y = exp(x - max(x)) / sum(exp(x - max(x))) |
@@ -51,7 +51,7 @@
 | PreAllocatedMemory        | allocate memory |
 | SharedWeight              | used to represent onnx/tflite operator input that is not generated by another operator |
 | Copy                      | memory copy |
-| Check                     | tensor level compare, result is used for Jump |
+| Check                     | element level compare, same as onnx Greater, GreaterOrEqual, Equal, LowerOrEqual, Lower |
 | Repeat                    | do while loop for dynamic control flow |
 | Jump                      | if statement for dynamic control flow |
 | Attention                 | transformer global attention mask |
@@ -70,22 +70,35 @@
 | Where                     | onnx where|
 | SoftPlus                  | y = log(1 + e ^ x)|
 | Exp                       | y = exp(x) |
-| Split                     | y = x |
+| OneHot                    | same as onnx one hot |
 | Tdnn                      | Kaldi tdnn operator(Splice + Linear) |
 | Dropout                   | dropout function |
 | TopK                      | same as onnx topk |
 | SpaceToBatchNd            | tensorflow space_to_batch function |
 | BatchToSpaceNd            | tensorflow batch_to_space function |
 | Abs                       | y = (x > 0) ? x : -x |
-| Equal                     | elementwise tensor compare, same as onnx equal, this also support tflite NOT_EQUAL |
+| NonZero                   | same as onnx non zero |
 | Sign                      | y = sign(x) |
 | HSwishNoDiv               | y = x * relu6(x + 3) |
 | InstanceNorm              | Instance Normalization |
 | Expand                    | onnx expand |
 | Scatter                   | onnx scatter, scatter_elements, scatterND |
-| Log                       | y = log(x) |
 | Select                    | y = choice ? a : b, same as tflite select |
 | Not                       | y = ! (x), same as onnx not |
-| RoIAlign                  | same as onnx RoIAlign |
+| Reciprocal                | same as onnx reciprocal, y = 1 / x |
+| Log                       | y = log(x) |
 | GenerateProposals         | same as tf tf.image.generate_bounding_box_proposals |
-| Reciprocal                | same as onnx reciprocal |
+| RoIAlign                  | same as onnx RoIAlign |
+| GAT                       | graph attention module |
+| QuantizeLinear            | int8 quantization |
+| Round                     | y = round(x) |
+| Floor                     | y = floor(x) |
+| Ceil                      | y = ceil(x) |
+| RandomUniform             | same as onnx random uniform |
+| CumSum                    | prefix sum, same as onnx cumsum |
+| GridSample                | same as onnx grid_sample |
+| NonMaxSuppression         | same as onnx non max suppression |
+| Range                     | same as onnx range |
+| Swish                     | y = x * exp(x) |
+| Split                     | y = x |
+| ~~Equal~~                 | elementwise tensor compare, same as onnx equal, this also support tflite NOT_EQUAL, Equal is replaced with Check |
diff --git a/docs/REDUCE_GPU_PREPARE_TIME.md b/docs/REDUCE_GPU_PREPARE_TIME.md
index b3825009..20ff3c8c 100644
--- a/docs/REDUCE_GPU_PREPARE_TIME.md
+++ b/docs/REDUCE_GPU_PREPARE_TIME.md
@@ -1,25 +1,30 @@
-# How to reduce gpu initial time
+# How to reduce gpu inference overhead
 ---
-Bolt support ARM Mali GPU, large addtitional prepare time is cost due to algorithm selecting and building kernel from source code. 
+Bolt supports ARM GPU inference with OpenCL. 
+But building OpenCL kernel function from source code and selecting optimal algorithm takes up a lot of time.
+They can be optimized by preparing the OpenCL binary function library and algorithm file in advance.
+Inference can directly use prepared files.
 
-- ### Build extra resources for reducing prepare time on GPU
+- ### Build OpenCL binary kernel library
 
-  Bolt provides offline tools [preprocess_ocl](../inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh) to reduce GPU prepare time. We have test mobilenet_v1 on MALI G76 GPU. Prepare time can be reduced from 2-3s to 60ms after build algorithm file and OpenCL kernel binary. Here we give an exaple:
+  Bolt provides offline tool [preprocess_ocl](../inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh) to reduce GPU prepare time. 
+  We have test mobilenet_v1 model on ARM MALI G76 GPU. Prepare time can be reduced from 2-3s to 60ms after building OpenCL binary kernel and algorithm file. 
+  Here we give an example:
 
-- #### Step By Step
+  - #### Step By Step
 
-    <1> Connect target device by Andriod ADB;
+    <1> Connect target device by using Andriod *ADB*;
     
-    <2> Convert your models to .bolt with X2bolt;
+    <2> Convert your models to xxx.bolt by using *X2bolt*;
     
-    <3> Make a write/read able folder on target device, copy all your needed .bolt models into it, E.g:
+    <3> Create a directory on target device, copy all your needed xxx.bolt models into it, E.g:
     
     ```
     adb shell "mkdir /data/local/tmp/preprocess_bolt_models"
     adb shell "cp ${boltModelDir}/*.bolt /data/local/tmp/preprocess_bolt_models"
     ```
     
-    <4> Set essential variables for sh */inference/engine/tools/preproces_ocl/build_preprocess_ocl.sh*:
+    <4> Set essential command line arguments for shell script [preprocess_ocl](../inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh):
     
       - dNum: Device serial number, which can be aquired by using command
       
@@ -30,32 +35,36 @@ Bolt support ARM Mali GPU, large addtitional prepare time is cost due to algorit
       - device_bolt_models: which is created in step <3>;
 
     for example:
+        
       ```
-      ./build_preprocess_ocl.sh --device dNum --target android-aarch64 -d device_bolt_models
+      ./build_preprocess_ocl.sh --device 435bc850 --target android-aarch64 -d /data/local/tmp/preprocess_bolt_models
       ```
         
     <5> Run *build_preprocess_ocl.sh* on host;
 
-    After running build_preprocess_ocl.sh successfully, these extra xxxlib.so will be produced:
-    
-    - OpenCL kernel bin dynamic library: All needed kernels for your model has been compiled from sources to bins, and package into .so, such as: *${BOLT_ROOT}/inference/engine/tools/preprocess_ocl/lib/libMali_G76p_map.so*
+    After running build_preprocess_ocl.sh successfully, OpenCL binary kernel shared library libxxx_map.so will be produced.
+    All needed kernels for your models has been compiled from sources to bins, 
+    and packaged into libxxx_map.so, such as *${BOLT_ROOT}/inference/engine/tools/preprocess_ocl/lib/libMali_G76p_map.so*
+
+- ### Use OpenCL binary kernel library to reduce gpu prepare time for your model
 
-- ### Use algorithm file and kernel binary dynamic library to reduce gpu prepare time for your model
+  - #### Reduce Imagenet classification prepare time
 
-- #### Reduce Imagenet classification prepare time
-  ```
-    adb shell "mkdir /data/local/tmp/kits"
-    adb push install_arm_llvm/kits/classification /data/local/tmp/kits
-    adb push tools/preprocess_ocl/lib/libMali_G76p_map.so /data/local/tmp/kits
-    adb shell "cd /data/local/tmp/kits && export LD_LIBRARY_PATH=./ && ./classification -m ./mobilenet_v1_f16.bolt -a GPU"
-  ```
+      ```
+       adb shell "mkdir /data/local/tmp/kits"
+       adb push install_arm_llvm/kits/classification /data/local/tmp/kits
+       adb push tools/preprocess_ocl/lib/libMali_G76p_map.so /data/local/tmp/kits
+       adb shell "cd /data/local/tmp/kits && export LD_LIBRARY_PATH=./ && ./classification -m ./mobilenet_v1_f16.bolt -a GPU"
+      ```
 
-- #### Reduce C project prepare time
+  - #### Reduce C project prepare time
   
-    - Argument *algoFileStream* of C API *ModelHandle CreateModelWithFileStream( const char *modelFileStream, AFFINITY_TYPE affinity, const char *algoFileStream)* is used to set your algofile filestream;
-    - Package kernel binary dynamic library into your project;
+    Package kernel binary dynamic library into your project, and put it in *libbolt.so* directory.
 
 - ### Note
-  - Kernel binary dynamic library are binding with specific GPU type and your bolt models;
-  - Please run it under file path "/data/local/tmp" for android devices to ensure the program get full authorities;
-  - Argument *algoPath* of C API *ModelHandle CreateModel(const char *modelPath, AFFINITY_TYPE affinity, const char *algoPath)* is abandoned, for now algoInfo has been packaged into xxxlib.so, please set it *NULL*;
+  - OpenCL kernel functions are stored in the shared library libxxx_map.so in binary form. 
+    Shared library libxxx_map.so is binding with specific GPU type and bolt models.
+    Bolt will use C system function *dlopen* to open shared library libxxx_map.so, please save it in same directory.
+  - Please run prepare program under */data/local/tmp* directory for android devices to ensure the program has write permission.
+  - Argument *algoPath* of C API *ModelHandle CreateModel(const char \*modelPath, AFFINITY_TYPE affinity, const char \*algoPath)* is abandoned in latest version, 
+    algorithm file has been packaged into libxxx_map.so, please set it to *NULL*.
\ No newline at end of file
diff --git a/docs/USER_HANDBOOK.md b/docs/USER_HANDBOOK.md
index 8ebb039d..630488c3 100644
--- a/docs/USER_HANDBOOK.md
+++ b/docs/USER_HANDBOOK.md
@@ -6,25 +6,20 @@ Before you try any step described in this document, please make sure you have in
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Model Conversion](#model-conversion)   
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Model Inference](#model-inference)  
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[API](#api)   
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Performance Profiling](#performance-profiling)  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Performance Profiling](#performance-profiling)   
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Model Visualization](#model-visualization)  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Model Protection](#model-protection)  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Environment variables](#environment-variables)  
 &nbsp;&nbsp;&nbsp;&nbsp;[Advanced Features](#advanced-features)  
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[INT8 Post Training Quantization](#int8-post-training-quantization)  
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[BNN Network Support](#bnn-network-support)  
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Algorithm Tuning for Key Layers](#algorithm-tuning-for-key-layers)  
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Time-Series Data Acceleration](#time-series-data-acceleration)  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[How to reduce gpu inference overhead](#how-to-reduce-gpu-inference-overhead) 
 
 # Basic Usage
 ---
 
-### Environment variables
-
-Some Linux shell environment variables are reserved for Bolt.
-
-- *BOLT_MEMORY_REUSE_OPTIMIZATION*: whether to use memory reuse optimization. The default value is ON, You can set it *OFF* before model conversion to disable memory reuse optimization. Note that this setting takes effect during the model conversion. Once the model (.bolt) is stored, the memory reuse behavior is fixed.
-- *BOLT_PADDING*: Bolt only supports RNN/GRU/LSTM hidden states number mod 32 = 0 case, If you want to run number mod 32 != 0 case, please set it to *ON* before model conversion. The default value is ON.
-- *BOLT_INT8_STORAGE_ERROR_THRESHOLD*: Bolt supports storage precision and computation precision independent. You can use int8 model storage, FP32/FP16 computation. There will be a huge accuracy error when you quantize all float weight to int8 storage. So we provide a configure parameter to control only quantize < *BOLT_INT8_STORAGE_ERROR_THRESHOLD* weight.
-- *Bolt_TensorComputing_LibraryAlgoritmMap*: a path on the target device set by user to save tensor_computing library performance tuning result.
-
 ### Model Conversion
 
 <div align=center><img src="images/ModelConversion.PNG" width = 100% height = 100% style="border: 1px solid rgba(151,151,151,0.50)"/></div>
@@ -40,10 +35,12 @@ Some Linux shell environment variables are reserved for Bolt.
 * [X2bolt](../model_tools/tools/X2bolt/X2bolt.cpp) is a general converter, which focuses on converting different deep learning model to bolt model.
 
  
-Here we list the examples of two typical model conversions for Android backend, for X86 backend the ADB tool is not required.
+*Here we list the examples of two typical model conversions for Android backend, for X86 backend the ADB tool is not required.*
 
 #### Caffe/ONNX/Tflite Model Conversion
 
+Here we give an example of Caffe model conversion. ONNX and Tflite Model Conversions are similar to Caffe. The only difference is the suffix and number of model files. **If you want to convert ONNX model, you would better simplify ONNX model with [onnx-sim](https://github.com/daquexian/onnx-simplifier)**.
+
 resnet50(caffe) model contains two model files : [resnet50.prototxt](https://github.com/KaimingHe/deep-residual-networks/blob/master/prototxt/ResNet-50-deploy.prototxt) and [resnet50.caffemodel](https://deepdetect.com/models/resnet/ResNet-50-model.caffemodel). Prepare these two model files on */home/resnet/* in advance.
 
 1. Push your model to the phone;
@@ -66,20 +63,22 @@ resnet50(caffe) model contains two model files : [resnet50.prototxt](https://git
    adb shell "./X2bolt --help"    
    ```
 
-3. Execute ***X2bolt*** to convert a model from caffe model to bolt model. Here shows the example of float16 model conversion.
+3. Execute ***X2bolt*** to convert a model from caffe model to bolt model. Here shows the example of float32 model conversion.
 
    ```
-   adb shell "/data/local/tmp/bolt/tools/X2bolt -d /data/local/tmp/models/resnet50/ -m resnet50 -i FP16"
+   adb shell "/data/local/tmp/bolt/tools/X2bolt -d /data/local/tmp/models/resnet50/ -m resnet50 -i FP32"
      
    adb shell "ls /data/local/tmp/models/resnet50"
-   # command output$ resnet50_fp16.bolt
+   # command output$ resnet50_fp32.bolt
    ```
 
 Note : Model conversion procedure of onnx and tflite is similar to caffe. 
 
 #### Tensorflow Model Conversion
 
-Save your mobilenet_v1 to frozen .pb model. And preprocess your model using [tf2json](../model_tools/tools/tensorflow2json/tf2json.py) which can convert the .pb to .json. Then use **X2bolt** to convert .json to .bolt model.
+Save your mobilenet_v1 to frozened .pb model. 
+Preprocess .pb model using [tf2json](../model_tools/tools/tensorflow2json/tf2json.py) which can convert the .pb to .json.
+Convert .json to .bolt model with **X2bolt**.
 
 Here is the example of mobilenet_v1_frozen.pb converted to mobilenet_v1.bolt. 
 
@@ -322,6 +321,36 @@ Bolt provides a program performance visualization interface to help user identif
 4. Use Google Chrome browser to open <chrome://tracing/> extension. Load the JSON file. You can see the program execution time.
 ![](images/PerformanceProfiling.PNG)
 
+### Model Visualization
+
+Bolt provides two ways to see model structure.
+
+- #### Using **-V** option in X2bolt or post_training_quantization to print model structure
+   
+<div align=center><img src="images/X2bolt.PNG" width = 60% height = 60%  style="border:1px solid rgba(151,151,151,0.50)"/></div>
+
+- #### [Using netron to visualise bolt model](https://github.com/huawei-noah/bolt/issues/97)
+
+<div align=center><img src="images/netron.PNG" width = 10% height = 10%  style="border:1px solid rgba(151,151,151,0.50)"/></div>
+
+
+### Model Protection
+
+If you don't want others to know your model structure, you can follow these steps to achieve goal.
+
+1. modify enum type *OperatorType*'s order and *OperatorTypeName* function in [common/uni/include/operator_type.h](common/uni/include/operator_type.h).
+2. set cmake option *USE_MODEL_PRINT* to *OFF* in [common/cmakes/bolt.cmake](common/cmakes/bolt.cmake).
+
+### Environment variables
+
+Some Linux shell environment variables are reserved for Bolt.
+
+- *BOLT_MEMORY_REUSE_OPTIMIZATION*: whether to use memory reuse optimization. The default value is ON, You can set it *OFF* before model conversion to disable memory reuse optimization. Note that this setting takes effect during the model conversion. Once the model (.bolt) is stored, the memory reuse behavior is fixed.
+- *BOLT_PADDING*: Bolt only supports RNN/GRU/LSTM hidden states number mod 32 = 0 case, If you want to run number mod 32 != 0 case, please set it to *ON* before model conversion. The default value is ON.
+- *BOLT_INT8_STORAGE_ERROR_THRESHOLD*: Bolt supports storage precision and computation precision independent. You can use int8 model storage, FP32/FP16 computation. There will be a huge accuracy error when you quantize all float weight to int8 storage. So we provide a configure parameter to control only quantize < *BOLT_INT8_STORAGE_ERROR_THRESHOLD* weight.
+- *Bolt_TensorComputing_LibraryAlgoritmMap*: a path on the target device set by user to save tensor_computing library performance tuning result.
+
+
 # Advanced Features
 ---
 
@@ -368,3 +397,10 @@ Flow is the time-series data acceleration module for Bolt. Flow simplifies the a
 Flow provides flexible CPU multi-core parallelism and heterogeneous scheduling (CPU + GPU). User don't need to pay excessive attention to heterogeneous management and write lots of non-reusable code to implement a heterogeneous application. User can get the best end-to-end performance with the help of Flow. Flow supports data parallelism and subgraph parallelism, with a simple API.
 
 More usage information can be find in [DEVELOPER.md](./DEVELOPER.md#time-series-data-acceleration-by-using-flow).
+
+### How to reduce gpu inference overhead
+
+Bolt support ARM GPU inference with OpenCL, but there are a big overhead that is caused by compiling OpenCL kernel source code and selecting optimal algorithm.
+
+They can be optimized by preparing some files in advance. Inference can directly use prepared files. 
+You can refer [REDUCE_GPU_PREPARE_TIME.md](./REDUCE_GPU_PREPARE_TIME.md) for more details.
diff --git a/docs/images/ChineseSpeechRecognition.PNG b/docs/images/ChineseSpeechRecognition.PNG
deleted file mode 100644
index 66a6a8af..00000000
Binary files a/docs/images/ChineseSpeechRecognition.PNG and /dev/null differ
diff --git a/docs/images/FaceDetection.PNG b/docs/images/FaceDetection.PNG
deleted file mode 100644
index 85a93c96..00000000
Binary files a/docs/images/FaceDetection.PNG and /dev/null differ
diff --git a/docs/images/Framework.PNG b/docs/images/Framework.PNG
index a831acd8..109d9496 100644
Binary files a/docs/images/Framework.PNG and b/docs/images/Framework.PNG differ
diff --git a/docs/images/ImageClassification.PNG b/docs/images/ImageClassification.PNG
deleted file mode 100644
index 3603c9be..00000000
Binary files a/docs/images/ImageClassification.PNG and /dev/null differ
diff --git a/docs/images/ReadingComprehension.gif b/docs/images/ReadingComprehension.gif
new file mode 100644
index 00000000..bb695603
Binary files /dev/null and b/docs/images/ReadingComprehension.gif differ
diff --git a/docs/images/Semantics.PNG b/docs/images/Semantics.PNG
deleted file mode 100644
index fd299bad..00000000
Binary files a/docs/images/Semantics.PNG and /dev/null differ
diff --git a/docs/images/X2bolt.PNG b/docs/images/X2bolt.PNG
new file mode 100644
index 00000000..a57ca6aa
Binary files /dev/null and b/docs/images/X2bolt.PNG differ
diff --git a/docs/images/losses_of_training_lenet.PNG b/docs/images/losses_of_training_lenet.PNG
new file mode 100644
index 00000000..68c34a70
Binary files /dev/null and b/docs/images/losses_of_training_lenet.PNG differ
diff --git a/docs/images/losses_of_training_mobilenet.PNG b/docs/images/losses_of_training_mobilenet.PNG
new file mode 100644
index 00000000..175bc606
Binary files /dev/null and b/docs/images/losses_of_training_mobilenet.PNG differ
diff --git a/docs/images/losses_of_training_resnet.PNG b/docs/images/losses_of_training_resnet.PNG
new file mode 100644
index 00000000..c8648f91
Binary files /dev/null and b/docs/images/losses_of_training_resnet.PNG differ
diff --git a/docs/images/netron.PNG b/docs/images/netron.PNG
new file mode 100644
index 00000000..e7545fa3
Binary files /dev/null and b/docs/images/netron.PNG differ
diff --git a/inference/engine/api/c/bolt.h b/inference/engine/api/c/bolt.h
index b729578a..bb51e422 100644
--- a/inference/engine/api/c/bolt.h
+++ b/inference/engine/api/c/bolt.h
@@ -33,9 +33,10 @@ typedef void *ResultHandle;
 
 /** CPU affinity policy */
 typedef enum {
-    CPU_HIGH_PERFORMANCE = 0,  ///< performance is high priority(use big core)
-    CPU_LOW_POWER = 1,         ///< power is high priority(use small core)
-    GPU = 2                    ///< use GPU
+    CPU = 0,                   ///< don't bind process to specific core
+    CPU_HIGH_PERFORMANCE = 1,  ///< performance is high priority(use big core)
+    CPU_LOW_POWER = 2,         ///< power is high priority(use small core)
+    GPU = 3                    ///< use GPU
 } AFFINITY_TYPE;
 
 /** heterogeneous device type */
@@ -60,28 +61,20 @@ typedef enum {
 } DATA_TYPE;
 
 /** Get DATA_TYPE String */
-inline const char *const *GetDataTypeString()
-{
-    static const char *const names[] = {"FP_32", "FP_16", "INT_32", "UINT_32"};
-    return names;
-}
+const char *const *GetDataTypeString();
 
 /** multi-dimension data format */
 typedef enum {
     NCHW = 0,    ///< batch->channel->high->width data order
     NHWC = 1,    ///< batch->high->width->channel data order
-    NCHWC8 = 2,  ///< batch->channel/8->high->width->channel four element data order
+    NCHWC8 = 2,  ///< batch->channel/8->high->width->channel eight element data order
     MTK = 3,     ///< batch->time->unit data order
     NORMAL = 4,  ///< batch->unit data order
     NCHWC4 = 5   ///< batch->channel/4->width->high->channel four element data order
 } DATA_FORMAT;
 
 /** Get DATA_FORMAT String */
-inline const char *const *GetDataFormatString()
-{
-    static const char *const names[] = {"NCHW", "NHWC", "NCHWC8", "MTK", "NORMAL"};
-    return names;
-}
+const char *const *GetDataFormatString();
 
 /**
  * @brief create model from file
@@ -174,6 +167,12 @@ void PrepareModel(ModelHandle ih,
  * @param  ih            inference pipeline handle
  *
  * @return result data memory handle
+ * @note destroy result when unused
+ * @code
+ *     ResultHandle result = AllocAllResultHandle(...);
+ *     ...
+ *     FreeResultHandle(result);
+ * @endcode
  */
 ResultHandle AllocAllResultHandle(ModelHandle ih);
 
@@ -213,6 +212,11 @@ int GetNumOutputsFromResultHandle(ResultHandle ir);
  * @note
  * name/n/c/h/w/dt/df array space must be allocated before calling, the array length must be equal to num_inputs.
  * each element of name must be allocated, the array length must be equal to 128.
+ * GetOutputDataInfoFromResultHandle must behind RunModel because RunModel will change ResultHandle.
+ * @code
+ *     RunModel(...);
+ *     GetOutputDataInfoFromResultHandle(...);
+ * @endcode
  */
 void GetOutputDataInfoFromResultHandle(ResultHandle ir,
     int num_outputs,
@@ -231,6 +235,11 @@ void GetOutputDataInfoFromResultHandle(ResultHandle ir,
  * @param  data          the array of all output data's content
  *
  * @return
+ * @note GetOutputDataFromResultHandle must behind RunModel because RunModel will change ResultHandle.
+ * @code
+ *     RunModel(...);
+ *     GetOutputDataFromResultHandle(...);
+ * @endcode
  */
 void GetOutputDataFromResultHandle(ResultHandle ir, int num_outputs, void **data);
 
@@ -337,6 +346,11 @@ void ResizeModelInput(ModelHandle ih,
  * @param  name          the array of tesor name that needed
  *
  * @return result data memory handle
+ * @code
+ *     ResultHandle result = AllocSpecificResultHandle(...);
+ *     ...
+ *     FreeResultHandle(result);
+ * @endcode
  */
 ResultHandle AllocSpecificResultHandle(ModelHandle ih, int num_outputs, const char **name);
 
@@ -399,6 +413,15 @@ void SetRuntimeDeviceDynamic(ModelHandle ih);
  * @return
  */
 void SetNumThreads(int threads);
+
+/**
+ * @brief check memory leak
+ *
+ * @note
+ * This can only be used at the end of program after Model and Result free.
+ * @return
+ */
+void MemoryCheck();
 #ifdef __cplusplus
 }
 #endif
diff --git a/inference/engine/api/java/com/huawei/noah/AffinityType.java b/inference/engine/api/java/com/huawei/noah/AffinityType.java
index 4cfe0704..5c458851 100644
--- a/inference/engine/api/java/com/huawei/noah/AffinityType.java
+++ b/inference/engine/api/java/com/huawei/noah/AffinityType.java
@@ -22,6 +22,7 @@
 
 /** affinity policy */
 public enum AffinityType {
+    CPU,                   ///< don't bind process to specific core
     CPU_HIGH_PERFORMANCE,  ///< performance is high priority(use CPU big core)
     CPU_LOW_POWER,         ///< power is high priority(use CPU small core)
     GPU                    ///< use ARM MALI GPU
diff --git a/inference/engine/include/activation.hpp b/inference/engine/include/activation.hpp
index 293f7119..bdfeada6 100644
--- a/inference/engine/include/activation.hpp
+++ b/inference/engine/include/activation.hpp
@@ -18,9 +18,9 @@
 
 class Activation : public Operator {
 public:
-    Activation(ActivationParamSpec activationDesc)
+    Activation(ActivationParamSpec p)
     {
-        this->activationDesc = activationDesc;
+        this->p = p;
         std::map<ActivationMode, OperatorType> activationMap = {{ACTIVATION_RELU, OT_Relu},
             {ACTIVATION_RELU6, OT_Relu6}, {ACTIVATION_H_SWISH, OT_HSwish},
             {ACTIVATION_H_SWISH_NODIV, OT_HSwishNoDiv}, {ACTIVATION_SIGMOID, OT_Sigmoid},
@@ -28,13 +28,14 @@ class Activation : public Operator {
             {ACTIVATION_TANH, OT_TanH}, {ACTIVATION_MISH, OT_Mish}, {ACTIVATION_GREATER, OT_Greater},
             {ACTIVATION_EXP, OT_Exp}, {ACTIVATION_SOFTPLUS, OT_SoftPlus}, {ACTIVATION_ABS, OT_Abs},
             {ACTIVATION_SIGN, OT_Sign}, {ACTIVATION_NOT, OT_Not}, {ACTIVATION_LOG, OT_Log},
-            {ACTIVATION_NEG, OT_Neg}};
-        if (activationMap.find(activationDesc.mode) == activationMap.end()) {
-            UNI_ERROR_LOG("can not map ActivationMode to OperatorType.\n");
+            {ACTIVATION_NEG, OT_Neg}, {ACTIVATION_ROUND, OT_Round}, {ACTIVATION_FLOOR, OT_Floor},
+            {ACTIVATION_CEIL, OT_Ceil}, {ACTIVATION_SWISH, OT_Swish},
+            {ACTIVATION_RECIPROCAL, OT_Reciprocal}};
+        if (activationMap.find(p.mode) == activationMap.end()) {
+            UNI_ERROR_LOG("can not map ActivationMode(%d) to OperatorType.\n", p.mode);
         } else {
-            this->opt = activationMap[activationDesc.mode];
+            this->opt = activationMap[p.mode];
         }
-        this->lenOfTemp = 0;
     }
 
     OperatorType get_type() override
@@ -48,7 +49,7 @@ class Activation : public Operator {
     }
 
 protected:
-    ActivationParamSpec activationDesc;
+    ActivationParamSpec p;
     OperatorType opt;
 };
 
diff --git a/inference/engine/include/attention.hpp b/inference/engine/include/attention.hpp
index 87892b81..a992be7f 100644
--- a/inference/engine/include/attention.hpp
+++ b/inference/engine/include/attention.hpp
@@ -15,7 +15,6 @@
 #define _ATTENTION_H
 
 #include "operator.hpp"
-#include "tensor_computing.h"
 
 class Attention : public Operator {
 public:
diff --git a/inference/engine/include/batch_norm.hpp b/inference/engine/include/batch_norm.hpp
index 4dba830b..6f762f97 100644
--- a/inference/engine/include/batch_norm.hpp
+++ b/inference/engine/include/batch_norm.hpp
@@ -22,7 +22,6 @@ class BatchNorm : public WeightOperator {
     {
         this->dt = dt;
         this->p = p;
-        this->numChannels = 0;
     }
 
     OperatorType get_type() override
@@ -32,7 +31,6 @@ class BatchNorm : public WeightOperator {
 
 protected:
     BatchNormParamSpec p;
-    U32 numChannels;
 };
 
 #endif  // _BATCH_NORM_H
diff --git a/inference/engine/include/cast.hpp b/inference/engine/include/cast.hpp
index 318422e7..78d84b72 100644
--- a/inference/engine/include/cast.hpp
+++ b/inference/engine/include/cast.hpp
@@ -29,11 +29,6 @@ class Cast : public Operator {
         return OT_Cast;
     }
 
-    bool can_input_output_the_same() override
-    {
-        return false;
-    }
-
 public:
     CastParamSpec p;
 };
diff --git a/inference/engine/include/check.hpp b/inference/engine/include/check.hpp
index 2c8ce0b0..f07dbbc2 100644
--- a/inference/engine/include/check.hpp
+++ b/inference/engine/include/check.hpp
@@ -14,9 +14,9 @@
 #ifndef _CHECK_H
 #define _CHECK_H
 
-#include "operator.hpp"
+#include "weight_operator.hpp"
 
-class Check : public Operator {
+class Check : public WeightOperator {
 public:
     Check(DataType dt, CheckParamSpec p)
     {
diff --git a/inference/engine/include/cnn.h b/inference/engine/include/cnn.h
index c9b03f4b..b320c344 100644
--- a/inference/engine/include/cnn.h
+++ b/inference/engine/include/cnn.h
@@ -82,6 +82,8 @@ class CNN : public Model {
 
     void update_op_tensors();
 
+    void update_tensor_positions();
+
     void set_input_desc(std::map<std::string, TensorDesc> inputDescMap);
 
     void infer_tmp_memory_size() override;
@@ -98,6 +100,8 @@ class CNN : public Model {
 
     void clean_tensorMap_desc();
 
+    void check_dynamic_output_size(OperatorType type);
+
 private:
     std::map<std::string, std::shared_ptr<Tensor>> tensorMap;
     std::map<std::string, std::shared_ptr<Operator>> operatorMap;
@@ -116,5 +120,6 @@ class CNN : public Model {
 #ifdef _USE_GPU
     ImageContainer tmpImages;
 #endif
+    bool dynamicOutputSize = false;
 };
 #endif
diff --git a/inference/engine/include/constant.hpp b/inference/engine/include/constant.hpp
index 93c3e344..d58cd1bd 100644
--- a/inference/engine/include/constant.hpp
+++ b/inference/engine/include/constant.hpp
@@ -40,7 +40,7 @@ class Constant : public Operator {
     {
         Tensor outputTensor = this->outputTensors[0];
         auto outputPtr = ((CpuMemory *)outputTensor.get_memory())->get_ptr();
-        memcpy(outputPtr, data, tensorNumBytes(constDesc));
+        UNI_MEMCPY(outputPtr, data, tensorNumBytes(constDesc));
     }
 
     EE infer_output_tensors_size(std::vector<TensorDesc> *outDims) override
diff --git a/inference/engine/include/constant_of_shape.hpp b/inference/engine/include/constant_of_shape.hpp
new file mode 100644
index 00000000..5ed4eb21
--- /dev/null
+++ b/inference/engine/include/constant_of_shape.hpp
@@ -0,0 +1,35 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _CONSTANT_OF_SHAPE_H
+#define _CONSTANT_OF_SHAPE_H
+
+#include "operator.hpp"
+
+class ConstantOfShape : public Operator {
+public:
+    explicit ConstantOfShape(DataType dt, ConstantOfShapeParamSpec p)
+    {
+        this->dt = dt;
+        this->p = p;
+    }
+
+    OperatorType get_type() override
+    {
+        return OT_ConstantOfShape;
+    }
+
+protected:
+    ConstantOfShapeParamSpec p;
+};
+#endif  // _CONSTANT_OF_SHAPE_H
diff --git a/inference/engine/include/cpu/activation_cpu.hpp b/inference/engine/include/cpu/activation_cpu.hpp
index c598c142..7294b264 100644
--- a/inference/engine/include/cpu/activation_cpu.hpp
+++ b/inference/engine/include/cpu/activation_cpu.hpp
@@ -18,13 +18,13 @@
 
 class ActivationCPU : public Activation {
 public:
-    ActivationCPU(ActivationParamSpec activationDesc) : Activation(activationDesc)
+    ActivationCPU(ActivationParamSpec p) : Activation(p)
     {}
 
     std::shared_ptr<Operator> clone() override
     {
         std::shared_ptr<ActivationCPU> mem =
-            std::shared_ptr<ActivationCPU>(new ActivationCPU(this->activationDesc));
+            std::shared_ptr<ActivationCPU>(new ActivationCPU(this->p));
         *mem = *this;
         return mem;
     }
@@ -33,15 +33,14 @@ class ActivationCPU : public Activation {
     {
         Tensor inputTensor = this->inputTensors[0];
         Tensor outputTensor = this->outputTensors[0];
-        CHECK_STATUS(activation(inputTensor, this->activationDesc, outputTensor, &this->archInfo));
+        CHECK_STATUS(activation(inputTensor, this->p, outputTensor, &this->archInfo));
         outputTensor.set_scale(inputTensor.get_scale());
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(activation_infer_output_size(inTensors[0], outTensors[0], &this->archInfo));
-        return SUCCESS;
+        return activation_infer_output_size(inTensors[0], outTensors[0], &this->archInfo);
     }
 };
 
diff --git a/inference/engine/include/cpu/batch_norm_cpu.hpp b/inference/engine/include/cpu/batch_norm_cpu.hpp
index 651f962d..24f11d20 100644
--- a/inference/engine/include/cpu/batch_norm_cpu.hpp
+++ b/inference/engine/include/cpu/batch_norm_cpu.hpp
@@ -38,36 +38,29 @@ class BatchNormCPU : public BatchNorm {
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        auto inputDesc = inTensors[0]->get_desc();
-        this->set_channels_from_weight();
-        TensorDesc outputDesc = inputDesc;
-        //if (outputDesc.nDims == 3 && this->p.axis == -1 && outputDesc.dims[0] == this->numChannels) {
-        //    outputDesc.df = DF_NHWC;
-        //}
-        outTensors[0]->resize(outputDesc);
+        outTensors[0]->resize(inTensors[0]->get_desc());
         return SUCCESS;
     }
 
-    void set_channels_from_weight()
+    int get_channels_num()
     {
+        int ret = 0;
         auto curOpWs = this->get_weightspec();
         if (0 != curOpWs.bytes_of_weight) {
-            this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt));
+            ret = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt));
         } else if (0 != curOpWs.bytes_of_vec) {
-            this->numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt));
-        } else {
-            this->numChannels = 0;
+            ret = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt));
         }
+        return ret;
     }
 
     EE infer_weight_desc() override
     {
-        // weight is mean, bias is variance
-        this->set_channels_from_weight();
+        int num = this->get_channels_num();
         this->weightTensors = std::vector<Tensor>(1);
-        this->weightTensors[0].resize(tensor1d(this->dt, this->numChannels));
+        this->weightTensors[0].resize(tensor1d(this->dt, num));
         this->biasTensors = std::vector<Tensor>(1);
-        this->biasTensors[0].resize(tensor1d(this->dt, this->numChannels));
+        this->biasTensors[0].resize(tensor1d(this->dt, num));
         return SUCCESS;
     }
 
diff --git a/inference/engine/include/cpu/cast_cpu.hpp b/inference/engine/include/cpu/cast_cpu.hpp
index 4a7c74bd..544795b3 100644
--- a/inference/engine/include/cpu/cast_cpu.hpp
+++ b/inference/engine/include/cpu/cast_cpu.hpp
@@ -30,13 +30,13 @@ class CastCPU : public Cast {
 
     void run() override
     {
-        CHECK_STATUS(cast(this->inputTensors[0], this->outputTensors[0], this->p, &this->archInfo));
+        CHECK_STATUS(cast(this->inputTensors[0], this->p, this->outputTensors[0], &this->archInfo));
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(cast_infer_output_size(inTensors[0], outTensors[0], this->p, &this->archInfo));
+        CHECK_STATUS(cast_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo));
         return SUCCESS;
     }
 };
diff --git a/inference/engine/include/cpu/channel_resize_cpu.hpp b/inference/engine/include/cpu/channel_resize_cpu.hpp
index cf5d6c5f..8fa0789c 100644
--- a/inference/engine/include/cpu/channel_resize_cpu.hpp
+++ b/inference/engine/include/cpu/channel_resize_cpu.hpp
@@ -43,7 +43,7 @@ class ChannelResizeCPU : public ChannelResize {
         if (!this->valid) {
             if (inputPtr != outputPtr) {
                 CHECK_REQUIREMENT(inputSize == outputSize);
-                memcpy(outputPtr, inputPtr, inputSize);
+                UNI_MEMCPY(outputPtr, inputPtr, inputSize);
             }
         } else if (this->rearrange && DF_NCHWC8 == inputDesc.df && DF_NCHWC8 == outputDesc.df) {
             transformNCHWC8ToNCHWC8ByGroup(
diff --git a/inference/engine/include/cpu/check_cpu.hpp b/inference/engine/include/cpu/check_cpu.hpp
index 464721f9..92599660 100644
--- a/inference/engine/include/cpu/check_cpu.hpp
+++ b/inference/engine/include/cpu/check_cpu.hpp
@@ -31,7 +31,12 @@ class CheckCPU : public Check {
     void run() override
     {
         Tensor inputATensor = this->inputTensors[0];
-        Tensor inputBTensor = this->inputTensors[1];
+        Tensor inputBTensor;
+        if (this->weightTensors.size() > 0) {
+            inputBTensor = this->weightTensors[0];
+        } else {
+            inputBTensor = this->inputTensors[1];
+        }
         Tensor outputTensor = this->outputTensors[0];
         CHECK_STATUS(check(inputATensor, inputBTensor, this->p, outputTensor, &this->archInfo));
     }
@@ -41,6 +46,17 @@ class CheckCPU : public Check {
     {
         return check_infer_output_size(inTensors, outTensors[0], &this->archInfo);
     }
+
+    EE infer_weight_desc() override
+    {
+        auto curOpWs = this->get_weightspec();
+        if (curOpWs.bytes_of_weight > 0) {
+            this->weightTensors = std::vector<Tensor>(1);
+            this->weightTensors[0].resize(
+                tensor2d(curOpWs.mdt, 1, curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt)));
+        }
+        return SUCCESS;
+    }
 };
 
 #endif  // _CHECK_CPU_H
diff --git a/inference/engine/include/cpu/concat_cpu.hpp b/inference/engine/include/cpu/concat_cpu.hpp
index b758c63d..3cddb585 100644
--- a/inference/engine/include/cpu/concat_cpu.hpp
+++ b/inference/engine/include/cpu/concat_cpu.hpp
@@ -44,7 +44,8 @@ class ConcatCPU : public Concat {
     U32 infer_tmp_memory_size() override
     {
         U32 bytes = 0;
-        CHECK_STATUS(concat_infer_forward_tmp_bytes(this->inputTensors, &bytes, &this->archInfo));
+        CHECK_STATUS(concat_infer_forward_tmp_bytes(
+            this->inputTensors, this->outputTensors[0], &bytes, &this->archInfo));
         return bytes;
     }
 };
diff --git a/inference/engine/include/cpu/constant_of_shape_cpu.hpp b/inference/engine/include/cpu/constant_of_shape_cpu.hpp
new file mode 100644
index 00000000..6e2f2f73
--- /dev/null
+++ b/inference/engine/include/cpu/constant_of_shape_cpu.hpp
@@ -0,0 +1,55 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _CONSTATNT_OF_SHAPE_CPU_H
+#define _CONSTATNT_OF_SHAPE_CPU_H
+
+#include "constant_of_shape.hpp"
+
+class ConstantOfShapeCPU : public ConstantOfShape {
+public:
+    ConstantOfShapeCPU(DataType dt, ConstantOfShapeParamSpec p) : ConstantOfShape(dt, p)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<ConstantOfShapeCPU> mem =
+            std::shared_ptr<ConstantOfShapeCPU>(new ConstantOfShapeCPU(this->dt, this->p));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        TensorDesc desc = this->outputTensors[0].get_desc();
+        UNI_INIT(tensorNumElements(desc), desc.dt, this->p.value,
+            ((CpuMemory *)(this->outputTensors[0].get_memory()))->get_ptr());
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        TensorDesc inDesc = inTensors[0]->get_desc();
+        TensorDesc outDesc;
+        outDesc.dt = this->p.dt;
+        outDesc.nDims = inDesc.dims[0];
+        outDesc.df = getTensorDefaultDataFormat(outDesc.nDims);
+        for (U32 i = 0; i < outDesc.nDims; i++) {
+            outDesc.dims[i] = inDesc.dims[inDesc.nDims + inDesc.dims[0] - 1 - i];
+        }
+        outTensors[0]->resize(outDesc);
+        return SUCCESS;
+    }
+};
+
+#endif  // CONSTATNT_OF_SHAPE_CPU_H
diff --git a/inference/engine/include/cpu/convolution_cpu.hpp b/inference/engine/include/cpu/convolution_cpu.hpp
index cc5003d1..176a1735 100644
--- a/inference/engine/include/cpu/convolution_cpu.hpp
+++ b/inference/engine/include/cpu/convolution_cpu.hpp
@@ -33,17 +33,6 @@ class ConvolutionCPU : public Convolution {
         return mem;
     }
 
-    DataType get_float_precision()
-    {
-        DataType ret = this->dt;
-        if (this->dt == DT_F16_8Q) {
-            ret = DT_F16;
-        } else if (this->dt == DT_F32_8Q) {
-            ret = DT_F32;
-        }
-        return ret;
-    }
-
     EE init_weight_bias_from_model(std::shared_ptr<U8> *modelPtrShared) override
     {
         U8 *modelPtr = nullptr;
@@ -55,12 +44,21 @@ class ConvolutionCPU : public Convolution {
         if (modelPtr != nullptr) {
             filterDt = this->dt;
         }
-        DataType dtNoQ = this->get_float_precision();
+        DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : ((dt == DT_F32_8Q) ? DT_F32 : dt);
         U32 isBNN = 0;
         if (filterDt == DT_BIN01 || filterDt == DT_BIN11) {
             isBNN = 1;
         }
 
+        if (curOpWs.num_quant_scale == this->weightTensors.size()) {
+            for (U32 i = 0; i < this->weightTensors.size(); ++i) {
+                if (curOpWs.weight_scale[i].num_scale > 0) {
+                    this->weightTensors[i].set_scale_ptr(
+                        std::shared_ptr<F32>(curOpWs.weight_scale[i].scale, [](F32 *) {}));
+                }
+            }
+        }
+
         for (U32 i = 0; i < this->weightTensors.size(); i++) {
             TensorDesc desc = this->weightTensors[i].get_desc();
             desc.dt = filterDt;
@@ -69,7 +67,7 @@ class ConvolutionCPU : public Convolution {
         for (U32 i = 0; i < this->biasTensors.size(); i++) {
             TensorDesc desc = this->biasTensors[i].get_desc();
             desc.dt = dtNoQ;
-            if (this->p.convolution_type == Convolution_Pointwise) {
+            if (this->p.convolution_type == CONVOLUTION_POINTWISE) {
                 U32 vectorLen = this->p.num_outputs;  // bias length
                 if (isBNN == 1) {
                     this->dt = dtNoQ;  // BNN convolution should not be quantized further
@@ -88,12 +86,12 @@ class ConvolutionCPU : public Convolution {
             U32 offset_bytes = 0;
             if (modelPtr != nullptr) {
                 this->weightTensors[j].alloc();
-                memcpy(((CpuMemory *)(this->weightTensors[j].get_memory()))->get_ptr(), modelPtr,
-                    weight_bytes);
+                UNI_MEMCPY(((CpuMemory *)(this->weightTensors[j].get_memory()))->get_ptr(),
+                    modelPtr, weight_bytes);
                 offset_bytes += weight_bytes;
                 if (this->hasBias) {
                     this->biasTensors[j].alloc();
-                    memcpy(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(),
+                    UNI_MEMCPY(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(),
                         modelPtr + offset_bytes, bias_bytes);
                     offset_bytes += bias_bytes;
                 }
@@ -106,7 +104,7 @@ class ConvolutionCPU : public Convolution {
                 weight_offset += weight_bytes;
                 if (this->hasBias) {
                     this->biasTensors[j].alloc();
-                    memcpy(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(),
+                    UNI_MEMCPY(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(),
                         curOpWs.vec + bias_offset, bias_bytes);
                     bias_offset += bias_bytes;
                 }
@@ -118,10 +116,10 @@ class ConvolutionCPU : public Convolution {
                     U8 *ptr = (U8 *)((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr();
                     UNI_INIT(p.num_outputs, DT_F16, 1.0, ptr);
                     ptr += bias_bytes / 2;
-                    memset(ptr, 0, bias_bytes / 2);  // second half is bias
+                    UNI_MEMSET(ptr, 0, bias_bytes / 2);  // second half is bias
 #endif
                 } else {
-                    memset(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(), 0,
+                    UNI_MEMSET(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(), 0,
                         bias_bytes);
                 }
             }
@@ -143,62 +141,54 @@ class ConvolutionCPU : public Convolution {
         outputTensor.resize(outputDesc);
 
         F32 *scalePtr = nullptr;
-        switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
 #if defined(_USE_INT8)
-                if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) {
-                    TensorDesc inputDesc = inputTensor.get_desc();
-                    scalePtr = this->scales.get();
-                    scalePtr[0] = inputTensor.get_scale();
-                    if (featureScale.size() > 0 && featureScale[0][0] > 0) {
-                        scalePtr[0] = featureScale[0][0];
-                    } else if (DT_F16 == inputDesc.dt) {
-                        scalePtr[0] = -1;
-                    }
-                    if (featureScale.size() > 0 && (featureScale.back())[0] != -2) {
-                        scalePtr[1] = (featureScale.back())[0];
-                    } else {
-                        scalePtr[1] = -1;
-                    }
-                }
+        if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) {
+            TensorDesc inputDesc = inputTensor.get_desc();
+            scalePtr = this->scales.get();
+            scalePtr[0] = inputTensor.get_scale();
+            if (DT_I8 != inputDesc.dt && DT_U8_Q != inputDesc.dt && featureScale.size() > 0 &&
+                featureScale[0][0] > 0) {
+                scalePtr[0] = featureScale[0][0];
+            }
+            if (featureScale.size() > 0 && (featureScale.back())[0] != -2) {
+                scalePtr[1] = (featureScale.back())[0];
+            } else {
+                scalePtr[1] = -1;
+            }
+        }
 #endif
+        switch (this->p.convolution_type) {
+            case CONVOLUTION_DILATION:
+            case CONVOLUTION_POINTWISE: {
                 std::vector<Tensor> tmpTensors(1, this->temp);
                 CHECK_STATUS(convolution(this->inputTensors, filterTensor, p, this->pwAlg, scalePtr,
                     biasTensor, tmpTensors, outputTensor, this->pwActivationParamSpec,
                     &this->archInfo));
-#if defined(_USE_INT8)
-                auto outputDesc = outputTensor.get_desc();
-                if (DT_I8 == outputDesc.dt || DT_U8_Q == outputDesc.dt) {
-                    outputTensor.set_scale(scalePtr[1]);
-                }
-#endif
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 CHECK_STATUS(depthwise_convolution(this->inputTensors[0], filterTensor, p,
-                    this->dwAlg, biasTensor, this->temp, outputTensor, this->dwActivationParamSpec,
-                    &this->archInfo));
+                    this->dwAlg, scalePtr, biasTensor, this->temp, outputTensor,
+                    this->dwActivationParamSpec, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 std::vector<Tensor> tmpTensors(1, this->temp);
                 CHECK_STATUS(depthwise_pointwise_convolution(this->inputTensors, filterTensor,
-                    weightTensors[1], p, this->dwAlg, biasTensor, biasTensors[1], tmpTensors,
-                    outputTensor, this->dwActivationParamSpec, this->pwActivationParamSpec,
-                    &this->archInfo));
-                break;
-            }
-            case Convolution_Dilation: {
-                std::vector<Tensor> tmpTensors(1, this->temp);
-                CHECK_STATUS(convolution(this->inputTensors, filterTensor, p, this->pwAlg, scalePtr,
-                    biasTensor, tmpTensors, outputTensor, this->pwActivationParamSpec,
-                    &this->archInfo));
+                    weightTensors[1], p, this->dwAlg, scalePtr, biasTensor, biasTensors[1],
+                    tmpTensors, outputTensor, this->dwActivationParamSpec,
+                    this->pwActivationParamSpec, &this->archInfo));
                 break;
             }
             default: {
                 UNI_ERROR_LOG("unsupported convolution type %d\n", this->p.convolution_type);
             }
         }
+#if defined(_USE_INT8)
+        if (DT_I8 == outputDesc.dt || DT_U8_Q == outputDesc.dt) {
+            outputTensor.set_scale(scalePtr[1]);
+        }
+#endif
         inputTensor.resize(oriInputDesc);
         outputTensor.resize(oriOutputDesc);
     }
@@ -220,7 +210,8 @@ class ConvolutionCPU : public Convolution {
         DataType targetType = filterDesc.dt;
         I32 algo;
         switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
+            case CONVOLUTION_DILATION:
+            case CONVOLUTION_POINTWISE: {
                 if (this->dt == DT_F16_8Q || this->dt == DT_F32_8Q) {
 #ifndef _USE_X86
                     targetType = DT_I8;
@@ -244,7 +235,7 @@ class ConvolutionCPU : public Convolution {
                 }
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) {
                     this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo;
                 } else {
@@ -256,7 +247,7 @@ class ConvolutionCPU : public Convolution {
                 }
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) {
                     this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo;
                 } else {
@@ -269,20 +260,9 @@ class ConvolutionCPU : public Convolution {
                 }
                 break;
             }
-            case Convolution_Dilation: {
-                if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) {
-                    this->pwAlg = (ConvolutionForwardAlgorithm)algo;
-                } else {
-                    CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor,
-                        outputTensor, p, policy, &(this->pwAlg), targetType,
-                        this->pwActivationParamSpec, &this->archInfo));
-                    algo = this->pwAlg;
-                    algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1);
-                }
-                break;
-            }
             default:
-                CHECK_STATUS(NOT_SUPPORTED);
+                UNI_ERROR_LOG("not support to infer new type convolution's algorithm.\n");
+                return NOT_SUPPORTED;
         }
         inputTensor.resize(oriInputDesc);
         outputTensor.resize(oriOutputDesc);
@@ -317,7 +297,9 @@ class ConvolutionCPU : public Convolution {
             }
         }
         DataType targetType = this->dt;
-        if (Convolution_Pointwise == this->p.convolution_type) {
+        int numChannels = ic;
+        if (this->p.convolution_type == CONVOLUTION_DILATION ||
+            this->p.convolution_type == CONVOLUTION_POINTWISE) {
             if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) {
 #ifndef _USE_X86
                 targetType = DT_I8;
@@ -325,10 +307,6 @@ class ConvolutionCPU : public Convolution {
                 targetType = DT_U8_Q;
 #endif
             }
-        }
-        int numChannels = ic;
-        if (this->p.convolution_type == Convolution_Dilation ||
-            this->p.convolution_type == Convolution_Pointwise) {
             numChannels /= this->p.group;
         }
 
@@ -338,14 +316,14 @@ class ConvolutionCPU : public Convolution {
             channelAxis = 4;
             filterDesc.push_back(tensor5d(this->dt, this->p.num_outputs, numChannels,
                 this->p.kernel_t, this->p.kernel_h, this->p.kernel_w));
-            if (Convolution_Depthwise_Pointwise == this->p.convolution_type) {
+            if (CONVOLUTION_DEPTHWISE_POINTWISE == this->p.convolution_type) {
                 filterDesc.push_back(tensor5d(this->dt, this->p.num_outputs, numChannels, 1, 1, 1));
             }
         } else if (tensorIs4d(inDim)) {
             channelAxis = 3;
             filterDesc.push_back(tensor4d(
                 this->dt, this->p.num_outputs, numChannels, this->p.kernel_h, this->p.kernel_w));
-            if (Convolution_Depthwise_Pointwise == this->p.convolution_type) {
+            if (CONVOLUTION_DEPTHWISE_POINTWISE == this->p.convolution_type) {
                 filterDesc.push_back(tensor4d(this->dt, this->p.num_outputs, numChannels, 1, 1));
             }
         }
@@ -354,13 +332,14 @@ class ConvolutionCPU : public Convolution {
             filterTensor[i].resize(filterDesc[i]);
         }
         switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
+            case CONVOLUTION_DILATION:
+            case CONVOLUTION_POINTWISE: {
                 biasDesc.push_back(tensor1d(this->dt, this->p.num_outputs));
                 CHECK_STATUS(convolution_infer_output_size(
                     inputTensor, filterTensor[0], p, outputTensor, targetType, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 filterDesc[0].dims[channelAxis] = 1;
                 filterTensor[0].resize(filterDesc[0]);
                 biasDesc.push_back(tensor1d(this->dt, this->p.num_outputs));
@@ -368,7 +347,7 @@ class ConvolutionCPU : public Convolution {
                     inputTensor, filterTensor[0], p, outputTensor, targetType, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 filterDesc[0].dims[channelAxis] = 1;
                 filterTensor[0].resize(filterDesc[0]);
                 biasDesc.push_back(tensor1d(this->dt, numChannels));
@@ -377,14 +356,9 @@ class ConvolutionCPU : public Convolution {
                     filterTensor[0], filterTensor[1], p, outputTensor, targetType, &this->archInfo));
                 break;
             }
-            case Convolution_Dilation: {
-                biasDesc.push_back(tensor1d(this->dt, this->p.num_outputs));
-                CHECK_STATUS(convolution_infer_output_size(
-                    inputTensor, filterTensor[0], p, outputTensor, targetType, &this->archInfo));
-                break;
-            }
             default:
-                CHECK_STATUS(NOT_SUPPORTED);
+                UNI_ERROR_LOG("not support to infer new type convolution's output.\n");
+                return NOT_SUPPORTED;
         }
         TensorDesc outputDesc = outputTensor->get_desc();
         if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) {
@@ -435,29 +409,26 @@ class ConvolutionCPU : public Convolution {
 
         U32 bytes = 0;
         switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
+            case CONVOLUTION_DILATION:
+            case CONVOLUTION_POINTWISE: {
                 CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor,
                     outputTensor, p, this->pwAlg, &bytes, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputTensor,
                     filterTensor, outputTensor, p, this->dwAlg, &bytes, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor,
                     filterTensor, this->weightTensors[1], outputTensor, p, this->dwAlg, &bytes,
                     &this->archInfo));
                 break;
             }
-            case Convolution_Dilation: {
-                CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor,
-                    outputTensor, p, this->pwAlg, &bytes, &this->archInfo));
-                break;
-            }
             default:
-                CHECK_STATUS(NOT_SUPPORTED);
+                UNI_ERROR_LOG("not support to infer new type convolution's tmp memory.\n");
+                break;
         }
         inputTensor.resize(oriInputDesc);
         outputTensor.resize(oriOutputDesc);
@@ -469,28 +440,26 @@ class ConvolutionCPU : public Convolution {
         auto filterTensor = this->weightTensors[0];
         U32 bytes = 0;
         switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
+            case CONVOLUTION_DILATION:
+            case CONVOLUTION_POINTWISE: {
                 CHECK_STATUS(convolution_transform_filter_bytes(
                     filterTensor, this->p, this->pwAlg, &bytes, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 CHECK_STATUS(depthwise_convolution_transform_filter_bytes(
                     filterTensor, this->p, this->dwAlg, &bytes, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes(filterTensor,
                     weightTensors[1], this->p, this->dwAlg, &bytes, bytesExtra, &this->archInfo));
                 break;
             }
-            case Convolution_Dilation: {
-                CHECK_STATUS(convolution_transform_filter_bytes(
-                    filterTensor, this->p, this->pwAlg, &bytes, &this->archInfo));
-                break;
-            }
             default:
-                CHECK_STATUS(NOT_SUPPORTED);
+                UNI_ERROR_LOG("not support to infer new type convolution's tramsform filter tmp "
+                              "memory.\n");
+                break;
         }
         return bytes;
     }
@@ -501,9 +470,10 @@ class ConvolutionCPU : public Convolution {
         this->wtm = std::shared_ptr<Tensor>(new Tensor());
 
         TensorDesc wtmDesc;
+        // int8 winograd
         if ((DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) &&
-            Convolution_Pointwise == this->p.convolution_type &&
-            CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) {  // int8 winograd
+            CONVOLUTION_POINTWISE == this->p.convolution_type &&
+            CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) {
 #if defined(_USE_INT8)
             U32 ftBytes;
             CHECK_STATUS(convolution_transform_filter_bytes(
@@ -525,24 +495,32 @@ class ConvolutionCPU : public Convolution {
             this->scales = std::shared_ptr<F32>((F32 *)operator new(38 * bytesOf(DT_F32)));
             CHECK_STATUS(
                 quantize(tFilter, this->wtm.get(), this->scales.get() + 2, &(this->archInfo)));
+            // int8 tilegemm
         } else if ((DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) &&
-            Convolution_Pointwise == this->p.convolution_type) {  // int8 tilegemm
+            (CONVOLUTION_POINTWISE == this->p.convolution_type ||
+                CONVOLUTION_DILATION == this->p.convolution_type)) {
             TensorDesc qDesc = filterTensor.get_desc();
-            qDesc.dt = DT_I8;
-            Tensor qFilterTensor = Tensor::alloc_sized<CPUMem>(qDesc);
             this->scales = std::shared_ptr<F32>((F32 *)operator new(3 * bytesOf(DT_F32)));
-            this->scales.get()[2] = -1;
-            CHECK_STATUS(
-                quantize(filterTensor, &qFilterTensor, this->scales.get() + 2, &(this->archInfo)));
+            if (qDesc.dt != DT_I8) {
+                qDesc.dt = DT_I8;
+                Tensor qFilterTensor = Tensor::alloc_sized<CPUMem>(qDesc);
+                this->scales.get()[2] = -1;
+                CHECK_STATUS(quantize(
+                    filterTensor, &qFilterTensor, this->scales.get() + 2, &(this->archInfo)));
+                filterTensor = qFilterTensor;
+                filterTensor.set_scale(this->scales.get()[2]);
+            } else {
+                this->scales.get()[2] = filterTensor.get_scale();
+            }
 
             U32 ftmBytes;
             CHECK_STATUS(convolution_transform_filter_bytes(
-                qFilterTensor, this->p, this->pwAlg, &ftmBytes, &this->archInfo));
+                filterTensor, this->p, this->pwAlg, &ftmBytes, &this->archInfo));
             *(this->wtm.get()) = Tensor::alloc_sized<CPUMem>(tensor1d(DT_U8, ftmBytes));
 
             // trans filter
             CHECK_STATUS(convolution_transform_filter(
-                qFilterTensor, this->p, this->pwAlg, this->temp, this->wtm.get(), &this->archInfo));
+                filterTensor, this->p, this->pwAlg, this->temp, this->wtm.get(), &this->archInfo));
 #endif
         } else {  // All other cases
             U32 bytesExtra;
@@ -551,17 +529,18 @@ class ConvolutionCPU : public Convolution {
             wtm->alloc();
 
             switch (this->p.convolution_type) {
-                case Convolution_Pointwise: {
+                case CONVOLUTION_DILATION:
+                case CONVOLUTION_POINTWISE: {
                     CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg,
                         this->temp, this->wtm.get(), &this->archInfo));
                     break;
                 }
-                case Convolution_Depthwise: {
+                case CONVOLUTION_DEPTHWISE: {
                     CHECK_STATUS(depthwise_convolution_transform_filter(
                         filterTensor, this->p, this->dwAlg, this->wtm.get(), &this->archInfo));
                     break;
                 }
-                case Convolution_Depthwise_Pointwise: {
+                case CONVOLUTION_DEPTHWISE_POINTWISE: {
                     Tensor pwTensor;
                     pwTensor.resize(tensor1d(DT_U8, bytesExtra));
                     pwTensor.alloc();
@@ -571,13 +550,9 @@ class ConvolutionCPU : public Convolution {
                     weightTensors[1] = pwTensor;
                     break;
                 }
-                case Convolution_Dilation: {
-                    CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg,
-                        this->temp, this->wtm.get(), &this->archInfo));
-                    break;
-                }
                 default:
-                    CHECK_STATUS(NOT_SUPPORTED);
+                    UNI_ERROR_LOG("not support to transform new type convolution's filter.\n");
+                    return NOT_SUPPORTED;
             }
         }
         this->weightTensors[0] = *this->get_wtm();
diff --git a/inference/engine/include/cpu/copy_cpu.hpp b/inference/engine/include/cpu/copy_cpu.hpp
index 83f482c3..f1828570 100644
--- a/inference/engine/include/cpu/copy_cpu.hpp
+++ b/inference/engine/include/cpu/copy_cpu.hpp
@@ -43,11 +43,11 @@ class CopyCPU : public Copy {
         U32 copyLength = (this->p.length >= 0) ? this->p.length : tensorNumElements(srcDesc) / batch;
         U32 srcBatchStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[0]
                                                         : tensorNumElements(srcDesc) / batch;
-        U32 srcStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[1]
+        U32 srcStride = (this->p.src_dims[1] >= 0) ? this->p.src_dims[1]
                                                    : tensorNumElements(srcDesc) / batch;
         U32 dstBatchStride = (this->p.dst_dims[0] >= 0) ? this->p.dst_dims[0]
                                                         : tensorNumElements(dstDesc) / batch;
-        U32 dstStride = (this->p.dst_dims[0] >= 0) ? this->p.dst_dims[1]
+        U32 dstStride = (this->p.dst_dims[1] >= 0) ? this->p.dst_dims[1]
                                                    : tensorNumElements(dstDesc) / batch;
         for (U32 i = 0; i < batch; i++) {
             U32 srcBlockIndex = 0;
diff --git a/inference/engine/include/cpu/cumsum_cpu.hpp b/inference/engine/include/cpu/cumsum_cpu.hpp
new file mode 100644
index 00000000..0508a98a
--- /dev/null
+++ b/inference/engine/include/cpu/cumsum_cpu.hpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _CUMSUM_CPU_H
+#define _CUMSUM_CPU_H
+
+#include "cumsum.hpp"
+
+class CumSumCPU : public CumSum {
+public:
+    CumSumCPU(DataType dt, CumSumParamSpec p) : CumSum(dt, p)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<CumSumCPU> mem =
+            std::shared_ptr<CumSumCPU>(new CumSumCPU(this->dt, this->p));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        CHECK_STATUS(cumsum(inputTensors[0], this->p, outputTensors[0], &this->archInfo));
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        CHECK_STATUS(cumsum_infer_output_size(inTensors[0], outTensors[0], &this->archInfo));
+        return SUCCESS;
+    }
+};
+
+#endif  // CUMSUM_CPU_H
diff --git a/inference/engine/include/cpu/deconvolution_cpu.hpp b/inference/engine/include/cpu/deconvolution_cpu.hpp
index caadfe7a..6d7d467a 100644
--- a/inference/engine/include/cpu/deconvolution_cpu.hpp
+++ b/inference/engine/include/cpu/deconvolution_cpu.hpp
@@ -33,20 +33,18 @@ class DeconvolutionCPU : public Deconvolution {
     EE infer_weight_desc() override
     {
         auto curOpWs = this->get_weightspec();
-        DataType filterDt = curOpWs.mdt;  // weight data type may not be the same as input and output
+        DataType fdt = curOpWs.mdt;
         if (curOpWs.weight == nullptr) {
-            filterDt = this->dt;
+            fdt = this->dt;
         }
-        DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt;
-        CHECK_REQUIREMENT(filterDt != DT_BIN01 && filterDt != DT_BIN11);
-        DataFormat filterDf = DF_NCHW;
-        TensorDesc filterTensorDesc = tensor4df(filterDt, filterDf, this->numInputs,
-            this->p.num_outputs, this->p.kernel_h, this->p.kernel_w);
-        // bias length
-        U32 vectorLen = this->numInputs * this->p.group;
+        if (fdt == DT_BIN01 || fdt == DT_BIN11) {
+            return NOT_MATCH;
+        }
+        TensorDesc filterTensorDesc = tensor4df(
+            fdt, DF_NCHW, this->numInputs, this->p.num_outputs, this->p.kernel_h, this->p.kernel_w);
         // bias data type should be the same as input and output
-        TensorDesc vectorTensorDesc = tensor1d(dtNoQ, vectorLen);
-
+        DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : ((dt == DT_F32_8Q) ? DT_F32 : dt);
+        TensorDesc vectorTensorDesc = tensor1d(dtNoQ, this->numInputs * this->p.group);
         this->weightTensors = std::vector<Tensor>(1);
         this->weightTensors[0].resize(filterTensorDesc);
         this->biasTensors = std::vector<Tensor>(1);
@@ -62,13 +60,8 @@ class DeconvolutionCPU : public Deconvolution {
         Tensor outputTensor = this->outputTensors[0];
         TensorDesc oriOutputDesc = outputTensor.get_desc();
         outputTensor.resize(transformDescTo4d(oriOutputDesc));
-
         Tensor filterTensor = this->weightTensors[0];
         Tensor biasTensor = this->biasTensors[0];
-        auto filterDesc = filterTensor.get_desc();
-        if (filterDesc.dt == DT_BIN01 || filterDesc.dt == DT_BIN11) {
-            CHECK_STATUS(NOT_SUPPORTED);
-        }
         CHECK_STATUS(deconvolution(inputTensor, filterTensor, p, this->alg, nullptr, biasTensor,
             this->temp, outputTensor, this->activationDesc, &this->archInfo));
         inputTensor.resize(oriInputDesc);
@@ -120,7 +113,7 @@ class DeconvolutionCPU : public Deconvolution {
         filterTensor.resize(filterDim);
 
         DataType targetType = this->dt;
-        if (DT_F16_8Q == this->dt) {
+        if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) {
             targetType = DT_I8;
         }
 
@@ -167,10 +160,10 @@ class DeconvolutionCPU : public Deconvolution {
         Tensor filterTensor = this->weightTensors[0];
         auto wtmBytes = this->infer_wtm_memory_size();
         Tensor wtm = Tensor::alloc_sized<CPUMem>(tensor1d(DT_U8, wtmBytes));
-        CHECK_STATUS(deconvolution_transform_filter(
-            filterTensor, this->p, this->alg, this->temp, &wtm, &this->archInfo));
+        EE ret = deconvolution_transform_filter(
+            filterTensor, this->p, this->alg, this->temp, &wtm, &this->archInfo);
         this->weightTensors[0] = wtm;
-        return SUCCESS;
+        return ret;
     }
 };
 
diff --git a/inference/engine/include/cpu/depth2space_cpu.hpp b/inference/engine/include/cpu/depth2space_cpu.hpp
new file mode 100644
index 00000000..ba673f84
--- /dev/null
+++ b/inference/engine/include/cpu/depth2space_cpu.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _DEPTH2SPACE_CPU_H
+#define _DEPTH2SPACE_CPU_H
+
+#include "depth2space.hpp"
+
+class Depth2SpaceCPU : public Depth2Space {
+public:
+    Depth2SpaceCPU(DataType dt, Depth2SpaceParamSpec p) : Depth2Space(dt, p)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<Depth2SpaceCPU> mem =
+            std::shared_ptr<Depth2SpaceCPU>(new Depth2SpaceCPU(this->dt, this->p));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        CHECK_STATUS(
+            depth2space(inputTensors[0], this->p, this->temp, outputTensors[0], &this->archInfo));
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        CHECK_STATUS(
+            depth2space_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo));
+        return SUCCESS;
+    }
+};
+
+#endif  // DEPTH2SPACE_CPU_H
diff --git a/inference/engine/include/cpu/eltwise_cpu.hpp b/inference/engine/include/cpu/eltwise_cpu.hpp
index 086f858c..14d1581a 100644
--- a/inference/engine/include/cpu/eltwise_cpu.hpp
+++ b/inference/engine/include/cpu/eltwise_cpu.hpp
@@ -18,71 +18,26 @@
 
 class EltwiseCPU : public Eltwise {
 public:
-    EltwiseCPU(EltwiseParamSpec eltwiseDesc) : Eltwise(eltwiseDesc)
+    EltwiseCPU(EltwiseParamSpec p) : Eltwise(p)
     {}
 
     std::shared_ptr<Operator> clone() override
     {
-        std::shared_ptr<EltwiseCPU> mem =
-            std::shared_ptr<EltwiseCPU>(new EltwiseCPU(this->eltwiseDesc));
+        std::shared_ptr<EltwiseCPU> mem = std::shared_ptr<EltwiseCPU>(new EltwiseCPU(this->p));
         *mem = *this;
         return mem;
     }
 
-    bool use_scale(const std::vector<TensorDesc> &inputDesc)
-    {
-        bool ret;
-        if (this->eltwiseDesc.elt_mode == ELTWISE_PROD && inputDesc.size() == 2 &&
-            inputDesc[0].nDims > 1 && inputDesc[1].nDims > 1 &&
-            inputDesc[0].dims[inputDesc[0].nDims - 2] == inputDesc[1].dims[inputDesc[1].nDims - 2] &&
-            inputDesc[1].dims[inputDesc[1].nDims - 1] == 1 &&
-            (inputDesc[1].nDims == 2 || (inputDesc[1].nDims == 3 && inputDesc[1].dims[0] == 1) ||
-                (inputDesc[1].nDims == 4 && inputDesc[1].dims[0] == 1 && inputDesc[1].dims[1] == 1)) &&
-            tensorNumElements(inputDesc[0]) != tensorNumElements(inputDesc[1])) {
-            ret = true;
-        } else {
-            ret = false;
-        }
-        return ret;
-    }
-
     void run() override
     {
-        std::vector<TensorDesc> inputDesc;
-        for (auto p : this->inputTensors) {
-            inputDesc.push_back(p.get_desc());
-        }
-        if (this->use_scale(inputDesc)) {
-            Tensor inTensor = this->inputTensors[1];
-            U8 *alpha = (U8 *)((CpuMemory *)(inTensor.get_memory()))->get_ptr();
-            ScaleParamSpec scaleParam;
-            scaleParam.axis = 1;
-            CHECK_STATUS(scale(this->inputTensors[0], alpha, nullptr, scaleParam,
-                this->outputTensors[0], &this->archInfo));
-        } else {
-            CHECK_STATUS(eltwise(this->inputTensors, this->eltwiseDesc, this->temp,
-                this->outputTensors[0], &this->archInfo));
-        }
+        CHECK_STATUS(eltwise(
+            this->inputTensors, this->p, this->temp, this->outputTensors[0], &this->archInfo));
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        std::vector<TensorDesc> inputDesc;
-        for (auto p : inTensors) {
-            inputDesc.push_back(p->get_desc());
-        }
-        if (this->use_scale(inputDesc)) {
-            ScaleParamSpec scaleParam;
-            scaleParam.axis = 1;
-            TensorDesc desc = inTensors[1]->get_desc();
-            U32 axisLen = desc.dims[desc.nDims - 2];
-            CHECK_STATUS(scale_infer_output_size(
-                inTensors[0], scaleParam, axisLen, outTensors[0], &this->archInfo));
-        } else {
-            CHECK_STATUS(eltwise_infer_output_size(inTensors, outTensors[0], &this->archInfo));
-        }
-        return SUCCESS;
+        return eltwise_infer_output_size(inTensors, outTensors[0], &this->archInfo);
     }
 };
 
diff --git a/inference/engine/include/cpu/embedding_cpu.hpp b/inference/engine/include/cpu/embedding_cpu.hpp
index 410d3c9c..d0a225d3 100644
--- a/inference/engine/include/cpu/embedding_cpu.hpp
+++ b/inference/engine/include/cpu/embedding_cpu.hpp
@@ -81,9 +81,9 @@ class EmbeddingCPU : public Embedding {
         }
         TensorDesc weightDesc;
         if (this->p.transpose) {
-            weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_output, this->p.input_dim);
+            weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_outputs, this->p.num_inputs);
         } else {
-            weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.input_dim, this->p.num_output);
+            weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.num_inputs, this->p.num_outputs);
         }
         U32 weightBytes = tensorNumBytes(weightDesc);
 
@@ -93,15 +93,15 @@ class EmbeddingCPU : public Embedding {
         bool set_ptr = false;
         modelWeightTensor->alloc();
         if (modelPtr != nullptr) {
-            memcpy(
+            UNI_MEMCPY(
                 ((CpuMemory *)(modelWeightTensor->get_memory()))->get_ptr(), modelPtr, weightBytes);
             *modelPtrShared = std::shared_ptr<U8>(*modelPtrShared, modelPtr + weightBytes);
             set_ptr = true;
         } else {
             auto curOpWs = this->get_weightspec();
             if (curOpWs.weight != nullptr) {
-                memcpy(((CpuMemory *)(modelWeightTensor->get_memory()))->get_ptr(), curOpWs.weight,
-                    weightBytes);
+                UNI_MEMCPY(((CpuMemory *)(modelWeightTensor->get_memory()))->get_ptr(),
+                    curOpWs.weight, weightBytes);
                 set_ptr = true;
             }
         }
diff --git a/inference/engine/include/cpu/equal_cpu.hpp b/inference/engine/include/cpu/equal_cpu.hpp
index f0253911..5fae144e 100644
--- a/inference/engine/include/cpu/equal_cpu.hpp
+++ b/inference/engine/include/cpu/equal_cpu.hpp
@@ -44,10 +44,9 @@ class EqualCPU : public Equal {
     EE infer_weight_desc() override
     {
         auto curOpWs = this->get_weightspec();
-        int weightBytes = curOpWs.bytes_of_weight;
-        int weightLen = weightBytes / bytesOf(curOpWs.mdt);
         this->weightTensors = std::vector<Tensor>(1);
-        this->weightTensors[0].resize(tensor2d(this->dt, 1, weightLen));
+        this->weightTensors[0].resize(
+            tensor2d(curOpWs.mdt, 1, curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt)));
         return SUCCESS;
     }
 };
diff --git a/inference/engine/include/cpu/expand_cpu.hpp b/inference/engine/include/cpu/expand_cpu.hpp
index 09ebdf37..d57e2756 100644
--- a/inference/engine/include/cpu/expand_cpu.hpp
+++ b/inference/engine/include/cpu/expand_cpu.hpp
@@ -29,17 +29,36 @@ class ExpandCPU : public Expand {
         return mem;
     }
 
+    ExpandParamSpec get_param(TensorDesc desc)
+    {
+        ExpandParamSpec ps = this->p;
+        if (ps.num_shape == 0) {
+            ps.num_shape = desc.dims[0];
+            for (int i = 0; i < ps.num_shape; i++) {
+                ps.shape[i] = desc.dims[desc.nDims + i];
+            }
+        }
+        return ps;
+    }
+
     void run() override
     {
-        CHECK_STATUS(expand(
-            this->inputTensors[0], this->p, this->temp, this->outputTensors[0], &this->archInfo));
+        ExpandParamSpec ps = p;
+        if (ps.num_shape == 0 && inputTensors.size() > 1) {
+            ps = get_param(inputTensors[1].get_desc());
+        }
+        CHECK_STATUS(
+            expand(this->inputTensors[0], ps, this->temp, this->outputTensors[0], &this->archInfo));
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(
-            expand_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo));
+        ExpandParamSpec ps = p;
+        if (ps.num_shape == 0 && inTensors.size() > 1) {
+            ps = get_param(inTensors[1]->get_desc());
+        }
+        CHECK_STATUS(expand_infer_output_size(inTensors[0], ps, outTensors[0], &this->archInfo));
         return SUCCESS;
     }
 };
diff --git a/inference/engine/include/cpu/factory_cpu.hpp b/inference/engine/include/cpu/factory_cpu.hpp
index d2d4c35b..ed366356 100644
--- a/inference/engine/include/cpu/factory_cpu.hpp
+++ b/inference/engine/include/cpu/factory_cpu.hpp
@@ -66,7 +66,6 @@
 #include "cpu/tdnn_fully_connected_cpu.hpp"
 #include "cpu/batch_norm_cpu.hpp"
 #include "cpu/cast_cpu.hpp"
-#include "cpu/equal_cpu.hpp"
 #include "cpu/instance_norm_cpu.hpp"
 #include "cpu/expand_cpu.hpp"
 #include "cpu/scatter_cpu.hpp"
@@ -74,6 +73,17 @@
 #include "cpu/select_cpu.hpp"
 #include "cpu/topk_cpu.hpp"
 #include "cpu/gat_cpu.hpp"
+#include "cpu/quantizelinear_cpu.hpp"
+#include "cpu/grid_sample_cpu.hpp"
+#include "cpu/onehot_cpu.hpp"
+#include "cpu/cumsum_cpu.hpp"
+#include "cpu/non_max_suppression_cpu.hpp"
+#include "cpu/constant_of_shape_cpu.hpp"
+#include "cpu/non_zero_cpu.hpp"
+#include "cpu/roialign_cpu.hpp"
+#include "cpu/range_cpu.hpp"
+#include "cpu/depth2space_cpu.hpp"
+#include "cpu/space2depth_cpu.hpp"
 
 class FactoryCPU : public Factory {
 public:
@@ -173,9 +183,10 @@ class FactoryCPU : public Factory {
         return std::shared_ptr<Operator>(cep);
     }
 
-    std::shared_ptr<Operator> createLayerNorm(DataType dt, U32 weightNum) override
+    std::shared_ptr<Operator> createLayerNorm(
+        DataType dt, LayerNormParamSpec p, U32 weightNum) override
     {
-        auto cep = (LayerNorm *)(new LayerNormCPU(dt, weightNum));
+        auto cep = (LayerNorm *)(new LayerNormCPU(dt, p, weightNum));
         return std::shared_ptr<Operator>(cep);
     }
 
@@ -264,9 +275,9 @@ class FactoryCPU : public Factory {
         return std::shared_ptr<Operator>(cep);
     }
 
-    std::shared_ptr<Operator> createPreAllocatedMemory(DataType dt, TensorDesc desc) override
+    std::shared_ptr<Operator> createPreAllocatedMemory(PreAllocatedMemoryParamSpec p) override
     {
-        auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryCPU(dt, desc);
+        auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryCPU(p);
         return std::shared_ptr<Operator>(cep);
     }
 
@@ -288,13 +299,13 @@ class FactoryCPU : public Factory {
 
     std::shared_ptr<Operator> createSpace2Depth(DataType dt, Space2DepthParamSpec p) override
     {
-        OP_UNSUP(2, dt, p);
+        auto cep = new Space2DepthCPU(dt, p);
         return std::shared_ptr<Operator>(cep);
     }
 
     std::shared_ptr<Operator> createDepth2Space(DataType dt, Depth2SpaceParamSpec p) override
     {
-        OP_UNSUP(2, dt, p);
+        auto cep = new Depth2SpaceCPU(dt, p);
         return std::shared_ptr<Operator>(cep);
     }
 
@@ -414,12 +425,6 @@ class FactoryCPU : public Factory {
         return std::shared_ptr<Operator>(cep);
     }
 
-    std::shared_ptr<Operator> createEqual(DataType dt, EqualParamSpec p) override
-    {
-        auto cep = new EqualCPU(dt, p);
-        return std::shared_ptr<Operator>(cep);
-    }
-
     std::shared_ptr<Operator> createInstanceNorm(DataType dt, InstanceNormParamSpec p) override
     {
         auto cep = new InstanceNormCPU(dt, p);
@@ -450,9 +455,9 @@ class FactoryCPU : public Factory {
         return std::shared_ptr<Operator>(cep);
     }
 
-    std::shared_ptr<Operator> createRoIAlign(RoIAlignParamSpec p) override
+    std::shared_ptr<Operator> createRoIAlign(DataType dt, RoIAlignParamSpec p) override
     {
-        OP_UNSUP(1, p);
+        auto cep = new RoIAlignCPU(dt, p);
         return std::shared_ptr<Operator>(cep);
     }
 
@@ -468,5 +473,54 @@ class FactoryCPU : public Factory {
         auto cep = new GATCPU(dt, p);
         return std::shared_ptr<Operator>(cep);
     }
+
+    std::shared_ptr<Operator> createQuantizeLinear(DataType dt, QuantizeLinearParamSpec p) override
+    {
+        auto cep = new QuantizeLinearCPU(dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createGridSample(DataType dt, GridSampleParamSpec p) override
+    {
+        auto cep = new GridSampleCPU(dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createOneHot(DataType dt, OneHotParamSpec p) override
+    {
+        auto cep = new OneHotCPU(dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createCumSum(DataType dt, CumSumParamSpec p) override
+    {
+        auto cep = new CumSumCPU(dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createNonMaxSuppression(
+        DataType dt, NonMaxSuppressionParamSpec p) override
+    {
+        auto cep = new NonMaxSuppressionCPU(dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createConstantOfShape(DataType dt, ConstantOfShapeParamSpec p) override
+    {
+        auto cep = new ConstantOfShapeCPU(dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createNonZero(DataType dt) override
+    {
+        auto cep = new NonZeroCPU(dt);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createRange(DataType dt, RangeParamSpec p) override
+    {
+        auto cep = new RangeCPU(dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
 };
 #endif  // _FACTORY_CPU_H
diff --git a/inference/engine/include/cpu/fully_connected_cpu.hpp b/inference/engine/include/cpu/fully_connected_cpu.hpp
index 70838487..701f2019 100644
--- a/inference/engine/include/cpu/fully_connected_cpu.hpp
+++ b/inference/engine/include/cpu/fully_connected_cpu.hpp
@@ -31,25 +31,15 @@ class FullyConnectedCPU : public FullyConnected {
         return mem;
     }
 
-    DataType get_float_precision()
-    {
-        DataType ret = this->dt;
-        if (this->dt == DT_F16_8Q) {
-            ret = DT_F16;
-        } else if (this->dt == DT_F32_8Q) {
-            ret = DT_F32;
-        }
-        return ret;
-    }
-
     EE infer_weight_desc() override
     {
-        DataType dtNoQ = this->get_float_precision();
+        DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : ((dt == DT_F32_8Q) ? DT_F32 : dt);
         auto curOpWs = this->get_weightspec();
+        DataType weightDt = curOpWs.mdt;
         if (curOpWs.bytes_of_weight > 0) {
             this->weightTensors = std::vector<Tensor>(1);
             this->weightTensors[0].resize(
-                tensor2df(dtNoQ, DF_TRANSPOSE, this->p.num_outputs, this->numInput));
+                tensor2df(weightDt, DF_TRANSPOSE, this->p.num_outputs, this->numInput));
         }
         if (curOpWs.bytes_of_vec > 0) {
             this->biasTensors = std::vector<Tensor>(1);
@@ -60,53 +50,50 @@ class FullyConnectedCPU : public FullyConnected {
 
     Tensor get_weight_tensor()
     {
-        Tensor weightTensor;
         if (weightTensors.size() > 0) {
-            weightTensor = this->weightTensors[0];
+            return this->weightTensors[0];
         } else {
             CHECK_REQUIREMENT(1 < this->inputTensors.size());
-            weightTensor = this->inputTensors[1];
-            TensorDesc desc = weightTensor.get_desc();
+            TensorDesc desc = this->inputTensors[1].get_desc();
             if (this->mvm) {
                 desc.df = DF_TRANSPOSE;
             } else {
                 desc.df = DF_NORMAL;
             }
+            Tensor weightTensor = this->inputTensors[1];
             weightTensor.resize(desc);
+            return weightTensor;
         }
-        return weightTensor;
     }
 
     Tensor get_bias_tensor()
     {
-        Tensor biasTensor;
-        U32 inputCount = 1;
-        if (weightTensors.size() == 0) {
-            inputCount++;
-        }
         if (biasTensors.size() > 0) {
-            biasTensor = this->biasTensors[0];
+            return this->biasTensors[0];
         } else {
+            U32 inputCount = 1;
+            if (weightTensors.size() == 0) {
+                inputCount++;
+            }
             if (inputCount < this->inputTensors.size()) {
-                biasTensor = this->inputTensors[inputCount++];
+                return this->inputTensors[inputCount++];
             }
+            Tensor biasTensor;
+            return biasTensor;
         }
-        return biasTensor;
     }
 
     void run() override
     {
-        Tensor inputTensor = this->inputTensors[0];
-        TensorDesc inputDesc = inputTensor.get_desc();
-
         Tensor weightTensor = get_weight_tensor();
         Tensor biasTensor = get_bias_tensor();
         Tensor outputTensor = this->outputTensors[0];
+#ifdef _USE_INT8
+        TensorDesc inputDesc = this->inputTensors[0].get_desc();
         TensorDesc outputDesc = outputTensor.get_desc();
-
         if (featureScale.size() > 1 && featureScale[0][0] > 0 && DT_I8 != inputDesc.dt &&
             DT_U8_Q != inputDesc.dt) {
-            inputTensor.set_scale(featureScale[0][0]);
+            this->inputTensors[0].set_scale(featureScale[0][0]);
         }
         if (DT_I8 == outputDesc.dt || DT_U8_Q == outputDesc.dt) {
             if (featureScale.size() > 0) {
@@ -115,10 +102,10 @@ class FullyConnectedCPU : public FullyConnected {
                 outputTensor.set_scale(-1);
             }
         }
-
+#endif
         std::vector<Tensor> tmpTensor(1, this->temp);
-        CHECK_STATUS(fully_connected(
-            inputTensor, weightTensor, biasTensor, tmpTensor, outputTensor, &this->archInfo));
+        CHECK_STATUS(fully_connected(this->inputTensors[0], weightTensor, biasTensor, tmpTensor,
+            outputTensor, &this->archInfo));
     }
 
     EE infer_output_tensors_size(
@@ -146,8 +133,8 @@ class FullyConnectedCPU : public FullyConnected {
         tmpFilter.resize(weightDesc);
         CHECK_STATUS(fully_connected_infer_output_size(
             inTensors[0], tmpFilter, outTensors[0], &this->archInfo));
-        TensorDesc outputDesc = outTensors[0]->get_desc();
         if (1 == this->p.num_slices) {
+            TensorDesc outputDesc = outTensors[0]->get_desc();
             if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) {
                 if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) {
                     outputDesc.dt = (DT_F16_8Q == this->dt) ? DT_F16 : DT_F32;
@@ -161,9 +148,11 @@ class FullyConnectedCPU : public FullyConnected {
             }
             outTensors[0]->resize(outputDesc);
         } else {
-            UNI_ERROR_LOG("FC merge is deprecated\n");
+            //UNI_ERROR_LOG("FC merge is deprecated\n");
             for (U32 i = 0; i < this->p.num_slices; i++) {
+                TensorDesc outputDesc = outTensors[i]->get_desc();
                 outputDesc.dims[0] = this->p.slice_point[i];
+                UNI_INFO_LOG("-- %d %d\n", p.num_slices, p.slice_point[i]);
                 if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) {
                     if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) {
                         outputDesc.dt = (DT_F16_8Q == this->dt) ? DT_F16 : DT_F32;
@@ -175,6 +164,7 @@ class FullyConnectedCPU : public FullyConnected {
 #endif
                     }
                 }
+                outTensors[i]->resize(outputDesc);
             }
         }
         return SUCCESS;
@@ -241,7 +231,7 @@ class FullyConnectedCPU : public FullyConnected {
 
 #ifdef _USE_INT8
         bool thisIsNoQuant = (featureScale.size() > 1 && featureScale[0].back() == 0);
-        if ((DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) && !thisIsNoQuant) {
+        if ((DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) && !thisIsNoQuant && (tmpDesc.dt != DT_I8)) {
             tmpDesc.dt = DT_I8;
             Tensor qFilter = Tensor::alloc_sized<CPUMem>(tmpDesc);
             F32 scale = -1;
diff --git a/inference/engine/include/cpu/gather_cpu.hpp b/inference/engine/include/cpu/gather_cpu.hpp
index 14e0e755..725b966b 100644
--- a/inference/engine/include/cpu/gather_cpu.hpp
+++ b/inference/engine/include/cpu/gather_cpu.hpp
@@ -39,6 +39,18 @@ class GatherCPU : public Gather {
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
         Tensor tensor0, tensor1;
+        if (is_shape(inTensors)) {
+            if ((this->p.data_desc.nDims > 0 && this->weightTensors.size() == 0) ||
+                (this->p.index_desc.nDims > 0 && this->biasTensors.size() == 0)) {
+                CHECK_STATUS(this->init_weight_bias_from_model());
+            }
+            if (this->p.data_desc.nDims > 0) {
+                this->p.data_desc = tensor_shape(this->weightTensors[0]);
+            }
+            if (this->p.index_desc.nDims > 0) {
+                this->p.index_desc = tensor_shape(this->biasTensors[0]);
+            }
+        }
         Tensor *dataTensor = get_data_tensor_ptr(inTensors, &tensor0);
         Tensor *indexTensor = get_index_tensor_ptr(inTensors, &tensor1);
         CHECK_STATUS(gather_infer_output_size(
@@ -49,11 +61,11 @@ class GatherCPU : public Gather {
     EE infer_weight_desc() override
     {
         Tensor dataTensor, indexTensor;
-        if (this->p.data_desc.nDims > 0) {
+        if (this->p.data_desc.nDims > 0 && this->weightTensors.size() == 0) {
             dataTensor.resize(this->p.data_desc);
             this->weightTensors.push_back(dataTensor);
         }
-        if (this->p.index_desc.nDims > 0) {
+        if (this->p.index_desc.nDims > 0 && this->biasTensors.size() == 0) {
             indexTensor.resize(this->p.index_desc);
             this->biasTensors.push_back(indexTensor);
         }
diff --git a/inference/engine/include/cpu/grid_sample_cpu.hpp b/inference/engine/include/cpu/grid_sample_cpu.hpp
new file mode 100644
index 00000000..c64782da
--- /dev/null
+++ b/inference/engine/include/cpu/grid_sample_cpu.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _GRID_SAMPLE_CPU_H
+#define _GRID_SAMPLE_CPU_H
+
+#include "grid_sample.hpp"
+
+class GridSampleCPU : public GridSample {
+public:
+    GridSampleCPU(DataType dt, GridSampleParamSpec p) : GridSample(dt, p)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<GridSampleCPU> mem =
+            std::shared_ptr<GridSampleCPU>(new GridSampleCPU(this->dt, this->p));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        CHECK_STATUS(grid_sample(inputTensors[0], inputTensors[1], this->p, this->temp,
+            outputTensors[0], &this->archInfo));
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        return grid_sample_infer_output_size(
+            inTensors[0], inTensors[1], outTensors[0], &this->archInfo);
+    }
+};
+
+#endif  // GRID_SAMPLE_CPU_H
diff --git a/inference/engine/include/cpu/instance_norm_cpu.hpp b/inference/engine/include/cpu/instance_norm_cpu.hpp
index 35d9dbb2..f2ca9dd6 100644
--- a/inference/engine/include/cpu/instance_norm_cpu.hpp
+++ b/inference/engine/include/cpu/instance_norm_cpu.hpp
@@ -38,33 +38,29 @@ class InstanceNormCPU : public InstanceNorm {
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        auto inputDesc = inTensors[0]->get_desc();
-        this->set_channels_from_weight();
-        TensorDesc outputDesc = inputDesc;
-        outTensors[0]->resize(outputDesc);
+        outTensors[0]->resize(inTensors[0]->get_desc());
         return SUCCESS;
     }
 
-    void set_channels_from_weight()
+    int get_channels_num()
     {
+        int ret = 0;
         auto curOpWs = this->get_weightspec();
         if (0 != curOpWs.bytes_of_weight) {
-            this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt));
+            ret = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt));
         } else if (0 != curOpWs.bytes_of_vec) {
-            this->numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt));
-        } else {
-            this->numChannels = 0;
+            ret = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt));
         }
+        return ret;
     }
 
     EE infer_weight_desc() override
     {
-        // weight is scale, bias is bias
-        this->set_channels_from_weight();
+        int num = this->get_channels_num();
         this->weightTensors = std::vector<Tensor>(1);
-        this->weightTensors[0].resize(tensor1d(this->dt, this->numChannels));
+        this->weightTensors[0].resize(tensor1d(this->dt, num));
         this->biasTensors = std::vector<Tensor>(1);
-        this->biasTensors[0].resize(tensor1d(this->dt, this->numChannels));
+        this->biasTensors[0].resize(tensor1d(this->dt, num));
         return SUCCESS;
     }
 
diff --git a/inference/engine/include/cpu/layer_norm_cpu.hpp b/inference/engine/include/cpu/layer_norm_cpu.hpp
index da9dbb27..fa88cc9a 100644
--- a/inference/engine/include/cpu/layer_norm_cpu.hpp
+++ b/inference/engine/include/cpu/layer_norm_cpu.hpp
@@ -18,13 +18,13 @@
 
 class LayerNormCPU : public LayerNorm {
 public:
-    LayerNormCPU(DataType dt, U32 weightNum) : LayerNorm(dt, weightNum)
+    LayerNormCPU(DataType dt, LayerNormParamSpec p, U32 weightNum) : LayerNorm(dt, p, weightNum)
     {}
 
     std::shared_ptr<Operator> clone() override
     {
         std::shared_ptr<LayerNormCPU> mem =
-            std::shared_ptr<LayerNormCPU>(new LayerNormCPU(this->dt, this->weightNum));
+            std::shared_ptr<LayerNormCPU>(new LayerNormCPU(this->dt, this->p, this->weightNum));
         *mem = *this;
         return mem;
     }
@@ -32,7 +32,7 @@ class LayerNormCPU : public LayerNorm {
     EE infer_weight_desc() override
     {
         auto curOpWs = this->get_weightspec();
-        DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt;
+        DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : ((dt == DT_F32_8Q) ? DT_F32 : dt);
         if (0 != curOpWs.bytes_of_weight) {
             this->weightNum = curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt);
         }
@@ -67,14 +67,14 @@ class LayerNormCPU : public LayerNorm {
         Tensor biasTensor = this->biasTensors[0];
         Tensor outputTensor = this->outputTensors[0];
 
-        CHECK_STATUS(layer_normalization(
-            inputTensor, weightTensor, biasTensor, this->temp, outputTensor, &this->archInfo));
+        CHECK_STATUS(layer_normalization(inputTensor, this->p, weightTensor, biasTensor, this->temp,
+            outputTensor, &this->archInfo));
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(normalization_infer_output_size(inTensors[0], outTensors[0], &this->archInfo));
+        EE ret = normalization_infer_output_size(inTensors[0], outTensors[0], &this->archInfo);
 #ifdef _USE_INT8
         if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) {
             if (featureScale.size() > 0 && -1 == (featureScale.back())[0]) {
@@ -88,7 +88,7 @@ class LayerNormCPU : public LayerNorm {
             }
         }
 #endif
-        return SUCCESS;
+        return ret;
     }
 };
 
diff --git a/inference/engine/include/cpu/logsoftmax_cpu.hpp b/inference/engine/include/cpu/logsoftmax_cpu.hpp
index 20d0c1a1..9be1a737 100644
--- a/inference/engine/include/cpu/logsoftmax_cpu.hpp
+++ b/inference/engine/include/cpu/logsoftmax_cpu.hpp
@@ -16,18 +16,10 @@
 
 #include "cpu/softmax_cpu.hpp"
 
-// LOGSOFTMAX_CPU_V1: y = log(softmax(x))
-// LOGSOFTMAX_CPU_V2: y = (x - reduce_max) - log(reduce_sum(exp(x - reduce_max)))
 class LogSoftmaxCPU : public SoftmaxCPU {
 public:
     LogSoftmaxCPU(DataType dt, SoftmaxParamSpec p) : SoftmaxCPU(dt, p)
-    {
-#ifndef LOGSOFTMAX_CPU_V1
-        TensorDesc maskDesc;
-        maskDesc.nDims = 0;
-        reductionMask.resize(maskDesc);
-#endif
-    }
+    {}
 
     OperatorType get_type() override
     {
@@ -44,79 +36,8 @@ class LogSoftmaxCPU : public SoftmaxCPU {
 
     void run() override
     {
-#ifdef LOGSOFTMAX_CPU_V1
-        ActivationParamSpec activationDesc;
-        activationDesc.mode = ACTIVATION_LOG;
-        CHECK_STATUS(
-            softmax(inputTensors[0], this->p, this->temp, outputTensors[0], &this->archInfo));
-        CHECK_STATUS(
-            activation(outputTensors[0], activationDesc, outputTensors[0], &this->archInfo));
-#else
-        Tensor tmp, newInput;
-        U8 *data = (U8 *)((CpuMemory *)(this->temp.get_memory()))->get_ptr();
-        std::shared_ptr<U8> p1(data, [](U8 *ptr) {});
-        newInput.resize(inputTensors[0].get_desc());
-        ((CpuMemory *)(reductionResult.get_memory()))->set_shared_ptr(p1);
-        std::shared_ptr<U8> p2(data + reductionResult.bytes(), [](U8 *ptr) {});
-        ((CpuMemory *)(newInput.get_memory()))->set_shared_ptr(p2);
-        std::shared_ptr<U8> p3(data + reductionResult.bytes() + newInput.bytes(), [](U8 *ptr) {});
-        ((CpuMemory *)(tmp.get_memory()))->set_shared_ptr(p3);
-
-        ReductionParamSpec reductionSpec = get_reduction_param();
-        reductionSpec.reduction_mode = REDUCTION_MAX;
-        CHECK_STATUS(reduction(
-            inputTensors[0], reductionMask, reductionSpec, tmp, reductionResult, &this->archInfo));
-        EltwiseParamSpec eltwiseSpec;
-        eltwiseSpec.elt_mode = ELTWISE_SUB;
-        eltwiseSpec.activation_type = ACTIVATION_NULL;
-        std::vector<Tensor> tmpInput = {inputTensors[0], reductionResult};
-        CHECK_STATUS(eltwise(tmpInput, eltwiseSpec, tmp, newInput, &this->archInfo));
-
-        ActivationParamSpec activationSpec;
-        activationSpec.mode = ACTIVATION_EXP;
-        CHECK_STATUS(activation(newInput, activationSpec, outputTensors[0], &this->archInfo));
-
-        CHECK_STATUS(reduction(outputTensors[0], reductionMask, get_reduction_param(), tmp,
-            reductionResult, &this->archInfo));
-
-        activationSpec.mode = ACTIVATION_LOG;
-        CHECK_STATUS(activation(reductionResult, activationSpec, reductionResult, &this->archInfo));
-
-        tmpInput = {newInput, reductionResult};
-        CHECK_STATUS(eltwise(tmpInput, eltwiseSpec, tmp, outputTensors[0], &this->archInfo));
-#endif
-    }
-
-#ifndef LOGSOFTMAX_CPU_V1
-    ReductionParamSpec get_reduction_param()
-    {
-        ReductionParamSpec reductionSpec;
-        reductionSpec.axes_num = 1;
-        reductionSpec.axes[0] = this->p.axis;
-        reductionSpec.reduction_mode = REDUCTION_SUM;
-        reductionSpec.keep_dim = true;
-        reductionSpec.coeff = 1;
-        return reductionSpec;
-    }
-
-    U32 infer_tmp_memory_size() override
-    {
-        U32 bytes1 = 0, bytes2 = 0;
-        CHECK_STATUS(reduction_infer_output_size(&(inputTensors[0]), reductionMask,
-            get_reduction_param(), &reductionResult, &this->archInfo));
-
-        CHECK_STATUS(reduction_infer_forward_tmp_bytes(
-            inputTensors[0], get_reduction_param(), reductionResult, &bytes1, &this->archInfo));
-
-        std::vector<Tensor> tmpInput = {inputTensors[0], reductionResult};
         CHECK_STATUS(
-            eltwise_infer_forward_tmp_bytes(tmpInput, inputTensors[0], &bytes2, &this->archInfo));
-        return inputTensors[0].bytes() + reductionResult.bytes() + UNI_MAX(bytes1, bytes2);
+            logsoftmax(inputTensors[0], this->p, this->temp, outputTensors[0], &this->archInfo));
     }
-
-private:
-    Tensor reductionResult;
-    Tensor reductionMask;
-#endif
 };
 #endif  // LOGSOFTMAX_CPU_H
diff --git a/inference/engine/include/cpu/matmul_cpu.hpp b/inference/engine/include/cpu/matmul_cpu.hpp
index b99c01ed..f1342cc7 100644
--- a/inference/engine/include/cpu/matmul_cpu.hpp
+++ b/inference/engine/include/cpu/matmul_cpu.hpp
@@ -32,16 +32,17 @@ class MatMulCPU : public MatMul {
     void run() override
     {
         Tensor inputTensorA = this->inputTensors[0];
-        TensorDesc inputDescA = inputTensorA.get_desc();
         Tensor inputTensorB = this->inputTensors[1];
-        TensorDesc inputDescB = inputTensorB.get_desc();
         Tensor inputTensorC;
         if (this->inputTensors.size() > 2) {
             inputTensorC = this->inputTensors[2];
         }
         Tensor outputTensor = this->outputTensors[0];
-        TensorDesc outputDesc = outputTensor.get_desc();
 
+#ifdef _USE_INT8
+        TensorDesc inputDescA = inputTensorA.get_desc();
+        TensorDesc inputDescB = inputTensorB.get_desc();
+        TensorDesc outputDesc = outputTensor.get_desc();
         if (3 == featureScale.size() && featureScale[0][0] > 0 && DT_I8 != inputDescA.dt &&
             DT_U8_Q != inputDescA.dt) {
             inputTensorA.set_scale(featureScale[0][0]);
@@ -53,6 +54,7 @@ class MatMulCPU : public MatMul {
         if (featureScale.size() > 0) {
             outputTensor.set_scale((featureScale.back())[0]);
         }
+#endif
         std::vector<Tensor> tmpTensor(1, this->temp);
         CHECK_STATUS(matmul(inputTensors[0], this->p.transpose_a, inputTensors[1],
             this->p.transpose_b, inputTensorC, tmpTensor, outputTensors[0], &this->archInfo));
diff --git a/inference/engine/include/cpu/non_max_suppression_cpu.hpp b/inference/engine/include/cpu/non_max_suppression_cpu.hpp
new file mode 100644
index 00000000..7ce02edb
--- /dev/null
+++ b/inference/engine/include/cpu/non_max_suppression_cpu.hpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _NON_MAX_SUPPRESSION_CPU_H
+#define _NON_MAX_SUPPRESSION_CPU_H
+
+#include "non_max_suppression.hpp"
+
+class NonMaxSuppressionCPU : public NonMaxSuppression {
+public:
+    NonMaxSuppressionCPU(DataType dt, NonMaxSuppressionParamSpec p) : NonMaxSuppression(dt, p)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<NonMaxSuppressionCPU> mem =
+            std::shared_ptr<NonMaxSuppressionCPU>(new NonMaxSuppressionCPU(this->dt, this->p));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        CHECK_STATUS(non_max_suppression(inputTensors, this->p, outputTensors[0], &this->archInfo));
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        return non_max_suppression_infer_output_size(
+            inTensors, this->p, outTensors[0], &this->archInfo);
+    }
+};
+
+#endif  // NON_MAX_SUPPRESSION_CPU_H
diff --git a/inference/engine/include/cpu/non_zero_cpu.hpp b/inference/engine/include/cpu/non_zero_cpu.hpp
new file mode 100644
index 00000000..ad18446b
--- /dev/null
+++ b/inference/engine/include/cpu/non_zero_cpu.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _NON_ZERO_CPU_H
+#define _NON_ZERO_CPU_H
+
+#include "non_zero.hpp"
+
+class NonZeroCPU : public NonZero {
+public:
+    NonZeroCPU(DataType dt) : NonZero(dt)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<NonZeroCPU> mem = std::shared_ptr<NonZeroCPU>(new NonZeroCPU(this->dt));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        CHECK_STATUS(non_zero(inputTensors[0], outputTensors[0], &this->archInfo));
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        TensorDesc inDesc = inTensors[0]->get_desc();
+        int num = tensorNumElements(inDesc);
+        TensorDesc outDesc = tensor2df(DT_I32, DF_NORMAL, inDesc.nDims, num);
+        outTensors[0]->resize(outDesc);
+        return SUCCESS;
+    }
+};
+
+#endif  // NON_ZERO_CPU_H
diff --git a/inference/engine/include/cpu/onehot_cpu.hpp b/inference/engine/include/cpu/onehot_cpu.hpp
new file mode 100644
index 00000000..faf90465
--- /dev/null
+++ b/inference/engine/include/cpu/onehot_cpu.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _ONEHOT_CPU_H
+#define _ONEHOT_CPU_H
+
+#include "onehot.hpp"
+
+class OneHotCPU : public OneHot {
+public:
+    OneHotCPU(DataType dt, OneHotParamSpec p) : OneHot(dt, p)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<OneHotCPU> mem =
+            std::shared_ptr<OneHotCPU>(new OneHotCPU(this->dt, this->p));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        CHECK_STATUS(onehot(inputTensors[0], this->p, outputTensors[0], &this->archInfo));
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        CHECK_STATUS(onehot_infer_output_size(
+            inTensors[0], this->p, this->dt, outTensors[0], &this->archInfo));
+        return SUCCESS;
+    }
+};
+
+#endif  // ONEHOT_CPU_H
diff --git a/inference/engine/include/cpu/power_cpu.hpp b/inference/engine/include/cpu/power_cpu.hpp
index e997faaf..fffc074a 100644
--- a/inference/engine/include/cpu/power_cpu.hpp
+++ b/inference/engine/include/cpu/power_cpu.hpp
@@ -42,7 +42,7 @@ class PowerCPU : public Power {
             auto inPtr = ((CpuMemory *)(inputTensor.get_memory()))->get_ptr();
             auto outPtr = ((CpuMemory *)(outputTensor.get_memory()))->get_ptr();
             if (inPtr != outPtr) {
-                memcpy(outPtr, inPtr, tensorNumBytes(inputDesc));
+                UNI_MEMCPY(outPtr, inPtr, tensorNumBytes(inputDesc));
             }
 #endif
         } else {
@@ -53,7 +53,8 @@ class PowerCPU : public Power {
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        return power_infer_output_size(inTensors[0], outTensors[0], &this->archInfo);
+        CHECK_STATUS(power_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo));
+        return SUCCESS;
     }
 };
 
diff --git a/inference/engine/include/cpu/preallocated_memory_cpu.hpp b/inference/engine/include/cpu/preallocated_memory_cpu.hpp
index 92a0fab5..a54338ad 100644
--- a/inference/engine/include/cpu/preallocated_memory_cpu.hpp
+++ b/inference/engine/include/cpu/preallocated_memory_cpu.hpp
@@ -18,30 +18,27 @@
 
 class PreAllocatedMemoryCPU : public PreAllocatedMemory {
 public:
-    PreAllocatedMemoryCPU(DataType dt, TensorDesc desc) : PreAllocatedMemory(dt, desc)
+    PreAllocatedMemoryCPU(PreAllocatedMemoryParamSpec p) : PreAllocatedMemory(p)
     {}
 
     std::shared_ptr<Operator> clone() override
     {
         std::shared_ptr<PreAllocatedMemoryCPU> mem =
-            std::shared_ptr<PreAllocatedMemoryCPU>(new PreAllocatedMemoryCPU(this->dt, this->desc));
+            std::shared_ptr<PreAllocatedMemoryCPU>(new PreAllocatedMemoryCPU(this->p));
         *mem = *this;
         return mem;
     }
 
     void run() override
     {
-        CHECK_STATUS(preallocated_memory(this->outputTensors[0], &this->archInfo));
+        CHECK_STATUS(preallocated_memory(this->p, this->outputTensors[0], &this->archInfo));
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        if (inTensors.size() > 0) {
-            CHECK_STATUS(NOT_MATCH);
-        }
-        outTensors[0]->resize(this->desc);
-        return SUCCESS;
+        return preallocated_memory_infer_output_size(
+            inTensors, this->p, outTensors[0], &this->archInfo);
     }
 };
 
diff --git a/inference/engine/include/cpu/prelu_cpu.hpp b/inference/engine/include/cpu/prelu_cpu.hpp
index 5f81a573..3e9dcdc8 100644
--- a/inference/engine/include/cpu/prelu_cpu.hpp
+++ b/inference/engine/include/cpu/prelu_cpu.hpp
@@ -31,28 +31,35 @@ class PReLUCPU : public PReLU {
     EE infer_weight_desc() override
     {
         auto curOpWs = this->get_weightspec();
-        U32 weightNum = 0;
-        if (curOpWs.weight != nullptr) {
-            weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt));
+        U32 weightNum = (curOpWs.weight == nullptr)
+            ? 0
+            : curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt));
+        if (weightNum > 0) {
+            Tensor weightTensor;
+            weightTensor.resize(tensor1d(this->dt, weightNum));
+            this->weightTensors.push_back(weightTensor);
         }
-        if (weightNum == 0) {
-            CHECK_STATUS(NOT_SUPPORTED);
-        }
-        if (weightNum == 1) {
-            this->preluDesc.propagate_down = true;
-        } else {
-            this->preluDesc.propagate_down = false;
-        }
-        Tensor weightTensor;
-        weightTensor.resize(tensor1d(this->dt, weightNum));
-        this->weightTensors.push_back(weightTensor);
         return SUCCESS;
     }
 
     void run() override
     {
-        CHECK_STATUS(prelu(this->inputTensors[0], this->weightTensors[0], this->preluDesc,
-            this->outputTensors[0], &this->archInfo));
+        Tensor weight;
+        if (this->weightTensors.size() > 0) {
+            weight = this->weightTensors[0];
+        } else if (this->inputTensors.size() > 1) {
+            weight = this->inputTensors[1];
+        } else {
+            UNI_ERROR_LOG("operator:%s type:%s doesn't have weight.\n", this->name.c_str(),
+                OperatorTypeName()[this->get_type()]);
+        }
+        if (weight.length() == 1) {
+            this->p.propagate_down = true;
+        } else {
+            this->p.propagate_down = false;
+        }
+        CHECK_STATUS(
+            prelu(this->inputTensors[0], weight, this->p, this->outputTensors[0], &this->archInfo));
     }
 
     EE infer_output_tensors_size(
diff --git a/inference/engine/include/cpu/quantizelinear_cpu.hpp b/inference/engine/include/cpu/quantizelinear_cpu.hpp
new file mode 100644
index 00000000..3bd8125b
--- /dev/null
+++ b/inference/engine/include/cpu/quantizelinear_cpu.hpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _QUANTIZELINEAR_CPU_H
+#define _QUANTIZELINEAR_CPU_H
+
+#include "quantizelinear.hpp"
+
+class QuantizeLinearCPU : public QuantizeLinear {
+public:
+    QuantizeLinearCPU(DataType dt, QuantizeLinearParamSpec p) : QuantizeLinear(dt, p)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<QuantizeLinearCPU> mem =
+            std::shared_ptr<QuantizeLinearCPU>(new QuantizeLinearCPU(this->dt, this->p));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        F32 scale = -1;  // default per layer
+        TensorDesc inputDesc = this->inputTensors[0].get_desc();
+        TensorDesc outputDesc = this->outputTensors[0].get_desc();
+        if (inputDesc.dt == outputDesc.dt) {
+            UNI_MEMCPY(get_ptr_from_tensor(this->outputTensors[0], this->archInfo.arch),
+                get_ptr_from_tensor(this->inputTensors[0], this->archInfo.arch),
+                tensorNumBytes(this->inputTensors[0].get_desc()));
+            return;
+        }
+        if (featureScale.size() > 0 && featureScale[0].size() > 0 && featureScale[0][0] > 0) {
+            scale = featureScale[0][0];
+        }
+        CHECK_STATUS(
+            quantize(this->inputTensors[0], &this->outputTensors[0], &scale, &this->archInfo));
+        this->outputTensors[0].set_scale(scale);
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        TensorDesc outputDesc = inTensors[0]->get_desc();
+        if (this->dt == DT_F32_8Q || this->dt == DT_F16_8Q) {
+#ifdef _USE_X86
+            outputDesc.dt = p.dt;
+
+            // special case, matmul mvm
+            if (outputDesc.nDims >= 2 && outputDesc.dims[1] != 1) {
+                outputDesc.dt = DT_U8_Q;
+            }
+#else
+            outputDesc.dt = DT_I8;
+#endif
+        }
+        outTensors[0]->resize(outputDesc);
+        return SUCCESS;
+    }
+};
+
+#endif  // _QUANTIZELINEAR_CPU_H
diff --git a/inference/engine/include/cpu/range_cpu.hpp b/inference/engine/include/cpu/range_cpu.hpp
new file mode 100644
index 00000000..af4bd19f
--- /dev/null
+++ b/inference/engine/include/cpu/range_cpu.hpp
@@ -0,0 +1,73 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _RANGE_CPU_H
+#define _RANGE_CPU_H
+
+#include "range.hpp"
+
+class RangeCPU : public Range {
+public:
+    RangeCPU(DataType dt, RangeParamSpec p) : Range(dt, p)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<RangeCPU> mem = std::shared_ptr<RangeCPU>(new RangeCPU(this->dt, this->p));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        //CHECK_STATUS(non_zero(inputTensors[0], outputTensors[0], &this->archInfo));
+        int idx = outputTensors.size() - 1;
+        TensorDesc desc = outputTensors[idx].get_desc();
+        I32 length = (p.limit - p.start) / p.delta;
+        switch (desc.dt) {
+            case DT_I32: {
+                I32 *ptr = (I32 *)((CpuMemory *)(outputTensors[idx].get_memory()))->get_ptr();
+                for (int i = 0; i < length; i++) {
+                    ptr[i] = p.start + p.delta * i;
+                }
+                break;
+            }
+            default:
+                UNI_ERROR_LOG("not support Range(%s).\n", OperatorTypeName()[desc.dt]);
+                break;
+        }
+        if (outputTensors.size() > 1) {
+            U32 *ptr = (U32 *)((CpuMemory *)(outputTensors[0].get_memory()))->get_ptr();
+            *ptr = length;
+        }
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        U32 length = (p.limit - p.start) / p.delta;
+        TensorDesc desc0 = tensor1d(DT_U32, length);
+        desc0.df = DF_SCALAR;
+        desc0.dims[1] = length;
+        TensorDesc desc1 = tensor1d(p.dt, length);
+        if (outTensors.size() >= 1) {
+            outTensors[outTensors.size() - 1]->resize(desc1);
+        }
+        if (outTensors.size() == 2) {
+            outTensors[outTensors.size() - 2]->resize(desc0);
+        }
+        return SUCCESS;
+    }
+};
+
+#endif  // RANGE_CPU_H
diff --git a/inference/engine/include/cpu/repeat_cpu.hpp b/inference/engine/include/cpu/repeat_cpu.hpp
index 10ec2b0e..8a7becb6 100644
--- a/inference/engine/include/cpu/repeat_cpu.hpp
+++ b/inference/engine/include/cpu/repeat_cpu.hpp
@@ -39,7 +39,7 @@ class RepeatCPU : public Repeat {
         if (this->inputTensors.size() > 1) {
             Tensor inputTensor = this->inputTensors[1];
             TensorDesc inputDesc = inputTensor.get_desc();
-            I32 *ptr = (I32 *)(((CpuMemory *)(inputTensor.get_memory()))->get_ptr());
+            U8 *ptr = (U8 *)(((CpuMemory *)(inputTensor.get_memory()))->get_ptr());
             U32 length = tensorNumElements(inputDesc);
             for (U32 i = 0; i < length; i++) {
                 // end loop
diff --git a/inference/engine/include/cpu/reshape_cpu.hpp b/inference/engine/include/cpu/reshape_cpu.hpp
index eec8571b..1df80a3d 100644
--- a/inference/engine/include/cpu/reshape_cpu.hpp
+++ b/inference/engine/include/cpu/reshape_cpu.hpp
@@ -29,6 +29,18 @@ class ReshapeCPU : public Reshape {
         return mem;
     }
 
+    ReshapeParamSpec get_param(TensorDesc desc)
+    {
+        ReshapeParamSpec ps = this->p;
+        if (ps.num_shape == 0) {
+            ps.num_shape = desc.dims[0];
+            for (int i = 0; i < ps.num_shape; i++) {
+                ps.shape[i] = desc.dims[desc.nDims + i];
+            }
+        }
+        return ps;
+    }
+
     void run() override
     {
         Tensor inputTensor = this->inputTensors[0];
@@ -37,9 +49,9 @@ class ReshapeCPU : public Reshape {
         Tensor tmpOutputTensor = outputTensor;
         auto inputDesc = inputTensor.get_desc();
         auto outputDesc = outputTensor.get_desc();
+        auto tmpOutputDesc = outputDesc;
         // if axis is 8, the mode of a model for reshape is tflite.
         if (this->p.axis == 8 && outputDesc.nDims == 4) {
-            auto tmpOutputDesc = outputTensor.get_desc();
             tmpOutputDesc.df = DF_NHWC;
             tmpOutputTensor = this->temp;
             tmpOutputTensor.resize(tmpOutputDesc);
@@ -61,7 +73,6 @@ class ReshapeCPU : public Reshape {
         // NHWC -> NCHW
         if (this->p.axis == 8 && outputDesc.nDims == 4) {
             auto outputDesc = outputTensor.get_desc();
-            auto tmpOutputDesc = tmpOutputTensor.get_desc();
             void *tmpOutputPtr = ((CpuMemory *)(tmpOutputTensor.get_memory()))->get_ptr();
             transformToNCHW(tmpOutputDesc, tmpOutputPtr, outputDesc,
                 ((CpuMemory *)(outputTensor.get_memory()))->get_ptr());
@@ -72,9 +83,11 @@ class ReshapeCPU : public Reshape {
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(
-            reshape_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo));
-        return SUCCESS;
+        ReshapeParamSpec ps = this->p;
+        if (ps.num_shape == 0 && inTensors.size() > 1) {
+            ps = get_param(inTensors[1]->get_desc());
+        }
+        return reshape_infer_output_size(inTensors[0], ps, outTensors[0], &this->archInfo);
     }
 
     U32 infer_tmp_memory_size() override
@@ -82,6 +95,9 @@ class ReshapeCPU : public Reshape {
         U32 bytes = 0;
         CHECK_STATUS(reshape_infer_forward_tmp_bytes(
             this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo));
+        if (this->p.axis == 8) {
+            bytes += UNI_MAX(this->inputTensors[0].bytes(), this->outputTensors[0].bytes());
+        }
         return bytes;
     }
 };
diff --git a/inference/engine/include/cpu/resize_cpu.hpp b/inference/engine/include/cpu/resize_cpu.hpp
index cee728bc..93167c32 100644
--- a/inference/engine/include/cpu/resize_cpu.hpp
+++ b/inference/engine/include/cpu/resize_cpu.hpp
@@ -19,58 +19,35 @@
 
 class ResizeCPU : public Resize {
 public:
-    ResizeCPU(DataType paramDT, ResizeParamSpec p) : Resize(paramDT, p)
+    ResizeCPU(DataType dt, ResizeParamSpec p) : Resize(dt, p)
     {}
 
     std::shared_ptr<Operator> clone() override
     {
         std::shared_ptr<ResizeCPU> mem =
-            std::shared_ptr<ResizeCPU>(new ResizeCPU(this->paramDT, this->p));
+            std::shared_ptr<ResizeCPU>(new ResizeCPU(this->dt, this->p));
         *mem = *this;
         return mem;
     }
 
     void run() override
     {
-        CHECK_STATUS(resize(inputTensors[0], temp, outputTensors[0], this->p, &this->archInfo));
+        CHECK_STATUS(resize(inputTensors[0], this->p, temp, outputTensors[0], &this->archInfo));
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        U32 bytes;
-        switch (paramDT) {
-            case DT_F32: {
-                CHECK_REQUIREMENT(1 == this->p.scales[0] && 1 == this->p.scales[1]);
-                CHECK_STATUS(resize_infer_output_size(inTensors[0], this->paramDT,
-                    this->p.scales + 2, outTensors[0], &bytes, &this->archInfo));
-                break;
-            }
-            case DT_U32: {
-                CHECK_STATUS(resize_infer_output_size(inTensors[0], this->paramDT, this->p.sizes,
-                    outTensors[0], &bytes, &this->archInfo));
-                break;
-            }
-            default: {
-                CHECK_STATUS(NOT_SUPPORTED);
-            }
-        }
-        return SUCCESS;
+        return resize_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo);
     }
 
     U32 infer_tmp_memory_size() override
     {
-        U32 size = 0;
-        TensorDesc inputDesc = inputTensors[0].get_desc();
-        if (DF_NCHW == inputDesc.df && (IS_ARM(archInfo.arch) || IS_X86(archInfo.arch))) {
-            U32 paddedC = (inputDesc.dims[2] + 7) / 8 * 8;
-            TensorDesc outputDesc = outputTensors[0].get_desc();
-            inputDesc.dims[2] = paddedC;
-            outputDesc.dims[2] = paddedC;
-            size = tensorNumBytes(inputDesc) + tensorNumBytes(outputDesc);
-        }
-        return size;
+        U32 bytes = 0;
+        CHECK_STATUS(resize_infer_forward_tmp_bytes(
+            this->inputTensors[0], this->p, this->outputTensors[0], &bytes, &this->archInfo));
+        return bytes;
     }
 };
 
-#endif  // _RESIZECPU_H
+#endif  // _RESIZE_CPU_H
diff --git a/inference/engine/include/cpu/rnn_cpu.hpp b/inference/engine/include/cpu/rnn_cpu.hpp
index e0691525..70ab5eab 100644
--- a/inference/engine/include/cpu/rnn_cpu.hpp
+++ b/inference/engine/include/cpu/rnn_cpu.hpp
@@ -36,17 +36,17 @@ class RNNCPU : public RNNCellCPU {
         U8 *state = (U8 *)get_ptr_from_tensor(this->temp, this->archInfo.arch);
         TensorDesc desc = inputTensor.get_desc();
         int batch = desc.dims[desc.nDims - 1];
-        I32 num = p.biDirection ? 2 : 1;
-        I32 column = this->p.numProjection > 0 ? this->p.numProjection : this->p.numOutput;
-        U32 ch_size = (this->p.numOutput + column) * bytesOf(desc.dt);
+        I32 num = p.bi_direction ? 2 : 1;
+        I32 column = this->p.num_projection > 0 ? this->p.num_projection : this->p.num_outputs;
+        U32 ch_size = (this->p.num_outputs + column) * bytesOf(desc.dt);
         if (this->inputTensors.size() == 1) {
             // bi-direction rnn has forward-states and backward-states
-            memset(state, 0, batch * num * ch_size);
+            UNI_MEMSET(state, 0, batch * num * ch_size);
         } else if (this->inputTensors.size() == 2) {
             if (num != 1) {
                 UNI_ERROR_LOG("currently not support to set bi-direction RNN's h or c.\n");
             }
-            memcpy(state, get_ptr_from_tensor(this->inputTensors[1], this->archInfo.arch),
+            UNI_MEMCPY(state, get_ptr_from_tensor(this->inputTensors[1], this->archInfo.arch),
                 tensorNumBytes(this->inputTensors[1].get_desc()));
         } else if (this->inputTensors.size() == 3) {
             if (num != 1) {
@@ -59,8 +59,8 @@ class RNNCPU : public RNNCellCPU {
             U32 input_c_tile = tensorNumBytes(this->inputTensors[2].get_desc()) / batch;
             for (int i = 0; i < batch; i++) {
                 U8 *ptr = state + i * ch_size;
-                memcpy(ptr, c + input_c_tile * i, input_c_tile);
-                memcpy(ptr + c_size, h + input_h_tile * i, input_h_tile);
+                UNI_MEMCPY(ptr, c + input_c_tile * i, input_c_tile);
+                UNI_MEMCPY(ptr + c_size, h + input_h_tile * i, input_h_tile);
             }
         }
 
@@ -69,7 +69,7 @@ class RNNCPU : public RNNCellCPU {
             tmpTensor, this->outputTensors, &this->archInfo));
 
         if (this->outputTensors.size() == 2) {
-            memcpy(get_ptr_from_tensor(this->outputTensors[1], this->archInfo.arch), state,
+            UNI_MEMCPY(get_ptr_from_tensor(this->outputTensors[1], this->archInfo.arch), state,
                 tensorNumBytes(this->outputTensors[1].get_desc()));
         } else if (this->outputTensors.size() == 3) {
             U8 *h = (U8 *)get_ptr_from_tensor(this->outputTensors[1], this->archInfo.arch);
@@ -79,8 +79,8 @@ class RNNCPU : public RNNCellCPU {
             U32 output_c_tile = tensorNumBytes(this->outputTensors[2].get_desc()) / batch;
             for (int i = 0; i < batch; i++) {
                 U8 *ptr = state + i * ch_size;
-                memcpy(c + output_c_tile * i, ptr, output_c_tile);
-                memcpy(h + output_h_tile * i, ptr + c_size, output_h_tile);
+                UNI_MEMCPY(c + output_c_tile * i, ptr, output_c_tile);
+                UNI_MEMCPY(h + output_h_tile * i, ptr + c_size, output_h_tile);
             }
         }
     }
@@ -89,16 +89,12 @@ class RNNCPU : public RNNCellCPU {
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
         TensorDesc inputDesc = inTensors[0]->get_desc();
-
-        if (inputDesc.nDims < 3) {
-            CHECK_STATUS(NOT_MATCH);
-        }
+        CHECK_REQUIREMENT(inputDesc.nDims >= 3);
         this->xDim = inputDesc.dims[inputDesc.nDims - 3];
         for (U32 i = 0; i < inputDesc.nDims - 3; ++i) {
             xDim *= inputDesc.dims[i];
         }
-        CHECK_STATUS(rnn_infer_output_size(inTensors, this->p, outTensors, &this->archInfo));
-        return SUCCESS;
+        return rnn_infer_output_size(inTensors, this->p, outTensors, &this->archInfo);
     }
 
     U32 infer_tmp_memory_size() override
diff --git a/inference/engine/include/cpu/rnncell_cpu.hpp b/inference/engine/include/cpu/rnncell_cpu.hpp
index fe2b13f7..d595876f 100644
--- a/inference/engine/include/cpu/rnncell_cpu.hpp
+++ b/inference/engine/include/cpu/rnncell_cpu.hpp
@@ -43,7 +43,7 @@ class RNNCellCPU : public RNNCell {
             tmpOffset = xTensor.bytes();
         }
         CHECK_STATUS(rnncell(xTensor, this->weightTensors, this->biasTensors, stateTensor, this->p,
-            this->xDim, this->p.numOutput, tmpOffset, tmpTensor, hTensor, &this->archInfo));
+            this->xDim, this->p.num_outputs, tmpOffset, tmpTensor, hTensor, &this->archInfo));
     }
 
     EE infer_output_tensors_size(
@@ -96,14 +96,14 @@ class RNNCellCPU : public RNNCell {
 
     EE infer_weight_desc() override
     {
-        int directions = (this->p.biDirection) ? 2 : 1;
+        int directions = (this->p.bi_direction) ? 2 : 1;
         int weightNum, biasNum, column;
-        if (this->p.numProjection > 0) {
+        if (this->p.num_projection > 0) {
             weightNum = biasNum = 2;
-            column = this->p.numProjection;
+            column = this->p.num_projection;
         } else {
             weightNum = biasNum = 1;
-            column = this->p.numOutput;
+            column = this->p.num_outputs;
         }
         int gates = 0;
         switch (this->p.mode) {
@@ -121,12 +121,12 @@ class RNNCellCPU : public RNNCell {
                 return NOT_SUPPORTED;
         }
         U32 filterRow = gates * column;
-        U32 filterCol = this->xDim + this->p.numOutput;
+        U32 filterCol = this->xDim + this->p.num_outputs;
         std::vector<TensorDesc> weight_desc(2), bias_desc(2);
         weight_desc[0] = tensor2df(this->dt, DF_NK, filterRow, filterCol);
-        weight_desc[1] = tensor2df(this->dt, DF_NK, this->p.numOutput, this->p.numProjection);
+        weight_desc[1] = tensor2df(this->dt, DF_NK, this->p.num_outputs, this->p.num_projection);
         bias_desc[0] = tensor1d(this->dt, filterRow);
-        bias_desc[1] = tensor1d(this->dt, this->p.numOutput);
+        bias_desc[1] = tensor1d(this->dt, this->p.num_outputs);
         this->weightTensors = std::vector<Tensor>(directions * weightNum);
         this->biasTensors = std::vector<Tensor>(directions * biasNum);
         for (int i = 0, wid = 0, vid = 0; i < directions; i++) {
diff --git a/inference/engine/include/cpu/roialign_cpu.hpp b/inference/engine/include/cpu/roialign_cpu.hpp
new file mode 100644
index 00000000..c142ede1
--- /dev/null
+++ b/inference/engine/include/cpu/roialign_cpu.hpp
@@ -0,0 +1,53 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _ROTALIGN_CPU_H
+#define _ROTALIGN_CPU_H
+
+#include "roialign.hpp"
+
+class RoIAlignCPU : public RoIAlign {
+public:
+    RoIAlignCPU(DataType dt, RoIAlignParamSpec p) : RoIAlign(dt, p)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<RoIAlignCPU> mem =
+            std::shared_ptr<RoIAlignCPU>(new RoIAlignCPU(this->dt, this->p));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        CHECK_STATUS(roialign(inputTensors, this->p, this->temp, outputTensors[0], &this->archInfo));
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        CHECK_STATUS(roialign_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo));
+        return SUCCESS;
+    }
+
+    U32 infer_tmp_memory_size() override
+    {
+        U32 bytes = 0;
+        CHECK_STATUS(roialign_infer_forward_tmp_bytes(
+            this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo));
+        return bytes;
+    }
+};
+
+#endif  // ROTALIGN_CPU_H
diff --git a/inference/engine/include/cpu/scale_cpu.hpp b/inference/engine/include/cpu/scale_cpu.hpp
index 669c11ae..d2a54722 100644
--- a/inference/engine/include/cpu/scale_cpu.hpp
+++ b/inference/engine/include/cpu/scale_cpu.hpp
@@ -24,7 +24,7 @@ class ScaleCPU : public Scale {
     std::shared_ptr<Operator> clone() override
     {
         std::shared_ptr<ScaleCPU> mem =
-            std::shared_ptr<ScaleCPU>(new ScaleCPU(this->dt, this->p, this->numChannels));
+            std::shared_ptr<ScaleCPU>(new ScaleCPU(this->dt, this->p, 0));
         *mem = *this;
         return mem;
     }
diff --git a/inference/engine/include/cpu/shape_cpu.hpp b/inference/engine/include/cpu/shape_cpu.hpp
index 10df2bc7..a111f749 100644
--- a/inference/engine/include/cpu/shape_cpu.hpp
+++ b/inference/engine/include/cpu/shape_cpu.hpp
@@ -33,15 +33,23 @@ class ShapeCPU : public Shape {
         Tensor inputTensor = this->inputTensors[0];
         TensorDesc inputDesc = inputTensor.get_desc();
         Tensor outputTensor = this->outputTensors[0];
-        UNI_MEMCPY(((CpuMemory *)(outputTensor.get_memory()))->get_ptr(), inputDesc.dims,
-            inputDesc.nDims * sizeof(U32));
+        U32 *ptr = (U32 *)((CpuMemory *)(outputTensor.get_memory()))->get_ptr();
+        for (U32 i = 0; i < inputDesc.nDims; i++) {
+            ptr[i] = inputDesc.dims[inputDesc.nDims - 1 - i];
+        }
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
         TensorDesc inputDesc = inTensors[0]->get_desc();
-        TensorDesc outputDesc = tensor1d(DT_U32, inputDesc.nDims);
+        TensorDesc outputDesc;
+        outputDesc.dt = DT_U32;
+        outputDesc.nDims = 1;
+        outputDesc.dims[0] = inputDesc.nDims;
+        for (U32 i = 0; i < inputDesc.nDims; i++) {
+            outputDesc.dims[outputDesc.nDims + i] = inputDesc.dims[inputDesc.nDims - 1 - i];
+        }
         outTensors[0]->resize(outputDesc);
         return SUCCESS;
     }
diff --git a/inference/engine/include/cpu/shared_weight_cpu.hpp b/inference/engine/include/cpu/shared_weight_cpu.hpp
index eec5ec9e..f73a28b6 100644
--- a/inference/engine/include/cpu/shared_weight_cpu.hpp
+++ b/inference/engine/include/cpu/shared_weight_cpu.hpp
@@ -61,12 +61,12 @@ class SharedWeightCPU : public SharedWeight {
         U32 weightBytes = modelWeightTensor.bytes();
         modelWeightTensor.alloc();
         if (modelPtr != nullptr) {
-            memcpy(
+            UNI_MEMCPY(
                 ((CpuMemory *)(modelWeightTensor.get_memory()))->get_ptr(), modelPtr, weightBytes);
             *modelPtrShared = std::shared_ptr<U8>(*modelPtrShared, modelPtr + weightBytes);
         } else {
             auto curOpWs = this->get_weightspec();
-            memcpy(((CpuMemory *)(modelWeightTensor.get_memory()))->get_ptr(), curOpWs.weight,
+            UNI_MEMCPY(((CpuMemory *)(modelWeightTensor.get_memory()))->get_ptr(), curOpWs.weight,
                 weightBytes);
         }
         this->weightTensors.push_back(modelWeightTensor);
diff --git a/inference/engine/include/cpu/slice_cpu.hpp b/inference/engine/include/cpu/slice_cpu.hpp
index 4321b20a..2f618572 100644
--- a/inference/engine/include/cpu/slice_cpu.hpp
+++ b/inference/engine/include/cpu/slice_cpu.hpp
@@ -39,14 +39,7 @@ class SliceCPU : public Slice {
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(slice_infer_output_size(inTensors[0], this->p, outTensors, &this->archInfo));
-        auto outDesc = outTensors[0]->get_desc();
-        if (outDesc.nDims == 3 && outDesc.dims[1] == 1 && outDesc.dims[2] == 1) {
-            outDesc.nDims = 2;
-            outDesc.df = DF_NORMAL;
-            outTensors[0]->resize(outDesc);
-        }
-        return SUCCESS;
+        return slice_infer_output_size(inTensors[0], this->p, outTensors, &this->archInfo);
     }
 };
 
diff --git a/inference/engine/include/cpu/softmax_cpu.hpp b/inference/engine/include/cpu/softmax_cpu.hpp
index 15650c4f..bff819b5 100644
--- a/inference/engine/include/cpu/softmax_cpu.hpp
+++ b/inference/engine/include/cpu/softmax_cpu.hpp
@@ -38,9 +38,7 @@ class SoftmaxCPU : public Softmax {
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(
-            softmax_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo));
-        return SUCCESS;
+        return softmax_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo);
     }
 };
 
diff --git a/inference/engine/include/cpu/space2depth_cpu.hpp b/inference/engine/include/cpu/space2depth_cpu.hpp
new file mode 100644
index 00000000..7bf54335
--- /dev/null
+++ b/inference/engine/include/cpu/space2depth_cpu.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _SPACE2DEPTH_CPU_H
+#define _SPACE2DEPTH_CPU_H
+
+#include "space2depth.hpp"
+
+class Space2DepthCPU : public Space2Depth {
+public:
+    Space2DepthCPU(DataType dt, Space2DepthParamSpec p) : Space2Depth(dt, p)
+    {}
+
+    std::shared_ptr<Operator> clone() override
+    {
+        std::shared_ptr<Space2DepthCPU> mem =
+            std::shared_ptr<Space2DepthCPU>(new Space2DepthCPU(this->dt, this->p));
+        *mem = *this;
+        return mem;
+    }
+
+    void run() override
+    {
+        CHECK_STATUS(space2depth(inputTensors[0], this->p, outputTensors[0], &this->archInfo));
+    }
+
+    EE infer_output_tensors_size(
+        std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
+    {
+        CHECK_STATUS(
+            space2depth_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo));
+        return SUCCESS;
+    }
+};
+
+#endif  // SPACE2DEPTH_CPU_H
diff --git a/inference/engine/include/cpu/splice_cpu.hpp b/inference/engine/include/cpu/splice_cpu.hpp
index e05b858f..bbb87424 100644
--- a/inference/engine/include/cpu/splice_cpu.hpp
+++ b/inference/engine/include/cpu/splice_cpu.hpp
@@ -44,8 +44,8 @@ class SpliceCPU : public Splice {
             this->transform_filter();
         }
         EmbedParamSpec embedParamSpec;
-        embedParamSpec.input_dim = this->inputFrameSize;
-        embedParamSpec.num_output = inputDesc.dims[0];
+        embedParamSpec.num_inputs = this->inputFrameSize;
+        embedParamSpec.num_outputs = inputDesc.dims[0];
         embedParamSpec.transpose = false;
         CHECK_STATUS(embedding(this->weightTensors[0], inputTensor, embedParamSpec, this->temp,
             outputTensor, &this->archInfo));
diff --git a/inference/engine/include/cpu/tdnn_convolution_cpu.hpp b/inference/engine/include/cpu/tdnn_convolution_cpu.hpp
index a83167ff..196fcd2c 100644
--- a/inference/engine/include/cpu/tdnn_convolution_cpu.hpp
+++ b/inference/engine/include/cpu/tdnn_convolution_cpu.hpp
@@ -35,9 +35,9 @@ class TdnnConvolutionCPU : public ConvolutionCPU {
                 UNI_ERROR_LOG("TdnnCPU currently not support time context is decreasing order\n");
             }
         }
-        ConvolutionMode convMode = Convolution_Pointwise;
+        ConvolutionMode convMode = CONVOLUTION_POINTWISE;
         if (dilation > 1) {
-            convMode = Convolution_Dilation;
+            convMode = CONVOLUTION_DILATION;
         }
         this->p = createConvolutionParamSpec(1, 1, this->tdnn.num_context, 1, 1, 1, 1, 0, 0, 0, 0,
             0, 0, 1, dilation, 1, this->tdnn.num_outputs, convMode);
diff --git a/inference/engine/include/cpu/tdnn_fully_connected_cpu.hpp b/inference/engine/include/cpu/tdnn_fully_connected_cpu.hpp
index 85c11226..1eddb1c1 100644
--- a/inference/engine/include/cpu/tdnn_fully_connected_cpu.hpp
+++ b/inference/engine/include/cpu/tdnn_fully_connected_cpu.hpp
@@ -80,7 +80,7 @@ class TdnnFullyConnectedCPU : public FullyConnectedCPU {
                      j < this->outputFrameSize - this->slide_size; j++) {
                     U8 *dst = output + (i * this->outputFrameSize + j) * tileSize;
                     U8 *src = dst + tileSize;
-                    memcpy(dst, src, tileSize);
+                    UNI_MEMCPY(dst, src, tileSize);
                 }
             }
         }
@@ -104,8 +104,8 @@ class TdnnFullyConnectedCPU : public FullyConnectedCPU {
             ((CpuMemory *)spliceResult.get_memory())->set_shared_ptr(spliceBuffer);
 
             EmbedParamSpec embedParamSpec;
-            embedParamSpec.input_dim = this->inputFrameSize;
-            embedParamSpec.num_output = inputDesc.dims[0];
+            embedParamSpec.num_inputs = this->inputFrameSize;
+            embedParamSpec.num_outputs = inputDesc.dims[0];
             embedParamSpec.transpose = false;
             CHECK_STATUS(embedding(this->index, inputTensor, embedParamSpec, this->temp,
                 spliceResult, &this->archInfo));
diff --git a/inference/engine/include/cpu/topk_cpu.hpp b/inference/engine/include/cpu/topk_cpu.hpp
index 99d0ce3e..936651bd 100644
--- a/inference/engine/include/cpu/topk_cpu.hpp
+++ b/inference/engine/include/cpu/topk_cpu.hpp
@@ -28,19 +28,28 @@ class TopKCPU : public TopK {
         return mem;
     }
 
+    TopKParamSpec get_param(TensorDesc desc)
+    {
+        TopKParamSpec lp = this->p;
+        if (lp.k == 0) {
+            lp.k = desc.dims[desc.nDims];
+        }
+        return lp;
+    }
     void run() override
     {
-        Tensor inputTensor = this->inputTensors[0];
-        Tensor outputTensor = this->outputTensors[0];
-        Tensor outputIndicesTensor = this->outputTensors[1];
-        CHECK_STATUS(topk(
-            inputTensor, this->p, this->temp, outputTensor, outputIndicesTensor, &this->archInfo));
+        CHECK_STATUS(topk(inputTensors[0], this->p, this->temp, outputTensors[0], outputTensors[1],
+            &this->archInfo));
     }
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(topk_infer_output_size(
-            inTensors[0], this->p, outTensors[0], outTensors[1], &this->archInfo));
+        TopKParamSpec lp = this->p;
+        if (lp.k == 0 && inTensors.size() > 1) {
+            lp = get_param(inTensors[1]->get_desc());
+        }
+        CHECK_STATUS(
+            topk_infer_output_size(inTensors[0], lp, outTensors[0], outTensors[1], &this->archInfo));
         return SUCCESS;
     }
 
diff --git a/inference/engine/include/cpu/where_cpu.hpp b/inference/engine/include/cpu/where_cpu.hpp
index 86d63c67..bba916ef 100644
--- a/inference/engine/include/cpu/where_cpu.hpp
+++ b/inference/engine/include/cpu/where_cpu.hpp
@@ -14,7 +14,6 @@
 #ifndef _WHERE_CPU_H
 #define _WHERE_CPU_H
 
-#include <math.h>
 #include "where.hpp"
 
 class WhereCPU : public Where {
@@ -31,31 +30,14 @@ class WhereCPU : public Where {
 
     void run() override
     {
-        CHECK_STATUS(where(this->inputTensors[1], this->inputTensors[0], this->biasTensors[0],
+        CHECK_STATUS(where(this->inputTensors[0], this->inputTensors[1], this->inputTensors[2],
             this->outputTensors[0], &this->archInfo));
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        //inTensors[0] is condition now, 2021/2/3
-        CHECK_STATUS(where_infer_output_size(
-            inTensors[inTensors.size() - 1], outTensors[0], &this->archInfo));
-        return SUCCESS;
-    }
-
-    EE infer_weight_desc() override
-    {
-        auto curOpWs = this->get_weightspec();
-        int weightBytes = curOpWs.bytes_of_weight;
-        int Lw = sqrt(weightBytes / bytesOf(curOpWs.mdt));
-        int biasBytes = curOpWs.bytes_of_vec;
-        int Lb = biasBytes / bytesOf(curOpWs.mdt);
-        this->weightTensors = std::vector<Tensor>(1);
-        this->weightTensors[0].resize(tensor4d(this->dt, 1, 1, Lw, Lw));
-        this->biasTensors = std::vector<Tensor>(1);
-        this->biasTensors[0].resize(tensor2d(this->dt, 1, Lb));
-        return SUCCESS;
+        return where_infer_output_size(inTensors[1], inTensors[2], outTensors[0], &this->archInfo);
     }
 };
 
diff --git a/inference/engine/include/cumsum.hpp b/inference/engine/include/cumsum.hpp
new file mode 100644
index 00000000..dc91a99e
--- /dev/null
+++ b/inference/engine/include/cumsum.hpp
@@ -0,0 +1,35 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _CUMSUM_H
+#define _CUMSUM_H
+
+#include "operator.hpp"
+
+class CumSum : public Operator {
+public:
+    explicit CumSum(DataType dt, CumSumParamSpec p)
+    {
+        this->dt = dt;
+        this->p = p;
+    }
+
+    OperatorType get_type() override
+    {
+        return OT_CumSum;
+    }
+
+protected:
+    CumSumParamSpec p;
+};
+#endif  // _CUMSUM_H
diff --git a/inference/engine/include/data_loader.hpp b/inference/engine/include/data_loader.hpp
index bd88c9ca..7dadacee 100644
--- a/inference/engine/include/data_loader.hpp
+++ b/inference/engine/include/data_loader.hpp
@@ -20,6 +20,8 @@
 
 int string_end_with(std::string s, std::string sub);
 
+bool is_directory(std::string path);
+
 void get_files(std::string directoryName, std::vector<std::string> &files);
 
 std::vector<Tensor> load_fake_data(std::vector<TensorDesc> dataDesc);
diff --git a/inference/engine/include/detection_output.hpp b/inference/engine/include/detection_output.hpp
index cdc92799..ba52f8a6 100644
--- a/inference/engine/include/detection_output.hpp
+++ b/inference/engine/include/detection_output.hpp
@@ -46,9 +46,7 @@ class DetectionOutput : public Operator {
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(
-            detectionoutput_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo));
-        return SUCCESS;
+        return detectionoutput_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo);
     }
 
 protected:
diff --git a/inference/engine/include/eltwise.hpp b/inference/engine/include/eltwise.hpp
index 35a13270..85d14b9e 100644
--- a/inference/engine/include/eltwise.hpp
+++ b/inference/engine/include/eltwise.hpp
@@ -18,9 +18,9 @@
 
 class Eltwise : public Operator {
 public:
-    Eltwise(EltwiseParamSpec eltwiseDesc)
+    Eltwise(EltwiseParamSpec p)
     {
-        this->eltwiseDesc = eltwiseDesc;
+        this->p = p;
     }
 
     OperatorType get_type() override
@@ -37,6 +37,6 @@ class Eltwise : public Operator {
     }
 
 protected:
-    EltwiseParamSpec eltwiseDesc;
+    EltwiseParamSpec p;
 };
 #endif  // _ELTWISE_H
diff --git a/inference/engine/include/equal.hpp b/inference/engine/include/equal.hpp
index 3fbcd179..d1e8c243 100644
--- a/inference/engine/include/equal.hpp
+++ b/inference/engine/include/equal.hpp
@@ -29,11 +29,6 @@ class Equal : public WeightOperator {
         return OT_Equal;
     }
 
-    bool can_input_output_the_same() override
-    {
-        return false;
-    }
-
 protected:
     EqualParamSpec p;
 };
diff --git a/inference/engine/include/factory.hpp b/inference/engine/include/factory.hpp
index 085b43c9..8cc0c6ae 100644
--- a/inference/engine/include/factory.hpp
+++ b/inference/engine/include/factory.hpp
@@ -15,11 +15,10 @@
 #define _FACTORY_H
 
 #include "operator.hpp"
-#include "tensor_computing.h"
 
 #define NOT_SUPPORT       \
     Operator *cep = NULL; \
-    CHECK_STATUS(NOT_SUPPORTED);
+    UNI_ERROR_LOG("not support to create operator in %s.\n", __FUNCTION__);
 #define NOT_USE0()
 #define NOT_USE1(a1) \
     {                \
@@ -100,11 +99,12 @@ class Factory {
 
     virtual std::shared_ptr<Operator> createMatMul(DataType dt, MatMulParamSpec p) = 0;
 
-    virtual std::shared_ptr<Operator> createLayerNorm(DataType dt, U32 weightNum) = 0;
+    virtual std::shared_ptr<Operator> createLayerNorm(
+        DataType dt, LayerNormParamSpec p, U32 weightNum) = 0;
 
     virtual std::shared_ptr<Operator> createReshape(DataType dt, ReshapeParamSpec p) = 0;
 
-    virtual std::shared_ptr<Operator> createResize(DataType paramDT, ResizeParamSpec p) = 0;
+    virtual std::shared_ptr<Operator> createResize(DataType dt, ResizeParamSpec p) = 0;
 
     virtual std::shared_ptr<Operator> createSlice(DataType dt, SliceParamSpec p) = 0;
 
@@ -131,7 +131,7 @@ class Factory {
 
     virtual std::shared_ptr<Operator> createBilateralSliceApply(BilateralSliceApplyParamSpec p) = 0;
 
-    virtual std::shared_ptr<Operator> createPreAllocatedMemory(DataType dt, TensorDesc desc) = 0;
+    virtual std::shared_ptr<Operator> createPreAllocatedMemory(PreAllocatedMemoryParamSpec p) = 0;
 
     virtual std::shared_ptr<Operator> createSharedWeight(DataType dt,
         TensorDesc desc,
@@ -186,8 +186,6 @@ class Factory {
 
     virtual std::shared_ptr<Operator> createCast(DataType dt, CastParamSpec p) = 0;
 
-    virtual std::shared_ptr<Operator> createEqual(DataType dt, EqualParamSpec p) = 0;
-
     virtual std::shared_ptr<Operator> createExpand(DataType dt, ExpandParamSpec p) = 0;
 
     virtual std::shared_ptr<Operator> createScatter(DataType dt, ScatterParamSpec p) = 0;
@@ -198,23 +196,31 @@ class Factory {
 
     virtual std::shared_ptr<Operator> createInstanceNorm(DataType dt, InstanceNormParamSpec p) = 0;
 
-    virtual std::shared_ptr<Operator> createRoIAlign(RoIAlignParamSpec p) = 0;
+    virtual std::shared_ptr<Operator> createRoIAlign(DataType dt, RoIAlignParamSpec p) = 0;
 
     virtual std::shared_ptr<Operator> createGenerateProposals(
         DataType dt, GenerateProposalsParamSpec p) = 0;
 
     virtual std::shared_ptr<Operator> createGAT(DataType dt, GATParamSpec p) = 0;
 
-    DataType get_float_precision(DataType dt)
-    {
-        DataType ret = dt;
-        if (dt == DT_F16_8Q) {
-            ret = DT_F16;
-        } else if (dt == DT_F32_8Q) {
-            ret = DT_F32;
-        }
-        return ret;
-    }
+    virtual std::shared_ptr<Operator> createQuantizeLinear(
+        DataType dt, QuantizeLinearParamSpec p) = 0;
+
+    virtual std::shared_ptr<Operator> createGridSample(DataType dt, GridSampleParamSpec p) = 0;
+
+    virtual std::shared_ptr<Operator> createOneHot(DataType dt, OneHotParamSpec p) = 0;
+
+    virtual std::shared_ptr<Operator> createCumSum(DataType dt, CumSumParamSpec p) = 0;
+
+    virtual std::shared_ptr<Operator> createNonMaxSuppression(
+        DataType dt, NonMaxSuppressionParamSpec p) = 0;
+
+    virtual std::shared_ptr<Operator> createConstantOfShape(
+        DataType dt, ConstantOfShapeParamSpec p) = 0;
+
+    virtual std::shared_ptr<Operator> createNonZero(DataType dt) = 0;
+
+    virtual std::shared_ptr<Operator> createRange(DataType dt, RangeParamSpec p) = 0;
 
     std::shared_ptr<Operator> createOperators(OperatorSpec curOps,
         DataType dt,
@@ -239,14 +245,28 @@ class Factory {
         if (dt == DT_F32_8Q || dt == DT_F16_8Q) {
 #ifndef _USE_INT8
             UNI_ERROR_LOG("this library not support to inference int8, please recompile with "
-                          "--int8=on. Only Armv7+ and x86 AVX512-VNNI cpu support.\n");
+                          "--int8=on. Only Armv7+ and x86 AVX512/AVX512-VNNI cpu support.\n");
 #endif
         }
         OperatorType opType = curOps.type;
-        DataType dtNoQ = get_float_precision(dt);
+        DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : ((dt == DT_F32_8Q) ? DT_F32 : dt);
         std::string opName = curOps.name;
         std::shared_ptr<Operator> op;
         auto curPs = curOps.ps;
+        std::map<OperatorType, ActivationMode> activationMap = {{OT_Relu6, ACTIVATION_RELU6},
+            {OT_HSwish, ACTIVATION_H_SWISH}, {OT_HSwishNoDiv, ACTIVATION_H_SWISH_NODIV},
+            {OT_Sigmoid, ACTIVATION_SIGMOID}, {OT_HSigmoid, ACTIVATION_H_SIGMOID},
+            {OT_Gelu, ACTIVATION_GELU}, {OT_TanH, ACTIVATION_TANH}, {OT_Mish, ACTIVATION_MISH},
+            {OT_Greater, ACTIVATION_GREATER}, {OT_Exp, ACTIVATION_EXP},
+            {OT_SoftPlus, ACTIVATION_SOFTPLUS}, {OT_Abs, ACTIVATION_ABS}, {OT_Sign, ACTIVATION_SIGN},
+            {OT_Not, ACTIVATION_NOT}, {OT_Log, ACTIVATION_LOG}, {OT_Neg, ACTIVATION_NEG},
+            {OT_Round, ACTIVATION_ROUND}, {OT_Floor, ACTIVATION_FLOOR}, {OT_Ceil, ACTIVATION_CEIL},
+            {OT_Swish, ACTIVATION_SWISH}, {OT_Reciprocal, ACTIVATION_RECIPROCAL}};
+        if (activationMap.find(opType) != activationMap.end()) {
+            ActivationParamSpec activationDesc;
+            activationDesc.mode = activationMap[opType];
+            return createActivation(activationDesc);
+        }
         switch (opType) {
             case OT_Conv: {
                 ActivationParamSpec dwActiveDesc;
@@ -288,60 +308,6 @@ class Factory {
                 op = createActivation(activationDesc);
                 break;
             }
-            case OT_Relu6: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_RELU6;
-                op = createActivation(activationDesc);
-                break;
-            }
-            case OT_HSwish: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_H_SWISH;
-                op = createActivation(activationDesc);
-                break;
-            }
-            case OT_HSwishNoDiv: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_H_SWISH_NODIV;
-                op = createActivation(activationDesc);
-                break;
-            }
-            case OT_Sigmoid: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_SIGMOID;
-                op = createActivation(activationDesc);
-                break;
-            }
-            case OT_HSigmoid: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_H_SIGMOID;
-                op = createActivation(activationDesc);
-                break;
-            }
-            case OT_Gelu: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_GELU;
-                op = createActivation(activationDesc);
-                break;
-            }
-            case OT_TanH: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_TANH;
-                op = createActivation(activationDesc);
-                break;
-            }
-            case OT_Mish: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_MISH;
-                op = createActivation(activationDesc);
-                break;
-            }
-            case OT_Greater: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_GREATER;
-                op = createActivation(activationDesc);
-                break;
-            }
             case OT_Concat: {
                 op = createConcat(curPs.concat_spec);
                 break;
@@ -367,7 +333,7 @@ class Factory {
                 break;
             }
             case OT_LayerNorm: {
-                op = createLayerNorm(dt, 0);
+                op = createLayerNorm(dt, curPs.ln_spec, 0);
                 break;
             }
             case OT_Reshape: {
@@ -375,12 +341,7 @@ class Factory {
                 break;
             }
             case OT_Resize: {
-                if (curPs.resize_spec.num_sizes > 0) {
-                    op = createResize(DT_U32, curPs.resize_spec);
-                } else {
-                    CHECK_REQUIREMENT(curPs.resize_spec.num_scales == 4);
-                    op = createResize(DT_F32, curPs.resize_spec);
-                }
+                op = createResize(dt, curPs.resize_spec);
                 break;
             }
             case OT_Slice: {
@@ -424,10 +385,7 @@ class Factory {
                 break;
             }
             case OT_PreAllocatedMemory: {
-                PreAllocatedMemoryParamSpec curPreAllocatedMemoryParamSpec =
-                    curOps.ps.preallocated_memory_spec;
-                TensorDesc desc = curPreAllocatedMemoryParamSpec.desc;
-                op = createPreAllocatedMemory(dtNoQ, desc);
+                op = createPreAllocatedMemory(curOps.ps.preallocated_memory_spec);
                 break;
             }
             case OT_SharedWeight: {
@@ -527,18 +485,6 @@ class Factory {
                 op = createWhere(dt);
                 break;
             }
-            case OT_SoftPlus: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_SOFTPLUS;
-                op = createActivation(activationDesc);
-                break;
-            }
-            case OT_Exp: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_EXP;
-                op = createActivation(activationDesc);
-                break;
-            }
             case OT_Tdnn: {
                 op = createTdnn(dt, curPs.tdnn_spec);
                 break;
@@ -551,26 +497,10 @@ class Factory {
                 op = createTopK(dt, curPs.topk_spec);
                 break;
             }
-            case OT_Abs: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_ABS;
-                op = createActivation(activationDesc);
-                break;
-            }
             case OT_Cast: {
                 op = createCast(dt, curPs.cast_spec);
                 break;
             }
-            case OT_Equal: {
-                op = createEqual(dt, curPs.equal_spec);
-                break;
-            }
-            case OT_Sign: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_SIGN;
-                op = createActivation(activationDesc);
-                break;
-            }
             case OT_InstanceNorm: {
                 op = createInstanceNorm(dt, curPs.in_spec);
                 break;
@@ -591,38 +521,52 @@ class Factory {
                 op = createSelect(dt);
                 break;
             }
-            case OT_Not: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_NOT;
-                op = createActivation(activationDesc);
+            case OT_GAT: {
+                op = createGAT(dt, curPs.gat_spec);
                 break;
             }
-            case OT_Log: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_LOG;
-                op = createActivation(activationDesc);
+            case OT_RoIAlign: {
+                op = createRoIAlign(dt, curPs.roialign_spec);
                 break;
             }
-            case OT_Neg: {
-                ActivationParamSpec activationDesc;
-                activationDesc.mode = ACTIVATION_NEG;
-                op = createActivation(activationDesc);
+            case OT_GenerateProposals: {
+                op = createGenerateProposals(dt, curPs.generate_proposals_spec);
                 break;
             }
-            case OT_GAT: {
-                op = createGAT(dt, curPs.gat_spec);
+            case OT_QuantizeLinear: {
+                op = createQuantizeLinear(dt, curPs.quant_spec);
                 break;
             }
-            case OT_RoIAlign: {
-                op = createRoIAlign(curPs.roialign_spec);
+            case OT_GridSample: {
+                op = createGridSample(dt, curPs.grid_sample_spec);
                 break;
             }
-            case OT_GenerateProposals: {
-                op = createGenerateProposals(dt, curPs.generate_proposals_spec);
+            case OT_OneHot: {
+                op = createOneHot(dt, curPs.onehot_spec);
+                break;
+            }
+            case OT_CumSum: {
+                op = createCumSum(dt, curPs.cumsum_spec);
+                break;
+            }
+            case OT_NonMaxSuppression: {
+                op = createNonMaxSuppression(dt, curPs.non_max_suppression_spec);
+                break;
+            }
+            case OT_ConstantOfShape: {
+                op = createConstantOfShape(dt, curPs.constant_of_shape_spec);
+                break;
+            }
+            case OT_NonZero: {
+                op = createNonZero(dt);
+                break;
+            }
+            case OT_Range: {
+                op = createRange(dt, curPs.range_spec);
                 break;
             }
             default: {
-                UNI_ERROR_LOG("unsupported layer %s\n", OperatorTypeName()[opType]);
+                UNI_ERROR_LOG("can not create layer %s.\n", OperatorTypeName()[opType]);
                 break;
             }
         }
diff --git a/inference/engine/include/generate_proposals.hpp b/inference/engine/include/generate_proposals.hpp
index d55fab89..f63c9d24 100644
--- a/inference/engine/include/generate_proposals.hpp
+++ b/inference/engine/include/generate_proposals.hpp
@@ -34,9 +34,7 @@ class GenerateProposals : public WeightOperator {
         bool findId = false;
         this->anchorBlockDim = 4;
         U32 tensorNum = inTensors.size();
-        if (tensorNum != 3) {
-            CHECK_STATUS(NOT_MATCH);
-        }
+        CHECK_REQUIREMENT(tensorNum == 3);
         for (U32 i = 0; i < tensorNum; i++) {
             U32 j = (i + 1) % tensorNum;
             TensorDesc iDesc = inTensors[i]->get_desc();
@@ -60,13 +58,10 @@ class GenerateProposals : public WeightOperator {
                 }
             }
         }
-        if (!findId) {
-            CHECK_STATUS(NOT_MATCH);
-        }
+        CHECK_REQUIREMENT(findId);
     }
 
 protected:
-    DataType dt;
     GenerateProposalsParamSpec p;
     U8 deltaTensorId;
     U8 logitTensorId;
diff --git a/inference/engine/include/grid_sample.hpp b/inference/engine/include/grid_sample.hpp
new file mode 100644
index 00000000..ac5193bb
--- /dev/null
+++ b/inference/engine/include/grid_sample.hpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _GRID_SAMPLE_H
+#define _GRID_SAMPLE_H
+
+#include "operator.hpp"
+
+class GridSample : public Operator {
+public:
+    GridSample(DataType dt, GridSampleParamSpec p)
+    {
+        this->dt = dt;
+        this->p = p;
+    }
+
+    OperatorType get_type() override
+    {
+        return OT_GridSample;
+    }
+
+protected:
+    GridSampleParamSpec p;
+};
+
+#endif  // _GRID_SAMPLE_H
diff --git a/inference/engine/include/image_container.hpp b/inference/engine/include/image_container.hpp
index 9792b658..6288a93a 100644
--- a/inference/engine/include/image_container.hpp
+++ b/inference/engine/include/image_container.hpp
@@ -11,8 +11,8 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#ifndef _IMAGE_CONTAINER_
-#define _IMAGE_CONTAINER_
+#ifndef _IMAGE_CONTAINER_H
+#define _IMAGE_CONTAINER_H
 
 #include "tensor_desc.h"
 #include "image_manager.hpp"
@@ -49,7 +49,7 @@ class ImageContainer : public ImageManager {
     {
         I32 vecId = ImageManager::getImageVecsId(slot, width, height, depth);
         if (vecId < 0 || vecId >= (I32)images[slot].size()) {
-            CHECK_STATUS(NOT_MATCH);
+            UNI_ERROR_LOG("gpu image buffer reuse wrong.\n");
         }
         return *(images[slot][vecId].get());
     }
diff --git a/inference/engine/include/image_manager.hpp b/inference/engine/include/image_manager.hpp
index 5880d5de..ece144c9 100644
--- a/inference/engine/include/image_manager.hpp
+++ b/inference/engine/include/image_manager.hpp
@@ -11,8 +11,8 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#ifndef _IMAGE_MANAGER_
-#define _IMAGE_MANAGER_
+#ifndef _IMAGE_MANAGER_H
+#define _IMAGE_MANAGER_H
 #include <vector>
 
 class ImageManager {
@@ -32,7 +32,7 @@ class ImageManager {
         if (width == 0 && height == 0 && depth == 0) {
             return false;
         } else if (width == 0 || height == 0 || depth == 0) {
-            CHECK_STATUS(NOT_MATCH);
+            UNI_ERROR_LOG("gpu image tensor parameter is wrong.\n");
         }
         if (imageVecs.count(slot) == 0) {
             std::vector<std::vector<U32>> strs(1, str);
diff --git a/inference/engine/include/inference.hpp b/inference/engine/include/inference.hpp
index 5f9cfbe9..0db6b1cf 100644
--- a/inference/engine/include/inference.hpp
+++ b/inference/engine/include/inference.hpp
@@ -18,7 +18,6 @@
 #ifdef _USE_GPU
 #include "gcl.h"
 #endif
-#include "thread_affinity.h"
 
 inline std::map<std::string, TensorDesc> extractInputDims(const ModelSpec *ms)
 {
@@ -41,10 +40,9 @@ inline std::shared_ptr<CNN> createPipelinefromMs(
     // create ops
     cnn->initialize_ops(ms);
 
-    std::map<std::string, TensorDesc> inputDescMap = extractInputDims(ms);
-
     cnn->loadAlgorithmMap(algorithmMapPath);
 
+    std::map<std::string, TensorDesc> inputDescMap = extractInputDims(ms);
     // assign space for output, tmp, bias, and trans_weight
     cnn->ready(inputDescMap);
 
@@ -56,12 +54,13 @@ inline std::shared_ptr<CNN> createPipelinefromMs(
 inline std::shared_ptr<CNN> createPipeline(
     const char *affinityPolicyName, const char *modelPath, const char *algorithmMapPath = "")
 {
-    // deserialize model from file
+    std::shared_ptr<CNN> pipeline;
     ModelSpec ms;
-    CHECK_STATUS(deserialize_model_from_file(modelPath, &ms));
-    std::shared_ptr<CNN> pipeline = createPipelinefromMs(affinityPolicyName, &ms, algorithmMapPath);
-    CHECK_STATUS(mt_destroy_model(&ms));
+    EE ret = deserialize_model_from_file(modelPath, &ms);
+    if (ret == SUCCESS) {
+        pipeline = createPipelinefromMs(affinityPolicyName, &ms, algorithmMapPath);
+        CHECK_STATUS(mt_destroy_model(&ms));
+    }
     return pipeline;
 }
-
 #endif
diff --git a/inference/engine/include/instance_norm.hpp b/inference/engine/include/instance_norm.hpp
index fda06c9f..76b1ad11 100644
--- a/inference/engine/include/instance_norm.hpp
+++ b/inference/engine/include/instance_norm.hpp
@@ -22,7 +22,6 @@ class InstanceNorm : public WeightOperator {
     {
         this->dt = dt;
         this->p = p;
-        this->numChannels = 0;
     }
 
     OperatorType get_type() override
@@ -32,7 +31,6 @@ class InstanceNorm : public WeightOperator {
 
 protected:
     InstanceNormParamSpec p;
-    U32 numChannels;
 };
 
 #endif  // _INSTANCE_NORM_H
diff --git a/inference/engine/include/jump.hpp b/inference/engine/include/jump.hpp
index 2932217b..102396d3 100644
--- a/inference/engine/include/jump.hpp
+++ b/inference/engine/include/jump.hpp
@@ -46,7 +46,7 @@ class Jump : public Operator {
         // check status
         if (this->inputTensors.size() > 1) {
             Tensor inputTensor = this->inputTensors[1];
-            I32 *ptr = (I32 *)((CpuMemory *)(inputTensor.get_memory()))->get_ptr();
+            U8 *ptr = (U8 *)((CpuMemory *)(inputTensor.get_memory()))->get_ptr();
             U32 length = inputTensor.length();
             for (U32 i = 0; i < length; i++) {
                 if (ptr[i]) {
diff --git a/inference/engine/include/layer_norm.hpp b/inference/engine/include/layer_norm.hpp
index f9e27ac0..4b39599e 100644
--- a/inference/engine/include/layer_norm.hpp
+++ b/inference/engine/include/layer_norm.hpp
@@ -18,9 +18,10 @@
 
 class LayerNorm : public WeightOperator {
 public:
-    LayerNorm(DataType dt, U32 weightNum)
+    LayerNorm(DataType dt, LayerNormParamSpec p, U32 weightNum)
     {
         this->dt = dt;
+        this->p = p;
         this->weightNum = weightNum;
         this->hasBias = false;
     }
@@ -31,6 +32,7 @@ class LayerNorm : public WeightOperator {
     }
 
 protected:
+    LayerNormParamSpec p;
     U32 weightNum;
 };
 
diff --git a/inference/engine/include/memory_tracker.hpp b/inference/engine/include/memory_tracker.hpp
index adc0a790..5110846e 100644
--- a/inference/engine/include/memory_tracker.hpp
+++ b/inference/engine/include/memory_tracker.hpp
@@ -96,7 +96,7 @@ class MemoryTracker {
     {
         I32 subSlot = imageManager.getImageVecsId(slot, str[0], str[1], str[2]);
         if (subSlot < 0) {
-            CHECK_STATUS(NOT_MATCH);
+            UNI_ERROR_LOG("gpu image buffer reuse parameter is wrong.\n");
         }
         return subSlot;
     }
diff --git a/inference/engine/include/model.hpp b/inference/engine/include/model.hpp
index f9fa654c..e17aa577 100644
--- a/inference/engine/include/model.hpp
+++ b/inference/engine/include/model.hpp
@@ -15,9 +15,8 @@
 #define _MODEL_H
 
 #include "operator.hpp"
-#include "tensor_desc.h"
 #include "algorithm_map.h"
-#include "thread_affinity.h"
+#include "affinity_policy.h"
 #ifdef _USE_GPU
 #include "gcl.h"
 #endif
@@ -27,153 +26,40 @@ class Model {
     Model()
     {}
 
-    Model(AffinityPolicy affinityPolicy, DataType dt, std::string name)
-    {
-        this->set_device_info(affinityPolicy);
-        this->dt = dt;
-        this->name = name;
-        std::string deviceName = "";
-        if (IS_GPU(this->deviceInfo.schedule)) {
-#ifdef _USE_GPU
-            if (OCLContext::getInstance().handle->useQualcommDev) {
-                this->deviceInfo.schedule = QUALCOMM;
-            }
-#else
-            UNI_ERROR_LOG("This library not support ARM MALI/Qualcomm GPU, please rebuild library "
-                          "with --gpu option.\n");
-            exit(1);
-#endif
-        }
-        algorithmMap = std::shared_ptr<AlgorithmMap>(
-            new AlgorithmMap(this->deviceInfo.schedule, name, deviceName, dt));
-    }
-
-    void set_runtime_device(int cpuId, int threadId = 0)
-    {
-        this->set_runtime_device(cpuId, this->deviceInfo.archs[cpuId], threadId);
-    }
-
-    void set_runtime_device(int cpuId, Arch arch, int threadId = 0)
-    {
-        this->deviceInfo.schedule = arch;
-        UNI_DEBUG_LOG("Inference use %s.\n", ArchName()[this->deviceInfo.schedule])
-        if (cpuId >= 0 && cpuId < this->deviceInfo.cpuNum) {
-            set_thread_affinity(threadId, &cpuId, 1);
-            for (auto op : ops) {
-                op->set_schedule(this->deviceInfo.schedule);
-            }
-        }
-    }
-
-    void set_runtime_device_dynamic(int threadId = 0)
-    {
-        set_cpu_dynamic(&this->deviceInfo, threadId);
-    }
-
-    Arch get_runtime_device()
-    {
-        return this->deviceInfo.schedule;
-    }
-
-    virtual void ready(std::map<std::string, TensorDesc> inputDescMap)
-    {
-        infer_output_tensors_size(inputDescMap);
-        assign_output_tensor();
-
-        infer_tmp_memory_size();
-        assign_tmp_tensor();
-    }
+    explicit Model(AffinityPolicy affinityPolicy, DataType dt, std::string name);
+
+    virtual ~Model() = default;
+
+    virtual void ready(std::map<std::string, TensorDesc> inputDescMap);
 
     virtual void run() = 0;
 
 #ifdef _USE_INT8
-    virtual U32 find_next_dynamic_scale_op(std::vector<U32> calibratedOpIdx, U32 startIdx)
-    {
-        CHECK_REQUIREMENT(startIdx < this->ops.size())
-        for (U32 i = startIdx; i < this->ops.size();) {
-            auto op = this->ops[i];
-            if (op->is_dynamic_scale()) {
-                bool calibrated = false;
-                for (auto idx : calibratedOpIdx) {
-                    if (i == idx) {
-                        calibrated = true;
-                        break;
-                    }
-                }
-                if (!calibrated) {
-                    return i;
-                }
-            }
-
-            if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) {
-                i = op->get_next_operator_index();
-            } else {
-                i++;
-            }
-        }
-
-        return 0;  // The first layer should never be quantized
-    }
-
-    virtual std::shared_ptr<Operator> get_operator_by_index(U32 index)
-    {
-        return this->ops[index];
-    }
-
-    virtual void run_till_breakpoint(U32 opIdx)
-    {
-        CHECK_REQUIREMENT(IS_CPU(this->deviceInfo.schedule));
-        for (U32 i = 0; i < this->ops.size();) {
-            auto op = this->ops[i];
-            if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) {
-                if (opIdx == i) {
-                    break;
-                }
-                i = op->get_next_operator_index();
-            } else {
-                op->run();
-                if (opIdx == i) {
-                    break;
-                }
-                i++;
-            }
-        }
-    }
-#endif
+    virtual U32 find_next_dynamic_scale_op(std::vector<U32> calibratedOpIdx, U32 startIdx);
 
-    std::string get_name()
-    {
-        return this->name;
-    }
+    virtual std::shared_ptr<Operator> get_operator_by_index(U32 index);
 
-    void loadAlgorithmMap(CI8 *path, bool useFileStream = false)
-    {
-        std::string algoName = this->algorithmMap->getAlgorithmFileName();
-        CI8 *algoInfo = nullptr;
-        if (IS_GPU(this->deviceInfo.schedule)) {
-#ifdef _USE_GPU
-            algoInfo = gcl_get_algorithm_info(OCLContext::getInstance().handle.get(), algoName);
+    virtual void run_till_breakpoint(U32 opIdx);
 #endif
-        }
-        if (!algoInfo && useFileStream) {
-            algoInfo = path;
-        }
-        if (algoInfo) {
-            this->algorithmMap->loadAlgorithmMapFromFileStream(algoInfo);
-        } else if (path) {
-            this->algorithmMap->loadAlgorithmMapFromFile(path);
-        }
-    }
-
-    void saveAlgorithmMapToFile(std::string algorithmMapPath)
-    {
-        this->algorithmMap->saveAlgorithmMapToFile(algorithmMapPath);
-    }
+
+    void loadAlgorithmMap(CI8 *path, bool useFileStream = false);
+
+    void saveAlgorithmMapToFile(std::string algorithmMapPath);
+
+    void set_runtime_device(int cpuId, int threadId = 0);
+
+    void set_runtime_device(int cpuId, Arch arch, int threadId = 0);
+
+    void set_runtime_device_dynamic(int threadId = 0);
+
+    Arch get_runtime_device();
+
+    std::string get_name();
 
 protected:
+    DataType dt;
     std::vector<std::shared_ptr<Operator>> ops;
     DeviceInfo deviceInfo;
-    DataType dt;
     std::shared_ptr<AlgorithmMap> algorithmMap;
 
     virtual EE infer_output_tensors_size(std::map<std::string, TensorDesc>) = 0;
@@ -181,29 +67,9 @@ class Model {
     virtual void infer_tmp_memory_size() = 0;
     virtual void assign_tmp_tensor() = 0;
 
-    virtual bool checkOperator()
-    {
-        for (auto op : this->ops) {
-            if (!op->checkOperator()) {
-                return false;
-            }
-        }
-        return true;
-    }
-
 private:
     std::string name;
 
-    void set_device_info(AffinityPolicy affinityPolicy)
-    {
-#ifndef _USE_IOS
-        this->deviceInfo = get_cpu_info(affinityPolicy);
-        this->set_runtime_device_dynamic();
-#else
-        this->deviceInfo.affinityPolicy = affinityPolicy;
-        this->deviceInfo.schedule = ARM_A76;
-#endif
-        UNI_DEBUG_LOG("Inference use %s.\n", ArchName()[this->deviceInfo.schedule])
-    }
+    void set_device_info(AffinityPolicy affinityPolicy);
 };
 #endif
diff --git a/inference/engine/include/non_max_suppression.hpp b/inference/engine/include/non_max_suppression.hpp
new file mode 100644
index 00000000..c3e2a65f
--- /dev/null
+++ b/inference/engine/include/non_max_suppression.hpp
@@ -0,0 +1,35 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _NON_MAX_SUPPRESSION_H
+#define _NON_MAX_SUPPRESSION_H
+
+#include "operator.hpp"
+
+class NonMaxSuppression : public Operator {
+public:
+    explicit NonMaxSuppression(DataType dt, NonMaxSuppressionParamSpec p)
+    {
+        this->dt = dt;
+        this->p = p;
+    }
+
+    OperatorType get_type() override
+    {
+        return OT_NonMaxSuppression;
+    }
+
+protected:
+    NonMaxSuppressionParamSpec p;
+};
+#endif  // _NON_MAX_SUPPRESSION_H
diff --git a/inference/engine/include/non_zero.hpp b/inference/engine/include/non_zero.hpp
new file mode 100644
index 00000000..fb3b0865
--- /dev/null
+++ b/inference/engine/include/non_zero.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _NON_ZERO_H
+#define _NON_ZERO_H
+
+#include "operator.hpp"
+
+class NonZero : public Operator {
+public:
+    explicit NonZero(DataType dt)
+    {
+        this->dt = dt;
+    }
+
+    OperatorType get_type() override
+    {
+        return OT_NonZero;
+    }
+};
+#endif  // _NON_ZERO_H
diff --git a/inference/engine/include/ocl/activation_ocl.hpp b/inference/engine/include/ocl/activation_ocl.hpp
index aa492719..ac6767a1 100644
--- a/inference/engine/include/ocl/activation_ocl.hpp
+++ b/inference/engine/include/ocl/activation_ocl.hpp
@@ -18,7 +18,7 @@
 
 class ActivationOCL : public Activation {
 public:
-    ActivationOCL(ActivationParamSpec activationDesc) : Activation(activationDesc)
+    ActivationOCL(ActivationParamSpec p) : Activation(p)
     {
         INIT_GPU_INFO(nullptr)
     }
@@ -28,7 +28,7 @@ class ActivationOCL : public Activation {
     std::shared_ptr<Operator> clone() override
     {
         std::shared_ptr<ActivationOCL> mem =
-            std::shared_ptr<ActivationOCL>(new ActivationOCL(this->activationDesc));
+            std::shared_ptr<ActivationOCL>(new ActivationOCL(this->p));
         *mem = *this;
         return mem;
     }
@@ -38,7 +38,7 @@ class ActivationOCL : public Activation {
         OCLContext::getInstance().handle.get()->curOpName = this->get_name();
         Tensor inputTensor = this->inputTensors[0];
         Tensor outputTensor = this->outputTensors[0];
-        CHECK_STATUS(activation(inputTensor, this->activationDesc, outputTensor, &this->archInfo));
+        CHECK_STATUS(activation(inputTensor, this->p, outputTensor, &this->archInfo));
     }
 
     EE infer_output_tensors_size(
diff --git a/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp b/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp
index 4218cc9b..a1d7712c 100644
--- a/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp
+++ b/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp
@@ -41,7 +41,7 @@ class BilateralSliceApplyOCL : public BilateralSliceApply {
         Tensor gridTensor = this->inputTensors[1];
         Tensor outputTensor = this->outputTensors[0];
 
-        if (this->p.mode == BSliceApply_NULL) {
+        if (this->p.mode == BSLICE_APPLY_NULL) {
             this->guideTensor = this->inputTensors[2];
         }
         CHECK_STATUS(bilateral_slice_apply(
diff --git a/inference/engine/include/ocl/cast_ocl.hpp b/inference/engine/include/ocl/cast_ocl.hpp
index 2baa1d54..4d5d5aea 100644
--- a/inference/engine/include/ocl/cast_ocl.hpp
+++ b/inference/engine/include/ocl/cast_ocl.hpp
@@ -35,16 +35,14 @@ class CastOCL : public Cast {
     inline void run_prepare()
     {
         OCLContext::getInstance().handle.get()->curOpName = this->get_name();
-        Tensor inputTensor = this->inputTensors[0];
-        Tensor outputTensor = this->outputTensors[0];
-        CHECK_STATUS(cast(inputTensor, outputTensor, this->p, &this->archInfo));
+        CHECK_STATUS(cast(this->inputTensors[0], this->p, this->outputTensors[0], &this->archInfo));
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
         this->needSetKernelVec = true;
-        CHECK_STATUS(cast_infer_output_size(inTensors[0], outTensors[0], this->p, &this->archInfo));
+        CHECK_STATUS(cast_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo));
         return SUCCESS;
     }
     REGISTER_OCL_OPERATOR_RUN
diff --git a/inference/engine/include/ocl/concat_ocl.hpp b/inference/engine/include/ocl/concat_ocl.hpp
index d52615c3..a864e0be 100644
--- a/inference/engine/include/ocl/concat_ocl.hpp
+++ b/inference/engine/include/ocl/concat_ocl.hpp
@@ -73,7 +73,8 @@ class ConcatOCL : public Concat {
     U32 infer_tmp_memory_size() override
     {
         U32 bytes = 0;
-        CHECK_STATUS(concat_infer_forward_tmp_bytes(this->inputTensors, &bytes, &this->archInfo));
+        CHECK_STATUS(concat_infer_forward_tmp_bytes(
+            this->inputTensors, this->outputTensors[0], &bytes, &this->archInfo));
         return bytes;
     }
     REGISTER_OCL_OPERATOR_RUN
diff --git a/inference/engine/include/ocl/convolution_ocl.hpp b/inference/engine/include/ocl/convolution_ocl.hpp
index 9765c7b7..7e702ba9 100644
--- a/inference/engine/include/ocl/convolution_ocl.hpp
+++ b/inference/engine/include/ocl/convolution_ocl.hpp
@@ -47,7 +47,7 @@ class ConvolutionOCL : public Convolution {
         U32 filterNum = 1;
         DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt;
         switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
+            case CONVOLUTION_POINTWISE: {
                 if (this->p.num_outputs_origin == 1) {
                     if (tensorIs5d(wDesc[0])) {
                         wDesc[0].dims[4] = this->p.num_outputs;
@@ -61,13 +61,13 @@ class ConvolutionOCL : public Convolution {
                     CONVOLUTION_ALGORITHM_NULL;
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 vDesc[0] = tensor1d(dtNoQ, this->p.num_outputs);
                 ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm =
                     DEPTHWISE_CONVOLUTION_ALGORITHM_NULL;
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 wDesc[1] = this->filterDescExt;
                 vDesc[0] = tensor1d(dtNoQ, this->numChannels);
                 vDesc[1] = tensor1d(dtNoQ, this->p.num_outputs);
@@ -76,13 +76,8 @@ class ConvolutionOCL : public Convolution {
                     DEPTHWISE_CONVOLUTION_ALGORITHM_NULL;
                 break;
             }
-            case Convolution_Dilation: {
-                CHECK_STATUS(NOT_SUPPORTED);
-                return NOT_SUPPORTED;
-                break;
-            }
             default:
-                CHECK_STATUS(NOT_SUPPORTED);
+                UNI_ERROR_LOG("not support to read new type convolution's weight.\n");
                 return NOT_SUPPORTED;
         }
 
@@ -106,7 +101,7 @@ class ConvolutionOCL : public Convolution {
         Tensor biasTensor = this->biasTensors[0];
         Tensor outputTensor = this->outputTensors[0];
         switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
+            case CONVOLUTION_POINTWISE: {
                 Tensor tmpTensor = Tensor(OCLMem);
                 std::vector<Tensor> tmpTensors(3, tmpTensor);
                 tmpTensors[0] = this->temp;
@@ -121,15 +116,15 @@ class ConvolutionOCL : public Convolution {
                     &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 Tensor tmpTensor = this->temp;
                 get_tmp_image(0, bytes + 1, &tmpTensor);
-                CHECK_STATUS(
-                    depthwise_convolution(inputTensor, filterTensor, p, this->dwAlg, biasTensor,
-                        tmpTensor, outputTensor, this->dwActivationParamSpec, &this->archInfo));
+                CHECK_STATUS(depthwise_convolution(inputTensor, filterTensor, p, this->dwAlg,
+                    nullptr, biasTensor, tmpTensor, outputTensor, this->dwActivationParamSpec,
+                    &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 auto dwFilterTensor = filterTensor;
                 auto pwFilterTensor = this->weightTensors[1];
                 auto dwBiasTensor = biasTensor;
@@ -140,17 +135,13 @@ class ConvolutionOCL : public Convolution {
                 get_tmp_image(0, bytes + 1, &tmpTensors[1]);
                 get_tmp_image(1, bytes + 4, &tmpTensors[2]);
                 CHECK_STATUS(depthwise_pointwise_convolution(this->inputTensors, dwFilterTensor,
-                    pwFilterTensor, p, this->dwAlg, dwBiasTensor, pwBiasTensor, tmpTensors,
+                    pwFilterTensor, p, this->dwAlg, nullptr, dwBiasTensor, pwBiasTensor, tmpTensors,
                     outputTensor, this->dwActivationParamSpec, this->pwActivationParamSpec,
                     &this->archInfo));
                 break;
             }
-            case Convolution_Dilation: {
-                CHECK_STATUS(NOT_SUPPORTED);
-                break;
-            }
             default: {
-                UNI_ERROR_LOG("unsupported convolution type %d\n", this->p.convolution_type);
+                UNI_ERROR_LOG("not support to run new type convolution.\n");
             }
         }
     }
@@ -165,9 +156,10 @@ class ConvolutionOCL : public Convolution {
         ConvolutionPolicy policy = CONVOLUTION_TUNNING;
         DataType targetType = DT_F16;
         I32 algo[7];
-        std::string name = this->name + std::to_string(get_type()) + std::to_string(this->p.convolution_type); 
+        std::string name =
+            this->name + std::to_string(get_type()) + std::to_string(this->p.convolution_type);
         switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
+            case CONVOLUTION_POINTWISE: {
                 if (this->dt == DT_F16_8Q) {
                     targetType = DT_I8;
                 }
@@ -190,7 +182,7 @@ class ConvolutionOCL : public Convolution {
                 }
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 if (algorithmMap->getAlgorithmInfoFromMap(name, algo, 4)) {
                     this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0];
                     this->runInfo.best_h[0] = algo[1];
@@ -210,7 +202,7 @@ class ConvolutionOCL : public Convolution {
                 }
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 if (algorithmMap->getAlgorithmInfoFromMap(name, algo, 7)) {
                     this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0];
                     this->runInfo.best_h[0] = algo[1];
@@ -239,12 +231,9 @@ class ConvolutionOCL : public Convolution {
                 }
                 break;
             }
-            case Convolution_Dilation: {
-                CHECK_STATUS(NOT_SUPPORTED);
-                break;
-            }
             default:
-                CHECK_STATUS(NOT_SUPPORTED);
+                UNI_ERROR_LOG("not support to infer new type convolution's algorithm.\n");
+                return NOT_SUPPORTED;
         }
         return SUCCESS;
     }
@@ -293,11 +282,11 @@ class ConvolutionOCL : public Convolution {
         }
         DataType targetType = DT_F16;  // Default DT_F16
 
-        if (this->p.convolution_type == Convolution_Dilation) {
-            this->p.convolution_type = Convolution_Pointwise;
+        if (this->p.convolution_type == CONVOLUTION_DILATION) {
+            this->p.convolution_type = CONVOLUTION_POINTWISE;
         }
         switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
+            case CONVOLUTION_POINTWISE: {
                 if (tensorIs5d(inDim)) {
                     this->filterDesc = tensor5df(this->dt, DF_NCHW, numFiltersOcl,
                         this->numChannels, this->p.kernel_t, this->p.kernel_h, this->p.kernel_w);
@@ -310,7 +299,7 @@ class ConvolutionOCL : public Convolution {
                     inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 this->filterDesc = tensor4df(
                     this->dt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w);
                 filterTensor.resize(this->filterDesc);
@@ -318,7 +307,7 @@ class ConvolutionOCL : public Convolution {
                     inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 this->filterDesc = tensor4df(
                     this->dt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w);
                 this->filterDescExt =
@@ -330,12 +319,9 @@ class ConvolutionOCL : public Convolution {
                     filterTensor, filterTensorExt, p, outTensors[0], targetType, &this->archInfo));
                 break;
             }
-            case Convolution_Dilation: {
-                return NOT_SUPPORTED;
-                break;
-            }
             default:
-                CHECK_STATUS(NOT_SUPPORTED);
+                UNI_ERROR_LOG("not support to infer new type convolution's output.\n");
+                return NOT_SUPPORTED;
         }
         if (use_output_tensor_image(numFiltersOcl, inputTensor)) {
             CHECK_STATUS(set_tensors_image(outTensors, inTensors.size()));
@@ -352,28 +338,24 @@ class ConvolutionOCL : public Convolution {
             bytes[i] = 0;
         }
         switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
+            case CONVOLUTION_POINTWISE: {
                 CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor,
                     outputTensor, p, this->pwAlg, bytes, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputTensor,
                     filterTensor, outputTensor, p, this->dwAlg, bytes, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor,
                     filterTensor, this->weightTensors[1], outputTensor, p, this->dwAlg, bytes,
                     &this->archInfo));
                 break;
             }
-            case Convolution_Dilation: {
-                CHECK_STATUS(NOT_SUPPORTED);
-                break;
-            }
             default:
-                CHECK_STATUS(NOT_SUPPORTED);
+                UNI_ERROR_LOG("not support to infer new type convolution's tmp memory.\n");
         }
         add_tmp_image(0, bytes + 1);
         add_tmp_image(1, bytes + 4);
@@ -387,22 +369,21 @@ class ConvolutionOCL : public Convolution {
         U32 biasNum = 0;
         TensorDesc desc[2];
         switch (this->p.convolution_type) {
-            case Convolution_Pointwise: {
+            case CONVOLUTION_POINTWISE: {
                 CHECK_STATUS(convolution_transform_filter_bytes(
                     filterTensor, this->p, this->pwAlg, desc, &this->archInfo));
-                if (this->runInfo.best_k[0] <= 1 && 
-                    this->pwAlg == CONVOLUTION_ALGORITHM_DIRECT) {
+                if (this->runInfo.best_k[0] <= 1 && this->pwAlg == CONVOLUTION_ALGORITHM_DIRECT) {
                     needTransBiasImgToBuf = true;
                     biasNum = 0;
                 }
                 break;
             }
-            case Convolution_Depthwise: {
+            case CONVOLUTION_DEPTHWISE: {
                 CHECK_STATUS(depthwise_convolution_transform_filter_bytes(
                     filterTensor, this->p, this->dwAlg, desc, &this->archInfo));
                 break;
             }
-            case Convolution_Depthwise_Pointwise: {
+            case CONVOLUTION_DEPTHWISE_POINTWISE: {
                 CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes(filterTensor,
                     this->weightTensors[1], this->p, this->dwAlg, &desc[0], &desc[1],
                     &this->archInfo));
@@ -415,12 +396,10 @@ class ConvolutionOCL : public Convolution {
                 }
                 break;
             }
-            case Convolution_Dilation: {
-                CHECK_STATUS(NOT_SUPPORTED);
-                break;
-            }
             default:
-                CHECK_STATUS(NOT_SUPPORTED);
+                UNI_ERROR_LOG("not support to infer new type convolution's tramsform filter tmp "
+                              "memory.\n");
+                return NOT_SUPPORTED;
         }
         this->wtm = std::shared_ptr<Tensor>(new Tensor(OCLMem));
         this->wtm->resize(desc[0]);
@@ -448,38 +427,36 @@ class ConvolutionOCL : public Convolution {
     EE transform_filter() override
     {
         auto filterTensor = this->weightTensors[0];
-        if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->p.convolution_type &&
+        if (DT_F16_8Q == this->dt && CONVOLUTION_POINTWISE == this->p.convolution_type &&
             CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) {  // int8 winograd
             return NOT_SUPPORTED;
         } else if (DT_F16_8Q == this->dt &&
-            Convolution_Pointwise == this->p.convolution_type) {  // int8 tilegemm
+            CONVOLUTION_POINTWISE == this->p.convolution_type) {  // int8 tilegemm
             return NOT_SUPPORTED;
         } else {  // All other cases
             CHECK_STATUS(alloc_wtm_memory());
             switch (this->p.convolution_type) {
-                case Convolution_Pointwise: {
+                case CONVOLUTION_POINTWISE: {
                     CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg,
                         this->temp, this->wtm.get(), &this->archInfo));
                     break;
                 }
-                case Convolution_Depthwise: {
+                case CONVOLUTION_DEPTHWISE: {
                     CHECK_STATUS(depthwise_convolution_transform_filter(
                         filterTensor, this->p, this->dwAlg, this->wtm.get(), &this->archInfo));
                     break;
                 }
-                case Convolution_Depthwise_Pointwise: {
+                case CONVOLUTION_DEPTHWISE_POINTWISE: {
                     CHECK_STATUS(depthwise_pointwise_convolution_transform_filter(filterTensor,
                         this->weightTensors[1], this->p, this->dwAlg, this->wtm.get(),
                         &this->wtm_dp, &this->archInfo));
                     this->weightTensors[1] = wtm_dp;
                     break;
                 }
-                case Convolution_Dilation: {
-                    CHECK_STATUS(NOT_SUPPORTED);
-                    break;
+                default: {
+                    UNI_ERROR_LOG("not support to transform new type convolution's filter.\n");
+                    return NOT_SUPPORTED;
                 }
-                default:
-                    CHECK_STATUS(NOT_SUPPORTED);
             }
         }
         this->weightTensors[0] = *this->get_wtm();
diff --git a/inference/engine/include/ocl/copy_ocl.hpp b/inference/engine/include/ocl/copy_ocl.hpp
index 7ac82768..c2c2dcea 100644
--- a/inference/engine/include/ocl/copy_ocl.hpp
+++ b/inference/engine/include/ocl/copy_ocl.hpp
@@ -38,9 +38,7 @@ class CopyOCL : public Copy {
         TensorDesc srcDesc = this->inputTensors[0].get_desc();
         TensorDesc dstDesc = this->inputTensors[1].get_desc();
         U32 batch = srcDesc.dims[srcDesc.nDims - 1];
-        if (batch > 1) {
-            CHECK_STATUS(NOT_SUPPORTED);
-        }
+        CHECK_REQUIREMENT(batch == 1);
         U32 copyLength = (this->p.length >= 0) ? this->p.length : tensorNumElements(srcDesc) / batch;
         U32 srcStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[1]
                                                    : tensorNumElements(srcDesc) / batch;
diff --git a/inference/engine/include/ocl/eltwise_ocl.hpp b/inference/engine/include/ocl/eltwise_ocl.hpp
index ca9be6ff..3233866c 100644
--- a/inference/engine/include/ocl/eltwise_ocl.hpp
+++ b/inference/engine/include/ocl/eltwise_ocl.hpp
@@ -18,7 +18,7 @@
 
 class EltwiseOCL : public Eltwise {
 public:
-    EltwiseOCL(EltwiseParamSpec eltwiseDesc) : Eltwise(eltwiseDesc)
+    EltwiseOCL(EltwiseParamSpec p) : Eltwise(p)
     {
         INIT_GPU_INFO(nullptr)
     }
@@ -27,8 +27,7 @@ class EltwiseOCL : public Eltwise {
 
     std::shared_ptr<Operator> clone() override
     {
-        std::shared_ptr<EltwiseOCL> mem =
-            std::shared_ptr<EltwiseOCL>(new EltwiseOCL(this->eltwiseDesc));
+        std::shared_ptr<EltwiseOCL> mem = std::shared_ptr<EltwiseOCL>(new EltwiseOCL(this->p));
         *mem = *this;
         return mem;
     }
@@ -36,8 +35,8 @@ class EltwiseOCL : public Eltwise {
     inline void run_prepare()
     {
         OCLContext::getInstance().handle.get()->curOpName = this->get_name();
-        CHECK_STATUS(eltwise(this->inputTensors, this->eltwiseDesc, this->temp,
-            this->outputTensors[0], &this->archInfo));
+        CHECK_STATUS(eltwise(
+            this->inputTensors, this->p, this->temp, this->outputTensors[0], &this->archInfo));
     }
 
     EE infer_output_tensors_size(
diff --git a/inference/engine/include/ocl/embedding_ocl.hpp b/inference/engine/include/ocl/embedding_ocl.hpp
index a664c637..7e31960c 100644
--- a/inference/engine/include/ocl/embedding_ocl.hpp
+++ b/inference/engine/include/ocl/embedding_ocl.hpp
@@ -52,20 +52,16 @@ class EmbeddingOCL : public Embedding {
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
         this->needSetKernelVec = true;
-        if (this->p.num_output <= 0) {
-            if (inTensors.size() <= 1) {
-                CHECK_STATUS(NOT_SUPPORTED);
-            }
+        if (this->p.num_outputs <= 0) {
+            CHECK_REQUIREMENT(inTensors.size() > 1);
             TensorDesc desc = inTensors[1]->get_desc();
-            if (desc.nDims != 2) {
-                CHECK_STATUS(NOT_MATCH);
-            }
+            CHECK_REQUIREMENT(desc.nDims == 2);
             if (this->p.transpose) {
-                this->p.input_dim = desc.dims[0];
-                this->p.num_output = desc.dims[1];
+                this->p.num_inputs = desc.dims[0];
+                this->p.num_outputs = desc.dims[1];
             } else {
-                this->p.input_dim = desc.dims[1];
-                this->p.num_output = desc.dims[0];
+                this->p.num_inputs = desc.dims[1];
+                this->p.num_outputs = desc.dims[0];
             }
         }
         CHECK_STATUS(embedding_infer_output_size(
@@ -81,9 +77,9 @@ class EmbeddingOCL : public Embedding {
         }
         TensorDesc weightDesc;
         if (this->p.transpose) {
-            weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_output, this->p.input_dim);
+            weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_outputs, this->p.num_inputs);
         } else {
-            weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.input_dim, this->p.num_output);
+            weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.num_inputs, this->p.num_outputs);
         }
         Tensor modelWeightTensor = Tensor(OCLMem);
         modelWeightTensor.resize(weightDesc);
diff --git a/inference/engine/include/ocl/factory_ocl.hpp b/inference/engine/include/ocl/factory_ocl.hpp
index 7f401dc4..ea4ccebf 100644
--- a/inference/engine/include/ocl/factory_ocl.hpp
+++ b/inference/engine/include/ocl/factory_ocl.hpp
@@ -156,9 +156,10 @@ class FactoryOCL : public Factory {
         return std::shared_ptr<Operator>(cep);
     }
 
-    std::shared_ptr<Operator> createLayerNorm(DataType dt, U32 weightNum) override
+    std::shared_ptr<Operator> createLayerNorm(
+        DataType dt, LayerNormParamSpec p, U32 weightNum) override
     {
-        auto cep = (LayerNorm *)new LayerNormOCL(dt, weightNum);
+        auto cep = (LayerNorm *)new LayerNormOCL(dt, p, weightNum);
         return std::shared_ptr<Operator>(cep);
     }
 
@@ -168,11 +169,9 @@ class FactoryOCL : public Factory {
         return std::shared_ptr<Operator>(cep);
     }
 
-    std::shared_ptr<Operator> createResize(DataType paramDT, ResizeParamSpec p) override
+    std::shared_ptr<Operator> createResize(DataType dt, ResizeParamSpec p) override
     {
-        // auto cep = new Resize(paramDT, paramPtr);
-        // OP_UNSUP(2, paramDT, paramPtr);
-        auto cep = (Resize *)(new ResizeOCL(paramDT, p));
+        auto cep = (Resize *)(new ResizeOCL(dt, p));
         return std::shared_ptr<Operator>(cep);
     }
 
@@ -250,9 +249,9 @@ class FactoryOCL : public Factory {
         return std::shared_ptr<Operator>(cep);
     }
 
-    std::shared_ptr<Operator> createPreAllocatedMemory(DataType dt, TensorDesc desc) override
+    std::shared_ptr<Operator> createPreAllocatedMemory(PreAllocatedMemoryParamSpec p) override
     {
-        auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryOCL(dt, desc);
+        auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryOCL(p);
         return std::shared_ptr<Operator>(cep);
     }
 
@@ -393,12 +392,6 @@ class FactoryOCL : public Factory {
         return std::shared_ptr<Operator>(cep);
     }
 
-    std::shared_ptr<Operator> createEqual(DataType dt, EqualParamSpec p) override
-    {
-        OP_UNSUP(2, dt, p);
-        return std::shared_ptr<Operator>(cep);
-    }
-
     std::shared_ptr<Operator> createInstanceNorm(DataType dt, InstanceNormParamSpec p) override
     {
         OP_UNSUP(2, dt, p);
@@ -435,9 +428,9 @@ class FactoryOCL : public Factory {
         return std::shared_ptr<Operator>(cep);
     }
 
-    std::shared_ptr<Operator> createRoIAlign(RoIAlignParamSpec p) override
+    std::shared_ptr<Operator> createRoIAlign(DataType dt, RoIAlignParamSpec p) override
     {
-        auto cep = (RoIAlign *)new RoIAlignOCL(p);
+        auto cep = (RoIAlign *)new RoIAlignOCL(dt, p);
         return std::shared_ptr<Operator>(cep);
     }
 
@@ -453,5 +446,53 @@ class FactoryOCL : public Factory {
         OP_UNSUP(2, dt, p);
         return std::shared_ptr<Operator>(cep);
     }
+    std::shared_ptr<Operator> createQuantizeLinear(DataType dt, QuantizeLinearParamSpec p) override
+    {
+        OP_UNSUP(2, dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createGridSample(DataType dt, GridSampleParamSpec p) override
+    {
+        OP_UNSUP(2, dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createOneHot(DataType dt, OneHotParamSpec p) override
+    {
+        OP_UNSUP(2, dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createCumSum(DataType dt, CumSumParamSpec p) override
+    {
+        OP_UNSUP(2, dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createNonMaxSuppression(
+        DataType dt, NonMaxSuppressionParamSpec p) override
+    {
+        OP_UNSUP(2, dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createConstantOfShape(DataType dt, ConstantOfShapeParamSpec p) override
+    {
+        OP_UNSUP(2, dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createNonZero(DataType dt) override
+    {
+        OP_UNSUP(1, dt);
+        return std::shared_ptr<Operator>(cep);
+    }
+
+    std::shared_ptr<Operator> createRange(DataType dt, RangeParamSpec p) override
+    {
+        OP_UNSUP(2, dt, p);
+        return std::shared_ptr<Operator>(cep);
+    }
 };
 #endif  // _FACTORY_OCL_H
diff --git a/inference/engine/include/ocl/fully_connected_ocl.hpp b/inference/engine/include/ocl/fully_connected_ocl.hpp
index 1f832e60..b621ead0 100644
--- a/inference/engine/include/ocl/fully_connected_ocl.hpp
+++ b/inference/engine/include/ocl/fully_connected_ocl.hpp
@@ -131,9 +131,7 @@ class FullyConnectedOCL : public FullyConnected {
             auto biasMem = (OclMemory *)inTensors[1]->get_memory();
             biasMem->padding(0, 8, 0, 0);
         }
-        if (this->p.num_slices > 1) {
-            CHECK_STATUS(NOT_SUPPORTED);
-        }
+        CHECK_REQUIREMENT(this->p.num_slices == 1);
         return SUCCESS;
     }
 
@@ -174,11 +172,8 @@ class FullyConnectedOCL : public FullyConnected {
     {
         Tensor inputTensor = this->inputTensors[0];
         Tensor filterTensor = this->weightTensors[0];
-        if (this->p.num_slices == 1) {
-            CHECK_STATUS(alloc_wtm_memory());
-        } else {
-            CHECK_STATUS(NOT_SUPPORTED);
-        }
+        CHECK_REQUIREMENT(this->p.num_slices == 1);
+        CHECK_STATUS(alloc_wtm_memory());
         CHECK_STATUS(fully_connected_transform_filter(
             inputTensor, filterTensor, this->wtm.get(), &this->archInfo));
         this->weightTensors[0] = *this->get_wtm();
diff --git a/inference/engine/include/ocl/layer_norm_ocl.hpp b/inference/engine/include/ocl/layer_norm_ocl.hpp
index 4b3e8cd8..aefb73ba 100644
--- a/inference/engine/include/ocl/layer_norm_ocl.hpp
+++ b/inference/engine/include/ocl/layer_norm_ocl.hpp
@@ -18,7 +18,7 @@
 
 class LayerNormOCL : public LayerNorm {
 public:
-    LayerNormOCL(DataType dt, U32 weightNum) : LayerNorm(dt, weightNum)
+    LayerNormOCL(DataType dt, LayerNormParamSpec p, U32 weightNum) : LayerNorm(dt, p, weightNum)
     {
         INIT_GPU_INFO(nullptr)
     }
@@ -28,7 +28,7 @@ class LayerNormOCL : public LayerNorm {
     std::shared_ptr<Operator> clone() override
     {
         std::shared_ptr<LayerNormOCL> mem =
-            std::shared_ptr<LayerNormOCL>(new LayerNormOCL(this->dt, this->weightNum));
+            std::shared_ptr<LayerNormOCL>(new LayerNormOCL(this->dt, this->p, this->weightNum));
         *mem = *this;
         return mem;
     }
@@ -66,8 +66,8 @@ class LayerNormOCL : public LayerNorm {
         Tensor weightTensor = this->weightTensors[0];
         Tensor biasTensor = this->biasTensors[0];
         Tensor outputTensor = this->outputTensors[0];
-        CHECK_STATUS(layer_normalization(
-            inputTensor, weightTensor, biasTensor, this->temp, outputTensor, &this->archInfo));
+        CHECK_STATUS(layer_normalization(inputTensor, this->p, weightTensor, biasTensor, this->temp,
+            outputTensor, &this->archInfo));
     }
 
     EE infer_output_tensors_size(
diff --git a/inference/engine/include/ocl/matmul_ocl.hpp b/inference/engine/include/ocl/matmul_ocl.hpp
index f1407b73..bd076d1b 100644
--- a/inference/engine/include/ocl/matmul_ocl.hpp
+++ b/inference/engine/include/ocl/matmul_ocl.hpp
@@ -84,9 +84,7 @@ class MatMulOCL : public MatMul {
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
         this->needSetKernelVec = true;
-        if (inTensors.size() > 2) {
-            CHECK_STATUS(NOT_SUPPORTED);
-        }
+        CHECK_REQUIREMENT(inTensors.size() == 2);
         CHECK_STATUS(matmul_infer_output_size(inTensors[0], this->p.transpose_a, inTensors[1],
             this->p.transpose_b, outTensors[0], &this->archInfo));
         if (check_tensors_image(inTensors)) {
diff --git a/inference/engine/include/ocl/power_ocl.hpp b/inference/engine/include/ocl/power_ocl.hpp
index 61851551..d643d17d 100644
--- a/inference/engine/include/ocl/power_ocl.hpp
+++ b/inference/engine/include/ocl/power_ocl.hpp
@@ -45,7 +45,7 @@ class PowerOCL : public Power {
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
         this->needSetKernelVec = true;
-        CHECK_STATUS(power_infer_output_size(inTensors[0], outTensors[0], &this->archInfo));
+        CHECK_STATUS(power_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo));
         if (check_tensors_image(inTensors) && inTensors[0] != outTensors[0]) {
             CHECK_STATUS(set_tensors_image(outTensors, inTensors.size()));
         }
diff --git a/inference/engine/include/ocl/preallocated_memory_ocl.hpp b/inference/engine/include/ocl/preallocated_memory_ocl.hpp
index 56b28b78..e840d310 100644
--- a/inference/engine/include/ocl/preallocated_memory_ocl.hpp
+++ b/inference/engine/include/ocl/preallocated_memory_ocl.hpp
@@ -18,7 +18,7 @@
 
 class PreAllocatedMemoryOCL : public PreAllocatedMemory {
 public:
-    PreAllocatedMemoryOCL(DataType dt, TensorDesc desc) : PreAllocatedMemory(dt, desc)
+    PreAllocatedMemoryOCL(PreAllocatedMemoryParamSpec p) : PreAllocatedMemory(p)
     {
         INIT_GPU_INFO(nullptr)
     }
@@ -28,7 +28,7 @@ class PreAllocatedMemoryOCL : public PreAllocatedMemory {
     std::shared_ptr<Operator> clone() override
     {
         std::shared_ptr<PreAllocatedMemoryOCL> mem =
-            std::shared_ptr<PreAllocatedMemoryOCL>(new PreAllocatedMemoryOCL(this->dt, this->desc));
+            std::shared_ptr<PreAllocatedMemoryOCL>(new PreAllocatedMemoryOCL(this->p));
         *mem = *this;
         return mem;
     }
@@ -36,19 +36,15 @@ class PreAllocatedMemoryOCL : public PreAllocatedMemory {
     inline void run_prepare()
     {
         OCLContext::getInstance().handle.get()->curOpName = this->get_name();
-        CHECK_STATUS(preallocated_memory(this->outputTensors[0], &this->archInfo));
+        CHECK_STATUS(preallocated_memory(this->p, this->outputTensors[0], &this->archInfo));
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
         this->needSetKernelVec = true;
-        if (inTensors.size() > 0) {
-            CHECK_STATUS(NOT_MATCH);
-        }
-        outTensors[0]->resize(this->desc);
-        CHECK_STATUS(preallocated_memory_infer_output_size(outTensors[0], &this->archInfo));
-        return SUCCESS;
+        return preallocated_memory_infer_output_size(
+            inTensors, this->p, outTensors[0], &this->archInfo);
     }
 
     REGISTER_OCL_OPERATOR_RUN
diff --git a/inference/engine/include/ocl/prelu_ocl.hpp b/inference/engine/include/ocl/prelu_ocl.hpp
index 5ddd26ff..f2708611 100644
--- a/inference/engine/include/ocl/prelu_ocl.hpp
+++ b/inference/engine/include/ocl/prelu_ocl.hpp
@@ -39,13 +39,11 @@ class PReLUOCL : public PReLU {
         if (curOpWs.weight != nullptr) {
             weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt));
         }
-        if (weightNum == 0) {
-            CHECK_STATUS(NOT_SUPPORTED);
-        }
+        CHECK_REQUIREMENT(weightNum != 0);
         if (weightNum == 1) {
-            this->preluDesc.propagate_down = true;
+            this->p.propagate_down = true;
         } else {
-            this->preluDesc.propagate_down = false;
+            this->p.propagate_down = false;
         }
         Tensor modelWeightTensor = Tensor(OCLMem);
         TensorDesc weightDesc = tensor1d(this->dt, weightNum);
@@ -57,7 +55,7 @@ class PReLUOCL : public PReLU {
     inline void run_prepare()
     {
         OCLContext::getInstance().handle.get()->curOpName = this->get_name();
-        CHECK_STATUS(prelu(this->inputTensors[0], this->weightTensors[0], this->preluDesc,
+        CHECK_STATUS(prelu(this->inputTensors[0], this->weightTensors[0], this->p,
             this->outputTensors[0], &this->archInfo));
     }
 
diff --git a/inference/engine/include/ocl/repeat_ocl.hpp b/inference/engine/include/ocl/repeat_ocl.hpp
index 32a7efbc..e79baeaf 100644
--- a/inference/engine/include/ocl/repeat_ocl.hpp
+++ b/inference/engine/include/ocl/repeat_ocl.hpp
@@ -45,18 +45,10 @@ class RepeatOCL : public Repeat {
         if (this->inputTensors.size() > 1) {
             Tensor inputTensor = this->inputTensors[1];
             TensorDesc inputDesc = inputTensor.get_desc();
-            GCLMem_t ptr = (GCLMem_t)(((OclMemory *)(inputTensor.get_memory()))->get_ptr());
             U32 length = tensorNumElements(inputDesc);
-            DataFormat df = ptr->desc.memFormat;
-            if (df != DF_NCHW) {
-                CHECK_STATUS(NOT_SUPPORTED);
-            }
-            U32 w_off, h_off;
-            w_off = ptr->desc.offset[0];
-            h_off = ptr->desc.offset[1];
-            if (w_off != 0 || h_off != 0) {
-                CHECK_STATUS(NOT_SUPPORTED);
-            }
+            GCLMem_t ptr = (GCLMem_t)(((OclMemory *)(inputTensor.get_memory()))->get_ptr());
+            CHECK_REQUIREMENT(ptr->desc.memFormat == DF_NCHW);
+            CHECK_REQUIREMENT(ptr->desc.offset[0] == 0 && ptr->desc.offset[1] == 0);
             I32 *val = hostVal.get();
             CHECK_STATUS(gcl_trans_memory(OCLContext::getInstance().handle.get(), ptr, val, &length,
                 DEVICE_BUF_TO_HOST, CL_TRUE));
diff --git a/inference/engine/include/ocl/resize_ocl.hpp b/inference/engine/include/ocl/resize_ocl.hpp
index b55ed05d..021174bc 100644
--- a/inference/engine/include/ocl/resize_ocl.hpp
+++ b/inference/engine/include/ocl/resize_ocl.hpp
@@ -19,7 +19,7 @@
 
 class ResizeOCL : public Resize {
 public:
-    ResizeOCL(DataType paramDT, ResizeParamSpec p) : Resize(paramDT, p)
+    ResizeOCL(DataType dt, ResizeParamSpec p) : Resize(dt, p)
     {
         INIT_GPU_INFO(nullptr)
     }
@@ -29,7 +29,7 @@ class ResizeOCL : public Resize {
     std::shared_ptr<Operator> clone() override
     {
         std::shared_ptr<ResizeOCL> mem =
-            std::shared_ptr<ResizeOCL>(new ResizeOCL(this->paramDT, this->p));
+            std::shared_ptr<ResizeOCL>(new ResizeOCL(this->dt, this->p));
         *mem = *this;
         return mem;
     }
@@ -39,31 +39,16 @@ class ResizeOCL : public Resize {
         OCLContext::getInstance().handle.get()->curOpName = this->get_name();
         Tensor inputTensor = this->inputTensors[0];
         Tensor outputTensor = this->outputTensors[0];
-        CHECK_STATUS(resize(inputTensor, this->temp, outputTensor, this->p, &this->archInfo));
+        CHECK_STATUS(resize(inputTensor, this->p, this->temp, outputTensor, &this->archInfo));
     }
 
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
         this->needSetKernelVec = true;
+        CHECK_STATUS(
+            resize_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo));
         TensorDesc desc = inTensors[0]->get_desc();
-        U32 bytes;
-        switch (paramDT) {
-            case DT_F32: {
-                CHECK_REQUIREMENT(1 == this->p.scales[0] && 1 == this->p.scales[1]);
-                CHECK_STATUS(resize_infer_output_size(inTensors[0], this->paramDT,
-                    this->p.scales + 2, outTensors[0], &bytes, &this->archInfo));
-                break;
-            }
-            case DT_U32: {
-                CHECK_STATUS(resize_infer_output_size(inTensors[0], this->paramDT, this->p.sizes,
-                    outTensors[0], &bytes, &this->archInfo));
-                break;
-            }
-            default: {
-                CHECK_STATUS(NOT_SUPPORTED);
-            }
-        }
         if (desc.df == DF_NCHWC4 && check_tensors_image(inTensors)) {
             CHECK_STATUS(set_tensors_image(outTensors, inTensors.size()));
         }
@@ -72,12 +57,10 @@ class ResizeOCL : public Resize {
 
     U32 infer_tmp_memory_size() override
     {
-        U32 size = 0;
-        TensorDesc inputDesc = inputTensors[0].get_desc();
-        if (inputDesc.df == DF_NCHW && inputTensors[0].get_mem_type() != OCLMem) {
-            size = tensorNumBytes(inputDesc);
-        }
-        return size;
+        U32 bytes = 0;
+        CHECK_STATUS(resize_infer_forward_tmp_bytes(
+            this->inputTensors[0], this->p, this->outputTensors[0], &bytes, &this->archInfo));
+        return bytes;
     }
 
     REGISTER_OCL_OPERATOR_RUN
diff --git a/inference/engine/include/ocl/rnn_ocl.hpp b/inference/engine/include/ocl/rnn_ocl.hpp
index deb78b70..0c919761 100644
--- a/inference/engine/include/ocl/rnn_ocl.hpp
+++ b/inference/engine/include/ocl/rnn_ocl.hpp
@@ -47,8 +47,8 @@ class RNNOCL : public RNNCellOCL {
         ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm =
             CONVOLUTION_ALGORITHM_NULL;
         I32 algo[10];
-        U32 algoNum = (this->p.numProjection > 0) ? 10 : 7;
-        std::string name = this->name + std::to_string(get_type()); 
+        U32 algoNum = (this->p.num_projection > 0) ? 10 : 7;
+        std::string name = this->name + std::to_string(get_type());
         if (algorithmMap->getAlgorithmInfoFromMap(name, algo, algoNum)) {
             this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0];
             this->runInfo.best_h[0] = algo[1];
@@ -119,13 +119,13 @@ class RNNOCL : public RNNCellOCL {
         this->wtm_gemv = std::shared_ptr<Tensor>(new Tensor(this->wtmType));
         this->wtm_gemv->resize(ftmDesc[1]);
         this->wtm_gemv->alloc();
-        if (this->p.numProjection > 0) {
+        if (this->p.num_projection > 0) {
             this->wtm_pro = std::shared_ptr<Tensor>(new Tensor(this->wtmType));
             this->wtm_pro->resize(ftmDesc[2]);
             this->wtm_pro->alloc();
         }
 
-        if (this->p.biDirection) {
+        if (this->p.bi_direction) {
             this->wtm_bi = std::shared_ptr<Tensor>(new Tensor(this->wtmType));
             this->wtm_bi->resize(ftmDesc[0]);
             CHECK_STATUS(set_wtm_image(ftmDesc[0], &wtm_bi));
@@ -133,7 +133,7 @@ class RNNOCL : public RNNCellOCL {
             this->wtm_gemv_bi = std::shared_ptr<Tensor>(new Tensor(this->wtmType));
             this->wtm_gemv_bi->resize(ftmDesc[1]);
             this->wtm_gemv_bi->alloc();
-            if (this->p.numProjection > 0) {
+            if (this->p.num_projection > 0) {
                 this->wtm_pro_bi = std::shared_ptr<Tensor>(new Tensor(this->wtmType));
                 this->wtm_pro_bi->resize(ftmDesc[2]);
                 this->wtm_pro_bi->alloc();
@@ -147,8 +147,8 @@ class RNNOCL : public RNNCellOCL {
         CHECK_STATUS(alloc_wtm_memory());
         std::vector<Tensor> filterTensors;
         std::vector<Tensor *> ftmTensors;
-        U32 weightNum = (this->p.numProjection > 0) ? 2 : 1;
-        U32 directions = (this->p.biDirection) ? 2 : 1;
+        U32 weightNum = (this->p.num_projection > 0) ? 2 : 1;
+        U32 directions = (this->p.bi_direction) ? 2 : 1;
         for (U32 i = 0; i < directions; i++) {
             for (U32 j = 0; j < weightNum; j++) {
                 filterTensors.push_back(this->weightTensors[i * weightNum + j]);
@@ -157,13 +157,13 @@ class RNNOCL : public RNNCellOCL {
 
         ftmTensors.push_back(this->wtm.get());
         ftmTensors.push_back(this->wtm_gemv.get());
-        if (this->p.numProjection > 0) {
+        if (this->p.num_projection > 0) {
             ftmTensors.push_back(this->wtm_pro.get());
         }
-        if (this->p.biDirection) {
+        if (this->p.bi_direction) {
             ftmTensors.push_back(this->wtm_bi.get());
             ftmTensors.push_back(this->wtm_gemv_bi.get());
-            if (this->p.numProjection > 0) {
+            if (this->p.num_projection > 0) {
                 ftmTensors.push_back(this->wtm_pro_bi.get());
             }
         }
@@ -178,16 +178,16 @@ class RNNOCL : public RNNCellOCL {
         weightNumCount++;
         this->weightTensors[weightNumCount] = *this->wtm_gemv.get();
         weightNumCount++;
-        if (this->p.numProjection > 0) {
+        if (this->p.num_projection > 0) {
             this->weightTensors[weightNumCount] = (*this->wtm_pro.get());
             weightNumCount++;
         }
-        if (this->p.biDirection) {
+        if (this->p.bi_direction) {
             this->weightTensors[weightNumCount] = *this->wtm_bi.get();
             weightNumCount++;
             this->weightTensors[weightNumCount] = *this->wtm_gemv_bi.get();
             weightNumCount++;
-            if (this->p.numProjection > 0) {
+            if (this->p.num_projection > 0) {
                 this->weightTensors[weightNumCount] = (*this->wtm_pro_bi.get());
                 weightNumCount++;
             }
diff --git a/inference/engine/include/ocl/rnncell_ocl.hpp b/inference/engine/include/ocl/rnncell_ocl.hpp
index b79b0eef..6a1170e9 100644
--- a/inference/engine/include/ocl/rnncell_ocl.hpp
+++ b/inference/engine/include/ocl/rnncell_ocl.hpp
@@ -41,13 +41,13 @@ class RNNCellOCL : public RNNCell {
         Tensor hTensor = this->outputTensors[0];
 
         CHECK_STATUS(rnncell(xTensor, this->weightTensors, this->biasTensors, stateTensor, this->p,
-            this->xDim, this->p.numOutput, 0, this->temp, hTensor, &this->archInfo));
+            this->xDim, this->p.num_outputs, 0, this->temp, hTensor, &this->archInfo));
     }
 
     EE infer_forward_algorithm(std::shared_ptr<AlgorithmMap> algorithmMap) override
     {
-        if (this->p.biDirection) {
-            CHECK_STATUS(NOT_SUPPORTED);
+        if (this->p.bi_direction) {
+            UNI_ERROR_LOG("gpu not support bi-direction rnn.\n");
         }
         OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec;
         Tensor xTensor = this->inputTensors[0];
@@ -58,8 +58,8 @@ class RNNCellOCL : public RNNCell {
         ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm =
             CONVOLUTION_ALGORITHM_NULL;
         I32 algo[7];
-        U32 algoNum = (this->p.numProjection > 0) ? 7 : 4;
-        std::string name = this->name + std::to_string(get_type()); 
+        U32 algoNum = (this->p.num_projection > 0) ? 7 : 4;
+        std::string name = this->name + std::to_string(get_type());
         if (algorithmMap->getAlgorithmInfoFromMap(name, algo, algoNum)) {
             this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0];
             this->runInfo.best_h[0] = algo[1];
@@ -72,7 +72,7 @@ class RNNCellOCL : public RNNCell {
             }
         } else {
             CHECK_STATUS(rnncell_infer_forward_algorithm(xTensor, filterTensor, biasTensor,
-                stateTensor, this->p, this->xDim, this->p.numOutput, hTensor, &this->archInfo));
+                stateTensor, this->p, this->xDim, this->p.num_outputs, hTensor, &this->archInfo));
             algo[0] = this->runInfo.algorithm;
             algo[1] = this->runInfo.best_h[0];
             algo[2] = this->runInfo.best_c[0];
@@ -118,7 +118,7 @@ class RNNCellOCL : public RNNCell {
         this->wtm = std::shared_ptr<Tensor>(new Tensor(this->wtmType));
         this->wtm->resize(ftmDesc[0]);
         this->wtm->alloc();
-        if (this->p.numProjection > 0) {
+        if (this->p.num_projection > 0) {
             this->wtm_pro = std::shared_ptr<Tensor>(new Tensor(this->wtmType));
             this->wtm_pro->resize(ftmDesc[1]);
             this->wtm_pro->alloc();
@@ -133,13 +133,13 @@ class RNNCellOCL : public RNNCell {
         std::vector<Tensor *> ftmTensors;
         filterTensors.push_back(this->weightTensors[0]);
         ftmTensors.push_back(this->wtm.get());
-        if (this->p.numProjection > 0) {
+        if (this->p.num_projection > 0) {
             filterTensors.push_back(this->weightTensors[1]);
             ftmTensors.push_back(this->wtm_pro.get());
         }
         CHECK_STATUS(rnncell_transform_filter(filterTensors, this->p, ftmTensors, &this->archInfo));
         this->weightTensors[0] = *this->get_wtm();
-        if (this->p.numProjection > 0) {
+        if (this->p.num_projection > 0) {
             this->weightTensors[1] = *wtm_pro.get();
         }
         return SUCCESS;
@@ -147,20 +147,20 @@ class RNNCellOCL : public RNNCell {
 
     EE infer_weight_desc() override
     {
-        U32 column = (this->p.numProjection > 0) ? this->p.numProjection : this->p.numOutput;
+        U32 column = (this->p.num_projection > 0) ? this->p.num_projection : this->p.num_outputs;
         U32 filterRow = 4 * column;
-        U32 filterCol = this->p.numOutput + this->xDim;
+        U32 filterCol = this->p.num_outputs + this->xDim;
         TensorDesc weightDesc[2];
         TensorDesc biasDesc[2];
         weightDesc[0] = tensor2df(this->dt, DF_NK, filterRow, filterCol);
-        weightDesc[1] = tensor2df(this->dt, DF_NK, this->p.numOutput, this->p.numProjection);
+        weightDesc[1] = tensor2df(this->dt, DF_NK, this->p.num_outputs, this->p.num_projection);
         biasDesc[0] = tensor1d(this->dt, filterRow);
-        biasDesc[1] = tensor1d(this->dt, this->p.numOutput);
-        U32 weightNum = (this->p.numProjection > 0) ? 2 : 1;
+        biasDesc[1] = tensor1d(this->dt, this->p.num_outputs);
+        U32 weightNum = (this->p.num_projection > 0) ? 2 : 1;
         U32 biasNum = weightNum;
-        U32 diretions = (this->p.biDirection) ? 2 : 1;
+        U32 diretions = (this->p.bi_direction) ? 2 : 1;
         if (this->p.mode != RNN_LSTM) {
-            CHECK_STATUS(NOT_SUPPORTED);
+            UNI_ERROR_LOG("gpu rnn only support lstm.\n");
         }
 
         for (U32 d = 0; d < diretions; d++) {
diff --git a/inference/engine/include/ocl/roialign_ocl.hpp b/inference/engine/include/ocl/roialign_ocl.hpp
index 744fb615..a0bfd60e 100644
--- a/inference/engine/include/ocl/roialign_ocl.hpp
+++ b/inference/engine/include/ocl/roialign_ocl.hpp
@@ -18,7 +18,7 @@
 
 class RoIAlignOCL : public RoIAlign {
 public:
-    RoIAlignOCL(RoIAlignParamSpec p) : RoIAlign(p)
+    RoIAlignOCL(DataType dt, RoIAlignParamSpec p) : RoIAlign(dt, p)
     {
         INIT_GPU_INFO(nullptr)
     }
@@ -27,7 +27,8 @@ class RoIAlignOCL : public RoIAlign {
 
     std::shared_ptr<Operator> clone() override
     {
-        std::shared_ptr<RoIAlignOCL> mem = std::shared_ptr<RoIAlignOCL>(new RoIAlignOCL(this->p));
+        std::shared_ptr<RoIAlignOCL> mem =
+            std::shared_ptr<RoIAlignOCL>(new RoIAlignOCL(this->dt, this->p));
         *mem = *this;
         return mem;
     }
diff --git a/inference/engine/include/ocl/scale_ocl.hpp b/inference/engine/include/ocl/scale_ocl.hpp
index 1b86a991..f6df3a22 100644
--- a/inference/engine/include/ocl/scale_ocl.hpp
+++ b/inference/engine/include/ocl/scale_ocl.hpp
@@ -28,7 +28,7 @@ class ScaleOCL : public Scale {
     std::shared_ptr<Operator> clone() override
     {
         std::shared_ptr<ScaleOCL> mem =
-            std::shared_ptr<ScaleOCL>(new ScaleOCL(this->dt, this->p, this->numChannels));
+            std::shared_ptr<ScaleOCL>(new ScaleOCL(this->dt, this->p, 0));
         *mem = *this;
         return mem;
     }
@@ -38,7 +38,6 @@ class ScaleOCL : public Scale {
         auto curOpWs = this->get_weightspec();
         U32 weightNum = 0;
         U32 vecNum = 0;
-        this->numChannels = 0;
         if (0 != curOpWs.bytes_of_weight) {
             weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt));
         }
@@ -72,10 +71,7 @@ class ScaleOCL : public Scale {
         int inputNum = this->inputTensors.size();
         Tensor inputTensor = this->inputTensors[this->dataID];
         Tensor outputTensor = this->outputTensors[0];
-        if (inputNum == 1 && weightTensors.size() == 0 && biasTensors.size() == 0) {
-            CHECK_STATUS(NOT_MATCH);
-        }
-
+        CHECK_REQUIREMENT(inputNum != 1 || weightTensors.size() != 0 || biasTensors.size() != 0);
         if (inputNum > 1) {
             U32 cNum = this->inputTensors[this->dataID].get_desc().dims[2];
             for (int i = 0; i < inputNum; i++) {
@@ -92,8 +88,10 @@ class ScaleOCL : public Scale {
                             desc.offset[1] == 0) {
                             continue;
                         }
+                    } else {
+                        UNI_ERROR_LOG("gpu scale not support %s format input.\n",
+                            DataFormatName()[desc.memFormat]);
                     }
-                    CHECK_STATUS(NOT_MATCH);
                 }
             }
         }
diff --git a/inference/engine/include/onehot.hpp b/inference/engine/include/onehot.hpp
new file mode 100644
index 00000000..2e0876a9
--- /dev/null
+++ b/inference/engine/include/onehot.hpp
@@ -0,0 +1,35 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _ONEHOT_H
+#define _ONEHOT_H
+
+#include "operator.hpp"
+
+class OneHot : public Operator {
+public:
+    explicit OneHot(DataType dt, OneHotParamSpec p)
+    {
+        this->dt = dt;
+        this->p = p;
+    }
+
+    OperatorType get_type() override
+    {
+        return OT_OneHot;
+    }
+
+protected:
+    OneHotParamSpec p;
+};
+#endif  // _ONEHOT_H
diff --git a/inference/engine/include/operator.hpp b/inference/engine/include/operator.hpp
index 5426ac73..810dac88 100644
--- a/inference/engine/include/operator.hpp
+++ b/inference/engine/include/operator.hpp
@@ -23,7 +23,7 @@
 #include "gcl_engine.h"
 #include "image_container.hpp"
 #endif
-#include "parameter_spec.h"
+#include "tensor_computing.h"
 
 class Operator {
 public:
@@ -155,7 +155,7 @@ class Operator {
         featureScale.resize(num);
         for (U32 i = 0; i < num; i++) {
             featureScale[i].resize(qs[i].num_scale);
-            memcpy(featureScale[i].data(), qs[i].scale, qs[i].num_scale * bytesOf(DT_F32));
+            UNI_MEMCPY(featureScale[i].data(), qs[i].scale, qs[i].num_scale * bytesOf(DT_F32));
         }
 #endif
     }
@@ -251,7 +251,7 @@ class Operator {
             if (size[0] == 0 && size[1] == 0 && size[2] == 0) {
                 return false;
             } else if (size[0] == 0 || size[1] == 0 || size[2] == 0) {
-                CHECK_STATUS(NOT_MATCH);
+                UNI_ERROR_LOG("gpu tmp buffer(on image buffer) parameter is wrong.\n");
             }
             *tensor = this->tempImages->get(slot, size[0], size[1], size[2]);
             findMatchImage = true;
@@ -302,6 +302,25 @@ class Operator {
     }
 #endif
 
+    int is_shape(std::vector<Tensor *> tensors)
+    {
+        int count = 0;
+        for (U32 i = 0; i < tensors.size(); i++) {
+            count += tensorIsShape(tensors[i]->get_desc());
+        }
+        return count;
+    }
+
+    TensorDesc tensor_shape(Tensor tensor)
+    {
+        TensorDesc desc = tensor.get_desc();
+        U32 *ptr = (U32 *)((CpuMemory *)(tensor.get_memory()))->get_ptr();
+        for (U32 i = 0; i < tensor.length() && desc.nDims + i < DIM_LEN; i++) {
+            desc.dims[desc.nDims + i] = ptr[i];
+        }
+        return desc;
+    }
+
 protected:
     ArchInfo archInfo;
     DataType dt;
diff --git a/inference/engine/include/parse_command.h b/inference/engine/include/parse_command.h
index 81db5c4d..fb3ef113 100644
--- a/inference/engine/include/parse_command.h
+++ b/inference/engine/include/parse_command.h
@@ -20,19 +20,12 @@
 #include "error.h"
 
 #ifdef _USE_FP16
-
-inline U32 getBinFileSize(CI8 *dataPath, CI8 *dataName)
+inline U32 getBinFileSize(CI8 *directory, CI8 *name)
 {
-    std::string filePath = dataPath;
-    CI8 lastFlag = filePath[filePath.length() - 1];
-    if (strcmp(&lastFlag, "/") != 0) {
-        filePath += "/";
-    }
-    std::string fileName = dataName;
-    fileName = filePath + fileName;
-    FILE *file = fopen(fileName.c_str(), "rb");
+    std::string path = std::string(directory) + std::string("/") + std::string(name);
+    FILE *file = fopen(path.c_str(), "rb");
     if (file == NULL) {
-        UNI_WARNING_LOG("can not get %s file size.\n", fileName.c_str());
+        UNI_ERROR_LOG("can not get %s file size.\n", path.c_str());
         return 0;
     }
     fseek(file, 0, SEEK_END);
@@ -42,50 +35,34 @@ inline U32 getBinFileSize(CI8 *dataPath, CI8 *dataName)
     return size;
 }
 
-inline void writeF16ToF32Bin(F16 *data, U32 num, CI8 *dataPath, CI8 *dataName)
+inline void writeF16ToF32Bin(F16 *data, U32 num, CI8 *directory, CI8 *name)
 {
-    std::string filePath = dataPath;
-    CI8 lastFlag = filePath[filePath.length() - 1];
-    if (strcmp(&lastFlag, "/") != 0) {
-        filePath += "/";
-    }
-    std::string fileName = dataName;
-    fileName = filePath + fileName;
-    FILE *outfile = fopen(fileName.c_str(), "wb");
-    if (outfile == NULL) {
-        UNI_WARNING_LOG("can not write %s.\n", fileName.c_str());
+    std::string path = std::string(directory) + std::string("/") + std::string(name);
+    FILE *file = fopen(path.c_str(), "wb");
+    if (file == NULL) {
+        UNI_ERROR_LOG("can not write %s.\n", path.c_str());
         return;
     }
-    F32 *dataTran = new F32[num];
-    for (U32 i = 0; i < num; i++) {
-        dataTran[i] = (F32)data[i];
-    }
-    fwrite(dataTran, sizeof(float), num, outfile);
-    fclose(outfile);
-    delete[] dataTran;
+    float *buffer = (float *)UNI_MALLOC(sizeof(float) * num);
+    transformToFloat(DT_F16, data, buffer, num);
+    fwrite(buffer, sizeof(float), num, file);
+    fclose(file);
+    UNI_FREE(buffer);
 }
 
-inline void readF32BinToF16(F16 *data, U32 num, CI8 *dataPath, CI8 *dataName)
+inline void readF32BinToF16(F16 *data, U32 num, CI8 *directory, CI8 *name)
 {
-    std::string filePath = dataPath;
-    CI8 lastFlag = filePath[filePath.length() - 1];
-    if (strcmp(&lastFlag, "/") != 0) {
-        filePath += "/";
-    }
-    std::string fileName = dataName;
-    fileName = filePath + fileName;
-    FILE *infile = fopen(fileName.c_str(), "rb");
-    if (infile == NULL) {
-        UNI_WARNING_LOG("can not read %s.\n", fileName.c_str());
+    std::string path = std::string(directory) + std::string("/") + std::string(name);
+    FILE *file = fopen(path.c_str(), "rb");
+    if (file == NULL) {
+        UNI_ERROR_LOG("can not read %s.\n", path.c_str());
         return;
     }
-    F32 *dataTran = new F32[num];
-    fread(dataTran, sizeof(float), num, infile);
-    for (U32 i = 0; i < num; i++) {
-        data[i] = (F16)dataTran[i];
-    }
-    fclose(infile);
-    delete[] dataTran;
+    float *buffer = (float *)UNI_MALLOC(sizeof(float) * num);
+    fread(buffer, sizeof(float), num, file);
+    transformFromFloat(DT_F16, buffer, data, num);
+    fclose(file);
+    UNI_FREE(buffer);
 }
 #endif
 
diff --git a/inference/engine/include/pooling.hpp b/inference/engine/include/pooling.hpp
index 677a303f..1c0f8f7c 100644
--- a/inference/engine/include/pooling.hpp
+++ b/inference/engine/include/pooling.hpp
@@ -15,7 +15,6 @@
 #define _POOLING_H
 
 #include "operator.hpp"
-#include "tensor_computing.h"
 
 class Pooling : public Operator {
 public:
diff --git a/inference/engine/include/preallocated_memory.hpp b/inference/engine/include/preallocated_memory.hpp
index 6a909c54..d83befd3 100644
--- a/inference/engine/include/preallocated_memory.hpp
+++ b/inference/engine/include/preallocated_memory.hpp
@@ -18,10 +18,9 @@
 
 class PreAllocatedMemory : public Operator {
 public:
-    PreAllocatedMemory(DataType dt, TensorDesc desc)
+    PreAllocatedMemory(PreAllocatedMemoryParamSpec p)
     {
-        this->dt = dt;
-        this->desc = desc;
+        this->p = p;
     }
 
     OperatorType get_type() override
@@ -30,7 +29,7 @@ class PreAllocatedMemory : public Operator {
     }
 
 protected:
-    TensorDesc desc;
+    PreAllocatedMemoryParamSpec p;
 };
 
 #endif  // _PREALLOCATED_MEMORY_H
diff --git a/inference/engine/include/prelu.hpp b/inference/engine/include/prelu.hpp
index 0a0e504c..5e020f84 100644
--- a/inference/engine/include/prelu.hpp
+++ b/inference/engine/include/prelu.hpp
@@ -29,7 +29,7 @@ class PReLU : public WeightOperator {
     }
 
 protected:
-    PReLUParamSpec preluDesc;
+    PReLUParamSpec p;
 };
 
 #endif  // _PADDING_H
diff --git a/inference/engine/include/prior_box.hpp b/inference/engine/include/prior_box.hpp
index 4ee39b74..b93a00f5 100644
--- a/inference/engine/include/prior_box.hpp
+++ b/inference/engine/include/prior_box.hpp
@@ -44,8 +44,7 @@ class PriorBox : public Operator {
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(priorbox_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo));
-        return SUCCESS;
+        return priorbox_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo);
     }
 
 protected:
diff --git a/inference/engine/include/quantizelinear.hpp b/inference/engine/include/quantizelinear.hpp
new file mode 100644
index 00000000..2cf3246b
--- /dev/null
+++ b/inference/engine/include/quantizelinear.hpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _QUANTIZELINEAR_H
+#define _QUANTIZELINEAR_H
+
+#include "operator.hpp"
+
+class QuantizeLinear : public Operator {
+public:
+    QuantizeLinear(DataType dt, QuantizeLinearParamSpec p)
+    {
+        this->dt = dt;
+        this->p = p;
+    }
+
+    OperatorType get_type() override
+    {
+        return OT_QuantizeLinear;
+    }
+
+protected:
+    QuantizeLinearParamSpec p;
+};
+
+#endif  // _QUANTIZELINEAR_H
diff --git a/inference/engine/include/range.hpp b/inference/engine/include/range.hpp
new file mode 100644
index 00000000..7d54ae90
--- /dev/null
+++ b/inference/engine/include/range.hpp
@@ -0,0 +1,35 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _RANGE_H
+#define _RANGE_H
+
+#include "operator.hpp"
+
+class Range : public Operator {
+public:
+    explicit Range(DataType dt, RangeParamSpec p)
+    {
+        this->dt = dt;
+        this->p = p;
+    }
+
+    OperatorType get_type() override
+    {
+        return OT_Range;
+    }
+
+protected:
+    RangeParamSpec p;
+};
+#endif  // _RANGE_H
diff --git a/inference/engine/include/reduction.hpp b/inference/engine/include/reduction.hpp
index bca7f650..fa4f08e0 100644
--- a/inference/engine/include/reduction.hpp
+++ b/inference/engine/include/reduction.hpp
@@ -15,7 +15,6 @@
 #define _REDUCTION_H
 
 #include "operator.hpp"
-#include "tensor_computing.h"
 
 class Reduction : public Operator {
 public:
@@ -41,10 +40,10 @@ class Reduction : public Operator {
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        if (this->p.axes_num == 0) {
+        if (this->p.num_axes == 0) {
             TensorDesc desc = inTensors[0]->get_desc();
-            this->p.axes_num = desc.nDims;
-            for (int i = 0; i < this->p.axes_num; i++) {
+            this->p.num_axes = desc.nDims;
+            for (int i = 0; i < this->p.num_axes; i++) {
                 this->p.axes[i] = i;
             }
         }
diff --git a/inference/engine/include/relative_position_embedding.hpp b/inference/engine/include/relative_position_embedding.hpp
index 3af4d378..f660533f 100644
--- a/inference/engine/include/relative_position_embedding.hpp
+++ b/inference/engine/include/relative_position_embedding.hpp
@@ -53,29 +53,29 @@ class RelativePositionEmbedding : public EmbeddingCPU {
         U32 batch = inputDesc.dims[inputDesc.nDims - 1];
         U32 length = inputDesc.dims[inputDesc.nDims - 1 - tmpAxis];
         for (U32 in = 0; in < batch; in++) {
-            U8 *ptr = outputPtr + in * length * this->p.num_output * bytesOf(this->dt);
-            if (length > this->p.input_dim) {
-                U32 size = (length - this->p.input_dim) * this->p.num_output * bytesOf(this->dt);
-                memset(ptr, 0, size);
+            U8 *ptr = outputPtr + in * length * this->p.num_outputs * bytesOf(this->dt);
+            if (length > this->p.num_inputs) {
+                U32 size = (length - this->p.num_inputs) * this->p.num_outputs * bytesOf(this->dt);
+                UNI_MEMSET(ptr, 0, size);
                 ptr += size;
             }
             U32 start = 0;
-            U32 copyLength = this->p.input_dim;
-            if (length < this->p.input_dim) {
-                start = this->p.input_dim - length;
+            U32 copyLength = this->p.num_inputs;
+            if (length < this->p.num_inputs) {
+                start = this->p.num_inputs - length;
                 copyLength = length;
             }
             if (this->p.transpose) {
                 for (U32 i = 0; i < copyLength; i++) {
-                    for (U32 j = 0; j < this->p.num_output; j++) {
-                        memcpy(ptr,
-                            weightPtr + (j * this->p.input_dim + start + i) * bytesOf(this->dt),
+                    for (U32 j = 0; j < this->p.num_outputs; j++) {
+                        UNI_MEMCPY(ptr,
+                            weightPtr + (j * this->p.num_inputs + start + i) * bytesOf(this->dt),
                             bytesOf(this->dt));
                     }
                 }
             } else {
-                memcpy(ptr, weightPtr + start * this->p.num_output * bytesOf(this->dt),
-                    copyLength * this->p.num_output * bytesOf(this->dt));
+                UNI_MEMCPY(ptr, weightPtr + start * this->p.num_outputs * bytesOf(this->dt),
+                    copyLength * this->p.num_outputs * bytesOf(this->dt));
             }
         }
     }
@@ -87,7 +87,7 @@ class RelativePositionEmbedding : public EmbeddingCPU {
         I32 tmpAxis = (this->p.axis + inDim.nDims) % inDim.nDims;
         U32 batch = inDim.dims[inDim.nDims - 1];
         U32 length = inDim.dims[inDim.nDims - 1 - tmpAxis];
-        TensorDesc outDim = tensor3df(this->dt, DF_MTK, batch, length, this->p.num_output);
+        TensorDesc outDim = tensor3df(this->dt, DF_MTK, batch, length, this->p.num_outputs);
         outTensors[0]->resize(outDim);
         return SUCCESS;
     }
diff --git a/inference/engine/include/relative_shift.hpp b/inference/engine/include/relative_shift.hpp
index 74248d25..759d856a 100644
--- a/inference/engine/include/relative_shift.hpp
+++ b/inference/engine/include/relative_shift.hpp
@@ -50,7 +50,7 @@ class RelativeShift : public Operator {
         U32 length = inputDesc.dims[tmpAxis];
         if (tmpAxis + 1 >= (I32)inputDesc.nDims) {
             U32 bytes = inputTensor.bytes();
-            memcpy(outputPtr, inputPtr, bytes);
+            UNI_MEMCPY(outputPtr, inputPtr, bytes);
             return;
         }
         U32 loops = inputDesc.dims[tmpAxis + 1];
@@ -72,13 +72,13 @@ class RelativeShift : public Operator {
                     (loops - this->p.shift_length) * (this->p.shift_length + length);
                 U32 start = this->p.shift_length * length - num;
                 U32 srcIndex = start * tileSize;
-                memcpy(dstPtr, srcPtr + srcIndex, num * tileSize);
+                UNI_MEMCPY(dstPtr, srcPtr + srcIndex, num * tileSize);
                 dstPtr += num * tileSize;
                 srcIndex += num * tileSize;
                 for (U32 j = this->p.shift_length; j < loops; j++) {
-                    memset(dstPtr, 0, this->p.shift_length * tileSize);
+                    UNI_MEMSET(dstPtr, 0, this->p.shift_length * tileSize);
                     dstPtr += this->p.shift_length * tileSize;
-                    memcpy(dstPtr, srcPtr + srcIndex, chunkSize);
+                    UNI_MEMCPY(dstPtr, srcPtr + srcIndex, chunkSize);
                     dstPtr += chunkSize;
                     srcIndex += chunkSize;
                 }
@@ -87,7 +87,7 @@ class RelativeShift : public Operator {
                 srcPtr += this->p.shift_length * loops * tileSize;
                 for (U32 j = 0; j < loops; j++) {
                     for (U32 k = 0; k < klen; k++) {
-                        memcpy(dstPtr, srcPtr, tileSize);
+                        UNI_MEMCPY(dstPtr, srcPtr, tileSize);
                         srcPtr += tileSize;
                         dstPtr += tileSize;
                     }
diff --git a/inference/engine/include/resize.hpp b/inference/engine/include/resize.hpp
index 981855b0..88da402f 100644
--- a/inference/engine/include/resize.hpp
+++ b/inference/engine/include/resize.hpp
@@ -18,14 +18,10 @@
 
 class Resize : public Operator {
 public:
-    Resize(DataType paramDT, ResizeParamSpec p)
+    Resize(DataType dt, ResizeParamSpec p)
     {
-        if (paramDT == DT_F32 || paramDT == DT_U32) {
-            this->paramDT = paramDT;
-            this->p = p;
-        } else {
-            CHECK_STATUS(NOT_SUPPORTED);
-        }
+        this->dt = dt;
+        this->p = p;
     }
 
     OperatorType get_type() override
@@ -34,7 +30,6 @@ class Resize : public Operator {
     }
 
 protected:
-    DataType paramDT;
     ResizeParamSpec p;
 };
 
diff --git a/inference/engine/include/roialign.hpp b/inference/engine/include/roialign.hpp
index b171636d..d01f60be 100644
--- a/inference/engine/include/roialign.hpp
+++ b/inference/engine/include/roialign.hpp
@@ -15,12 +15,12 @@
 #define _ROIALIGN_H
 
 #include "operator.hpp"
-#include "tensor_computing.h"
 
 class RoIAlign : public Operator {
 public:
-    RoIAlign(RoIAlignParamSpec p)
+    RoIAlign(DataType dt, RoIAlignParamSpec p)
     {
+        this->dt = dt;
         this->p = p;
     }
 
diff --git a/inference/engine/include/scale.hpp b/inference/engine/include/scale.hpp
index 9ee5490a..d8f006b6 100644
--- a/inference/engine/include/scale.hpp
+++ b/inference/engine/include/scale.hpp
@@ -22,7 +22,6 @@ class Scale : public WeightOperator {
     {
         this->dt = dt;
         this->p = p;
-        this->numChannels = numChannels;
         this->dataID = 0;
     }
 
@@ -39,44 +38,30 @@ class Scale : public WeightOperator {
     U32 find_target_axis_len(std::vector<Tensor *> inTensors)
     {
         auto curOpWs = this->get_weightspec();
-        U32 weightNum = 0;
-        U32 vecNum = 0;
+        int weightNum = 0;
+        int vecNum = 0;
         if (0 != curOpWs.bytes_of_weight) {
             weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt));
         } else if (0 != curOpWs.bytes_of_vec) {
             vecNum = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt));
         }
         if (weightNum > 0 && vecNum > 0 && weightNum != vecNum) {
-            CHECK_STATUS(NOT_MATCH);
+            UNI_ERROR_LOG(
+                "scale alpha length(%d) is not equal to beta length(%d).\n", weightNum, vecNum);
         }
-        this->numChannels = (weightNum) ? weightNum : vecNum;
+        int numChannels = (weightNum) ? weightNum : vecNum;
         if (weightNum == 0 && vecNum == 0) {
             if (inTensors.size() == 1) {
-                CHECK_STATUS(NOT_MATCH);
+                UNI_ERROR_LOG("scale doesn't have alpha or beta.\n");
             }
             TensorDesc desc = inTensors[1 - dataID]->get_desc();
-            this->numChannels = tensorNumElements(desc);
+            numChannels = tensorNumElements(desc);
         }
-
-        TensorDesc inputDesc = inTensors[dataID]->get_desc();
-        U32 axisLen = this->numChannels;
-        I32 axis = p.axis;
-        U32 nDims = inputDesc.nDims;
-        axis = (nDims + axis) % nDims;
-        axis = nDims - 1 - axis;
-        if (axisLen != inputDesc.dims[axis]) {
-            for (U32 i = 0; i < nDims; i++) {
-                if (inputDesc.dims[nDims - 1 - i] == axisLen) {
-                    p.axis = i;
-                }
-            }
-        }
-        return axisLen;
+        return numChannels;
     }
 
 protected:
     ScaleParamSpec p;
-    U32 numChannels;
     int dataID;
 };
 
diff --git a/inference/engine/include/select.hpp b/inference/engine/include/select.hpp
index 06d0da04..c552c11c 100644
--- a/inference/engine/include/select.hpp
+++ b/inference/engine/include/select.hpp
@@ -27,10 +27,5 @@ class Select : public Operator {
     {
         return OT_Select;
     }
-
-    bool can_input_output_the_same() override
-    {
-        return false;
-    }
 };
 #endif  // _SELECT_H
diff --git a/inference/engine/include/weight_operator.hpp b/inference/engine/include/weight_operator.hpp
index a5770a36..7d4d1488 100644
--- a/inference/engine/include/weight_operator.hpp
+++ b/inference/engine/include/weight_operator.hpp
@@ -15,7 +15,6 @@
 #define _WEIGHTOPERATOR_H
 
 #include "operator.hpp"
-#include "tensor_computing.h"
 #include "model_spec.h"
 
 class WeightOperator : public Operator {
@@ -118,7 +117,7 @@ class WeightOperator : public Operator {
         return SUCCESS;
     }
 
-    virtual EE init_weight_bias_from_model(std::shared_ptr<U8> *modelPtr)
+    virtual EE init_weight_bias_from_model(std::shared_ptr<U8> *modelPtr = nullptr)
     {
         EE ret = this->infer_weight_desc();
         if (ret != SUCCESS) {
@@ -151,6 +150,15 @@ class WeightOperator : public Operator {
             weight_offset += tensorNumBytes(desc);
         }
 
+        if (curOpWs.num_quant_scale == this->weightTensors.size()) {
+            for (U32 i = 0; i < this->weightTensors.size(); ++i) {
+                if (curOpWs.weight_scale[i].num_scale > 0) {
+                    this->weightTensors[i].set_scale_ptr(
+                        std::shared_ptr<F32>(curOpWs.weight_scale[i].scale, [](F32 *) {}));
+                }
+            }
+        }
+
         U32 bias_offset = (modelPtr != nullptr) ? weight_offset : 0;
         if (this->hasBias) {
             for (auto bias_tensor : this->biasTensors) {
@@ -169,7 +177,7 @@ class WeightOperator : public Operator {
                 bias_mem_src.resize(desc);
                 bias_mem_src.alloc();
                 U8 *tmp = (U8 *)bias_mem_src.get_ptr();
-                memset(tmp, 0, bias_mem_src.bytes());
+                UNI_MEMSET(tmp, 0, bias_mem_src.bytes());
                 bias_mem_dst->reuse(&bias_mem_src);
             }
         }
diff --git a/inference/engine/include/where.hpp b/inference/engine/include/where.hpp
index a22f1221..5ec80270 100644
--- a/inference/engine/include/where.hpp
+++ b/inference/engine/include/where.hpp
@@ -14,9 +14,9 @@
 #ifndef _WHERE_H
 #define _WHERE_H
 
-#include "weight_operator.hpp"
+#include "operator.hpp"
 
-class Where : public WeightOperator {
+class Where : public Operator {
 public:
     Where(DataType dt)
     {
@@ -27,11 +27,6 @@ class Where : public WeightOperator {
     {
         return OT_Where;
     }
-
-    bool can_input_output_the_same() override
-    {
-        return false;
-    }
 };
 
 #endif  // _WHERE_H
diff --git a/inference/engine/include/yolov3_detection_output.hpp b/inference/engine/include/yolov3_detection_output.hpp
index 1c4f6188..aa3cb678 100644
--- a/inference/engine/include/yolov3_detection_output.hpp
+++ b/inference/engine/include/yolov3_detection_output.hpp
@@ -46,9 +46,8 @@ class Yolov3DetectionOutput : public Operator {
     EE infer_output_tensors_size(
         std::vector<Tensor *> inTensors, std::vector<Tensor *> outTensors) override
     {
-        CHECK_STATUS(yolov3detectionoutput_infer_output_size(
-            inTensors, this->p, outTensors[0], &this->archInfo));
-        return SUCCESS;
+        return yolov3detectionoutput_infer_output_size(
+            inTensors, this->p, outTensors[0], &this->archInfo);
     }
 
 protected:
diff --git a/inference/engine/src/BoltModel_Jni.cpp b/inference/engine/src/BoltModel_Jni.cpp
index 5aa8f14a..32957760 100644
--- a/inference/engine/src/BoltModel_Jni.cpp
+++ b/inference/engine/src/BoltModel_Jni.cpp
@@ -48,6 +48,8 @@ AFFINITY_TYPE str2AFFINITY_TYPE(std::string affinity_str)
         ret = CPU_LOW_POWER;
     } else if (affinity_str == "GPU") {
         ret = GPU;
+    } else if (affinity_str == "CPU") {
+        ret = CPU;
     } else {
         UNI_ERROR_LOG("unsupported JNI CPU affinity setting %s.\n", affinity_str.c_str());
     }
@@ -84,7 +86,7 @@ DATA_TYPE str2DATA_TYPE(std::string data_type)
     DATA_TYPE ret = FP_32;
     if (data_type == "FP32") {
         ret = FP_32;
-#ifdef __aarch64__
+#ifdef _USE_FP16
     } else if (data_type == "FP16") {
         ret = FP_16;
 #endif
@@ -186,13 +188,13 @@ void getInputParameters(JNIEnv *env,
         UNI_ERROR_LOG("input DataFormat array length %d is not equal to input num %d\n",
             env->GetArrayLength(df_input), num);
     }
-    int *data_n = (int *)malloc(num * sizeof(int));
-    int *data_c = (int *)malloc(num * sizeof(int));
-    int *data_h = (int *)malloc(num * sizeof(int));
-    int *data_w = (int *)malloc(num * sizeof(int));
-    char **data_name = (char **)malloc(num * sizeof(char *));
-    DATA_TYPE *data_dt = (DATA_TYPE *)malloc(num * sizeof(DATA_TYPE));
-    DATA_FORMAT *data_df = (DATA_FORMAT *)malloc(num * sizeof(DATA_FORMAT));
+    int *data_n = (int *)UNI_MALLOC(num * sizeof(int));
+    int *data_c = (int *)UNI_MALLOC(num * sizeof(int));
+    int *data_h = (int *)UNI_MALLOC(num * sizeof(int));
+    int *data_w = (int *)UNI_MALLOC(num * sizeof(int));
+    char **data_name = (char **)UNI_MALLOC(num * sizeof(char *));
+    DATA_TYPE *data_dt = (DATA_TYPE *)UNI_MALLOC(num * sizeof(DATA_TYPE));
+    DATA_FORMAT *data_df = (DATA_FORMAT *)UNI_MALLOC(num * sizeof(DATA_FORMAT));
     jint *curArray_n = env->GetIntArrayElements(n, 0);
     jint *curArray_c = env->GetIntArrayElements(c, 0);
     jint *curArray_h = env->GetIntArrayElements(h, 0);
@@ -206,7 +208,7 @@ void getInputParameters(JNIEnv *env,
         jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i));
         const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0);
         int length = strlen(cur_str_ptr);
-        data_name[i] = (char *)malloc(sizeof(char) * (length + 1));
+        data_name[i] = (char *)UNI_MALLOC(sizeof(char) * (length + 1));
         UNI_MEMCPY(data_name[i], cur_str_ptr, length);
         data_name[i][length] = '\0';
 
@@ -265,9 +267,9 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_createModel)(
     const char *affinityPtr = env->GetStringUTFChars(affinity, JNI_FALSE);
     std::string affinity_str = (std::string)affinityPtr;
     AFFINITY_TYPE affinity_cur = str2AFFINITY_TYPE(affinity_str);
-    long modelAddr = (long)CreateModel(modelPathPtr, affinity_cur, NULL);
+    long modelAddr = (long long)CreateModel(modelPathPtr, affinity_cur, NULL);
     ModelHandleInfo *ihInfo = (ModelHandleInfo *)modelAddr;
-    if (nullptr == ihInfo->cnn) {
+    if (nullptr == ihInfo) {
         UNI_ERROR_LOG("Bolt instance not created\n");
         modelAddr = 0;
     }
@@ -283,7 +285,7 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneModel)(
     UNI_DEBUG_LOG("JNI %s...\n", __FUNCTION__);
     ModelHandle handle = (ModelHandle)modelAddr;
     ModelHandle cloneHandle = CloneModel(handle);
-    long ret = (long)cloneHandle;
+    long ret = (long long)cloneHandle;
     UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__);
     return ret;
 }
@@ -315,16 +317,16 @@ extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_prepareModel)(JNIEnv
     PrepareModel(
         ih, num_input, (const char **)data_name, data_n, data_c, data_h, data_w, data_dt, data_df);
 
-    free(data_n);
-    free(data_c);
-    free(data_h);
-    free(data_w);
+    UNI_FREE(data_n);
+    UNI_FREE(data_c);
+    UNI_FREE(data_h);
+    UNI_FREE(data_w);
     for (int i = 0; i < num_input; i++) {
-        free(data_name[i]);
+        UNI_FREE(data_name[i]);
     }
-    free(data_name);
-    free(data_dt);
-    free(data_df);
+    UNI_FREE(data_name);
+    UNI_FREE(data_dt);
+    UNI_FREE(data_df);
     UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__);
 }
 
@@ -355,16 +357,16 @@ extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_resizeModelInput)(JN
     ResizeModelInput(
         ih, num_input, (const char **)data_name, data_n, data_c, data_h, data_w, data_dt, data_df);
 
-    free(data_n);
-    free(data_c);
-    free(data_h);
-    free(data_w);
+    UNI_FREE(data_n);
+    UNI_FREE(data_c);
+    UNI_FREE(data_h);
+    UNI_FREE(data_w);
     for (int i = 0; i < num_input; i++) {
-        free(data_name[i]);
+        UNI_FREE(data_name[i]);
     }
-    free(data_name);
-    free(data_dt);
-    free(data_df);
+    UNI_FREE(data_name);
+    UNI_FREE(data_dt);
+    UNI_FREE(data_df);
     UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__);
 }
 
@@ -375,7 +377,7 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocAllResultHandl
     ModelHandle ih = (ModelHandle)modelAddr;
     ResultHandle ir = AllocAllResultHandle(ih);
     UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__);
-    return (long)ir;
+    return (long long)ir;
 }
 
 extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocSpecificResultHandle)(
@@ -387,12 +389,12 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocSpecificResult
             env->GetArrayLength(outputNames), num_outputs);
     }
     ModelHandle ih = (ModelHandle)modelAddr;
-    char **output_names_ptr = (char **)malloc(sizeof(char *) * num_outputs);
+    char **output_names_ptr = (char **)UNI_MALLOC(sizeof(char *) * num_outputs);
     for (int i = 0; i < num_outputs; i++) {
         jstring cur_str = (jstring)(env->GetObjectArrayElement(outputNames, i));
         const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0);
         int length = strlen(cur_str_ptr);
-        output_names_ptr[i] = (char *)malloc(sizeof(char) * (length + 1));
+        output_names_ptr[i] = (char *)UNI_MALLOC(sizeof(char) * (length + 1));
         UNI_MEMCPY(output_names_ptr[i], cur_str_ptr, length);
         output_names_ptr[i][length] = '\0';
 
@@ -402,11 +404,11 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocSpecificResult
     ResultHandle ir = AllocSpecificResultHandle(ih, num_outputs, (const char **)output_names_ptr);
 
     for (int i = 0; i < num_outputs; i++) {
-        free(output_names_ptr[i]);
+        UNI_FREE(output_names_ptr[i]);
     }
-    free(output_names_ptr);
+    UNI_FREE(output_names_ptr);
     UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__);
-    return (long)ir;
+    return (long long)ir;
 }
 
 extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_setRuntimeDeviceJNI)(
@@ -462,13 +464,13 @@ extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_runModel)(JNIEnv *en
     CNN *cnn = (CNN *)ihInfo->cnn;
     std::map<std::string, std::shared_ptr<Tensor>> inMap = cnn->get_input();
 
-    char **input_names_ptr = (char **)malloc(sizeof(char *) * num_input);
-    void **mem_ptr = (void **)malloc(sizeof(void *) * num_input);
+    char **input_names_ptr = (char **)UNI_MALLOC(sizeof(char *) * num_input);
+    void **mem_ptr = (void **)UNI_MALLOC(sizeof(void *) * num_input);
     for (int i = 0; i < num_input; i++) {
         jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i));
         const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0);
         int length = strlen(cur_str_ptr);
-        input_names_ptr[i] = (char *)malloc(sizeof(char) * (length + 1));
+        input_names_ptr[i] = (char *)UNI_MALLOC(sizeof(char) * (length + 1));
         UNI_MEMCPY(input_names_ptr[i], cur_str_ptr, length);
         input_names_ptr[i][length] = '\0';
         env->ReleaseStringUTFChars(cur_str, cur_str_ptr);
@@ -488,10 +490,10 @@ extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_runModel)(JNIEnv *en
 
     RunModel(ih, ir, num_input, (const char **)input_names_ptr, mem_ptr);
     for (int i = 0; i < num_input; i++) {
-        free(input_names_ptr[i]);
+        UNI_FREE(input_names_ptr[i]);
     }
-    free(input_names_ptr);
-    free(mem_ptr);
+    UNI_FREE(input_names_ptr);
+    UNI_FREE(mem_ptr);
     UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__);
 }
 
@@ -583,7 +585,7 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneResultHandle)(
     UNI_DEBUG_LOG("JNI %s...\n", __FUNCTION__);
     ResultHandle ir = (ResultHandle)ResultHandleAddr;
     UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__);
-    return (long)CloneResultHandle(ir);
+    return (long long)CloneResultHandle(ir);
 }
 
 extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_freeResultHandle)(
diff --git a/inference/engine/src/CMakeLists.txt b/inference/engine/src/CMakeLists.txt
index aae57281..7f48bdc6 100644
--- a/inference/engine/src/CMakeLists.txt
+++ b/inference/engine/src/CMakeLists.txt
@@ -7,6 +7,9 @@ add_library(${PROJECT_NAME} SHARED ${srcs})
 add_library(${PROJECT_NAME}_static STATIC ${srcs})
 
 target_link_libraries(${PROJECT_NAME} LINK_PUBLIC tensor image model_spec)
+if (USE_SECURE_C)
+    target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY})
+endif ()
 if (BUILD_TEST)
     target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${JPEG_SHARED_LIBRARY})
 endif (BUILD_TEST)
diff --git a/inference/engine/src/bolt.cpp b/inference/engine/src/bolt.cpp
index f1f0700e..12218319 100644
--- a/inference/engine/src/bolt.cpp
+++ b/inference/engine/src/bolt.cpp
@@ -11,8 +11,8 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-#include "inference.hpp"
 #include "../api/c/bolt.h"
+#include "inference.hpp"
 
 #define NAME_VALUE_PAIR(x) #x, x
 const int DataDescMaxDims = 8;
@@ -39,14 +39,14 @@ typedef struct {
     DEVICE_TYPE deviceType;
 } ResultHandleInner;
 
-inline DataType DATA_TYPE2DataType(DATA_TYPE dt_user)
+inline static DataType DATA_TYPE2DataType(DATA_TYPE dt_user)
 {
     DataType ret = DT_F32;
     switch (dt_user) {
         case FP_32:
             ret = DT_F32;
             break;
-#ifdef __aarch64__
+#ifdef _USE_FP16
         case FP_16:
             ret = DT_F16;
             break;
@@ -64,14 +64,14 @@ inline DataType DATA_TYPE2DataType(DATA_TYPE dt_user)
     return ret;
 }
 
-inline DATA_TYPE DataType2DATA_TYPE(DataType dt_bolt)
+inline static DATA_TYPE DataType2DATA_TYPE(DataType dt_bolt)
 {
     DATA_TYPE ret = FP_32;
     switch (dt_bolt) {
         case DT_F32:
             ret = FP_32;
             break;
-#ifdef __aarch64__
+#ifdef _USE_FP16
         case DT_F16:
             ret = FP_16;
             break;
@@ -89,7 +89,7 @@ inline DATA_TYPE DataType2DATA_TYPE(DataType dt_bolt)
     return ret;
 }
 
-inline DataFormat DATA_FORMAT2DataFormat(DATA_FORMAT df_user)
+inline static DataFormat DATA_FORMAT2DataFormat(DATA_FORMAT df_user)
 {
     DataFormat ret = DF_NCHW;
     switch (df_user) {
@@ -116,7 +116,7 @@ inline DataFormat DATA_FORMAT2DataFormat(DATA_FORMAT df_user)
     return ret;
 }
 
-inline DATA_FORMAT DataFormat2DATA_FORMAT(DataFormat df_bolt)
+inline static DATA_FORMAT DataFormat2DATA_FORMAT(DataFormat df_bolt)
 {
     DATA_FORMAT ret = NCHW;
     switch (df_bolt) {
@@ -146,10 +146,13 @@ inline DATA_FORMAT DataFormat2DATA_FORMAT(DataFormat df_bolt)
     return ret;
 }
 
-inline AffinityPolicy AFFINITY_TYPE2AffinityPolicy(AFFINITY_TYPE affinity)
+inline static AffinityPolicy AFFINITY_TYPE2AffinityPolicy(AFFINITY_TYPE affinity)
 {
     AffinityPolicy ret = AFFINITY_CPU_HIGH_PERFORMANCE;
     switch (affinity) {
+        case CPU:
+            ret = AFFINITY_CPU;
+            break;
         case CPU_HIGH_PERFORMANCE:
             ret = AFFINITY_CPU_HIGH_PERFORMANCE;
             break;
@@ -167,7 +170,7 @@ inline AffinityPolicy AFFINITY_TYPE2AffinityPolicy(AFFINITY_TYPE affinity)
     return ret;
 }
 
-inline Arch DEVICE_TYPE2Arch(DEVICE_TYPE device)
+inline static Arch DEVICE_TYPE2Arch(DEVICE_TYPE device)
 {
     Arch ret = ARM_V8;
     switch (device) {
@@ -192,6 +195,9 @@ inline Arch DEVICE_TYPE2Arch(DEVICE_TYPE device)
         case CPU_X86_AVX2:
             ret = X86_AVX2;
             break;
+        case CPU_X86_AVX512:
+            ret = X86_AVX512;
+            break;
         case CPU_SERIAL:
             ret = CPU_GENERAL;
             break;
@@ -203,7 +209,7 @@ inline Arch DEVICE_TYPE2Arch(DEVICE_TYPE device)
     return ret;
 }
 
-inline DEVICE_TYPE Arch2DEVICE_TYPE(Arch arch)
+inline static DEVICE_TYPE Arch2DEVICE_TYPE(Arch arch)
 {
     DEVICE_TYPE ret = CPU_ARM_V8;
     switch (arch) {
@@ -242,7 +248,7 @@ inline DEVICE_TYPE Arch2DEVICE_TYPE(Arch arch)
     return ret;
 }
 
-void TensorDesc2DataDesc(TensorDesc srcDesc, DataDesc *dstDesc)
+inline static void TensorDesc2DataDesc(TensorDesc srcDesc, DataDesc *dstDesc)
 {
     dstDesc->dt = srcDesc.dt;
     dstDesc->df = srcDesc.df;
@@ -258,102 +264,155 @@ void TensorDesc2DataDesc(TensorDesc srcDesc, DataDesc *dstDesc)
     }
 }
 
-void assert_not_nullptr(const char *funcName, const char *ptrName, const void *ptr)
+inline static void assert_not_nullptr(const char *funcName, const char *ptrName, const void *ptr)
 {
     if (ptr == NULL) {
-        UNI_ERROR_LOG("C API %s received null ptr %s.\n", funcName, ptrName);
+        UNI_WARNING_LOG("C API %s received null ptr %s.\n", funcName, ptrName);
     }
 }
 
+static void print_model_handle(ModelHandleInner *handle)
+{
+    if (handle == nullptr) {
+        UNI_DEBUG_LOG("ModelHandle %p\n", handle);
+    } else {
+        UNI_DEBUG_LOG("ModelHandle %p(modelspec:%p engine:%p device:%d algorithm:%s file "
+                      "stream:%d)\n",
+            handle, handle->ms, handle->cnn, handle->deviceType, (const char *)handle->algoPath,
+            handle->useFileStream);
+    }
+}
+
+static void print_result_handle(ResultHandleInner *handle)
+{
+    if (handle == nullptr) {
+        UNI_DEBUG_LOG("ResultHandle %p\n", handle);
+    } else {
+        UNI_DEBUG_LOG("ResultHandle %p(num:%u data:%p device:%d)\n", handle, handle->num_outputs,
+            handle->outputArr, handle->deviceType);
+    }
+}
+
+const char *const *GetDataTypeString()
+{
+    static const char *const names[] = {"FP_32", "FP_16", "INT_32", "UINT_32"};
+    return names;
+}
+
+const char *const *GetDataFormatString()
+{
+    static const char *const names[] = {"NCHW", "NHWC", "NCHWC8", "MTK", "NORMAL"};
+    return names;
+}
+
 void GetGpuDeviceName(char *gpuDeviceName)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, gpuDeviceName);
     std::string deviceName = "unKnown";
 #ifdef _USE_GPU
     deviceName = OCLContext::getInstance().handle->deviceName;
 #endif
-    strcpy(gpuDeviceName, deviceName.c_str());
-    UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
+    UNI_STRCPY(gpuDeviceName, deviceName.c_str());
+    UNI_DEBUG_LOG("C API %s(%s) end.\n", __FUNCTION__, gpuDeviceName);
 }
 
 ModelHandle CreateModel(const char *modelPath, AFFINITY_TYPE affinity, const char *algorithmMapPath)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p, %d, %p)...\n", __FUNCTION__, modelPath, affinity, algorithmMapPath);
     assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(modelPath));
-    ModelHandleInner *handle = new ModelHandleInner();
-    ModelSpec *ms = new ModelSpec();
-    if (SUCCESS != deserialize_model_from_file(modelPath, ms)) {
-        UNI_ERROR_LOG("C API %s failed to load model %s.\n", __FUNCTION__, modelPath);
-        delete ms;
-        handle->cnn = nullptr;
-        return (ModelHandle)handle;
-    }
-    CNN *cnn = new CNN(AFFINITY_TYPE2AffinityPolicy(affinity), ms->dt, ms->model_name);
-    cnn->sort_operators_sequential(ms);
-    cnn->initialize_ops(ms);
-
-    handle->cnn = (void *)cnn;
-    handle->ms = (void *)ms;
-    handle->deviceType = Arch2DEVICE_TYPE(cnn->get_runtime_device());
-    handle->algoPath = (void *)algorithmMapPath;
-    handle->useFileStream = false;
-    UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
+    ModelHandleInner *handle = nullptr;
+    if (modelPath != nullptr) {
+        ModelSpec *ms = new ModelSpec();
+        if (SUCCESS != deserialize_model_from_file(modelPath, ms)) {
+            UNI_WARNING_LOG("C API %s failed to load model %s.\n", __FUNCTION__, modelPath);
+            delete ms;
+        } else {
+            CNN *cnn = new CNN(AFFINITY_TYPE2AffinityPolicy(affinity), ms->dt, ms->model_name);
+            cnn->sort_operators_sequential(ms);
+            cnn->initialize_ops(ms);
+
+            handle = new ModelHandleInner();
+            handle->cnn = (void *)cnn;
+            handle->ms = (void *)ms;
+            handle->deviceType = Arch2DEVICE_TYPE(cnn->get_runtime_device());
+            handle->algoPath = (void *)algorithmMapPath;
+            handle->useFileStream = false;
+        }
+    }
+    UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, handle);
+    print_model_handle(handle);
     return (ModelHandle)handle;
 }
 
 ModelHandle CloneModel(ModelHandle ih)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ih);
     ModelHandleInner *handle = (ModelHandleInner *)ih;
+    print_model_handle(handle);
     assert_not_nullptr(__FUNCTION__, "ModelHandle", handle);
-    CNN *cnn = (CNN *)handle->cnn;
-    assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
-    ModelHandleInner *cloneHandle = new ModelHandleInner();
-    *cloneHandle = *handle;
-    CNN *cloneCnn = new CNN();
-    *cloneCnn = cnn->clone();
-    cloneHandle->cnn = cloneCnn;
-    UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
+    ModelHandleInner *cloneHandle = nullptr;
+    if (handle != nullptr) {
+        CNN *cnn = (CNN *)handle->cnn;
+        assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
+        if (cnn != nullptr) {
+            cloneHandle = new ModelHandleInner();
+            *cloneHandle = *handle;
+            CNN *cloneCnn = new CNN();
+            *cloneCnn = cnn->clone();
+            cloneHandle->cnn = cloneCnn;
+        }
+    }
+    UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, cloneHandle);
+    print_model_handle(cloneHandle);
     return (ModelHandle)cloneHandle;
 }
 
 ModelHandle CreateModelWithFileStream(
     const char *modelFileStream, AFFINITY_TYPE affinity, const char *algorithmMapFileStream)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p, %d, %p)...\n", __FUNCTION__, modelFileStream, affinity,
+        algorithmMapFileStream);
     assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(modelFileStream));
-    ModelHandleInner *handle = new ModelHandleInner();
-    ModelSpec *ms = new ModelSpec();
-    if (SUCCESS != deserialize_model_from_file(modelFileStream, ms, true)) {
-        UNI_ERROR_LOG("C API %s failed to parse model.\n", __FUNCTION__);
-        delete ms;
-        handle->cnn = nullptr;
-        return (ModelHandle)handle;
-    }
-    CNN *cnn = new CNN(AFFINITY_TYPE2AffinityPolicy(affinity), ms->dt, ms->model_name);
-    cnn->sort_operators_sequential(ms);
-    cnn->initialize_ops(ms);
-
-    handle->cnn = (void *)cnn;
-    handle->ms = (void *)ms;
-    handle->deviceType = Arch2DEVICE_TYPE(cnn->get_runtime_device());
-    handle->algoPath = (void *)algorithmMapFileStream;
-    handle->useFileStream = true;
-    UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
+    ModelHandleInner *handle = nullptr;
+    if (modelFileStream != nullptr) {
+        ModelSpec *ms = new ModelSpec();
+        if (SUCCESS != deserialize_model_from_file(modelFileStream, ms, true)) {
+            UNI_WARNING_LOG("C API %s failed to parse model.\n", __FUNCTION__);
+            delete ms;
+        } else {
+            CNN *cnn = new CNN(AFFINITY_TYPE2AffinityPolicy(affinity), ms->dt, ms->model_name);
+            cnn->sort_operators_sequential(ms);
+            cnn->initialize_ops(ms);
+
+            handle = new ModelHandleInner();
+            handle->cnn = (void *)cnn;
+            handle->ms = (void *)ms;
+            handle->deviceType = Arch2DEVICE_TYPE(cnn->get_runtime_device());
+            handle->algoPath = (void *)algorithmMapFileStream;
+            handle->useFileStream = true;
+        }
+    }
+    UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, handle);
+    print_model_handle(handle);
     return (ModelHandle)handle;
 }
 
 int GetNumInputsFromModel(ModelHandle ih)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ih);
     ModelHandleInner *ihInfo = (ModelHandleInner *)ih;
     assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo);
-
-    CNN *cnn = (CNN *)ihInfo->cnn;
-    assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
-    UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
-
-    return (cnn->get_input_desc()).size();
+    print_model_handle(ihInfo);
+    int ret = 0;
+    if (ihInfo != nullptr) {
+        CNN *cnn = (CNN *)ihInfo->cnn;
+        assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
+        if (cnn != nullptr) {
+            ret = (cnn->get_input_desc()).size();
+        }
+    }
+    UNI_DEBUG_LOG("C API %s(%d) end.\n", __FUNCTION__, ret);
+    return ret;
 }
 
 void GetInputDataInfoFromModel(ModelHandle ih,
@@ -382,9 +441,11 @@ void GetInputDataInfoFromModel5D(ModelHandle handle,
     DATA_TYPE *dt,
     DATA_FORMAT *df)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p, %d, %p, %p, %p, %p, %p, %p, %p, %p)...\n", __FUNCTION__, handle,
+        num_inputs, name, n, c, t, h, w, dt, df);
     ModelHandleInner *ihInfo = (ModelHandleInner *)handle;
     assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo);
+    print_model_handle(ihInfo);
 
     CNN *cnn = (CNN *)ihInfo->cnn;
     assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
@@ -411,7 +472,7 @@ void GetInputDataInfoFromModel5D(ModelHandle handle,
     U32 in, ic, it, ih, iw;
     int i = 0;
     for (auto iter : inputTensorDescs) {
-        strcpy(name[i], iter.first.c_str());
+        UNI_STRCPY(name[i], iter.first.c_str());
         TensorDesc desc = iter.second;
         in = ic = it = ih = iw = 1;
         if (tensorIs1d(desc)) {
@@ -440,7 +501,7 @@ void GetInputDataInfoFromModel5D(ModelHandle handle,
     UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
 }
 
-std::map<std::string, TensorDesc> getInputDataFormatFromUser(ModelHandle ih,
+static std::map<std::string, TensorDesc> getInputDataFormatFromUser(ModelHandle ih,
     int num_inputs,
     const char **name,
     const int *n,
@@ -481,7 +542,7 @@ std::map<std::string, TensorDesc> getInputDataFormatFromUser(ModelHandle ih,
         if (inputTensorDescs.find(inputName) == inputTensorDescs.end()) {
             UNI_ERROR_LOG(
                 "C API inner function %s received %s is not model input.\n", __FUNCTION__, name[i]);
-            exit(1);
+            continue;
         }
         DataType idt = DATA_TYPE2DataType(dt[i]);
         DataFormat idf = DATA_FORMAT2DataFormat(df[i]);
@@ -525,7 +586,7 @@ void PrepareModel(ModelHandle ih,
 }
 
 void PrepareModel5D(ModelHandle ih,
-    int num_input,
+    int num_inputs,
     const char **name,
     const int *n,
     const int *c,
@@ -535,22 +596,25 @@ void PrepareModel5D(ModelHandle ih,
     const DATA_TYPE *dt,
     const DATA_FORMAT *df)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p, %d, %p, %p, %p, %p, %p, %p, %p, %p)...\n", __FUNCTION__, ih,
+        num_inputs, name, n, c, t, h, w, dt, df);
     ModelHandleInner *ihInfo = (ModelHandleInner *)ih;
     assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo);
+    print_model_handle(ihInfo);
 
     CNN *cnn = (CNN *)ihInfo->cnn;
     assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
 
     std::map<std::string, TensorDesc> modelInputDims =
-        getInputDataFormatFromUser(ih, num_input, name, n, c, t, h, w, dt, df);
-    cnn->loadAlgorithmMap((const char *)ihInfo->algoPath, ihInfo->useFileStream);
+        getInputDataFormatFromUser(ih, num_inputs, name, n, c, t, h, w, dt, df);
+    //cnn->loadAlgorithmMap((const char *)ihInfo->algoPath, ihInfo->useFileStream);
     cnn->ready(modelInputDims);
     cnn->mark_input_output();
 
     ModelSpec *ms = (ModelSpec *)ihInfo->ms;
     CHECK_STATUS(mt_destroy_model(ms));
     delete ms;
+    ihInfo->ms = nullptr;
     UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
 }
 
@@ -564,9 +628,12 @@ void ResizeModelInput(ModelHandle ih,
     const DATA_TYPE *dt,
     const DATA_FORMAT *df)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p, %d, %p, %p, %p, %p, %p, %p, %p)...\n", __FUNCTION__, ih, num_inputs,
+        name, n, c, h, w, dt, df);
     ModelHandleInner *ihInfo = (ModelHandleInner *)ih;
     assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo);
+    print_model_handle(ihInfo);
+
     CNN *cnn = (CNN *)ihInfo->cnn;
     assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
 
@@ -578,67 +645,80 @@ void ResizeModelInput(ModelHandle ih,
 
 ResultHandle AllocAllResultHandle(ModelHandle ih)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ih);
     ModelHandleInner *ihInfo = (ModelHandleInner *)ih;
     assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo);
-    CNN *cnn = (CNN *)ihInfo->cnn;
-    assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
-
-    ResultHandleInner *model_result_ptr = (ResultHandleInner *)malloc(sizeof(ResultHandleInner));
-    std::map<std::string, TensorDesc> outputTensorDescs = cnn->get_output_desc();
-    int num_outputs = outputTensorDescs.size();
-    DataDesc *outputArrPtr = (DataDesc *)malloc(sizeof(DataDesc) * num_outputs);
-    int i = 0;
-    for (auto iter : outputTensorDescs) {
-        std::string name = iter.first;
-        U32 length = name.size();
-        length = (length > NAME_LEN) ? NAME_LEN : length;
-        memcpy(outputArrPtr[i].name, name.c_str(), length);
-        if (length < NAME_LEN) {
-            outputArrPtr[i].name[length] = '\0';
+    print_model_handle(ihInfo);
+    ResultHandleInner *model_result_ptr = nullptr;
+    if (ihInfo != nullptr) {
+        CNN *cnn = (CNN *)ihInfo->cnn;
+        assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
+        if (cnn != nullptr) {
+            model_result_ptr = (ResultHandleInner *)UNI_MALLOC(sizeof(ResultHandleInner));
+            std::map<std::string, TensorDesc> outputTensorDescs = cnn->get_output_desc();
+            int num_outputs = outputTensorDescs.size();
+            DataDesc *outputArrPtr = (DataDesc *)UNI_MALLOC(sizeof(DataDesc) * num_outputs);
+            int i = 0;
+            for (auto iter : outputTensorDescs) {
+                std::string name = iter.first;
+                U32 length = name.size();
+                length = (length > NAME_LEN) ? NAME_LEN : length;
+                UNI_MEMCPY(outputArrPtr[i].name, name.c_str(), length);
+                if (length < NAME_LEN) {
+                    outputArrPtr[i].name[length] = '\0';
+                }
+                TensorDesc2DataDesc(iter.second, &outputArrPtr[i]);
+                i++;
+            }
+            model_result_ptr->num_outputs = num_outputs;
+            model_result_ptr->outputArr = outputArrPtr;
+            model_result_ptr->deviceType = ihInfo->deviceType;
         }
-        TensorDesc2DataDesc(iter.second, &outputArrPtr[i]);
-        i++;
     }
-    model_result_ptr->num_outputs = num_outputs;
-    model_result_ptr->outputArr = outputArrPtr;
-    model_result_ptr->deviceType = ihInfo->deviceType;
-    UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
-    return (void *)model_result_ptr;
+    UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, model_result_ptr);
+    print_result_handle(model_result_ptr);
+    return (ResultHandle)model_result_ptr;
 }
 
 ResultHandle AllocSpecificResultHandle(ModelHandle ih, int num_outputs, const char **name)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p, %d, %p)...\n", __FUNCTION__, ih, num_outputs, name);
     ModelHandleInner *ihInfo = (ModelHandleInner *)ih;
     assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo);
-    CNN *cnn = (CNN *)ihInfo->cnn;
-    assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
-
-    ResultHandleInner *model_result_ptr = (ResultHandleInner *)malloc(sizeof(ResultHandleInner));
-    int model_num_outputs = num_outputs;
-    DataDesc *outputArrPtr = (DataDesc *)malloc(sizeof(DataDesc) * model_num_outputs);
-    for (int i = 0; i < num_outputs; i++) {
-        U32 length = UNI_MIN(strlen(name[i]), NAME_LEN - 1);
-        memcpy(outputArrPtr[i].name, name[i], length);
-        if (length < NAME_LEN) {
-            outputArrPtr[i].name[length] = '\0';
+    print_model_handle(ihInfo);
+    ResultHandleInner *model_result_ptr = nullptr;
+    if (ihInfo != nullptr) {
+        CNN *cnn = (CNN *)ihInfo->cnn;
+        assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
+        if (cnn != nullptr) {
+            model_result_ptr = (ResultHandleInner *)UNI_MALLOC(sizeof(ResultHandleInner));
+            int model_num_outputs = num_outputs;
+            DataDesc *outputArrPtr = (DataDesc *)UNI_MALLOC(sizeof(DataDesc) * model_num_outputs);
+            for (int i = 0; i < num_outputs; i++) {
+                U32 length = UNI_MIN(strlen(name[i]), NAME_LEN - 1);
+                UNI_MEMCPY(outputArrPtr[i].name, name[i], length);
+                if (length < NAME_LEN) {
+                    outputArrPtr[i].name[length] = '\0';
+                }
+                TensorDesc srcDesc = cnn->get_tensor_desc_by_name(name[i]);
+                TensorDesc2DataDesc(srcDesc, &outputArrPtr[i]);
+            }
+            model_result_ptr->num_outputs = model_num_outputs;
+            model_result_ptr->outputArr = outputArrPtr;
+            model_result_ptr->deviceType = ihInfo->deviceType;
         }
-        TensorDesc srcDesc = cnn->get_tensor_desc_by_name(name[i]);
-        TensorDesc2DataDesc(srcDesc, &outputArrPtr[i]);
     }
-    model_result_ptr->num_outputs = model_num_outputs;
-    model_result_ptr->outputArr = outputArrPtr;
-    model_result_ptr->deviceType = ihInfo->deviceType;
-    UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, model_result_ptr);
+    print_result_handle(model_result_ptr);
     return (void *)model_result_ptr;
 }
 
 void SetRuntimeDevice(ModelHandle ih, int cpu_id, DEVICE_TYPE device)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p, %d, %d)...\n", __FUNCTION__, ih, cpu_id, device);
     ModelHandleInner *ihInfo = (ModelHandleInner *)ih;
     assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo);
+    print_model_handle(ihInfo);
     CNN *cnn = (CNN *)ihInfo->cnn;
     assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
     cnn->set_runtime_device(cpu_id, DEVICE_TYPE2Arch(device));
@@ -648,9 +728,10 @@ void SetRuntimeDevice(ModelHandle ih, int cpu_id, DEVICE_TYPE device)
 
 void SetRuntimeDeviceDynamic(ModelHandle ih)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ih);
     ModelHandleInner *ihInfo = (ModelHandleInner *)ih;
     assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo);
+    print_model_handle(ihInfo);
     CNN *cnn = (CNN *)ihInfo->cnn;
     assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
     cnn->set_runtime_device_dynamic();
@@ -660,20 +741,22 @@ void SetRuntimeDeviceDynamic(ModelHandle ih)
 
 void SetNumThreads(int threadNum)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%d)...\n", __FUNCTION__, threadNum);
     set_cpu_num_threads(threadNum);
     UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
 }
 
 void RunModel(ModelHandle ih, ResultHandle ir, int num_inputs, const char **name, void **data)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p, %p, %d, %p, %p)...\n", __FUNCTION__, ih, ir, num_inputs, name, data);
     ModelHandleInner *ihInfo = (ModelHandleInner *)ih;
     assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo);
+    print_model_handle(ihInfo);
     CNN *cnn = (CNN *)ihInfo->cnn;
     assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
     ResultHandleInner *ir_inner = (ResultHandleInner *)ir;
     assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner);
+    print_result_handle(ir_inner);
     if (num_inputs > 0) {
         assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(name));
         assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(data));
@@ -720,11 +803,16 @@ void RunModel(ModelHandle ih, ResultHandle ir, int num_inputs, const char **name
 
 int GetNumOutputsFromResultHandle(ResultHandle ir)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ir);
     ResultHandleInner *ir_inner = (ResultHandleInner *)ir;
     assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner);
-    UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
-    return ir_inner->num_outputs;
+    print_result_handle(ir_inner);
+    int ret = 0;
+    if (ir_inner != nullptr) {
+        ret = ir_inner->num_outputs;
+    }
+    UNI_DEBUG_LOG("C API %s(%d) end.\n", __FUNCTION__, ret);
+    return ret;
 }
 
 void GetOutputDataInfoFromResultHandle(ResultHandle ir,
@@ -737,13 +825,15 @@ void GetOutputDataInfoFromResultHandle(ResultHandle ir,
     DATA_TYPE *dt,
     DATA_FORMAT *df)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p, %d, %p, %p, %p, %p, %p, %p, %p)...\n", __FUNCTION__, ir,
+        num_outputs, name, n, c, h, w, dt, df);
     if (num_outputs <= 0) {
         UNI_WARNING_LOG("C API %s received num_outputs = 0.\n", __FUNCTION__);
         return;
     }
     ResultHandleInner *ir_inner = (ResultHandleInner *)ir;
     assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner);
+    print_result_handle(ir_inner);
     if (num_outputs != (int)ir_inner->num_outputs) {
         UNI_ERROR_LOG("C API %s received num_outputs %d != num_outputs %d in ResultHandle.\n",
             __FUNCTION__, num_outputs, ir_inner->num_outputs);
@@ -759,7 +849,7 @@ void GetOutputDataInfoFromResultHandle(ResultHandle ir,
     assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(dt));
     assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(df));
     for (int i = 0; i < num_outputs; i++) {
-        strcpy(name[i], outputArrPtr[i].name);
+        UNI_STRCPY(name[i], outputArrPtr[i].name);
         dt[i] = DataType2DATA_TYPE(outputArrPtr[i].dt);
         df[i] = DataFormat2DATA_FORMAT(outputArrPtr[i].df);
         n[i] = outputArrPtr[i].dims[0];
@@ -772,13 +862,14 @@ void GetOutputDataInfoFromResultHandle(ResultHandle ir,
 
 void GetOutputDataFromResultHandle(ResultHandle ir, int num_outputs, void **data)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p, %d, %p)...\n", __FUNCTION__, ir, num_outputs, data);
     if (num_outputs <= 0) {
         UNI_WARNING_LOG("C API %s received num_outputs = 0.\n", __FUNCTION__);
         return;
     }
     ResultHandleInner *ir_inner = (ResultHandleInner *)ir;
     assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner);
+    print_result_handle(ir_inner);
     if (num_outputs != (int)ir_inner->num_outputs) {
         UNI_ERROR_LOG("C API %s received num_outputs %d != num_outputs %d in ResultHandle.\n",
             __FUNCTION__, num_outputs, ir_inner->num_outputs);
@@ -795,58 +886,80 @@ void GetOutputDataFromResultHandle(ResultHandle ir, int num_outputs, void **data
 
 ResultHandle CloneResultHandle(ResultHandle ir)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ir);
     ResultHandleInner *ir_inner = (ResultHandleInner *)ir;
     assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner);
-    ResultHandleInner *clone_ir_inner = (ResultHandleInner *)malloc(sizeof(ResultHandleInner));
-    *clone_ir_inner = *ir_inner;
-    U32 size = sizeof(DataDesc) * clone_ir_inner->num_outputs;
-    if (size > 0) {
-        clone_ir_inner->outputArr = (DataDesc *)malloc(size);
-        DataDesc *outputArrPtr = ir_inner->outputArr;
-        assert_not_nullptr(__FUNCTION__, "ResultHandle.outputArr", outputArrPtr);
-        memcpy(clone_ir_inner->outputArr, outputArrPtr, size);
-    } else {
-        clone_ir_inner->outputArr = nullptr;
+    print_result_handle(ir_inner);
+    ResultHandleInner *clone_ir_inner = nullptr;
+    if (ir_inner != nullptr) {
+        clone_ir_inner = (ResultHandleInner *)UNI_MALLOC(sizeof(ResultHandleInner));
+        *clone_ir_inner = *ir_inner;
+        U32 size = sizeof(DataDesc) * clone_ir_inner->num_outputs;
+        if (size > 0) {
+            clone_ir_inner->outputArr = (DataDesc *)UNI_MALLOC(size);
+            DataDesc *outputArrPtr = ir_inner->outputArr;
+            assert_not_nullptr(__FUNCTION__, "ResultHandle.outputArr", outputArrPtr);
+            UNI_MEMCPY(clone_ir_inner->outputArr, outputArrPtr, size);
+        } else {
+            clone_ir_inner->outputArr = nullptr;
+        }
     }
-    UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, clone_ir_inner);
+    print_result_handle(clone_ir_inner);
     return (ResultHandle)clone_ir_inner;
 }
 
 void FreeResultHandle(ResultHandle ir)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ir);
     ResultHandleInner *ir_inner = (ResultHandleInner *)ir;
     assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner);
-    DataDesc *outputArrPtr = ir_inner->outputArr;
-    if (ir_inner->num_outputs > 0) {
-        assert_not_nullptr(__FUNCTION__, "ResultHandle.outputArr", outputArrPtr);
-        free(outputArrPtr);
-        ir_inner->num_outputs = 0;
+    print_result_handle(ir_inner);
+    if (ir_inner != nullptr) {
+        DataDesc *outputArrPtr = ir_inner->outputArr;
+        if (ir_inner->num_outputs > 0) {
+            assert_not_nullptr(__FUNCTION__, "ResultHandle.outputArr", outputArrPtr);
+            ir_inner->num_outputs = 0;
+        }
+        if (outputArrPtr != nullptr) {
+            UNI_FREE(outputArrPtr);
+            ir_inner->outputArr = nullptr;
+        }
+        UNI_FREE(ir_inner);
     }
-    (*ir_inner).outputArr = nullptr;
-    free(ir_inner);
     UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
 }
 
 void DestroyModel(ModelHandle ih)
 {
-    UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__);
+    UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ih);
     ModelHandleInner *ihInfo = (ModelHandleInner *)ih;
     assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo);
-
-    CNN *cnn = (CNN *)ihInfo->cnn;
-    assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
-
-    if (nullptr != ihInfo->algoPath && !ihInfo->useFileStream) {
-        const char *algoPath = (const char *)ihInfo->algoPath;
-        UNI_THREAD_SAFE(cnn->saveAlgorithmMapToFile(algoPath));
+    print_model_handle(ihInfo);
+    if (ihInfo != nullptr) {
+        CNN *cnn = (CNN *)ihInfo->cnn;
+        assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn);
+        if (cnn != nullptr) {
+            //if (ihInfo->algoPath != nullptr && !ihInfo->useFileStream) {
+            //    const char *algoPath = (const char *)ihInfo->algoPath;
+            //    UNI_THREAD_SAFE(cnn->saveAlgorithmMapToFile(algoPath));
+            //}
+            delete cnn;
+            ihInfo->cnn = nullptr;
+        }
+        delete ihInfo;
     }
-
-    delete cnn;
-    ihInfo->cnn = nullptr;
-    delete ihInfo;
     UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
 }
 
+void MemoryCheck()
+{
+    UNI_DEBUG_LOG("C API %s()...\n", __FUNCTION__);
+#ifndef _USE_MEM_CHECK
+    UNI_WARNING_LOG("please set USE_MEM_CHECK to ON at common/cmakes/bolt.cmake, and rebuild "
+                    "library.\n");
+#endif
+    UNI_MEM_STATISTICS();
+    UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__);
+}
 #undef NAME_VALUE_PAIR
diff --git a/inference/engine/src/bolt_dllite.cpp b/inference/engine/src/bolt_dllite.cpp
index ae2eb36e..140637df 100644
--- a/inference/engine/src/bolt_dllite.cpp
+++ b/inference/engine/src/bolt_dllite.cpp
@@ -376,7 +376,7 @@ bolt::ResultHandle bolt::AllocResult(
     for (size_t i = 0; i < outputs.size(); i++) {
         U32 length = outputs[i].name.length();
         outputNames[i] = (char *)malloc(length + 1);
-        memcpy(outputNames[i], outputs[i].name.c_str(), length);
+        UNI_MEMCPY(outputNames[i], outputs[i].name.c_str(), length);
         outputNames[i][length] = '\0';
     }
     bolt::ResultHandle rh = (bolt::ResultHandle)AllocSpecificResultHandle(
diff --git a/inference/engine/src/cnn.cpp b/inference/engine/src/cnn.cpp
index 30d8e50b..56e8ec36 100644
--- a/inference/engine/src/cnn.cpp
+++ b/inference/engine/src/cnn.cpp
@@ -11,6 +11,7 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+#include <unordered_map>
 #include "cnn.h"
 #ifdef _USE_CPU
 #include "cpu/factory_cpu.hpp"
@@ -33,6 +34,22 @@ bool is_same_tensor(Tensor a, Tensor b)
     return ret;
 }
 
+void CNN::check_dynamic_output_size(OperatorType type)
+{
+    std::set<OperatorType> types = {OT_Shape, OT_NonMaxSuppression};
+    if (types.find(type) != types.end()) {
+        this->dynamicOutputSize = true;
+    }
+    if (type == OT_Shape) {
+        UNI_WARNING_LOG("model contains Shape operator, this will use dynamic output size "
+                        "inference(may encounter error). If you don't want to use it, you can use "
+                        "onnx-simplifier to simplify original onnx model.\n");
+        if (IS_GPU(this->deviceInfo.schedule)) {
+            UNI_ERROR_LOG("gpu currently not support dynamic output size inference.\n");
+        }
+    }
+}
+
 CNN CNN::clone()
 {
     CNN cnn = *this;
@@ -166,8 +183,7 @@ void CNN::initialize_ops(const ModelSpec *ms)
     std::shared_ptr<Factory> factory;
     if (IS_GPU(this->deviceInfo.schedule)) {
 #ifdef _USE_GPU
-        auto factory_ocl = (Factory *)(new FactoryOCL());
-        factory = std::shared_ptr<Factory>(factory_ocl);
+        factory = std::shared_ptr<Factory>(new FactoryOCL());
         this->tmpTensor = Tensor(OCLMem);
 #else
         UNI_ERROR_LOG("This library not support ARM GPU, please rebuild library with --gpu "
@@ -175,17 +191,18 @@ void CNN::initialize_ops(const ModelSpec *ms)
         exit(1);
 #endif
     } else {
-        auto factory_cpu = (Factory *)(new FactoryCPU());
-        factory = std::shared_ptr<Factory>(factory_cpu);
+        factory = std::shared_ptr<Factory>(new FactoryCPU());
         this->tmpTensor = Tensor();
     }
 
     for (int i = 0; i < opNum; i++) {
         OperatorSpec curOps = ms->ops[i];
         std::string opName = curOps.name;
+        UNI_DEBUG_LOG("create operator:%s type:%s.\n", curOps.name, OperatorTypeName()[curOps.type]);
         if (opName.compare("data") == 0) {
             continue;
         }
+        this->check_dynamic_output_size(curOps.type);
         std::vector<std::string> inputTensorsName;
         std::vector<std::string> outputTensorsName;
         for (U32 j = 0; j < curOps.num_inputs; j++) {
@@ -212,6 +229,7 @@ void CNN::initialize_ops(const ModelSpec *ms)
     for (int i = 0; i < ms->num_weight_specs; i++) {
         WeightSpec curOpWs = ms->ws[i];
         std::string opName = curOpWs.op_name;
+        UNI_DEBUG_LOG("set operator:%s's weight parameter.\n", curOpWs.op_name);
         if (this->operatorMap.find(opName) == this->operatorMap.end()) {
             UNI_WARNING_LOG("unsed weight %s in model.\n", opName.c_str());
             continue;
@@ -238,7 +256,7 @@ void CNN::ready(std::map<std::string, TensorDesc> inputDescMap)
                 if (op->is_weight()) {
                     UNI_DEBUG_LOG("op: %s init weight\n", op->get_name().c_str());
                     auto weightOpPtr = dynamic_cast<WeightOperator *>(op.get());
-                    CHECK_STATUS(weightOpPtr->init_weight_bias_from_model(nullptr));
+                    CHECK_STATUS(weightOpPtr->init_weight_bias_from_model());
                 }
                 UNI_DEBUG_LOG("op: %s infer forward algorithm\n", op->get_name().c_str());
                 //need process for qualcomm
@@ -277,7 +295,6 @@ void CNN::reready(std::map<std::string, TensorDesc> inputDescMap)
 
 EE CNN::mark_input_output()
 {
-    EE ret = SUCCESS;
     for (auto &iter : this->inputTensors) {
         std::string str = iter.first;
         if (tensorMap.find(str) != tensorMap.end()) {
@@ -285,8 +302,7 @@ EE CNN::mark_input_output()
         } else {
             UNI_ERROR_LOG(
                 "can not find tensor(name: %s) to be marked as model input.\n", str.c_str());
-            ret = NOT_MATCH;
-            break;
+            return NOT_MATCH;
         }
     }
     for (auto &iter : this->outputTensors) {
@@ -297,11 +313,10 @@ EE CNN::mark_input_output()
             UNI_ERROR_LOG("can not find tensor(name: %s) to be marked as model output. Maybe this "
                           "tensor is removed by graph optimizer.\n",
                 str.c_str());
-            ret = NOT_MATCH;
-            break;
+            return NOT_MATCH;
         }
     }
-    return ret;
+    return SUCCESS;
 }
 
 void CNN::set_input_by_copy(std::map<std::string, U8 *> modelTensorsInput)
@@ -312,14 +327,16 @@ void CNN::set_input_by_copy(std::map<std::string, U8 *> modelTensorsInput)
         UNI_DEBUG_LOG("    Copy input %s...\n", inputName.c_str());
         U8 *data = modelTensorInput.second;
         if (this->inputTensors.find(inputName) == this->inputTensors.end()) {
-            CHECK_STATUS(NOT_MATCH);
+            UNI_ERROR_LOG("Can not find input:%s to set.\n", inputName.c_str());
+            return;
         }
         auto tensorPtr = this->inputTensors[inputName];
         Tensor input;
         input.resize(tensorPtr->get_desc());
         std::shared_ptr<U8> shared_data(data, [](U8 *ptr) {});
         ((CpuMemory *)(input.get_memory()))->set_shared_ptr(shared_data);
-        tensorPtr->copy_from(&input);
+        UNI_PROFILE(
+            { tensorPtr->copy_from(&input); }, "copy " + inputName, std::string("input::copy"));
         UNI_DEBUG_LOG("    Copy input: %s %s\n", inputName.c_str(), tensorPtr->string(8).c_str());
     }
     UNI_DEBUG_LOG("Copy input end.\n");
@@ -332,15 +349,20 @@ void CNN::set_input_by_assign(std::map<std::string, std::shared_ptr<U8>> modelTe
         std::string inputName = modelTensorInput.first;
         std::shared_ptr<U8> data = modelTensorInput.second;
         if (this->inputTensors.find(inputName) == this->inputTensors.end()) {
-            CHECK_STATUS(NOT_MATCH);
+            UNI_ERROR_LOG("Can not find input:%s to set.\n", inputName.c_str());
+            return;
         }
         auto tensorPtr = this->inputTensors[inputName];
-        if (data != ((CpuMemory *)(tensorPtr->get_memory()))->get_shared_ptr()) {
-            Tensor input;
-            input.resize(tensorPtr->get_desc());
-            ((CpuMemory *)(input.get_memory()))->set_shared_ptr(data);
-            tensorPtr->reuse(&input);
-        }
+        UNI_PROFILE(
+            {
+                if (data != ((CpuMemory *)(tensorPtr->get_memory()))->get_shared_ptr()) {
+                    Tensor input;
+                    input.resize(tensorPtr->get_desc());
+                    ((CpuMemory *)(input.get_memory()))->set_shared_ptr(data);
+                    tensorPtr->reuse(&input);
+                }
+            },
+            "copy " + inputName, std::string("input::copy"));
         UNI_DEBUG_LOG("    Set input: %s %s\n", inputName.c_str(), tensorPtr->string(8).c_str());
     }
     UNI_DEBUG_LOG("Set input end.\n");
@@ -371,10 +393,13 @@ std::map<std::string, std::shared_ptr<Tensor>> CNN::get_output()
 
 Tensor CNN::get_tensor_by_name(std::string tensorName)
 {
+    Tensor ret;
     if (this->tensorMap.find(tensorName) == this->tensorMap.end()) {
-        CHECK_STATUS(NOT_MATCH);
+        UNI_ERROR_LOG("Can not find output:%s to get.\n", tensorName.c_str());
+    } else {
+        ret = *(this->tensorMap[tensorName].get());
     }
-    return *(this->tensorMap[tensorName].get());
+    return ret;
 }
 
 TensorDesc CNN::get_tensor_desc_by_name(std::string tensorName)
@@ -404,6 +429,52 @@ std::map<std::string, TensorDesc> CNN::get_output_desc()
     return descs;
 }
 
+void CNN::update_tensor_positions()
+{
+    std::unordered_map<std::string, I32> m;
+    for (auto &opName : this->sortedOps) {
+        auto op = this->operatorMap[opName];
+        if (op->get_type() == OT_Reshape) {
+            std::vector<std::string> curOpInputTensorName = this->operatorTensorMap[opName][0];
+            std::vector<std::string> curOpOutputTensorName = this->operatorTensorMap[opName][1];
+            auto tensor = this->tensorMap[curOpInputTensorName[0]];
+            if ((tensor->get_desc().df != DF_NCHWC8) &&
+                (tensor->get_desc().df != DF_NCHWC16))
+            {
+                std::vector<I32> tensorPositions = op->get_tensor_positions();
+                m[curOpInputTensorName[0]] = m[curOpOutputTensorName[0]] = tensorPositions[0] = -1;
+                tensorPositions[1] = -3;
+                // when slot is -3, reuse the input tensor mem.
+                op->set_tensor_positions(tensorPositions);
+            }
+        }
+    }
+    if (!m.empty()) {
+        for (auto &opName : this->sortedOps) {
+            auto op = this->operatorMap[opName];
+            std::vector<I32> tensorPositions = op->get_tensor_positions();
+            if (tensorPositions.size() > 1 && tensorPositions[1] == -3) {
+                continue;
+            }
+            bool update = false;
+            for (U32 i = 0, tensorIter = 0; i < 2; ++i) {
+                U32 iterSize = this->operatorTensorMap[opName][i].size();
+                for (U32 j = 0; j < iterSize; ++j) {
+                    std::string tensorName = this->operatorTensorMap[opName][i][j];
+                    if (m.count(tensorName)) {
+                        tensorPositions[tensorIter] = m[tensorName];
+                        update = true;
+                    }
+                    ++tensorIter;
+                }
+            }
+            if (update) {
+                op->set_tensor_positions(tensorPositions);
+            }
+        }
+    }
+}
+
 EE CNN::infer_output_tensors_size(std::map<std::string, TensorDesc> inputDescMap)
 {
     UNI_DEBUG_LOG("Infer tensor dimension...\n");
@@ -413,6 +484,9 @@ EE CNN::infer_output_tensors_size(std::map<std::string, TensorDesc> inputDescMap
             "model input: %s desc %s\n", iter.first.c_str(), tensorDesc2Str(iter.second).c_str());
     }
     this->infer_layout_desc();
+#ifndef _USE_GPU
+    this->update_tensor_positions();
+#endif
     this->update_op_tensors();
     UNI_DEBUG_LOG("Infer tensor dimension end.\n");
     return SUCCESS;
@@ -452,7 +526,7 @@ void CNN::assign_output_tensor()
                 //    tensorPositions[tensorIter]);
                 auto tensor = this->tensorMap[tensorName];
                 bool needAssign = true;
-                if (i == 0 && this->inputTensors.find(tensorName) == this->inputTensors.end()) {
+                if (i == 0 && (this->inputTensors.find(tensorName) == this->inputTensors.end())) {
                     needAssign = false;
                 }
                 if (this->weightOpOutputNames.find(tensorName) != this->weightOpOutputNames.end()) {
@@ -462,6 +536,7 @@ void CNN::assign_output_tensor()
                     I32 slot = tensorPositions[tensorIter];
                     if (slot >= 0) {
                         tensor->reuse(get_reuse_memory(slot, tensor.get()));
+
                     } else if (slot == -1) {
                         tensor->alloc();
 #ifdef _USE_GPU
@@ -469,6 +544,9 @@ void CNN::assign_output_tensor()
                         auto mem = (OclMemory *)tensor->get_memory();
                         mem->mapped_alloc();
 #endif
+                    } else if (slot == -3) {
+                        // when slot is -3, reuse the input tensor mem.
+                        tensor->reuse(&(tensors[0][0]));
                     }
                 }
                 tensorIter++;
@@ -489,7 +567,37 @@ void CNN::run()
         if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) {
             opIndex = op->get_next_operator_index();
         } else {
-            UNI_PROFILE(op->run(), op->get_name(),
+            if (this->dynamicOutputSize) {
+                std::vector<Tensor> inputs = op->get_input_tensors();
+                std::vector<Tensor> outputs = op->get_output_tensors();
+                std::vector<Tensor *> in, out;
+                for (U32 i = 0; i < inputs.size(); i++) {
+                    in.push_back(&inputs[i]);
+                }
+                for (U32 i = 0; i < outputs.size(); i++) {
+                    out.push_back(&outputs[i]);
+                }
+                op->infer_output_tensors_size(in, out);
+            }
+#ifdef _DEBUG
+            std::vector<Tensor> inputTensors = op->get_input_tensors();
+            std::vector<std::string> inputNames = operatorTensorMap[op->get_name()][0];
+            for (U32 i = 0; i < inputTensors.size(); i++) {
+                Tensor inputTensor = inputTensors[i];
+                std::string line = inputTensor.string(8);
+                UNI_DEBUG_LOG("    input:%s %s\n", inputNames[i].c_str(), line.c_str());
+            }
+#endif
+            UNI_PROFILE(
+                {
+                    op->run();
+#if defined(_USE_GPU) && defined(_PROFILE)
+                    if (IS_GPU(this->deviceInfo.schedule)) {
+                        gcl_finish(OCLContext::getInstance().handle.get());
+                    }
+#endif
+                },
+                op->get_name(),
                 std::string(OperatorTypeName()[op->get_type()]) + std::string("::run"));
             opIndex++;
         }
@@ -562,7 +670,7 @@ void CNN::set_op_tensors_positions(std::shared_ptr<Operator> op,
     U32 outputTensorsNum = outputTensorsName.size();
     U32 numTensors = inputTensorsNum + outputTensorsNum;
     std::vector<I32> tensorPositions(numTensors);
-    memcpy(tensorPositions.data(), tensor_positions, numTensors * bytesOf(DT_I32));
+    UNI_MEMCPY(tensorPositions.data(), tensor_positions, numTensors * bytesOf(DT_I32));
     if (IS_GPU(this->deviceInfo.schedule)) {
         for (U32 j = 0; j < numTensors; j++) {
             std::string curTensorName;
@@ -636,7 +744,7 @@ void CNN::set_input_desc(std::map<std::string, TensorDesc> inputDescMap)
 {
     for (auto &iter : inputDescMap) {
         if (tensorMap.find(iter.first) == tensorMap.end()) {
-            UNI_WARNING_LOG("Unused model input node: %s\n", iter.first.c_str());
+            UNI_WARNING_LOG("unused model input node: %s\n", iter.first.c_str());
             continue;
         }
         TensorDesc desc = iter.second;
diff --git a/inference/engine/src/data_loader.cpp b/inference/engine/src/data_loader.cpp
index a7860250..c5769ae2 100644
--- a/inference/engine/src/data_loader.cpp
+++ b/inference/engine/src/data_loader.cpp
@@ -18,6 +18,7 @@
 #include "data_loader.hpp"
 #include <algorithm>
 #include <string>
+#include <fstream>
 
 #ifdef _BUILD_TEST
 #include <jpeglib.h>
@@ -50,7 +51,7 @@ std::vector<Tensor> load_jpeg(
         info.out_color_space);
     CHECK_REQUIREMENT(2 == info.out_color_space);  // Support RGB for now
 
-    U8 *data = (U8 *)malloc(dataSize);
+    U8 *data = (U8 *)UNI_MALLOC(dataSize);
     JSAMPROW row_pointer[1];
     while (info.output_scanline < info.output_height) {
         row_pointer[0] = data + info.output_scanline * width * numChannels;
@@ -76,7 +77,7 @@ std::vector<Tensor> load_jpeg(
         b[i] = dataMov[2];
         dataMov += numChannels;
     }
-    free(data);
+    UNI_FREE(data);
 
     std::shared_ptr<Tensor> imageTensor =
         load_resize_image(rgbTensor, imageDesc[0], ImageFormat, scaleValue);
@@ -130,7 +131,8 @@ void get_files(std::string directoryName, std::vector<std::string> &files)
     }
     struct dirent *file;
     while ((file = readdir(directory)) != NULL) {
-        if (strcmp(file->d_name, ".") == 0 || strcmp(file->d_name, "..") == 0) {
+        if (std::string(file->d_name) == std::string(".") ||
+            std::string(file->d_name) == std::string("..")) {
             continue;
         }
         struct stat st;
@@ -146,7 +148,7 @@ void get_files(std::string directoryName, std::vector<std::string> &files)
     closedir(directory);
 }
 
-Tensor fscanfReadData(FILE *f, TensorDesc desc)
+Tensor readFileData(std::ifstream &file, TensorDesc desc)
 {
     Tensor tensor = Tensor::alloc_sized<CPUMem>(desc);
     U32 size = tensor.length();
@@ -156,41 +158,37 @@ Tensor fscanfReadData(FILE *f, TensorDesc desc)
         case DT_F32: {
             F32 *dataPtr = (F32 *)ptr;
             for (U32 i = 0; i < size; i++) {
-                fscanf(f, "%f", dataPtr + i);
+                file >> dataPtr[i];
             }
             break;
         }
-#ifdef __aarch64__
+#ifdef _USE_FP16
         case DT_F16: {
             F16 *dataPtr = (F16 *)ptr;
             F32 value;
             for (U32 i = 0; i < size; i++) {
-                fscanf(f, "%f", &value);
+                file >> value;
                 dataPtr[i] = (F16)value;
             }
             break;
         }
 #endif
         case DT_U32: {
-            F32 value = 0;
             U32 *dataPtr = (U32 *)ptr;
             for (U32 i = 0; i < size; i++) {
-                fscanf(f, "%f", &value);
-                dataPtr[i] = value;
+                file >> dataPtr[i];
             }
             break;
         }
         case DT_I32: {
-            F32 value = 0;
             I32 *dataPtr = (I32 *)ptr;
             for (U32 i = 0; i < size; i++) {
-                fscanf(f, "%f", &value);
-                dataPtr[i] = value;
+                file >> dataPtr[i];
             }
             break;
         }
         default:
-            CHECK_STATUS(NOT_SUPPORTED);
+            UNI_ERROR_LOG("not support to read %s type data.\n", DataTypeName()[dataType]);
             break;
     }
     return tensor;
@@ -211,31 +209,37 @@ std::vector<Tensor> load_fake_data(std::vector<TensorDesc> dataDesc)
 std::vector<Tensor> load_txt(std::string dataPath, std::vector<TensorDesc> dataDesc)
 {
     std::vector<Tensor> result;
-    FILE *f = fopen(dataPath.c_str(), "r");
-    CHECK_REQUIREMENT(f != nullptr);
+    std::ifstream file;
+    file.open(dataPath.c_str());
+    if (!file.is_open()) {
+        UNI_ERROR_LOG("can not read %s.\n", dataPath.c_str());
+    }
     for (U32 index = 0; index < dataDesc.size(); index++) {
-        result.push_back(fscanfReadData(f, dataDesc[index]));
+        result.push_back(readFileData(file, dataDesc[index]));
     }
-    fclose(f);
+    file.close();
     return result;
 }
 
 std::vector<Tensor> load_seq(std::string dataPath, std::vector<TensorDesc> dataDesc)
 {
     std::vector<Tensor> result;
-    FILE *f = fopen(dataPath.c_str(), "r");
-    CHECK_REQUIREMENT(f != nullptr);
+    std::ifstream file;
+    file.open(dataPath.c_str());
+    if (!file.is_open()) {
+        UNI_ERROR_LOG("can not read %s.\n", dataPath.c_str());
+    }
     for (U32 index = 0; index < dataDesc.size(); index++) {
         U32 sequenceLen = 0;
-        fscanf(f, "%u", &sequenceLen);
+        file >> sequenceLen;
         TensorDesc sequenceDesc = dataDesc[index];
         sequenceDesc.dims[0] = sequenceLen;
         for (U32 j = 1; j < sequenceDesc.nDims; j++) {
             sequenceDesc.dims[j] = 1;
         }
-        result.push_back(fscanfReadData(f, sequenceDesc));
+        result.push_back(readFileData(file, sequenceDesc));
     }
-    fclose(f);
+    file.close();
     return result;
 }
 
@@ -253,21 +257,15 @@ std::vector<Tensor> load_bin(
             Tensor tensor = Tensor::alloc_sized<CPUMem>(sourceDesc);
             U32 len = tensor.length();
             auto ptr = ((CpuMemory *)(tensor.get_memory()))->get_ptr();
-            U32 readLength = fread(ptr, bytesOf(sourceDataType[index]), len, f);
-            CHECK_REQUIREMENT(len == readLength);
+            CHECK_REQUIREMENT(fread(ptr, bytesOf(sourceDataType[index]), len, f) == len);
             if (sourceDataType[index] != dataDesc[index].dt) {
                 Tensor transform_tensor = Tensor::alloc_sized<CPUMem>(dataDesc[index]);
-                if (0) {
-#ifdef __aarch64__
-                } else if (sourceDataType[index] == DT_F32 && dataDesc[index].dt == DT_F16) {
-                    F32 *ptr1 = (F32 *)ptr;
-                    F16 *ptr2 = (F16 *)((CpuMemory *)(transform_tensor.get_memory()))->get_ptr();
-                    for (U32 i = 0; i < len; i++) {
-                        ptr2[i] = (F16)ptr1[i];
-                    }
-#endif
+                if (sourceDataType[index] == DT_F32) {
+                    transformFromFloat(dataDesc[index].dt, (const float *)ptr,
+                        ((CpuMemory *)(transform_tensor.get_memory()))->get_ptr(), len);
                 } else {
-                    CHECK_STATUS(NOT_SUPPORTED);
+                    UNI_ERROR_LOG("not support to read+transform %s data.\n",
+                        DataTypeName()[sourceDataType[index]]);
                 }
                 result.push_back(transform_tensor);
             } else {
@@ -315,3 +313,21 @@ std::vector<std::string> load_data(std::string directoryPath,
     }
     return dataPaths;
 }
+
+bool is_directory(std::string path)
+{
+    bool ret = false;
+    struct stat s;
+    if (stat(path.c_str(), &s) == 0) {
+        if (s.st_mode & S_IFDIR) {
+            ret = true;
+        } else if (s.st_mode & S_IFREG) {
+            ret = false;
+        } else {
+            UNI_ERROR_LOG("can not recognize %s.\n", path.c_str());
+        }
+    } else {
+        UNI_ERROR_LOG("%s is not exist.\n", path.c_str());
+    }
+    return ret;
+}
diff --git a/inference/engine/src/model.cpp b/inference/engine/src/model.cpp
new file mode 100644
index 00000000..5fa2c599
--- /dev/null
+++ b/inference/engine/src/model.cpp
@@ -0,0 +1,168 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "model.hpp"
+#include "thread_affinity.h"
+
+Model::Model(AffinityPolicy affinityPolicy, DataType dt, std::string name)
+{
+    this->set_device_info(affinityPolicy);
+    this->dt = dt;
+    this->name = name;
+    std::string deviceName = "";
+    if (IS_GPU(this->deviceInfo.schedule)) {
+#ifdef _USE_GPU
+        if (OCLContext::getInstance().handle->useQualcommDev) {
+            this->deviceInfo.schedule = QUALCOMM;
+        }
+#else
+        UNI_ERROR_LOG("This library not support ARM MALI/Qualcomm GPU, please rebuild library "
+                      "with --gpu option.\n");
+        exit(1);
+#endif
+    }
+    algorithmMap = std::shared_ptr<AlgorithmMap>(
+        new AlgorithmMap(this->deviceInfo.schedule, name, deviceName, dt));
+}
+
+void Model::set_runtime_device(int cpuId, int threadId)
+{
+    this->set_runtime_device(cpuId, this->deviceInfo.archs[cpuId], threadId);
+}
+
+void Model::set_runtime_device(int cpuId, Arch arch, int threadId)
+{
+    this->deviceInfo.schedule = arch;
+    UNI_DEBUG_LOG("Inference use %s.\n", ArchName()[this->deviceInfo.schedule])
+    if (cpuId >= 0 && cpuId < this->deviceInfo.cpuNum) {
+        set_thread_affinity(threadId, &cpuId, 1);
+        for (auto op : ops) {
+            op->set_schedule(this->deviceInfo.schedule);
+        }
+    }
+}
+
+void Model::set_runtime_device_dynamic(int threadId)
+{
+    set_cpu_dynamic(&this->deviceInfo, threadId);
+}
+
+Arch Model::get_runtime_device()
+{
+    return this->deviceInfo.schedule;
+}
+
+void Model::ready(std::map<std::string, TensorDesc> inputDescMap)
+{
+    infer_output_tensors_size(inputDescMap);
+    assign_output_tensor();
+
+    infer_tmp_memory_size();
+    assign_tmp_tensor();
+}
+
+#ifdef _USE_INT8
+U32 Model::find_next_dynamic_scale_op(std::vector<U32> calibratedOpIdx, U32 startIdx)
+{
+    CHECK_REQUIREMENT(startIdx < this->ops.size())
+    for (U32 i = startIdx; i < this->ops.size();) {
+        auto op = this->ops[i];
+        if (op->is_dynamic_scale()) {
+            bool calibrated = false;
+            for (auto idx : calibratedOpIdx) {
+                if (i == idx) {
+                    calibrated = true;
+                    break;
+                }
+            }
+            if (!calibrated) {
+                return i;
+            }
+        }
+
+        if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) {
+            i = op->get_next_operator_index();
+        } else {
+            i++;
+        }
+    }
+
+    return 0;  // The first layer should never be quantized
+}
+
+std::shared_ptr<Operator> Model::get_operator_by_index(U32 index)
+{
+    return this->ops[index];
+}
+
+void Model::run_till_breakpoint(U32 opIdx)
+{
+    CHECK_REQUIREMENT(IS_CPU(this->deviceInfo.schedule));
+    for (U32 i = 0; i < this->ops.size();) {
+        auto op = this->ops[i];
+        if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) {
+            if (opIdx == i) {
+                break;
+            }
+            i = op->get_next_operator_index();
+        } else {
+            op->run();
+            if (opIdx == i) {
+                break;
+            }
+            i++;
+        }
+    }
+}
+#endif
+
+std::string Model::get_name()
+{
+    return this->name;
+}
+
+void Model::loadAlgorithmMap(CI8 *path, bool useFileStream)
+{
+    std::string algoName = this->algorithmMap->getAlgorithmFileName();
+    CI8 *algoInfo = nullptr;
+    if (IS_GPU(this->deviceInfo.schedule)) {
+#ifdef _USE_GPU
+        algoInfo = gcl_get_algorithm_info(OCLContext::getInstance().handle.get(), algoName);
+#endif
+    }
+    if (!algoInfo && useFileStream) {
+        algoInfo = path;
+    }
+    if (algoInfo) {
+        this->algorithmMap->loadAlgorithmMapFromFileStream(algoInfo);
+    } else if (path) {
+        this->algorithmMap->loadAlgorithmMapFromFile(path);
+    }
+}
+
+void Model::saveAlgorithmMapToFile(std::string algorithmMapPath)
+{
+    this->algorithmMap->saveAlgorithmMapToFile(algorithmMapPath);
+}
+
+void Model::set_device_info(AffinityPolicy affinityPolicy)
+{
+#ifndef _USE_IOS
+    this->deviceInfo = get_cpu_info(affinityPolicy);
+    this->set_runtime_device_dynamic();
+#else
+    this->deviceInfo.affinityPolicy = affinityPolicy;
+    this->deviceInfo.schedule = ARM_A76;
+#endif
+    UNI_DEBUG_LOG("Inference use %s.\n", ArchName()[this->deviceInfo.schedule])
+}
diff --git a/inference/engine/src/model_calibration.cpp b/inference/engine/src/model_calibration.cpp
index bedc2da8..3acde351 100644
--- a/inference/engine/src/model_calibration.cpp
+++ b/inference/engine/src/model_calibration.cpp
@@ -287,7 +287,7 @@ void calibrate_model_with_dataset(std::string dataPath,
                 tensorSize.push_back(
                     tensorNumElements(resizedTensors[tensorPosition[i].second].get_desc()));
                 dBytes = tensorSize.back() * elementBytes;
-                memcpy(d,
+                UNI_MEMCPY(d,
                     ((CpuMemory *)(resizedTensors[tensorPosition[i].second].get_memory()))->get_ptr(),
                     dBytes);
 
@@ -347,13 +347,13 @@ void calibrate_model_with_dataset(std::string dataPath,
 
         resultMs->ops[opIdx].num_quant_feature = scales.size();
         resultMs->ops[opIdx].feature_scale =
-            (QuantSpec *)mt_new_storage(scales.size() * sizeof(QuantSpec));
+            (QuantSpec *)mt_malloc(scales.size() * sizeof(QuantSpec));
 
         for (U32 i = 0; i < scales.size(); i++) {
             resultMs->ops[opIdx].feature_scale[i].num_scale = scales[i].size();
             U32 scaleBytes = scales[i].size() * sizeof(F32);
-            resultMs->ops[opIdx].feature_scale[i].scale = (F32 *)mt_new_storage(scaleBytes);
-            memcpy(resultMs->ops[opIdx].feature_scale[i].scale, scales[i].data(), scaleBytes);
+            resultMs->ops[opIdx].feature_scale[i].scale = (F32 *)mt_malloc(scaleBytes);
+            UNI_MEMCPY(resultMs->ops[opIdx].feature_scale[i].scale, scales[i].data(), scaleBytes);
         }
 
         calibratedOpIdx.push_back(opIdx);
diff --git a/inference/engine/src/result_format.cpp b/inference/engine/src/result_format.cpp
index bd7abb67..4eb96155 100644
--- a/inference/engine/src/result_format.cpp
+++ b/inference/engine/src/result_format.cpp
@@ -23,7 +23,7 @@ std::vector<int> topK_index(U8 *res, TensorDesc desc, U32 topK)
     }
 
     switch (desc.dt) {
-#ifdef __aarch64__
+#ifdef _USE_FP16
         case DT_F16: {
             F16 *dataPtr = (F16 *)res;
             sort(index.begin(), index.end(),
diff --git a/inference/engine/tools/common_algo_search/common_algo_search.cpp b/inference/engine/tools/common_algo_search/common_algo_search.cpp
index b30ddf20..03f4dbb4 100644
--- a/inference/engine/tools/common_algo_search/common_algo_search.cpp
+++ b/inference/engine/tools/common_algo_search/common_algo_search.cpp
@@ -55,10 +55,10 @@ int convolutionCPUFloatAlgorithmSearch(Arch arch, DataType dt, std::string path)
                             }
                             convParamSpec.stride_h = sv;
                             convParamSpec.stride_w = sv;
-                            convParamSpec.padding_left = pl;
-                            convParamSpec.padding_right = pr;
-                            convParamSpec.padding_top = pt;
-                            convParamSpec.padding_bottom = pb;
+                            convParamSpec.pad_left = pl;
+                            convParamSpec.pad_right = pr;
+                            convParamSpec.pad_top = pt;
+                            convParamSpec.pad_bottom = pb;
                             filterDesc = tensor4df(dt, DF_NCHW, fn, ic, fv, fv);
                             Tensor inputTensor;
                             Tensor outputTensor;
diff --git a/inference/engine/tools/model_finetuner/model_finetuner.cpp b/inference/engine/tools/model_finetuner/model_finetuner.cpp
index 3cb0510b..492383e2 100644
--- a/inference/engine/tools/model_finetuner/model_finetuner.cpp
+++ b/inference/engine/tools/model_finetuner/model_finetuner.cpp
@@ -58,7 +58,7 @@ void load_cifar10(U8 *dataset, U32 batchIdx, TensorDesc inDesc, float *pixels, T
     archInfo.arch = CPU_GENERAL;
 
     if (training) {
-        memset(labels, 0, BATCH_SIZE * 10 * sizeof(float));
+        UNI_MEMSET(labels, 0, BATCH_SIZE * 10 * sizeof(float));
     }
     Tensor tmp, output;
     output.resize(inDesc);
diff --git a/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp b/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp
index d79d29b0..4e33d216 100644
--- a/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp
+++ b/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp
@@ -17,7 +17,6 @@
 #include "result_format.hpp"
 #include <stdlib.h>
 #include <fstream>
-#include <string.h>
 #include <vector>
 #include <string>
 #include <dirent.h>
@@ -70,52 +69,50 @@ inline void write_to_file(std::string str, std::string path, std::string name)
 inline void runBoltModel(
     CI8 *modelPath, CI8 *algoPath, std::map<std::string, std::vector<U8>> *kernelInfos)
 {
+    UNI_INFO_LOG("Build gpu kernels and algorithm map file for bolt model(%s)...\n", modelPath);
     if (!strstr(modelPath, "f16.bolt")) {
-        UNI_ERROR_LOG("Bolt gpu only support F16(_f16.bolt) now\n");
-        UNI_ERROR_LOG("Ensure your model is xxxx_f16.bolt\n");
+        UNI_ERROR_LOG("Bolt gpu only support float16 inference, and model file is end with "
+                      "_f16.bolt suffix.\n");
         exit(1);
     }
 
-    UNI_INFO_LOG("Building algofile and used kernelInfos for %s\n", modelPath);
-
-    ModelHandle model_address = model_address = CreateModel(modelPath, GPU, algoPath);
-    int num_input = GetNumInputsFromModel(model_address);
-    int *n = (int *)malloc(sizeof(int) * num_input);
-    int *c = (int *)malloc(sizeof(int) * num_input);
-    int *h = (int *)malloc(sizeof(int) * num_input);
-    int *w = (int *)malloc(sizeof(int) * num_input);
-    char **name = (char **)malloc(sizeof(char *) * num_input);
-    for (int i = 0; i < num_input; i++) {
-        name[i] = (char *)malloc(sizeof(char) * 1024);
+    ModelHandle model = CreateModel(modelPath, GPU, algoPath);
+    int input_num = GetNumInputsFromModel(model);
+    int *input_n = (int *)malloc(sizeof(int) * input_num);
+    int *input_c = (int *)malloc(sizeof(int) * input_num);
+    int *input_h = (int *)malloc(sizeof(int) * input_num);
+    int *input_w = (int *)malloc(sizeof(int) * input_num);
+    DATA_TYPE *input_dt = (DATA_TYPE *)malloc(sizeof(DATA_TYPE) * input_num);
+    DATA_FORMAT *input_df = (DATA_FORMAT *)malloc(sizeof(DATA_FORMAT) * input_num);
+    char **input_name = (char **)malloc(sizeof(char *) * input_num);
+    for (int i = 0; i < input_num; i++) {
+        input_name[i] = (char *)malloc(sizeof(char) * 1024);
     }
-    DATA_TYPE *dt_input = (DATA_TYPE *)malloc(sizeof(DATA_TYPE) * num_input);
-    DATA_FORMAT *df_input = (DATA_FORMAT *)malloc(sizeof(DATA_FORMAT) * num_input);
-    GetInputDataInfoFromModel(model_address, num_input, name, n, c, h, w, dt_input, df_input);
-    unsigned char **input_ptr = (unsigned char **)malloc(sizeof(unsigned char *) * num_input);
-    for (int i = 0; i < num_input; i++) {
-        int length = n[i] * c[i] * h[i] * w[i];
-        F16 *ptr = (F16 *)malloc(sizeof(F16) * length);
-        for (int i = 0; i < length; i++) {
-            ptr[i] = 1;
-        }
-        input_ptr[i] = (unsigned char *)ptr;
+    GetInputDataInfoFromModel(
+        model, input_num, input_name, input_n, input_c, input_h, input_w, input_dt, input_df);
+    unsigned char **input_ptr = (unsigned char **)malloc(sizeof(unsigned char *) * input_num);
+    for (int i = 0; i < input_num; i++) {
+        int length = input_n[i] * input_c[i] * input_h[i] * input_w[i];
+        input_ptr[i] = (unsigned char *)malloc(sizeof(F16) * length);
+        UNI_INIT(length, DT_F16, 1, input_ptr[i]);
     }
-    PrepareModel(model_address, num_input, (const char **)name, n, c, h, w, dt_input, df_input);
-    ResultHandle model_result = AllocAllResultHandle(model_address);
-    int model_result_num = GetNumOutputsFromResultHandle(model_result);
-    int *output_n = (int *)malloc(sizeof(int) * model_result_num);
-    int *output_c = (int *)malloc(sizeof(int) * model_result_num);
-    int *output_h = (int *)malloc(sizeof(int) * model_result_num);
-    int *output_w = (int *)malloc(sizeof(int) * model_result_num);
-    char **outputNames = (char **)malloc(sizeof(char *) * model_result_num);
-    for (int i = 0; i < model_result_num; i++) {
-        outputNames[i] = (char *)malloc(sizeof(char) * 1024);
+    PrepareModel(model, input_num, (const char **)input_name, input_n, input_c, input_h, input_w,
+        input_dt, input_df);
+    ResultHandle result = AllocAllResultHandle(model);
+    int output_num = GetNumOutputsFromResultHandle(result);
+    int *output_n = (int *)malloc(sizeof(int) * output_num);
+    int *output_c = (int *)malloc(sizeof(int) * output_num);
+    int *output_h = (int *)malloc(sizeof(int) * output_num);
+    int *output_w = (int *)malloc(sizeof(int) * output_num);
+    DATA_TYPE *output_dt = (DATA_TYPE *)malloc(sizeof(DATA_TYPE) * output_num);
+    DATA_FORMAT *output_df = (DATA_FORMAT *)malloc(sizeof(DATA_FORMAT) * output_num);
+    char **output_name = (char **)malloc(sizeof(char *) * output_num);
+    for (int i = 0; i < output_num; i++) {
+        output_name[i] = (char *)malloc(sizeof(char) * 1024);
     }
-    DATA_TYPE *dt_output = (DATA_TYPE *)malloc(sizeof(DATA_TYPE) * model_result_num);
-    DATA_FORMAT *df_output = (DATA_FORMAT *)malloc(sizeof(DATA_FORMAT) * model_result_num);
-    GetOutputDataInfoFromResultHandle(model_result, model_result_num, outputNames, output_n,
-        output_c, output_h, output_w, dt_output, df_output);
-    RunModel(model_address, model_result, num_input, (const char **)name, (void **)input_ptr);
+    GetOutputDataInfoFromResultHandle(result, output_num, output_name, output_n, output_c, output_h,
+        output_w, output_dt, output_df);
+    RunModel(model, result, input_num, (const char **)input_name, (void **)input_ptr);
 
     GCLHandle_t handle = OCLContext::getInstance().handle.get();
     for (auto p : handle->kernelMap) {
@@ -153,31 +150,31 @@ inline void runBoltModel(
         }
     }
     CHECK_STATUS(gcl_finish(handle));
-    FreeResultHandle(model_result);
-    DestroyModel(model_address);
+    FreeResultHandle(result);
+    DestroyModel(model);
 
-    free(n);
-    free(c);
-    free(h);
-    free(w);
-    free(dt_input);
-    free(df_input);
-    for (int i = 0; i < num_input; i++) {
-        free(name[i]);
+    free(input_n);
+    free(input_c);
+    free(input_h);
+    free(input_w);
+    free(input_dt);
+    free(input_df);
+    for (int i = 0; i < input_num; i++) {
+        free(input_name[i]);
         free(input_ptr[i]);
     }
-    free(name);
+    free(input_name);
     free(input_ptr);
     free(output_n);
     free(output_c);
     free(output_h);
     free(output_w);
-    free(dt_output);
-    free(df_output);
-    for (int i = 0; i < model_result_num; i++) {
-        free(outputNames[i]);
+    free(output_dt);
+    free(output_df);
+    for (int i = 0; i < output_num; i++) {
+        free(output_name[i]);
     }
-    free(outputNames);
+    free(output_name);
 }
 
 inline void buildFileStream(CI8 *fileName, U8 **bytesPtr, U32 *len)
@@ -238,6 +235,7 @@ inline void buildKernelBinFiles(std::map<std::string, std::vector<U8>> kernelInf
     device_map += "#include \"gcl_kernel_binmap.h\"\n";
     device_map += "#include \"" + device_map_head_name + "\"\n";
 
+    I8 buffer[16];
     for (auto p : kernelInfos) {
         std::string kernelName = p.first;
         std::vector<U8> binaryInfo = p.second;
@@ -247,12 +245,11 @@ inline void buildKernelBinFiles(std::map<std::string, std::vector<U8>> kernelInf
         device_map += "const unsigned int " + func + "_len = " + std::to_string(len) + ";\n";
         device_map += "const unsigned char " + func + "[] = " + "{";
         for (U32 i = 0; i < len; i++) {
-            I8 tempstr[4];
             if (i % 20 == 0) {
                 device_map += "\n";
             }
-            sprintf(tempstr, "0x%02x", binaryInfo[i]);
-            device_map += std::string(tempstr);
+            sprintf(buffer, "0x%02x", binaryInfo[i]);
+            device_map += std::string(buffer);
             if (i != len - 1) {
                 device_map += ", ";
             } else {
@@ -269,12 +266,11 @@ inline void buildKernelBinFiles(std::map<std::string, std::vector<U8>> kernelInf
         device_map += "const unsigned int " + algoName + "_len = " + std::to_string(len) + ";\n";
         device_map += "const unsigned char " + algoName + "[] = " + "{";
         for (U32 i = 0; i < len; i++) {
-            I8 tempstr[4];
             if (i % 20 == 0) {
                 device_map += "\n";
             }
-            sprintf(tempstr, "0x%02x", bytes[i]);
-            device_map += std::string(tempstr);
+            sprintf(buffer, "0x%02x", bytes[i]);
+            device_map += std::string(buffer);
             if (i != len - 1) {
                 device_map += ", ";
             } else {
@@ -333,29 +329,10 @@ int main(int argc, char *argv[])
         exit(1);
     }
     I8 lastFlag;
-    std::string modelsPath = (CI8 *)argv[1];
-    lastFlag = modelsPath[modelsPath.length() - 1];
-    if (strcmp(&lastFlag, "/") != 0) {
-        modelsPath += "/";
-    }
-
-    std::string algoPath = (CI8 *)argv[2];
-    lastFlag = algoPath[algoPath.length() - 1];
-    if (strcmp(&lastFlag, "/") != 0) {
-        algoPath += "/";
-    }
-
-    std::string includePath = (CI8 *)argv[3];
-    lastFlag = includePath[includePath.length() - 1];
-    if (strcmp(&lastFlag, "/") != 0) {
-        includePath += "/";
-    }
-
-    std::string cppPath = (CI8 *)argv[4];
-    lastFlag = cppPath[cppPath.length() - 1];
-    if (strcmp(&lastFlag, "/") != 0) {
-        cppPath += "/";
-    }
+    std::string modelsPath = (CI8 *)argv[1] + std::string("/");
+    std::string algoPath = (CI8 *)argv[2] + std::string("/");
+    std::string includePath = (CI8 *)argv[3] + std::string("/");
+    std::string cppPath = (CI8 *)argv[4] + std::string("/");
 
     std::vector<std::string> modelNamesArray;
     modelNamesArray = buildFileNamesArray(modelsPath, ".bolt");
diff --git a/inference/examples/CMakeLists.txt b/inference/examples/CMakeLists.txt
index 17356469..e494d978 100644
--- a/inference/examples/CMakeLists.txt
+++ b/inference/examples/CMakeLists.txt
@@ -21,13 +21,14 @@ engine_test(benchmark benchmark/benchmark.cpp)
 install(TARGETS benchmark
         RUNTIME DESTINATION examples)
 if (BUILD_TEST)
-    engine_test(c_image_classifification "c_api/c_image_classifification.c;c_api/c_common.c;c_api/c_test.c")
+    engine_test(c_image_classification "c_api/c_image_classification.c;c_api/c_common.c;c_api/c_test.c")
     engine_test(c_input_method "c_api/c_input_method.c;c_api/c_common.c;c_api/c_test.c")
     engine_test(bert bert/bert.cpp)
     engine_test(tinybert bert/tinybert.cpp)
     engine_test(classification image_classification/classification.cpp)
     engine_test(nmt machine_translation/nmt.cpp)
     engine_test(nmt_tsc machine_translation/nmt_tsc.cpp)
+    engine_test(tsc_ssru machine_translation/tsc_ssru.cpp)
     engine_test(asr_rnnt automatic_speech_recognition/asr_rnnt.cpp)
     engine_test(asr_convolution_transformer automatic_speech_recognition/asr_convolution_transformer.cpp)
     engine_test(tts text_to_speech/tts.cpp)
@@ -47,11 +48,12 @@ if (BUILD_TEST)
                     tinybert
                     tinybert_onnx
                     nmt
+                    tsc_ssru
                     asr_rnnt
                     asr_convolution_transformer
                     tts
                     vad
-                    c_image_classifification
+                    c_image_classification
                     c_input_method
             RUNTIME DESTINATION examples)
 
diff --git a/inference/examples/automatic_speech_recognition/flow_asr.h b/inference/examples/automatic_speech_recognition/flow_asr.h
index a7ef6401..0fd6a345 100644
--- a/inference/examples/automatic_speech_recognition/flow_asr.h
+++ b/inference/examples/automatic_speech_recognition/flow_asr.h
@@ -267,7 +267,7 @@ void loadBinary(const std::string fileName, char *data, size_t size)
     ifs.seekg(0, std::ifstream::beg);
     ifs.read(data, UNI_MIN(length, size));
     if (length < size) {
-        memset(data + length, 0, size - length);
+        UNI_MEMSET(data + length, 0, size - length);
     }
     ifs.close();
 }
@@ -352,7 +352,7 @@ std::map<std::string, std::shared_ptr<Tensor>> getEncoderInputOutput(
         case DT_F32: {
             F32 *ptr = (F32 *)((CpuMemory *)(tensors["sounds"]->get_memory()))->get_ptr();
             for (int i = 0; i < frameLength; i++) {
-                memcpy(ptr + i * featureLength, feature[0][i + frameOffset].data(),
+                UNI_MEMCPY(ptr + i * featureLength, feature[0][i + frameOffset].data(),
                     featureLength * sizeof(float));
             }
             break;
@@ -497,7 +497,7 @@ std::map<std::string, std::shared_ptr<Tensor>> getEncoderInputOutput(
             if (iter.first != std::string("sounds")) {
                 TensorDesc desc = iter.second->get_desc();
                 U8 *ptr = (U8 *)((CpuMemory *)(iter.second->get_memory()))->get_ptr();
-                memset(ptr, 0, tensorNumBytes(desc));
+                UNI_MEMSET(ptr, 0, tensorNumBytes(desc));
             }
         }
     }
@@ -606,7 +606,7 @@ std::map<std::string, std::shared_ptr<Tensor>> getPinYin2HanZiInputOutput(int fr
     tensors["pinyin"]->resize(tensor2df(DT_U32, DF_NORMAL, 1, bufferLength));
     tensors["pinyin"]->alloc();
     if (frameId == 0) {
-        memset(buffer, 0, sizeof(unsigned int) * bufferLength);
+        UNI_MEMSET(buffer, 0, sizeof(unsigned int) * bufferLength);
     }
     int pinyin = *((unsigned int *)((CpuMemory *)(joint["output_argmax"]->get_memory()))->get_ptr()) -
         PINYIN_FEATURE_GAP;
@@ -620,8 +620,8 @@ std::map<std::string, std::shared_ptr<Tensor>> getPinYin2HanZiInputOutput(int fr
         buffer[bufferValidSize - 1] = pinyin;
     }
     unsigned int *ptr = (unsigned int *)((CpuMemory *)(tensors["pinyin"]->get_memory()))->get_ptr();
-    memcpy(ptr, buffer, sizeof(unsigned int) * bufferValidSize);
-    memset(ptr + bufferValidSize, 0, sizeof(unsigned int) * (bufferLength - bufferValidSize));
+    UNI_MEMCPY(ptr, buffer, sizeof(unsigned int) * bufferValidSize);
+    UNI_MEMSET(ptr + bufferValidSize, 0, sizeof(unsigned int) * (bufferLength - bufferValidSize));
 
     tensors["hanzi_squeeze/Squeeze"] = std::shared_ptr<Tensor>(new Tensor());
     std::shared_ptr<Tensor> tmp;
diff --git a/inference/examples/automatic_speech_recognition/vad.cpp b/inference/examples/automatic_speech_recognition/vad.cpp
index 2a327129..05ba0ab4 100644
--- a/inference/examples/automatic_speech_recognition/vad.cpp
+++ b/inference/examples/automatic_speech_recognition/vad.cpp
@@ -92,7 +92,7 @@ int main(int argc, char *argv[])
         std::cout << "output_eoq: " << eoq.element(0) << " " << eoq.element(1) << std::endl;
         falseResult += verify(vad, eoq);
         Tensor outCache = pipeline->get_tensor_by_name("output_cache");
-        memcpy(cache.data(), (U8 *)((CpuMemory *)(outCache.get_memory()))->get_ptr(),
+        UNI_MEMCPY(cache.data(), (U8 *)((CpuMemory *)(outCache.get_memory()))->get_ptr(),
             tensorNumBytes(cacheDesc));
     }
     UNI_TIME_STATISTICS
diff --git a/inference/examples/benchmark/benchmark.cpp b/inference/examples/benchmark/benchmark.cpp
index 84a179d8..7a1171dd 100644
--- a/inference/examples/benchmark/benchmark.cpp
+++ b/inference/examples/benchmark/benchmark.cpp
@@ -43,11 +43,14 @@ void print_benchmark_usage()
            "5. -l [loopTime]: The running loopTimes. The default value is %d.\n"
            "6. -w [warmUp]: WarmUp times. The default value is %d.\n"
            "7. -t [threadsNum]: Parallel threads num. The default value is %d.\n"
-           "Example: ./benchmark -m /local/models/resnet50_f16.bolt\n",
+           "Example:\n"
+           "    ./benchmark -m /local/models/resnet50_f16.bolt\n"
+           "    ./benchmark -m /local/models/resnet50_f16.bolt -i ./input.txt\n"
+           "    ./benchmark -m /local/models/resnet50_f16.bolt -i ./data/\n",
         loopTime, warmUp, threadsNum);
 }
 
-void parse_options(int argc, char *argv[])
+int parse_options(int argc, char *argv[])
 {
     std::cout << "\nPlease enter this command './benchmark --help' to get more usage "
                  "information.\n";
@@ -55,7 +58,7 @@ void parse_options(int argc, char *argv[])
     for (std::string arg : lineArgs) {
         if (arg == "--help" || arg == "-help" || arg == "--h" || arg == "-h") {
             print_benchmark_usage();
-            exit(-1);
+            return 0;
         }
     }
 
@@ -94,27 +97,35 @@ void parse_options(int argc, char *argv[])
             default:
                 std::cout << "Input option gets error, please check the params meticulously.\n";
                 print_benchmark_usage();
-                exit(-1);
+                return 0;
         }
     }
+    return 1;
 }
 
 std::map<std::string, std::shared_ptr<U8>> create_tensors_from_path(
-    std::string dataPath, std::shared_ptr<CNN> pipeline)
+    std::string inputData, std::shared_ptr<CNN> pipeline)
 {
     std::map<std::string, TensorDesc> inputDescMap = pipeline->get_input_desc();
-    std::vector<DataType> sourceDataTypes;
-    std::vector<TensorDesc> inputDescs;
-    for (auto iter : inputDescMap) {
-        TensorDesc curDesc = iter.second;
-        sourceDataTypes.push_back(curDesc.dt);
-        inputDescs.push_back(curDesc);
-    }
     std::vector<Tensor> input;
-    if (string_end_with(inputData, ".txt")) {
-        input = load_txt(inputData, inputDescs);
+    if (inputData != "" && is_directory(inputData)) {
+        for (auto iter : inputDescMap) {
+            std::string path = inputData + "/" + iter.first + ".txt";
+            input.push_back(load_txt(path, {iter.second})[0]);
+        }
     } else {
-        input = load_bin(inputData, sourceDataTypes, inputDescs);
+        std::vector<DataType> sourceDataTypes;
+        std::vector<TensorDesc> inputDescs;
+        for (auto iter : inputDescMap) {
+            TensorDesc curDesc = iter.second;
+            sourceDataTypes.push_back(curDesc.dt);
+            inputDescs.push_back(curDesc);
+        }
+        if (string_end_with(inputData, ".txt")) {
+            input = load_txt(inputData, inputDescs);
+        } else {
+            input = load_bin(inputData, sourceDataTypes, inputDescs);
+        }
     }
     std::map<std::string, std::shared_ptr<U8>> model_tensors_input;
     int index = 0;
@@ -145,7 +156,7 @@ std::map<std::string, std::shared_ptr<Tensor>> get_output(
         for (auto iter : outMap) {
             Tensor result = *(iter.second);
             auto mem = (OclMemory *)result.get_memory();
-            mem->get_mapped_ptr();
+            UNI_PROFILE(mem->get_mapped_ptr(), "copy " + iter.first, std::string("output::copy"));
         }
 #else
         UNI_WARNING_LOG("this binary not support GPU, please recompile project with GPU "
@@ -155,15 +166,26 @@ std::map<std::string, std::shared_ptr<Tensor>> get_output(
     return outMap;
 }
 
-int main(int argc, char *argv[])
+int benchmark(int argc, char *argv[])
 {
     UNI_TIME_INIT
-    parse_options(argc, argv);
+    int ret = parse_options(argc, argv);
+    if (!ret) {
+        return 0;
+    }
 
     set_cpu_num_threads(threadsNum);
 
     // 1: set up the pipeline
+    double timeBegin = ut_time_ms();
     auto pipeline = createPipeline(affinityPolicyName, modelPath, algorithmMapPath);
+#ifdef _USE_GPU
+    if (std::string(affinityPolicyName) == std::string("GPU")) {
+        gcl_finish(OCLContext::getInstance().handle.get());
+    }
+#endif
+    double timeEnd = ut_time_ms();
+    double prepareTime = timeEnd - timeBegin;
 
     // 2: create input data and feed the pipeline with it
     auto model_tensors_input = create_tensors_from_path(inputData, pipeline);
@@ -171,16 +193,21 @@ int main(int argc, char *argv[])
     std::map<std::string, std::shared_ptr<Tensor>> outMap;
 
     // 3: warm up and run
+    UNI_TIME_STOP
+    timeBegin = ut_time_ms();
     for (int i = 0; i < warmUp; i++) {
         pipeline->set_input_by_assign(model_tensors_input);
         pipeline->run();
         outMap = get_output(pipeline, affinityPolicyName);
     }
 #ifdef _USE_GPU
-    if (strcmp(affinityPolicyName, "GPU") == 0) {
+    if (std::string(affinityPolicyName) == std::string("GPU")) {
         gcl_finish(OCLContext::getInstance().handle.get());
     }
 #endif
+    timeEnd = ut_time_ms();
+    double warmUpTime = timeEnd - timeBegin;
+    UNI_TIME_START
 
     double minTime = DBL_MAX;
     double maxTime = 0;
@@ -201,10 +228,19 @@ int main(int argc, char *argv[])
     print_result(outMap);
 
     UNI_TIME_STATISTICS
-    UNI_CI_LOG("total_time:%fms(loops=%d)\n", 1.0 * totalTime, loopTime);
-    UNI_CI_LOG("avg_time:%fms/data\n", 1.0 * totalTime / UNI_MAX(1, loopTime));
-    UNI_CI_LOG("min_time:%fms/data\n", 1.0 * minTime);
-    UNI_CI_LOG("max_time:%fms/data\n", 1.0 * maxTime);
+    UNI_CI_LOG("model prepare_time:%fms\n", 1.0 * prepareTime);
+    UNI_CI_LOG("model warm_up_time:%fms\n", 1.0 * warmUpTime);
+    UNI_CI_LOG("run total_time:%fms(loops=%d)\n", 1.0 * totalTime, loopTime);
+    UNI_CI_LOG("run avg_time:%fms/data\n", 1.0 * totalTime / UNI_MAX(1, loopTime));
+    UNI_CI_LOG("run min_time:%fms/data\n", 1.0 * minTime);
+    UNI_CI_LOG("run max_time:%fms/data\n", 1.0 * maxTime);
     pipeline->saveAlgorithmMapToFile(algorithmMapPath);
     return 0;
 }
+
+int main(int argc, char *argv[])
+{
+    int ret = benchmark(argc, argv);
+    UNI_MEM_STATISTICS();
+    return ret;
+}
diff --git a/inference/examples/bert/flow_tinybert.cpp b/inference/examples/bert/flow_tinybert.cpp
index 859afc94..6268d168 100644
--- a/inference/examples/bert/flow_tinybert.cpp
+++ b/inference/examples/bert/flow_tinybert.cpp
@@ -36,7 +36,7 @@ std::map<std::string, std::shared_ptr<Tensor>> inputOutput()
     tensors["tinybert_words"] = std::shared_ptr<Tensor>(new Tensor());
     tensors["tinybert_words"]->resize(inputDesc);
     tensors["tinybert_words"]->alloc();
-    memcpy(((CpuMemory *)tensors["tinybert_words"]->get_memory())->get_ptr(), words,
+    UNI_MEMCPY(((CpuMemory *)tensors["tinybert_words"]->get_memory())->get_ptr(), words,
         tensorNumBytes(inputDesc));
 
     tensors["tinybert_positions"] = std::shared_ptr<Tensor>(new Tensor());
diff --git a/inference/examples/bert/graph_tinybert.cpp b/inference/examples/bert/graph_tinybert.cpp
index 5eb77d5b..70bb9ea1 100644
--- a/inference/examples/bert/graph_tinybert.cpp
+++ b/inference/examples/bert/graph_tinybert.cpp
@@ -39,7 +39,7 @@ std::map<std::string, std::shared_ptr<Tensor>> inputOutput()
     tensors["tinybert_words"] = std::shared_ptr<Tensor>(new Tensor());
     tensors["tinybert_words"]->resize(inputDesc);
     tensors["tinybert_words"]->alloc();
-    memcpy(((CpuMemory *)tensors["tinybert_words"]->get_memory())->get_ptr(), words,
+    UNI_MEMCPY(((CpuMemory *)tensors["tinybert_words"]->get_memory())->get_ptr(), words,
         tensorNumBytes(inputDesc));
 
     tensors["tinybert_positions"] = std::shared_ptr<Tensor>(new Tensor());
diff --git a/inference/examples/bert/tinybert_test.h b/inference/examples/bert/tinybert_test.h
index e3409cb2..7aba1717 100644
--- a/inference/examples/bert/tinybert_test.h
+++ b/inference/examples/bert/tinybert_test.h
@@ -146,7 +146,7 @@ inline void tinybertTest(int argc,
         loopTime = parse_res.loopTime.first;
     }
 
-    bool useGPU = (strcmp(affinityPolicyName, "GPU") == 0) ? true : false;
+    bool useGPU = std::string(affinityPolicyName) == std::string("GPU");
     std::shared_ptr<CNN> pipelineBase;
     UNI_PROFILE(pipelineBase = createPipeline(affinityPolicyName, modelPath, algorithmMapPath),
         std::string("bolt::prepare"), std::string("prepare"));
diff --git a/inference/examples/c_api/Makefile b/inference/examples/c_api/Makefile
new file mode 100644
index 00000000..d1e67afb
--- /dev/null
+++ b/inference/examples/c_api/Makefile
@@ -0,0 +1,16 @@
+CC=aarch64-linux-android21-clang
+CFLAGS=-O3 -I../../../inference/engine/include
+LDFLAGS=../../../install_android-aarch64/lib/libbolt.a -llog \
+	../../../third_party/android-aarch64/opencl/lib/libOpenCL.so \
+	-lm -lstdc++
+
+targets: c_image_classification c_input_method
+
+c_image_classification: c_image_classification.o c_common.o c_test.o
+	${CC} -o $@ $^ ${LDFLAGS}
+c_input_method: c_input_method.o c_common.o c_test.o
+	${CC} -o $@ $^ ${LDFLAGS}
+%.o: %.c
+	$(CC) -c $(CFLAGS) $< -o $@
+clean:
+	rm -rf *.o c_image_classification c_input_method
diff --git a/inference/examples/c_api/c_common.c b/inference/examples/c_api/c_common.c
index 97e4d7ba..15d83476 100644
--- a/inference/examples/c_api/c_common.c
+++ b/inference/examples/c_api/c_common.c
@@ -88,7 +88,7 @@ void MallocTensor(int num,
     const DATA_FORMAT *df,
     void ***data)
 {
-    *data = malloc(sizeof(void *) * num);
+    *data = (void **)malloc(sizeof(void *) * num);
     for (int i = 0; i < num; i++) {
         int length = n[i] * c[i] * h[i] * w[i];
         switch (dt[i]) {
@@ -128,9 +128,7 @@ void CreateInference(int useModelFileStream,
     const char *algorithmMapPath,
     AFFINITY_TYPE affinity,
     ModelHandle *inferenceHandle,
-    ResultHandle *resultHandle,
-    int *inputNum,
-    char ***inputName)
+    ResultHandle *resultHandle)
 {
     if (useModelFileStream) {
         *inferenceHandle = CreateModelWithFileStream(modelPath, affinity, algorithmMapPath);
@@ -153,12 +151,10 @@ void CreateInference(int useModelFileStream,
 
     *resultHandle = AllocAllResultHandle(*inferenceHandle);
 
-    *inputNum = in_num;
-    *inputName = in_name;
-    //for (int i = 0; i < in_num; i++) {
-    //    free(in_name[i]);
-    //}
-    //free(in_name);
+    for (int i = 0; i < in_num; i++) {
+        free(in_name[i]);
+    }
+    free(in_name);
     free(in_n);
     free(in_c);
     free(in_h);
diff --git a/inference/examples/c_api/c_common.h b/inference/examples/c_api/c_common.h
index 49d8241c..10c3afe5 100644
--- a/inference/examples/c_api/c_common.h
+++ b/inference/examples/c_api/c_common.h
@@ -61,7 +61,5 @@ void CreateInference(int useModelFileStream,
     const char *algorithmMapPath,
     AFFINITY_TYPE affinity,
     ModelHandle *inferenceHandle,
-    ResultHandle *resultHandle,
-    int *inputNum,
-    char ***inputName);
+    ResultHandle *resultHandle);
 #endif
diff --git a/inference/examples/c_api/c_image_classifification.c b/inference/examples/c_api/c_image_classification.c
similarity index 86%
rename from inference/examples/c_api/c_image_classifification.c
rename to inference/examples/c_api/c_image_classification.c
index 87b583da..e7414293 100644
--- a/inference/examples/c_api/c_image_classifification.c
+++ b/inference/examples/c_api/c_image_classification.c
@@ -18,15 +18,13 @@ int main(int argc, char *argv[])
     ParseOptions(argc, argv);
     ModelHandle inferenceHandle;
     ResultHandle resultHandle;
-    int inNum;
-    char **inName;
     if (useFileStream) {
         char *modelFileStream = BuildFileStream(modelPath);
         CreateInference(useFileStream, modelFileStream, algorithmMapPath, affinity,
-            &inferenceHandle, &resultHandle, &inNum, &inName);
+            &inferenceHandle, &resultHandle);
     } else {
-        CreateInference(useFileStream, modelPath, algorithmMapPath, affinity, &inferenceHandle,
-            &resultHandle, &inNum, &inName);
+        CreateInference(
+            useFileStream, modelPath, algorithmMapPath, affinity, &inferenceHandle, &resultHandle);
     }
 
     int inputNum, *inputN, *inputC, *inputH, *inputW;
@@ -39,6 +37,10 @@ int main(int argc, char *argv[])
     MallocTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, &inputData);
     InitTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, inputData, 1);
 
+    PrintTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, inputData,
+        "input ", 8);
+    RunModel(inferenceHandle, resultHandle, inputNum, (const char **)inputName, inputData);
+
     int outputNum, *outputN, *outputC, *outputH, *outputW;
     DATA_TYPE *outputDT;
     DATA_FORMAT *outputDF;
@@ -46,12 +48,7 @@ int main(int argc, char *argv[])
     void **outputData;
     CreateOutputTensorDesc(resultHandle, &outputNum, &outputName, &outputN, &outputC, &outputH,
         &outputW, &outputDT, &outputDF);
-    outputData = malloc(sizeof(void *) * outputNum);
-
-    PrintTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, inputData,
-        "input ", 8);
-    RunModel(inferenceHandle, resultHandle, inNum, (const char **)inName, inputData);
-
+    outputData = (void **)malloc(sizeof(void *) * outputNum);
     GetOutputDataFromResultHandle(resultHandle, outputNum, outputData);
     PrintTensor(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF,
         outputData, "output ", 8);
@@ -59,10 +56,6 @@ int main(int argc, char *argv[])
     FreeTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, inputData);
     FreeTensorDesc(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF);
     free(outputData);
-    for (int i = 0; i < inNum; i++) {
-        free(inName[i]);
-    }
-    free(inName);
     FreeResultHandle(resultHandle);
     DestroyModel(inferenceHandle);
     return 0;
diff --git a/inference/examples/c_api/c_input_method.c b/inference/examples/c_api/c_input_method.c
index 9baa2390..1172f23f 100644
--- a/inference/examples/c_api/c_input_method.c
+++ b/inference/examples/c_api/c_input_method.c
@@ -13,6 +13,7 @@
 
 #include <stdio.h>
 #include <string.h>
+#include "secure_c_wrapper.h"
 #include "../../examples/c_api/c_test.h"
 
 int main(int argc, char *argv[])
@@ -20,15 +21,13 @@ int main(int argc, char *argv[])
     ParseOptions(argc, argv);
     ModelHandle inferenceHandle;
     ResultHandle resultHandle;
-    int inNum;
-    char **inName;
     if (useFileStream) {
         char *modelFileStream = BuildFileStream(modelPath);
         CreateInference(useFileStream, modelFileStream, algorithmMapPath, affinity,
-            &inferenceHandle, &resultHandle, &inNum, &inName);
+            &inferenceHandle, &resultHandle);
     } else {
-        CreateInference(useFileStream, modelPath, algorithmMapPath, affinity, &inferenceHandle,
-            &resultHandle, &inNum, &inName);
+        CreateInference(
+            useFileStream, modelPath, algorithmMapPath, affinity, &inferenceHandle, &resultHandle);
     }
 
     int inputNum, *inputN, *inputC, *inputH, *inputW;
@@ -38,7 +37,7 @@ int main(int argc, char *argv[])
     void **inputData;
     CreateInputTensorDesc(inferenceHandle, &inputNum, &inputName, &inputN, &inputC, &inputH,
         &inputW, &inputDT, &inputDF);
-    inputData = malloc(sizeof(void *) * inputNum);
+    inputData = (void **)malloc(sizeof(void *) * inputNum);
 
     int outputNum, *outputN, *outputC, *outputH, *outputW;
     DATA_TYPE *outputDT;
@@ -47,7 +46,7 @@ int main(int argc, char *argv[])
     void **outputData, **lastOutputData;
     CreateOutputTensorDesc(resultHandle, &outputNum, &outputName, &outputN, &outputC, &outputH,
         &outputW, &outputDT, &outputDF);
-    outputData = malloc(sizeof(void *) * outputNum);
+    outputData = (void **)malloc(sizeof(void *) * outputNum);
     MallocTensor(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF,
         &lastOutputData);
     InitTensor(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF,
@@ -94,9 +93,9 @@ int main(int argc, char *argv[])
         }
         PrintTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF,
             inputData, "    input ", 8);
-        ResizeModelInput(inferenceHandle, inNum, (const char **)inName, inputN, inputC, inputH,
-            inputW, inputDT, inputDF);
-        RunModel(inferenceHandle, resultHandle, inNum, (const char **)inName, inputData);
+        ResizeModelInput(inferenceHandle, inputNum, (const char **)inputName, inputN, inputC,
+            inputH, inputW, inputDT, inputDF);
+        RunModel(inferenceHandle, resultHandle, inputNum, (const char **)inputName, inputData);
         GetOutputDataFromResultHandle(resultHandle, outputNum, outputData);
         PrintTensor(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF,
             outputData, "    output ", 8);
@@ -107,10 +106,6 @@ int main(int argc, char *argv[])
     FreeTensor(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF,
         lastOutputData);
     free(outputData);
-    for (int i = 0; i < inNum; i++) {
-        free(inName[i]);
-    }
-    free(inName);
     FreeResultHandle(resultHandle);
     DestroyModel(inferenceHandle);
     return 0;
diff --git a/inference/examples/c_api/c_test.c b/inference/examples/c_api/c_test.c
index 4bea1853..dd09de4d 100644
--- a/inference/examples/c_api/c_test.c
+++ b/inference/examples/c_api/c_test.c
@@ -13,13 +13,13 @@
 
 #include <sys/time.h>
 #include <unistd.h>
-#include <string.h>
 #include <stdio.h>
+#include <string.h>
 #include "../../examples/c_api/c_test.h"
 
 char *modelPath = (char *)"";
 AFFINITY_TYPE affinity = CPU_HIGH_PERFORMANCE;
-char *algorithmMapPath = (char *)"./";
+char *algorithmMapPath = NULL;
 int loopTime = 1;
 int useFileStream = 0;
 char *algorithmMapName = (char *)"";
diff --git a/inference/examples/image_classification/classification.cpp b/inference/examples/image_classification/classification.cpp
index be3746c1..3e0243fa 100644
--- a/inference/examples/image_classification/classification.cpp
+++ b/inference/examples/image_classification/classification.cpp
@@ -55,7 +55,7 @@ int main(int argc, char *argv[])
     int category = -1;
     int loopTime = 1;
     if (!parse_res.model.second) {
-        exit(-1);
+        return 0;
     }
     if (parse_res.model.second) {
         modelPath = parse_res.model.first;
@@ -137,7 +137,7 @@ int main(int argc, char *argv[])
         cnn->run();
     }
 #ifdef _USE_GPU
-    if (strcmp(affinityPolicyName, "GPU") == 0) {
+    if (std::string(affinityPolicyName) == std::string("GPU")) {
         gcl_finish(OCLContext::getInstance().handle.get());
     }
 #endif
diff --git a/inference/examples/image_matting/u2net.cpp b/inference/examples/image_matting/u2net.cpp
index 26a99c90..1c5b21a8 100644
--- a/inference/examples/image_matting/u2net.cpp
+++ b/inference/examples/image_matting/u2net.cpp
@@ -121,7 +121,7 @@ std::shared_ptr<U8> preprocess(cv::Mat image,
         }
     }
     if (appending_channels != 0) {
-        memcpy(&(vec_transpose[iter_index]), &(vec_flow[0]), vec_flow.size() * sizeof(float));
+        UNI_MEMCPY(&(vec_transpose[iter_index]), &(vec_flow[0]), vec_flow.size() * sizeof(float));
     }
     return input_ptr;
 }
diff --git a/inference/examples/machine_translation/nmt.cpp b/inference/examples/machine_translation/nmt.cpp
index 61ab0872..f02b0b1c 100644
--- a/inference/examples/machine_translation/nmt.cpp
+++ b/inference/examples/machine_translation/nmt.cpp
@@ -45,7 +45,7 @@ int main(int argc, char *argv[])
     if (parse_res.algoPath.second) {
         algorithmMapPath = parse_res.algoPath.first;
     }
-    bool useGPU = (strcmp(affinityPolicyName, "GPU") == 0) ? true : false;
+    bool useGPU = std::string(affinityPolicyName) == std::string("GPU");
 
     auto pipeline = createPipeline(affinityPolicyName, modelPath, algorithmMapPath);
 
diff --git a/inference/examples/machine_translation/tsc_ssru.cpp b/inference/examples/machine_translation/tsc_ssru.cpp
new file mode 100644
index 00000000..17106255
--- /dev/null
+++ b/inference/examples/machine_translation/tsc_ssru.cpp
@@ -0,0 +1,364 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_TSC_SSRU_TEST
+#define _H_TSC_SSRU_TEST
+
+#include "inference.hpp"
+#include "data_loader.hpp"
+#include "profiling.h"
+#include "parse_command.h"
+
+int main(int argc, char *argv[])
+{
+    UNI_TIME_INIT
+    ParseRes parse_res;
+    parseCommandLine(argc, argv, &parse_res, "examples");
+
+    char *modelPath = (char *)"";
+    char *sequenceDirectory = (char *)"";
+    char *affinityPolicyName = (char *)"";
+    char *algorithmMapPath = (char *)"";
+    int loopTime = 1;
+
+    if (!parse_res.model.second) {
+        exit(-1);
+    }
+    if (parse_res.model.second) {
+        modelPath = parse_res.model.first;
+    }
+    if (parse_res.archInfo.second) {
+        affinityPolicyName = parse_res.archInfo.first;
+    }
+    if (parse_res.algoPath.second) {
+        algorithmMapPath = parse_res.algoPath.first;
+    }
+    if (parse_res.loopTime.second) {
+        loopTime = parse_res.loopTime.first;
+    }
+    bool useGPU = std::string(affinityPolicyName) == std::string("GPU");
+    std::shared_ptr<CNN> pipelineBase;
+    UNI_PROFILE(pipelineBase = createPipeline(affinityPolicyName, modelPath, algorithmMapPath),
+        std::string("bolt::prepare"), std::string("prepare"));
+
+    U32 batch = 4;
+    U32 inputLen = 55;
+    U32 seqLen = batch * inputLen;
+    U32 shortlistLen = 25100;
+    U32 input_ids[] = {
+        2583,
+        16370,
+        422,
+        175,
+        11445,
+        38,
+        156,
+        16718,
+        13,
+        345,
+        1485,
+        3677,
+        2,
+        2905,
+        845,
+        17379,
+        7408,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        2583,
+        16370,
+        43,
+        5,
+        2905,
+        845,
+        17379,
+        109,
+        16740,
+        4,
+        3339,
+        12550,
+        19144,
+        55,
+        257,
+        7,
+        156,
+        18,
+        1961,
+        22348,
+        1609,
+        30,
+        4,
+        22068,
+        12143,
+        7,
+        18,
+        1394,
+        609,
+        172,
+        4,
+        1634,
+        3,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        5999,
+        1567,
+        55,
+        1588,
+        2,
+        331,
+        15,
+        1311,
+        16969,
+        8,
+        6134,
+        7,
+        15,
+        3770,
+        7120,
+        823,
+        5,
+        75,
+        55,
+        679,
+        4508,
+        2,
+        5036,
+        6753,
+        47,
+        16370,
+        14288,
+        4,
+        3540,
+        4862,
+        6112,
+        623,
+        156,
+        1124,
+        82,
+        278,
+        1981,
+        150,
+        122,
+        18183,
+        55,
+        13,
+        42,
+        15,
+        33,
+        4759,
+        569,
+        85,
+        62,
+        6,
+        4,
+        910,
+        3873,
+        3,
+        0,
+        1056,
+        345,
+        1485,
+        3677,
+        2,
+        122,
+        278,
+        7088,
+        107,
+        1089,
+        21486,
+        9584,
+        5,
+        8,
+        1329,
+        11445,
+        38,
+        156,
+        16718,
+        13,
+        15880,
+        10997,
+        3,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+    };
+    F32 *masks = (F32 *)malloc(seqLen * sizeof(F32));
+    U32 h_sequence_length[] = {18, 34, 55, 24};
+    for (U32 i = 0; i < batch; ++i) {
+        for (U32 j = 0; j < h_sequence_length[i]; ++j) {
+            masks[i * inputLen + j] = 1.0f;
+        }
+        for (U32 j = h_sequence_length[i]; j < inputLen; ++j) {
+            masks[i * inputLen + j] = 0.0f;
+        }
+    }
+
+    U32 *positions = (U32 *)malloc(seqLen * sizeof(U32));
+    for (U32 i = 0; i < seqLen; ++i) {
+        positions[i] = i % inputLen;
+    }
+
+    U32 *shortlist = (U32 *)malloc(shortlistLen * sizeof(U32));
+    for (U32 i = 0; i < shortlistLen; ++i) {
+        shortlist[i] = i;
+    }
+
+    I32 trueRes[] = {2583, 16370, 14386, 14745, 2584, 37, 12, 14143, 2, 72, 3219, 2479, 19, 23,
+        3268, 2, 13166, 12, 8506, 7585, 17379, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 2583, 16370, 5, 8506, 61, 7585, 17379, 132, 13166, 2, 12, 24232, 17, 13955,
+        813, 523, 468, 1406, 725, 2027, 725, 2027, 725, 27, 12, 6596, 14, 13039, 2, 10, 2471, 5104,
+        61, 6584, 1048, 19, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5999, 1567, 55, 1588, 2, 615, 863, 6452, 9,
+        9257, 93, 3821, 4579, 5369, 300, 2151, 8386, 2, 1195, 6753, 2, 86, 16370, 13071, 2, 86, 10,
+        1075, 5, 153, 6112, 72, 272, 1232, 35, 9869, 1134, 2, 115, 97, 35, 9869, 1134, 2, 115, 97,
+        35, 9869, 1134, 2, 44, 33, 12604, 569, 1080, 62, 6, 12, 7185, 23, 171, 3, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 1760, 3219, 2479, 19, 23, 3268, 2, 1134, 97, 3771, 14, 23, 4980, 23969,
+        15532, 14, 37, 5903, 9, 14745, 2584, 35, 10, 14143, 9300, 2, 72, 17832, 23, 24330, 3, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    // load sequences
+    const char *inputNames[4] = {"encoder_positions", "encoder_words", "nmt_mask", "shortlist"};
+    const char *outputNames[1] = {"decoder_output"};
+
+    std::map<std::string, TensorDesc> inputDescMap;
+    inputDescMap[inputNames[0]] = tensor2d(DT_U32, batch, inputLen);
+    inputDescMap[inputNames[1]] = tensor2d(DT_U32, batch, inputLen);
+    inputDescMap[inputNames[2]] = tensor2d(DT_F32, batch, inputLen);
+    inputDescMap[inputNames[3]] = tensor2d(DT_U32, 1, shortlistLen);
+    pipelineBase->reready(inputDescMap);
+
+    std::map<std::string, std::shared_ptr<U8>> inputs;
+    inputs[inputNames[0]] = std::shared_ptr<U8>((U8 *)positions);
+    inputs[inputNames[1]] = std::shared_ptr<U8>((U8 *)input_ids, [](U8 *) {});
+    inputs[inputNames[2]] = std::shared_ptr<U8>((U8 *)masks);
+    inputs[inputNames[3]] = std::shared_ptr<U8>((U8 *)shortlist);
+
+    pipelineBase->set_input_by_assign(inputs);
+    double timeBegin = ut_time_ms();
+    for (int i = 0; i < loopTime; ++i) {
+        pipelineBase->run();
+    }
+    double timeEnd = ut_time_ms();
+    double totalTime = (timeEnd - timeBegin);
+
+    Tensor decoder_output = pipelineBase->get_tensor_by_name(outputNames[0]);
+    U32 outputNum = decoder_output.length();
+    for (U32 i = 0; i < outputNum; ++i) {
+        if (decoder_output.element(i) != trueRes[i]) {
+            UNI_CI_LOG("ERROR: Get Wrong Result!\n");
+        }
+    }
+    UNI_CI_LOG("avg_time: %fms/sequence\n", 1.0 * totalTime / loopTime);
+    return 0;
+}
+
+#endif
diff --git a/inference/examples/ultra_face/ultra_face.h b/inference/examples/ultra_face/ultra_face.h
index e5111fb9..89a9608b 100644
--- a/inference/examples/ultra_face/ultra_face.h
+++ b/inference/examples/ultra_face/ultra_face.h
@@ -13,7 +13,6 @@
 #ifndef _H_ULTRA_FACE
 #define _H_ULTRA_FACE
 
-#define clip(x, y) (x < 0 ? 0 : (x > y ? y : x))
 #include <algorithm>
 #include <iostream>
 #include <string>
@@ -61,6 +60,13 @@ std::vector<std::vector<float>> featuremap_size;
 std::vector<std::vector<float>> shrinkage_size;
 std::vector<int> w_h_list;
 std::vector<std::vector<float>> priors = {};
+
+inline float clip(float x, float y)
+{
+    float ret = (x < 0 ? 0 : (x > y ? y : x));
+    return ret;
+}
+
 inline void prior_boxes_generator(
     int input_width, int input_length, float score_threshold, float iou_threshold)
 {
@@ -176,7 +182,7 @@ inline void nms(std::vector<FaceInfo> &input, std::vector<FaceInfo> &output, int
                     total += exp(buf[i].score);
                 }
                 FaceInfo rects;
-                memset(&rects, 0, sizeof(rects));
+                UNI_MEMSET(&rects, 0, sizeof(rects));
                 for (unsigned int i = 0; i < buf.size(); i++) {
                     float rate = exp(buf[i].score) / total;
                     rects.x1 += buf[i].x1 * rate;
diff --git a/inference/examples/voice_wake_up/slide_tdnn.cpp b/inference/examples/voice_wake_up/slide_tdnn.cpp
index db872ab0..cff3971d 100644
--- a/inference/examples/voice_wake_up/slide_tdnn.cpp
+++ b/inference/examples/voice_wake_up/slide_tdnn.cpp
@@ -159,13 +159,13 @@ int main(int argc, char *argv[])
     Tensor buffer = Tensor::alloc_sized<CPUMem>(inputDesc);
     std::shared_ptr<U8> dst = ((CpuMemory *)buffer.get_memory())->get_shared_ptr();
     model_tensors_input[inputName] = dst;
-    memset(dst.get(), 0, frameNum * tileSize);
+    UNI_MEMSET(dst.get(), 0, frameNum * tileSize);
 
     // 3: run
     std::map<std::string, std::shared_ptr<Tensor>> outMap;
     double timeBegin = ut_time_ms();
     for (int i = 0; i < frameNum; i++) {
-        memcpy(dst.get() + (frameNum - i - 1) * tileSize, src.get(), (i + 1) * tileSize);
+        UNI_MEMCPY(dst.get() + (frameNum - i - 1) * tileSize, src.get(), (i + 1) * tileSize);
         pipeline->set_input_by_assign(model_tensors_input);
         pipeline->run();
         outMap = get_output(pipeline, affinityPolicyName);
diff --git a/inference/flow/src/node.cpp b/inference/flow/src/node.cpp
index 7dd5fc20..6b2b64aa 100644
--- a/inference/flow/src/node.cpp
+++ b/inference/flow/src/node.cpp
@@ -221,7 +221,7 @@ EE Node::run()
                 void *src = ((CpuMemory *)inferenceResult[name]->get_memory())->get_ptr();
                 void *dst = ((CpuMemory *)postprocessInputs[name]->get_memory())->get_ptr();
                 if (src != dst) {
-                    memcpy(dst, src, tensorNumBytes(desc));
+                    UNI_MEMCPY(dst, src, tensorNumBytes(desc));
                 }
             }
         }
diff --git a/install.sh b/install.sh
index 983f95f8..a3d0fad5 100644
--- a/install.sh
+++ b/install.sh
@@ -8,6 +8,7 @@ target=""
 build_threads="8"
 converter="on"
 use_serial="on"
+use_neon="on"
 use_fp32="on"
 use_fp16="on"
 use_int8="on"
@@ -26,7 +27,7 @@ Build bolt library.
 
 Mandatory arguments to long options are mandatory for short options too.
   -h, --help                 display this help and exit.
-  --target=<???>             target device system and hardware setting, currently only support theses targets:
+  --target=<???>             target device system and hardware setting. xxx_blank will use shell environment variables CC, CXX, CFLAGS and CXXFLAGS, e.g. linux-aarch64_blank is for ARM64 server. currently only support theses targets:
 EOF
     print_targets
     cat <<EOF
@@ -35,10 +36,13 @@ EOF
   --debug                    set to use debug(default: OFF).
   --profile                  set to print performance profiling information(default: OFF).
   --shared                   set to use shared library(default: OFF).
+  --secure                   set to use Huawei secure C(default: OFF).
   --gpu                      set to use arm mali/qualcomm gpu(default: OFF).
   --openmp                   set to use OpenMP multi-threads parallel operator(default: OFF).
   --flow                     set to use flow to process pipeline data(default: OFF).
+  --train                    set to use train(default: OFF).
   --serial=<ON|OFF>          set to use serial calculation(default: ON).
+  --neon=<ON|OFF>            set to use arm neon calculation(default: ON when using for arm platform).
   --fp32=<ON|OFF>            set to use float32 calculation(default: ON).
   --fp16=<ON|OFF>            set to use float16 calculation on arm aarch64(default: ON on aarch64, OFF on others).
   --int8=<ON|OFF>            set to use int8 calculation on arm aarch64(default: ON on aarch64, OFF on others).
@@ -49,7 +53,7 @@ EOF
 }
 
 cmake_options=""
-TEMP=`getopt -o "ht:c:" -al target:,threads:,help,converter:,example,debug,profile,shared,gpu,openmp,flow,serial:,fp32:,fp16:,int8:,clean -- "$@"`
+TEMP=`getopt -o "ht:c:" -al target:,threads:,help,converter:,example,debug,profile,shared,gpu,openmp,flow,serial:,neon:,fp32:,fp16:,int8:,train,clean,secure -- "$@"`
 if [[ $? != 0 ]]; then
     echo "[ERROR] ${script_name} terminating..." >&2
     exit 1
@@ -99,6 +103,9 @@ while true ; do
         --serial)
             use_serial=$2
             shift 2 ;;
+        --neon)
+            use_neon=$2
+            shift 2 ;;
         --fp32)
             use_fp32=$2
             shift 2 ;;
@@ -108,6 +115,12 @@ while true ; do
         --int8)
             use_int8=$2
             shift 2 ;;
+        --train)
+            cmake_options="${cmake_options} -DUSE_TRAINING=ON -DRAUL_CONFIG_BLAS_VENDOR=Huawei"
+            shift ;;
+        --secure)
+            cmake_options="${cmake_options} -DUSE_SECURE_C=ON"
+            shift ;;
         --clean)
             clean="on"
             shift ;;
@@ -127,7 +140,7 @@ target=$(map_target ${target})
 check_target ${target}
 
 if [[ "${converter}" == "ON" || "${converter}" == "on" ]]; then
-    cmake_options="${cmake_options} -DUSE_CAFFE=ON -DUSE_ONNX=ON -DUSE_TFLITE=ON -DUSE_TENSORFLOW=ON"
+    cmake_options="${cmake_options} -DUSE_CAFFE=ON -DUSE_ONNX=ON -DUSE_TFLITE=ON -DUSE_TENSORFLOW=ON -DUSE_MINDSPORE=ON"
 fi
 
 source ${script_dir}/scripts/setup_compiler.sh || exit 1
@@ -167,7 +180,9 @@ else
     cmake_options="${cmake_options} -DUSE_FP32=OFF"
 fi
 if [[ ${target} =~ aarch64 ]]; then
-    cmake_options="${cmake_options} -DUSE_NEON=ON"
+    if [[ "${use_neon}" == "ON" || "${use_neon}" == "on" ]]; then
+        cmake_options="${cmake_options} -DUSE_NEON=ON"
+    fi
     if [[ ${cmake_options} =~ USE_GPU=ON ]]; then
         use_fp16="on"
     fi
@@ -183,6 +198,10 @@ if [[ ${target} =~ aarch64 ]]; then
             fi
         fi
         rm -rf test.log main
+    else
+        if [[ "${use_int8}" == "ON" || "${use_int8}" == "on" ]]; then
+            cmake_options="${cmake_options} -DUSE_INT8=ON"
+        fi
     fi
 elif [[ ${target} =~ avx ]]; then
     cmake_options="${cmake_options} -DUSE_X86=ON"
@@ -200,7 +219,9 @@ else
     fi
 fi
 if [[ "${target}" == "linux-arm_himix100" || ${target} =~ armv7 || "${target}" == "linux-arm_musleabi" ]]; then
-    cmake_options="${cmake_options} -DUSE_NEON=ON"
+    if [[ "${use_neon}" == "ON" || "${use_neon}" == "on" ]]; then
+        cmake_options="${cmake_options} -DUSE_NEON=ON"
+    fi
     if [[ "${use_int8}" == "ON" || "${use_int8}" == "on" ]]; then
         cmake_options="${cmake_options} -DUSE_INT8=ON"
     fi
@@ -238,6 +259,6 @@ if [[ ${cmake_options} =~ USE_FLOW=ON ]]; then
 fi
 ${BOLT_ROOT}/kit/setup.sh ${platform} ${kit_flow} || exit 1
 
-${MAKE} test ARGS="-V"
+${MAKE} test ARGS="-V" || exit 1
 
 cd ..
diff --git a/kit/Android/ChineseSpeechRecognition/app/src/main/assets/encoder_flow.prototxt b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/encoder_flow.prototxt
new file mode 100644
index 00000000..a5cd74d6
--- /dev/null
+++ b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/encoder_flow.prototxt
@@ -0,0 +1,350 @@
+name: "encoder"
+input: "sounds"
+input: "encoder_block0_trunk0_layer0_mem"
+input: "encoder_block0_trunk0_layer1_mem"
+input: "encoder_block1_trunk1_layer0_kmem"
+input: "encoder_block1_trunk1_layer0_vmem"
+input: "encoder_block1_trunk1_layer1_kmem"
+input: "encoder_block1_trunk1_layer1_vmem"
+input: "encoder_block2_trunk0_layer0_mem"
+input: "encoder_block2_trunk0_layer1_mem"
+input: "encoder_block2_trunk1_layer0_kmem"
+input: "encoder_block2_trunk1_layer0_vmem"
+input: "encoder_block2_trunk1_layer1_kmem"
+input: "encoder_block2_trunk1_layer1_vmem"
+input: "encoder_block3_trunk0_layer0_mem"
+input: "encoder_block3_trunk0_layer1_mem"
+input: "encoder_block3_trunk1_layer0_kmem"
+input: "encoder_block3_trunk1_layer0_vmem"
+input: "encoder_block3_trunk1_layer1_kmem"
+input: "encoder_block3_trunk1_layer1_vmem"
+input: "encoder_block3_trunk1_layer2_kmem"
+input: "encoder_block3_trunk1_layer2_vmem"
+input: "encoder_block3_trunk1_layer3_kmem"
+input: "encoder_block3_trunk1_layer3_vmem"
+output: "encoder_block3_transformer_ln"
+output: "encoder_block0_conv0_neg_slice"
+output: "encoder_block0_conv1_neg_slice"
+output: "encoder_block1_transformer_layer0_k_neg_slice"
+output: "encoder_block1_transformer_layer0_v_neg_slice"
+output: "encoder_block1_transformer_layer1_k_neg_slice"
+output: "encoder_block1_transformer_layer1_v_neg_slice"
+output: "encoder_block2_conv0_neg_slice"
+output: "encoder_block2_conv1_neg_slice"
+output: "encoder_block2_transformer_layer0_k_neg_slice"
+output: "encoder_block2_transformer_layer0_v_neg_slice"
+output: "encoder_block2_transformer_layer1_k_neg_slice"
+output: "encoder_block2_transformer_layer1_v_neg_slice"
+output: "encoder_block3_conv0_neg_slice"
+output: "encoder_block3_conv1_neg_slice"
+output: "encoder_block3_transformer_layer0_k_neg_slice"
+output: "encoder_block3_transformer_layer0_v_neg_slice"
+output: "encoder_block3_transformer_layer1_k_neg_slice"
+output: "encoder_block3_transformer_layer1_v_neg_slice"
+output: "encoder_block3_transformer_layer2_k_neg_slice"
+output: "encoder_block3_transformer_layer2_v_neg_slice"
+output: "encoder_block3_transformer_layer3_k_neg_slice"
+output: "encoder_block3_transformer_layer3_v_neg_slice"
+node {
+  name: "sounds"
+  type: "Input"
+  output: "sounds"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 15
+  input_dim: 128
+}
+node {
+  name: "encoder_block0_trunk0_layer0_mem"
+  type: "Input"
+  output: "encoder_block0_trunk0_layer0_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 2
+  input_dim: 128
+  input_dim: 1
+}
+node {
+  name: "encoder_block0_trunk0_layer1_mem"
+  type: "Input"
+  output: "encoder_block0_trunk0_layer1_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHWC8"
+  input_dim: 1
+  input_dim: 32
+  input_dim: 1
+  input_dim: 64
+}
+node {
+  name: "encoder_block1_trunk1_layer0_kmem"
+  type: "Input"
+  output: "encoder_block1_trunk1_layer0_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 5
+  input_dim: 6
+  input_dim: 64
+}
+node {
+  name: "encoder_block1_trunk1_layer0_vmem"
+  type: "Input"
+  output: "encoder_block1_trunk1_layer0_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 5
+  input_dim: 6
+  input_dim: 64
+}
+node {
+  name: "encoder_block1_trunk1_layer1_kmem"
+  type: "Input"
+  output: "encoder_block1_trunk1_layer1_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 6
+  input_dim: 64
+}
+node {
+  name: "encoder_block1_trunk1_layer1_vmem"
+  type: "Input"
+  output: "encoder_block1_trunk1_layer1_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 6
+  input_dim: 64
+}
+node {
+  name: "encoder_block2_trunk0_layer0_mem"
+  type: "Input"
+  output: "encoder_block2_trunk0_layer0_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 2
+  input_dim: 384
+}
+node {
+  name: "encoder_block2_trunk0_layer1_mem"
+  type: "Input"
+  output: "encoder_block2_trunk0_layer1_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHWC8"
+  input_dim: 1
+  input_dim: 1024
+  input_dim: 1
+  input_dim: 1
+}
+node {
+  name: "encoder_block2_trunk1_layer0_kmem"
+  type: "Input"
+  output: "encoder_block2_trunk1_layer0_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block2_trunk1_layer0_vmem"
+  type: "Input"
+  output: "encoder_block2_trunk1_layer0_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block2_trunk1_layer1_kmem"
+  type: "Input"
+  output: "encoder_block2_trunk1_layer1_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block2_trunk1_layer1_vmem"
+  type: "Input"
+  output: "encoder_block2_trunk1_layer1_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk0_layer0_mem"
+  type: "Input"
+  output: "encoder_block3_trunk0_layer0_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 2
+  input_dim: 512
+}
+node {
+  name: "encoder_block3_trunk0_layer1_mem"
+  type: "Input"
+  output: "encoder_block3_trunk0_layer1_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHWC8"
+  input_dim: 1
+  input_dim: 1024
+  input_dim: 1
+  input_dim: 1
+}
+node {
+  name: "encoder_block3_trunk1_layer0_kmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer0_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer0_vmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer0_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer1_kmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer1_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 15
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer1_vmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer1_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 15
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer2_kmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer2_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 23
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer2_vmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer2_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 23
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer3_kmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer3_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 31
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer3_vmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer3_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 31
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_inference"
+  type: "Inference"
+  input: "sounds"
+  input: "encoder_block0_trunk0_layer0_mem"
+  input: "encoder_block0_trunk0_layer1_mem"
+  input: "encoder_block1_trunk1_layer0_kmem"
+  input: "encoder_block1_trunk1_layer0_vmem"
+  input: "encoder_block1_trunk1_layer1_kmem"
+  input: "encoder_block1_trunk1_layer1_vmem"
+  input: "encoder_block2_trunk0_layer0_mem"
+  input: "encoder_block2_trunk0_layer1_mem"
+  input: "encoder_block2_trunk1_layer0_kmem"
+  input: "encoder_block2_trunk1_layer0_vmem"
+  input: "encoder_block2_trunk1_layer1_kmem"
+  input: "encoder_block2_trunk1_layer1_vmem"
+  input: "encoder_block3_trunk0_layer0_mem"
+  input: "encoder_block3_trunk0_layer1_mem"
+  input: "encoder_block3_trunk1_layer0_kmem"
+  input: "encoder_block3_trunk1_layer0_vmem"
+  input: "encoder_block3_trunk1_layer1_kmem"
+  input: "encoder_block3_trunk1_layer1_vmem"
+  input: "encoder_block3_trunk1_layer2_kmem"
+  input: "encoder_block3_trunk1_layer2_vmem"
+  input: "encoder_block3_trunk1_layer3_kmem"
+  input: "encoder_block3_trunk1_layer3_vmem"
+  output: "encoder_block3_transformer_ln"
+  output: "encoder_block0_conv0_neg_slice"
+  output: "encoder_block0_conv1_neg_slice"
+  output: "encoder_block1_transformer_layer0_k_neg_slice"
+  output: "encoder_block1_transformer_layer0_v_neg_slice"
+  output: "encoder_block1_transformer_layer1_k_neg_slice"
+  output: "encoder_block1_transformer_layer1_v_neg_slice"
+  output: "encoder_block2_conv0_neg_slice"
+  output: "encoder_block2_conv1_neg_slice"
+  output: "encoder_block2_transformer_layer0_k_neg_slice"
+  output: "encoder_block2_transformer_layer0_v_neg_slice"
+  output: "encoder_block2_transformer_layer1_k_neg_slice"
+  output: "encoder_block2_transformer_layer1_v_neg_slice"
+  output: "encoder_block3_conv0_neg_slice"
+  output: "encoder_block3_conv1_neg_slice"
+  output: "encoder_block3_transformer_layer0_k_neg_slice"
+  output: "encoder_block3_transformer_layer0_v_neg_slice"
+  output: "encoder_block3_transformer_layer1_k_neg_slice"
+  output: "encoder_block3_transformer_layer1_v_neg_slice"
+  output: "encoder_block3_transformer_layer2_k_neg_slice"
+  output: "encoder_block3_transformer_layer2_v_neg_slice"
+  output: "encoder_block3_transformer_layer3_k_neg_slice"
+  output: "encoder_block3_transformer_layer3_v_neg_slice"
+  infer_output_size_parameter: "encoderInferOutputSize"
+  preprocess_parameter: "encoderPreProcess"
+  inference_parameter: "/data/user/0/com.huawei.noah/cache/asr_convolution_transformer_encoder_f32.bolt"
+}
diff --git a/kit/Android/ChineseSpeechRecognition/app/src/main/assets/joint_flow.prototxt b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/joint_flow.prototxt
new file mode 100644
index 00000000..d8ceb477
--- /dev/null
+++ b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/joint_flow.prototxt
@@ -0,0 +1,33 @@
+name: "joint_flow"
+input: "encoder"
+input: "prediction_net"
+output: "output_argmax"
+node {
+  name: "encoder"
+  type: "Input"
+  output: "encoder"
+  input_type: "FLOAT32"
+  input_format: "MTK"
+  input_dim: 1
+  input_dim: 1
+  input_dim: 512
+}
+node {
+  name: "prediction_net"
+  type: "Input"
+  output: "prediction_net"
+  input_type: "FLOAT32"
+  input_format: "MTK"
+  input_dim: 1
+  input_dim: 1
+  input_dim: 512
+}
+node {
+  name: "joint_inference"
+  type: "Inference"
+  input: "encoder"
+  input: "prediction_net"
+  output: "output_argmax"
+  infer_output_size_parameter: "jointInferOutputSize"
+  inference_parameter: "/data/user/0/com.huawei.noah/cache/asr_convolution_transformer_joint_net_f32.bolt"
+}
diff --git a/kit/Android/ChineseSpeechRecognition/app/src/main/assets/pinyin2hanzi_flow.prototxt b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/pinyin2hanzi_flow.prototxt
new file mode 100644
index 00000000..a493aa81
--- /dev/null
+++ b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/pinyin2hanzi_flow.prototxt
@@ -0,0 +1,24 @@
+name: "pinyin2hanzi_flow"
+input: "pinyin"
+output: "hanzi_squeeze/Squeeze"
+node {
+  name: "pinyin"
+  type: "Input"
+  output: "pinyin"
+  input_type: "UINT32"
+  input_format: "NORMAL"
+  input_dim: 1
+  input_dim: 32
+}
+node {
+  name: "pinyin2hanzi_inference"
+  type: "Inference"
+  input: "pinyin"
+  output: "hanzi_squeeze/Squeeze"
+  infer_output_size_parameter: "pinyin2hanziInferOutputSize"
+  preprocess_parameter: "pinyin2hanziPreProcess"
+  preprocess_parameter: "/data/user/0/com.huawei.noah/cache/pinyin_lm_embedding.bin"
+  preprocess_parameter: "1601"
+  preprocess_parameter: "512"
+  inference_parameter: "/data/user/0/com.huawei.noah/cache/cnn_pinyin_lm_b7h512e4_cn_en_20200518_cloud_fp32_f32.bolt"
+}
diff --git a/kit/Android/ChineseSpeechRecognition/app/src/main/assets/prediction_flow.prototxt b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/prediction_flow.prototxt
new file mode 100644
index 00000000..c5707a29
--- /dev/null
+++ b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/prediction_flow.prototxt
@@ -0,0 +1,139 @@
+name: "prediction"
+input: "label"
+input: "prediction_net_layer0_kmem"
+input: "prediction_net_layer0_vmem"
+input: "prediction_net_layer1_kmem"
+input: "prediction_net_layer1_vmem"
+input: "prediction_net_layer2_kmem"
+input: "prediction_net_layer2_vmem"
+input: "prediction_net_layer3_kmem"
+input: "prediction_net_layer3_vmem"
+output: "prediction_net_ln"
+output: "prediction_net_layer0_k_neg_slice"
+output: "prediction_net_layer0_v_neg_slice"
+output: "prediction_net_layer1_k_neg_slice"
+output: "prediction_net_layer1_v_neg_slice"
+output: "prediction_net_layer2_k_neg_slice"
+output: "prediction_net_layer2_v_neg_slice"
+output: "prediction_net_layer3_k_neg_slice"
+output: "prediction_net_layer3_v_neg_slice"
+node {
+  name: "label"
+  type: "Input"
+  output: "label"
+  input_type: "UINT32"
+  input_format: "NORMAL"
+  input_dim: 1
+  input_dim: 1
+}
+node {
+  name: "prediction_net_layer0_kmem"
+  type: "Input"
+  output: "prediction_net_layer0_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 3
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer0_vmem"
+  type: "Input"
+  output: "prediction_net_layer0_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 3
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer1_kmem"
+  type: "Input"
+  output: "prediction_net_layer1_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 5
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer1_vmem"
+  type: "Input"
+  output: "prediction_net_layer1_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 5
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer2_kmem"
+  type: "Input"
+  output: "prediction_net_layer2_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer2_vmem"
+  type: "Input"
+  output: "prediction_net_layer2_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer3_kmem"
+  type: "Input"
+  output: "prediction_net_layer3_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer3_vmem"
+  type: "Input"
+  output: "prediction_net_layer3_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_inference"
+  input: "label"
+  input: "prediction_net_layer0_kmem"
+  input: "prediction_net_layer0_vmem"
+  input: "prediction_net_layer1_kmem"
+  input: "prediction_net_layer1_vmem"
+  input: "prediction_net_layer2_kmem"
+  input: "prediction_net_layer2_vmem"
+  input: "prediction_net_layer3_kmem"
+  input: "prediction_net_layer3_vmem"
+  output: "prediction_net_ln"
+  output: "prediction_net_layer0_k_neg_slice"
+  output: "prediction_net_layer0_v_neg_slice"
+  output: "prediction_net_layer1_k_neg_slice"
+  output: "prediction_net_layer1_v_neg_slice"
+  output: "prediction_net_layer2_k_neg_slice"
+  output: "prediction_net_layer2_v_neg_slice"
+  output: "prediction_net_layer3_k_neg_slice"
+  output: "prediction_net_layer3_v_neg_slice"
+  infer_output_size_parameter: "predictionInferOutputSize"
+  inference_parameter: "/data/user/0/com.huawei.noah/cache/asr_convolution_transformer_prediction_net_f32.bolt"
+}
diff --git a/kit/Android/ReadingComprehension/.gitignore b/kit/Android/ReadingComprehension/.gitignore
new file mode 100644
index 00000000..aa724b77
--- /dev/null
+++ b/kit/Android/ReadingComprehension/.gitignore
@@ -0,0 +1,15 @@
+*.iml
+.gradle
+/local.properties
+/.idea/caches
+/.idea/libraries
+/.idea/modules.xml
+/.idea/workspace.xml
+/.idea/navEditor.xml
+/.idea/assetWizardSettings.xml
+.DS_Store
+/build
+/captures
+.externalNativeBuild
+.cxx
+local.properties
diff --git a/kit/Android/ReadingComprehension/app/.gitignore b/kit/Android/ReadingComprehension/app/.gitignore
new file mode 100644
index 00000000..42afabfd
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/.gitignore
@@ -0,0 +1 @@
+/build
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/build.gradle b/kit/Android/ReadingComprehension/app/build.gradle
new file mode 100644
index 00000000..2e3ece18
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/build.gradle
@@ -0,0 +1,48 @@
+plugins {
+    id 'com.android.application'
+}
+
+android {
+    compileSdkVersion 32
+    buildToolsVersion "30.0.3"
+
+    defaultConfig {
+        applicationId "com.huawei.noah"
+        minSdkVersion 16
+        targetSdkVersion 32
+        versionCode 1
+        versionName "1.0"
+
+        testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
+
+        ndk{
+            abiFilters "arm64-v8a"
+        }
+    }
+
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
+        }
+    }
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+
+
+    buildFeatures {
+        viewBinding true
+    }
+}
+
+dependencies {
+
+    implementation 'androidx.appcompat:appcompat:1.4.0'
+    implementation 'com.google.android.material:material:1.4.0'
+    implementation 'androidx.constraintlayout:constraintlayout:2.1.2'
+    testImplementation 'junit:junit:4.+'
+    androidTestImplementation 'androidx.test.ext:junit:1.1.3'
+    androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0'
+}
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/proguard-rules.pro b/kit/Android/ReadingComprehension/app/proguard-rules.pro
new file mode 100644
index 00000000..481bb434
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/proguard-rules.pro
@@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/src/main/AndroidManifest.xml b/kit/Android/ReadingComprehension/app/src/main/AndroidManifest.xml
new file mode 100644
index 00000000..57d457d5
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/AndroidManifest.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.huawei.noah">
+
+    <application
+        android:allowBackup="true"
+        android:icon="@mipmap/ic_launcher"
+        android:label="@string/app_name"
+        android:roundIcon="@mipmap/ic_launcher_round"
+        android:supportsRtl="true"
+        android:theme="@style/Theme.ReadingComparehension">
+        <activity android:name=".MainActivity"
+            android:exported="true">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+
+    </application>
+
+</manifest>
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/MainActivity.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/MainActivity.java
new file mode 100644
index 00000000..f207e6e0
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/MainActivity.java
@@ -0,0 +1,250 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+package com.huawei.noah;
+
+import androidx.appcompat.app.AppCompatActivity;
+
+import android.content.Intent;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.Looper;
+import android.util.Log;
+import android.view.Display;
+import android.view.View;
+import android.widget.EditText;
+import android.widget.ProgressBar;
+import android.widget.TextView;
+import android.widget.Toast;
+
+import com.huawei.noah.bert.AppTokenizer;
+import com.huawei.noah.bert.PredictionModel;
+import com.huawei.noah.databinding.ActivityMainBinding;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+public class MainActivity extends AppCompatActivity implements View.OnClickListener {
+    private ActivityMainBinding binding;
+    private EditText content;
+    private EditText question;
+    private TextView answer;
+    private static final String VOCAB = "vocab.txt";
+    private static final String MODEL = "bert_squad_10_f32.bolt";
+    private String modelPath;
+    private AppTokenizer appTokenizer;
+    private ProgressBar progressBar;
+
+    private ExecutorService executorService;
+    @Override protected void onCreate(Bundle savedInstanceState)
+    {
+        super.onCreate(savedInstanceState);
+
+        binding = ActivityMainBinding.inflate(getLayoutInflater());
+        setContentView(binding.getRoot());
+
+        executorService = Executors.newFixedThreadPool(1);
+        content = binding.content;
+        question = binding.question;
+        answer = binding.answer;
+        progressBar = binding.progress;
+
+        findViewById(R.id.demo1).setOnClickListener(this);
+        findViewById(R.id.demo2).setOnClickListener(this);
+        findViewById(R.id.ask_button).setOnClickListener(this);
+
+        String vocab = getCacheDir() + File.separator + VOCAB;
+        modelPath = getCacheDir() + File.separator + MODEL;
+
+        try {
+            copyAssetResource2File(VOCAB, vocab);
+            copyAssetResource2File(MODEL, modelPath);
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+
+        appTokenizer = new AppTokenizer(vocab);
+    }
+
+    private void copyAssetResource2File(String assetsFile, String outFile) throws IOException
+    {
+        File outF = new File(outFile);
+        if (outF.exists())
+            return;
+        InputStream is = this.getAssets().open(assetsFile);
+        FileOutputStream fos = new FileOutputStream(outF);
+        int byteCount;
+        byte[] buffer = new byte[1024];
+        while ((byteCount = is.read(buffer)) != -1) {
+            fos.write(buffer, 0, byteCount);
+        }
+        fos.flush();
+        is.close();
+        fos.close();
+        outF.setReadable(true);
+    }
+
+    @Override public void onClick(View v)
+    {
+        switch (v.getId()) {
+            case R.id.ask_button: {
+                if (content.getText().toString().length() == 0) {
+                    Toast
+                        .makeText(
+                            getApplicationContext(), "Content can not be null", Toast.LENGTH_LONG)
+                        .show();
+                    return;
+                } else if (question.getText().toString().length() == 0) {
+                    Toast
+                        .makeText(
+                            getApplicationContext(), "Question can not be null", Toast.LENGTH_LONG)
+                        .show();
+                    return;
+                }
+
+                progressBar.setVisibility(View.VISIBLE);
+                executorService.submit(new Runnable() {
+                    @Override public void run()
+                    {
+                        float[][] tokenizers = appTokenizer.runTokenizer(
+                            content.getText().toString(), question.getText().toString());
+                        int[] inputCActual = {
+                            tokenizers[0].length, tokenizers[1].length, tokenizers[2].length};
+                        int inputNum = 3;
+                        int outputNum = 2;
+                        String[] inputName = {"input_ids:0", "input_mask:0", "segment_ids:0"};
+                        String[] outputName = {"unstack:0", "unstack:1"};
+                        int[] inputN = {1, 1, 1};
+                        int[] inputCMax = {256, 256, 256};
+                        int[] inputH = {1, 1, 1};
+                        int[] inputW = {1, 1, 1};
+                        DataType[] inputDatatype = {DataType.INT32, DataType.INT32, DataType.INT32};
+                        DataFormat[] inputDataFormat = {
+                            DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL};
+                        BoltModel boltModel = new BoltModel(modelPath,
+                            AffinityType.CPU_HIGH_PERFORMANCE, inputNum, inputName, inputN,
+                            inputCMax, inputH, inputW, inputDatatype, inputDataFormat, outputNum,
+                            outputName);
+                        BoltResult boltResult = boltModel.run(inputNum, inputName, inputN,
+                            inputCActual, inputH, inputW, inputDatatype, inputDataFormat,
+                            tokenizers);
+                        float[][] result = boltResult.getResultData();
+                        String resultStr = getResultAnswer(result);
+                        boltModel.destructor();
+                        doOnUiCode(resultStr);
+                    }
+                });
+
+            } break;
+            case R.id.demo1: {
+                content.setText(getString(R.string.Demo1));
+                question.setText(getString(R.string.Ques1));
+                answer.setText("");
+            } break;
+            case R.id.demo2: {
+                content.setText(getString(R.string.Demo2));
+                question.setText(getString(R.string.Ques2));
+                answer.setText("");
+            } break;
+
+            default:
+                break;
+        }
+    }
+
+    private void doOnUiCode(String string)
+    {
+        Handler uiThread = new Handler(Looper.getMainLooper());
+        uiThread.post(new Runnable() {
+            @Override public void run()
+            {
+                answer.setText(string);
+                progressBar.setVisibility(View.GONE);
+            }
+        });
+    }
+
+    private String getResultAnswer(float[][] result)
+    {
+        ArrayList<Integer> start_index = getBestIndexs(result[0], 20);
+        ArrayList<Integer> end_index = getBestIndexs(result[1], 20);
+        ArrayList<PredictionModel> predictionModels = new ArrayList<>();
+        for (int start : start_index) {
+            for (int end : end_index) {
+                predictionModels.add(
+                    new PredictionModel(start, end, result[0][start], result[1][end]));
+            }
+        }
+        Collections.sort(predictionModels, new Comparator<PredictionModel>() {
+            @Override public int compare(PredictionModel o1, PredictionModel o2)
+            {
+                if ((o1.start_logit + o1.end_logit) >= (o2.start_logit + o2.end_logit)) {
+                    return -1;
+                } else
+                    return 1;
+            }
+        });
+
+        PredictionModel predictionModel = predictionModels.get(2);
+        String tok = "";
+        for (int i = predictionModel.start; i <= predictionModel.end; i++) {
+            if (appTokenizer.features_.get(i).contains("##")) {
+                String s = appTokenizer.features_.get(i).substring(
+                    appTokenizer.features_.get(i).indexOf("##") + 2);
+                tok += s;
+            } else {
+                if (i == predictionModel.start) {
+                    tok += appTokenizer.features_.get(i);
+                } else {
+                    tok += " " + appTokenizer.features_.get(i);
+                }
+            }
+        }
+        return tok;
+    }
+
+    private ArrayList<Integer> getBestIndexs(float[] datas, int bestSize)
+    {
+        ArrayList<Integer> results = new ArrayList<>();
+        Map<Float, Integer> unstack = new TreeMap<Float, Integer>(new Comparator<Float>() {
+            @Override public int compare(Float o1, Float o2)
+            {
+                return o2.compareTo(o1);
+            }
+        });
+
+        for (int i = 0; i < 256; i++) {
+            unstack.put(datas[i], i);
+        }
+
+        int index = 0;
+        for (Iterator i = unstack.values().iterator(); i.hasNext();) {
+            if (index >= bestSize)
+                break;
+            Object obj = i.next();
+            results.add((int)obj);
+            index++;
+        }
+        return results;
+    }
+}
diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/AppTokenizer.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/AppTokenizer.java
new file mode 100644
index 00000000..d42df66d
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/AppTokenizer.java
@@ -0,0 +1,119 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+package com.huawei.noah.bert;
+
+import android.util.Log;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class AppTokenizer {
+    private static final String TAG = "AppTokenizer";
+
+    private int maxSeqLength;
+    private int tokenSize;
+    private List<String> tokens;
+    private FullTokenizer tokenizer;
+    public List<String> features_ = new ArrayList<>();
+
+    public AppTokenizer(String vocab)
+    {
+        this.maxSeqLength = 256;
+        tokenizer = new FullTokenizer(vocab);
+    }
+
+    public float[][] runTokenizer(String paragraph, String question)
+    {
+        List<String> paragraph_tokens = tokenizer.tokenize(paragraph);
+        List<String> feture1 = tokenizer.getFeaturesList();
+        List<String> question_tokens = tokenizer.tokenize(question);
+        List<String> feture2 = tokenizer.getFeaturesList();
+
+        return getExampleSingle(tokenizer, paragraph_tokens, question_tokens, feture1, feture2);
+    }
+
+    private float[][] getExampleSingle(FullTokenizer tokenizer,
+        List<String> paragraph_tokens,
+        List<String> question_tokens,
+        List<String> feature1,
+        List<String> feature2)
+    {
+        tokens = new ArrayList<>();
+        List<Integer> segmentIds = new ArrayList<>();
+        List<Integer> positions = new ArrayList<Integer>();
+
+        features_.clear();
+        features_.add("[CLS]");
+        tokens.add("[CLS]");
+
+        for (int i = 0; i < question_tokens.size(); i++) {
+            tokens.add(question_tokens.get(i));
+            features_.add(feature2.get(i));
+        }
+        tokens.add("[SEP]");
+        features_.add("[SEP]");
+
+        for (int i = 0; i < paragraph_tokens.size(); i++) {
+            tokens.add(paragraph_tokens.get(i));
+            features_.add(feature1.get(i));
+        }
+        tokens.add("[SEP]");
+        features_.add("[SEP]");
+
+        List<Integer> inputIds = tokenizer.convertTokensToIds(tokens);
+        for (int i = 0; i < maxSeqLength; i++) {
+            if (i < inputIds.size()) {
+                if (i < question_tokens.size() + 2) {
+                    segmentIds.add(0);
+                } else {
+                    segmentIds.add(1);
+                }
+                positions.add(1);
+            } else {
+                inputIds.add(0);
+                segmentIds.add(0);
+                positions.add(0);
+            }
+        }
+
+        float[][] outputs = new float[3][maxSeqLength];
+        for (int i = 0; i < inputIds.size(); i++) {
+            outputs[0][i] = inputIds.get(i);
+        }
+        for (int i = 0; i < positions.size(); i++) {
+            outputs[1][i] = positions.get(i);
+        }
+
+        for (int i = 0; i < segmentIds.size(); i++) {
+            outputs[2][i] = segmentIds.get(i);
+        }
+
+        tokenSize = paragraph_tokens.size() + 2 + question_tokens.size() + 1;
+        Log.i(TAG, "getExampleSingle: tokenSize = " + tokenSize);
+        return outputs;
+    }
+
+    public String getTokens()
+    {
+        StringBuilder stringBuilder = new StringBuilder();
+        for (int i = 1; i < tokens.size() - 1; i++) {
+            stringBuilder.append(tokens.get(i)).append(" ");
+        }
+        return stringBuilder.toString();
+    }
+
+    public int getTokenSize()
+    {
+        return tokenSize;
+    }
+}
diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/BasicTokenizer.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/BasicTokenizer.java
new file mode 100644
index 00000000..0abf15ba
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/BasicTokenizer.java
@@ -0,0 +1,162 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+package com.huawei.noah.bert;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class BasicTokenizer {
+    public List<String> tokenize(String text)
+    {
+        String cleanText = cleanText(text);
+
+        String chineseTokens = tokenizeChineseChars(cleanText);
+
+        List<String> origTokens = whiteSpaceTokenize(chineseTokens);
+
+        String str = "";
+        for (String token : origTokens) {
+            List<String> list = runSplitOnPunc(token);
+            for (int i = 0; i < list.size(); i++) {
+                str += list.get(i) + " ";
+            }
+        }
+
+        List<String> resTokens = whiteSpaceTokenize(str);
+
+        return resTokens;
+    }
+
+    private List<String> runSplitOnPunc(String token)
+    {
+        List<List<Character>> result = new ArrayList<List<Character>>();
+
+        int length = token.length();
+        int i = 0;
+        boolean startNewWord = true;
+        while (i < length) {
+            char c = token.charAt(i);
+            if (isPunctuation(c)) {
+                List<Character> list = Arrays.asList(c);
+                result.add(list);
+                startNewWord = true;
+            } else {
+                if (startNewWord) {
+                    result.add(new ArrayList<Character>());
+                }
+                startNewWord = false;
+                result.get(result.size() - 1).add(c);
+            }
+            i += 1;
+        }
+
+        List<String> res = new ArrayList<String>();
+        for (int j = 0; j < result.size(); j++) {
+            String str = "";
+            for (int k = 0; k < result.get(j).size(); k++) {
+                str += result.get(j).get(k);
+            }
+            res.add(str);
+        }
+        return res;
+    }
+
+    private boolean isPunctuation(char c)
+    {
+        if ((c >= 33 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) ||
+            (c >= 123 && c <= 126)) {
+            return true;
+        }
+
+        if (c == '“' || c == '”' || c == '、' || c == '《' || c == '》' || c == '。' || c == '；' ||
+            c == '【' || c == '】') {
+            return true;
+        }
+
+        return false;
+    }
+
+    private List<String> whiteSpaceTokenize(String text)
+    {
+        List<String> result = new ArrayList<String>();
+
+        text = text.trim();
+        if (null == text) {
+            return result;
+        }
+        String[] tokens = text.split(" ");
+        result = Arrays.asList(tokens);
+
+        return result;
+    }
+
+    private String tokenizeChineseChars(String cleanText)
+    {
+        StringBuffer outStrBuf = new StringBuffer();
+
+        for (int i = 0; i < cleanText.length(); i++) {
+            char c = cleanText.charAt(i);
+            if (isChineseChar(c)) {
+                outStrBuf.append(" ");
+                outStrBuf.append(c);
+                outStrBuf.append(" ");
+            } else {
+                outStrBuf.append(c);
+            }
+        }
+
+        return outStrBuf.toString();
+    }
+
+    private boolean isChineseChar(char c)
+    {
+        String s = String.valueOf(c);
+        String regex = "[\u4e00-\u9fa5]";
+        Pattern p = Pattern.compile(regex);
+
+        Matcher m = p.matcher(s);
+        return m.matches();
+    }
+
+    private String cleanText(String text)
+    {
+        StringBuffer outStrBuf = new StringBuffer("");
+
+        for (int i = 0; i < text.length(); i++) {
+            char c = text.charAt(i);
+            if (isWhiteSpace(c)) {
+                outStrBuf.append(" ");
+            } else {
+                outStrBuf.append(c);
+            }
+        }
+        return outStrBuf.toString();
+    }
+
+    private boolean isWhiteSpace(char c)
+    {
+        if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
+            return true;
+        }
+
+        return false;
+    }
+
+    public static void main(String[] args)
+    {
+        System.out.print("hello world");
+    }
+}
diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/FullTokenizer.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/FullTokenizer.java
new file mode 100644
index 00000000..1e87c218
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/FullTokenizer.java
@@ -0,0 +1,95 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+package com.huawei.noah.bert;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+public class WordpieceTokenizer {
+    private Map<String, Integer> vocab;
+    private String unkToken = "[UNK]";
+    private int maxInputCharsPerWord = 200;
+    private List<String> featuresList = new ArrayList<>();
+
+    public WordpieceTokenizer(Map<String, Integer> vocab)
+    {
+        this.vocab = vocab;
+    }
+
+    /*
+        For example:
+        input = "unaffable"
+        output = ["un", "##aff", "##able"]
+    */
+    public List<String> tokenize(String text)
+    {
+        String lowText = text.toLowerCase();
+
+        featuresList.clear();
+        List<String> outputTokens = new ArrayList<String>();
+
+        int length = lowText.length();
+        if (length > this.maxInputCharsPerWord) {
+            outputTokens.add(this.unkToken);
+        }
+
+        boolean isBad = false;
+        int start = 0;
+        List<String> subTokens = new ArrayList<String>();
+        List<String> featureTokens = new ArrayList<String>();
+
+        while (start < length) {
+            int end = length;
+            String curSubStr = null;
+            String featureSubStr = null;
+            while (start < end) {
+                String subStr = lowText.substring(start, end);
+                String featureStr = text.substring(start, end);
+                if (start > 0) {
+                    subStr = "##" + subStr;
+                    featureStr = "##" + featureStr;
+                }
+                if (this.vocab.containsKey(subStr)) {
+                    curSubStr = subStr;
+                    featureSubStr = featureStr;
+                    break;
+                }
+                end -= 1;
+            }
+            if (null == curSubStr) {
+                isBad = true;
+                break;
+            }
+            subTokens.add(curSubStr);
+            featureTokens.add(featureSubStr);
+            start = end;
+        }
+
+        if (isBad) {
+            outputTokens.add(this.unkToken);
+            featuresList.add(this.unkToken);
+        } else {
+            outputTokens.addAll(subTokens);
+            featuresList.addAll(featureTokens);
+        }
+
+        return outputTokens;
+    }
+
+    public List<String> getFeaturesList()
+    {
+        return featuresList;
+    }
+}
diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/PredictionModel.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/PredictionModel.java
new file mode 100644
index 00000000..eac02c50
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/PredictionModel.java
@@ -0,0 +1,27 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+package com.huawei.noah.bert;
+
+public class PredictionModel {
+    public int start;
+    public int end;
+    public float start_logit;
+    public  float end_logit;
+
+    public PredictionModel(int aStart, int aEnd, float startLogit, float endLogit){
+        start=aStart;
+        end=aEnd;
+        start_logit=startLogit;
+        end_logit=endLogit;
+    }
+}
diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/WordpieceTokenizer.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/WordpieceTokenizer.java
new file mode 100644
index 00000000..701076a2
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/WordpieceTokenizer.java
@@ -0,0 +1,94 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+package com.huawei.noah.bert;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+public class WordpieceTokenizer {
+
+    private Map<String, Integer> vocab;
+    private String unkToken = "[UNK]";
+    private int maxInputCharsPerWord = 200;
+    private List<String> featuresList=new ArrayList<>();
+
+    public WordpieceTokenizer(Map<String, Integer> vocab){
+        this.vocab = vocab;
+    }
+
+    /*
+        For example:
+        input = "unaffable"
+        output = ["un", "##aff", "##able"]
+    */
+    public List<String> tokenize(String text){
+
+        String lowText=text.toLowerCase();
+
+        featuresList.clear();
+        List<String> outputTokens = new ArrayList<String>();
+
+        int length = lowText.length();
+        if(length > this.maxInputCharsPerWord){
+            outputTokens.add(this.unkToken);
+        }
+
+        boolean isBad = false;
+        int start = 0;
+        List<String> subTokens = new ArrayList<String>();
+        List<String> featureTokens = new ArrayList<String>();
+
+        while(start < length){
+            int end = length;
+            String curSubStr = null;
+            String featureSubStr = null;
+            while(start < end){
+                String subStr = lowText.substring(start, end);
+                String featureStr = text.substring(start, end);
+                if(start > 0){
+                    subStr = "##" + subStr;
+                    featureStr = "##" + featureStr;
+                }
+                if(this.vocab.containsKey(subStr)){
+                    curSubStr = subStr;
+                    featureSubStr = featureStr;
+                    break;
+                }
+                end -= 1;
+            }
+            if(null == curSubStr){
+                isBad = true;
+                break;
+            }
+            subTokens.add(curSubStr);
+            featureTokens.add(featureSubStr);
+            start = end;
+        }
+
+        if(isBad){
+            outputTokens.add(this.unkToken);
+            featuresList.add(this.unkToken);
+        }else{
+            outputTokens.addAll(subTokens);
+            featuresList.addAll(featureTokens);
+        }
+
+        return outputTokens;
+    }
+
+    public List<String> getFeaturesList(){
+        return featuresList;
+    }
+}
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/drawable-v24/ic_launcher_foreground.xml b/kit/Android/ReadingComprehension/app/src/main/res/drawable-v24/ic_launcher_foreground.xml
new file mode 100644
index 00000000..2b068d11
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/res/drawable-v24/ic_launcher_foreground.xml
@@ -0,0 +1,30 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:aapt="http://schemas.android.com/aapt"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
+        <aapt:attr name="android:fillColor">
+            <gradient
+                android:endX="85.84757"
+                android:endY="92.4963"
+                android:startX="42.9492"
+                android:startY="49.59793"
+                android:type="linear">
+                <item
+                    android:color="#44000000"
+                    android:offset="0.0" />
+                <item
+                    android:color="#00000000"
+                    android:offset="1.0" />
+            </gradient>
+        </aapt:attr>
+    </path>
+    <path
+        android:fillColor="#FFFFFF"
+        android:fillType="nonZero"
+        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
+        android:strokeWidth="1"
+        android:strokeColor="#00000000" />
+</vector>
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/drawable/ic_launcher_background.xml b/kit/Android/ReadingComprehension/app/src/main/res/drawable/ic_launcher_background.xml
new file mode 100644
index 00000000..07d5da9c
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/res/drawable/ic_launcher_background.xml
@@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path
+        android:fillColor="#3DDC84"
+        android:pathData="M0,0h108v108h-108z" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M9,0L9,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,0L19,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,0L29,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,0L39,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,0L49,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,0L59,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,0L69,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,0L79,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M89,0L89,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M99,0L99,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,9L108,9"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,19L108,19"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,29L108,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,39L108,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,49L108,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,59L108,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,69L108,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,79L108,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,89L108,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,99L108,99"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,29L89,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,39L89,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,49L89,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,59L89,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,69L89,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,79L89,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,19L29,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,19L39,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,19L49,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,19L59,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,19L69,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,19L79,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+</vector>
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/layout/activity_main.xml b/kit/Android/ReadingComprehension/app/src/main/res/layout/activity_main.xml
new file mode 100644
index 00000000..5a2d6be6
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/res/layout/activity_main.xml
@@ -0,0 +1,146 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    tools:context=".MainActivity"
+    android:background="#aaaaaa">
+
+    <RelativeLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:background="@android:color/transparent">
+
+        <TextView
+            android:id="@+id/answer"
+            android:layout_width="match_parent"
+            android:layout_height="160dp"
+            android:background="@color/white"
+            android:textColor="#333333"
+            android:hint="Press Ask Button to get answer"
+            android:paddingLeft="10dp"
+            android:layout_alignParentBottom="true"/>
+
+        <TextView
+            android:id="@+id/head_title2"
+            android:layout_width="match_parent"
+            android:layout_height="50dp"
+            android:background="@color/purple_200"
+            android:textColor="@color/white"
+            android:textSize="20sp"
+            android:textStyle="bold"
+            android:text="Answer"
+            android:layout_above="@id/answer"
+            android:gravity="center_vertical"
+            android:paddingLeft="20dp"/>
+
+        <RelativeLayout
+            android:id="@+id/question_bg"
+            android:layout_width="match_parent"
+            android:layout_height="160dp"
+            android:layout_above="@id/head_title2"
+            android:background="@android:color/transparent">
+
+            <RelativeLayout
+                android:id="@+id/text_bg"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent"
+                android:layout_marginRight="80dp"
+                android:background="@color/white">
+                <EditText
+                    android:id="@+id/question"
+                    android:layout_width="match_parent"
+                    android:layout_height="match_parent"
+                    android:textSize="15sp"
+                    android:textColor="#333333"
+                    android:text="how old is lili"
+                    android:hint="Please input readubf material"
+                    android:gravity="top"
+                    android:paddingHorizontal="10dp"
+                    android:background="@null"/>
+            </RelativeLayout>
+
+            <Button
+                android:id="@+id/ask_button"
+                android:layout_height="40dp"
+                android:layout_width="60dp"
+                android:layout_marginLeft="10dp"
+                android:layout_marginRight="10dp"
+                android:layout_alignParentRight="true"
+                android:layout_centerVertical="true"
+                android:text="Ask"
+                android:textColor="@color/white"
+                android:background="@color/teal_700"/>
+        </RelativeLayout>
+
+        <TextView
+            android:id="@+id/head_title1"
+            android:layout_width="match_parent"
+            android:layout_height="50dp"
+            android:layout_above="@id/question_bg"
+            android:background="@color/teal_700"
+            android:textColor="@color/white"
+            android:textSize="20sp"
+            android:textStyle="bold"
+            android:text="Question"
+            android:gravity="center_vertical"
+            android:paddingLeft="20dp"/>
+
+        <RelativeLayout
+            android:id="@+id/content_bg"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_alignParentRight="true"
+            android:layout_marginRight="80dp"
+            android:layout_alignParentTop="true"
+            android:layout_above="@id/head_title1"
+            android:background="@color/white">
+            <EditText
+                android:id="@+id/content"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent"
+                android:textSize="15sp"
+                android:textColor="#333333"
+                android:text="lili is 4 years old,she is beautiful"
+                android:hint="Please input readubf material"
+                android:paddingHorizontal="10dp"
+                android:gravity="top"
+                android:background="@null"/>
+        </RelativeLayout>
+
+        <Button
+            android:id="@+id/demo1"
+            android:layout_height="40dp"
+            android:layout_width="60dp"
+            android:layout_marginLeft="10dp"
+            android:layout_marginRight="10dp"
+            android:layout_alignParentRight="true"
+            android:layout_marginTop="20dp"
+            android:text="demo1"
+            android:textColor="@color/white"
+            android:background="@color/oringe"/>
+
+        <Button
+            android:id="@+id/demo2"
+            android:layout_height="40dp"
+            android:layout_width="60dp"
+            android:layout_marginLeft="10dp"
+            android:layout_marginRight="10dp"
+            android:layout_alignParentRight="true"
+            android:layout_below="@id/demo1"
+            android:layout_marginTop="20dp"
+            android:text="demo2"
+            android:textColor="@color/white"
+            android:background="@color/oringe"/>
+
+        <ProgressBar
+            android:id="@+id/progress"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_centerInParent="true"
+            android:visibility="gone"/>
+    </RelativeLayout>
+
+
+</androidx.constraintlayout.widget.ConstraintLayout>
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
new file mode 100644
index 00000000..eca70cfe
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
new file mode 100644
index 00000000..eca70cfe
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-hdpi/ic_launcher.png b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-hdpi/ic_launcher.png
new file mode 100644
index 00000000..a571e600
Binary files /dev/null and b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-hdpi/ic_launcher.png differ
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-hdpi/ic_launcher_round.png b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-hdpi/ic_launcher_round.png
new file mode 100644
index 00000000..61da551c
Binary files /dev/null and b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-hdpi/ic_launcher_round.png differ
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-mdpi/ic_launcher.png b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-mdpi/ic_launcher.png
new file mode 100644
index 00000000..c41dd285
Binary files /dev/null and b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-mdpi/ic_launcher.png differ
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-mdpi/ic_launcher_round.png b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-mdpi/ic_launcher_round.png
new file mode 100644
index 00000000..db5080a7
Binary files /dev/null and b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-mdpi/ic_launcher_round.png differ
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xhdpi/ic_launcher.png b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xhdpi/ic_launcher.png
new file mode 100644
index 00000000..6dba46da
Binary files /dev/null and b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xhdpi/ic_launcher.png differ
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png
new file mode 100644
index 00000000..da31a871
Binary files /dev/null and b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png differ
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxhdpi/ic_launcher.png b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxhdpi/ic_launcher.png
new file mode 100644
index 00000000..15ac6817
Binary files /dev/null and b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxhdpi/ic_launcher.png differ
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png
new file mode 100644
index 00000000..b216f2d3
Binary files /dev/null and b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png differ
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png
new file mode 100644
index 00000000..f25a4197
Binary files /dev/null and b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png differ
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png
new file mode 100644
index 00000000..e96783cc
Binary files /dev/null and b/kit/Android/ReadingComprehension/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png differ
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/values-night/themes.xml b/kit/Android/ReadingComprehension/app/src/main/res/values-night/themes.xml
new file mode 100644
index 00000000..cfc0523a
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/res/values-night/themes.xml
@@ -0,0 +1,16 @@
+<resources xmlns:tools="http://schemas.android.com/tools">
+    <!-- Base application theme. -->
+    <style name="Theme.ReadingComparehension" parent="Theme.MaterialComponents.DayNight.DarkActionBar.Bridge">
+        <!-- Primary brand color. -->
+        <item name="colorPrimary">@color/purple_200</item>
+        <item name="colorPrimaryVariant">@color/purple_700</item>
+        <item name="colorOnPrimary">@color/black</item>
+        <!-- Secondary brand color. -->
+        <item name="colorSecondary">@color/teal_200</item>
+        <item name="colorSecondaryVariant">@color/teal_200</item>
+        <item name="colorOnSecondary">@color/black</item>
+        <!-- Status bar color. -->
+        <item name="android:statusBarColor" tools:targetApi="l">?attr/colorPrimaryVariant</item>
+        <!-- Customize your theme here. -->
+    </style>
+</resources>
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/values/colors.xml b/kit/Android/ReadingComprehension/app/src/main/res/values/colors.xml
new file mode 100644
index 00000000..75a78819
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/res/values/colors.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="purple_200">#FFBB86FC</color>
+    <color name="purple_500">#FF6200EE</color>
+    <color name="purple_700">#FF3700B3</color>
+    <color name="teal_200">#FF03DAC5</color>
+    <color name="teal_700">#FF018786</color>
+    <color name="black">#FF000000</color>
+    <color name="white">#FFFFFFFF</color>
+    <color name="oringe">#FF9800</color>
+</resources>
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/values/strings.xml b/kit/Android/ReadingComprehension/app/src/main/res/values/strings.xml
new file mode 100644
index 00000000..4418dd67
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/res/values/strings.xml
@@ -0,0 +1,11 @@
+<resources>
+    <string name="app_name">ReadingComprehension</string>
+
+    <string name="Demo1">On a sunny day, Xiao Ming goes to the beach to swim, he likes swimming very much</string>
+    <string name="Ques1">What is the weather today</string>
+
+    <string name="Demo2">It takes four hours to take the high-speed train from Shenzhen to Changsha. Shenzhen is high-speed train station is Shenzhen North,
+        Changsha is high-speed train station is Changsha South, and Xiao Ming is in Shenzhen preparing to go to Changsha.</string>
+    <string name="Ques2">How long does it take Xiao Ming to go to Changsha by high-speed rail</string>
+
+</resources>
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/app/src/main/res/values/themes.xml b/kit/Android/ReadingComprehension/app/src/main/res/values/themes.xml
new file mode 100644
index 00000000..746c4909
--- /dev/null
+++ b/kit/Android/ReadingComprehension/app/src/main/res/values/themes.xml
@@ -0,0 +1,16 @@
+<resources xmlns:tools="http://schemas.android.com/tools">
+    <!-- Base application theme. -->
+    <style name="Theme.ReadingComparehension" parent="Theme.MaterialComponents.DayNight.DarkActionBar.Bridge">
+        <!-- Primary brand color. -->
+        <item name="colorPrimary">@color/purple_500</item>
+        <item name="colorPrimaryVariant">@color/purple_700</item>
+        <item name="colorOnPrimary">@color/white</item>
+        <!-- Secondary brand color. -->
+        <item name="colorSecondary">@color/teal_200</item>
+        <item name="colorSecondaryVariant">@color/teal_700</item>
+        <item name="colorOnSecondary">@color/black</item>
+        <!-- Status bar color. -->
+        <item name="android:statusBarColor" tools:targetApi="l">?attr/colorPrimaryVariant</item>
+        <!-- Customize your theme here. -->
+    </style>
+</resources>
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/build.gradle b/kit/Android/ReadingComprehension/build.gradle
new file mode 100644
index 00000000..5a7c7fc5
--- /dev/null
+++ b/kit/Android/ReadingComprehension/build.gradle
@@ -0,0 +1,25 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+buildscript {
+    repositories {
+        google()
+        mavenCentral()
+    }
+    dependencies {
+        classpath "com.android.tools.build:gradle:4.2.2"
+
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        google()
+        mavenCentral()
+        jcenter() // Warning: this repository is going to shut down soon
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/gradle.properties b/kit/Android/ReadingComprehension/gradle.properties
new file mode 100644
index 00000000..6826e61b
--- /dev/null
+++ b/kit/Android/ReadingComprehension/gradle.properties
@@ -0,0 +1,17 @@
+# Project-wide Gradle settings.
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
+# AndroidX package structure to make it clearer which packages are bundled with the
+# Android operating system, and which are packaged with your app"s APK
+# https://developer.android.com/topic/libraries/support-library/androidx-rn
+android.useAndroidX=true
\ No newline at end of file
diff --git a/kit/Android/ReadingComprehension/gradle/wrapper/gradle-wrapper.jar b/kit/Android/ReadingComprehension/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 00000000..f6b961fd
Binary files /dev/null and b/kit/Android/ReadingComprehension/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/kit/Android/ReadingComprehension/gradle/wrapper/gradle-wrapper.properties b/kit/Android/ReadingComprehension/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 00000000..d0e0094f
--- /dev/null
+++ b/kit/Android/ReadingComprehension/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Mon Nov 22 10:59:19 CST 2021
+distributionBase=GRADLE_USER_HOME
+distributionUrl=https\://services.gradle.org/distributions/gradle-6.7.1-bin.zip
+distributionPath=wrapper/dists
+zipStorePath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
diff --git a/kit/Android/ReadingComprehension/gradlew b/kit/Android/ReadingComprehension/gradlew
new file mode 100644
index 00000000..cccdd3d5
--- /dev/null
+++ b/kit/Android/ReadingComprehension/gradlew
@@ -0,0 +1,172 @@
+#!/usr/bin/env sh
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=$(save "$@")
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
+if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
+  cd "$(dirname "$0")"
+fi
+
+exec "$JAVACMD" "$@"
diff --git a/kit/Android/ReadingComprehension/gradlew.bat b/kit/Android/ReadingComprehension/gradlew.bat
new file mode 100644
index 00000000..e95643d6
--- /dev/null
+++ b/kit/Android/ReadingComprehension/gradlew.bat
@@ -0,0 +1,84 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windows variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/kit/Android/ReadingComprehension/settings.gradle b/kit/Android/ReadingComprehension/settings.gradle
new file mode 100644
index 00000000..36fa58a3
--- /dev/null
+++ b/kit/Android/ReadingComprehension/settings.gradle
@@ -0,0 +1,2 @@
+rootProject.name = "ReadingComparehension"
+include ':app'
diff --git a/kit/Android/Semantics/app/src/main/java/com/huawei/noah/MainActivity.java b/kit/Android/Semantics/app/src/main/java/com/huawei/noah/MainActivity.java
new file mode 100644
index 00000000..527a28f1
--- /dev/null
+++ b/kit/Android/Semantics/app/src/main/java/com/huawei/noah/MainActivity.java
@@ -0,0 +1,267 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+package com.huawei.noah;
+
+import androidx.annotation.NonNull;
+import androidx.appcompat.app.AppCompatActivity;
+import androidx.recyclerview.widget.LinearLayoutManager;
+import androidx.recyclerview.widget.RecyclerView;
+
+import android.app.ActivityManager;
+import android.content.Context;
+import android.graphics.Color;
+import android.os.Bundle;
+import android.os.Debug;
+import android.text.TextUtils;
+import android.text.format.Formatter;
+import android.util.Log;
+import android.view.LayoutInflater;
+import android.view.View;
+import android.view.ViewGroup;
+import android.widget.Adapter;
+import android.widget.EditText;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.Toast;
+
+import com.huawei.noah.bert.AppTokenizer;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class MainActivity extends AppCompatActivity implements View.OnClickListener {
+    private static final String TAG = "MainActivity";
+    private static final String VOCAB = "vocab.txt";
+    private static final String MODEL = "tinybert_f32.bolt";
+
+    int selIndex = 0;
+
+    private EditText etInput;
+    private TextView tvIntent;
+    private TextView tvTime;
+
+    private AppTokenizer appTokenizer;
+    private RecyclerView recyclerView;
+    private String[] quickList;
+    private ImageView toggle;
+
+    private String modelPath;
+
+    @Override protected void onCreate(Bundle savedInstanceState)
+    {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.activity_main);
+
+        etInput = findViewById(R.id.et_input);
+        tvIntent = findViewById(R.id.tv_intent);
+        tvTime = findViewById(R.id.tv_time);
+        findViewById(R.id.btn_detect).setOnClickListener(this);
+
+        findViewById(R.id.ic_clear).setOnClickListener(this);
+        toggle = findViewById(R.id.ic_toggle);
+        toggle.setOnClickListener(this);
+
+        String vocab = getCacheDir() + File.separator + VOCAB;
+        modelPath = getCacheDir() + File.separator + MODEL;
+        try {
+            copyAssetResource2File(VOCAB, vocab);
+            copyAssetResource2File(MODEL, modelPath);
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+
+        Log.i(TAG, "onCreate: vocab = " + vocab + ", model path = " + modelPath);
+        appTokenizer = new AppTokenizer(vocab);
+
+        quickList = getResources().getStringArray(R.array.quick_list);
+        recyclerView = findViewById(R.id.recyclerView);
+        LinearLayoutManager layoutManager = new LinearLayoutManager(this);
+        layoutManager.setOrientation(LinearLayoutManager.VERTICAL);
+        recyclerView.setLayoutManager(layoutManager);
+        TipAdapter tipAdapter = new TipAdapter();
+        recyclerView.setAdapter(tipAdapter);
+
+        etInput.setText(quickList[0]);
+    }
+
+    private void copyAssetResource2File(String assetsFile, String outFile) throws IOException
+    {
+        File outF = new File(outFile);
+        if (outF.exists())
+            return;
+        InputStream is = this.getAssets().open(assetsFile);
+        FileOutputStream fos = new FileOutputStream(outF);
+
+        int byteCount;
+        byte[] buffer = new byte[1024];
+        while ((byteCount = is.read(buffer)) != -1) {
+            fos.write(buffer, 0, byteCount);
+        }
+        fos.flush();
+        is.close();
+        fos.close();
+        outF.setReadable(true);
+    }
+
+    class TipAdapter extends RecyclerView.Adapter<TipAdapter.ViewHolder> {
+        @NonNull
+        @Override
+        public ViewHolder onCreateViewHolder(@NonNull ViewGroup parent, int viewType)
+        {
+            View view =
+                LayoutInflater.from(parent.getContext()).inflate(R.layout.item, parent, false);
+            return new ViewHolder(view);
+        }
+
+        private View lastSelView;
+        @Override public void onBindViewHolder(@NonNull final ViewHolder holder, final int position)
+        {
+            holder.textView.setText(position + ". " + quickList[position]);
+
+            if (selIndex == position) {
+                if (lastSelView != null && lastSelView != holder.textView) {
+                    lastSelView.setBackgroundColor(Color.rgb(255, 255, 255));
+                }
+                holder.textView.setBackgroundColor(Color.rgb(0, 230, 0));
+                lastSelView = holder.textView;
+            } else {
+                holder.textView.setBackgroundColor(Color.rgb(255, 255, 255));
+            }
+            holder.textView.setOnClickListener(new View.OnClickListener() {
+                @Override public void onClick(View v)
+                {
+                    etInput.setText(quickList[position]);
+                    etInput.setSelection(quickList[position].length());
+                    recyclerView.setVisibility(View.GONE);
+                    toggle.setImageResource(R.drawable.ic_find_next_holo_light);
+
+                    selIndex = position;
+                    notifyItemChanged(position);
+                }
+            });
+        }
+
+        @Override public int getItemCount()
+        {
+            return quickList.length;
+        }
+
+        class ViewHolder extends RecyclerView.ViewHolder {
+            TextView textView;
+
+            ViewHolder(@NonNull View itemView)
+            {
+                super(itemView);
+                textView = itemView.findViewById(R.id.text);
+            }
+        }
+    }
+
+    private void detect(String sentence)
+    {
+        int inputNum = 3;
+        int outputNum = 1;
+        String[] inputName = {"input_ids", "position_ids", "token_type_ids"};
+        String[] outputName = {"logit"};
+        int[] inputN = {1, 1, 1};
+        int[] inputCMax = {64, 64, 64};
+        int[] inputH = {1, 1, 1};
+        int[] inputW = {1, 1, 1};
+        DataType[] intputDataType = {DataType.INT32, DataType.INT32, DataType.INT32};
+        DataFormat[] intputDataFormat = {DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL};
+        BoltModel boltModel = new BoltModel(modelPath, AffinityType.CPU_HIGH_PERFORMANCE, inputNum,
+            inputName, inputN, inputCMax, inputH, inputW, intputDataType, intputDataFormat,
+            outputNum, outputName);
+
+        float[][] tokenizers = appTokenizer.runTokenizer(sentence);
+        int[] inputCActual = {tokenizers[0].length, tokenizers[1].length, tokenizers[2].length};
+
+        long start = System.currentTimeMillis();
+        BoltResult boltResult = boltModel.run(inputNum, inputName, inputN, inputCActual, inputH,
+            inputW, intputDataType, intputDataFormat, tokenizers);
+        float[][] result = boltResult.getResultData();
+
+        if (result[0][0] > result[0][1]) {
+            tvIntent.setText("negative");
+        } else {
+            tvIntent.setText("positive");
+        }
+
+        String memory = getRunningAppProcessInfo();
+        long time = System.currentTimeMillis() - start;
+        tvTime.setText("time：" + time + "ms\nstorage：" + memory);
+    }
+
+    @Override public void onClick(View v)
+    {
+        switch (v.getId()) {
+            case R.id.btn_detect:
+                // trim to remove blank
+                String input = etInput.getText().toString().trim();
+                if (TextUtils.isEmpty(input)) {
+                    Toast.makeText(this, etInput.getHint(), Toast.LENGTH_SHORT).show();
+                    return;
+                }
+                detect(input);
+                break;
+
+            case R.id.ic_clear:
+                etInput.setText("");
+                break;
+            case R.id.ic_toggle:
+                int visbility = recyclerView.getVisibility() == View.VISIBLE ? View.GONE
+                                                                             : View.VISIBLE;
+                recyclerView.setVisibility(visbility);
+                if (visbility == View.VISIBLE) {
+                    toggle.setImageResource(R.drawable.ic_find_previous_holo_light);
+                } else {
+                    toggle.setImageResource(R.drawable.ic_find_next_holo_light);
+                }
+                break;
+        }
+    }
+
+    private String getRunningAppProcessInfo()
+    {
+        ActivityManager manager = (ActivityManager)getSystemService(Context.ACTIVITY_SERVICE);
+        List<ActivityManager.RunningAppProcessInfo> appProcessList =
+            manager.getRunningAppProcesses();
+
+        for (ActivityManager.RunningAppProcessInfo appProcessInfo : appProcessList) {
+            String processName = appProcessInfo.processName;
+            if (!TextUtils.equals(processName, getPackageName()))
+                continue;
+            // get memory info
+            int[] myMempid = new int[] {appProcessInfo.pid};
+            Debug.MemoryInfo[] memoryInfo = manager.getProcessMemoryInfo(myMempid);
+            // memory usgae(KB)
+            int dalvikPss = memoryInfo[0].dalvikPss;
+            int nativePss = memoryInfo[0].nativePss;
+            int otherPss = memoryInfo[0].otherPss;
+            return Formatter.formatFileSize(this, (dalvikPss + nativePss + otherPss) * 1024);
+        }
+
+        return null;
+    }
+}
diff --git a/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/AppTokenizer.java b/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/AppTokenizer.java
new file mode 100644
index 00000000..1b7df02a
--- /dev/null
+++ b/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/AppTokenizer.java
@@ -0,0 +1,106 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+package com.huawei.noah.bert;
+
+import android.util.Log;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class AppTokenizer {
+    private static final String TAG = "AppTokenizer";
+
+    private int maxSeqLength;
+    private int tokenSize;
+    private List<String> tokens;
+    private FullTokenizer tokenizer;
+
+    public AppTokenizer(String vocab)
+    {
+        this.maxSeqLength = 64;
+        tokenizer = new FullTokenizer(vocab, true);
+    }
+
+    public float[][] runTokenizer(String words)
+    {
+        List<String> tokens = tokenizer.tokenize(words);
+        Log.i(TAG, "runTokenizer: " + tokens.toString());
+        return getExampleSingle(tokenizer, tokens);
+    }
+
+    private float[][] getExampleSingle(FullTokenizer tokenizer, List<String> input_tokens)
+    {
+        while (true) {
+            int totalLength = input_tokens.size();
+            if (totalLength <= maxSeqLength - 2) {
+                break;
+            } else {
+                input_tokens.remove(input_tokens.size() - 1);
+            }
+        }
+
+        tokens = new ArrayList<>();
+        List<Integer> segmentIds = new ArrayList<>();
+
+        tokens.add("[CLS]");
+        segmentIds.add(0);
+        for (String token : input_tokens) {
+            tokens.add(token);
+            segmentIds.add(0);
+        }
+        tokens.add("[SEP]");
+
+        segmentIds.add(0);
+
+        List<Integer> inputIds = tokenizer.convertTokensToIds(tokens);
+
+        while (inputIds.size() < maxSeqLength) {
+            inputIds.add(0);
+            segmentIds.add(0);
+        }
+        List<Integer> positions = new ArrayList<Integer>();
+        for (int i = 0; i < inputIds.size(); i++) {
+            positions.add(i);
+        }
+
+        float[][] outputs = new float[3][maxSeqLength];
+        for (int i = 0; i < inputIds.size(); i++) {
+            outputs[0][i] = inputIds.get(i);
+        }
+        for (int i = 0; i < positions.size(); i++) {
+            outputs[1][i] = positions.get(i);
+        }
+
+        for (int i = 0; i < segmentIds.size(); i++) {
+            outputs[2][i] = segmentIds.get(i);
+        }
+
+        tokenSize = input_tokens.size() + 2;
+        Log.i(TAG, "getExampleSingle: tokenSize = " + tokenSize);
+        return outputs;
+    }
+
+    public String getTokens()
+    {
+        StringBuilder stringBuilder = new StringBuilder();
+        for (int i = 1; i < tokens.size() - 1; i++) {
+            stringBuilder.append(tokens.get(i)).append(" ");
+        }
+        return stringBuilder.toString();
+    }
+
+    public int getTokenSize()
+    {
+        return tokenSize;
+    }
+}
diff --git a/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/BasicTokenizer.java b/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/BasicTokenizer.java
new file mode 100644
index 00000000..0abf15ba
--- /dev/null
+++ b/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/BasicTokenizer.java
@@ -0,0 +1,162 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+package com.huawei.noah.bert;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class BasicTokenizer {
+    public List<String> tokenize(String text)
+    {
+        String cleanText = cleanText(text);
+
+        String chineseTokens = tokenizeChineseChars(cleanText);
+
+        List<String> origTokens = whiteSpaceTokenize(chineseTokens);
+
+        String str = "";
+        for (String token : origTokens) {
+            List<String> list = runSplitOnPunc(token);
+            for (int i = 0; i < list.size(); i++) {
+                str += list.get(i) + " ";
+            }
+        }
+
+        List<String> resTokens = whiteSpaceTokenize(str);
+
+        return resTokens;
+    }
+
+    private List<String> runSplitOnPunc(String token)
+    {
+        List<List<Character>> result = new ArrayList<List<Character>>();
+
+        int length = token.length();
+        int i = 0;
+        boolean startNewWord = true;
+        while (i < length) {
+            char c = token.charAt(i);
+            if (isPunctuation(c)) {
+                List<Character> list = Arrays.asList(c);
+                result.add(list);
+                startNewWord = true;
+            } else {
+                if (startNewWord) {
+                    result.add(new ArrayList<Character>());
+                }
+                startNewWord = false;
+                result.get(result.size() - 1).add(c);
+            }
+            i += 1;
+        }
+
+        List<String> res = new ArrayList<String>();
+        for (int j = 0; j < result.size(); j++) {
+            String str = "";
+            for (int k = 0; k < result.get(j).size(); k++) {
+                str += result.get(j).get(k);
+            }
+            res.add(str);
+        }
+        return res;
+    }
+
+    private boolean isPunctuation(char c)
+    {
+        if ((c >= 33 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) ||
+            (c >= 123 && c <= 126)) {
+            return true;
+        }
+
+        if (c == '“' || c == '”' || c == '、' || c == '《' || c == '》' || c == '。' || c == '；' ||
+            c == '【' || c == '】') {
+            return true;
+        }
+
+        return false;
+    }
+
+    private List<String> whiteSpaceTokenize(String text)
+    {
+        List<String> result = new ArrayList<String>();
+
+        text = text.trim();
+        if (null == text) {
+            return result;
+        }
+        String[] tokens = text.split(" ");
+        result = Arrays.asList(tokens);
+
+        return result;
+    }
+
+    private String tokenizeChineseChars(String cleanText)
+    {
+        StringBuffer outStrBuf = new StringBuffer();
+
+        for (int i = 0; i < cleanText.length(); i++) {
+            char c = cleanText.charAt(i);
+            if (isChineseChar(c)) {
+                outStrBuf.append(" ");
+                outStrBuf.append(c);
+                outStrBuf.append(" ");
+            } else {
+                outStrBuf.append(c);
+            }
+        }
+
+        return outStrBuf.toString();
+    }
+
+    private boolean isChineseChar(char c)
+    {
+        String s = String.valueOf(c);
+        String regex = "[\u4e00-\u9fa5]";
+        Pattern p = Pattern.compile(regex);
+
+        Matcher m = p.matcher(s);
+        return m.matches();
+    }
+
+    private String cleanText(String text)
+    {
+        StringBuffer outStrBuf = new StringBuffer("");
+
+        for (int i = 0; i < text.length(); i++) {
+            char c = text.charAt(i);
+            if (isWhiteSpace(c)) {
+                outStrBuf.append(" ");
+            } else {
+                outStrBuf.append(c);
+            }
+        }
+        return outStrBuf.toString();
+    }
+
+    private boolean isWhiteSpace(char c)
+    {
+        if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
+            return true;
+        }
+
+        return false;
+    }
+
+    public static void main(String[] args)
+    {
+        System.out.print("hello world");
+    }
+}
diff --git a/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/FullTokenizer.java b/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/FullTokenizer.java
new file mode 100644
index 00000000..05bb327a
--- /dev/null
+++ b/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/FullTokenizer.java
@@ -0,0 +1,81 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+package com.huawei.noah.bert;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class FullTokenizer {
+    private Map<String, Integer> vocab;
+    private boolean doLowerCase;
+    private BasicTokenizer basicTokenizer;
+    private WordpieceTokenizer wordpieceTokenizer;
+
+    public FullTokenizer(String vocab_file, boolean doLowerCase)
+    {
+        this.vocab = load(vocab_file);
+        this.doLowerCase = doLowerCase;
+        this.basicTokenizer = new BasicTokenizer();
+        this.wordpieceTokenizer = new WordpieceTokenizer(vocab);
+    }
+
+    private Map<String, Integer> load(String filePath)
+    {
+        Map<String, Integer> map = new HashMap<String, Integer>();
+        try {
+            BufferedReader br = new BufferedReader(
+                new InputStreamReader(new FileInputStream(new File(filePath)), "UTF-8"));
+            int index = 0;
+            String token = null;
+            while ((token = br.readLine()) != null) {
+                map.put(token, index);
+                index += 1;
+            }
+            br.close();
+        } catch (Exception e) {
+            System.err.println("read errors :" + e);
+        }
+        return map;
+    }
+
+    public List<String> tokenize(String text)
+    {
+        if (this.doLowerCase) {
+            text = text.toLowerCase();
+        }
+        List<String> splitTopkens = new ArrayList<String>();
+
+        for (String token : basicTokenizer.tokenize(text)) {
+            for (String subToken : wordpieceTokenizer.tokenize(token)) {
+                splitTopkens.add(subToken);
+            }
+        }
+
+        return splitTopkens;
+    }
+
+    public List<Integer> convertTokensToIds(List<String> tokens)
+    {
+        List<Integer> outputIds = new ArrayList<Integer>();
+        for (String token : tokens) {
+            outputIds.add(this.vocab.get(token));
+        }
+        return outputIds;
+    }
+}
diff --git a/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/WordpieceTokenizer.java b/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/WordpieceTokenizer.java
new file mode 100644
index 00000000..9b8f0331
--- /dev/null
+++ b/kit/Android/Semantics/app/src/main/java/com/huawei/noah/bert/WordpieceTokenizer.java
@@ -0,0 +1,95 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+package com.huawei.noah.bert;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+public class WordpieceTokenizer {
+    private Map<String, Integer> vocab;
+    private String unkToken = "[UNK]";
+    private int maxInputCharsPerWord = 200;
+
+    public WordpieceTokenizer(Map<String, Integer> vocab)
+    {
+        this.vocab = vocab;
+    }
+
+    /*
+        For example:
+        input = "unaffable"
+        output = ["un", "##aff", "##able"]
+    */
+    public List<String> tokenize(String text)
+    {
+        List<String> tokens = whiteSpaceTokenize(text);
+
+        List<String> outputTokens = new ArrayList<String>();
+        for (String token : tokens) {
+            int length = token.length();
+            if (length > this.maxInputCharsPerWord) {
+                outputTokens.add(this.unkToken);
+                continue;
+            }
+
+            boolean isBad = false;
+            int start = 0;
+            List<String> subTokens = new ArrayList<String>();
+
+            while (start < length) {
+                int end = length;
+                String curSubStr = null;
+                while (start < end) {
+                    String subStr = token.substring(start, end);
+                    if (start > 0) {
+                        subStr = "##" + subStr;
+                    }
+                    if (this.vocab.containsKey(subStr)) {
+                        curSubStr = subStr;
+                        break;
+                    }
+                    end -= 1;
+                }
+                if (null == curSubStr) {
+                    isBad = true;
+                    break;
+                }
+                subTokens.add(curSubStr);
+                start = end;
+            }
+
+            if (isBad) {
+                outputTokens.add(this.unkToken);
+            } else {
+                outputTokens.addAll(subTokens);
+            }
+        }
+        return outputTokens;
+    }
+
+    private List<String> whiteSpaceTokenize(String text)
+    {
+        List<String> result = new ArrayList<String>();
+
+        text = text.trim();
+        if (null == text) {
+            return result;
+        }
+        String[] tokens = text.split(" ");
+        result = Arrays.asList(tokens);
+
+        return result;
+    }
+}
diff --git a/kit/assets/CameraEnlarge/esr_1_f32.bolt b/kit/assets/CameraEnlarge/esr_1_f32.bolt
index 9e28818f..fb5cf556 100644
Binary files a/kit/assets/CameraEnlarge/esr_1_f32.bolt and b/kit/assets/CameraEnlarge/esr_1_f32.bolt differ
diff --git a/kit/assets/FaceDetection/simplified_f32.bolt b/kit/assets/FaceDetection/simplified_f32.bolt
new file mode 100644
index 00000000..f8499262
Binary files /dev/null and b/kit/assets/FaceDetection/simplified_f32.bolt differ
diff --git a/kit/assets/ImageClassification/ghostnet_f32.bolt b/kit/assets/ImageClassification/ghostnet_f32.bolt
index b932c202..db506883 100644
Binary files a/kit/assets/ImageClassification/ghostnet_f32.bolt and b/kit/assets/ImageClassification/ghostnet_f32.bolt differ
diff --git a/kit/assets/ReadingComprehension/vocab.txt b/kit/assets/ReadingComprehension/vocab.txt
new file mode 100644
index 00000000..fb140275
--- /dev/null
+++ b/kit/assets/ReadingComprehension/vocab.txt
@@ -0,0 +1,30522 @@
+[PAD]
+[unused0]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[unused11]
+[unused12]
+[unused13]
+[unused14]
+[unused15]
+[unused16]
+[unused17]
+[unused18]
+[unused19]
+[unused20]
+[unused21]
+[unused22]
+[unused23]
+[unused24]
+[unused25]
+[unused26]
+[unused27]
+[unused28]
+[unused29]
+[unused30]
+[unused31]
+[unused32]
+[unused33]
+[unused34]
+[unused35]
+[unused36]
+[unused37]
+[unused38]
+[unused39]
+[unused40]
+[unused41]
+[unused42]
+[unused43]
+[unused44]
+[unused45]
+[unused46]
+[unused47]
+[unused48]
+[unused49]
+[unused50]
+[unused51]
+[unused52]
+[unused53]
+[unused54]
+[unused55]
+[unused56]
+[unused57]
+[unused58]
+[unused59]
+[unused60]
+[unused61]
+[unused62]
+[unused63]
+[unused64]
+[unused65]
+[unused66]
+[unused67]
+[unused68]
+[unused69]
+[unused70]
+[unused71]
+[unused72]
+[unused73]
+[unused74]
+[unused75]
+[unused76]
+[unused77]
+[unused78]
+[unused79]
+[unused80]
+[unused81]
+[unused82]
+[unused83]
+[unused84]
+[unused85]
+[unused86]
+[unused87]
+[unused88]
+[unused89]
+[unused90]
+[unused91]
+[unused92]
+[unused93]
+[unused94]
+[unused95]
+[unused96]
+[unused97]
+[unused98]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+[unused99]
+[unused100]
+[unused101]
+[unused102]
+[unused103]
+[unused104]
+[unused105]
+[unused106]
+[unused107]
+[unused108]
+[unused109]
+[unused110]
+[unused111]
+[unused112]
+[unused113]
+[unused114]
+[unused115]
+[unused116]
+[unused117]
+[unused118]
+[unused119]
+[unused120]
+[unused121]
+[unused122]
+[unused123]
+[unused124]
+[unused125]
+[unused126]
+[unused127]
+[unused128]
+[unused129]
+[unused130]
+[unused131]
+[unused132]
+[unused133]
+[unused134]
+[unused135]
+[unused136]
+[unused137]
+[unused138]
+[unused139]
+[unused140]
+[unused141]
+[unused142]
+[unused143]
+[unused144]
+[unused145]
+[unused146]
+[unused147]
+[unused148]
+[unused149]
+[unused150]
+[unused151]
+[unused152]
+[unused153]
+[unused154]
+[unused155]
+[unused156]
+[unused157]
+[unused158]
+[unused159]
+[unused160]
+[unused161]
+[unused162]
+[unused163]
+[unused164]
+[unused165]
+[unused166]
+[unused167]
+[unused168]
+[unused169]
+[unused170]
+[unused171]
+[unused172]
+[unused173]
+[unused174]
+[unused175]
+[unused176]
+[unused177]
+[unused178]
+[unused179]
+[unused180]
+[unused181]
+[unused182]
+[unused183]
+[unused184]
+[unused185]
+[unused186]
+[unused187]
+[unused188]
+[unused189]
+[unused190]
+[unused191]
+[unused192]
+[unused193]
+[unused194]
+[unused195]
+[unused196]
+[unused197]
+[unused198]
+[unused199]
+[unused200]
+[unused201]
+[unused202]
+[unused203]
+[unused204]
+[unused205]
+[unused206]
+[unused207]
+[unused208]
+[unused209]
+[unused210]
+[unused211]
+[unused212]
+[unused213]
+[unused214]
+[unused215]
+[unused216]
+[unused217]
+[unused218]
+[unused219]
+[unused220]
+[unused221]
+[unused222]
+[unused223]
+[unused224]
+[unused225]
+[unused226]
+[unused227]
+[unused228]
+[unused229]
+[unused230]
+[unused231]
+[unused232]
+[unused233]
+[unused234]
+[unused235]
+[unused236]
+[unused237]
+[unused238]
+[unused239]
+[unused240]
+[unused241]
+[unused242]
+[unused243]
+[unused244]
+[unused245]
+[unused246]
+[unused247]
+[unused248]
+[unused249]
+[unused250]
+[unused251]
+[unused252]
+[unused253]
+[unused254]
+[unused255]
+[unused256]
+[unused257]
+[unused258]
+[unused259]
+[unused260]
+[unused261]
+[unused262]
+[unused263]
+[unused264]
+[unused265]
+[unused266]
+[unused267]
+[unused268]
+[unused269]
+[unused270]
+[unused271]
+[unused272]
+[unused273]
+[unused274]
+[unused275]
+[unused276]
+[unused277]
+[unused278]
+[unused279]
+[unused280]
+[unused281]
+[unused282]
+[unused283]
+[unused284]
+[unused285]
+[unused286]
+[unused287]
+[unused288]
+[unused289]
+[unused290]
+[unused291]
+[unused292]
+[unused293]
+[unused294]
+[unused295]
+[unused296]
+[unused297]
+[unused298]
+[unused299]
+[unused300]
+[unused301]
+[unused302]
+[unused303]
+[unused304]
+[unused305]
+[unused306]
+[unused307]
+[unused308]
+[unused309]
+[unused310]
+[unused311]
+[unused312]
+[unused313]
+[unused314]
+[unused315]
+[unused316]
+[unused317]
+[unused318]
+[unused319]
+[unused320]
+[unused321]
+[unused322]
+[unused323]
+[unused324]
+[unused325]
+[unused326]
+[unused327]
+[unused328]
+[unused329]
+[unused330]
+[unused331]
+[unused332]
+[unused333]
+[unused334]
+[unused335]
+[unused336]
+[unused337]
+[unused338]
+[unused339]
+[unused340]
+[unused341]
+[unused342]
+[unused343]
+[unused344]
+[unused345]
+[unused346]
+[unused347]
+[unused348]
+[unused349]
+[unused350]
+[unused351]
+[unused352]
+[unused353]
+[unused354]
+[unused355]
+[unused356]
+[unused357]
+[unused358]
+[unused359]
+[unused360]
+[unused361]
+[unused362]
+[unused363]
+[unused364]
+[unused365]
+[unused366]
+[unused367]
+[unused368]
+[unused369]
+[unused370]
+[unused371]
+[unused372]
+[unused373]
+[unused374]
+[unused375]
+[unused376]
+[unused377]
+[unused378]
+[unused379]
+[unused380]
+[unused381]
+[unused382]
+[unused383]
+[unused384]
+[unused385]
+[unused386]
+[unused387]
+[unused388]
+[unused389]
+[unused390]
+[unused391]
+[unused392]
+[unused393]
+[unused394]
+[unused395]
+[unused396]
+[unused397]
+[unused398]
+[unused399]
+[unused400]
+[unused401]
+[unused402]
+[unused403]
+[unused404]
+[unused405]
+[unused406]
+[unused407]
+[unused408]
+[unused409]
+[unused410]
+[unused411]
+[unused412]
+[unused413]
+[unused414]
+[unused415]
+[unused416]
+[unused417]
+[unused418]
+[unused419]
+[unused420]
+[unused421]
+[unused422]
+[unused423]
+[unused424]
+[unused425]
+[unused426]
+[unused427]
+[unused428]
+[unused429]
+[unused430]
+[unused431]
+[unused432]
+[unused433]
+[unused434]
+[unused435]
+[unused436]
+[unused437]
+[unused438]
+[unused439]
+[unused440]
+[unused441]
+[unused442]
+[unused443]
+[unused444]
+[unused445]
+[unused446]
+[unused447]
+[unused448]
+[unused449]
+[unused450]
+[unused451]
+[unused452]
+[unused453]
+[unused454]
+[unused455]
+[unused456]
+[unused457]
+[unused458]
+[unused459]
+[unused460]
+[unused461]
+[unused462]
+[unused463]
+[unused464]
+[unused465]
+[unused466]
+[unused467]
+[unused468]
+[unused469]
+[unused470]
+[unused471]
+[unused472]
+[unused473]
+[unused474]
+[unused475]
+[unused476]
+[unused477]
+[unused478]
+[unused479]
+[unused480]
+[unused481]
+[unused482]
+[unused483]
+[unused484]
+[unused485]
+[unused486]
+[unused487]
+[unused488]
+[unused489]
+[unused490]
+[unused491]
+[unused492]
+[unused493]
+[unused494]
+[unused495]
+[unused496]
+[unused497]
+[unused498]
+[unused499]
+[unused500]
+[unused501]
+[unused502]
+[unused503]
+[unused504]
+[unused505]
+[unused506]
+[unused507]
+[unused508]
+[unused509]
+[unused510]
+[unused511]
+[unused512]
+[unused513]
+[unused514]
+[unused515]
+[unused516]
+[unused517]
+[unused518]
+[unused519]
+[unused520]
+[unused521]
+[unused522]
+[unused523]
+[unused524]
+[unused525]
+[unused526]
+[unused527]
+[unused528]
+[unused529]
+[unused530]
+[unused531]
+[unused532]
+[unused533]
+[unused534]
+[unused535]
+[unused536]
+[unused537]
+[unused538]
+[unused539]
+[unused540]
+[unused541]
+[unused542]
+[unused543]
+[unused544]
+[unused545]
+[unused546]
+[unused547]
+[unused548]
+[unused549]
+[unused550]
+[unused551]
+[unused552]
+[unused553]
+[unused554]
+[unused555]
+[unused556]
+[unused557]
+[unused558]
+[unused559]
+[unused560]
+[unused561]
+[unused562]
+[unused563]
+[unused564]
+[unused565]
+[unused566]
+[unused567]
+[unused568]
+[unused569]
+[unused570]
+[unused571]
+[unused572]
+[unused573]
+[unused574]
+[unused575]
+[unused576]
+[unused577]
+[unused578]
+[unused579]
+[unused580]
+[unused581]
+[unused582]
+[unused583]
+[unused584]
+[unused585]
+[unused586]
+[unused587]
+[unused588]
+[unused589]
+[unused590]
+[unused591]
+[unused592]
+[unused593]
+[unused594]
+[unused595]
+[unused596]
+[unused597]
+[unused598]
+[unused599]
+[unused600]
+[unused601]
+[unused602]
+[unused603]
+[unused604]
+[unused605]
+[unused606]
+[unused607]
+[unused608]
+[unused609]
+[unused610]
+[unused611]
+[unused612]
+[unused613]
+[unused614]
+[unused615]
+[unused616]
+[unused617]
+[unused618]
+[unused619]
+[unused620]
+[unused621]
+[unused622]
+[unused623]
+[unused624]
+[unused625]
+[unused626]
+[unused627]
+[unused628]
+[unused629]
+[unused630]
+[unused631]
+[unused632]
+[unused633]
+[unused634]
+[unused635]
+[unused636]
+[unused637]
+[unused638]
+[unused639]
+[unused640]
+[unused641]
+[unused642]
+[unused643]
+[unused644]
+[unused645]
+[unused646]
+[unused647]
+[unused648]
+[unused649]
+[unused650]
+[unused651]
+[unused652]
+[unused653]
+[unused654]
+[unused655]
+[unused656]
+[unused657]
+[unused658]
+[unused659]
+[unused660]
+[unused661]
+[unused662]
+[unused663]
+[unused664]
+[unused665]
+[unused666]
+[unused667]
+[unused668]
+[unused669]
+[unused670]
+[unused671]
+[unused672]
+[unused673]
+[unused674]
+[unused675]
+[unused676]
+[unused677]
+[unused678]
+[unused679]
+[unused680]
+[unused681]
+[unused682]
+[unused683]
+[unused684]
+[unused685]
+[unused686]
+[unused687]
+[unused688]
+[unused689]
+[unused690]
+[unused691]
+[unused692]
+[unused693]
+[unused694]
+[unused695]
+[unused696]
+[unused697]
+[unused698]
+[unused699]
+[unused700]
+[unused701]
+[unused702]
+[unused703]
+[unused704]
+[unused705]
+[unused706]
+[unused707]
+[unused708]
+[unused709]
+[unused710]
+[unused711]
+[unused712]
+[unused713]
+[unused714]
+[unused715]
+[unused716]
+[unused717]
+[unused718]
+[unused719]
+[unused720]
+[unused721]
+[unused722]
+[unused723]
+[unused724]
+[unused725]
+[unused726]
+[unused727]
+[unused728]
+[unused729]
+[unused730]
+[unused731]
+[unused732]
+[unused733]
+[unused734]
+[unused735]
+[unused736]
+[unused737]
+[unused738]
+[unused739]
+[unused740]
+[unused741]
+[unused742]
+[unused743]
+[unused744]
+[unused745]
+[unused746]
+[unused747]
+[unused748]
+[unused749]
+[unused750]
+[unused751]
+[unused752]
+[unused753]
+[unused754]
+[unused755]
+[unused756]
+[unused757]
+[unused758]
+[unused759]
+[unused760]
+[unused761]
+[unused762]
+[unused763]
+[unused764]
+[unused765]
+[unused766]
+[unused767]
+[unused768]
+[unused769]
+[unused770]
+[unused771]
+[unused772]
+[unused773]
+[unused774]
+[unused775]
+[unused776]
+[unused777]
+[unused778]
+[unused779]
+[unused780]
+[unused781]
+[unused782]
+[unused783]
+[unused784]
+[unused785]
+[unused786]
+[unused787]
+[unused788]
+[unused789]
+[unused790]
+[unused791]
+[unused792]
+[unused793]
+[unused794]
+[unused795]
+[unused796]
+[unused797]
+[unused798]
+[unused799]
+[unused800]
+[unused801]
+[unused802]
+[unused803]
+[unused804]
+[unused805]
+[unused806]
+[unused807]
+[unused808]
+[unused809]
+[unused810]
+[unused811]
+[unused812]
+[unused813]
+[unused814]
+[unused815]
+[unused816]
+[unused817]
+[unused818]
+[unused819]
+[unused820]
+[unused821]
+[unused822]
+[unused823]
+[unused824]
+[unused825]
+[unused826]
+[unused827]
+[unused828]
+[unused829]
+[unused830]
+[unused831]
+[unused832]
+[unused833]
+[unused834]
+[unused835]
+[unused836]
+[unused837]
+[unused838]
+[unused839]
+[unused840]
+[unused841]
+[unused842]
+[unused843]
+[unused844]
+[unused845]
+[unused846]
+[unused847]
+[unused848]
+[unused849]
+[unused850]
+[unused851]
+[unused852]
+[unused853]
+[unused854]
+[unused855]
+[unused856]
+[unused857]
+[unused858]
+[unused859]
+[unused860]
+[unused861]
+[unused862]
+[unused863]
+[unused864]
+[unused865]
+[unused866]
+[unused867]
+[unused868]
+[unused869]
+[unused870]
+[unused871]
+[unused872]
+[unused873]
+[unused874]
+[unused875]
+[unused876]
+[unused877]
+[unused878]
+[unused879]
+[unused880]
+[unused881]
+[unused882]
+[unused883]
+[unused884]
+[unused885]
+[unused886]
+[unused887]
+[unused888]
+[unused889]
+[unused890]
+[unused891]
+[unused892]
+[unused893]
+[unused894]
+[unused895]
+[unused896]
+[unused897]
+[unused898]
+[unused899]
+[unused900]
+[unused901]
+[unused902]
+[unused903]
+[unused904]
+[unused905]
+[unused906]
+[unused907]
+[unused908]
+[unused909]
+[unused910]
+[unused911]
+[unused912]
+[unused913]
+[unused914]
+[unused915]
+[unused916]
+[unused917]
+[unused918]
+[unused919]
+[unused920]
+[unused921]
+[unused922]
+[unused923]
+[unused924]
+[unused925]
+[unused926]
+[unused927]
+[unused928]
+[unused929]
+[unused930]
+[unused931]
+[unused932]
+[unused933]
+[unused934]
+[unused935]
+[unused936]
+[unused937]
+[unused938]
+[unused939]
+[unused940]
+[unused941]
+[unused942]
+[unused943]
+[unused944]
+[unused945]
+[unused946]
+[unused947]
+[unused948]
+[unused949]
+[unused950]
+[unused951]
+[unused952]
+[unused953]
+[unused954]
+[unused955]
+[unused956]
+[unused957]
+[unused958]
+[unused959]
+[unused960]
+[unused961]
+[unused962]
+[unused963]
+[unused964]
+[unused965]
+[unused966]
+[unused967]
+[unused968]
+[unused969]
+[unused970]
+[unused971]
+[unused972]
+[unused973]
+[unused974]
+[unused975]
+[unused976]
+[unused977]
+[unused978]
+[unused979]
+[unused980]
+[unused981]
+[unused982]
+[unused983]
+[unused984]
+[unused985]
+[unused986]
+[unused987]
+[unused988]
+[unused989]
+[unused990]
+[unused991]
+[unused992]
+[unused993]
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+=
+>
+?
+@
+[
+\
+]
+^
+_
+`
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+{
+|
+}
+~
+¡
+¢
+£
+¤
+¥
+¦
+§
+¨
+©
+ª
+«
+¬
+®
+°
+±
+²
+³
+´
+µ
+¶
+·
+¹
+º
+»
+¼
+½
+¾
+¿
+×
+ß
+æ
+ð
+÷
+ø
+þ
+đ
+ħ
+ı
+ł
+ŋ
+œ
+ƒ
+ɐ
+ɑ
+ɒ
+ɔ
+ɕ
+ə
+ɛ
+ɡ
+ɣ
+ɨ
+ɪ
+ɫ
+ɬ
+ɯ
+ɲ
+ɴ
+ɹ
+ɾ
+ʀ
+ʁ
+ʂ
+ʃ
+ʉ
+ʊ
+ʋ
+ʌ
+ʎ
+ʐ
+ʑ
+ʒ
+ʔ
+ʰ
+ʲ
+ʳ
+ʷ
+ʸ
+ʻ
+ʼ
+ʾ
+ʿ
+ˈ
+ː
+ˡ
+ˢ
+ˣ
+ˤ
+α
+β
+γ
+δ
+ε
+ζ
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ξ
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+ђ
+є
+і
+ј
+љ
+њ
+ћ
+ӏ
+ա
+բ
+գ
+դ
+ե
+թ
+ի
+լ
+կ
+հ
+մ
+յ
+ն
+ո
+պ
+ս
+վ
+տ
+ր
+ւ
+ք
+־
+א
+ב
+ג
+ד
+ה
+ו
+ז
+ח
+ט
+י
+ך
+כ
+ל
+ם
+מ
+ן
+נ
+ס
+ע
+ף
+פ
+ץ
+צ
+ק
+ר
+ש
+ת
+،
+ء
+ا
+ب
+ة
+ت
+ث
+ج
+ح
+خ
+د
+ذ
+ر
+ز
+س
+ش
+ص
+ض
+ط
+ظ
+ع
+غ
+ـ
+ف
+ق
+ك
+ل
+م
+ن
+ه
+و
+ى
+ي
+ٹ
+پ
+چ
+ک
+گ
+ں
+ھ
+ہ
+ی
+ے
+अ
+आ
+उ
+ए
+क
+ख
+ग
+च
+ज
+ट
+ड
+ण
+त
+थ
+द
+ध
+न
+प
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+ा
+ि
+ी
+ो
+।
+॥
+ং
+অ
+আ
+ই
+উ
+এ
+ও
+ক
+খ
+গ
+চ
+ছ
+জ
+ট
+ড
+ণ
+ত
+থ
+দ
+ধ
+ন
+প
+ব
+ভ
+ম
+য
+র
+ল
+শ
+ষ
+স
+হ
+া
+ি
+ী
+ে
+க
+ச
+ட
+த
+ந
+ன
+ப
+ம
+ய
+ர
+ல
+ள
+வ
+ா
+ி
+ு
+ே
+ை
+ನ
+ರ
+ಾ
+ක
+ය
+ර
+ල
+ව
+ා
+ก
+ง
+ต
+ท
+น
+พ
+ม
+ย
+ร
+ล
+ว
+ส
+อ
+า
+เ
+་
+།
+ག
+ང
+ད
+ན
+པ
+བ
+མ
+འ
+ར
+ལ
+ས
+မ
+ა
+ბ
+გ
+დ
+ე
+ვ
+თ
+ი
+კ
+ლ
+მ
+ნ
+ო
+რ
+ს
+ტ
+უ
+ᄀ
+ᄂ
+ᄃ
+ᄅ
+ᄆ
+ᄇ
+ᄉ
+ᄊ
+ᄋ
+ᄌ
+ᄎ
+ᄏ
+ᄐ
+ᄑ
+ᄒ
+ᅡ
+ᅢ
+ᅥ
+ᅦ
+ᅧ
+ᅩ
+ᅪ
+ᅭ
+ᅮ
+ᅯ
+ᅲ
+ᅳ
+ᅴ
+ᅵ
+ᆨ
+ᆫ
+ᆯ
+ᆷ
+ᆸ
+ᆼ
+ᴬ
+ᴮ
+ᴰ
+ᴵ
+ᴺ
+ᵀ
+ᵃ
+ᵇ
+ᵈ
+ᵉ
+ᵍ
+ᵏ
+ᵐ
+ᵒ
+ᵖ
+ᵗ
+ᵘ
+ᵢ
+ᵣ
+ᵤ
+ᵥ
+ᶜ
+ᶠ
+‐
+‑
+‒
+–
+—
+―
+‖
+‘
+’
+‚
+“
+”
+„
+†
+‡
+•
+…
+‰
+′
+″
+›
+‿
+⁄
+⁰
+ⁱ
+⁴
+⁵
+⁶
+⁷
+⁸
+⁹
+⁺
+⁻
+ⁿ
+₀
+₁
+₂
+₃
+₄
+₅
+₆
+₇
+₈
+₉
+₊
+₍
+₎
+ₐ
+ₑ
+ₒ
+ₓ
+ₕ
+ₖ
+ₗ
+ₘ
+ₙ
+ₚ
+ₛ
+ₜ
+₤
+₩
+€
+₱
+₹
+ℓ
+№
+ℝ
+™
+⅓
+⅔
+←
+↑
+→
+↓
+↔
+↦
+⇄
+⇌
+⇒
+∂
+∅
+∆
+∇
+∈
+−
+∗
+∘
+√
+∞
+∧
+∨
+∩
+∪
+≈
+≡
+≤
+≥
+⊂
+⊆
+⊕
+⊗
+⋅
+─
+│
+■
+▪
+●
+★
+☆
+☉
+♠
+♣
+♥
+♦
+♭
+♯
+⟨
+⟩
+ⱼ
+⺩
+⺼
+⽥
+、
+。
+〈
+〉
+《
+》
+「
+」
+『
+』
+〜
+あ
+い
+う
+え
+お
+か
+き
+く
+け
+こ
+さ
+し
+す
+せ
+そ
+た
+ち
+っ
+つ
+て
+と
+な
+に
+ぬ
+ね
+の
+は
+ひ
+ふ
+へ
+ほ
+ま
+み
+む
+め
+も
+や
+ゆ
+よ
+ら
+り
+る
+れ
+ろ
+を
+ん
+ァ
+ア
+ィ
+イ
+ウ
+ェ
+エ
+オ
+カ
+キ
+ク
+ケ
+コ
+サ
+シ
+ス
+セ
+タ
+チ
+ッ
+ツ
+テ
+ト
+ナ
+ニ
+ノ
+ハ
+ヒ
+フ
+ヘ
+ホ
+マ
+ミ
+ム
+メ
+モ
+ャ
+ュ
+ョ
+ラ
+リ
+ル
+レ
+ロ
+ワ
+ン
+・
+ー
+一
+三
+上
+下
+不
+世
+中
+主
+久
+之
+也
+事
+二
+五
+井
+京
+人
+亻
+仁
+介
+代
+仮
+伊
+会
+佐
+侍
+保
+信
+健
+元
+光
+八
+公
+内
+出
+分
+前
+劉
+力
+加
+勝
+北
+区
+十
+千
+南
+博
+原
+口
+古
+史
+司
+合
+吉
+同
+名
+和
+囗
+四
+国
+國
+土
+地
+坂
+城
+堂
+場
+士
+夏
+外
+大
+天
+太
+夫
+奈
+女
+子
+学
+宀
+宇
+安
+宗
+定
+宣
+宮
+家
+宿
+寺
+將
+小
+尚
+山
+岡
+島
+崎
+川
+州
+巿
+帝
+平
+年
+幸
+广
+弘
+張
+彳
+後
+御
+德
+心
+忄
+志
+忠
+愛
+成
+我
+戦
+戸
+手
+扌
+政
+文
+新
+方
+日
+明
+星
+春
+昭
+智
+曲
+書
+月
+有
+朝
+木
+本
+李
+村
+東
+松
+林
+森
+楊
+樹
+橋
+歌
+止
+正
+武
+比
+氏
+民
+水
+氵
+氷
+永
+江
+沢
+河
+治
+法
+海
+清
+漢
+瀬
+火
+版
+犬
+王
+生
+田
+男
+疒
+発
+白
+的
+皇
+目
+相
+省
+真
+石
+示
+社
+神
+福
+禾
+秀
+秋
+空
+立
+章
+竹
+糹
+美
+義
+耳
+良
+艹
+花
+英
+華
+葉
+藤
+行
+街
+西
+見
+訁
+語
+谷
+貝
+貴
+車
+軍
+辶
+道
+郎
+郡
+部
+都
+里
+野
+金
+鈴
+镇
+長
+門
+間
+阝
+阿
+陳
+陽
+雄
+青
+面
+風
+食
+香
+馬
+高
+龍
+龸
+ﬁ
+ﬂ
+！
+（
+）
+，
+－
+．
+／
+：
+？
+～
+the
+of
+and
+in
+to
+was
+he
+is
+as
+for
+on
+with
+that
+it
+his
+by
+at
+from
+her
+##s
+she
+you
+had
+an
+were
+but
+be
+this
+are
+not
+my
+they
+one
+which
+or
+have
+him
+me
+first
+all
+also
+their
+has
+up
+who
+out
+been
+when
+after
+there
+into
+new
+two
+its
+##a
+time
+would
+no
+what
+about
+said
+we
+over
+then
+other
+so
+more
+##e
+can
+if
+like
+back
+them
+only
+some
+could
+##i
+where
+just
+##ing
+during
+before
+##n
+do
+##o
+made
+school
+through
+than
+now
+years
+most
+world
+may
+between
+down
+well
+three
+##d
+year
+while
+will
+##ed
+##r
+##y
+later
+##t
+city
+under
+around
+did
+such
+being
+used
+state
+people
+part
+know
+against
+your
+many
+second
+university
+both
+national
+##er
+these
+don
+known
+off
+way
+until
+re
+how
+even
+get
+head
+...
+didn
+##ly
+team
+american
+because
+de
+##l
+born
+united
+film
+since
+still
+long
+work
+south
+us
+became
+any
+high
+again
+day
+family
+see
+right
+man
+eyes
+house
+season
+war
+states
+including
+took
+life
+north
+same
+each
+called
+name
+much
+place
+however
+go
+four
+group
+another
+found
+won
+area
+here
+going
+10
+away
+series
+left
+home
+music
+best
+make
+hand
+number
+company
+several
+never
+last
+john
+000
+very
+album
+take
+end
+good
+too
+following
+released
+game
+played
+little
+began
+district
+##m
+old
+want
+those
+side
+held
+own
+early
+county
+ll
+league
+use
+west
+##u
+face
+think
+##es
+2010
+government
+##h
+march
+came
+small
+general
+town
+june
+##on
+line
+based
+something
+##k
+september
+thought
+looked
+along
+international
+2011
+air
+july
+club
+went
+january
+october
+our
+august
+april
+york
+12
+few
+2012
+2008
+east
+show
+member
+college
+2009
+father
+public
+##us
+come
+men
+five
+set
+station
+church
+##c
+next
+former
+november
+room
+party
+located
+december
+2013
+age
+got
+2007
+##g
+system
+let
+love
+2006
+though
+every
+2014
+look
+song
+water
+century
+without
+body
+black
+night
+within
+great
+women
+single
+ve
+building
+large
+population
+river
+named
+band
+white
+started
+##an
+once
+15
+20
+should
+18
+2015
+service
+top
+built
+british
+open
+death
+king
+moved
+local
+times
+children
+february
+book
+why
+11
+door
+need
+president
+order
+final
+road
+wasn
+although
+due
+major
+died
+village
+third
+knew
+2016
+asked
+turned
+st
+wanted
+say
+##p
+together
+received
+main
+son
+served
+different
+##en
+behind
+himself
+felt
+members
+power
+football
+law
+voice
+play
+##in
+near
+park
+history
+30
+having
+2005
+16
+##man
+saw
+mother
+##al
+army
+point
+front
+help
+english
+street
+art
+late
+hands
+games
+award
+##ia
+young
+14
+put
+published
+country
+division
+across
+told
+13
+often
+ever
+french
+london
+center
+six
+red
+2017
+led
+days
+include
+light
+25
+find
+tell
+among
+species
+really
+according
+central
+half
+2004
+form
+original
+gave
+office
+making
+enough
+lost
+full
+opened
+must
+included
+live
+given
+german
+player
+run
+business
+woman
+community
+cup
+might
+million
+land
+2000
+court
+development
+17
+short
+round
+ii
+km
+seen
+class
+story
+always
+become
+sure
+research
+almost
+director
+council
+la
+##2
+career
+things
+using
+island
+##z
+couldn
+car
+##is
+24
+close
+force
+##1
+better
+free
+support
+control
+field
+students
+2003
+education
+married
+##b
+nothing
+worked
+others
+record
+big
+inside
+level
+anything
+continued
+give
+james
+##3
+military
+established
+non
+returned
+feel
+does
+title
+written
+thing
+feet
+william
+far
+co
+association
+hard
+already
+2002
+##ra
+championship
+human
+western
+100
+##na
+department
+hall
+role
+various
+production
+21
+19
+heart
+2001
+living
+fire
+version
+##ers
+##f
+television
+royal
+##4
+produced
+working
+act
+case
+society
+region
+present
+radio
+period
+looking
+least
+total
+keep
+england
+wife
+program
+per
+brother
+mind
+special
+22
+##le
+am
+works
+soon
+##6
+political
+george
+services
+taken
+created
+##7
+further
+able
+reached
+david
+union
+joined
+upon
+done
+important
+social
+information
+either
+##ic
+##x
+appeared
+position
+ground
+lead
+rock
+dark
+election
+23
+board
+france
+hair
+course
+arms
+site
+police
+girl
+instead
+real
+sound
+##v
+words
+moment
+##te
+someone
+##8
+summer
+project
+announced
+san
+less
+wrote
+past
+followed
+##5
+blue
+founded
+al
+finally
+india
+taking
+records
+america
+##ne
+1999
+design
+considered
+northern
+god
+stop
+battle
+toward
+european
+outside
+described
+track
+today
+playing
+language
+28
+call
+26
+heard
+professional
+low
+australia
+miles
+california
+win
+yet
+green
+##ie
+trying
+blood
+##ton
+southern
+science
+maybe
+everything
+match
+square
+27
+mouth
+video
+race
+recorded
+leave
+above
+##9
+daughter
+points
+space
+1998
+museum
+change
+middle
+common
+##0
+move
+tv
+post
+##ta
+lake
+seven
+tried
+elected
+closed
+ten
+paul
+minister
+##th
+months
+start
+chief
+return
+canada
+person
+sea
+release
+similar
+modern
+brought
+rest
+hit
+formed
+mr
+##la
+1997
+floor
+event
+doing
+thomas
+1996
+robert
+care
+killed
+training
+star
+week
+needed
+turn
+finished
+railway
+rather
+news
+health
+sent
+example
+ran
+term
+michael
+coming
+currently
+yes
+forces
+despite
+gold
+areas
+50
+stage
+fact
+29
+dead
+says
+popular
+2018
+originally
+germany
+probably
+developed
+result
+pulled
+friend
+stood
+money
+running
+mi
+signed
+word
+songs
+child
+eventually
+met
+tour
+average
+teams
+minutes
+festival
+current
+deep
+kind
+1995
+decided
+usually
+eastern
+seemed
+##ness
+episode
+bed
+added
+table
+indian
+private
+charles
+route
+available
+idea
+throughout
+centre
+addition
+appointed
+style
+1994
+books
+eight
+construction
+press
+mean
+wall
+friends
+remained
+schools
+study
+##ch
+##um
+institute
+oh
+chinese
+sometimes
+events
+possible
+1992
+australian
+type
+brown
+forward
+talk
+process
+food
+debut
+seat
+performance
+committee
+features
+character
+arts
+herself
+else
+lot
+strong
+russian
+range
+hours
+peter
+arm
+##da
+morning
+dr
+sold
+##ry
+quickly
+directed
+1993
+guitar
+china
+##w
+31
+list
+##ma
+performed
+media
+uk
+players
+smile
+##rs
+myself
+40
+placed
+coach
+province
+towards
+wouldn
+leading
+whole
+boy
+official
+designed
+grand
+census
+##el
+europe
+attack
+japanese
+henry
+1991
+##re
+##os
+cross
+getting
+alone
+action
+lower
+network
+wide
+washington
+japan
+1990
+hospital
+believe
+changed
+sister
+##ar
+hold
+gone
+sir
+hadn
+ship
+##ka
+studies
+academy
+shot
+rights
+below
+base
+bad
+involved
+kept
+largest
+##ist
+bank
+future
+especially
+beginning
+mark
+movement
+section
+female
+magazine
+plan
+professor
+lord
+longer
+##ian
+sat
+walked
+hill
+actually
+civil
+energy
+model
+families
+size
+thus
+aircraft
+completed
+includes
+data
+captain
+##or
+fight
+vocals
+featured
+richard
+bridge
+fourth
+1989
+officer
+stone
+hear
+##ism
+means
+medical
+groups
+management
+self
+lips
+competition
+entire
+lived
+technology
+leaving
+federal
+tournament
+bit
+passed
+hot
+independent
+awards
+kingdom
+mary
+spent
+fine
+doesn
+reported
+##ling
+jack
+fall
+raised
+itself
+stay
+true
+studio
+1988
+sports
+replaced
+paris
+systems
+saint
+leader
+theatre
+whose
+market
+capital
+parents
+spanish
+canadian
+earth
+##ity
+cut
+degree
+writing
+bay
+christian
+awarded
+natural
+higher
+bill
+##as
+coast
+provided
+previous
+senior
+ft
+valley
+organization
+stopped
+onto
+countries
+parts
+conference
+queen
+security
+interest
+saying
+allowed
+master
+earlier
+phone
+matter
+smith
+winning
+try
+happened
+moving
+campaign
+los
+##ley
+breath
+nearly
+mid
+1987
+certain
+girls
+date
+italian
+african
+standing
+fell
+artist
+##ted
+shows
+deal
+mine
+industry
+1986
+##ng
+everyone
+republic
+provide
+collection
+library
+student
+##ville
+primary
+owned
+older
+via
+heavy
+1st
+makes
+##able
+attention
+anyone
+africa
+##ri
+stated
+length
+ended
+fingers
+command
+staff
+skin
+foreign
+opening
+governor
+okay
+medal
+kill
+sun
+cover
+job
+1985
+introduced
+chest
+hell
+feeling
+##ies
+success
+meet
+reason
+standard
+meeting
+novel
+1984
+trade
+source
+buildings
+##land
+rose
+guy
+goal
+##ur
+chapter
+native
+husband
+previously
+unit
+limited
+entered
+weeks
+producer
+operations
+mountain
+takes
+covered
+forced
+related
+roman
+complete
+successful
+key
+texas
+cold
+##ya
+channel
+1980
+traditional
+films
+dance
+clear
+approximately
+500
+nine
+van
+prince
+question
+active
+tracks
+ireland
+regional
+silver
+author
+personal
+sense
+operation
+##ine
+economic
+1983
+holding
+twenty
+isbn
+additional
+speed
+hour
+edition
+regular
+historic
+places
+whom
+shook
+movie
+km²
+secretary
+prior
+report
+chicago
+read
+foundation
+view
+engine
+scored
+1982
+units
+ask
+airport
+property
+ready
+immediately
+lady
+month
+listed
+contract
+##de
+manager
+themselves
+lines
+##ki
+navy
+writer
+meant
+##ts
+runs
+##ro
+practice
+championships
+singer
+glass
+commission
+required
+forest
+starting
+culture
+generally
+giving
+access
+attended
+test
+couple
+stand
+catholic
+martin
+caught
+executive
+##less
+eye
+##ey
+thinking
+chair
+quite
+shoulder
+1979
+hope
+decision
+plays
+defeated
+municipality
+whether
+structure
+offered
+slowly
+pain
+ice
+direction
+##ion
+paper
+mission
+1981
+mostly
+200
+noted
+individual
+managed
+nature
+lives
+plant
+##ha
+helped
+except
+studied
+computer
+figure
+relationship
+issue
+significant
+loss
+die
+smiled
+gun
+ago
+highest
+1972
+##am
+male
+bring
+goals
+mexico
+problem
+distance
+commercial
+completely
+location
+annual
+famous
+drive
+1976
+neck
+1978
+surface
+caused
+italy
+understand
+greek
+highway
+wrong
+hotel
+comes
+appearance
+joseph
+double
+issues
+musical
+companies
+castle
+income
+review
+assembly
+bass
+initially
+parliament
+artists
+experience
+1974
+particular
+walk
+foot
+engineering
+talking
+window
+dropped
+##ter
+miss
+baby
+boys
+break
+1975
+stars
+edge
+remember
+policy
+carried
+train
+stadium
+bar
+sex
+angeles
+evidence
+##ge
+becoming
+assistant
+soviet
+1977
+upper
+step
+wing
+1970
+youth
+financial
+reach
+##ll
+actor
+numerous
+##se
+##st
+nodded
+arrived
+##ation
+minute
+##nt
+believed
+sorry
+complex
+beautiful
+victory
+associated
+temple
+1968
+1973
+chance
+perhaps
+metal
+##son
+1945
+bishop
+##et
+lee
+launched
+particularly
+tree
+le
+retired
+subject
+prize
+contains
+yeah
+theory
+empire
+##ce
+suddenly
+waiting
+trust
+recording
+##to
+happy
+terms
+camp
+champion
+1971
+religious
+pass
+zealand
+names
+2nd
+port
+ancient
+tom
+corner
+represented
+watch
+legal
+anti
+justice
+cause
+watched
+brothers
+45
+material
+changes
+simply
+response
+louis
+fast
+##ting
+answer
+60
+historical
+1969
+stories
+straight
+create
+feature
+increased
+rate
+administration
+virginia
+el
+activities
+cultural
+overall
+winner
+programs
+basketball
+legs
+guard
+beyond
+cast
+doctor
+mm
+flight
+results
+remains
+cost
+effect
+winter
+##ble
+larger
+islands
+problems
+chairman
+grew
+commander
+isn
+1967
+pay
+failed
+selected
+hurt
+fort
+box
+regiment
+majority
+journal
+35
+edward
+plans
+##ke
+##ni
+shown
+pretty
+irish
+characters
+directly
+scene
+likely
+operated
+allow
+spring
+##j
+junior
+matches
+looks
+mike
+houses
+fellow
+##tion
+beach
+marriage
+##ham
+##ive
+rules
+oil
+65
+florida
+expected
+nearby
+congress
+sam
+peace
+recent
+iii
+wait
+subsequently
+cell
+##do
+variety
+serving
+agreed
+please
+poor
+joe
+pacific
+attempt
+wood
+democratic
+piece
+prime
+##ca
+rural
+mile
+touch
+appears
+township
+1964
+1966
+soldiers
+##men
+##ized
+1965
+pennsylvania
+closer
+fighting
+claimed
+score
+jones
+physical
+editor
+##ous
+filled
+genus
+specific
+sitting
+super
+mom
+##va
+therefore
+supported
+status
+fear
+cases
+store
+meaning
+wales
+minor
+spain
+tower
+focus
+vice
+frank
+follow
+parish
+separate
+golden
+horse
+fifth
+remaining
+branch
+32
+presented
+stared
+##id
+uses
+secret
+forms
+##co
+baseball
+exactly
+##ck
+choice
+note
+discovered
+travel
+composed
+truth
+russia
+ball
+color
+kiss
+dad
+wind
+continue
+ring
+referred
+numbers
+digital
+greater
+##ns
+metres
+slightly
+direct
+increase
+1960
+responsible
+crew
+rule
+trees
+troops
+##no
+broke
+goes
+individuals
+hundred
+weight
+creek
+sleep
+memory
+defense
+provides
+ordered
+code
+value
+jewish
+windows
+1944
+safe
+judge
+whatever
+corps
+realized
+growing
+pre
+##ga
+cities
+alexander
+gaze
+lies
+spread
+scott
+letter
+showed
+situation
+mayor
+transport
+watching
+workers
+extended
+##li
+expression
+normal
+##ment
+chart
+multiple
+border
+##ba
+host
+##ner
+daily
+mrs
+walls
+piano
+##ko
+heat
+cannot
+##ate
+earned
+products
+drama
+era
+authority
+seasons
+join
+grade
+##io
+sign
+difficult
+machine
+1963
+territory
+mainly
+##wood
+stations
+squadron
+1962
+stepped
+iron
+19th
+##led
+serve
+appear
+sky
+speak
+broken
+charge
+knowledge
+kilometres
+removed
+ships
+article
+campus
+simple
+##ty
+pushed
+britain
+##ve
+leaves
+recently
+cd
+soft
+boston
+latter
+easy
+acquired
+poland
+##sa
+quality
+officers
+presence
+planned
+nations
+mass
+broadcast
+jean
+share
+image
+influence
+wild
+offer
+emperor
+electric
+reading
+headed
+ability
+promoted
+yellow
+ministry
+1942
+throat
+smaller
+politician
+##by
+latin
+spoke
+cars
+williams
+males
+lack
+pop
+80
+##ier
+acting
+seeing
+consists
+##ti
+estate
+1961
+pressure
+johnson
+newspaper
+jr
+chris
+olympics
+online
+conditions
+beat
+elements
+walking
+vote
+##field
+needs
+carolina
+text
+featuring
+global
+block
+shirt
+levels
+francisco
+purpose
+females
+et
+dutch
+duke
+ahead
+gas
+twice
+safety
+serious
+turning
+highly
+lieutenant
+firm
+maria
+amount
+mixed
+daniel
+proposed
+perfect
+agreement
+affairs
+3rd
+seconds
+contemporary
+paid
+1943
+prison
+save
+kitchen
+label
+administrative
+intended
+constructed
+academic
+nice
+teacher
+races
+1956
+formerly
+corporation
+ben
+nation
+issued
+shut
+1958
+drums
+housing
+victoria
+seems
+opera
+1959
+graduated
+function
+von
+mentioned
+picked
+build
+recognized
+shortly
+protection
+picture
+notable
+exchange
+elections
+1980s
+loved
+percent
+racing
+fish
+elizabeth
+garden
+volume
+hockey
+1941
+beside
+settled
+##ford
+1940
+competed
+replied
+drew
+1948
+actress
+marine
+scotland
+steel
+glanced
+farm
+steve
+1957
+risk
+tonight
+positive
+magic
+singles
+effects
+gray
+screen
+dog
+##ja
+residents
+bus
+sides
+none
+secondary
+literature
+polish
+destroyed
+flying
+founder
+households
+1939
+lay
+reserve
+usa
+gallery
+##ler
+1946
+industrial
+younger
+approach
+appearances
+urban
+ones
+1950
+finish
+avenue
+powerful
+fully
+growth
+page
+honor
+jersey
+projects
+advanced
+revealed
+basic
+90
+infantry
+pair
+equipment
+visit
+33
+evening
+search
+grant
+effort
+solo
+treatment
+buried
+republican
+primarily
+bottom
+owner
+1970s
+israel
+gives
+jim
+dream
+bob
+remain
+spot
+70
+notes
+produce
+champions
+contact
+ed
+soul
+accepted
+ways
+del
+##ally
+losing
+split
+price
+capacity
+basis
+trial
+questions
+##ina
+1955
+20th
+guess
+officially
+memorial
+naval
+initial
+##ization
+whispered
+median
+engineer
+##ful
+sydney
+##go
+columbia
+strength
+300
+1952
+tears
+senate
+00
+card
+asian
+agent
+1947
+software
+44
+draw
+warm
+supposed
+com
+pro
+##il
+transferred
+leaned
+##at
+candidate
+escape
+mountains
+asia
+potential
+activity
+entertainment
+seem
+traffic
+jackson
+murder
+36
+slow
+product
+orchestra
+haven
+agency
+bbc
+taught
+website
+comedy
+unable
+storm
+planning
+albums
+rugby
+environment
+scientific
+grabbed
+protect
+##hi
+boat
+typically
+1954
+1953
+damage
+principal
+divided
+dedicated
+mount
+ohio
+##berg
+pick
+fought
+driver
+##der
+empty
+shoulders
+sort
+thank
+berlin
+prominent
+account
+freedom
+necessary
+efforts
+alex
+headquarters
+follows
+alongside
+des
+simon
+andrew
+suggested
+operating
+learning
+steps
+1949
+sweet
+technical
+begin
+easily
+34
+teeth
+speaking
+settlement
+scale
+##sh
+renamed
+ray
+max
+enemy
+semi
+joint
+compared
+##rd
+scottish
+leadership
+analysis
+offers
+georgia
+pieces
+captured
+animal
+deputy
+guest
+organized
+##lin
+tony
+combined
+method
+challenge
+1960s
+huge
+wants
+battalion
+sons
+rise
+crime
+types
+facilities
+telling
+path
+1951
+platform
+sit
+1990s
+##lo
+tells
+assigned
+rich
+pull
+##ot
+commonly
+alive
+##za
+letters
+concept
+conducted
+wearing
+happen
+bought
+becomes
+holy
+gets
+ocean
+defeat
+languages
+purchased
+coffee
+occurred
+titled
+##q
+declared
+applied
+sciences
+concert
+sounds
+jazz
+brain
+##me
+painting
+fleet
+tax
+nick
+##ius
+michigan
+count
+animals
+leaders
+episodes
+##line
+content
+##den
+birth
+##it
+clubs
+64
+palace
+critical
+refused
+fair
+leg
+laughed
+returning
+surrounding
+participated
+formation
+lifted
+pointed
+connected
+rome
+medicine
+laid
+taylor
+santa
+powers
+adam
+tall
+shared
+focused
+knowing
+yards
+entrance
+falls
+##wa
+calling
+##ad
+sources
+chosen
+beneath
+resources
+yard
+##ite
+nominated
+silence
+zone
+defined
+##que
+gained
+thirty
+38
+bodies
+moon
+##ard
+adopted
+christmas
+widely
+register
+apart
+iran
+premier
+serves
+du
+unknown
+parties
+##les
+generation
+##ff
+continues
+quick
+fields
+brigade
+quiet
+teaching
+clothes
+impact
+weapons
+partner
+flat
+theater
+supreme
+1938
+37
+relations
+##tor
+plants
+suffered
+1936
+wilson
+kids
+begins
+##age
+1918
+seats
+armed
+internet
+models
+worth
+laws
+400
+communities
+classes
+background
+knows
+thanks
+quarter
+reaching
+humans
+carry
+killing
+format
+kong
+hong
+setting
+75
+architecture
+disease
+railroad
+inc
+possibly
+wish
+arthur
+thoughts
+harry
+doors
+density
+##di
+crowd
+illinois
+stomach
+tone
+unique
+reports
+anyway
+##ir
+liberal
+der
+vehicle
+thick
+dry
+drug
+faced
+largely
+facility
+theme
+holds
+creation
+strange
+colonel
+##mi
+revolution
+bell
+politics
+turns
+silent
+rail
+relief
+independence
+combat
+shape
+write
+determined
+sales
+learned
+4th
+finger
+oxford
+providing
+1937
+heritage
+fiction
+situated
+designated
+allowing
+distribution
+hosted
+##est
+sight
+interview
+estimated
+reduced
+##ria
+toronto
+footballer
+keeping
+guys
+damn
+claim
+motion
+sport
+sixth
+stayed
+##ze
+en
+rear
+receive
+handed
+twelve
+dress
+audience
+granted
+brazil
+##well
+spirit
+##ated
+noticed
+etc
+olympic
+representative
+eric
+tight
+trouble
+reviews
+drink
+vampire
+missing
+roles
+ranked
+newly
+household
+finals
+wave
+critics
+##ee
+phase
+massachusetts
+pilot
+unlike
+philadelphia
+bright
+guns
+crown
+organizations
+roof
+42
+respectively
+clearly
+tongue
+marked
+circle
+fox
+korea
+bronze
+brian
+expanded
+sexual
+supply
+yourself
+inspired
+labour
+fc
+##ah
+reference
+vision
+draft
+connection
+brand
+reasons
+1935
+classic
+driving
+trip
+jesus
+cells
+entry
+1920
+neither
+trail
+claims
+atlantic
+orders
+labor
+nose
+afraid
+identified
+intelligence
+calls
+cancer
+attacked
+passing
+stephen
+positions
+imperial
+grey
+jason
+39
+sunday
+48
+swedish
+avoid
+extra
+uncle
+message
+covers
+allows
+surprise
+materials
+fame
+hunter
+##ji
+1930
+citizens
+figures
+davis
+environmental
+confirmed
+shit
+titles
+di
+performing
+difference
+acts
+attacks
+##ov
+existing
+votes
+opportunity
+nor
+shop
+entirely
+trains
+opposite
+pakistan
+##pa
+develop
+resulted
+representatives
+actions
+reality
+pressed
+##ish
+barely
+wine
+conversation
+faculty
+northwest
+ends
+documentary
+nuclear
+stock
+grace
+sets
+eat
+alternative
+##ps
+bag
+resulting
+creating
+surprised
+cemetery
+1919
+drop
+finding
+sarah
+cricket
+streets
+tradition
+ride
+1933
+exhibition
+target
+ear
+explained
+rain
+composer
+injury
+apartment
+municipal
+educational
+occupied
+netherlands
+clean
+billion
+constitution
+learn
+1914
+maximum
+classical
+francis
+lose
+opposition
+jose
+ontario
+bear
+core
+hills
+rolled
+ending
+drawn
+permanent
+fun
+##tes
+##lla
+lewis
+sites
+chamber
+ryan
+##way
+scoring
+height
+1934
+##house
+lyrics
+staring
+55
+officials
+1917
+snow
+oldest
+##tic
+orange
+##ger
+qualified
+interior
+apparently
+succeeded
+thousand
+dinner
+lights
+existence
+fans
+heavily
+41
+greatest
+conservative
+send
+bowl
+plus
+enter
+catch
+##un
+economy
+duty
+1929
+speech
+authorities
+princess
+performances
+versions
+shall
+graduate
+pictures
+effective
+remembered
+poetry
+desk
+crossed
+starring
+starts
+passenger
+sharp
+##ant
+acres
+ass
+weather
+falling
+rank
+fund
+supporting
+check
+adult
+publishing
+heads
+cm
+southeast
+lane
+##burg
+application
+bc
+##ura
+les
+condition
+transfer
+prevent
+display
+ex
+regions
+earl
+federation
+cool
+relatively
+answered
+besides
+1928
+obtained
+portion
+##town
+mix
+##ding
+reaction
+liked
+dean
+express
+peak
+1932
+##tte
+counter
+religion
+chain
+rare
+miller
+convention
+aid
+lie
+vehicles
+mobile
+perform
+squad
+wonder
+lying
+crazy
+sword
+##ping
+attempted
+centuries
+weren
+philosophy
+category
+##ize
+anna
+interested
+47
+sweden
+wolf
+frequently
+abandoned
+kg
+literary
+alliance
+task
+entitled
+##ay
+threw
+promotion
+factory
+tiny
+soccer
+visited
+matt
+fm
+achieved
+52
+defence
+internal
+persian
+43
+methods
+##ging
+arrested
+otherwise
+cambridge
+programming
+villages
+elementary
+districts
+rooms
+criminal
+conflict
+worry
+trained
+1931
+attempts
+waited
+signal
+bird
+truck
+subsequent
+programme
+##ol
+ad
+49
+communist
+details
+faith
+sector
+patrick
+carrying
+laugh
+##ss
+controlled
+korean
+showing
+origin
+fuel
+evil
+1927
+##ent
+brief
+identity
+darkness
+address
+pool
+missed
+publication
+web
+planet
+ian
+anne
+wings
+invited
+##tt
+briefly
+standards
+kissed
+##be
+ideas
+climate
+causing
+walter
+worse
+albert
+articles
+winners
+desire
+aged
+northeast
+dangerous
+gate
+doubt
+1922
+wooden
+multi
+##ky
+poet
+rising
+funding
+46
+communications
+communication
+violence
+copies
+prepared
+ford
+investigation
+skills
+1924
+pulling
+electronic
+##ak
+##ial
+##han
+containing
+ultimately
+offices
+singing
+understanding
+restaurant
+tomorrow
+fashion
+christ
+ward
+da
+pope
+stands
+5th
+flow
+studios
+aired
+commissioned
+contained
+exist
+fresh
+americans
+##per
+wrestling
+approved
+kid
+employed
+respect
+suit
+1925
+angel
+asking
+increasing
+frame
+angry
+selling
+1950s
+thin
+finds
+##nd
+temperature
+statement
+ali
+explain
+inhabitants
+towns
+extensive
+narrow
+51
+jane
+flowers
+images
+promise
+somewhere
+object
+fly
+closely
+##ls
+1912
+bureau
+cape
+1926
+weekly
+presidential
+legislative
+1921
+##ai
+##au
+launch
+founding
+##ny
+978
+##ring
+artillery
+strike
+un
+institutions
+roll
+writers
+landing
+chose
+kevin
+anymore
+pp
+##ut
+attorney
+fit
+dan
+billboard
+receiving
+agricultural
+breaking
+sought
+dave
+admitted
+lands
+mexican
+##bury
+charlie
+specifically
+hole
+iv
+howard
+credit
+moscow
+roads
+accident
+1923
+proved
+wear
+struck
+hey
+guards
+stuff
+slid
+expansion
+1915
+cat
+anthony
+##kin
+melbourne
+opposed
+sub
+southwest
+architect
+failure
+plane
+1916
+##ron
+map
+camera
+tank
+listen
+regarding
+wet
+introduction
+metropolitan
+link
+ep
+fighter
+inch
+grown
+gene
+anger
+fixed
+buy
+dvd
+khan
+domestic
+worldwide
+chapel
+mill
+functions
+examples
+##head
+developing
+1910
+turkey
+hits
+pocket
+antonio
+papers
+grow
+unless
+circuit
+18th
+concerned
+attached
+journalist
+selection
+journey
+converted
+provincial
+painted
+hearing
+aren
+bands
+negative
+aside
+wondered
+knight
+lap
+survey
+ma
+##ow
+noise
+billy
+##ium
+shooting
+guide
+bedroom
+priest
+resistance
+motor
+homes
+sounded
+giant
+##mer
+150
+scenes
+equal
+comic
+patients
+hidden
+solid
+actual
+bringing
+afternoon
+touched
+funds
+wedding
+consisted
+marie
+canal
+sr
+kim
+treaty
+turkish
+recognition
+residence
+cathedral
+broad
+knees
+incident
+shaped
+fired
+norwegian
+handle
+cheek
+contest
+represent
+##pe
+representing
+beauty
+##sen
+birds
+advantage
+emergency
+wrapped
+drawing
+notice
+pink
+broadcasting
+##ong
+somehow
+bachelor
+seventh
+collected
+registered
+establishment
+alan
+assumed
+chemical
+personnel
+roger
+retirement
+jeff
+portuguese
+wore
+tied
+device
+threat
+progress
+advance
+##ised
+banks
+hired
+manchester
+nfl
+teachers
+structures
+forever
+##bo
+tennis
+helping
+saturday
+sale
+applications
+junction
+hip
+incorporated
+neighborhood
+dressed
+ceremony
+##ds
+influenced
+hers
+visual
+stairs
+decades
+inner
+kansas
+hung
+hoped
+gain
+scheduled
+downtown
+engaged
+austria
+clock
+norway
+certainly
+pale
+protected
+1913
+victor
+employees
+plate
+putting
+surrounded
+##ists
+finishing
+blues
+tropical
+##ries
+minnesota
+consider
+philippines
+accept
+54
+retrieved
+1900
+concern
+anderson
+properties
+institution
+gordon
+successfully
+vietnam
+##dy
+backing
+outstanding
+muslim
+crossing
+folk
+producing
+usual
+demand
+occurs
+observed
+lawyer
+educated
+##ana
+kelly
+string
+pleasure
+budget
+items
+quietly
+colorado
+philip
+typical
+##worth
+derived
+600
+survived
+asks
+mental
+##ide
+56
+jake
+jews
+distinguished
+ltd
+1911
+sri
+extremely
+53
+athletic
+loud
+thousands
+worried
+shadow
+transportation
+horses
+weapon
+arena
+importance
+users
+tim
+objects
+contributed
+dragon
+douglas
+aware
+senator
+johnny
+jordan
+sisters
+engines
+flag
+investment
+samuel
+shock
+capable
+clark
+row
+wheel
+refers
+session
+familiar
+biggest
+wins
+hate
+maintained
+drove
+hamilton
+request
+expressed
+injured
+underground
+churches
+walker
+wars
+tunnel
+passes
+stupid
+agriculture
+softly
+cabinet
+regarded
+joining
+indiana
+##ea
+##ms
+push
+dates
+spend
+behavior
+woods
+protein
+gently
+chase
+morgan
+mention
+burning
+wake
+combination
+occur
+mirror
+leads
+jimmy
+indeed
+impossible
+singapore
+paintings
+covering
+##nes
+soldier
+locations
+attendance
+sell
+historian
+wisconsin
+invasion
+argued
+painter
+diego
+changing
+egypt
+##don
+experienced
+inches
+##ku
+missouri
+vol
+grounds
+spoken
+switzerland
+##gan
+reform
+rolling
+ha
+forget
+massive
+resigned
+burned
+allen
+tennessee
+locked
+values
+improved
+##mo
+wounded
+universe
+sick
+dating
+facing
+pack
+purchase
+user
+##pur
+moments
+##ul
+merged
+anniversary
+1908
+coal
+brick
+understood
+causes
+dynasty
+queensland
+establish
+stores
+crisis
+promote
+hoping
+views
+cards
+referee
+extension
+##si
+raise
+arizona
+improve
+colonial
+formal
+charged
+##rt
+palm
+lucky
+hide
+rescue
+faces
+95
+feelings
+candidates
+juan
+##ell
+goods
+6th
+courses
+weekend
+59
+luke
+cash
+fallen
+##om
+delivered
+affected
+installed
+carefully
+tries
+swiss
+hollywood
+costs
+lincoln
+responsibility
+##he
+shore
+file
+proper
+normally
+maryland
+assistance
+jump
+constant
+offering
+friendly
+waters
+persons
+realize
+contain
+trophy
+800
+partnership
+factor
+58
+musicians
+cry
+bound
+oregon
+indicated
+hero
+houston
+medium
+##ure
+consisting
+somewhat
+##ara
+57
+cycle
+##che
+beer
+moore
+frederick
+gotten
+eleven
+worst
+weak
+approached
+arranged
+chin
+loan
+universal
+bond
+fifteen
+pattern
+disappeared
+##ney
+translated
+##zed
+lip
+arab
+capture
+interests
+insurance
+##chi
+shifted
+cave
+prix
+warning
+sections
+courts
+coat
+plot
+smell
+feed
+golf
+favorite
+maintain
+knife
+vs
+voted
+degrees
+finance
+quebec
+opinion
+translation
+manner
+ruled
+operate
+productions
+choose
+musician
+discovery
+confused
+tired
+separated
+stream
+techniques
+committed
+attend
+ranking
+kings
+throw
+passengers
+measure
+horror
+fan
+mining
+sand
+danger
+salt
+calm
+decade
+dam
+require
+runner
+##ik
+rush
+associate
+greece
+##ker
+rivers
+consecutive
+matthew
+##ski
+sighed
+sq
+documents
+steam
+edited
+closing
+tie
+accused
+1905
+##ini
+islamic
+distributed
+directors
+organisation
+bruce
+7th
+breathing
+mad
+lit
+arrival
+concrete
+taste
+08
+composition
+shaking
+faster
+amateur
+adjacent
+stating
+1906
+twin
+flew
+##ran
+tokyo
+publications
+##tone
+obviously
+ridge
+storage
+1907
+carl
+pages
+concluded
+desert
+driven
+universities
+ages
+terminal
+sequence
+borough
+250
+constituency
+creative
+cousin
+economics
+dreams
+margaret
+notably
+reduce
+montreal
+mode
+17th
+ears
+saved
+jan
+vocal
+##ica
+1909
+andy
+##jo
+riding
+roughly
+threatened
+##ise
+meters
+meanwhile
+landed
+compete
+repeated
+grass
+czech
+regularly
+charges
+tea
+sudden
+appeal
+##ung
+solution
+describes
+pierre
+classification
+glad
+parking
+##ning
+belt
+physics
+99
+rachel
+add
+hungarian
+participate
+expedition
+damaged
+gift
+childhood
+85
+fifty
+##red
+mathematics
+jumped
+letting
+defensive
+mph
+##ux
+##gh
+testing
+##hip
+hundreds
+shoot
+owners
+matters
+smoke
+israeli
+kentucky
+dancing
+mounted
+grandfather
+emma
+designs
+profit
+argentina
+##gs
+truly
+li
+lawrence
+cole
+begun
+detroit
+willing
+branches
+smiling
+decide
+miami
+enjoyed
+recordings
+##dale
+poverty
+ethnic
+gay
+##bi
+gary
+arabic
+09
+accompanied
+##one
+##ons
+fishing
+determine
+residential
+acid
+##ary
+alice
+returns
+starred
+mail
+##ang
+jonathan
+strategy
+##ue
+net
+forty
+cook
+businesses
+equivalent
+commonwealth
+distinct
+ill
+##cy
+seriously
+##ors
+##ped
+shift
+harris
+replace
+rio
+imagine
+formula
+ensure
+##ber
+additionally
+scheme
+conservation
+occasionally
+purposes
+feels
+favor
+##and
+##ore
+1930s
+contrast
+hanging
+hunt
+movies
+1904
+instruments
+victims
+danish
+christopher
+busy
+demon
+sugar
+earliest
+colony
+studying
+balance
+duties
+##ks
+belgium
+slipped
+carter
+05
+visible
+stages
+iraq
+fifa
+##im
+commune
+forming
+zero
+07
+continuing
+talked
+counties
+legend
+bathroom
+option
+tail
+clay
+daughters
+afterwards
+severe
+jaw
+visitors
+##ded
+devices
+aviation
+russell
+kate
+##vi
+entering
+subjects
+##ino
+temporary
+swimming
+forth
+smooth
+ghost
+audio
+bush
+operates
+rocks
+movements
+signs
+eddie
+##tz
+ann
+voices
+honorary
+06
+memories
+dallas
+pure
+measures
+racial
+promised
+66
+harvard
+ceo
+16th
+parliamentary
+indicate
+benefit
+flesh
+dublin
+louisiana
+1902
+1901
+patient
+sleeping
+1903
+membership
+coastal
+medieval
+wanting
+element
+scholars
+rice
+62
+limit
+survive
+makeup
+rating
+definitely
+collaboration
+obvious
+##tan
+boss
+ms
+baron
+birthday
+linked
+soil
+diocese
+##lan
+ncaa
+##mann
+offensive
+shell
+shouldn
+waist
+##tus
+plain
+ross
+organ
+resolution
+manufacturing
+adding
+relative
+kennedy
+98
+whilst
+moth
+marketing
+gardens
+crash
+72
+heading
+partners
+credited
+carlos
+moves
+cable
+##zi
+marshall
+##out
+depending
+bottle
+represents
+rejected
+responded
+existed
+04
+jobs
+denmark
+lock
+##ating
+treated
+graham
+routes
+talent
+commissioner
+drugs
+secure
+tests
+reign
+restored
+photography
+##gi
+contributions
+oklahoma
+designer
+disc
+grin
+seattle
+robin
+paused
+atlanta
+unusual
+##gate
+praised
+las
+laughing
+satellite
+hungary
+visiting
+##sky
+interesting
+factors
+deck
+poems
+norman
+##water
+stuck
+speaker
+rifle
+domain
+premiered
+##her
+dc
+comics
+actors
+01
+reputation
+eliminated
+8th
+ceiling
+prisoners
+script
+##nce
+leather
+austin
+mississippi
+rapidly
+admiral
+parallel
+charlotte
+guilty
+tools
+gender
+divisions
+fruit
+##bs
+laboratory
+nelson
+fantasy
+marry
+rapid
+aunt
+tribe
+requirements
+aspects
+suicide
+amongst
+adams
+bone
+ukraine
+abc
+kick
+sees
+edinburgh
+clothing
+column
+rough
+gods
+hunting
+broadway
+gathered
+concerns
+##ek
+spending
+ty
+12th
+snapped
+requires
+solar
+bones
+cavalry
+##tta
+iowa
+drinking
+waste
+index
+franklin
+charity
+thompson
+stewart
+tip
+flash
+landscape
+friday
+enjoy
+singh
+poem
+listening
+##back
+eighth
+fred
+differences
+adapted
+bomb
+ukrainian
+surgery
+corporate
+masters
+anywhere
+##more
+waves
+odd
+sean
+portugal
+orleans
+dick
+debate
+kent
+eating
+puerto
+cleared
+96
+expect
+cinema
+97
+guitarist
+blocks
+electrical
+agree
+involving
+depth
+dying
+panel
+struggle
+##ged
+peninsula
+adults
+novels
+emerged
+vienna
+metro
+debuted
+shoes
+tamil
+songwriter
+meets
+prove
+beating
+instance
+heaven
+scared
+sending
+marks
+artistic
+passage
+superior
+03
+significantly
+shopping
+##tive
+retained
+##izing
+malaysia
+technique
+cheeks
+##ola
+warren
+maintenance
+destroy
+extreme
+allied
+120
+appearing
+##yn
+fill
+advice
+alabama
+qualifying
+policies
+cleveland
+hat
+battery
+smart
+authors
+10th
+soundtrack
+acted
+dated
+lb
+glance
+equipped
+coalition
+funny
+outer
+ambassador
+roy
+possibility
+couples
+campbell
+dna
+loose
+ethan
+supplies
+1898
+gonna
+88
+monster
+##res
+shake
+agents
+frequency
+springs
+dogs
+practices
+61
+gang
+plastic
+easier
+suggests
+gulf
+blade
+exposed
+colors
+industries
+markets
+pan
+nervous
+electoral
+charts
+legislation
+ownership
+##idae
+mac
+appointment
+shield
+copy
+assault
+socialist
+abbey
+monument
+license
+throne
+employment
+jay
+93
+replacement
+charter
+cloud
+powered
+suffering
+accounts
+oak
+connecticut
+strongly
+wright
+colour
+crystal
+13th
+context
+welsh
+networks
+voiced
+gabriel
+jerry
+##cing
+forehead
+mp
+##ens
+manage
+schedule
+totally
+remix
+##ii
+forests
+occupation
+print
+nicholas
+brazilian
+strategic
+vampires
+engineers
+76
+roots
+seek
+correct
+instrumental
+und
+alfred
+backed
+hop
+##des
+stanley
+robinson
+traveled
+wayne
+welcome
+austrian
+achieve
+67
+exit
+rates
+1899
+strip
+whereas
+##cs
+sing
+deeply
+adventure
+bobby
+rick
+jamie
+careful
+components
+cap
+useful
+personality
+knee
+##shi
+pushing
+hosts
+02
+protest
+ca
+ottoman
+symphony
+##sis
+63
+boundary
+1890
+processes
+considering
+considerable
+tons
+##work
+##ft
+##nia
+cooper
+trading
+dear
+conduct
+91
+illegal
+apple
+revolutionary
+holiday
+definition
+harder
+##van
+jacob
+circumstances
+destruction
+##lle
+popularity
+grip
+classified
+liverpool
+donald
+baltimore
+flows
+seeking
+honour
+approval
+92
+mechanical
+till
+happening
+statue
+critic
+increasingly
+immediate
+describe
+commerce
+stare
+##ster
+indonesia
+meat
+rounds
+boats
+baker
+orthodox
+depression
+formally
+worn
+naked
+claire
+muttered
+sentence
+11th
+emily
+document
+77
+criticism
+wished
+vessel
+spiritual
+bent
+virgin
+parker
+minimum
+murray
+lunch
+danny
+printed
+compilation
+keyboards
+false
+blow
+belonged
+68
+raising
+78
+cutting
+##board
+pittsburgh
+##up
+9th
+shadows
+81
+hated
+indigenous
+jon
+15th
+barry
+scholar
+ah
+##zer
+oliver
+##gy
+stick
+susan
+meetings
+attracted
+spell
+romantic
+##ver
+ye
+1895
+photo
+demanded
+customers
+##ac
+1896
+logan
+revival
+keys
+modified
+commanded
+jeans
+##ious
+upset
+raw
+phil
+detective
+hiding
+resident
+vincent
+##bly
+experiences
+diamond
+defeating
+coverage
+lucas
+external
+parks
+franchise
+helen
+bible
+successor
+percussion
+celebrated
+il
+lift
+profile
+clan
+romania
+##ied
+mills
+##su
+nobody
+achievement
+shrugged
+fault
+1897
+rhythm
+initiative
+breakfast
+carbon
+700
+69
+lasted
+violent
+74
+wound
+ken
+killer
+gradually
+filmed
+°c
+dollars
+processing
+94
+remove
+criticized
+guests
+sang
+chemistry
+##vin
+legislature
+disney
+##bridge
+uniform
+escaped
+integrated
+proposal
+purple
+denied
+liquid
+karl
+influential
+morris
+nights
+stones
+intense
+experimental
+twisted
+71
+84
+##ld
+pace
+nazi
+mitchell
+ny
+blind
+reporter
+newspapers
+14th
+centers
+burn
+basin
+forgotten
+surviving
+filed
+collections
+monastery
+losses
+manual
+couch
+description
+appropriate
+merely
+tag
+missions
+sebastian
+restoration
+replacing
+triple
+73
+elder
+julia
+warriors
+benjamin
+julian
+convinced
+stronger
+amazing
+declined
+versus
+merchant
+happens
+output
+finland
+bare
+barbara
+absence
+ignored
+dawn
+injuries
+##port
+producers
+##ram
+82
+luis
+##ities
+kw
+admit
+expensive
+electricity
+nba
+exception
+symbol
+##ving
+ladies
+shower
+sheriff
+characteristics
+##je
+aimed
+button
+ratio
+effectively
+summit
+angle
+jury
+bears
+foster
+vessels
+pants
+executed
+evans
+dozen
+advertising
+kicked
+patrol
+1889
+competitions
+lifetime
+principles
+athletics
+##logy
+birmingham
+sponsored
+89
+rob
+nomination
+1893
+acoustic
+##sm
+creature
+longest
+##tra
+credits
+harbor
+dust
+josh
+##so
+territories
+milk
+infrastructure
+completion
+thailand
+indians
+leon
+archbishop
+##sy
+assist
+pitch
+blake
+arrangement
+girlfriend
+serbian
+operational
+hence
+sad
+scent
+fur
+dj
+sessions
+hp
+refer
+rarely
+##ora
+exists
+1892
+##ten
+scientists
+dirty
+penalty
+burst
+portrait
+seed
+79
+pole
+limits
+rival
+1894
+stable
+alpha
+grave
+constitutional
+alcohol
+arrest
+flower
+mystery
+devil
+architectural
+relationships
+greatly
+habitat
+##istic
+larry
+progressive
+remote
+cotton
+##ics
+##ok
+preserved
+reaches
+##ming
+cited
+86
+vast
+scholarship
+decisions
+cbs
+joy
+teach
+1885
+editions
+knocked
+eve
+searching
+partly
+participation
+gap
+animated
+fate
+excellent
+##ett
+na
+87
+alternate
+saints
+youngest
+##ily
+climbed
+##ita
+##tors
+suggest
+##ct
+discussion
+staying
+choir
+lakes
+jacket
+revenue
+nevertheless
+peaked
+instrument
+wondering
+annually
+managing
+neil
+1891
+signing
+terry
+##ice
+apply
+clinical
+brooklyn
+aim
+catherine
+fuck
+farmers
+figured
+ninth
+pride
+hugh
+evolution
+ordinary
+involvement
+comfortable
+shouted
+tech
+encouraged
+taiwan
+representation
+sharing
+##lia
+##em
+panic
+exact
+cargo
+competing
+fat
+cried
+83
+1920s
+occasions
+pa
+cabin
+borders
+utah
+marcus
+##isation
+badly
+muscles
+##ance
+victorian
+transition
+warner
+bet
+permission
+##rin
+slave
+terrible
+similarly
+shares
+seth
+uefa
+possession
+medals
+benefits
+colleges
+lowered
+perfectly
+mall
+transit
+##ye
+##kar
+publisher
+##ened
+harrison
+deaths
+elevation
+##ae
+asleep
+machines
+sigh
+ash
+hardly
+argument
+occasion
+parent
+leo
+decline
+1888
+contribution
+##ua
+concentration
+1000
+opportunities
+hispanic
+guardian
+extent
+emotions
+hips
+mason
+volumes
+bloody
+controversy
+diameter
+steady
+mistake
+phoenix
+identify
+violin
+##sk
+departure
+richmond
+spin
+funeral
+enemies
+1864
+gear
+literally
+connor
+random
+sergeant
+grab
+confusion
+1865
+transmission
+informed
+op
+leaning
+sacred
+suspended
+thinks
+gates
+portland
+luck
+agencies
+yours
+hull
+expert
+muscle
+layer
+practical
+sculpture
+jerusalem
+latest
+lloyd
+statistics
+deeper
+recommended
+warrior
+arkansas
+mess
+supports
+greg
+eagle
+1880
+recovered
+rated
+concerts
+rushed
+##ano
+stops
+eggs
+files
+premiere
+keith
+##vo
+delhi
+turner
+pit
+affair
+belief
+paint
+##zing
+mate
+##ach
+##ev
+victim
+##ology
+withdrew
+bonus
+styles
+fled
+##ud
+glasgow
+technologies
+funded
+nbc
+adaptation
+##ata
+portrayed
+cooperation
+supporters
+judges
+bernard
+justin
+hallway
+ralph
+##ick
+graduating
+controversial
+distant
+continental
+spider
+bite
+##ho
+recognize
+intention
+mixing
+##ese
+egyptian
+bow
+tourism
+suppose
+claiming
+tiger
+dominated
+participants
+vi
+##ru
+nurse
+partially
+tape
+##rum
+psychology
+##rn
+essential
+touring
+duo
+voting
+civilian
+emotional
+channels
+##king
+apparent
+hebrew
+1887
+tommy
+carrier
+intersection
+beast
+hudson
+##gar
+##zo
+lab
+nova
+bench
+discuss
+costa
+##ered
+detailed
+behalf
+drivers
+unfortunately
+obtain
+##lis
+rocky
+##dae
+siege
+friendship
+honey
+##rian
+1861
+amy
+hang
+posted
+governments
+collins
+respond
+wildlife
+preferred
+operator
+##po
+laura
+pregnant
+videos
+dennis
+suspected
+boots
+instantly
+weird
+automatic
+businessman
+alleged
+placing
+throwing
+ph
+mood
+1862
+perry
+venue
+jet
+remainder
+##lli
+##ci
+passion
+biological
+boyfriend
+1863
+dirt
+buffalo
+ron
+segment
+fa
+abuse
+##era
+genre
+thrown
+stroke
+colored
+stress
+exercise
+displayed
+##gen
+struggled
+##tti
+abroad
+dramatic
+wonderful
+thereafter
+madrid
+component
+widespread
+##sed
+tale
+citizen
+todd
+monday
+1886
+vancouver
+overseas
+forcing
+crying
+descent
+##ris
+discussed
+substantial
+ranks
+regime
+1870
+provinces
+switch
+drum
+zane
+ted
+tribes
+proof
+lp
+cream
+researchers
+volunteer
+manor
+silk
+milan
+donated
+allies
+venture
+principle
+delivery
+enterprise
+##ves
+##ans
+bars
+traditionally
+witch
+reminded
+copper
+##uk
+pete
+inter
+links
+colin
+grinned
+elsewhere
+competitive
+frequent
+##oy
+scream
+##hu
+tension
+texts
+submarine
+finnish
+defending
+defend
+pat
+detail
+1884
+affiliated
+stuart
+themes
+villa
+periods
+tool
+belgian
+ruling
+crimes
+answers
+folded
+licensed
+resort
+demolished
+hans
+lucy
+1881
+lion
+traded
+photographs
+writes
+craig
+##fa
+trials
+generated
+beth
+noble
+debt
+percentage
+yorkshire
+erected
+ss
+viewed
+grades
+confidence
+ceased
+islam
+telephone
+retail
+##ible
+chile
+m²
+roberts
+sixteen
+##ich
+commented
+hampshire
+innocent
+dual
+pounds
+checked
+regulations
+afghanistan
+sung
+rico
+liberty
+assets
+bigger
+options
+angels
+relegated
+tribute
+wells
+attending
+leaf
+##yan
+butler
+romanian
+forum
+monthly
+lisa
+patterns
+gmina
+##tory
+madison
+hurricane
+rev
+##ians
+bristol
+##ula
+elite
+valuable
+disaster
+democracy
+awareness
+germans
+freyja
+##ins
+loop
+absolutely
+paying
+populations
+maine
+sole
+prayer
+spencer
+releases
+doorway
+bull
+##ani
+lover
+midnight
+conclusion
+##sson
+thirteen
+lily
+mediterranean
+##lt
+nhl
+proud
+sample
+##hill
+drummer
+guinea
+##ova
+murphy
+climb
+##ston
+instant
+attributed
+horn
+ain
+railways
+steven
+##ao
+autumn
+ferry
+opponent
+root
+traveling
+secured
+corridor
+stretched
+tales
+sheet
+trinity
+cattle
+helps
+indicates
+manhattan
+murdered
+fitted
+1882
+gentle
+grandmother
+mines
+shocked
+vegas
+produces
+##light
+caribbean
+##ou
+belong
+continuous
+desperate
+drunk
+historically
+trio
+waved
+raf
+dealing
+nathan
+bat
+murmured
+interrupted
+residing
+scientist
+pioneer
+harold
+aaron
+##net
+delta
+attempting
+minority
+mini
+believes
+chorus
+tend
+lots
+eyed
+indoor
+load
+shots
+updated
+jail
+##llo
+concerning
+connecting
+wealth
+##ved
+slaves
+arrive
+rangers
+sufficient
+rebuilt
+##wick
+cardinal
+flood
+muhammad
+whenever
+relation
+runners
+moral
+repair
+viewers
+arriving
+revenge
+punk
+assisted
+bath
+fairly
+breathe
+lists
+innings
+illustrated
+whisper
+nearest
+voters
+clinton
+ties
+ultimate
+screamed
+beijing
+lions
+andre
+fictional
+gathering
+comfort
+radar
+suitable
+dismissed
+hms
+ban
+pine
+wrist
+atmosphere
+voivodeship
+bid
+timber
+##ned
+##nan
+giants
+##ane
+cameron
+recovery
+uss
+identical
+categories
+switched
+serbia
+laughter
+noah
+ensemble
+therapy
+peoples
+touching
+##off
+locally
+pearl
+platforms
+everywhere
+ballet
+tables
+lanka
+herbert
+outdoor
+toured
+derek
+1883
+spaces
+contested
+swept
+1878
+exclusive
+slight
+connections
+##dra
+winds
+prisoner
+collective
+bangladesh
+tube
+publicly
+wealthy
+thai
+##ys
+isolated
+select
+##ric
+insisted
+pen
+fortune
+ticket
+spotted
+reportedly
+animation
+enforcement
+tanks
+110
+decides
+wider
+lowest
+owen
+##time
+nod
+hitting
+##hn
+gregory
+furthermore
+magazines
+fighters
+solutions
+##ery
+pointing
+requested
+peru
+reed
+chancellor
+knights
+mask
+worker
+eldest
+flames
+reduction
+1860
+volunteers
+##tis
+reporting
+##hl
+wire
+advisory
+endemic
+origins
+settlers
+pursue
+knock
+consumer
+1876
+eu
+compound
+creatures
+mansion
+sentenced
+ivan
+deployed
+guitars
+frowned
+involves
+mechanism
+kilometers
+perspective
+shops
+maps
+terminus
+duncan
+alien
+fist
+bridges
+##pers
+heroes
+fed
+derby
+swallowed
+##ros
+patent
+sara
+illness
+characterized
+adventures
+slide
+hawaii
+jurisdiction
+##op
+organised
+##side
+adelaide
+walks
+biology
+se
+##ties
+rogers
+swing
+tightly
+boundaries
+##rie
+prepare
+implementation
+stolen
+##sha
+certified
+colombia
+edwards
+garage
+##mm
+recalled
+##ball
+rage
+harm
+nigeria
+breast
+##ren
+furniture
+pupils
+settle
+##lus
+cuba
+balls
+client
+alaska
+21st
+linear
+thrust
+celebration
+latino
+genetic
+terror
+##cia
+##ening
+lightning
+fee
+witness
+lodge
+establishing
+skull
+##ique
+earning
+hood
+##ei
+rebellion
+wang
+sporting
+warned
+missile
+devoted
+activist
+porch
+worship
+fourteen
+package
+1871
+decorated
+##shire
+housed
+##ock
+chess
+sailed
+doctors
+oscar
+joan
+treat
+garcia
+harbour
+jeremy
+##ire
+traditions
+dominant
+jacques
+##gon
+##wan
+relocated
+1879
+amendment
+sized
+companion
+simultaneously
+volleyball
+spun
+acre
+increases
+stopping
+loves
+belongs
+affect
+drafted
+tossed
+scout
+battles
+1875
+filming
+shoved
+munich
+tenure
+vertical
+romance
+pc
+##cher
+argue
+##ical
+craft
+ranging
+www
+opens
+honest
+tyler
+yesterday
+virtual
+##let
+muslims
+reveal
+snake
+immigrants
+radical
+screaming
+speakers
+firing
+saving
+belonging
+ease
+lighting
+prefecture
+blame
+farmer
+hungry
+grows
+rubbed
+beam
+sur
+subsidiary
+##cha
+armenian
+sao
+dropping
+conventional
+##fer
+microsoft
+reply
+qualify
+spots
+1867
+sweat
+festivals
+##ken
+immigration
+physician
+discover
+exposure
+sandy
+explanation
+isaac
+implemented
+##fish
+hart
+initiated
+connect
+stakes
+presents
+heights
+householder
+pleased
+tourist
+regardless
+slip
+closest
+##ction
+surely
+sultan
+brings
+riley
+preparation
+aboard
+slammed
+baptist
+experiment
+ongoing
+interstate
+organic
+playoffs
+##ika
+1877
+130
+##tar
+hindu
+error
+tours
+tier
+plenty
+arrangements
+talks
+trapped
+excited
+sank
+ho
+athens
+1872
+denver
+welfare
+suburb
+athletes
+trick
+diverse
+belly
+exclusively
+yelled
+1868
+##med
+conversion
+##ette
+1874
+internationally
+computers
+conductor
+abilities
+sensitive
+hello
+dispute
+measured
+globe
+rocket
+prices
+amsterdam
+flights
+tigers
+inn
+municipalities
+emotion
+references
+3d
+##mus
+explains
+airlines
+manufactured
+pm
+archaeological
+1873
+interpretation
+devon
+comment
+##ites
+settlements
+kissing
+absolute
+improvement
+suite
+impressed
+barcelona
+sullivan
+jefferson
+towers
+jesse
+julie
+##tin
+##lu
+grandson
+hi
+gauge
+regard
+rings
+interviews
+trace
+raymond
+thumb
+departments
+burns
+serial
+bulgarian
+scores
+demonstrated
+##ix
+1866
+kyle
+alberta
+underneath
+romanized
+##ward
+relieved
+acquisition
+phrase
+cliff
+reveals
+han
+cuts
+merger
+custom
+##dar
+nee
+gilbert
+graduation
+##nts
+assessment
+cafe
+difficulty
+demands
+swung
+democrat
+jennifer
+commons
+1940s
+grove
+##yo
+completing
+focuses
+sum
+substitute
+bearing
+stretch
+reception
+##py
+reflected
+essentially
+destination
+pairs
+##ched
+survival
+resource
+##bach
+promoting
+doubles
+messages
+tear
+##down
+##fully
+parade
+florence
+harvey
+incumbent
+partial
+framework
+900
+pedro
+frozen
+procedure
+olivia
+controls
+##mic
+shelter
+personally
+temperatures
+##od
+brisbane
+tested
+sits
+marble
+comprehensive
+oxygen
+leonard
+##kov
+inaugural
+iranian
+referring
+quarters
+attitude
+##ivity
+mainstream
+lined
+mars
+dakota
+norfolk
+unsuccessful
+##°
+explosion
+helicopter
+congressional
+##sing
+inspector
+bitch
+seal
+departed
+divine
+##ters
+coaching
+examination
+punishment
+manufacturer
+sink
+columns
+unincorporated
+signals
+nevada
+squeezed
+dylan
+dining
+photos
+martial
+manuel
+eighteen
+elevator
+brushed
+plates
+ministers
+ivy
+congregation
+##len
+slept
+specialized
+taxes
+curve
+restricted
+negotiations
+likes
+statistical
+arnold
+inspiration
+execution
+bold
+intermediate
+significance
+margin
+ruler
+wheels
+gothic
+intellectual
+dependent
+listened
+eligible
+buses
+widow
+syria
+earn
+cincinnati
+collapsed
+recipient
+secrets
+accessible
+philippine
+maritime
+goddess
+clerk
+surrender
+breaks
+playoff
+database
+##ified
+##lon
+ideal
+beetle
+aspect
+soap
+regulation
+strings
+expand
+anglo
+shorter
+crosses
+retreat
+tough
+coins
+wallace
+directions
+pressing
+##oon
+shipping
+locomotives
+comparison
+topics
+nephew
+##mes
+distinction
+honors
+travelled
+sierra
+ibn
+##over
+fortress
+sa
+recognised
+carved
+1869
+clients
+##dan
+intent
+##mar
+coaches
+describing
+bread
+##ington
+beaten
+northwestern
+##ona
+merit
+youtube
+collapse
+challenges
+em
+historians
+objective
+submitted
+virus
+attacking
+drake
+assume
+##ere
+diseases
+marc
+stem
+leeds
+##cus
+##ab
+farming
+glasses
+##lock
+visits
+nowhere
+fellowship
+relevant
+carries
+restaurants
+experiments
+101
+constantly
+bases
+targets
+shah
+tenth
+opponents
+verse
+territorial
+##ira
+writings
+corruption
+##hs
+instruction
+inherited
+reverse
+emphasis
+##vic
+employee
+arch
+keeps
+rabbi
+watson
+payment
+uh
+##ala
+nancy
+##tre
+venice
+fastest
+sexy
+banned
+adrian
+properly
+ruth
+touchdown
+dollar
+boards
+metre
+circles
+edges
+favour
+comments
+ok
+travels
+liberation
+scattered
+firmly
+##ular
+holland
+permitted
+diesel
+kenya
+den
+originated
+##ral
+demons
+resumed
+dragged
+rider
+##rus
+servant
+blinked
+extend
+torn
+##ias
+##sey
+input
+meal
+everybody
+cylinder
+kinds
+camps
+##fe
+bullet
+logic
+##wn
+croatian
+evolved
+healthy
+fool
+chocolate
+wise
+preserve
+pradesh
+##ess
+respective
+1850
+##ew
+chicken
+artificial
+gross
+corresponding
+convicted
+cage
+caroline
+dialogue
+##dor
+narrative
+stranger
+mario
+br
+christianity
+failing
+trent
+commanding
+buddhist
+1848
+maurice
+focusing
+yale
+bike
+altitude
+##ering
+mouse
+revised
+##sley
+veteran
+##ig
+pulls
+theology
+crashed
+campaigns
+legion
+##ability
+drag
+excellence
+customer
+cancelled
+intensity
+excuse
+##lar
+liga
+participating
+contributing
+printing
+##burn
+variable
+##rk
+curious
+bin
+legacy
+renaissance
+##my
+symptoms
+binding
+vocalist
+dancer
+##nie
+grammar
+gospel
+democrats
+ya
+enters
+sc
+diplomatic
+hitler
+##ser
+clouds
+mathematical
+quit
+defended
+oriented
+##heim
+fundamental
+hardware
+impressive
+equally
+convince
+confederate
+guilt
+chuck
+sliding
+##ware
+magnetic
+narrowed
+petersburg
+bulgaria
+otto
+phd
+skill
+##ama
+reader
+hopes
+pitcher
+reservoir
+hearts
+automatically
+expecting
+mysterious
+bennett
+extensively
+imagined
+seeds
+monitor
+fix
+##ative
+journalism
+struggling
+signature
+ranch
+encounter
+photographer
+observation
+protests
+##pin
+influences
+##hr
+calendar
+##all
+cruz
+croatia
+locomotive
+hughes
+naturally
+shakespeare
+basement
+hook
+uncredited
+faded
+theories
+approaches
+dare
+phillips
+filling
+fury
+obama
+##ain
+efficient
+arc
+deliver
+min
+raid
+breeding
+inducted
+leagues
+efficiency
+axis
+montana
+eagles
+##ked
+supplied
+instructions
+karen
+picking
+indicating
+trap
+anchor
+practically
+christians
+tomb
+vary
+occasional
+electronics
+lords
+readers
+newcastle
+faint
+innovation
+collect
+situations
+engagement
+160
+claude
+mixture
+##feld
+peer
+tissue
+logo
+lean
+##ration
+°f
+floors
+##ven
+architects
+reducing
+##our
+##ments
+rope
+1859
+ottawa
+##har
+samples
+banking
+declaration
+proteins
+resignation
+francois
+saudi
+advocate
+exhibited
+armor
+twins
+divorce
+##ras
+abraham
+reviewed
+jo
+temporarily
+matrix
+physically
+pulse
+curled
+##ena
+difficulties
+bengal
+usage
+##ban
+annie
+riders
+certificate
+##pi
+holes
+warsaw
+distinctive
+jessica
+##mon
+mutual
+1857
+customs
+circular
+eugene
+removal
+loaded
+mere
+vulnerable
+depicted
+generations
+dame
+heir
+enormous
+lightly
+climbing
+pitched
+lessons
+pilots
+nepal
+ram
+google
+preparing
+brad
+louise
+renowned
+##₂
+liam
+##ably
+plaza
+shaw
+sophie
+brilliant
+bills
+##bar
+##nik
+fucking
+mainland
+server
+pleasant
+seized
+veterans
+jerked
+fail
+beta
+brush
+radiation
+stored
+warmth
+southeastern
+nate
+sin
+raced
+berkeley
+joke
+athlete
+designation
+trunk
+##low
+roland
+qualification
+archives
+heels
+artwork
+receives
+judicial
+reserves
+##bed
+woke
+installation
+abu
+floating
+fake
+lesser
+excitement
+interface
+concentrated
+addressed
+characteristic
+amanda
+saxophone
+monk
+auto
+##bus
+releasing
+egg
+dies
+interaction
+defender
+ce
+outbreak
+glory
+loving
+##bert
+sequel
+consciousness
+http
+awake
+ski
+enrolled
+##ress
+handling
+rookie
+brow
+somebody
+biography
+warfare
+amounts
+contracts
+presentation
+fabric
+dissolved
+challenged
+meter
+psychological
+lt
+elevated
+rally
+accurate
+##tha
+hospitals
+undergraduate
+specialist
+venezuela
+exhibit
+shed
+nursing
+protestant
+fluid
+structural
+footage
+jared
+consistent
+prey
+##ska
+succession
+reflect
+exile
+lebanon
+wiped
+suspect
+shanghai
+resting
+integration
+preservation
+marvel
+variant
+pirates
+sheep
+rounded
+capita
+sailing
+colonies
+manuscript
+deemed
+variations
+clarke
+functional
+emerging
+boxing
+relaxed
+curse
+azerbaijan
+heavyweight
+nickname
+editorial
+rang
+grid
+tightened
+earthquake
+flashed
+miguel
+rushing
+##ches
+improvements
+boxes
+brooks
+180
+consumption
+molecular
+felix
+societies
+repeatedly
+variation
+aids
+civic
+graphics
+professionals
+realm
+autonomous
+receiver
+delayed
+workshop
+militia
+chairs
+trump
+canyon
+##point
+harsh
+extending
+lovely
+happiness
+##jan
+stake
+eyebrows
+embassy
+wellington
+hannah
+##ella
+sony
+corners
+bishops
+swear
+cloth
+contents
+xi
+namely
+commenced
+1854
+stanford
+nashville
+courage
+graphic
+commitment
+garrison
+##bin
+hamlet
+clearing
+rebels
+attraction
+literacy
+cooking
+ruins
+temples
+jenny
+humanity
+celebrate
+hasn
+freight
+sixty
+rebel
+bastard
+##art
+newton
+##ada
+deer
+##ges
+##ching
+smiles
+delaware
+singers
+##ets
+approaching
+assists
+flame
+##ph
+boulevard
+barrel
+planted
+##ome
+pursuit
+##sia
+consequences
+posts
+shallow
+invitation
+rode
+depot
+ernest
+kane
+rod
+concepts
+preston
+topic
+chambers
+striking
+blast
+arrives
+descendants
+montgomery
+ranges
+worlds
+##lay
+##ari
+span
+chaos
+praise
+##ag
+fewer
+1855
+sanctuary
+mud
+fbi
+##ions
+programmes
+maintaining
+unity
+harper
+bore
+handsome
+closure
+tournaments
+thunder
+nebraska
+linda
+facade
+puts
+satisfied
+argentine
+dale
+cork
+dome
+panama
+##yl
+1858
+tasks
+experts
+##ates
+feeding
+equation
+##las
+##ida
+##tu
+engage
+bryan
+##ax
+um
+quartet
+melody
+disbanded
+sheffield
+blocked
+gasped
+delay
+kisses
+maggie
+connects
+##non
+sts
+poured
+creator
+publishers
+##we
+guided
+ellis
+extinct
+hug
+gaining
+##ord
+complicated
+##bility
+poll
+clenched
+investigate
+##use
+thereby
+quantum
+spine
+cdp
+humor
+kills
+administered
+semifinals
+##du
+encountered
+ignore
+##bu
+commentary
+##maker
+bother
+roosevelt
+140
+plains
+halfway
+flowing
+cultures
+crack
+imprisoned
+neighboring
+airline
+##ses
+##view
+##mate
+##ec
+gather
+wolves
+marathon
+transformed
+##ill
+cruise
+organisations
+carol
+punch
+exhibitions
+numbered
+alarm
+ratings
+daddy
+silently
+##stein
+queens
+colours
+impression
+guidance
+liu
+tactical
+##rat
+marshal
+della
+arrow
+##ings
+rested
+feared
+tender
+owns
+bitter
+advisor
+escort
+##ides
+spare
+farms
+grants
+##ene
+dragons
+encourage
+colleagues
+cameras
+##und
+sucked
+pile
+spirits
+prague
+statements
+suspension
+landmark
+fence
+torture
+recreation
+bags
+permanently
+survivors
+pond
+spy
+predecessor
+bombing
+coup
+##og
+protecting
+transformation
+glow
+##lands
+##book
+dug
+priests
+andrea
+feat
+barn
+jumping
+##chen
+##ologist
+##con
+casualties
+stern
+auckland
+pipe
+serie
+revealing
+ba
+##bel
+trevor
+mercy
+spectrum
+yang
+consist
+governing
+collaborated
+possessed
+epic
+comprises
+blew
+shane
+##ack
+lopez
+honored
+magical
+sacrifice
+judgment
+perceived
+hammer
+mtv
+baronet
+tune
+das
+missionary
+sheets
+350
+neutral
+oral
+threatening
+attractive
+shade
+aims
+seminary
+##master
+estates
+1856
+michel
+wounds
+refugees
+manufacturers
+##nic
+mercury
+syndrome
+porter
+##iya
+##din
+hamburg
+identification
+upstairs
+purse
+widened
+pause
+cared
+breathed
+affiliate
+santiago
+prevented
+celtic
+fisher
+125
+recruited
+byzantine
+reconstruction
+farther
+##mp
+diet
+sake
+au
+spite
+sensation
+##ert
+blank
+separation
+105
+##hon
+vladimir
+armies
+anime
+##lie
+accommodate
+orbit
+cult
+sofia
+archive
+##ify
+##box
+founders
+sustained
+disorder
+honours
+northeastern
+mia
+crops
+violet
+threats
+blanket
+fires
+canton
+followers
+southwestern
+prototype
+voyage
+assignment
+altered
+moderate
+protocol
+pistol
+##eo
+questioned
+brass
+lifting
+1852
+math
+authored
+##ual
+doug
+dimensional
+dynamic
+##san
+1851
+pronounced
+grateful
+quest
+uncomfortable
+boom
+presidency
+stevens
+relating
+politicians
+chen
+barrier
+quinn
+diana
+mosque
+tribal
+cheese
+palmer
+portions
+sometime
+chester
+treasure
+wu
+bend
+download
+millions
+reforms
+registration
+##osa
+consequently
+monitoring
+ate
+preliminary
+brandon
+invented
+ps
+eaten
+exterior
+intervention
+ports
+documented
+log
+displays
+lecture
+sally
+favourite
+##itz
+vermont
+lo
+invisible
+isle
+breed
+##ator
+journalists
+relay
+speaks
+backward
+explore
+midfielder
+actively
+stefan
+procedures
+cannon
+blond
+kenneth
+centered
+servants
+chains
+libraries
+malcolm
+essex
+henri
+slavery
+##hal
+facts
+fairy
+coached
+cassie
+cats
+washed
+cop
+##fi
+announcement
+item
+2000s
+vinyl
+activated
+marco
+frontier
+growled
+curriculum
+##das
+loyal
+accomplished
+leslie
+ritual
+kenny
+##00
+vii
+napoleon
+hollow
+hybrid
+jungle
+stationed
+friedrich
+counted
+##ulated
+platinum
+theatrical
+seated
+col
+rubber
+glen
+1840
+diversity
+healing
+extends
+id
+provisions
+administrator
+columbus
+##oe
+tributary
+te
+assured
+org
+##uous
+prestigious
+examined
+lectures
+grammy
+ronald
+associations
+bailey
+allan
+essays
+flute
+believing
+consultant
+proceedings
+travelling
+1853
+kit
+kerala
+yugoslavia
+buddy
+methodist
+##ith
+burial
+centres
+batman
+##nda
+discontinued
+bo
+dock
+stockholm
+lungs
+severely
+##nk
+citing
+manga
+##ugh
+steal
+mumbai
+iraqi
+robot
+celebrity
+bride
+broadcasts
+abolished
+pot
+joel
+overhead
+franz
+packed
+reconnaissance
+johann
+acknowledged
+introduce
+handled
+doctorate
+developments
+drinks
+alley
+palestine
+##nis
+##aki
+proceeded
+recover
+bradley
+grain
+patch
+afford
+infection
+nationalist
+legendary
+##ath
+interchange
+virtually
+gen
+gravity
+exploration
+amber
+vital
+wishes
+powell
+doctrine
+elbow
+screenplay
+##bird
+contribute
+indonesian
+pet
+creates
+##com
+enzyme
+kylie
+discipline
+drops
+manila
+hunger
+##ien
+layers
+suffer
+fever
+bits
+monica
+keyboard
+manages
+##hood
+searched
+appeals
+##bad
+testament
+grande
+reid
+##war
+beliefs
+congo
+##ification
+##dia
+si
+requiring
+##via
+casey
+1849
+regret
+streak
+rape
+depends
+syrian
+sprint
+pound
+tourists
+upcoming
+pub
+##xi
+tense
+##els
+practiced
+echo
+nationwide
+guild
+motorcycle
+liz
+##zar
+chiefs
+desired
+elena
+bye
+precious
+absorbed
+relatives
+booth
+pianist
+##mal
+citizenship
+exhausted
+wilhelm
+##ceae
+##hed
+noting
+quarterback
+urge
+hectares
+##gue
+ace
+holly
+##tal
+blonde
+davies
+parked
+sustainable
+stepping
+twentieth
+airfield
+galaxy
+nest
+chip
+##nell
+tan
+shaft
+paulo
+requirement
+##zy
+paradise
+tobacco
+trans
+renewed
+vietnamese
+##cker
+##ju
+suggesting
+catching
+holmes
+enjoying
+md
+trips
+colt
+holder
+butterfly
+nerve
+reformed
+cherry
+bowling
+trailer
+carriage
+goodbye
+appreciate
+toy
+joshua
+interactive
+enabled
+involve
+##kan
+collar
+determination
+bunch
+facebook
+recall
+shorts
+superintendent
+episcopal
+frustration
+giovanni
+nineteenth
+laser
+privately
+array
+circulation
+##ovic
+armstrong
+deals
+painful
+permit
+discrimination
+##wi
+aires
+retiring
+cottage
+ni
+##sta
+horizon
+ellen
+jamaica
+ripped
+fernando
+chapters
+playstation
+patron
+lecturer
+navigation
+behaviour
+genes
+georgian
+export
+solomon
+rivals
+swift
+seventeen
+rodriguez
+princeton
+independently
+sox
+1847
+arguing
+entity
+casting
+hank
+criteria
+oakland
+geographic
+milwaukee
+reflection
+expanding
+conquest
+dubbed
+##tv
+halt
+brave
+brunswick
+doi
+arched
+curtis
+divorced
+predominantly
+somerset
+streams
+ugly
+zoo
+horrible
+curved
+buenos
+fierce
+dictionary
+vector
+theological
+unions
+handful
+stability
+chan
+punjab
+segments
+##lly
+altar
+ignoring
+gesture
+monsters
+pastor
+##stone
+thighs
+unexpected
+operators
+abruptly
+coin
+compiled
+associates
+improving
+migration
+pin
+##ose
+compact
+collegiate
+reserved
+##urs
+quarterfinals
+roster
+restore
+assembled
+hurry
+oval
+##cies
+1846
+flags
+martha
+##del
+victories
+sharply
+##rated
+argues
+deadly
+neo
+drawings
+symbols
+performer
+##iel
+griffin
+restrictions
+editing
+andrews
+java
+journals
+arabia
+compositions
+dee
+pierce
+removing
+hindi
+casino
+runway
+civilians
+minds
+nasa
+hotels
+##zation
+refuge
+rent
+retain
+potentially
+conferences
+suburban
+conducting
+##tto
+##tions
+##tle
+descended
+massacre
+##cal
+ammunition
+terrain
+fork
+souls
+counts
+chelsea
+durham
+drives
+cab
+##bank
+perth
+realizing
+palestinian
+finn
+simpson
+##dal
+betty
+##ule
+moreover
+particles
+cardinals
+tent
+evaluation
+extraordinary
+##oid
+inscription
+##works
+wednesday
+chloe
+maintains
+panels
+ashley
+trucks
+##nation
+cluster
+sunlight
+strikes
+zhang
+##wing
+dialect
+canon
+##ap
+tucked
+##ws
+collecting
+##mas
+##can
+##sville
+maker
+quoted
+evan
+franco
+aria
+buying
+cleaning
+eva
+closet
+provision
+apollo
+clinic
+rat
+##ez
+necessarily
+ac
+##gle
+##ising
+venues
+flipped
+cent
+spreading
+trustees
+checking
+authorized
+##sco
+disappointed
+##ado
+notion
+duration
+trumpet
+hesitated
+topped
+brussels
+rolls
+theoretical
+hint
+define
+aggressive
+repeat
+wash
+peaceful
+optical
+width
+allegedly
+mcdonald
+strict
+copyright
+##illa
+investors
+mar
+jam
+witnesses
+sounding
+miranda
+michelle
+privacy
+hugo
+harmony
+##pp
+valid
+lynn
+glared
+nina
+102
+headquartered
+diving
+boarding
+gibson
+##ncy
+albanian
+marsh
+routine
+dealt
+enhanced
+er
+intelligent
+substance
+targeted
+enlisted
+discovers
+spinning
+observations
+pissed
+smoking
+rebecca
+capitol
+visa
+varied
+costume
+seemingly
+indies
+compensation
+surgeon
+thursday
+arsenal
+westminster
+suburbs
+rid
+anglican
+##ridge
+knots
+foods
+alumni
+lighter
+fraser
+whoever
+portal
+scandal
+##ray
+gavin
+advised
+instructor
+flooding
+terrorist
+##ale
+teenage
+interim
+senses
+duck
+teen
+thesis
+abby
+eager
+overcome
+##ile
+newport
+glenn
+rises
+shame
+##cc
+prompted
+priority
+forgot
+bomber
+nicolas
+protective
+360
+cartoon
+katherine
+breeze
+lonely
+trusted
+henderson
+richardson
+relax
+banner
+candy
+palms
+remarkable
+##rio
+legends
+cricketer
+essay
+ordained
+edmund
+rifles
+trigger
+##uri
+##away
+sail
+alert
+1830
+audiences
+penn
+sussex
+siblings
+pursued
+indianapolis
+resist
+rosa
+consequence
+succeed
+avoided
+1845
+##ulation
+inland
+##tie
+##nna
+counsel
+profession
+chronicle
+hurried
+##una
+eyebrow
+eventual
+bleeding
+innovative
+cure
+##dom
+committees
+accounting
+con
+scope
+hardy
+heather
+tenor
+gut
+herald
+codes
+tore
+scales
+wagon
+##oo
+luxury
+tin
+prefer
+fountain
+triangle
+bonds
+darling
+convoy
+dried
+traced
+beings
+troy
+accidentally
+slam
+findings
+smelled
+joey
+lawyers
+outcome
+steep
+bosnia
+configuration
+shifting
+toll
+brook
+performers
+lobby
+philosophical
+construct
+shrine
+aggregate
+boot
+cox
+phenomenon
+savage
+insane
+solely
+reynolds
+lifestyle
+##ima
+nationally
+holdings
+consideration
+enable
+edgar
+mo
+mama
+##tein
+fights
+relegation
+chances
+atomic
+hub
+conjunction
+awkward
+reactions
+currency
+finale
+kumar
+underwent
+steering
+elaborate
+gifts
+comprising
+melissa
+veins
+reasonable
+sunshine
+chi
+solve
+trails
+inhabited
+elimination
+ethics
+huh
+ana
+molly
+consent
+apartments
+layout
+marines
+##ces
+hunters
+bulk
+##oma
+hometown
+##wall
+##mont
+cracked
+reads
+neighbouring
+withdrawn
+admission
+wingspan
+damned
+anthology
+lancashire
+brands
+batting
+forgive
+cuban
+awful
+##lyn
+104
+dimensions
+imagination
+##ade
+dante
+##ship
+tracking
+desperately
+goalkeeper
+##yne
+groaned
+workshops
+confident
+burton
+gerald
+milton
+circus
+uncertain
+slope
+copenhagen
+sophia
+fog
+philosopher
+portraits
+accent
+cycling
+varying
+gripped
+larvae
+garrett
+specified
+scotia
+mature
+luther
+kurt
+rap
+##kes
+aerial
+750
+ferdinand
+heated
+es
+transported
+##shan
+safely
+nonetheless
+##orn
+##gal
+motors
+demanding
+##sburg
+startled
+##brook
+ally
+generate
+caps
+ghana
+stained
+demo
+mentions
+beds
+ap
+afterward
+diary
+##bling
+utility
+##iro
+richards
+1837
+conspiracy
+conscious
+shining
+footsteps
+observer
+cyprus
+urged
+loyalty
+developer
+probability
+olive
+upgraded
+gym
+miracle
+insects
+graves
+1844
+ourselves
+hydrogen
+amazon
+katie
+tickets
+poets
+##pm
+planes
+##pan
+prevention
+witnessed
+dense
+jin
+randy
+tang
+warehouse
+monroe
+bang
+archived
+elderly
+investigations
+alec
+granite
+mineral
+conflicts
+controlling
+aboriginal
+carlo
+##zu
+mechanics
+stan
+stark
+rhode
+skirt
+est
+##berry
+bombs
+respected
+##horn
+imposed
+limestone
+deny
+nominee
+memphis
+grabbing
+disabled
+##als
+amusement
+aa
+frankfurt
+corn
+referendum
+varies
+slowed
+disk
+firms
+unconscious
+incredible
+clue
+sue
+##zhou
+twist
+##cio
+joins
+idaho
+chad
+developers
+computing
+destroyer
+103
+mortal
+tucker
+kingston
+choices
+yu
+carson
+1800
+os
+whitney
+geneva
+pretend
+dimension
+staged
+plateau
+maya
+##une
+freestyle
+##bc
+rovers
+hiv
+##ids
+tristan
+classroom
+prospect
+##hus
+honestly
+diploma
+lied
+thermal
+auxiliary
+feast
+unlikely
+iata
+##tel
+morocco
+pounding
+treasury
+lithuania
+considerably
+1841
+dish
+1812
+geological
+matching
+stumbled
+destroying
+marched
+brien
+advances
+cake
+nicole
+belle
+settling
+measuring
+directing
+##mie
+tuesday
+bassist
+capabilities
+stunned
+fraud
+torpedo
+##list
+##phone
+anton
+wisdom
+surveillance
+ruined
+##ulate
+lawsuit
+healthcare
+theorem
+halls
+trend
+aka
+horizontal
+dozens
+acquire
+lasting
+swim
+hawk
+gorgeous
+fees
+vicinity
+decrease
+adoption
+tactics
+##ography
+pakistani
+##ole
+draws
+##hall
+willie
+burke
+heath
+algorithm
+integral
+powder
+elliott
+brigadier
+jackie
+tate
+varieties
+darker
+##cho
+lately
+cigarette
+specimens
+adds
+##ree
+##ensis
+##inger
+exploded
+finalist
+cia
+murders
+wilderness
+arguments
+nicknamed
+acceptance
+onwards
+manufacture
+robertson
+jets
+tampa
+enterprises
+blog
+loudly
+composers
+nominations
+1838
+ai
+malta
+inquiry
+automobile
+hosting
+viii
+rays
+tilted
+grief
+museums
+strategies
+furious
+euro
+equality
+cohen
+poison
+surrey
+wireless
+governed
+ridiculous
+moses
+##esh
+##room
+vanished
+##ito
+barnes
+attract
+morrison
+istanbul
+##iness
+absent
+rotation
+petition
+janet
+##logical
+satisfaction
+custody
+deliberately
+observatory
+comedian
+surfaces
+pinyin
+novelist
+strictly
+canterbury
+oslo
+monks
+embrace
+ibm
+jealous
+photograph
+continent
+dorothy
+marina
+doc
+excess
+holden
+allegations
+explaining
+stack
+avoiding
+lance
+storyline
+majesty
+poorly
+spike
+dos
+bradford
+raven
+travis
+classics
+proven
+voltage
+pillow
+fists
+butt
+1842
+interpreted
+##car
+1839
+gage
+telegraph
+lens
+promising
+expelled
+casual
+collector
+zones
+##min
+silly
+nintendo
+##kh
+##bra
+downstairs
+chef
+suspicious
+afl
+flies
+vacant
+uganda
+pregnancy
+condemned
+lutheran
+estimates
+cheap
+decree
+saxon
+proximity
+stripped
+idiot
+deposits
+contrary
+presenter
+magnus
+glacier
+im
+offense
+edwin
+##ori
+upright
+##long
+bolt
+##ois
+toss
+geographical
+##izes
+environments
+delicate
+marking
+abstract
+xavier
+nails
+windsor
+plantation
+occurring
+equity
+saskatchewan
+fears
+drifted
+sequences
+vegetation
+revolt
+##stic
+1843
+sooner
+fusion
+opposing
+nato
+skating
+1836
+secretly
+ruin
+lease
+##oc
+edit
+##nne
+flora
+anxiety
+ruby
+##ological
+##mia
+tel
+bout
+taxi
+emmy
+frost
+rainbow
+compounds
+foundations
+rainfall
+assassination
+nightmare
+dominican
+##win
+achievements
+deserve
+orlando
+intact
+armenia
+##nte
+calgary
+valentine
+106
+marion
+proclaimed
+theodore
+bells
+courtyard
+thigh
+gonzalez
+console
+troop
+minimal
+monte
+everyday
+##ence
+##if
+supporter
+terrorism
+buck
+openly
+presbyterian
+activists
+carpet
+##iers
+rubbing
+uprising
+##yi
+cute
+conceived
+legally
+##cht
+millennium
+cello
+velocity
+ji
+rescued
+cardiff
+1835
+rex
+concentrate
+senators
+beard
+rendered
+glowing
+battalions
+scouts
+competitors
+sculptor
+catalogue
+arctic
+ion
+raja
+bicycle
+wow
+glancing
+lawn
+##woman
+gentleman
+lighthouse
+publish
+predicted
+calculated
+##val
+variants
+##gne
+strain
+##ui
+winston
+deceased
+##nus
+touchdowns
+brady
+caleb
+sinking
+echoed
+crush
+hon
+blessed
+protagonist
+hayes
+endangered
+magnitude
+editors
+##tine
+estimate
+responsibilities
+##mel
+backup
+laying
+consumed
+sealed
+zurich
+lovers
+frustrated
+##eau
+ahmed
+kicking
+mit
+treasurer
+1832
+biblical
+refuse
+terrified
+pump
+agrees
+genuine
+imprisonment
+refuses
+plymouth
+##hen
+lou
+##nen
+tara
+trembling
+antarctic
+ton
+learns
+##tas
+crap
+crucial
+faction
+atop
+##borough
+wrap
+lancaster
+odds
+hopkins
+erik
+lyon
+##eon
+bros
+##ode
+snap
+locality
+tips
+empress
+crowned
+cal
+acclaimed
+chuckled
+##ory
+clara
+sends
+mild
+towel
+##fl
+##day
+##а
+wishing
+assuming
+interviewed
+##bal
+##die
+interactions
+eden
+cups
+helena
+##lf
+indie
+beck
+##fire
+batteries
+filipino
+wizard
+parted
+##lam
+traces
+##born
+rows
+idol
+albany
+delegates
+##ees
+##sar
+discussions
+##ex
+notre
+instructed
+belgrade
+highways
+suggestion
+lauren
+possess
+orientation
+alexandria
+abdul
+beats
+salary
+reunion
+ludwig
+alright
+wagner
+intimate
+pockets
+slovenia
+hugged
+brighton
+merchants
+cruel
+stole
+trek
+slopes
+repairs
+enrollment
+politically
+underlying
+promotional
+counting
+boeing
+##bb
+isabella
+naming
+##и
+keen
+bacteria
+listing
+separately
+belfast
+ussr
+450
+lithuanian
+anybody
+ribs
+sphere
+martinez
+cock
+embarrassed
+proposals
+fragments
+nationals
+##fs
+##wski
+premises
+fin
+1500
+alpine
+matched
+freely
+bounded
+jace
+sleeve
+##af
+gaming
+pier
+populated
+evident
+##like
+frances
+flooded
+##dle
+frightened
+pour
+trainer
+framed
+visitor
+challenging
+pig
+wickets
+##fold
+infected
+email
+##pes
+arose
+##aw
+reward
+ecuador
+oblast
+vale
+ch
+shuttle
+##usa
+bach
+rankings
+forbidden
+cornwall
+accordance
+salem
+consumers
+bruno
+fantastic
+toes
+machinery
+resolved
+julius
+remembering
+propaganda
+iceland
+bombardment
+tide
+contacts
+wives
+##rah
+concerto
+macdonald
+albania
+implement
+daisy
+tapped
+sudan
+helmet
+angela
+mistress
+##lic
+crop
+sunk
+finest
+##craft
+hostile
+##ute
+##tsu
+boxer
+fr
+paths
+adjusted
+habit
+ballot
+supervision
+soprano
+##zen
+bullets
+wicked
+sunset
+regiments
+disappear
+lamp
+performs
+app
+##gia
+##oa
+rabbit
+digging
+incidents
+entries
+##cion
+dishes
+##oi
+introducing
+##ati
+##fied
+freshman
+slot
+jill
+tackles
+baroque
+backs
+##iest
+lone
+sponsor
+destiny
+altogether
+convert
+##aro
+consensus
+shapes
+demonstration
+basically
+feminist
+auction
+artifacts
+##bing
+strongest
+twitter
+halifax
+2019
+allmusic
+mighty
+smallest
+precise
+alexandra
+viola
+##los
+##ille
+manuscripts
+##illo
+dancers
+ari
+managers
+monuments
+blades
+barracks
+springfield
+maiden
+consolidated
+electron
+##end
+berry
+airing
+wheat
+nobel
+inclusion
+blair
+payments
+geography
+bee
+cc
+eleanor
+react
+##hurst
+afc
+manitoba
+##yu
+su
+lineup
+fitness
+recreational
+investments
+airborne
+disappointment
+##dis
+edmonton
+viewing
+##row
+renovation
+##cast
+infant
+bankruptcy
+roses
+aftermath
+pavilion
+##yer
+carpenter
+withdrawal
+ladder
+##hy
+discussing
+popped
+reliable
+agreements
+rochester
+##abad
+curves
+bombers
+220
+rao
+reverend
+decreased
+choosing
+107
+stiff
+consulting
+naples
+crawford
+tracy
+ka
+ribbon
+cops
+##lee
+crushed
+deciding
+unified
+teenager
+accepting
+flagship
+explorer
+poles
+sanchez
+inspection
+revived
+skilled
+induced
+exchanged
+flee
+locals
+tragedy
+swallow
+loading
+hanna
+demonstrate
+##ela
+salvador
+flown
+contestants
+civilization
+##ines
+wanna
+rhodes
+fletcher
+hector
+knocking
+considers
+##ough
+nash
+mechanisms
+sensed
+mentally
+walt
+unclear
+##eus
+renovated
+madame
+##cks
+crews
+governmental
+##hin
+undertaken
+monkey
+##ben
+##ato
+fatal
+armored
+copa
+caves
+governance
+grasp
+perception
+certification
+froze
+damp
+tugged
+wyoming
+##rg
+##ero
+newman
+##lor
+nerves
+curiosity
+graph
+115
+##ami
+withdraw
+tunnels
+dull
+meredith
+moss
+exhibits
+neighbors
+communicate
+accuracy
+explored
+raiders
+republicans
+secular
+kat
+superman
+penny
+criticised
+##tch
+freed
+update
+conviction
+wade
+ham
+likewise
+delegation
+gotta
+doll
+promises
+technological
+myth
+nationality
+resolve
+convent
+##mark
+sharon
+dig
+sip
+coordinator
+entrepreneur
+fold
+##dine
+capability
+councillor
+synonym
+blown
+swan
+cursed
+1815
+jonas
+haired
+sofa
+canvas
+keeper
+rivalry
+##hart
+rapper
+speedway
+swords
+postal
+maxwell
+estonia
+potter
+recurring
+##nn
+##ave
+errors
+##oni
+cognitive
+1834
+##²
+claws
+nadu
+roberto
+bce
+wrestler
+ellie
+##ations
+infinite
+ink
+##tia
+presumably
+finite
+staircase
+108
+noel
+patricia
+nacional
+##cation
+chill
+eternal
+tu
+preventing
+prussia
+fossil
+limbs
+##logist
+ernst
+frog
+perez
+rene
+##ace
+pizza
+prussian
+##ios
+##vy
+molecules
+regulatory
+answering
+opinions
+sworn
+lengths
+supposedly
+hypothesis
+upward
+habitats
+seating
+ancestors
+drank
+yield
+hd
+synthesis
+researcher
+modest
+##var
+mothers
+peered
+voluntary
+homeland
+##the
+acclaim
+##igan
+static
+valve
+luxembourg
+alto
+carroll
+fe
+receptor
+norton
+ambulance
+##tian
+johnston
+catholics
+depicting
+jointly
+elephant
+gloria
+mentor
+badge
+ahmad
+distinguish
+remarked
+councils
+precisely
+allison
+advancing
+detection
+crowded
+##10
+cooperative
+ankle
+mercedes
+dagger
+surrendered
+pollution
+commit
+subway
+jeffrey
+lesson
+sculptures
+provider
+##fication
+membrane
+timothy
+rectangular
+fiscal
+heating
+teammate
+basket
+particle
+anonymous
+deployment
+##ple
+missiles
+courthouse
+proportion
+shoe
+sec
+##ller
+complaints
+forbes
+blacks
+abandon
+remind
+sizes
+overwhelming
+autobiography
+natalie
+##awa
+risks
+contestant
+countryside
+babies
+scorer
+invaded
+enclosed
+proceed
+hurling
+disorders
+##cu
+reflecting
+continuously
+cruiser
+graduates
+freeway
+investigated
+ore
+deserved
+maid
+blocking
+phillip
+jorge
+shakes
+dove
+mann
+variables
+lacked
+burden
+accompanying
+que
+consistently
+organizing
+provisional
+complained
+endless
+##rm
+tubes
+juice
+georges
+krishna
+mick
+labels
+thriller
+##uch
+laps
+arcade
+sage
+snail
+##table
+shannon
+fi
+laurence
+seoul
+vacation
+presenting
+hire
+churchill
+surprisingly
+prohibited
+savannah
+technically
+##oli
+170
+##lessly
+testimony
+suited
+speeds
+toys
+romans
+mlb
+flowering
+measurement
+talented
+kay
+settings
+charleston
+expectations
+shattered
+achieving
+triumph
+ceremonies
+portsmouth
+lanes
+mandatory
+loser
+stretching
+cologne
+realizes
+seventy
+cornell
+careers
+webb
+##ulating
+americas
+budapest
+ava
+suspicion
+##ison
+yo
+conrad
+##hai
+sterling
+jessie
+rector
+##az
+1831
+transform
+organize
+loans
+christine
+volcanic
+warrant
+slender
+summers
+subfamily
+newer
+danced
+dynamics
+rhine
+proceeds
+heinrich
+gastropod
+commands
+sings
+facilitate
+easter
+ra
+positioned
+responses
+expense
+fruits
+yanked
+imported
+25th
+velvet
+vic
+primitive
+tribune
+baldwin
+neighbourhood
+donna
+rip
+hay
+pr
+##uro
+1814
+espn
+welcomed
+##aria
+qualifier
+glare
+highland
+timing
+##cted
+shells
+eased
+geometry
+louder
+exciting
+slovakia
+##sion
+##iz
+##lot
+savings
+prairie
+##ques
+marching
+rafael
+tonnes
+##lled
+curtain
+preceding
+shy
+heal
+greene
+worthy
+##pot
+detachment
+bury
+sherman
+##eck
+reinforced
+seeks
+bottles
+contracted
+duchess
+outfit
+walsh
+##sc
+mickey
+##ase
+geoffrey
+archer
+squeeze
+dawson
+eliminate
+invention
+##enberg
+neal
+##eth
+stance
+dealer
+coral
+maple
+retire
+polo
+simplified
+##ht
+1833
+hid
+watts
+backwards
+jules
+##oke
+genesis
+mt
+frames
+rebounds
+burma
+woodland
+moist
+santos
+whispers
+drained
+subspecies
+##aa
+streaming
+ulster
+burnt
+correspondence
+maternal
+gerard
+denis
+stealing
+##load
+genius
+duchy
+##oria
+inaugurated
+momentum
+suits
+placement
+sovereign
+clause
+thames
+##hara
+confederation
+reservation
+sketch
+yankees
+lets
+rotten
+charm
+hal
+verses
+ultra
+commercially
+dot
+salon
+citation
+adopt
+winnipeg
+mist
+allocated
+cairo
+##boy
+jenkins
+interference
+objectives
+##wind
+1820
+portfolio
+armoured
+sectors
+##eh
+initiatives
+##world
+integrity
+exercises
+robe
+tap
+ab
+gazed
+##tones
+distracted
+rulers
+111
+favorable
+jerome
+tended
+cart
+factories
+##eri
+diplomat
+valued
+gravel
+charitable
+##try
+calvin
+exploring
+chang
+shepherd
+terrace
+pdf
+pupil
+##ural
+reflects
+ups
+##rch
+governors
+shelf
+depths
+##nberg
+trailed
+crest
+tackle
+##nian
+##ats
+hatred
+##kai
+clare
+makers
+ethiopia
+longtime
+detected
+embedded
+lacking
+slapped
+rely
+thomson
+anticipation
+iso
+morton
+successive
+agnes
+screenwriter
+straightened
+philippe
+playwright
+haunted
+licence
+iris
+intentions
+sutton
+112
+logical
+correctly
+##weight
+branded
+licked
+tipped
+silva
+ricky
+narrator
+requests
+##ents
+greeted
+supernatural
+cow
+##wald
+lung
+refusing
+employer
+strait
+gaelic
+liner
+##piece
+zoe
+sabha
+##mba
+driveway
+harvest
+prints
+bates
+reluctantly
+threshold
+algebra
+ira
+wherever
+coupled
+240
+assumption
+picks
+##air
+designers
+raids
+gentlemen
+##ean
+roller
+blowing
+leipzig
+locks
+screw
+dressing
+strand
+##lings
+scar
+dwarf
+depicts
+##nu
+nods
+##mine
+differ
+boris
+##eur
+yuan
+flip
+##gie
+mob
+invested
+questioning
+applying
+##ture
+shout
+##sel
+gameplay
+blamed
+illustrations
+bothered
+weakness
+rehabilitation
+##of
+##zes
+envelope
+rumors
+miners
+leicester
+subtle
+kerry
+##ico
+ferguson
+##fu
+premiership
+ne
+##cat
+bengali
+prof
+catches
+remnants
+dana
+##rily
+shouting
+presidents
+baltic
+ought
+ghosts
+dances
+sailors
+shirley
+fancy
+dominic
+##bie
+madonna
+##rick
+bark
+buttons
+gymnasium
+ashes
+liver
+toby
+oath
+providence
+doyle
+evangelical
+nixon
+cement
+carnegie
+embarked
+hatch
+surroundings
+guarantee
+needing
+pirate
+essence
+##bee
+filter
+crane
+hammond
+projected
+immune
+percy
+twelfth
+##ult
+regent
+doctoral
+damon
+mikhail
+##ichi
+lu
+critically
+elect
+realised
+abortion
+acute
+screening
+mythology
+steadily
+##fc
+frown
+nottingham
+kirk
+wa
+minneapolis
+##rra
+module
+algeria
+mc
+nautical
+encounters
+surprising
+statues
+availability
+shirts
+pie
+alma
+brows
+munster
+mack
+soup
+crater
+tornado
+sanskrit
+cedar
+explosive
+bordered
+dixon
+planets
+stamp
+exam
+happily
+##bble
+carriers
+kidnapped
+##vis
+accommodation
+emigrated
+##met
+knockout
+correspondent
+violation
+profits
+peaks
+lang
+specimen
+agenda
+ancestry
+pottery
+spelling
+equations
+obtaining
+ki
+linking
+1825
+debris
+asylum
+##20
+buddhism
+teddy
+##ants
+gazette
+##nger
+##sse
+dental
+eligibility
+utc
+fathers
+averaged
+zimbabwe
+francesco
+coloured
+hissed
+translator
+lynch
+mandate
+humanities
+mackenzie
+uniforms
+lin
+##iana
+##gio
+asset
+mhz
+fitting
+samantha
+genera
+wei
+rim
+beloved
+shark
+riot
+entities
+expressions
+indo
+carmen
+slipping
+owing
+abbot
+neighbor
+sidney
+##av
+rats
+recommendations
+encouraging
+squadrons
+anticipated
+commanders
+conquered
+##oto
+donations
+diagnosed
+##mond
+divide
+##iva
+guessed
+decoration
+vernon
+auditorium
+revelation
+conversations
+##kers
+##power
+herzegovina
+dash
+alike
+protested
+lateral
+herman
+accredited
+mg
+##gent
+freeman
+mel
+fiji
+crow
+crimson
+##rine
+livestock
+##pped
+humanitarian
+bored
+oz
+whip
+##lene
+##ali
+legitimate
+alter
+grinning
+spelled
+anxious
+oriental
+wesley
+##nin
+##hole
+carnival
+controller
+detect
+##ssa
+bowed
+educator
+kosovo
+macedonia
+##sin
+occupy
+mastering
+stephanie
+janeiro
+para
+unaware
+nurses
+noon
+135
+cam
+hopefully
+ranger
+combine
+sociology
+polar
+rica
+##eer
+neill
+##sman
+holocaust
+##ip
+doubled
+lust
+1828
+109
+decent
+cooling
+unveiled
+##card
+1829
+nsw
+homer
+chapman
+meyer
+##gin
+dive
+mae
+reagan
+expertise
+##gled
+darwin
+brooke
+sided
+prosecution
+investigating
+comprised
+petroleum
+genres
+reluctant
+differently
+trilogy
+johns
+vegetables
+corpse
+highlighted
+lounge
+pension
+unsuccessfully
+elegant
+aided
+ivory
+beatles
+amelia
+cain
+dubai
+sunny
+immigrant
+babe
+click
+##nder
+underwater
+pepper
+combining
+mumbled
+atlas
+horns
+accessed
+ballad
+physicians
+homeless
+gestured
+rpm
+freak
+louisville
+corporations
+patriots
+prizes
+rational
+warn
+modes
+decorative
+overnight
+din
+troubled
+phantom
+##ort
+monarch
+sheer
+##dorf
+generals
+guidelines
+organs
+addresses
+##zon
+enhance
+curling
+parishes
+cord
+##kie
+linux
+caesar
+deutsche
+bavaria
+##bia
+coleman
+cyclone
+##eria
+bacon
+petty
+##yama
+##old
+hampton
+diagnosis
+1824
+throws
+complexity
+rita
+disputed
+##₃
+pablo
+##sch
+marketed
+trafficking
+##ulus
+examine
+plague
+formats
+##oh
+vault
+faithful
+##bourne
+webster
+##ox
+highlights
+##ient
+##ann
+phones
+vacuum
+sandwich
+modeling
+##gated
+bolivia
+clergy
+qualities
+isabel
+##nas
+##ars
+wears
+screams
+reunited
+annoyed
+bra
+##ancy
+##rate
+differential
+transmitter
+tattoo
+container
+poker
+##och
+excessive
+resides
+cowboys
+##tum
+augustus
+trash
+providers
+statute
+retreated
+balcony
+reversed
+void
+storey
+preceded
+masses
+leap
+laughs
+neighborhoods
+wards
+schemes
+falcon
+santo
+battlefield
+pad
+ronnie
+thread
+lesbian
+venus
+##dian
+beg
+sandstone
+daylight
+punched
+gwen
+analog
+stroked
+wwe
+acceptable
+measurements
+dec
+toxic
+##kel
+adequate
+surgical
+economist
+parameters
+varsity
+##sberg
+quantity
+ella
+##chy
+##rton
+countess
+generating
+precision
+diamonds
+expressway
+ga
+##ı
+1821
+uruguay
+talents
+galleries
+expenses
+scanned
+colleague
+outlets
+ryder
+lucien
+##ila
+paramount
+##bon
+syracuse
+dim
+fangs
+gown
+sweep
+##sie
+toyota
+missionaries
+websites
+##nsis
+sentences
+adviser
+val
+trademark
+spells
+##plane
+patience
+starter
+slim
+##borg
+toe
+incredibly
+shoots
+elliot
+nobility
+##wyn
+cowboy
+endorsed
+gardner
+tendency
+persuaded
+organisms
+emissions
+kazakhstan
+amused
+boring
+chips
+themed
+##hand
+llc
+constantinople
+chasing
+systematic
+guatemala
+borrowed
+erin
+carey
+##hard
+highlands
+struggles
+1810
+##ifying
+##ced
+wong
+exceptions
+develops
+enlarged
+kindergarten
+castro
+##ern
+##rina
+leigh
+zombie
+juvenile
+##most
+consul
+##nar
+sailor
+hyde
+clarence
+intensive
+pinned
+nasty
+useless
+jung
+clayton
+stuffed
+exceptional
+ix
+apostolic
+230
+transactions
+##dge
+exempt
+swinging
+cove
+religions
+##ash
+shields
+dairy
+bypass
+190
+pursuing
+bug
+joyce
+bombay
+chassis
+southampton
+chat
+interact
+redesignated
+##pen
+nascar
+pray
+salmon
+rigid
+regained
+malaysian
+grim
+publicity
+constituted
+capturing
+toilet
+delegate
+purely
+tray
+drift
+loosely
+striker
+weakened
+trinidad
+mitch
+itv
+defines
+transmitted
+ming
+scarlet
+nodding
+fitzgerald
+fu
+narrowly
+sp
+tooth
+standings
+virtue
+##₁
+##wara
+##cting
+chateau
+gloves
+lid
+##nel
+hurting
+conservatory
+##pel
+sinclair
+reopened
+sympathy
+nigerian
+strode
+advocated
+optional
+chronic
+discharge
+##rc
+suck
+compatible
+laurel
+stella
+shi
+fails
+wage
+dodge
+128
+informal
+sorts
+levi
+buddha
+villagers
+##aka
+chronicles
+heavier
+summoned
+gateway
+3000
+eleventh
+jewelry
+translations
+accordingly
+seas
+##ency
+fiber
+pyramid
+cubic
+dragging
+##ista
+caring
+##ops
+android
+contacted
+lunar
+##dt
+kai
+lisbon
+patted
+1826
+sacramento
+theft
+madagascar
+subtropical
+disputes
+ta
+holidays
+piper
+willow
+mare
+cane
+itunes
+newfoundland
+benny
+companions
+dong
+raj
+observe
+roar
+charming
+plaque
+tibetan
+fossils
+enacted
+manning
+bubble
+tina
+tanzania
+##eda
+##hir
+funk
+swamp
+deputies
+cloak
+ufc
+scenario
+par
+scratch
+metals
+anthem
+guru
+engaging
+specially
+##boat
+dialects
+nineteen
+cecil
+duet
+disability
+messenger
+unofficial
+##lies
+defunct
+eds
+moonlight
+drainage
+surname
+puzzle
+honda
+switching
+conservatives
+mammals
+knox
+broadcaster
+sidewalk
+cope
+##ried
+benson
+princes
+peterson
+##sal
+bedford
+sharks
+eli
+wreck
+alberto
+gasp
+archaeology
+lgbt
+teaches
+securities
+madness
+compromise
+waving
+coordination
+davidson
+visions
+leased
+possibilities
+eighty
+jun
+fernandez
+enthusiasm
+assassin
+sponsorship
+reviewer
+kingdoms
+estonian
+laboratories
+##fy
+##nal
+applies
+verb
+celebrations
+##zzo
+rowing
+lightweight
+sadness
+submit
+mvp
+balanced
+dude
+##vas
+explicitly
+metric
+magnificent
+mound
+brett
+mohammad
+mistakes
+irregular
+##hing
+##ass
+sanders
+betrayed
+shipped
+surge
+##enburg
+reporters
+termed
+georg
+pity
+verbal
+bulls
+abbreviated
+enabling
+appealed
+##are
+##atic
+sicily
+sting
+heel
+sweetheart
+bart
+spacecraft
+brutal
+monarchy
+##tter
+aberdeen
+cameo
+diane
+##ub
+survivor
+clyde
+##aries
+complaint
+##makers
+clarinet
+delicious
+chilean
+karnataka
+coordinates
+1818
+panties
+##rst
+pretending
+ar
+dramatically
+kiev
+bella
+tends
+distances
+113
+catalog
+launching
+instances
+telecommunications
+portable
+lindsay
+vatican
+##eim
+angles
+aliens
+marker
+stint
+screens
+bolton
+##rne
+judy
+wool
+benedict
+plasma
+europa
+spark
+imaging
+filmmaker
+swiftly
+##een
+contributor
+##nor
+opted
+stamps
+apologize
+financing
+butter
+gideon
+sophisticated
+alignment
+avery
+chemicals
+yearly
+speculation
+prominence
+professionally
+##ils
+immortal
+institutional
+inception
+wrists
+identifying
+tribunal
+derives
+gains
+##wo
+papal
+preference
+linguistic
+vince
+operative
+brewery
+##ont
+unemployment
+boyd
+##ured
+##outs
+albeit
+prophet
+1813
+bi
+##rr
+##face
+##rad
+quarterly
+asteroid
+cleaned
+radius
+temper
+##llen
+telugu
+jerk
+viscount
+menu
+##ote
+glimpse
+##aya
+yacht
+hawaiian
+baden
+##rl
+laptop
+readily
+##gu
+monetary
+offshore
+scots
+watches
+##yang
+##arian
+upgrade
+needle
+xbox
+lea
+encyclopedia
+flank
+fingertips
+##pus
+delight
+teachings
+confirm
+roth
+beaches
+midway
+winters
+##iah
+teasing
+daytime
+beverly
+gambling
+bonnie
+##backs
+regulated
+clement
+hermann
+tricks
+knot
+##shing
+##uring
+##vre
+detached
+ecological
+owed
+specialty
+byron
+inventor
+bats
+stays
+screened
+unesco
+midland
+trim
+affection
+##ander
+##rry
+jess
+thoroughly
+feedback
+##uma
+chennai
+strained
+heartbeat
+wrapping
+overtime
+pleaded
+##sworth
+mon
+leisure
+oclc
+##tate
+##ele
+feathers
+angelo
+thirds
+nuts
+surveys
+clever
+gill
+commentator
+##dos
+darren
+rides
+gibraltar
+##nc
+##mu
+dissolution
+dedication
+shin
+meals
+saddle
+elvis
+reds
+chaired
+taller
+appreciation
+functioning
+niece
+favored
+advocacy
+robbie
+criminals
+suffolk
+yugoslav
+passport
+constable
+congressman
+hastings
+vera
+##rov
+consecrated
+sparks
+ecclesiastical
+confined
+##ovich
+muller
+floyd
+nora
+1822
+paved
+1827
+cumberland
+ned
+saga
+spiral
+##flow
+appreciated
+yi
+collaborative
+treating
+similarities
+feminine
+finishes
+##ib
+jade
+import
+##nse
+##hot
+champagne
+mice
+securing
+celebrities
+helsinki
+attributes
+##gos
+cousins
+phases
+ache
+lucia
+gandhi
+submission
+vicar
+spear
+shine
+tasmania
+biting
+detention
+constitute
+tighter
+seasonal
+##gus
+terrestrial
+matthews
+##oka
+effectiveness
+parody
+philharmonic
+##onic
+1816
+strangers
+encoded
+consortium
+guaranteed
+regards
+shifts
+tortured
+collision
+supervisor
+inform
+broader
+insight
+theaters
+armour
+emeritus
+blink
+incorporates
+mapping
+##50
+##ein
+handball
+flexible
+##nta
+substantially
+generous
+thief
+##own
+carr
+loses
+1793
+prose
+ucla
+romeo
+generic
+metallic
+realization
+damages
+mk
+commissioners
+zach
+default
+##ther
+helicopters
+lengthy
+stems
+spa
+partnered
+spectators
+rogue
+indication
+penalties
+teresa
+1801
+sen
+##tric
+dalton
+##wich
+irving
+photographic
+##vey
+dell
+deaf
+peters
+excluded
+unsure
+##vable
+patterson
+crawled
+##zio
+resided
+whipped
+latvia
+slower
+ecole
+pipes
+employers
+maharashtra
+comparable
+va
+textile
+pageant
+##gel
+alphabet
+binary
+irrigation
+chartered
+choked
+antoine
+offs
+waking
+supplement
+##wen
+quantities
+demolition
+regain
+locate
+urdu
+folks
+alt
+114
+##mc
+scary
+andreas
+whites
+##ava
+classrooms
+mw
+aesthetic
+publishes
+valleys
+guides
+cubs
+johannes
+bryant
+conventions
+affecting
+##itt
+drain
+awesome
+isolation
+prosecutor
+ambitious
+apology
+captive
+downs
+atmospheric
+lorenzo
+aisle
+beef
+foul
+##onia
+kidding
+composite
+disturbed
+illusion
+natives
+##ffer
+emi
+rockets
+riverside
+wartime
+painters
+adolf
+melted
+##ail
+uncertainty
+simulation
+hawks
+progressed
+meantime
+builder
+spray
+breach
+unhappy
+regina
+russians
+##urg
+determining
+##tation
+tram
+1806
+##quin
+aging
+##12
+1823
+garion
+rented
+mister
+diaz
+terminated
+clip
+1817
+depend
+nervously
+disco
+owe
+defenders
+shiva
+notorious
+disbelief
+shiny
+worcester
+##gation
+##yr
+trailing
+undertook
+islander
+belarus
+limitations
+watershed
+fuller
+overlooking
+utilized
+raphael
+1819
+synthetic
+breakdown
+klein
+##nate
+moaned
+memoir
+lamb
+practicing
+##erly
+cellular
+arrows
+exotic
+##graphy
+witches
+117
+charted
+rey
+hut
+hierarchy
+subdivision
+freshwater
+giuseppe
+aloud
+reyes
+qatar
+marty
+sideways
+utterly
+sexually
+jude
+prayers
+mccarthy
+softball
+blend
+damien
+##gging
+##metric
+wholly
+erupted
+lebanese
+negro
+revenues
+tasted
+comparative
+teamed
+transaction
+labeled
+maori
+sovereignty
+parkway
+trauma
+gran
+malay
+121
+advancement
+descendant
+2020
+buzz
+salvation
+inventory
+symbolic
+##making
+antarctica
+mps
+##gas
+##bro
+mohammed
+myanmar
+holt
+submarines
+tones
+##lman
+locker
+patriarch
+bangkok
+emerson
+remarks
+predators
+kin
+afghan
+confession
+norwich
+rental
+emerge
+advantages
+##zel
+rca
+##hold
+shortened
+storms
+aidan
+##matic
+autonomy
+compliance
+##quet
+dudley
+atp
+##osis
+1803
+motto
+documentation
+summary
+professors
+spectacular
+christina
+archdiocese
+flashing
+innocence
+remake
+##dell
+psychic
+reef
+scare
+employ
+rs
+sticks
+meg
+gus
+leans
+##ude
+accompany
+bergen
+tomas
+##iko
+doom
+wages
+pools
+##nch
+##bes
+breasts
+scholarly
+alison
+outline
+brittany
+breakthrough
+willis
+realistic
+##cut
+##boro
+competitor
+##stan
+pike
+picnic
+icon
+designing
+commercials
+washing
+villain
+skiing
+micro
+costumes
+auburn
+halted
+executives
+##hat
+logistics
+cycles
+vowel
+applicable
+barrett
+exclaimed
+eurovision
+eternity
+ramon
+##umi
+##lls
+modifications
+sweeping
+disgust
+##uck
+torch
+aviv
+ensuring
+rude
+dusty
+sonic
+donovan
+outskirts
+cu
+pathway
+##band
+##gun
+##lines
+disciplines
+acids
+cadet
+paired
+##40
+sketches
+##sive
+marriages
+##⁺
+folding
+peers
+slovak
+implies
+admired
+##beck
+1880s
+leopold
+instinct
+attained
+weston
+megan
+horace
+##ination
+dorsal
+ingredients
+evolutionary
+##its
+complications
+deity
+lethal
+brushing
+levy
+deserted
+institutes
+posthumously
+delivering
+telescope
+coronation
+motivated
+rapids
+luc
+flicked
+pays
+volcano
+tanner
+weighed
+##nica
+crowds
+frankie
+gifted
+addressing
+granddaughter
+winding
+##rna
+constantine
+gomez
+##front
+landscapes
+rudolf
+anthropology
+slate
+werewolf
+##lio
+astronomy
+circa
+rouge
+dreaming
+sack
+knelt
+drowned
+naomi
+prolific
+tracked
+freezing
+herb
+##dium
+agony
+randall
+twisting
+wendy
+deposit
+touches
+vein
+wheeler
+##bbled
+##bor
+batted
+retaining
+tire
+presently
+compare
+specification
+daemon
+nigel
+##grave
+merry
+recommendation
+czechoslovakia
+sandra
+ng
+roma
+##sts
+lambert
+inheritance
+sheikh
+winchester
+cries
+examining
+##yle
+comeback
+cuisine
+nave
+##iv
+ko
+retrieve
+tomatoes
+barker
+polished
+defining
+irene
+lantern
+personalities
+begging
+tract
+swore
+1809
+175
+##gic
+omaha
+brotherhood
+##rley
+haiti
+##ots
+exeter
+##ete
+##zia
+steele
+dumb
+pearson
+210
+surveyed
+elisabeth
+trends
+##ef
+fritz
+##rf
+premium
+bugs
+fraction
+calmly
+viking
+##birds
+tug
+inserted
+unusually
+##ield
+confronted
+distress
+crashing
+brent
+turks
+resign
+##olo
+cambodia
+gabe
+sauce
+##kal
+evelyn
+116
+extant
+clusters
+quarry
+teenagers
+luna
+##lers
+##ister
+affiliation
+drill
+##ashi
+panthers
+scenic
+libya
+anita
+strengthen
+inscriptions
+##cated
+lace
+sued
+judith
+riots
+##uted
+mint
+##eta
+preparations
+midst
+dub
+challenger
+##vich
+mock
+cf
+displaced
+wicket
+breaths
+enables
+schmidt
+analyst
+##lum
+ag
+highlight
+automotive
+axe
+josef
+newark
+sufficiently
+resembles
+50th
+##pal
+flushed
+mum
+traits
+##ante
+commodore
+incomplete
+warming
+titular
+ceremonial
+ethical
+118
+celebrating
+eighteenth
+cao
+lima
+medalist
+mobility
+strips
+snakes
+##city
+miniature
+zagreb
+barton
+escapes
+umbrella
+automated
+doubted
+differs
+cooled
+georgetown
+dresden
+cooked
+fade
+wyatt
+rna
+jacobs
+carlton
+abundant
+stereo
+boost
+madras
+inning
+##hia
+spur
+ip
+malayalam
+begged
+osaka
+groan
+escaping
+charging
+dose
+vista
+##aj
+bud
+papa
+communists
+advocates
+edged
+tri
+##cent
+resemble
+peaking
+necklace
+fried
+montenegro
+saxony
+goose
+glances
+stuttgart
+curator
+recruit
+grocery
+sympathetic
+##tting
+##fort
+127
+lotus
+randolph
+ancestor
+##rand
+succeeding
+jupiter
+1798
+macedonian
+##heads
+hiking
+1808
+handing
+fischer
+##itive
+garbage
+node
+##pies
+prone
+singular
+papua
+inclined
+attractions
+italia
+pouring
+motioned
+grandma
+garnered
+jacksonville
+corp
+ego
+ringing
+aluminum
+##hausen
+ordering
+##foot
+drawer
+traders
+synagogue
+##play
+##kawa
+resistant
+wandering
+fragile
+fiona
+teased
+var
+hardcore
+soaked
+jubilee
+decisive
+exposition
+mercer
+poster
+valencia
+hale
+kuwait
+1811
+##ises
+##wr
+##eed
+tavern
+gamma
+122
+johan
+##uer
+airways
+amino
+gil
+##ury
+vocational
+domains
+torres
+##sp
+generator
+folklore
+outcomes
+##keeper
+canberra
+shooter
+fl
+beams
+confrontation
+##lling
+##gram
+feb
+aligned
+forestry
+pipeline
+jax
+motorway
+conception
+decay
+##tos
+coffin
+##cott
+stalin
+1805
+escorted
+minded
+##nam
+sitcom
+purchasing
+twilight
+veronica
+additions
+passive
+tensions
+straw
+123
+frequencies
+1804
+refugee
+cultivation
+##iate
+christie
+clary
+bulletin
+crept
+disposal
+##rich
+##zong
+processor
+crescent
+##rol
+bmw
+emphasized
+whale
+nazis
+aurora
+##eng
+dwelling
+hauled
+sponsors
+toledo
+mega
+ideology
+theatres
+tessa
+cerambycidae
+saves
+turtle
+cone
+suspects
+kara
+rusty
+yelling
+greeks
+mozart
+shades
+cocked
+participant
+##tro
+shire
+spit
+freeze
+necessity
+##cos
+inmates
+nielsen
+councillors
+loaned
+uncommon
+omar
+peasants
+botanical
+offspring
+daniels
+formations
+jokes
+1794
+pioneers
+sigma
+licensing
+##sus
+wheelchair
+polite
+1807
+liquor
+pratt
+trustee
+##uta
+forewings
+balloon
+##zz
+kilometre
+camping
+explicit
+casually
+shawn
+foolish
+teammates
+nm
+hassan
+carrie
+judged
+satisfy
+vanessa
+knives
+selective
+cnn
+flowed
+##lice
+eclipse
+stressed
+eliza
+mathematician
+cease
+cultivated
+##roy
+commissions
+browns
+##ania
+destroyers
+sheridan
+meadow
+##rius
+minerals
+##cial
+downstream
+clash
+gram
+memoirs
+ventures
+baha
+seymour
+archie
+midlands
+edith
+fare
+flynn
+invite
+canceled
+tiles
+stabbed
+boulder
+incorporate
+amended
+camden
+facial
+mollusk
+unreleased
+descriptions
+yoga
+grabs
+550
+raises
+ramp
+shiver
+##rose
+coined
+pioneering
+tunes
+qing
+warwick
+tops
+119
+melanie
+giles
+##rous
+wandered
+##inal
+annexed
+nov
+30th
+unnamed
+##ished
+organizational
+airplane
+normandy
+stoke
+whistle
+blessing
+violations
+chased
+holders
+shotgun
+##ctic
+outlet
+reactor
+##vik
+tires
+tearing
+shores
+fortified
+mascot
+constituencies
+nc
+columnist
+productive
+tibet
+##rta
+lineage
+hooked
+oct
+tapes
+judging
+cody
+##gger
+hansen
+kashmir
+triggered
+##eva
+solved
+cliffs
+##tree
+resisted
+anatomy
+protesters
+transparent
+implied
+##iga
+injection
+mattress
+excluding
+##mbo
+defenses
+helpless
+devotion
+##elli
+growl
+liberals
+weber
+phenomena
+atoms
+plug
+##iff
+mortality
+apprentice
+howe
+convincing
+aaa
+swimmer
+barber
+leone
+promptly
+sodium
+def
+nowadays
+arise
+##oning
+gloucester
+corrected
+dignity
+norm
+erie
+##ders
+elders
+evacuated
+sylvia
+compression
+##yar
+hartford
+pose
+backpack
+reasoning
+accepts
+24th
+wipe
+millimetres
+marcel
+##oda
+dodgers
+albion
+1790
+overwhelmed
+aerospace
+oaks
+1795
+showcase
+acknowledge
+recovering
+nolan
+ashe
+hurts
+geology
+fashioned
+disappearance
+farewell
+swollen
+shrug
+marquis
+wimbledon
+124
+rue
+1792
+commemorate
+reduces
+experiencing
+inevitable
+calcutta
+intel
+##court
+murderer
+sticking
+fisheries
+imagery
+bloom
+280
+brake
+##inus
+gustav
+hesitation
+memorable
+po
+viral
+beans
+accidents
+tunisia
+antenna
+spilled
+consort
+treatments
+aye
+perimeter
+##gard
+donation
+hostage
+migrated
+banker
+addiction
+apex
+lil
+trout
+##ously
+conscience
+##nova
+rams
+sands
+genome
+passionate
+troubles
+##lets
+##set
+amid
+##ibility
+##ret
+higgins
+exceed
+vikings
+##vie
+payne
+##zan
+muscular
+##ste
+defendant
+sucking
+##wal
+ibrahim
+fuselage
+claudia
+vfl
+europeans
+snails
+interval
+##garh
+preparatory
+statewide
+tasked
+lacrosse
+viktor
+##lation
+angola
+##hra
+flint
+implications
+employs
+teens
+patrons
+stall
+weekends
+barriers
+scrambled
+nucleus
+tehran
+jenna
+parsons
+lifelong
+robots
+displacement
+5000
+##bles
+precipitation
+##gt
+knuckles
+clutched
+1802
+marrying
+ecology
+marx
+accusations
+declare
+scars
+kolkata
+mat
+meadows
+bermuda
+skeleton
+finalists
+vintage
+crawl
+coordinate
+affects
+subjected
+orchestral
+mistaken
+##tc
+mirrors
+dipped
+relied
+260
+arches
+candle
+##nick
+incorporating
+wildly
+fond
+basilica
+owl
+fringe
+rituals
+whispering
+stirred
+feud
+tertiary
+slick
+goat
+honorable
+whereby
+skip
+ricardo
+stripes
+parachute
+adjoining
+submerged
+synthesizer
+##gren
+intend
+positively
+ninety
+phi
+beaver
+partition
+fellows
+alexis
+prohibition
+carlisle
+bizarre
+fraternity
+##bre
+doubts
+icy
+cbc
+aquatic
+sneak
+sonny
+combines
+airports
+crude
+supervised
+spatial
+merge
+alfonso
+##bic
+corrupt
+scan
+undergo
+##ams
+disabilities
+colombian
+comparing
+dolphins
+perkins
+##lish
+reprinted
+unanimous
+bounced
+hairs
+underworld
+midwest
+semester
+bucket
+paperback
+miniseries
+coventry
+demise
+##leigh
+demonstrations
+sensor
+rotating
+yan
+##hler
+arrange
+soils
+##idge
+hyderabad
+labs
+##dr
+brakes
+grandchildren
+##nde
+negotiated
+rover
+ferrari
+continuation
+directorate
+augusta
+stevenson
+counterpart
+gore
+##rda
+nursery
+rican
+ave
+collectively
+broadly
+pastoral
+repertoire
+asserted
+discovering
+nordic
+styled
+fiba
+cunningham
+harley
+middlesex
+survives
+tumor
+tempo
+zack
+aiming
+lok
+urgent
+##rade
+##nto
+devils
+##ement
+contractor
+turin
+##wl
+##ool
+bliss
+repaired
+simmons
+moan
+astronomical
+cr
+negotiate
+lyric
+1890s
+lara
+bred
+clad
+angus
+pbs
+##ience
+engineered
+posed
+##lk
+hernandez
+possessions
+elbows
+psychiatric
+strokes
+confluence
+electorate
+lifts
+campuses
+lava
+alps
+##ep
+##ution
+##date
+physicist
+woody
+##page
+##ographic
+##itis
+juliet
+reformation
+sparhawk
+320
+complement
+suppressed
+jewel
+##½
+floated
+##kas
+continuity
+sadly
+##ische
+inability
+melting
+scanning
+paula
+flour
+judaism
+safer
+vague
+##lm
+solving
+curb
+##stown
+financially
+gable
+bees
+expired
+miserable
+cassidy
+dominion
+1789
+cupped
+145
+robbery
+facto
+amos
+warden
+resume
+tallest
+marvin
+ing
+pounded
+usd
+declaring
+gasoline
+##aux
+darkened
+270
+650
+sophomore
+##mere
+erection
+gossip
+televised
+risen
+dial
+##eu
+pillars
+##link
+passages
+profound
+##tina
+arabian
+ashton
+silicon
+nail
+##ead
+##lated
+##wer
+##hardt
+fleming
+firearms
+ducked
+circuits
+blows
+waterloo
+titans
+##lina
+atom
+fireplace
+cheshire
+financed
+activation
+algorithms
+##zzi
+constituent
+catcher
+cherokee
+partnerships
+sexuality
+platoon
+tragic
+vivian
+guarded
+whiskey
+meditation
+poetic
+##late
+##nga
+##ake
+porto
+listeners
+dominance
+kendra
+mona
+chandler
+factions
+22nd
+salisbury
+attitudes
+derivative
+##ido
+##haus
+intake
+paced
+javier
+illustrator
+barrels
+bias
+cockpit
+burnett
+dreamed
+ensuing
+##anda
+receptors
+someday
+hawkins
+mattered
+##lal
+slavic
+1799
+jesuit
+cameroon
+wasted
+tai
+wax
+lowering
+victorious
+freaking
+outright
+hancock
+librarian
+sensing
+bald
+calcium
+myers
+tablet
+announcing
+barack
+shipyard
+pharmaceutical
+##uan
+greenwich
+flush
+medley
+patches
+wolfgang
+pt
+speeches
+acquiring
+exams
+nikolai
+##gg
+hayden
+kannada
+##type
+reilly
+##pt
+waitress
+abdomen
+devastated
+capped
+pseudonym
+pharmacy
+fulfill
+paraguay
+1796
+clicked
+##trom
+archipelago
+syndicated
+##hman
+lumber
+orgasm
+rejection
+clifford
+lorraine
+advent
+mafia
+rodney
+brock
+##ght
+##used
+##elia
+cassette
+chamberlain
+despair
+mongolia
+sensors
+developmental
+upstream
+##eg
+##alis
+spanning
+165
+trombone
+basque
+seeded
+interred
+renewable
+rhys
+leapt
+revision
+molecule
+##ages
+chord
+vicious
+nord
+shivered
+23rd
+arlington
+debts
+corpus
+sunrise
+bays
+blackburn
+centimetres
+##uded
+shuddered
+gm
+strangely
+gripping
+cartoons
+isabelle
+orbital
+##ppa
+seals
+proving
+##lton
+refusal
+strengthened
+bust
+assisting
+baghdad
+batsman
+portrayal
+mara
+pushes
+spears
+og
+##cock
+reside
+nathaniel
+brennan
+1776
+confirmation
+caucus
+##worthy
+markings
+yemen
+nobles
+ku
+lazy
+viewer
+catalan
+encompasses
+sawyer
+##fall
+sparked
+substances
+patents
+braves
+arranger
+evacuation
+sergio
+persuade
+dover
+tolerance
+penguin
+cum
+jockey
+insufficient
+townships
+occupying
+declining
+plural
+processed
+projection
+puppet
+flanders
+introduces
+liability
+##yon
+gymnastics
+antwerp
+taipei
+hobart
+candles
+jeep
+wes
+observers
+126
+chaplain
+bundle
+glorious
+##hine
+hazel
+flung
+sol
+excavations
+dumped
+stares
+sh
+bangalore
+triangular
+icelandic
+intervals
+expressing
+turbine
+##vers
+songwriting
+crafts
+##igo
+jasmine
+ditch
+rite
+##ways
+entertaining
+comply
+sorrow
+wrestlers
+basel
+emirates
+marian
+rivera
+helpful
+##some
+caution
+downward
+networking
+##atory
+##tered
+darted
+genocide
+emergence
+replies
+specializing
+spokesman
+convenient
+unlocked
+fading
+augustine
+concentrations
+resemblance
+elijah
+investigator
+andhra
+##uda
+promotes
+bean
+##rrell
+fleeing
+wan
+simone
+announcer
+##ame
+##bby
+lydia
+weaver
+132
+residency
+modification
+##fest
+stretches
+##ast
+alternatively
+nat
+lowe
+lacks
+##ented
+pam
+tile
+concealed
+inferior
+abdullah
+residences
+tissues
+vengeance
+##ided
+moisture
+peculiar
+groove
+zip
+bologna
+jennings
+ninja
+oversaw
+zombies
+pumping
+batch
+livingston
+emerald
+installations
+1797
+peel
+nitrogen
+rama
+##fying
+##star
+schooling
+strands
+responding
+werner
+##ost
+lime
+casa
+accurately
+targeting
+##rod
+underway
+##uru
+hemisphere
+lester
+##yard
+occupies
+2d
+griffith
+angrily
+reorganized
+##owing
+courtney
+deposited
+##dd
+##30
+estadio
+##ifies
+dunn
+exiled
+##ying
+checks
+##combe
+##о
+##fly
+successes
+unexpectedly
+blu
+assessed
+##flower
+##ه
+observing
+sacked
+spiders
+kn
+##tail
+mu
+nodes
+prosperity
+audrey
+divisional
+155
+broncos
+tangled
+adjust
+feeds
+erosion
+paolo
+surf
+directory
+snatched
+humid
+admiralty
+screwed
+gt
+reddish
+##nese
+modules
+trench
+lamps
+bind
+leah
+bucks
+competes
+##nz
+##form
+transcription
+##uc
+isles
+violently
+clutching
+pga
+cyclist
+inflation
+flats
+ragged
+unnecessary
+##hian
+stubborn
+coordinated
+harriet
+baba
+disqualified
+330
+insect
+wolfe
+##fies
+reinforcements
+rocked
+duel
+winked
+embraced
+bricks
+##raj
+hiatus
+defeats
+pending
+brightly
+jealousy
+##xton
+##hm
+##uki
+lena
+gdp
+colorful
+##dley
+stein
+kidney
+##shu
+underwear
+wanderers
+##haw
+##icus
+guardians
+m³
+roared
+habits
+##wise
+permits
+gp
+uranium
+punished
+disguise
+bundesliga
+elise
+dundee
+erotic
+partisan
+pi
+collectors
+float
+individually
+rendering
+behavioral
+bucharest
+ser
+hare
+valerie
+corporal
+nutrition
+proportional
+##isa
+immense
+##kis
+pavement
+##zie
+##eld
+sutherland
+crouched
+1775
+##lp
+suzuki
+trades
+endurance
+operas
+crosby
+prayed
+priory
+rory
+socially
+##urn
+gujarat
+##pu
+walton
+cube
+pasha
+privilege
+lennon
+floods
+thorne
+waterfall
+nipple
+scouting
+approve
+##lov
+minorities
+voter
+dwight
+extensions
+assure
+ballroom
+slap
+dripping
+privileges
+rejoined
+confessed
+demonstrating
+patriotic
+yell
+investor
+##uth
+pagan
+slumped
+squares
+##cle
+##kins
+confront
+bert
+embarrassment
+##aid
+aston
+urging
+sweater
+starr
+yuri
+brains
+williamson
+commuter
+mortar
+structured
+selfish
+exports
+##jon
+cds
+##him
+unfinished
+##rre
+mortgage
+destinations
+##nagar
+canoe
+solitary
+buchanan
+delays
+magistrate
+fk
+##pling
+motivation
+##lier
+##vier
+recruiting
+assess
+##mouth
+malik
+antique
+1791
+pius
+rahman
+reich
+tub
+zhou
+smashed
+airs
+galway
+xii
+conditioning
+honduras
+discharged
+dexter
+##pf
+lionel
+129
+debates
+lemon
+tiffany
+volunteered
+dom
+dioxide
+procession
+devi
+sic
+tremendous
+advertisements
+colts
+transferring
+verdict
+hanover
+decommissioned
+utter
+relate
+pac
+racism
+##top
+beacon
+limp
+similarity
+terra
+occurrence
+ant
+##how
+becky
+capt
+updates
+armament
+richie
+pal
+##graph
+halloween
+mayo
+##ssen
+##bone
+cara
+serena
+fcc
+dolls
+obligations
+##dling
+violated
+lafayette
+jakarta
+exploitation
+##ime
+infamous
+iconic
+##lah
+##park
+kitty
+moody
+reginald
+dread
+spill
+crystals
+olivier
+modeled
+bluff
+equilibrium
+separating
+notices
+ordnance
+extinction
+onset
+cosmic
+attachment
+sammy
+expose
+privy
+anchored
+##bil
+abbott
+admits
+bending
+baritone
+emmanuel
+policeman
+vaughan
+winged
+climax
+dresses
+denny
+polytechnic
+mohamed
+burmese
+authentic
+nikki
+genetics
+grandparents
+homestead
+gaza
+postponed
+metacritic
+una
+##sby
+##bat
+unstable
+dissertation
+##rial
+##cian
+curls
+obscure
+uncovered
+bronx
+praying
+disappearing
+##hoe
+prehistoric
+coke
+turret
+mutations
+nonprofit
+pits
+monaco
+##ي
+##usion
+prominently
+dispatched
+podium
+##mir
+uci
+##uation
+133
+fortifications
+birthplace
+kendall
+##lby
+##oll
+preacher
+rack
+goodman
+##rman
+persistent
+##ott
+countless
+jaime
+recorder
+lexington
+persecution
+jumps
+renewal
+wagons
+##11
+crushing
+##holder
+decorations
+##lake
+abundance
+wrath
+laundry
+£1
+garde
+##rp
+jeanne
+beetles
+peasant
+##sl
+splitting
+caste
+sergei
+##rer
+##ema
+scripts
+##ively
+rub
+satellites
+##vor
+inscribed
+verlag
+scrapped
+gale
+packages
+chick
+potato
+slogan
+kathleen
+arabs
+##culture
+counterparts
+reminiscent
+choral
+##tead
+rand
+retains
+bushes
+dane
+accomplish
+courtesy
+closes
+##oth
+slaughter
+hague
+krakow
+lawson
+tailed
+elias
+ginger
+##ttes
+canopy
+betrayal
+rebuilding
+turf
+##hof
+frowning
+allegiance
+brigades
+kicks
+rebuild
+polls
+alias
+nationalism
+td
+rowan
+audition
+bowie
+fortunately
+recognizes
+harp
+dillon
+horrified
+##oro
+renault
+##tics
+ropes
+##α
+presumed
+rewarded
+infrared
+wiping
+accelerated
+illustration
+##rid
+presses
+practitioners
+badminton
+##iard
+detained
+##tera
+recognizing
+relates
+misery
+##sies
+##tly
+reproduction
+piercing
+potatoes
+thornton
+esther
+manners
+hbo
+##aan
+ours
+bullshit
+ernie
+perennial
+sensitivity
+illuminated
+rupert
+##jin
+##iss
+##ear
+rfc
+nassau
+##dock
+staggered
+socialism
+##haven
+appointments
+nonsense
+prestige
+sharma
+haul
+##tical
+solidarity
+gps
+##ook
+##rata
+igor
+pedestrian
+##uit
+baxter
+tenants
+wires
+medication
+unlimited
+guiding
+impacts
+diabetes
+##rama
+sasha
+pas
+clive
+extraction
+131
+continually
+constraints
+##bilities
+sonata
+hunted
+sixteenth
+chu
+planting
+quote
+mayer
+pretended
+abs
+spat
+##hua
+ceramic
+##cci
+curtains
+pigs
+pitching
+##dad
+latvian
+sore
+dayton
+##sted
+##qi
+patrols
+slice
+playground
+##nted
+shone
+stool
+apparatus
+inadequate
+mates
+treason
+##ija
+desires
+##liga
+##croft
+somalia
+laurent
+mir
+leonardo
+oracle
+grape
+obliged
+chevrolet
+thirteenth
+stunning
+enthusiastic
+##ede
+accounted
+concludes
+currents
+basil
+##kovic
+drought
+##rica
+mai
+##aire
+shove
+posting
+##shed
+pilgrimage
+humorous
+packing
+fry
+pencil
+wines
+smells
+144
+marilyn
+aching
+newest
+clung
+bon
+neighbours
+sanctioned
+##pie
+mug
+##stock
+drowning
+##mma
+hydraulic
+##vil
+hiring
+reminder
+lilly
+investigators
+##ncies
+sour
+##eous
+compulsory
+packet
+##rion
+##graphic
+##elle
+cannes
+##inate
+depressed
+##rit
+heroic
+importantly
+theresa
+##tled
+conway
+saturn
+marginal
+rae
+##xia
+corresponds
+royce
+pact
+jasper
+explosives
+packaging
+aluminium
+##ttered
+denotes
+rhythmic
+spans
+assignments
+hereditary
+outlined
+originating
+sundays
+lad
+reissued
+greeting
+beatrice
+##dic
+pillar
+marcos
+plots
+handbook
+alcoholic
+judiciary
+avant
+slides
+extract
+masculine
+blur
+##eum
+##force
+homage
+trembled
+owens
+hymn
+trey
+omega
+signaling
+socks
+accumulated
+reacted
+attic
+theo
+lining
+angie
+distraction
+primera
+talbot
+##key
+1200
+ti
+creativity
+billed
+##hey
+deacon
+eduardo
+identifies
+proposition
+dizzy
+gunner
+hogan
+##yam
+##pping
+##hol
+ja
+##chan
+jensen
+reconstructed
+##berger
+clearance
+darius
+##nier
+abe
+harlem
+plea
+dei
+circled
+emotionally
+notation
+fascist
+neville
+exceeded
+upwards
+viable
+ducks
+##fo
+workforce
+racer
+limiting
+shri
+##lson
+possesses
+1600
+kerr
+moths
+devastating
+laden
+disturbing
+locking
+##cture
+gal
+fearing
+accreditation
+flavor
+aide
+1870s
+mountainous
+##baum
+melt
+##ures
+motel
+texture
+servers
+soda
+##mb
+herd
+##nium
+erect
+puzzled
+hum
+peggy
+examinations
+gould
+testified
+geoff
+ren
+devised
+sacks
+##law
+denial
+posters
+grunted
+cesar
+tutor
+ec
+gerry
+offerings
+byrne
+falcons
+combinations
+ct
+incoming
+pardon
+rocking
+26th
+avengers
+flared
+mankind
+seller
+uttar
+loch
+nadia
+stroking
+exposing
+##hd
+fertile
+ancestral
+instituted
+##has
+noises
+prophecy
+taxation
+eminent
+vivid
+pol
+##bol
+dart
+indirect
+multimedia
+notebook
+upside
+displaying
+adrenaline
+referenced
+geometric
+##iving
+progression
+##ddy
+blunt
+announce
+##far
+implementing
+##lav
+aggression
+liaison
+cooler
+cares
+headache
+plantations
+gorge
+dots
+impulse
+thickness
+ashamed
+averaging
+kathy
+obligation
+precursor
+137
+fowler
+symmetry
+thee
+225
+hears
+##rai
+undergoing
+ads
+butcher
+bowler
+##lip
+cigarettes
+subscription
+goodness
+##ically
+browne
+##hos
+##tech
+kyoto
+donor
+##erty
+damaging
+friction
+drifting
+expeditions
+hardened
+prostitution
+152
+fauna
+blankets
+claw
+tossing
+snarled
+butterflies
+recruits
+investigative
+coated
+healed
+138
+communal
+hai
+xiii
+academics
+boone
+psychologist
+restless
+lahore
+stephens
+mba
+brendan
+foreigners
+printer
+##pc
+ached
+explode
+27th
+deed
+scratched
+dared
+##pole
+cardiac
+1780
+okinawa
+proto
+commando
+compelled
+oddly
+electrons
+##base
+replica
+thanksgiving
+##rist
+sheila
+deliberate
+stafford
+tidal
+representations
+hercules
+ou
+##path
+##iated
+kidnapping
+lenses
+##tling
+deficit
+samoa
+mouths
+consuming
+computational
+maze
+granting
+smirk
+razor
+fixture
+ideals
+inviting
+aiden
+nominal
+##vs
+issuing
+julio
+pitt
+ramsey
+docks
+##oss
+exhaust
+##owed
+bavarian
+draped
+anterior
+mating
+ethiopian
+explores
+noticing
+##nton
+discarded
+convenience
+hoffman
+endowment
+beasts
+cartridge
+mormon
+paternal
+probe
+sleeves
+interfere
+lump
+deadline
+##rail
+jenks
+bulldogs
+scrap
+alternating
+justified
+reproductive
+nam
+seize
+descending
+secretariat
+kirby
+coupe
+grouped
+smash
+panther
+sedan
+tapping
+##18
+lola
+cheer
+germanic
+unfortunate
+##eter
+unrelated
+##fan
+subordinate
+##sdale
+suzanne
+advertisement
+##ility
+horsepower
+##lda
+cautiously
+discourse
+luigi
+##mans
+##fields
+noun
+prevalent
+mao
+schneider
+everett
+surround
+governorate
+kira
+##avia
+westward
+##take
+misty
+rails
+sustainability
+134
+unused
+##rating
+packs
+toast
+unwilling
+regulate
+thy
+suffrage
+nile
+awe
+assam
+definitions
+travelers
+affordable
+##rb
+conferred
+sells
+undefeated
+beneficial
+torso
+basal
+repeating
+remixes
+##pass
+bahrain
+cables
+fang
+##itated
+excavated
+numbering
+statutory
+##rey
+deluxe
+##lian
+forested
+ramirez
+derbyshire
+zeus
+slamming
+transfers
+astronomer
+banana
+lottery
+berg
+histories
+bamboo
+##uchi
+resurrection
+posterior
+bowls
+vaguely
+##thi
+thou
+preserving
+tensed
+offence
+##inas
+meyrick
+callum
+ridden
+watt
+langdon
+tying
+lowland
+snorted
+daring
+truman
+##hale
+##girl
+aura
+overly
+filing
+weighing
+goa
+infections
+philanthropist
+saunders
+eponymous
+##owski
+latitude
+perspectives
+reviewing
+mets
+commandant
+radial
+##kha
+flashlight
+reliability
+koch
+vowels
+amazed
+ada
+elaine
+supper
+##rth
+##encies
+predator
+debated
+soviets
+cola
+##boards
+##nah
+compartment
+crooked
+arbitrary
+fourteenth
+##ctive
+havana
+majors
+steelers
+clips
+profitable
+ambush
+exited
+packers
+##tile
+nude
+cracks
+fungi
+##е
+limb
+trousers
+josie
+shelby
+tens
+frederic
+##ος
+definite
+smoothly
+constellation
+insult
+baton
+discs
+lingering
+##nco
+conclusions
+lent
+staging
+becker
+grandpa
+shaky
+##tron
+einstein
+obstacles
+sk
+adverse
+elle
+economically
+##moto
+mccartney
+thor
+dismissal
+motions
+readings
+nostrils
+treatise
+##pace
+squeezing
+evidently
+prolonged
+1783
+venezuelan
+je
+marguerite
+beirut
+takeover
+shareholders
+##vent
+denise
+digit
+airplay
+norse
+##bbling
+imaginary
+pills
+hubert
+blaze
+vacated
+eliminating
+##ello
+vine
+mansfield
+##tty
+retrospective
+barrow
+borne
+clutch
+bail
+forensic
+weaving
+##nett
+##witz
+desktop
+citadel
+promotions
+worrying
+dorset
+ieee
+subdivided
+##iating
+manned
+expeditionary
+pickup
+synod
+chuckle
+185
+barney
+##rz
+##ffin
+functionality
+karachi
+litigation
+meanings
+uc
+lick
+turbo
+anders
+##ffed
+execute
+curl
+oppose
+ankles
+typhoon
+##د
+##ache
+##asia
+linguistics
+compassion
+pressures
+grazing
+perfection
+##iting
+immunity
+monopoly
+muddy
+backgrounds
+136
+namibia
+francesca
+monitors
+attracting
+stunt
+tuition
+##ии
+vegetable
+##mates
+##quent
+mgm
+jen
+complexes
+forts
+##ond
+cellar
+bites
+seventeenth
+royals
+flemish
+failures
+mast
+charities
+##cular
+peruvian
+capitals
+macmillan
+ipswich
+outward
+frigate
+postgraduate
+folds
+employing
+##ouse
+concurrently
+fiery
+##tai
+contingent
+nightmares
+monumental
+nicaragua
+##kowski
+lizard
+mal
+fielding
+gig
+reject
+##pad
+harding
+##ipe
+coastline
+##cin
+##nos
+beethoven
+humphrey
+innovations
+##tam
+##nge
+norris
+doris
+solicitor
+huang
+obey
+141
+##lc
+niagara
+##tton
+shelves
+aug
+bourbon
+curry
+nightclub
+specifications
+hilton
+##ndo
+centennial
+dispersed
+worm
+neglected
+briggs
+sm
+font
+kuala
+uneasy
+plc
+##nstein
+##bound
+##aking
+##burgh
+awaiting
+pronunciation
+##bbed
+##quest
+eh
+optimal
+zhu
+raped
+greens
+presided
+brenda
+worries
+##life
+venetian
+marxist
+turnout
+##lius
+refined
+braced
+sins
+grasped
+sunderland
+nickel
+speculated
+lowell
+cyrillic
+communism
+fundraising
+resembling
+colonists
+mutant
+freddie
+usc
+##mos
+gratitude
+##run
+mural
+##lous
+chemist
+wi
+reminds
+28th
+steals
+tess
+pietro
+##ingen
+promoter
+ri
+microphone
+honoured
+rai
+sant
+##qui
+feather
+##nson
+burlington
+kurdish
+terrorists
+deborah
+sickness
+##wed
+##eet
+hazard
+irritated
+desperation
+veil
+clarity
+##rik
+jewels
+xv
+##gged
+##ows
+##cup
+berkshire
+unfair
+mysteries
+orchid
+winced
+exhaustion
+renovations
+stranded
+obe
+infinity
+##nies
+adapt
+redevelopment
+thanked
+registry
+olga
+domingo
+noir
+tudor
+ole
+##atus
+commenting
+behaviors
+##ais
+crisp
+pauline
+probable
+stirling
+wigan
+##bian
+paralympics
+panting
+surpassed
+##rew
+luca
+barred
+pony
+famed
+##sters
+cassandra
+waiter
+carolyn
+exported
+##orted
+andres
+destructive
+deeds
+jonah
+castles
+vacancy
+suv
+##glass
+1788
+orchard
+yep
+famine
+belarusian
+sprang
+##forth
+skinny
+##mis
+administrators
+rotterdam
+zambia
+zhao
+boiler
+discoveries
+##ride
+##physics
+lucius
+disappointing
+outreach
+spoon
+##frame
+qualifications
+unanimously
+enjoys
+regency
+##iidae
+stade
+realism
+veterinary
+rodgers
+dump
+alain
+chestnut
+castile
+censorship
+rumble
+gibbs
+##itor
+communion
+reggae
+inactivated
+logs
+loads
+##houses
+homosexual
+##iano
+ale
+informs
+##cas
+phrases
+plaster
+linebacker
+ambrose
+kaiser
+fascinated
+850
+limerick
+recruitment
+forge
+mastered
+##nding
+leinster
+rooted
+threaten
+##strom
+borneo
+##hes
+suggestions
+scholarships
+propeller
+documentaries
+patronage
+coats
+constructing
+invest
+neurons
+comet
+entirety
+shouts
+identities
+annoying
+unchanged
+wary
+##antly
+##ogy
+neat
+oversight
+##kos
+phillies
+replay
+constance
+##kka
+incarnation
+humble
+skies
+minus
+##acy
+smithsonian
+##chel
+guerrilla
+jar
+cadets
+##plate
+surplus
+audit
+##aru
+cracking
+joanna
+louisa
+pacing
+##lights
+intentionally
+##iri
+diner
+nwa
+imprint
+australians
+tong
+unprecedented
+bunker
+naive
+specialists
+ark
+nichols
+railing
+leaked
+pedal
+##uka
+shrub
+longing
+roofs
+v8
+captains
+neural
+tuned
+##ntal
+##jet
+emission
+medina
+frantic
+codex
+definitive
+sid
+abolition
+intensified
+stocks
+enrique
+sustain
+genoa
+oxide
+##written
+clues
+cha
+##gers
+tributaries
+fragment
+venom
+##rity
+##ente
+##sca
+muffled
+vain
+sire
+laos
+##ingly
+##hana
+hastily
+snapping
+surfaced
+sentiment
+motive
+##oft
+contests
+approximate
+mesa
+luckily
+dinosaur
+exchanges
+propelled
+accord
+bourne
+relieve
+tow
+masks
+offended
+##ues
+cynthia
+##mmer
+rains
+bartender
+zinc
+reviewers
+lois
+##sai
+legged
+arrogant
+rafe
+rosie
+comprise
+handicap
+blockade
+inlet
+lagoon
+copied
+drilling
+shelley
+petals
+##inian
+mandarin
+obsolete
+##inated
+onward
+arguably
+productivity
+cindy
+praising
+seldom
+busch
+discusses
+raleigh
+shortage
+ranged
+stanton
+encouragement
+firstly
+conceded
+overs
+temporal
+##uke
+cbe
+##bos
+woo
+certainty
+pumps
+##pton
+stalked
+##uli
+lizzie
+periodic
+thieves
+weaker
+##night
+gases
+shoving
+chooses
+wc
+##chemical
+prompting
+weights
+##kill
+robust
+flanked
+sticky
+hu
+tuberculosis
+##eb
+##eal
+christchurch
+resembled
+wallet
+reese
+inappropriate
+pictured
+distract
+fixing
+fiddle
+giggled
+burger
+heirs
+hairy
+mechanic
+torque
+apache
+obsessed
+chiefly
+cheng
+logging
+##tag
+extracted
+meaningful
+numb
+##vsky
+gloucestershire
+reminding
+##bay
+unite
+##lit
+breeds
+diminished
+clown
+glove
+1860s
+##ن
+##ug
+archibald
+focal
+freelance
+sliced
+depiction
+##yk
+organism
+switches
+sights
+stray
+crawling
+##ril
+lever
+leningrad
+interpretations
+loops
+anytime
+reel
+alicia
+delighted
+##ech
+inhaled
+xiv
+suitcase
+bernie
+vega
+licenses
+northampton
+exclusion
+induction
+monasteries
+racecourse
+homosexuality
+##right
+##sfield
+##rky
+dimitri
+michele
+alternatives
+ions
+commentators
+genuinely
+objected
+pork
+hospitality
+fencing
+stephan
+warships
+peripheral
+wit
+drunken
+wrinkled
+quentin
+spends
+departing
+chung
+numerical
+spokesperson
+##zone
+johannesburg
+caliber
+killers
+##udge
+assumes
+neatly
+demographic
+abigail
+bloc
+##vel
+mounting
+##lain
+bentley
+slightest
+xu
+recipients
+##jk
+merlin
+##writer
+seniors
+prisons
+blinking
+hindwings
+flickered
+kappa
+##hel
+80s
+strengthening
+appealing
+brewing
+gypsy
+mali
+lashes
+hulk
+unpleasant
+harassment
+bio
+treaties
+predict
+instrumentation
+pulp
+troupe
+boiling
+mantle
+##ffe
+ins
+##vn
+dividing
+handles
+verbs
+##onal
+coconut
+senegal
+340
+thorough
+gum
+momentarily
+##sto
+cocaine
+panicked
+destined
+##turing
+teatro
+denying
+weary
+captained
+mans
+##hawks
+##code
+wakefield
+bollywood
+thankfully
+##16
+cyril
+##wu
+amendments
+##bahn
+consultation
+stud
+reflections
+kindness
+1787
+internally
+##ovo
+tex
+mosaic
+distribute
+paddy
+seeming
+143
+##hic
+piers
+##15
+##mura
+##verse
+popularly
+winger
+kang
+sentinel
+mccoy
+##anza
+covenant
+##bag
+verge
+fireworks
+suppress
+thrilled
+dominate
+##jar
+swansea
+##60
+142
+reconciliation
+##ndi
+stiffened
+cue
+dorian
+##uf
+damascus
+amor
+ida
+foremost
+##aga
+porsche
+unseen
+dir
+##had
+##azi
+stony
+lexi
+melodies
+##nko
+angular
+integer
+podcast
+ants
+inherent
+jaws
+justify
+persona
+##olved
+josephine
+##nr
+##ressed
+customary
+flashes
+gala
+cyrus
+glaring
+backyard
+ariel
+physiology
+greenland
+html
+stir
+avon
+atletico
+finch
+methodology
+ked
+##lent
+mas
+catholicism
+townsend
+branding
+quincy
+fits
+containers
+1777
+ashore
+aragon
+##19
+forearm
+poisoning
+##sd
+adopting
+conquer
+grinding
+amnesty
+keller
+finances
+evaluate
+forged
+lankan
+instincts
+##uto
+guam
+bosnian
+photographed
+workplace
+desirable
+protector
+##dog
+allocation
+intently
+encourages
+willy
+##sten
+bodyguard
+electro
+brighter
+##ν
+bihar
+##chev
+lasts
+opener
+amphibious
+sal
+verde
+arte
+##cope
+captivity
+vocabulary
+yields
+##tted
+agreeing
+desmond
+pioneered
+##chus
+strap
+campaigned
+railroads
+##ович
+emblem
+##dre
+stormed
+501
+##ulous
+marijuana
+northumberland
+##gn
+##nath
+bowen
+landmarks
+beaumont
+##qua
+danube
+##bler
+attorneys
+th
+ge
+flyers
+critique
+villains
+cass
+mutation
+acc
+##0s
+colombo
+mckay
+motif
+sampling
+concluding
+syndicate
+##rell
+neon
+stables
+ds
+warnings
+clint
+mourning
+wilkinson
+##tated
+merrill
+leopard
+evenings
+exhaled
+emil
+sonia
+ezra
+discrete
+stove
+farrell
+fifteenth
+prescribed
+superhero
+##rier
+worms
+helm
+wren
+##duction
+##hc
+expo
+##rator
+hq
+unfamiliar
+antony
+prevents
+acceleration
+fiercely
+mari
+painfully
+calculations
+cheaper
+ign
+clifton
+irvine
+davenport
+mozambique
+##np
+pierced
+##evich
+wonders
+##wig
+##cate
+##iling
+crusade
+ware
+##uel
+enzymes
+reasonably
+mls
+##coe
+mater
+ambition
+bunny
+eliot
+kernel
+##fin
+asphalt
+headmaster
+torah
+aden
+lush
+pins
+waived
+##care
+##yas
+joao
+substrate
+enforce
+##grad
+##ules
+alvarez
+selections
+epidemic
+tempted
+##bit
+bremen
+translates
+ensured
+waterfront
+29th
+forrest
+manny
+malone
+kramer
+reigning
+cookies
+simpler
+absorption
+205
+engraved
+##ffy
+evaluated
+1778
+haze
+146
+comforting
+crossover
+##abe
+thorn
+##rift
+##imo
+##pop
+suppression
+fatigue
+cutter
+##tr
+201
+wurttemberg
+##orf
+enforced
+hovering
+proprietary
+gb
+samurai
+syllable
+ascent
+lacey
+tick
+lars
+tractor
+merchandise
+rep
+bouncing
+defendants
+##yre
+huntington
+##ground
+##oko
+standardized
+##hor
+##hima
+assassinated
+nu
+predecessors
+rainy
+liar
+assurance
+lyrical
+##uga
+secondly
+flattened
+ios
+parameter
+undercover
+##mity
+bordeaux
+punish
+ridges
+markers
+exodus
+inactive
+hesitate
+debbie
+nyc
+pledge
+savoy
+nagar
+offset
+organist
+##tium
+hesse
+marin
+converting
+##iver
+diagram
+propulsion
+pu
+validity
+reverted
+supportive
+##dc
+ministries
+clans
+responds
+proclamation
+##inae
+##ø
+##rea
+ein
+pleading
+patriot
+sf
+birch
+islanders
+strauss
+hates
+##dh
+brandenburg
+concession
+rd
+##ob
+1900s
+killings
+textbook
+antiquity
+cinematography
+wharf
+embarrassing
+setup
+creed
+farmland
+inequality
+centred
+signatures
+fallon
+370
+##ingham
+##uts
+ceylon
+gazing
+directive
+laurie
+##tern
+globally
+##uated
+##dent
+allah
+excavation
+threads
+##cross
+148
+frantically
+icc
+utilize
+determines
+respiratory
+thoughtful
+receptions
+##dicate
+merging
+chandra
+seine
+147
+builders
+builds
+diagnostic
+dev
+visibility
+goddamn
+analyses
+dhaka
+cho
+proves
+chancel
+concurrent
+curiously
+canadians
+pumped
+restoring
+1850s
+turtles
+jaguar
+sinister
+spinal
+traction
+declan
+vows
+1784
+glowed
+capitalism
+swirling
+install
+universidad
+##lder
+##oat
+soloist
+##genic
+##oor
+coincidence
+beginnings
+nissan
+dip
+resorts
+caucasus
+combustion
+infectious
+##eno
+pigeon
+serpent
+##itating
+conclude
+masked
+salad
+jew
+##gr
+surreal
+toni
+##wc
+harmonica
+151
+##gins
+##etic
+##coat
+fishermen
+intending
+bravery
+##wave
+klaus
+titan
+wembley
+taiwanese
+ransom
+40th
+incorrect
+hussein
+eyelids
+jp
+cooke
+dramas
+utilities
+##etta
+##print
+eisenhower
+principally
+granada
+lana
+##rak
+openings
+concord
+##bl
+bethany
+connie
+morality
+sega
+##mons
+##nard
+earnings
+##kara
+##cine
+wii
+communes
+##rel
+coma
+composing
+softened
+severed
+grapes
+##17
+nguyen
+analyzed
+warlord
+hubbard
+heavenly
+behave
+slovenian
+##hit
+##ony
+hailed
+filmmakers
+trance
+caldwell
+skye
+unrest
+coward
+likelihood
+##aging
+bern
+sci
+taliban
+honolulu
+propose
+##wang
+1700
+browser
+imagining
+cobra
+contributes
+dukes
+instinctively
+conan
+violinist
+##ores
+accessories
+gradual
+##amp
+quotes
+sioux
+##dating
+undertake
+intercepted
+sparkling
+compressed
+139
+fungus
+tombs
+haley
+imposing
+rests
+degradation
+lincolnshire
+retailers
+wetlands
+tulsa
+distributor
+dungeon
+nun
+greenhouse
+convey
+atlantis
+aft
+exits
+oman
+dresser
+lyons
+##sti
+joking
+eddy
+judgement
+omitted
+digits
+##cts
+##game
+juniors
+##rae
+cents
+stricken
+une
+##ngo
+wizards
+weir
+breton
+nan
+technician
+fibers
+liking
+royalty
+##cca
+154
+persia
+terribly
+magician
+##rable
+##unt
+vance
+cafeteria
+booker
+camille
+warmer
+##static
+consume
+cavern
+gaps
+compass
+contemporaries
+foyer
+soothing
+graveyard
+maj
+plunged
+blush
+##wear
+cascade
+demonstrates
+ordinance
+##nov
+boyle
+##lana
+rockefeller
+shaken
+banjo
+izzy
+##ense
+breathless
+vines
+##32
+##eman
+alterations
+chromosome
+dwellings
+feudal
+mole
+153
+catalonia
+relics
+tenant
+mandated
+##fm
+fridge
+hats
+honesty
+patented
+raul
+heap
+cruisers
+accusing
+enlightenment
+infants
+wherein
+chatham
+contractors
+zen
+affinity
+hc
+osborne
+piston
+156
+traps
+maturity
+##rana
+lagos
+##zal
+peering
+##nay
+attendant
+dealers
+protocols
+subset
+prospects
+biographical
+##cre
+artery
+##zers
+insignia
+nuns
+endured
+##eration
+recommend
+schwartz
+serbs
+berger
+cromwell
+crossroads
+##ctor
+enduring
+clasped
+grounded
+##bine
+marseille
+twitched
+abel
+choke
+https
+catalyst
+moldova
+italians
+##tist
+disastrous
+wee
+##oured
+##nti
+wwf
+nope
+##piration
+##asa
+expresses
+thumbs
+167
+##nza
+coca
+1781
+cheating
+##ption
+skipped
+sensory
+heidelberg
+spies
+satan
+dangers
+semifinal
+202
+bohemia
+whitish
+confusing
+shipbuilding
+relies
+surgeons
+landings
+ravi
+baku
+moor
+suffix
+alejandro
+##yana
+litre
+upheld
+##unk
+rajasthan
+##rek
+coaster
+insists
+posture
+scenarios
+etienne
+favoured
+appoint
+transgender
+elephants
+poked
+greenwood
+defences
+fulfilled
+militant
+somali
+1758
+chalk
+potent
+##ucci
+migrants
+wink
+assistants
+nos
+restriction
+activism
+niger
+##ario
+colon
+shaun
+##sat
+daphne
+##erated
+swam
+congregations
+reprise
+considerations
+magnet
+playable
+xvi
+##р
+overthrow
+tobias
+knob
+chavez
+coding
+##mers
+propped
+katrina
+orient
+newcomer
+##suke
+temperate
+##pool
+farmhouse
+interrogation
+##vd
+committing
+##vert
+forthcoming
+strawberry
+joaquin
+macau
+ponds
+shocking
+siberia
+##cellular
+chant
+contributors
+##nant
+##ologists
+sped
+absorb
+hail
+1782
+spared
+##hore
+barbados
+karate
+opus
+originates
+saul
+##xie
+evergreen
+leaped
+##rock
+correlation
+exaggerated
+weekday
+unification
+bump
+tracing
+brig
+afb
+pathways
+utilizing
+##ners
+mod
+mb
+disturbance
+kneeling
+##stad
+##guchi
+100th
+pune
+##thy
+decreasing
+168
+manipulation
+miriam
+academia
+ecosystem
+occupational
+rbi
+##lem
+rift
+##14
+rotary
+stacked
+incorporation
+awakening
+generators
+guerrero
+racist
+##omy
+cyber
+derivatives
+culminated
+allie
+annals
+panzer
+sainte
+wikipedia
+pops
+zu
+austro
+##vate
+algerian
+politely
+nicholson
+mornings
+educate
+tastes
+thrill
+dartmouth
+##gating
+db
+##jee
+regan
+differing
+concentrating
+choreography
+divinity
+##media
+pledged
+alexandre
+routing
+gregor
+madeline
+##idal
+apocalypse
+##hora
+gunfire
+culminating
+elves
+fined
+liang
+lam
+programmed
+tar
+guessing
+transparency
+gabrielle
+##gna
+cancellation
+flexibility
+##lining
+accession
+shea
+stronghold
+nets
+specializes
+##rgan
+abused
+hasan
+sgt
+ling
+exceeding
+##₄
+admiration
+supermarket
+##ark
+photographers
+specialised
+tilt
+resonance
+hmm
+perfume
+380
+sami
+threatens
+garland
+botany
+guarding
+boiled
+greet
+puppy
+russo
+supplier
+wilmington
+vibrant
+vijay
+##bius
+paralympic
+grumbled
+paige
+faa
+licking
+margins
+hurricanes
+##gong
+fest
+grenade
+ripping
+##uz
+counseling
+weigh
+##sian
+needles
+wiltshire
+edison
+costly
+##not
+fulton
+tramway
+redesigned
+staffordshire
+cache
+gasping
+watkins
+sleepy
+candidacy
+##group
+monkeys
+timeline
+throbbing
+##bid
+##sos
+berth
+uzbekistan
+vanderbilt
+bothering
+overturned
+ballots
+gem
+##iger
+sunglasses
+subscribers
+hooker
+compelling
+ang
+exceptionally
+saloon
+stab
+##rdi
+carla
+terrifying
+rom
+##vision
+coil
+##oids
+satisfying
+vendors
+31st
+mackay
+deities
+overlooked
+ambient
+bahamas
+felipe
+olympia
+whirled
+botanist
+advertised
+tugging
+##dden
+disciples
+morales
+unionist
+rites
+foley
+morse
+motives
+creepy
+##₀
+soo
+##sz
+bargain
+highness
+frightening
+turnpike
+tory
+reorganization
+##cer
+depict
+biographer
+##walk
+unopposed
+manifesto
+##gles
+institut
+emile
+accidental
+kapoor
+##dam
+kilkenny
+cortex
+lively
+##13
+romanesque
+jain
+shan
+cannons
+##ood
+##ske
+petrol
+echoing
+amalgamated
+disappears
+cautious
+proposes
+sanctions
+trenton
+##ر
+flotilla
+aus
+contempt
+tor
+canary
+cote
+theirs
+##hun
+conceptual
+deleted
+fascinating
+paso
+blazing
+elf
+honourable
+hutchinson
+##eiro
+##outh
+##zin
+surveyor
+tee
+amidst
+wooded
+reissue
+intro
+##ono
+cobb
+shelters
+newsletter
+hanson
+brace
+encoding
+confiscated
+dem
+caravan
+marino
+scroll
+melodic
+cows
+imam
+##adi
+##aneous
+northward
+searches
+biodiversity
+cora
+310
+roaring
+##bers
+connell
+theologian
+halo
+compose
+pathetic
+unmarried
+dynamo
+##oot
+az
+calculation
+toulouse
+deserves
+humour
+nr
+forgiveness
+tam
+undergone
+martyr
+pamela
+myths
+whore
+counselor
+hicks
+290
+heavens
+battleship
+electromagnetic
+##bbs
+stellar
+establishments
+presley
+hopped
+##chin
+temptation
+90s
+wills
+nas
+##yuan
+nhs
+##nya
+seminars
+##yev
+adaptations
+gong
+asher
+lex
+indicator
+sikh
+tobago
+cites
+goin
+##yte
+satirical
+##gies
+characterised
+correspond
+bubbles
+lure
+participates
+##vid
+eruption
+skate
+therapeutic
+1785
+canals
+wholesale
+defaulted
+sac
+460
+petit
+##zzled
+virgil
+leak
+ravens
+256
+portraying
+##yx
+ghetto
+creators
+dams
+portray
+vicente
+##rington
+fae
+namesake
+bounty
+##arium
+joachim
+##ota
+##iser
+aforementioned
+axle
+snout
+depended
+dismantled
+reuben
+480
+##ibly
+gallagher
+##lau
+##pd
+earnest
+##ieu
+##iary
+inflicted
+objections
+##llar
+asa
+gritted
+##athy
+jericho
+##sea
+##was
+flick
+underside
+ceramics
+undead
+substituted
+195
+eastward
+undoubtedly
+wheeled
+chimney
+##iche
+guinness
+cb
+##ager
+siding
+##bell
+traitor
+baptiste
+disguised
+inauguration
+149
+tipperary
+choreographer
+perched
+warmed
+stationary
+eco
+##ike
+##ntes
+bacterial
+##aurus
+flores
+phosphate
+##core
+attacker
+invaders
+alvin
+intersects
+a1
+indirectly
+immigrated
+businessmen
+cornelius
+valves
+narrated
+pill
+sober
+ul
+nationale
+monastic
+applicants
+scenery
+##jack
+161
+motifs
+constitutes
+cpu
+##osh
+jurisdictions
+sd
+tuning
+irritation
+woven
+##uddin
+fertility
+gao
+##erie
+antagonist
+impatient
+glacial
+hides
+boarded
+denominations
+interception
+##jas
+cookie
+nicola
+##tee
+algebraic
+marquess
+bahn
+parole
+buyers
+bait
+turbines
+paperwork
+bestowed
+natasha
+renee
+oceans
+purchases
+157
+vaccine
+215
+##tock
+fixtures
+playhouse
+integrate
+jai
+oswald
+intellectuals
+##cky
+booked
+nests
+mortimer
+##isi
+obsession
+sept
+##gler
+##sum
+440
+scrutiny
+simultaneous
+squinted
+##shin
+collects
+oven
+shankar
+penned
+remarkably
+##я
+slips
+luggage
+spectral
+1786
+collaborations
+louie
+consolidation
+##ailed
+##ivating
+420
+hoover
+blackpool
+harness
+ignition
+vest
+tails
+belmont
+mongol
+skinner
+##nae
+visually
+mage
+derry
+##tism
+##unce
+stevie
+transitional
+##rdy
+redskins
+drying
+prep
+prospective
+##21
+annoyance
+oversee
+##loaded
+fills
+##books
+##iki
+announces
+fda
+scowled
+respects
+prasad
+mystic
+tucson
+##vale
+revue
+springer
+bankrupt
+1772
+aristotle
+salvatore
+habsburg
+##geny
+dal
+natal
+nut
+pod
+chewing
+darts
+moroccan
+walkover
+rosario
+lenin
+punjabi
+##ße
+grossed
+scattering
+wired
+invasive
+hui
+polynomial
+corridors
+wakes
+gina
+portrays
+##cratic
+arid
+retreating
+erich
+irwin
+sniper
+##dha
+linen
+lindsey
+maneuver
+butch
+shutting
+socio
+bounce
+commemorative
+postseason
+jeremiah
+pines
+275
+mystical
+beads
+bp
+abbas
+furnace
+bidding
+consulted
+assaulted
+empirical
+rubble
+enclosure
+sob
+weakly
+cancel
+polly
+yielded
+##emann
+curly
+prediction
+battered
+70s
+vhs
+jacqueline
+render
+sails
+barked
+detailing
+grayson
+riga
+sloane
+raging
+##yah
+herbs
+bravo
+##athlon
+alloy
+giggle
+imminent
+suffers
+assumptions
+waltz
+##itate
+accomplishments
+##ited
+bathing
+remixed
+deception
+prefix
+##emia
+deepest
+##tier
+##eis
+balkan
+frogs
+##rong
+slab
+##pate
+philosophers
+peterborough
+grains
+imports
+dickinson
+rwanda
+##atics
+1774
+dirk
+lan
+tablets
+##rove
+clone
+##rice
+caretaker
+hostilities
+mclean
+##gre
+regimental
+treasures
+norms
+impose
+tsar
+tango
+diplomacy
+variously
+complain
+192
+recognise
+arrests
+1779
+celestial
+pulitzer
+##dus
+bing
+libretto
+##moor
+adele
+splash
+##rite
+expectation
+lds
+confronts
+##izer
+spontaneous
+harmful
+wedge
+entrepreneurs
+buyer
+##ope
+bilingual
+translate
+rugged
+conner
+circulated
+uae
+eaton
+##gra
+##zzle
+lingered
+lockheed
+vishnu
+reelection
+alonso
+##oom
+joints
+yankee
+headline
+cooperate
+heinz
+laureate
+invading
+##sford
+echoes
+scandinavian
+##dham
+hugging
+vitamin
+salute
+micah
+hind
+trader
+##sper
+radioactive
+##ndra
+militants
+poisoned
+ratified
+remark
+campeonato
+deprived
+wander
+prop
+##dong
+outlook
+##tani
+##rix
+##eye
+chiang
+darcy
+##oping
+mandolin
+spice
+statesman
+babylon
+182
+walled
+forgetting
+afro
+##cap
+158
+giorgio
+buffer
+##polis
+planetary
+##gis
+overlap
+terminals
+kinda
+centenary
+##bir
+arising
+manipulate
+elm
+ke
+1770
+ak
+##tad
+chrysler
+mapped
+moose
+pomeranian
+quad
+macarthur
+assemblies
+shoreline
+recalls
+stratford
+##rted
+noticeable
+##evic
+imp
+##rita
+##sque
+accustomed
+supplying
+tents
+disgusted
+vogue
+sipped
+filters
+khz
+reno
+selecting
+luftwaffe
+mcmahon
+tyne
+masterpiece
+carriages
+collided
+dunes
+exercised
+flare
+remembers
+muzzle
+##mobile
+heck
+##rson
+burgess
+lunged
+middleton
+boycott
+bilateral
+##sity
+hazardous
+lumpur
+multiplayer
+spotlight
+jackets
+goldman
+liege
+porcelain
+rag
+waterford
+benz
+attracts
+hopeful
+battling
+ottomans
+kensington
+baked
+hymns
+cheyenne
+lattice
+levine
+borrow
+polymer
+clashes
+michaels
+monitored
+commitments
+denounced
+##25
+##von
+cavity
+##oney
+hobby
+akin
+##holders
+futures
+intricate
+cornish
+patty
+##oned
+illegally
+dolphin
+##lag
+barlow
+yellowish
+maddie
+apologized
+luton
+plagued
+##puram
+nana
+##rds
+sway
+fanny
+łodz
+##rino
+psi
+suspicions
+hanged
+##eding
+initiate
+charlton
+##por
+nak
+competent
+235
+analytical
+annex
+wardrobe
+reservations
+##rma
+sect
+162
+fairfax
+hedge
+piled
+buckingham
+uneven
+bauer
+simplicity
+snyder
+interpret
+accountability
+donors
+moderately
+byrd
+continents
+##cite
+##max
+disciple
+hr
+jamaican
+ping
+nominees
+##uss
+mongolian
+diver
+attackers
+eagerly
+ideological
+pillows
+miracles
+apartheid
+revolver
+sulfur
+clinics
+moran
+163
+##enko
+ile
+katy
+rhetoric
+##icated
+chronology
+recycling
+##hrer
+elongated
+mughal
+pascal
+profiles
+vibration
+databases
+domination
+##fare
+##rant
+matthias
+digest
+rehearsal
+polling
+weiss
+initiation
+reeves
+clinging
+flourished
+impress
+ngo
+##hoff
+##ume
+buckley
+symposium
+rhythms
+weed
+emphasize
+transforming
+##taking
+##gence
+##yman
+accountant
+analyze
+flicker
+foil
+priesthood
+voluntarily
+decreases
+##80
+##hya
+slater
+sv
+charting
+mcgill
+##lde
+moreno
+##iu
+besieged
+zur
+robes
+##phic
+admitting
+api
+deported
+turmoil
+peyton
+earthquakes
+##ares
+nationalists
+beau
+clair
+brethren
+interrupt
+welch
+curated
+galerie
+requesting
+164
+##ested
+impending
+steward
+viper
+##vina
+complaining
+beautifully
+brandy
+foam
+nl
+1660
+##cake
+alessandro
+punches
+laced
+explanations
+##lim
+attribute
+clit
+reggie
+discomfort
+##cards
+smoothed
+whales
+##cene
+adler
+countered
+duffy
+disciplinary
+widening
+recipe
+reliance
+conducts
+goats
+gradient
+preaching
+##shaw
+matilda
+quasi
+striped
+meridian
+cannabis
+cordoba
+certificates
+##agh
+##tering
+graffiti
+hangs
+pilgrims
+repeats
+##ych
+revive
+urine
+etat
+##hawk
+fueled
+belts
+fuzzy
+susceptible
+##hang
+mauritius
+salle
+sincere
+beers
+hooks
+##cki
+arbitration
+entrusted
+advise
+sniffed
+seminar
+junk
+donnell
+processors
+principality
+strapped
+celia
+mendoza
+everton
+fortunes
+prejudice
+starving
+reassigned
+steamer
+##lund
+tuck
+evenly
+foreman
+##ffen
+dans
+375
+envisioned
+slit
+##xy
+baseman
+liberia
+rosemary
+##weed
+electrified
+periodically
+potassium
+stride
+contexts
+sperm
+slade
+mariners
+influx
+bianca
+subcommittee
+##rane
+spilling
+icao
+estuary
+##nock
+delivers
+iphone
+##ulata
+isa
+mira
+bohemian
+dessert
+##sbury
+welcoming
+proudly
+slowing
+##chs
+musee
+ascension
+russ
+##vian
+waits
+##psy
+africans
+exploit
+##morphic
+gov
+eccentric
+crab
+peck
+##ull
+entrances
+formidable
+marketplace
+groom
+bolted
+metabolism
+patton
+robbins
+courier
+payload
+endure
+##ifier
+andes
+refrigerator
+##pr
+ornate
+##uca
+ruthless
+illegitimate
+masonry
+strasbourg
+bikes
+adobe
+##³
+apples
+quintet
+willingly
+niche
+bakery
+corpses
+energetic
+##cliffe
+##sser
+##ards
+177
+centimeters
+centro
+fuscous
+cretaceous
+rancho
+##yde
+andrei
+telecom
+tottenham
+oasis
+ordination
+vulnerability
+presiding
+corey
+cp
+penguins
+sims
+##pis
+malawi
+piss
+##48
+correction
+##cked
+##ffle
+##ryn
+countdown
+detectives
+psychiatrist
+psychedelic
+dinosaurs
+blouse
+##get
+choi
+vowed
+##oz
+randomly
+##pol
+49ers
+scrub
+blanche
+bruins
+dusseldorf
+##using
+unwanted
+##ums
+212
+dominique
+elevations
+headlights
+om
+laguna
+##oga
+1750
+famously
+ignorance
+shrewsbury
+##aine
+ajax
+breuning
+che
+confederacy
+greco
+overhaul
+##screen
+paz
+skirts
+disagreement
+cruelty
+jagged
+phoebe
+shifter
+hovered
+viruses
+##wes
+mandy
+##lined
+##gc
+landlord
+squirrel
+dashed
+##ι
+ornamental
+gag
+wally
+grange
+literal
+spurs
+undisclosed
+proceeding
+yin
+##text
+billie
+orphan
+spanned
+humidity
+indy
+weighted
+presentations
+explosions
+lucian
+##tary
+vaughn
+hindus
+##anga
+##hell
+psycho
+171
+daytona
+protects
+efficiently
+rematch
+sly
+tandem
+##oya
+rebranded
+impaired
+hee
+metropolis
+peach
+godfrey
+diaspora
+ethnicity
+prosperous
+gleaming
+dar
+grossing
+playback
+##rden
+stripe
+pistols
+##tain
+births
+labelled
+##cating
+172
+rudy
+alba
+##onne
+aquarium
+hostility
+##gb
+##tase
+shudder
+sumatra
+hardest
+lakers
+consonant
+creeping
+demos
+homicide
+capsule
+zeke
+liberties
+expulsion
+pueblo
+##comb
+trait
+transporting
+##ddin
+##neck
+##yna
+depart
+gregg
+mold
+ledge
+hangar
+oldham
+playboy
+termination
+analysts
+gmbh
+romero
+##itic
+insist
+cradle
+filthy
+brightness
+slash
+shootout
+deposed
+bordering
+##truct
+isis
+microwave
+tumbled
+sheltered
+cathy
+werewolves
+messy
+andersen
+convex
+clapped
+clinched
+satire
+wasting
+edo
+vc
+rufus
+##jak
+mont
+##etti
+poznan
+##keeping
+restructuring
+transverse
+##rland
+azerbaijani
+slovene
+gestures
+roommate
+choking
+shear
+##quist
+vanguard
+oblivious
+##hiro
+disagreed
+baptism
+##lich
+coliseum
+##aceae
+salvage
+societe
+cory
+locke
+relocation
+relying
+versailles
+ahl
+swelling
+##elo
+cheerful
+##word
+##edes
+gin
+sarajevo
+obstacle
+diverted
+##nac
+messed
+thoroughbred
+fluttered
+utrecht
+chewed
+acquaintance
+assassins
+dispatch
+mirza
+##wart
+nike
+salzburg
+swell
+yen
+##gee
+idle
+ligue
+samson
+##nds
+##igh
+playful
+spawned
+##cise
+tease
+##case
+burgundy
+##bot
+stirring
+skeptical
+interceptions
+marathi
+##dies
+bedrooms
+aroused
+pinch
+##lik
+preferences
+tattoos
+buster
+digitally
+projecting
+rust
+##ital
+kitten
+priorities
+addison
+pseudo
+##guard
+dusk
+icons
+sermon
+##psis
+##iba
+bt
+##lift
+##xt
+ju
+truce
+rink
+##dah
+##wy
+defects
+psychiatry
+offences
+calculate
+glucose
+##iful
+##rized
+##unda
+francaise
+##hari
+richest
+warwickshire
+carly
+1763
+purity
+redemption
+lending
+##cious
+muse
+bruises
+cerebral
+aero
+carving
+##name
+preface
+terminology
+invade
+monty
+##int
+anarchist
+blurred
+##iled
+rossi
+treats
+guts
+shu
+foothills
+ballads
+undertaking
+premise
+cecilia
+affiliates
+blasted
+conditional
+wilder
+minors
+drone
+rudolph
+buffy
+swallowing
+horton
+attested
+##hop
+rutherford
+howell
+primetime
+livery
+penal
+##bis
+minimize
+hydro
+wrecked
+wrought
+palazzo
+##gling
+cans
+vernacular
+friedman
+nobleman
+shale
+walnut
+danielle
+##ection
+##tley
+sears
+##kumar
+chords
+lend
+flipping
+streamed
+por
+dracula
+gallons
+sacrifices
+gamble
+orphanage
+##iman
+mckenzie
+##gible
+boxers
+daly
+##balls
+##ان
+208
+##ific
+##rative
+##iq
+exploited
+slated
+##uity
+circling
+hillary
+pinched
+goldberg
+provost
+campaigning
+lim
+piles
+ironically
+jong
+mohan
+successors
+usaf
+##tem
+##ught
+autobiographical
+haute
+preserves
+##ending
+acquitted
+comparisons
+203
+hydroelectric
+gangs
+cypriot
+torpedoes
+rushes
+chrome
+derive
+bumps
+instability
+fiat
+pets
+##mbe
+silas
+dye
+reckless
+settler
+##itation
+info
+heats
+##writing
+176
+canonical
+maltese
+fins
+mushroom
+stacy
+aspen
+avid
+##kur
+##loading
+vickers
+gaston
+hillside
+statutes
+wilde
+gail
+kung
+sabine
+comfortably
+motorcycles
+##rgo
+169
+pneumonia
+fetch
+##sonic
+axel
+faintly
+parallels
+##oop
+mclaren
+spouse
+compton
+interdisciplinary
+miner
+##eni
+181
+clamped
+##chal
+##llah
+separates
+versa
+##mler
+scarborough
+labrador
+##lity
+##osing
+rutgers
+hurdles
+como
+166
+burt
+divers
+##100
+wichita
+cade
+coincided
+##erson
+bruised
+mla
+##pper
+vineyard
+##ili
+##brush
+notch
+mentioning
+jase
+hearted
+kits
+doe
+##acle
+pomerania
+##ady
+ronan
+seizure
+pavel
+problematic
+##zaki
+domenico
+##ulin
+catering
+penelope
+dependence
+parental
+emilio
+ministerial
+atkinson
+##bolic
+clarkson
+chargers
+colby
+grill
+peeked
+arises
+summon
+##aged
+fools
+##grapher
+faculties
+qaeda
+##vial
+garner
+refurbished
+##hwa
+geelong
+disasters
+nudged
+bs
+shareholder
+lori
+algae
+reinstated
+rot
+##ades
+##nous
+invites
+stainless
+183
+inclusive
+##itude
+diocesan
+til
+##icz
+denomination
+##xa
+benton
+floral
+registers
+##ider
+##erman
+##kell
+absurd
+brunei
+guangzhou
+hitter
+retaliation
+##uled
+##eve
+blanc
+nh
+consistency
+contamination
+##eres
+##rner
+dire
+palermo
+broadcasters
+diaries
+inspire
+vols
+brewer
+tightening
+ky
+mixtape
+hormone
+##tok
+stokes
+##color
+##dly
+##ssi
+pg
+##ometer
+##lington
+sanitation
+##tility
+intercontinental
+apps
+##adt
+¹⁄₂
+cylinders
+economies
+favourable
+unison
+croix
+gertrude
+odyssey
+vanity
+dangling
+##logists
+upgrades
+dice
+middleweight
+practitioner
+##ight
+206
+henrik
+parlor
+orion
+angered
+lac
+python
+blurted
+##rri
+sensual
+intends
+swings
+angled
+##phs
+husky
+attain
+peerage
+precinct
+textiles
+cheltenham
+shuffled
+dai
+confess
+tasting
+bhutan
+##riation
+tyrone
+segregation
+abrupt
+ruiz
+##rish
+smirked
+blackwell
+confidential
+browning
+amounted
+##put
+vase
+scarce
+fabulous
+raided
+staple
+guyana
+unemployed
+glider
+shay
+##tow
+carmine
+troll
+intervene
+squash
+superstar
+##uce
+cylindrical
+len
+roadway
+researched
+handy
+##rium
+##jana
+meta
+lao
+declares
+##rring
+##tadt
+##elin
+##kova
+willem
+shrubs
+napoleonic
+realms
+skater
+qi
+volkswagen
+##ł
+tad
+hara
+archaeologist
+awkwardly
+eerie
+##kind
+wiley
+##heimer
+##24
+titus
+organizers
+cfl
+crusaders
+lama
+usb
+vent
+enraged
+thankful
+occupants
+maximilian
+##gaard
+possessing
+textbooks
+##oran
+collaborator
+quaker
+##ulo
+avalanche
+mono
+silky
+straits
+isaiah
+mustang
+surged
+resolutions
+potomac
+descend
+cl
+kilograms
+plato
+strains
+saturdays
+##olin
+bernstein
+##ype
+holstein
+ponytail
+##watch
+belize
+conversely
+heroine
+perpetual
+##ylus
+charcoal
+piedmont
+glee
+negotiating
+backdrop
+prologue
+##jah
+##mmy
+pasadena
+climbs
+ramos
+sunni
+##holm
+##tner
+##tri
+anand
+deficiency
+hertfordshire
+stout
+##avi
+aperture
+orioles
+##irs
+doncaster
+intrigued
+bombed
+coating
+otis
+##mat
+cocktail
+##jit
+##eto
+amir
+arousal
+sar
+##proof
+##act
+##ories
+dixie
+pots
+##bow
+whereabouts
+159
+##fted
+drains
+bullying
+cottages
+scripture
+coherent
+fore
+poe
+appetite
+##uration
+sampled
+##ators
+##dp
+derrick
+rotor
+jays
+peacock
+installment
+##rro
+advisors
+##coming
+rodeo
+scotch
+##mot
+##db
+##fen
+##vant
+ensued
+rodrigo
+dictatorship
+martyrs
+twenties
+##н
+towed
+incidence
+marta
+rainforest
+sai
+scaled
+##cles
+oceanic
+qualifiers
+symphonic
+mcbride
+dislike
+generalized
+aubrey
+colonization
+##iation
+##lion
+##ssing
+disliked
+lublin
+salesman
+##ulates
+spherical
+whatsoever
+sweating
+avalon
+contention
+punt
+severity
+alderman
+atari
+##dina
+##grant
+##rop
+scarf
+seville
+vertices
+annexation
+fairfield
+fascination
+inspiring
+launches
+palatinate
+regretted
+##rca
+feral
+##iom
+elk
+nap
+olsen
+reddy
+yong
+##leader
+##iae
+garment
+transports
+feng
+gracie
+outrage
+viceroy
+insides
+##esis
+breakup
+grady
+organizer
+softer
+grimaced
+222
+murals
+galicia
+arranging
+vectors
+##rsten
+bas
+##sb
+##cens
+sloan
+##eka
+bitten
+ara
+fender
+nausea
+bumped
+kris
+banquet
+comrades
+detector
+persisted
+##llan
+adjustment
+endowed
+cinemas
+##shot
+sellers
+##uman
+peek
+epa
+kindly
+neglect
+simpsons
+talon
+mausoleum
+runaway
+hangul
+lookout
+##cic
+rewards
+coughed
+acquainted
+chloride
+##ald
+quicker
+accordion
+neolithic
+##qa
+artemis
+coefficient
+lenny
+pandora
+tx
+##xed
+ecstasy
+litter
+segunda
+chairperson
+gemma
+hiss
+rumor
+vow
+nasal
+antioch
+compensate
+patiently
+transformers
+##eded
+judo
+morrow
+penis
+posthumous
+philips
+bandits
+husbands
+denote
+flaming
+##any
+##phones
+langley
+yorker
+1760
+walters
+##uo
+##kle
+gubernatorial
+fatty
+samsung
+leroy
+outlaw
+##nine
+unpublished
+poole
+jakob
+##ᵢ
+##ₙ
+crete
+distorted
+superiority
+##dhi
+intercept
+crust
+mig
+claus
+crashes
+positioning
+188
+stallion
+301
+frontal
+armistice
+##estinal
+elton
+aj
+encompassing
+camel
+commemorated
+malaria
+woodward
+calf
+cigar
+penetrate
+##oso
+willard
+##rno
+##uche
+illustrate
+amusing
+convergence
+noteworthy
+##lma
+##rva
+journeys
+realise
+manfred
+##sable
+410
+##vocation
+hearings
+fiance
+##posed
+educators
+provoked
+adjusting
+##cturing
+modular
+stockton
+paterson
+vlad
+rejects
+electors
+selena
+maureen
+##tres
+uber
+##rce
+swirled
+##num
+proportions
+nanny
+pawn
+naturalist
+parma
+apostles
+awoke
+ethel
+wen
+##bey
+monsoon
+overview
+##inating
+mccain
+rendition
+risky
+adorned
+##ih
+equestrian
+germain
+nj
+conspicuous
+confirming
+##yoshi
+shivering
+##imeter
+milestone
+rumours
+flinched
+bounds
+smacked
+token
+##bei
+lectured
+automobiles
+##shore
+impacted
+##iable
+nouns
+nero
+##leaf
+ismail
+prostitute
+trams
+##lace
+bridget
+sud
+stimulus
+impressions
+reins
+revolves
+##oud
+##gned
+giro
+honeymoon
+##swell
+criterion
+##sms
+##uil
+libyan
+prefers
+##osition
+211
+preview
+sucks
+accusation
+bursts
+metaphor
+diffusion
+tolerate
+faye
+betting
+cinematographer
+liturgical
+specials
+bitterly
+humboldt
+##ckle
+flux
+rattled
+##itzer
+archaeologists
+odor
+authorised
+marshes
+discretion
+##ов
+alarmed
+archaic
+inverse
+##leton
+explorers
+##pine
+drummond
+tsunami
+woodlands
+##minate
+##tland
+booklet
+insanity
+owning
+insert
+crafted
+calculus
+##tore
+receivers
+##bt
+stung
+##eca
+##nched
+prevailing
+travellers
+eyeing
+lila
+graphs
+##borne
+178
+julien
+##won
+morale
+adaptive
+therapist
+erica
+cw
+libertarian
+bowman
+pitches
+vita
+##ional
+crook
+##ads
+##entation
+caledonia
+mutiny
+##sible
+1840s
+automation
+##ß
+flock
+##pia
+ironic
+pathology
+##imus
+remarried
+##22
+joker
+withstand
+energies
+##att
+shropshire
+hostages
+madeleine
+tentatively
+conflicting
+mateo
+recipes
+euros
+ol
+mercenaries
+nico
+##ndon
+albuquerque
+augmented
+mythical
+bel
+freud
+##child
+cough
+##lica
+365
+freddy
+lillian
+genetically
+nuremberg
+calder
+209
+bonn
+outdoors
+paste
+suns
+urgency
+vin
+restraint
+tyson
+##cera
+##selle
+barrage
+bethlehem
+kahn
+##par
+mounts
+nippon
+barony
+happier
+ryu
+makeshift
+sheldon
+blushed
+castillo
+barking
+listener
+taped
+bethel
+fluent
+headlines
+pornography
+rum
+disclosure
+sighing
+mace
+doubling
+gunther
+manly
+##plex
+rt
+interventions
+physiological
+forwards
+emerges
+##tooth
+##gny
+compliment
+rib
+recession
+visibly
+barge
+faults
+connector
+exquisite
+prefect
+##rlin
+patio
+##cured
+elevators
+brandt
+italics
+pena
+173
+wasp
+satin
+ea
+botswana
+graceful
+respectable
+##jima
+##rter
+##oic
+franciscan
+generates
+##dl
+alfredo
+disgusting
+##olate
+##iously
+sherwood
+warns
+cod
+promo
+cheryl
+sino
+##ة
+##escu
+twitch
+##zhi
+brownish
+thom
+ortiz
+##dron
+densely
+##beat
+carmel
+reinforce
+##bana
+187
+anastasia
+downhill
+vertex
+contaminated
+remembrance
+harmonic
+homework
+##sol
+fiancee
+gears
+olds
+angelica
+loft
+ramsay
+quiz
+colliery
+sevens
+##cape
+autism
+##hil
+walkway
+##boats
+ruben
+abnormal
+ounce
+khmer
+##bbe
+zachary
+bedside
+morphology
+punching
+##olar
+sparrow
+convinces
+##35
+hewitt
+queer
+remastered
+rods
+mabel
+solemn
+notified
+lyricist
+symmetric
+##xide
+174
+encore
+passports
+wildcats
+##uni
+baja
+##pac
+mildly
+##ease
+bleed
+commodity
+mounds
+glossy
+orchestras
+##omo
+damian
+prelude
+ambitions
+##vet
+awhile
+remotely
+##aud
+asserts
+imply
+##iques
+distinctly
+modelling
+remedy
+##dded
+windshield
+dani
+xiao
+##endra
+audible
+powerplant
+1300
+invalid
+elemental
+acquisitions
+##hala
+immaculate
+libby
+plata
+smuggling
+ventilation
+denoted
+minh
+##morphism
+430
+differed
+dion
+kelley
+lore
+mocking
+sabbath
+spikes
+hygiene
+drown
+runoff
+stylized
+tally
+liberated
+aux
+interpreter
+righteous
+aba
+siren
+reaper
+pearce
+millie
+##cier
+##yra
+gaius
+##iso
+captures
+##ttering
+dorm
+claudio
+##sic
+benches
+knighted
+blackness
+##ored
+discount
+fumble
+oxidation
+routed
+##ς
+novak
+perpendicular
+spoiled
+fracture
+splits
+##urt
+pads
+topology
+##cats
+axes
+fortunate
+offenders
+protestants
+esteem
+221
+broadband
+convened
+frankly
+hound
+prototypes
+isil
+facilitated
+keel
+##sher
+sahara
+awaited
+bubba
+orb
+prosecutors
+186
+hem
+520
+##xing
+relaxing
+remnant
+romney
+sorted
+slalom
+stefano
+ulrich
+##active
+exemption
+folder
+pauses
+foliage
+hitchcock
+epithet
+204
+criticisms
+##aca
+ballistic
+brody
+hinduism
+chaotic
+youths
+equals
+##pala
+pts
+thicker
+analogous
+capitalist
+improvised
+overseeing
+sinatra
+ascended
+beverage
+##tl
+straightforward
+##kon
+curran
+##west
+bois
+325
+induce
+surveying
+emperors
+sax
+unpopular
+##kk
+cartoonist
+fused
+##mble
+unto
+##yuki
+localities
+##cko
+##ln
+darlington
+slain
+academie
+lobbying
+sediment
+puzzles
+##grass
+defiance
+dickens
+manifest
+tongues
+alumnus
+arbor
+coincide
+184
+appalachian
+mustafa
+examiner
+cabaret
+traumatic
+yves
+bracelet
+draining
+heroin
+magnum
+baths
+odessa
+consonants
+mitsubishi
+##gua
+kellan
+vaudeville
+##fr
+joked
+null
+straps
+probation
+##ław
+ceded
+interfaces
+##pas
+##zawa
+blinding
+viet
+224
+rothschild
+museo
+640
+huddersfield
+##vr
+tactic
+##storm
+brackets
+dazed
+incorrectly
+##vu
+reg
+glazed
+fearful
+manifold
+benefited
+irony
+##sun
+stumbling
+##rte
+willingness
+balkans
+mei
+wraps
+##aba
+injected
+##lea
+gu
+syed
+harmless
+##hammer
+bray
+takeoff
+poppy
+timor
+cardboard
+astronaut
+purdue
+weeping
+southbound
+cursing
+stalls
+diagonal
+##neer
+lamar
+bryce
+comte
+weekdays
+harrington
+##uba
+negatively
+##see
+lays
+grouping
+##cken
+##henko
+affirmed
+halle
+modernist
+##lai
+hodges
+smelling
+aristocratic
+baptized
+dismiss
+justification
+oilers
+##now
+coupling
+qin
+snack
+healer
+##qing
+gardener
+layla
+battled
+formulated
+stephenson
+gravitational
+##gill
+##jun
+1768
+granny
+coordinating
+suites
+##cd
+##ioned
+monarchs
+##cote
+##hips
+sep
+blended
+apr
+barrister
+deposition
+fia
+mina
+policemen
+paranoid
+##pressed
+churchyard
+covert
+crumpled
+creep
+abandoning
+tr
+transmit
+conceal
+barr
+understands
+readiness
+spire
+##cology
+##enia
+##erry
+610
+startling
+unlock
+vida
+bowled
+slots
+##nat
+##islav
+spaced
+trusting
+admire
+rig
+##ink
+slack
+##70
+mv
+207
+casualty
+##wei
+classmates
+##odes
+##rar
+##rked
+amherst
+furnished
+evolve
+foundry
+menace
+mead
+##lein
+flu
+wesleyan
+##kled
+monterey
+webber
+##vos
+wil
+##mith
+##на
+bartholomew
+justices
+restrained
+##cke
+amenities
+191
+mediated
+sewage
+trenches
+ml
+mainz
+##thus
+1800s
+##cula
+##inski
+caine
+bonding
+213
+converts
+spheres
+superseded
+marianne
+crypt
+sweaty
+ensign
+historia
+##br
+spruce
+##post
+##ask
+forks
+thoughtfully
+yukon
+pamphlet
+ames
+##uter
+karma
+##yya
+bryn
+negotiation
+sighs
+incapable
+##mbre
+##ntial
+actresses
+taft
+##mill
+luce
+prevailed
+##amine
+1773
+motionless
+envoy
+testify
+investing
+sculpted
+instructors
+provence
+kali
+cullen
+horseback
+##while
+goodwin
+##jos
+gaa
+norte
+##ldon
+modify
+wavelength
+abd
+214
+skinned
+sprinter
+forecast
+scheduling
+marries
+squared
+tentative
+##chman
+boer
+##isch
+bolts
+swap
+fisherman
+assyrian
+impatiently
+guthrie
+martins
+murdoch
+194
+tanya
+nicely
+dolly
+lacy
+med
+##45
+syn
+decks
+fashionable
+millionaire
+##ust
+surfing
+##ml
+##ision
+heaved
+tammy
+consulate
+attendees
+routinely
+197
+fuse
+saxophonist
+backseat
+malaya
+##lord
+scowl
+tau
+##ishly
+193
+sighted
+steaming
+##rks
+303
+911
+##holes
+##hong
+ching
+##wife
+bless
+conserved
+jurassic
+stacey
+unix
+zion
+chunk
+rigorous
+blaine
+198
+peabody
+slayer
+dismay
+brewers
+nz
+##jer
+det
+##glia
+glover
+postwar
+int
+penetration
+sylvester
+imitation
+vertically
+airlift
+heiress
+knoxville
+viva
+##uin
+390
+macon
+##rim
+##fighter
+##gonal
+janice
+##orescence
+##wari
+marius
+belongings
+leicestershire
+196
+blanco
+inverted
+preseason
+sanity
+sobbing
+##due
+##elt
+##dled
+collingwood
+regeneration
+flickering
+shortest
+##mount
+##osi
+feminism
+##lat
+sherlock
+cabinets
+fumbled
+northbound
+precedent
+snaps
+##mme
+researching
+##akes
+guillaume
+insights
+manipulated
+vapor
+neighbour
+sap
+gangster
+frey
+f1
+stalking
+scarcely
+callie
+barnett
+tendencies
+audi
+doomed
+assessing
+slung
+panchayat
+ambiguous
+bartlett
+##etto
+distributing
+violating
+wolverhampton
+##hetic
+swami
+histoire
+##urus
+liable
+pounder
+groin
+hussain
+larsen
+popping
+surprises
+##atter
+vie
+curt
+##station
+mute
+relocate
+musicals
+authorization
+richter
+##sef
+immortality
+tna
+bombings
+##press
+deteriorated
+yiddish
+##acious
+robbed
+colchester
+cs
+pmid
+ao
+verified
+balancing
+apostle
+swayed
+recognizable
+oxfordshire
+retention
+nottinghamshire
+contender
+judd
+invitational
+shrimp
+uhf
+##icient
+cleaner
+longitudinal
+tanker
+##mur
+acronym
+broker
+koppen
+sundance
+suppliers
+##gil
+4000
+clipped
+fuels
+petite
+##anne
+landslide
+helene
+diversion
+populous
+landowners
+auspices
+melville
+quantitative
+##xes
+ferries
+nicky
+##llus
+doo
+haunting
+roche
+carver
+downed
+unavailable
+##pathy
+approximation
+hiroshima
+##hue
+garfield
+valle
+comparatively
+keyboardist
+traveler
+##eit
+congestion
+calculating
+subsidiaries
+##bate
+serb
+modernization
+fairies
+deepened
+ville
+averages
+##lore
+inflammatory
+tonga
+##itch
+co₂
+squads
+##hea
+gigantic
+serum
+enjoyment
+retailer
+verona
+35th
+cis
+##phobic
+magna
+technicians
+##vati
+arithmetic
+##sport
+levin
+##dation
+amtrak
+chow
+sienna
+##eyer
+backstage
+entrepreneurship
+##otic
+learnt
+tao
+##udy
+worcestershire
+formulation
+baggage
+hesitant
+bali
+sabotage
+##kari
+barren
+enhancing
+murmur
+pl
+freshly
+putnam
+syntax
+aces
+medicines
+resentment
+bandwidth
+##sier
+grins
+chili
+guido
+##sei
+framing
+implying
+gareth
+lissa
+genevieve
+pertaining
+admissions
+geo
+thorpe
+proliferation
+sato
+bela
+analyzing
+parting
+##gor
+awakened
+##isman
+huddled
+secrecy
+##kling
+hush
+gentry
+540
+dungeons
+##ego
+coasts
+##utz
+sacrificed
+##chule
+landowner
+mutually
+prevalence
+programmer
+adolescent
+disrupted
+seaside
+gee
+trusts
+vamp
+georgie
+##nesian
+##iol
+schedules
+sindh
+##market
+etched
+hm
+sparse
+bey
+beaux
+scratching
+gliding
+unidentified
+216
+collaborating
+gems
+jesuits
+oro
+accumulation
+shaping
+mbe
+anal
+##xin
+231
+enthusiasts
+newscast
+##egan
+janata
+dewey
+parkinson
+179
+ankara
+biennial
+towering
+dd
+inconsistent
+950
+##chet
+thriving
+terminate
+cabins
+furiously
+eats
+advocating
+donkey
+marley
+muster
+phyllis
+leiden
+##user
+grassland
+glittering
+iucn
+loneliness
+217
+memorandum
+armenians
+##ddle
+popularized
+rhodesia
+60s
+lame
+##illon
+sans
+bikini
+header
+orbits
+##xx
+##finger
+##ulator
+sharif
+spines
+biotechnology
+strolled
+naughty
+yates
+##wire
+fremantle
+milo
+##mour
+abducted
+removes
+##atin
+humming
+wonderland
+##chrome
+##ester
+hume
+pivotal
+##rates
+armand
+grams
+believers
+elector
+rte
+apron
+bis
+scraped
+##yria
+endorsement
+initials
+##llation
+eps
+dotted
+hints
+buzzing
+emigration
+nearer
+##tom
+indicators
+##ulu
+coarse
+neutron
+protectorate
+##uze
+directional
+exploits
+pains
+loire
+1830s
+proponents
+guggenheim
+rabbits
+ritchie
+305
+hectare
+inputs
+hutton
+##raz
+verify
+##ako
+boilers
+longitude
+##lev
+skeletal
+yer
+emilia
+citrus
+compromised
+##gau
+pokemon
+prescription
+paragraph
+eduard
+cadillac
+attire
+categorized
+kenyan
+weddings
+charley
+##bourg
+entertain
+monmouth
+##lles
+nutrients
+davey
+mesh
+incentive
+practised
+ecosystems
+kemp
+subdued
+overheard
+##rya
+bodily
+maxim
+##nius
+apprenticeship
+ursula
+##fight
+lodged
+rug
+silesian
+unconstitutional
+patel
+inspected
+coyote
+unbeaten
+##hak
+34th
+disruption
+convict
+parcel
+##cl
+##nham
+collier
+implicated
+mallory
+##iac
+##lab
+susannah
+winkler
+##rber
+shia
+phelps
+sediments
+graphical
+robotic
+##sner
+adulthood
+mart
+smoked
+##isto
+kathryn
+clarified
+##aran
+divides
+convictions
+oppression
+pausing
+burying
+##mt
+federico
+mathias
+eileen
+##tana
+kite
+hunched
+##acies
+189
+##atz
+disadvantage
+liza
+kinetic
+greedy
+paradox
+yokohama
+dowager
+trunks
+ventured
+##gement
+gupta
+vilnius
+olaf
+##thest
+crimean
+hopper
+##ej
+progressively
+arturo
+mouthed
+arrondissement
+##fusion
+rubin
+simulcast
+oceania
+##orum
+##stra
+##rred
+busiest
+intensely
+navigator
+cary
+##vine
+##hini
+##bies
+fife
+rowe
+rowland
+posing
+insurgents
+shafts
+lawsuits
+activate
+conor
+inward
+culturally
+garlic
+265
+##eering
+eclectic
+##hui
+##kee
+##nl
+furrowed
+vargas
+meteorological
+rendezvous
+##aus
+culinary
+commencement
+##dition
+quota
+##notes
+mommy
+salaries
+overlapping
+mule
+##iology
+##mology
+sums
+wentworth
+##isk
+##zione
+mainline
+subgroup
+##illy
+hack
+plaintiff
+verdi
+bulb
+differentiation
+engagements
+multinational
+supplemented
+bertrand
+caller
+regis
+##naire
+##sler
+##arts
+##imated
+blossom
+propagation
+kilometer
+viaduct
+vineyards
+##uate
+beckett
+optimization
+golfer
+songwriters
+seminal
+semitic
+thud
+volatile
+evolving
+ridley
+##wley
+trivial
+distributions
+scandinavia
+jiang
+##ject
+wrestled
+insistence
+##dio
+emphasizes
+napkin
+##ods
+adjunct
+rhyme
+##ricted
+##eti
+hopeless
+surrounds
+tremble
+32nd
+smoky
+##ntly
+oils
+medicinal
+padded
+steer
+wilkes
+219
+255
+concessions
+hue
+uniquely
+blinded
+landon
+yahoo
+##lane
+hendrix
+commemorating
+dex
+specify
+chicks
+##ggio
+intercity
+1400
+morley
+##torm
+highlighting
+##oting
+pang
+oblique
+stalled
+##liner
+flirting
+newborn
+1769
+bishopric
+shaved
+232
+currie
+##ush
+dharma
+spartan
+##ooped
+favorites
+smug
+novella
+sirens
+abusive
+creations
+espana
+##lage
+paradigm
+semiconductor
+sheen
+##rdo
+##yen
+##zak
+nrl
+renew
+##pose
+##tur
+adjutant
+marches
+norma
+##enity
+ineffective
+weimar
+grunt
+##gat
+lordship
+plotting
+expenditure
+infringement
+lbs
+refrain
+av
+mimi
+mistakenly
+postmaster
+1771
+##bara
+ras
+motorsports
+tito
+199
+subjective
+##zza
+bully
+stew
+##kaya
+prescott
+1a
+##raphic
+##zam
+bids
+styling
+paranormal
+reeve
+sneaking
+exploding
+katz
+akbar
+migrant
+syllables
+indefinitely
+##ogical
+destroys
+replaces
+applause
+##phine
+pest
+##fide
+218
+articulated
+bertie
+##thing
+##cars
+##ptic
+courtroom
+crowley
+aesthetics
+cummings
+tehsil
+hormones
+titanic
+dangerously
+##ibe
+stadion
+jaenelle
+auguste
+ciudad
+##chu
+mysore
+partisans
+##sio
+lucan
+philipp
+##aly
+debating
+henley
+interiors
+##rano
+##tious
+homecoming
+beyonce
+usher
+henrietta
+prepares
+weeds
+##oman
+ely
+plucked
+##pire
+##dable
+luxurious
+##aq
+artifact
+password
+pasture
+juno
+maddy
+minsk
+##dder
+##ologies
+##rone
+assessments
+martian
+royalist
+1765
+examines
+##mani
+##rge
+nino
+223
+parry
+scooped
+relativity
+##eli
+##uting
+##cao
+congregational
+noisy
+traverse
+##agawa
+strikeouts
+nickelodeon
+obituary
+transylvania
+binds
+depictions
+polk
+trolley
+##yed
+##lard
+breeders
+##under
+dryly
+hokkaido
+1762
+strengths
+stacks
+bonaparte
+connectivity
+neared
+prostitutes
+stamped
+anaheim
+gutierrez
+sinai
+##zzling
+bram
+fresno
+madhya
+##86
+proton
+##lena
+##llum
+##phon
+reelected
+wanda
+##anus
+##lb
+ample
+distinguishing
+##yler
+grasping
+sermons
+tomato
+bland
+stimulation
+avenues
+##eux
+spreads
+scarlett
+fern
+pentagon
+assert
+baird
+chesapeake
+ir
+calmed
+distortion
+fatalities
+##olis
+correctional
+pricing
+##astic
+##gina
+prom
+dammit
+ying
+collaborate
+##chia
+welterweight
+33rd
+pointer
+substitution
+bonded
+umpire
+communicating
+multitude
+paddle
+##obe
+federally
+intimacy
+##insky
+betray
+ssr
+##lett
+##lean
+##lves
+##therapy
+airbus
+##tery
+functioned
+ud
+bearer
+biomedical
+netflix
+##hire
+##nca
+condom
+brink
+ik
+##nical
+macy
+##bet
+flap
+gma
+experimented
+jelly
+lavender
+##icles
+##ulia
+munro
+##mian
+##tial
+rye
+##rle
+60th
+gigs
+hottest
+rotated
+predictions
+fuji
+bu
+##erence
+##omi
+barangay
+##fulness
+##sas
+clocks
+##rwood
+##liness
+cereal
+roe
+wight
+decker
+uttered
+babu
+onion
+xml
+forcibly
+##df
+petra
+sarcasm
+hartley
+peeled
+storytelling
+##42
+##xley
+##ysis
+##ffa
+fibre
+kiel
+auditor
+fig
+harald
+greenville
+##berries
+geographically
+nell
+quartz
+##athic
+cemeteries
+##lr
+crossings
+nah
+holloway
+reptiles
+chun
+sichuan
+snowy
+660
+corrections
+##ivo
+zheng
+ambassadors
+blacksmith
+fielded
+fluids
+hardcover
+turnover
+medications
+melvin
+academies
+##erton
+ro
+roach
+absorbing
+spaniards
+colton
+##founded
+outsider
+espionage
+kelsey
+245
+edible
+##ulf
+dora
+establishes
+##sham
+##tries
+contracting
+##tania
+cinematic
+costello
+nesting
+##uron
+connolly
+duff
+##nology
+mma
+##mata
+fergus
+sexes
+gi
+optics
+spectator
+woodstock
+banning
+##hee
+##fle
+differentiate
+outfielder
+refinery
+226
+312
+gerhard
+horde
+lair
+drastically
+##udi
+landfall
+##cheng
+motorsport
+odi
+##achi
+predominant
+quay
+skins
+##ental
+edna
+harshly
+complementary
+murdering
+##aves
+wreckage
+##90
+ono
+outstretched
+lennox
+munitions
+galen
+reconcile
+470
+scalp
+bicycles
+gillespie
+questionable
+rosenberg
+guillermo
+hostel
+jarvis
+kabul
+volvo
+opium
+yd
+##twined
+abuses
+decca
+outpost
+##cino
+sensible
+neutrality
+##64
+ponce
+anchorage
+atkins
+turrets
+inadvertently
+disagree
+libre
+vodka
+reassuring
+weighs
+##yal
+glide
+jumper
+ceilings
+repertory
+outs
+stain
+##bial
+envy
+##ucible
+smashing
+heightened
+policing
+hyun
+mixes
+lai
+prima
+##ples
+celeste
+##bina
+lucrative
+intervened
+kc
+manually
+##rned
+stature
+staffed
+bun
+bastards
+nairobi
+priced
+##auer
+thatcher
+##kia
+tripped
+comune
+##ogan
+##pled
+brasil
+incentives
+emanuel
+hereford
+musica
+##kim
+benedictine
+biennale
+##lani
+eureka
+gardiner
+rb
+knocks
+sha
+##ael
+##elled
+##onate
+efficacy
+ventura
+masonic
+sanford
+maize
+leverage
+##feit
+capacities
+santana
+##aur
+novelty
+vanilla
+##cter
+##tour
+benin
+##oir
+##rain
+neptune
+drafting
+tallinn
+##cable
+humiliation
+##boarding
+schleswig
+fabian
+bernardo
+liturgy
+spectacle
+sweeney
+pont
+routledge
+##tment
+cosmos
+ut
+hilt
+sleek
+universally
+##eville
+##gawa
+typed
+##dry
+favors
+allegheny
+glaciers
+##rly
+recalling
+aziz
+##log
+parasite
+requiem
+auf
+##berto
+##llin
+illumination
+##breaker
+##issa
+festivities
+bows
+govern
+vibe
+vp
+333
+sprawled
+larson
+pilgrim
+bwf
+leaping
+##rts
+##ssel
+alexei
+greyhound
+hoarse
+##dler
+##oration
+seneca
+##cule
+gaping
+##ulously
+##pura
+cinnamon
+##gens
+##rricular
+craven
+fantasies
+houghton
+engined
+reigned
+dictator
+supervising
+##oris
+bogota
+commentaries
+unnatural
+fingernails
+spirituality
+tighten
+##tm
+canadiens
+protesting
+intentional
+cheers
+sparta
+##ytic
+##iere
+##zine
+widen
+belgarath
+controllers
+dodd
+iaaf
+navarre
+##ication
+defect
+squire
+steiner
+whisky
+##mins
+560
+inevitably
+tome
+##gold
+chew
+##uid
+##lid
+elastic
+##aby
+streaked
+alliances
+jailed
+regal
+##ined
+##phy
+czechoslovak
+narration
+absently
+##uld
+bluegrass
+guangdong
+quran
+criticizing
+hose
+hari
+##liest
+##owa
+skier
+streaks
+deploy
+##lom
+raft
+bose
+dialed
+huff
+##eira
+haifa
+simplest
+bursting
+endings
+ib
+sultanate
+##titled
+franks
+whitman
+ensures
+sven
+##ggs
+collaborators
+forster
+organising
+ui
+banished
+napier
+injustice
+teller
+layered
+thump
+##otti
+roc
+battleships
+evidenced
+fugitive
+sadie
+robotics
+##roud
+equatorial
+geologist
+##iza
+yielding
+##bron
+##sr
+internationale
+mecca
+##diment
+sbs
+skyline
+toad
+uploaded
+reflective
+undrafted
+lal
+leafs
+bayern
+##dai
+lakshmi
+shortlisted
+##stick
+##wicz
+camouflage
+donate
+af
+christi
+lau
+##acio
+disclosed
+nemesis
+1761
+assemble
+straining
+northamptonshire
+tal
+##asi
+bernardino
+premature
+heidi
+42nd
+coefficients
+galactic
+reproduce
+buzzed
+sensations
+zionist
+monsieur
+myrtle
+##eme
+archery
+strangled
+musically
+viewpoint
+antiquities
+bei
+trailers
+seahawks
+cured
+pee
+preferring
+tasmanian
+lange
+sul
+##mail
+##working
+colder
+overland
+lucivar
+massey
+gatherings
+haitian
+##smith
+disapproval
+flaws
+##cco
+##enbach
+1766
+npr
+##icular
+boroughs
+creole
+forums
+techno
+1755
+dent
+abdominal
+streetcar
+##eson
+##stream
+procurement
+gemini
+predictable
+##tya
+acheron
+christoph
+feeder
+fronts
+vendor
+bernhard
+jammu
+tumors
+slang
+##uber
+goaltender
+twists
+curving
+manson
+vuelta
+mer
+peanut
+confessions
+pouch
+unpredictable
+allowance
+theodor
+vascular
+##factory
+bala
+authenticity
+metabolic
+coughing
+nanjing
+##cea
+pembroke
+##bard
+splendid
+36th
+ff
+hourly
+##ahu
+elmer
+handel
+##ivate
+awarding
+thrusting
+dl
+experimentation
+##hesion
+##46
+caressed
+entertained
+steak
+##rangle
+biologist
+orphans
+baroness
+oyster
+stepfather
+##dridge
+mirage
+reefs
+speeding
+##31
+barons
+1764
+227
+inhabit
+preached
+repealed
+##tral
+honoring
+boogie
+captives
+administer
+johanna
+##imate
+gel
+suspiciously
+1767
+sobs
+##dington
+backbone
+hayward
+garry
+##folding
+##nesia
+maxi
+##oof
+##ppe
+ellison
+galileo
+##stand
+crimea
+frenzy
+amour
+bumper
+matrices
+natalia
+baking
+garth
+palestinians
+##grove
+smack
+conveyed
+ensembles
+gardening
+##manship
+##rup
+##stituting
+1640
+harvesting
+topography
+jing
+shifters
+dormitory
+##carriage
+##lston
+ist
+skulls
+##stadt
+dolores
+jewellery
+sarawak
+##wai
+##zier
+fences
+christy
+confinement
+tumbling
+credibility
+fir
+stench
+##bria
+##plication
+##nged
+##sam
+virtues
+##belt
+marjorie
+pba
+##eem
+##made
+celebrates
+schooner
+agitated
+barley
+fulfilling
+anthropologist
+##pro
+restrict
+novi
+regulating
+##nent
+padres
+##rani
+##hesive
+loyola
+tabitha
+milky
+olson
+proprietor
+crambidae
+guarantees
+intercollegiate
+ljubljana
+hilda
+##sko
+ignorant
+hooded
+##lts
+sardinia
+##lidae
+##vation
+frontman
+privileged
+witchcraft
+##gp
+jammed
+laude
+poking
+##than
+bracket
+amazement
+yunnan
+##erus
+maharaja
+linnaeus
+264
+commissioning
+milano
+peacefully
+##logies
+akira
+rani
+regulator
+##36
+grasses
+##rance
+luzon
+crows
+compiler
+gretchen
+seaman
+edouard
+tab
+buccaneers
+ellington
+hamlets
+whig
+socialists
+##anto
+directorial
+easton
+mythological
+##kr
+##vary
+rhineland
+semantic
+taut
+dune
+inventions
+succeeds
+##iter
+replication
+branched
+##pired
+jul
+prosecuted
+kangaroo
+penetrated
+##avian
+middlesbrough
+doses
+bleak
+madam
+predatory
+relentless
+##vili
+reluctance
+##vir
+hailey
+crore
+silvery
+1759
+monstrous
+swimmers
+transmissions
+hawthorn
+informing
+##eral
+toilets
+caracas
+crouch
+kb
+##sett
+295
+cartel
+hadley
+##aling
+alexia
+yvonne
+##biology
+cinderella
+eton
+superb
+blizzard
+stabbing
+industrialist
+maximus
+##gm
+##orus
+groves
+maud
+clade
+oversized
+comedic
+##bella
+rosen
+nomadic
+fulham
+montane
+beverages
+galaxies
+redundant
+swarm
+##rot
+##folia
+##llis
+buckinghamshire
+fen
+bearings
+bahadur
+##rom
+gilles
+phased
+dynamite
+faber
+benoit
+vip
+##ount
+##wd
+booking
+fractured
+tailored
+anya
+spices
+westwood
+cairns
+auditions
+inflammation
+steamed
+##rocity
+##acion
+##urne
+skyla
+thereof
+watford
+torment
+archdeacon
+transforms
+lulu
+demeanor
+fucked
+serge
+##sor
+mckenna
+minas
+entertainer
+##icide
+caress
+originate
+residue
+##sty
+1740
+##ilised
+##org
+beech
+##wana
+subsidies
+##ghton
+emptied
+gladstone
+ru
+firefighters
+voodoo
+##rcle
+het
+nightingale
+tamara
+edmond
+ingredient
+weaknesses
+silhouette
+285
+compatibility
+withdrawing
+hampson
+##mona
+anguish
+giggling
+##mber
+bookstore
+##jiang
+southernmost
+tilting
+##vance
+bai
+economical
+rf
+briefcase
+dreadful
+hinted
+projections
+shattering
+totaling
+##rogate
+analogue
+indicted
+periodical
+fullback
+##dman
+haynes
+##tenberg
+##ffs
+##ishment
+1745
+thirst
+stumble
+penang
+vigorous
+##ddling
+##kor
+##lium
+octave
+##ove
+##enstein
+##inen
+##ones
+siberian
+##uti
+cbn
+repeal
+swaying
+##vington
+khalid
+tanaka
+unicorn
+otago
+plastered
+lobe
+riddle
+##rella
+perch
+##ishing
+croydon
+filtered
+graeme
+tripoli
+##ossa
+crocodile
+##chers
+sufi
+mined
+##tung
+inferno
+lsu
+##phi
+swelled
+utilizes
+£2
+cale
+periodicals
+styx
+hike
+informally
+coop
+lund
+##tidae
+ala
+hen
+qui
+transformations
+disposed
+sheath
+chickens
+##cade
+fitzroy
+sas
+silesia
+unacceptable
+odisha
+1650
+sabrina
+pe
+spokane
+ratios
+athena
+massage
+shen
+dilemma
+##drum
+##riz
+##hul
+corona
+doubtful
+niall
+##pha
+##bino
+fines
+cite
+acknowledging
+bangor
+ballard
+bathurst
+##resh
+huron
+mustered
+alzheimer
+garments
+kinase
+tyre
+warship
+##cp
+flashback
+pulmonary
+braun
+cheat
+kamal
+cyclists
+constructions
+grenades
+ndp
+traveller
+excuses
+stomped
+signalling
+trimmed
+futsal
+mosques
+relevance
+##wine
+wta
+##23
+##vah
+##lter
+hoc
+##riding
+optimistic
+##´s
+deco
+sim
+interacting
+rejecting
+moniker
+waterways
+##ieri
+##oku
+mayors
+gdansk
+outnumbered
+pearls
+##ended
+##hampton
+fairs
+totals
+dominating
+262
+notions
+stairway
+compiling
+pursed
+commodities
+grease
+yeast
+##jong
+carthage
+griffiths
+residual
+amc
+contraction
+laird
+sapphire
+##marine
+##ivated
+amalgamation
+dissolve
+inclination
+lyle
+packaged
+altitudes
+suez
+canons
+graded
+lurched
+narrowing
+boasts
+guise
+wed
+enrico
+##ovsky
+rower
+scarred
+bree
+cub
+iberian
+protagonists
+bargaining
+proposing
+trainers
+voyages
+vans
+fishes
+##aea
+##ivist
+##verance
+encryption
+artworks
+kazan
+sabre
+cleopatra
+hepburn
+rotting
+supremacy
+mecklenburg
+##brate
+burrows
+hazards
+outgoing
+flair
+organizes
+##ctions
+scorpion
+##usions
+boo
+234
+chevalier
+dunedin
+slapping
+##34
+ineligible
+pensions
+##38
+##omic
+manufactures
+emails
+bismarck
+238
+weakening
+blackish
+ding
+mcgee
+quo
+##rling
+northernmost
+xx
+manpower
+greed
+sampson
+clicking
+##ange
+##horpe
+##inations
+##roving
+torre
+##eptive
+##moral
+symbolism
+38th
+asshole
+meritorious
+outfits
+splashed
+biographies
+sprung
+astros
+##tale
+302
+737
+filly
+raoul
+nw
+tokugawa
+linden
+clubhouse
+##apa
+tracts
+romano
+##pio
+putin
+tags
+##note
+chained
+dickson
+gunshot
+moe
+gunn
+rashid
+##tails
+zipper
+##bas
+##nea
+contrasted
+##ply
+##udes
+plum
+pharaoh
+##pile
+aw
+comedies
+ingrid
+sandwiches
+subdivisions
+1100
+mariana
+nokia
+kamen
+hz
+delaney
+veto
+herring
+##words
+possessive
+outlines
+##roup
+siemens
+stairwell
+rc
+gallantry
+messiah
+palais
+yells
+233
+zeppelin
+##dm
+bolivar
+##cede
+smackdown
+mckinley
+##mora
+##yt
+muted
+geologic
+finely
+unitary
+avatar
+hamas
+maynard
+rees
+bog
+contrasting
+##rut
+liv
+chico
+disposition
+pixel
+##erate
+becca
+dmitry
+yeshiva
+narratives
+##lva
+##ulton
+mercenary
+sharpe
+tempered
+navigate
+stealth
+amassed
+keynes
+##lini
+untouched
+##rrie
+havoc
+lithium
+##fighting
+abyss
+graf
+southward
+wolverine
+balloons
+implements
+ngos
+transitions
+##icum
+ambushed
+concacaf
+dormant
+economists
+##dim
+costing
+csi
+rana
+universite
+boulders
+verity
+##llon
+collin
+mellon
+misses
+cypress
+fluorescent
+lifeless
+spence
+##ulla
+crewe
+shepard
+pak
+revelations
+##م
+jolly
+gibbons
+paw
+##dro
+##quel
+freeing
+##test
+shack
+fries
+palatine
+##51
+##hiko
+accompaniment
+cruising
+recycled
+##aver
+erwin
+sorting
+synthesizers
+dyke
+realities
+sg
+strides
+enslaved
+wetland
+##ghan
+competence
+gunpowder
+grassy
+maroon
+reactors
+objection
+##oms
+carlson
+gearbox
+macintosh
+radios
+shelton
+##sho
+clergyman
+prakash
+254
+mongols
+trophies
+oricon
+228
+stimuli
+twenty20
+cantonese
+cortes
+mirrored
+##saurus
+bhp
+cristina
+melancholy
+##lating
+enjoyable
+nuevo
+##wny
+downfall
+schumacher
+##ind
+banging
+lausanne
+rumbled
+paramilitary
+reflex
+ax
+amplitude
+migratory
+##gall
+##ups
+midi
+barnard
+lastly
+sherry
+##hp
+##nall
+keystone
+##kra
+carleton
+slippery
+##53
+coloring
+foe
+socket
+otter
+##rgos
+mats
+##tose
+consultants
+bafta
+bison
+topping
+##km
+490
+primal
+abandonment
+transplant
+atoll
+hideous
+mort
+pained
+reproduced
+tae
+howling
+##turn
+unlawful
+billionaire
+hotter
+poised
+lansing
+##chang
+dinamo
+retro
+messing
+nfc
+domesday
+##mina
+blitz
+timed
+##athing
+##kley
+ascending
+gesturing
+##izations
+signaled
+tis
+chinatown
+mermaid
+savanna
+jameson
+##aint
+catalina
+##pet
+##hers
+cochrane
+cy
+chatting
+##kus
+alerted
+computation
+mused
+noelle
+majestic
+mohawk
+campo
+octagonal
+##sant
+##hend
+241
+aspiring
+##mart
+comprehend
+iona
+paralyzed
+shimmering
+swindon
+rhone
+##eley
+reputed
+configurations
+pitchfork
+agitation
+francais
+gillian
+lipstick
+##ilo
+outsiders
+pontifical
+resisting
+bitterness
+sewer
+rockies
+##edd
+##ucher
+misleading
+1756
+exiting
+galloway
+##nging
+risked
+##heart
+246
+commemoration
+schultz
+##rka
+integrating
+##rsa
+poses
+shrieked
+##weiler
+guineas
+gladys
+jerking
+owls
+goldsmith
+nightly
+penetrating
+##unced
+lia
+##33
+ignited
+betsy
+##aring
+##thorpe
+follower
+vigorously
+##rave
+coded
+kiran
+knit
+zoology
+tbilisi
+##28
+##bered
+repository
+govt
+deciduous
+dino
+growling
+##bba
+enhancement
+unleashed
+chanting
+pussy
+biochemistry
+##eric
+kettle
+repression
+toxicity
+nrhp
+##arth
+##kko
+##bush
+ernesto
+commended
+outspoken
+242
+mca
+parchment
+sms
+kristen
+##aton
+bisexual
+raked
+glamour
+navajo
+a2
+conditioned
+showcased
+##hma
+spacious
+youthful
+##esa
+usl
+appliances
+junta
+brest
+layne
+conglomerate
+enchanted
+chao
+loosened
+picasso
+circulating
+inspect
+montevideo
+##centric
+##kti
+piazza
+spurred
+##aith
+bari
+freedoms
+poultry
+stamford
+lieu
+##ect
+indigo
+sarcastic
+bahia
+stump
+attach
+dvds
+frankenstein
+lille
+approx
+scriptures
+pollen
+##script
+nmi
+overseen
+##ivism
+tides
+proponent
+newmarket
+inherit
+milling
+##erland
+centralized
+##rou
+distributors
+credentials
+drawers
+abbreviation
+##lco
+##xon
+downing
+uncomfortably
+ripe
+##oes
+erase
+franchises
+##ever
+populace
+##bery
+##khar
+decomposition
+pleas
+##tet
+daryl
+sabah
+##stle
+##wide
+fearless
+genie
+lesions
+annette
+##ogist
+oboe
+appendix
+nair
+dripped
+petitioned
+maclean
+mosquito
+parrot
+rpg
+hampered
+1648
+operatic
+reservoirs
+##tham
+irrelevant
+jolt
+summarized
+##fp
+medallion
+##taff
+##−
+clawed
+harlow
+narrower
+goddard
+marcia
+bodied
+fremont
+suarez
+altering
+tempest
+mussolini
+porn
+##isms
+sweetly
+oversees
+walkers
+solitude
+grimly
+shrines
+hk
+ich
+supervisors
+hostess
+dietrich
+legitimacy
+brushes
+expressive
+##yp
+dissipated
+##rse
+localized
+systemic
+##nikov
+gettysburg
+##js
+##uaries
+dialogues
+muttering
+251
+housekeeper
+sicilian
+discouraged
+##frey
+beamed
+kaladin
+halftime
+kidnap
+##amo
+##llet
+1754
+synonymous
+depleted
+instituto
+insulin
+reprised
+##opsis
+clashed
+##ctric
+interrupting
+radcliffe
+insisting
+medici
+1715
+ejected
+playfully
+turbulent
+##47
+starvation
+##rini
+shipment
+rebellious
+petersen
+verification
+merits
+##rified
+cakes
+##charged
+1757
+milford
+shortages
+spying
+fidelity
+##aker
+emitted
+storylines
+harvested
+seismic
+##iform
+cheung
+kilda
+theoretically
+barbie
+lynx
+##rgy
+##tius
+goblin
+mata
+poisonous
+##nburg
+reactive
+residues
+obedience
+##евич
+conjecture
+##rac
+401
+hating
+sixties
+kicker
+moaning
+motown
+##bha
+emancipation
+neoclassical
+##hering
+consoles
+ebert
+professorship
+##tures
+sustaining
+assaults
+obeyed
+affluent
+incurred
+tornadoes
+##eber
+##zow
+emphasizing
+highlanders
+cheated
+helmets
+##ctus
+internship
+terence
+bony
+executions
+legislators
+berries
+peninsular
+tinged
+##aco
+1689
+amplifier
+corvette
+ribbons
+lavish
+pennant
+##lander
+worthless
+##chfield
+##forms
+mariano
+pyrenees
+expenditures
+##icides
+chesterfield
+mandir
+tailor
+39th
+sergey
+nestled
+willed
+aristocracy
+devotees
+goodnight
+raaf
+rumored
+weaponry
+remy
+appropriations
+harcourt
+burr
+riaa
+##lence
+limitation
+unnoticed
+guo
+soaking
+swamps
+##tica
+collapsing
+tatiana
+descriptive
+brigham
+psalm
+##chment
+maddox
+##lization
+patti
+caliph
+##aja
+akron
+injuring
+serra
+##ganj
+basins
+##sari
+astonished
+launcher
+##church
+hilary
+wilkins
+sewing
+##sf
+stinging
+##fia
+##ncia
+underwood
+startup
+##ition
+compilations
+vibrations
+embankment
+jurist
+##nity
+bard
+juventus
+groundwater
+kern
+palaces
+helium
+boca
+cramped
+marissa
+soto
+##worm
+jae
+princely
+##ggy
+faso
+bazaar
+warmly
+##voking
+229
+pairing
+##lite
+##grate
+##nets
+wien
+freaked
+ulysses
+rebirth
+##alia
+##rent
+mummy
+guzman
+jimenez
+stilled
+##nitz
+trajectory
+tha
+woken
+archival
+professions
+##pts
+##pta
+hilly
+shadowy
+shrink
+##bolt
+norwood
+glued
+migrate
+stereotypes
+devoid
+##pheus
+625
+evacuate
+horrors
+infancy
+gotham
+knowles
+optic
+downloaded
+sachs
+kingsley
+parramatta
+darryl
+mor
+##onale
+shady
+commence
+confesses
+kan
+##meter
+##placed
+marlborough
+roundabout
+regents
+frigates
+io
+##imating
+gothenburg
+revoked
+carvings
+clockwise
+convertible
+intruder
+##sche
+banged
+##ogo
+vicky
+bourgeois
+##mony
+dupont
+footing
+##gum
+pd
+##real
+buckle
+yun
+penthouse
+sane
+720
+serviced
+stakeholders
+neumann
+bb
+##eers
+comb
+##gam
+catchment
+pinning
+rallies
+typing
+##elles
+forefront
+freiburg
+sweetie
+giacomo
+widowed
+goodwill
+worshipped
+aspirations
+midday
+##vat
+fishery
+##trick
+bournemouth
+turk
+243
+hearth
+ethanol
+guadalajara
+murmurs
+sl
+##uge
+afforded
+scripted
+##hta
+wah
+##jn
+coroner
+translucent
+252
+memorials
+puck
+progresses
+clumsy
+##race
+315
+candace
+recounted
+##27
+##slin
+##uve
+filtering
+##mac
+howl
+strata
+heron
+leveled
+##ays
+dubious
+##oja
+##т
+##wheel
+citations
+exhibiting
+##laya
+##mics
+##pods
+turkic
+##lberg
+injunction
+##ennial
+##mit
+antibodies
+##44
+organise
+##rigues
+cardiovascular
+cushion
+inverness
+##zquez
+dia
+cocoa
+sibling
+##tman
+##roid
+expanse
+feasible
+tunisian
+algiers
+##relli
+rus
+bloomberg
+dso
+westphalia
+bro
+tacoma
+281
+downloads
+##ours
+konrad
+duran
+##hdi
+continuum
+jett
+compares
+legislator
+secession
+##nable
+##gues
+##zuka
+translating
+reacher
+##gley
+##ła
+aleppo
+##agi
+tc
+orchards
+trapping
+linguist
+versatile
+drumming
+postage
+calhoun
+superiors
+##mx
+barefoot
+leary
+##cis
+ignacio
+alfa
+kaplan
+##rogen
+bratislava
+mori
+##vot
+disturb
+haas
+313
+cartridges
+gilmore
+radiated
+salford
+tunic
+hades
+##ulsive
+archeological
+delilah
+magistrates
+auditioned
+brewster
+charters
+empowerment
+blogs
+cappella
+dynasties
+iroquois
+whipping
+##krishna
+raceway
+truths
+myra
+weaken
+judah
+mcgregor
+##horse
+mic
+refueling
+37th
+burnley
+bosses
+markus
+premio
+query
+##gga
+dunbar
+##economic
+darkest
+lyndon
+sealing
+commendation
+reappeared
+##mun
+addicted
+ezio
+slaughtered
+satisfactory
+shuffle
+##eves
+##thic
+##uj
+fortification
+warrington
+##otto
+resurrected
+fargo
+mane
+##utable
+##lei
+##space
+foreword
+ox
+##aris
+##vern
+abrams
+hua
+##mento
+sakura
+##alo
+uv
+sentimental
+##skaya
+midfield
+##eses
+sturdy
+scrolls
+macleod
+##kyu
+entropy
+##lance
+mitochondrial
+cicero
+excelled
+thinner
+convoys
+perceive
+##oslav
+##urable
+systematically
+grind
+burkina
+287
+##tagram
+ops
+##aman
+guantanamo
+##cloth
+##tite
+forcefully
+wavy
+##jou
+pointless
+##linger
+##tze
+layton
+portico
+superficial
+clerical
+outlaws
+##hism
+burials
+muir
+##inn
+creditors
+hauling
+rattle
+##leg
+calais
+monde
+archers
+reclaimed
+dwell
+wexford
+hellenic
+falsely
+remorse
+##tek
+dough
+furnishings
+##uttered
+gabon
+neurological
+novice
+##igraphy
+contemplated
+pulpit
+nightstand
+saratoga
+##istan
+documenting
+pulsing
+taluk
+##firmed
+busted
+marital
+##rien
+disagreements
+wasps
+##yes
+hodge
+mcdonnell
+mimic
+fran
+pendant
+dhabi
+musa
+##nington
+congratulations
+argent
+darrell
+concussion
+losers
+regrets
+thessaloniki
+reversal
+donaldson
+hardwood
+thence
+achilles
+ritter
+##eran
+demonic
+jurgen
+prophets
+goethe
+eki
+classmate
+buff
+##cking
+yank
+irrational
+##inging
+perished
+seductive
+qur
+sourced
+##crat
+##typic
+mustard
+ravine
+barre
+horizontally
+characterization
+phylogenetic
+boise
+##dit
+##runner
+##tower
+brutally
+intercourse
+seduce
+##bbing
+fay
+ferris
+ogden
+amar
+nik
+unarmed
+##inator
+evaluating
+kyrgyzstan
+sweetness
+##lford
+##oki
+mccormick
+meiji
+notoriety
+stimulate
+disrupt
+figuring
+instructional
+mcgrath
+##zoo
+groundbreaking
+##lto
+flinch
+khorasan
+agrarian
+bengals
+mixer
+radiating
+##sov
+ingram
+pitchers
+nad
+tariff
+##cript
+tata
+##codes
+##emi
+##ungen
+appellate
+lehigh
+##bled
+##giri
+brawl
+duct
+texans
+##ciation
+##ropolis
+skipper
+speculative
+vomit
+doctrines
+stresses
+253
+davy
+graders
+whitehead
+jozef
+timely
+cumulative
+haryana
+paints
+appropriately
+boon
+cactus
+##ales
+##pid
+dow
+legions
+##pit
+perceptions
+1730
+picturesque
+##yse
+periphery
+rune
+wr
+##aha
+celtics
+sentencing
+whoa
+##erin
+confirms
+variance
+425
+moines
+mathews
+spade
+rave
+m1
+fronted
+fx
+blending
+alleging
+reared
+##gl
+237
+##paper
+grassroots
+eroded
+##free
+##physical
+directs
+ordeal
+##sław
+accelerate
+hacker
+rooftop
+##inia
+lev
+buys
+cebu
+devote
+##lce
+specialising
+##ulsion
+choreographed
+repetition
+warehouses
+##ryl
+paisley
+tuscany
+analogy
+sorcerer
+hash
+huts
+shards
+descends
+exclude
+nix
+chaplin
+gaga
+ito
+vane
+##drich
+causeway
+misconduct
+limo
+orchestrated
+glands
+jana
+##kot
+u2
+##mple
+##sons
+branching
+contrasts
+scoop
+longed
+##virus
+chattanooga
+##75
+syrup
+cornerstone
+##tized
+##mind
+##iaceae
+careless
+precedence
+frescoes
+##uet
+chilled
+consult
+modelled
+snatch
+peat
+##thermal
+caucasian
+humane
+relaxation
+spins
+temperance
+##lbert
+occupations
+lambda
+hybrids
+moons
+mp3
+##oese
+247
+rolf
+societal
+yerevan
+ness
+##ssler
+befriended
+mechanized
+nominate
+trough
+boasted
+cues
+seater
+##hom
+bends
+##tangle
+conductors
+emptiness
+##lmer
+eurasian
+adriatic
+tian
+##cie
+anxiously
+lark
+propellers
+chichester
+jock
+ev
+2a
+##holding
+credible
+recounts
+tori
+loyalist
+abduction
+##hoot
+##redo
+nepali
+##mite
+ventral
+tempting
+##ango
+##crats
+steered
+##wice
+javelin
+dipping
+laborers
+prentice
+looming
+titanium
+##ː
+badges
+emir
+tensor
+##ntation
+egyptians
+rash
+denies
+hawthorne
+lombard
+showers
+wehrmacht
+dietary
+trojan
+##reus
+welles
+executing
+horseshoe
+lifeboat
+##lak
+elsa
+infirmary
+nearing
+roberta
+boyer
+mutter
+trillion
+joanne
+##fine
+##oked
+sinks
+vortex
+uruguayan
+clasp
+sirius
+##block
+accelerator
+prohibit
+sunken
+byu
+chronological
+diplomats
+ochreous
+510
+symmetrical
+1644
+maia
+##tology
+salts
+reigns
+atrocities
+##ия
+hess
+bared
+issn
+##vyn
+cater
+saturated
+##cycle
+##isse
+sable
+voyager
+dyer
+yusuf
+##inge
+fountains
+wolff
+##39
+##nni
+engraving
+rollins
+atheist
+ominous
+##ault
+herr
+chariot
+martina
+strung
+##fell
+##farlane
+horrific
+sahib
+gazes
+saetan
+erased
+ptolemy
+##olic
+flushing
+lauderdale
+analytic
+##ices
+530
+navarro
+beak
+gorilla
+herrera
+broom
+guadalupe
+raiding
+sykes
+311
+bsc
+deliveries
+1720
+invasions
+carmichael
+tajikistan
+thematic
+ecumenical
+sentiments
+onstage
+##rians
+##brand
+##sume
+catastrophic
+flanks
+molten
+##arns
+waller
+aimee
+terminating
+##icing
+alternately
+##oche
+nehru
+printers
+outraged
+##eving
+empires
+template
+banners
+repetitive
+za
+##oise
+vegetarian
+##tell
+guiana
+opt
+cavendish
+lucknow
+synthesized
+##hani
+##mada
+finalized
+##ctable
+fictitious
+mayoral
+unreliable
+##enham
+embracing
+peppers
+rbis
+##chio
+##neo
+inhibition
+slashed
+togo
+orderly
+embroidered
+safari
+salty
+236
+barron
+benito
+totaled
+##dak
+pubs
+simulated
+caden
+devin
+tolkien
+momma
+welding
+sesame
+##ept
+gottingen
+hardness
+630
+shaman
+temeraire
+620
+adequately
+pediatric
+##kit
+ck
+assertion
+radicals
+composure
+cadence
+seafood
+beaufort
+lazarus
+mani
+warily
+cunning
+kurdistan
+249
+cantata
+##kir
+ares
+##41
+##clusive
+nape
+townland
+geared
+insulted
+flutter
+boating
+violate
+draper
+dumping
+malmo
+##hh
+##romatic
+firearm
+alta
+bono
+obscured
+##clave
+exceeds
+panorama
+unbelievable
+##train
+preschool
+##essed
+disconnected
+installing
+rescuing
+secretaries
+accessibility
+##castle
+##drive
+##ifice
+##film
+bouts
+slug
+waterway
+mindanao
+##buro
+##ratic
+halves
+##ل
+calming
+liter
+maternity
+adorable
+bragg
+electrification
+mcc
+##dote
+roxy
+schizophrenia
+##body
+munoz
+kaye
+whaling
+239
+mil
+tingling
+tolerant
+##ago
+unconventional
+volcanoes
+##finder
+deportivo
+##llie
+robson
+kaufman
+neuroscience
+wai
+deportation
+masovian
+scraping
+converse
+##bh
+hacking
+bulge
+##oun
+administratively
+yao
+580
+amp
+mammoth
+booster
+claremont
+hooper
+nomenclature
+pursuits
+mclaughlin
+melinda
+##sul
+catfish
+barclay
+substrates
+taxa
+zee
+originals
+kimberly
+packets
+padma
+##ality
+borrowing
+ostensibly
+solvent
+##bri
+##genesis
+##mist
+lukas
+shreveport
+veracruz
+##ь
+##lou
+##wives
+cheney
+tt
+anatolia
+hobbs
+##zyn
+cyclic
+radiant
+alistair
+greenish
+siena
+dat
+independents
+##bation
+conform
+pieter
+hyper
+applicant
+bradshaw
+spores
+telangana
+vinci
+inexpensive
+nuclei
+322
+jang
+nme
+soho
+spd
+##ign
+cradled
+receptionist
+pow
+##43
+##rika
+fascism
+##ifer
+experimenting
+##ading
+##iec
+##region
+345
+jocelyn
+maris
+stair
+nocturnal
+toro
+constabulary
+elgin
+##kker
+msc
+##giving
+##schen
+##rase
+doherty
+doping
+sarcastically
+batter
+maneuvers
+##cano
+##apple
+##gai
+##git
+intrinsic
+##nst
+##stor
+1753
+showtime
+cafes
+gasps
+lviv
+ushered
+##thed
+fours
+restart
+astonishment
+transmitting
+flyer
+shrugs
+##sau
+intriguing
+cones
+dictated
+mushrooms
+medial
+##kovsky
+##elman
+escorting
+gaped
+##26
+godfather
+##door
+##sell
+djs
+recaptured
+timetable
+vila
+1710
+3a
+aerodrome
+mortals
+scientology
+##orne
+angelina
+mag
+convection
+unpaid
+insertion
+intermittent
+lego
+##nated
+endeavor
+kota
+pereira
+##lz
+304
+bwv
+glamorgan
+insults
+agatha
+fey
+##cend
+fleetwood
+mahogany
+protruding
+steamship
+zeta
+##arty
+mcguire
+suspense
+##sphere
+advising
+urges
+##wala
+hurriedly
+meteor
+gilded
+inline
+arroyo
+stalker
+##oge
+excitedly
+revered
+##cure
+earle
+introductory
+##break
+##ilde
+mutants
+puff
+pulses
+reinforcement
+##haling
+curses
+lizards
+stalk
+correlated
+##fixed
+fallout
+macquarie
+##unas
+bearded
+denton
+heaving
+802
+##ocation
+winery
+assign
+dortmund
+##lkirk
+everest
+invariant
+charismatic
+susie
+##elling
+bled
+lesley
+telegram
+sumner
+bk
+##ogen
+##к
+wilcox
+needy
+colbert
+duval
+##iferous
+##mbled
+allotted
+attends
+imperative
+##hita
+replacements
+hawker
+##inda
+insurgency
+##zee
+##eke
+casts
+##yla
+680
+ives
+transitioned
+##pack
+##powering
+authoritative
+baylor
+flex
+cringed
+plaintiffs
+woodrow
+##skie
+drastic
+ape
+aroma
+unfolded
+commotion
+nt
+preoccupied
+theta
+routines
+lasers
+privatization
+wand
+domino
+ek
+clenching
+nsa
+strategically
+showered
+bile
+handkerchief
+pere
+storing
+christophe
+insulting
+316
+nakamura
+romani
+asiatic
+magdalena
+palma
+cruises
+stripping
+405
+konstantin
+soaring
+##berman
+colloquially
+forerunner
+havilland
+incarcerated
+parasites
+sincerity
+##utus
+disks
+plank
+saigon
+##ining
+corbin
+homo
+ornaments
+powerhouse
+##tlement
+chong
+fastened
+feasibility
+idf
+morphological
+usable
+##nish
+##zuki
+aqueduct
+jaguars
+keepers
+##flies
+aleksandr
+faust
+assigns
+ewing
+bacterium
+hurled
+tricky
+hungarians
+integers
+wallis
+321
+yamaha
+##isha
+hushed
+oblivion
+aviator
+evangelist
+friars
+##eller
+monograph
+ode
+##nary
+airplanes
+labourers
+charms
+##nee
+1661
+hagen
+tnt
+rudder
+fiesta
+transcript
+dorothea
+ska
+inhibitor
+maccabi
+retorted
+raining
+encompassed
+clauses
+menacing
+1642
+lineman
+##gist
+vamps
+##ape
+##dick
+gloom
+##rera
+dealings
+easing
+seekers
+##nut
+##pment
+helens
+unmanned
+##anu
+##isson
+basics
+##amy
+##ckman
+adjustments
+1688
+brutality
+horne
+##zell
+sui
+##55
+##mable
+aggregator
+##thal
+rhino
+##drick
+##vira
+counters
+zoom
+##01
+##rting
+mn
+montenegrin
+packard
+##unciation
+##♭
+##kki
+reclaim
+scholastic
+thugs
+pulsed
+##icia
+syriac
+quan
+saddam
+banda
+kobe
+blaming
+buddies
+dissent
+##lusion
+##usia
+corbett
+jaya
+delle
+erratic
+lexie
+##hesis
+435
+amiga
+hermes
+##pressing
+##leen
+chapels
+gospels
+jamal
+##uating
+compute
+revolving
+warp
+##sso
+##thes
+armory
+##eras
+##gol
+antrim
+loki
+##kow
+##asian
+##good
+##zano
+braid
+handwriting
+subdistrict
+funky
+pantheon
+##iculate
+concurrency
+estimation
+improper
+juliana
+##his
+newcomers
+johnstone
+staten
+communicated
+##oco
+##alle
+sausage
+stormy
+##stered
+##tters
+superfamily
+##grade
+acidic
+collateral
+tabloid
+##oped
+##rza
+bladder
+austen
+##ellant
+mcgraw
+##hay
+hannibal
+mein
+aquino
+lucifer
+wo
+badger
+boar
+cher
+christensen
+greenberg
+interruption
+##kken
+jem
+244
+mocked
+bottoms
+cambridgeshire
+##lide
+sprawling
+##bbly
+eastwood
+ghent
+synth
+##buck
+advisers
+##bah
+nominally
+hapoel
+qu
+daggers
+estranged
+fabricated
+towels
+vinnie
+wcw
+misunderstanding
+anglia
+nothin
+unmistakable
+##dust
+##lova
+chilly
+marquette
+truss
+##edge
+##erine
+reece
+##lty
+##chemist
+##connected
+272
+308
+41st
+bash
+raion
+waterfalls
+##ump
+##main
+labyrinth
+queue
+theorist
+##istle
+bharatiya
+flexed
+soundtracks
+rooney
+leftist
+patrolling
+wharton
+plainly
+alleviate
+eastman
+schuster
+topographic
+engages
+immensely
+unbearable
+fairchild
+1620
+dona
+lurking
+parisian
+oliveira
+ia
+indictment
+hahn
+bangladeshi
+##aster
+vivo
+##uming
+##ential
+antonia
+expects
+indoors
+kildare
+harlan
+##logue
+##ogenic
+##sities
+forgiven
+##wat
+childish
+tavi
+##mide
+##orra
+plausible
+grimm
+successively
+scooted
+##bola
+##dget
+##rith
+spartans
+emery
+flatly
+azure
+epilogue
+##wark
+flourish
+##iny
+##tracted
+##overs
+##oshi
+bestseller
+distressed
+receipt
+spitting
+hermit
+topological
+##cot
+drilled
+subunit
+francs
+##layer
+eel
+##fk
+##itas
+octopus
+footprint
+petitions
+ufo
+##say
+##foil
+interfering
+leaking
+palo
+##metry
+thistle
+valiant
+##pic
+narayan
+mcpherson
+##fast
+gonzales
+##ym
+##enne
+dustin
+novgorod
+solos
+##zman
+doin
+##raph
+##patient
+##meyer
+soluble
+ashland
+cuffs
+carole
+pendleton
+whistling
+vassal
+##river
+deviation
+revisited
+constituents
+rallied
+rotate
+loomed
+##eil
+##nting
+amateurs
+augsburg
+auschwitz
+crowns
+skeletons
+##cona
+bonnet
+257
+dummy
+globalization
+simeon
+sleeper
+mandal
+differentiated
+##crow
+##mare
+milne
+bundled
+exasperated
+talmud
+owes
+segregated
+##feng
+##uary
+dentist
+piracy
+props
+##rang
+devlin
+##torium
+malicious
+paws
+##laid
+dependency
+##ergy
+##fers
+##enna
+258
+pistons
+rourke
+jed
+grammatical
+tres
+maha
+wig
+512
+ghostly
+jayne
+##achal
+##creen
+##ilis
+##lins
+##rence
+designate
+##with
+arrogance
+cambodian
+clones
+showdown
+throttle
+twain
+##ception
+lobes
+metz
+nagoya
+335
+braking
+##furt
+385
+roaming
+##minster
+amin
+crippled
+##37
+##llary
+indifferent
+hoffmann
+idols
+intimidating
+1751
+261
+influenza
+memo
+onions
+1748
+bandage
+consciously
+##landa
+##rage
+clandestine
+observes
+swiped
+tangle
+##ener
+##jected
+##trum
+##bill
+##lta
+hugs
+congresses
+josiah
+spirited
+##dek
+humanist
+managerial
+filmmaking
+inmate
+rhymes
+debuting
+grimsby
+ur
+##laze
+duplicate
+vigor
+##tf
+republished
+bolshevik
+refurbishment
+antibiotics
+martini
+methane
+newscasts
+royale
+horizons
+levant
+iain
+visas
+##ischen
+paler
+##around
+manifestation
+snuck
+alf
+chop
+futile
+pedestal
+rehab
+##kat
+bmg
+kerman
+res
+fairbanks
+jarrett
+abstraction
+saharan
+##zek
+1746
+procedural
+clearer
+kincaid
+sash
+luciano
+##ffey
+crunch
+helmut
+##vara
+revolutionaries
+##tute
+creamy
+leach
+##mmon
+1747
+permitting
+nes
+plight
+wendell
+##lese
+contra
+ts
+clancy
+ipa
+mach
+staples
+autopsy
+disturbances
+nueva
+karin
+pontiac
+##uding
+proxy
+venerable
+haunt
+leto
+bergman
+expands
+##helm
+wal
+##pipe
+canning
+celine
+cords
+obesity
+##enary
+intrusion
+planner
+##phate
+reasoned
+sequencing
+307
+harrow
+##chon
+##dora
+marred
+mcintyre
+repay
+tarzan
+darting
+248
+harrisburg
+margarita
+repulsed
+##hur
+##lding
+belinda
+hamburger
+novo
+compliant
+runways
+bingham
+registrar
+skyscraper
+ic
+cuthbert
+improvisation
+livelihood
+##corp
+##elial
+admiring
+##dened
+sporadic
+believer
+casablanca
+popcorn
+##29
+asha
+shovel
+##bek
+##dice
+coiled
+tangible
+##dez
+casper
+elsie
+resin
+tenderness
+rectory
+##ivision
+avail
+sonar
+##mori
+boutique
+##dier
+guerre
+bathed
+upbringing
+vaulted
+sandals
+blessings
+##naut
+##utnant
+1680
+306
+foxes
+pia
+corrosion
+hesitantly
+confederates
+crystalline
+footprints
+shapiro
+tirana
+valentin
+drones
+45th
+microscope
+shipments
+texted
+inquisition
+wry
+guernsey
+unauthorized
+resigning
+760
+ripple
+schubert
+stu
+reassure
+felony
+##ardo
+brittle
+koreans
+##havan
+##ives
+dun
+implicit
+tyres
+##aldi
+##lth
+magnolia
+##ehan
+##puri
+##poulos
+aggressively
+fei
+gr
+familiarity
+##poo
+indicative
+##trust
+fundamentally
+jimmie
+overrun
+395
+anchors
+moans
+##opus
+britannia
+armagh
+##ggle
+purposely
+seizing
+##vao
+bewildered
+mundane
+avoidance
+cosmopolitan
+geometridae
+quartermaster
+caf
+415
+chatter
+engulfed
+gleam
+purge
+##icate
+juliette
+jurisprudence
+guerra
+revisions
+##bn
+casimir
+brew
+##jm
+1749
+clapton
+cloudy
+conde
+hermitage
+278
+simulations
+torches
+vincenzo
+matteo
+##rill
+hidalgo
+booming
+westbound
+accomplishment
+tentacles
+unaffected
+##sius
+annabelle
+flopped
+sloping
+##litz
+dreamer
+interceptor
+vu
+##loh
+consecration
+copying
+messaging
+breaker
+climates
+hospitalized
+1752
+torino
+afternoons
+winfield
+witnessing
+##teacher
+breakers
+choirs
+sawmill
+coldly
+##ege
+sipping
+haste
+uninhabited
+conical
+bibliography
+pamphlets
+severn
+edict
+##oca
+deux
+illnesses
+grips
+##pl
+rehearsals
+sis
+thinkers
+tame
+##keepers
+1690
+acacia
+reformer
+##osed
+##rys
+shuffling
+##iring
+##shima
+eastbound
+ionic
+rhea
+flees
+littered
+##oum
+rocker
+vomiting
+groaning
+champ
+overwhelmingly
+civilizations
+paces
+sloop
+adoptive
+##tish
+skaters
+##vres
+aiding
+mango
+##joy
+nikola
+shriek
+##ignon
+pharmaceuticals
+##mg
+tuna
+calvert
+gustavo
+stocked
+yearbook
+##urai
+##mana
+computed
+subsp
+riff
+hanoi
+kelvin
+hamid
+moors
+pastures
+summons
+jihad
+nectar
+##ctors
+bayou
+untitled
+pleasing
+vastly
+republics
+intellect
+##η
+##ulio
+##tou
+crumbling
+stylistic
+sb
+##ی
+consolation
+frequented
+h₂o
+walden
+widows
+##iens
+404
+##ignment
+chunks
+improves
+288
+grit
+recited
+##dev
+snarl
+sociological
+##arte
+##gul
+inquired
+##held
+bruise
+clube
+consultancy
+homogeneous
+hornets
+multiplication
+pasta
+prick
+savior
+##grin
+##kou
+##phile
+yoon
+##gara
+grimes
+vanishing
+cheering
+reacting
+bn
+distillery
+##quisite
+##vity
+coe
+dockyard
+massif
+##jord
+escorts
+voss
+##valent
+byte
+chopped
+hawke
+illusions
+workings
+floats
+##koto
+##vac
+kv
+annapolis
+madden
+##onus
+alvaro
+noctuidae
+##cum
+##scopic
+avenge
+steamboat
+forte
+illustrates
+erika
+##trip
+570
+dew
+nationalities
+bran
+manifested
+thirsty
+diversified
+muscled
+reborn
+##standing
+arson
+##lessness
+##dran
+##logram
+##boys
+##kushima
+##vious
+willoughby
+##phobia
+286
+alsace
+dashboard
+yuki
+##chai
+granville
+myspace
+publicized
+tricked
+##gang
+adjective
+##ater
+relic
+reorganisation
+enthusiastically
+indications
+saxe
+##lassified
+consolidate
+iec
+padua
+helplessly
+ramps
+renaming
+regulars
+pedestrians
+accents
+convicts
+inaccurate
+lowers
+mana
+##pati
+barrie
+bjp
+outta
+someplace
+berwick
+flanking
+invoked
+marrow
+sparsely
+excerpts
+clothed
+rei
+##ginal
+wept
+##straße
+##vish
+alexa
+excel
+##ptive
+membranes
+aquitaine
+creeks
+cutler
+sheppard
+implementations
+ns
+##dur
+fragrance
+budge
+concordia
+magnesium
+marcelo
+##antes
+gladly
+vibrating
+##rral
+##ggles
+montrose
+##omba
+lew
+seamus
+1630
+cocky
+##ament
+##uen
+bjorn
+##rrick
+fielder
+fluttering
+##lase
+methyl
+kimberley
+mcdowell
+reductions
+barbed
+##jic
+##tonic
+aeronautical
+condensed
+distracting
+##promising
+huffed
+##cala
+##sle
+claudius
+invincible
+missy
+pious
+balthazar
+ci
+##lang
+butte
+combo
+orson
+##dication
+myriad
+1707
+silenced
+##fed
+##rh
+coco
+netball
+yourselves
+##oza
+clarify
+heller
+peg
+durban
+etudes
+offender
+roast
+blackmail
+curvature
+##woods
+vile
+309
+illicit
+suriname
+##linson
+overture
+1685
+bubbling
+gymnast
+tucking
+##mming
+##ouin
+maldives
+##bala
+gurney
+##dda
+##eased
+##oides
+backside
+pinto
+jars
+racehorse
+tending
+##rdial
+baronetcy
+wiener
+duly
+##rke
+barbarian
+cupping
+flawed
+##thesis
+bertha
+pleistocene
+puddle
+swearing
+##nob
+##tically
+fleeting
+prostate
+amulet
+educating
+##mined
+##iti
+##tler
+75th
+jens
+respondents
+analytics
+cavaliers
+papacy
+raju
+##iente
+##ulum
+##tip
+funnel
+271
+disneyland
+##lley
+sociologist
+##iam
+2500
+faulkner
+louvre
+menon
+##dson
+276
+##ower
+afterlife
+mannheim
+peptide
+referees
+comedians
+meaningless
+##anger
+##laise
+fabrics
+hurley
+renal
+sleeps
+##bour
+##icle
+breakout
+kristin
+roadside
+animator
+clover
+disdain
+unsafe
+redesign
+##urity
+firth
+barnsley
+portage
+reset
+narrows
+268
+commandos
+expansive
+speechless
+tubular
+##lux
+essendon
+eyelashes
+smashwords
+##yad
+##bang
+##claim
+craved
+sprinted
+chet
+somme
+astor
+wrocław
+orton
+266
+bane
+##erving
+##uing
+mischief
+##amps
+##sund
+scaling
+terre
+##xious
+impairment
+offenses
+undermine
+moi
+soy
+contiguous
+arcadia
+inuit
+seam
+##tops
+macbeth
+rebelled
+##icative
+##iot
+590
+elaborated
+frs
+uniformed
+##dberg
+259
+powerless
+priscilla
+stimulated
+980
+qc
+arboretum
+frustrating
+trieste
+bullock
+##nified
+enriched
+glistening
+intern
+##adia
+locus
+nouvelle
+ollie
+ike
+lash
+starboard
+ee
+tapestry
+headlined
+hove
+rigged
+##vite
+pollock
+##yme
+thrive
+clustered
+cas
+roi
+gleamed
+olympiad
+##lino
+pressured
+regimes
+##hosis
+##lick
+ripley
+##ophone
+kickoff
+gallon
+rockwell
+##arable
+crusader
+glue
+revolutions
+scrambling
+1714
+grover
+##jure
+englishman
+aztec
+263
+contemplating
+coven
+ipad
+preach
+triumphant
+tufts
+##esian
+rotational
+##phus
+328
+falkland
+##brates
+strewn
+clarissa
+rejoin
+environmentally
+glint
+banded
+drenched
+moat
+albanians
+johor
+rr
+maestro
+malley
+nouveau
+shaded
+taxonomy
+v6
+adhere
+bunk
+airfields
+##ritan
+1741
+encompass
+remington
+tran
+##erative
+amelie
+mazda
+friar
+morals
+passions
+##zai
+breadth
+vis
+##hae
+argus
+burnham
+caressing
+insider
+rudd
+##imov
+##mini
+##rso
+italianate
+murderous
+textual
+wainwright
+armada
+bam
+weave
+timer
+##taken
+##nh
+fra
+##crest
+ardent
+salazar
+taps
+tunis
+##ntino
+allegro
+gland
+philanthropic
+##chester
+implication
+##optera
+esq
+judas
+noticeably
+wynn
+##dara
+inched
+indexed
+crises
+villiers
+bandit
+royalties
+patterned
+cupboard
+interspersed
+accessory
+isla
+kendrick
+entourage
+stitches
+##esthesia
+headwaters
+##ior
+interlude
+distraught
+draught
+1727
+##basket
+biased
+sy
+transient
+triad
+subgenus
+adapting
+kidd
+shortstop
+##umatic
+dimly
+spiked
+mcleod
+reprint
+nellie
+pretoria
+windmill
+##cek
+singled
+##mps
+273
+reunite
+##orous
+747
+bankers
+outlying
+##omp
+##ports
+##tream
+apologies
+cosmetics
+patsy
+##deh
+##ocks
+##yson
+bender
+nantes
+serene
+##nad
+lucha
+mmm
+323
+##cius
+##gli
+cmll
+coinage
+nestor
+juarez
+##rook
+smeared
+sprayed
+twitching
+sterile
+irina
+embodied
+juveniles
+enveloped
+miscellaneous
+cancers
+dq
+gulped
+luisa
+crested
+swat
+donegal
+ref
+##anov
+##acker
+hearst
+mercantile
+##lika
+doorbell
+ua
+vicki
+##alla
+##som
+bilbao
+psychologists
+stryker
+sw
+horsemen
+turkmenistan
+wits
+##national
+anson
+mathew
+screenings
+##umb
+rihanna
+##agne
+##nessy
+aisles
+##iani
+##osphere
+hines
+kenton
+saskatoon
+tasha
+truncated
+##champ
+##itan
+mildred
+advises
+fredrik
+interpreting
+inhibitors
+##athi
+spectroscopy
+##hab
+##kong
+karim
+panda
+##oia
+##nail
+##vc
+conqueror
+kgb
+leukemia
+##dity
+arrivals
+cheered
+pisa
+phosphorus
+shielded
+##riated
+mammal
+unitarian
+urgently
+chopin
+sanitary
+##mission
+spicy
+drugged
+hinges
+##tort
+tipping
+trier
+impoverished
+westchester
+##caster
+267
+epoch
+nonstop
+##gman
+##khov
+aromatic
+centrally
+cerro
+##tively
+##vio
+billions
+modulation
+sedimentary
+283
+facilitating
+outrageous
+goldstein
+##eak
+##kt
+ld
+maitland
+penultimate
+pollard
+##dance
+fleets
+spaceship
+vertebrae
+##nig
+alcoholism
+als
+recital
+##bham
+##ference
+##omics
+m2
+##bm
+trois
+##tropical
+##в
+commemorates
+##meric
+marge
+##raction
+1643
+670
+cosmetic
+ravaged
+##ige
+catastrophe
+eng
+##shida
+albrecht
+arterial
+bellamy
+decor
+harmon
+##rde
+bulbs
+synchronized
+vito
+easiest
+shetland
+shielding
+wnba
+##glers
+##ssar
+##riam
+brianna
+cumbria
+##aceous
+##rard
+cores
+thayer
+##nsk
+brood
+hilltop
+luminous
+carts
+keynote
+larkin
+logos
+##cta
+##ا
+##mund
+##quay
+lilith
+tinted
+277
+wrestle
+mobilization
+##uses
+sequential
+siam
+bloomfield
+takahashi
+274
+##ieving
+presenters
+ringo
+blazed
+witty
+##oven
+##ignant
+devastation
+haydn
+harmed
+newt
+therese
+##peed
+gershwin
+molina
+rabbis
+sudanese
+001
+innate
+restarted
+##sack
+##fus
+slices
+wb
+##shah
+enroll
+hypothetical
+hysterical
+1743
+fabio
+indefinite
+warped
+##hg
+exchanging
+525
+unsuitable
+##sboro
+gallo
+1603
+bret
+cobalt
+homemade
+##hunter
+mx
+operatives
+##dhar
+terraces
+durable
+latch
+pens
+whorls
+##ctuated
+##eaux
+billing
+ligament
+succumbed
+##gly
+regulators
+spawn
+##brick
+##stead
+filmfare
+rochelle
+##nzo
+1725
+circumstance
+saber
+supplements
+##nsky
+##tson
+crowe
+wellesley
+carrot
+##9th
+##movable
+primate
+drury
+sincerely
+topical
+##mad
+##rao
+callahan
+kyiv
+smarter
+tits
+undo
+##yeh
+announcements
+anthologies
+barrio
+nebula
+##islaus
+##shaft
+##tyn
+bodyguards
+2021
+assassinate
+barns
+emmett
+scully
+##mah
+##yd
+##eland
+##tino
+##itarian
+demoted
+gorman
+lashed
+prized
+adventist
+writ
+##gui
+alla
+invertebrates
+##ausen
+1641
+amman
+1742
+align
+healy
+redistribution
+##gf
+##rize
+insulation
+##drop
+adherents
+hezbollah
+vitro
+ferns
+yanking
+269
+php
+registering
+uppsala
+cheerleading
+confines
+mischievous
+tully
+##ross
+49th
+docked
+roam
+stipulated
+pumpkin
+##bry
+prompt
+##ezer
+blindly
+shuddering
+craftsmen
+frail
+scented
+katharine
+scramble
+shaggy
+sponge
+helix
+zaragoza
+279
+##52
+43rd
+backlash
+fontaine
+seizures
+posse
+cowan
+nonfiction
+telenovela
+wwii
+hammered
+undone
+##gpur
+encircled
+irs
+##ivation
+artefacts
+oneself
+searing
+smallpox
+##belle
+##osaurus
+shandong
+breached
+upland
+blushing
+rankin
+infinitely
+psyche
+tolerated
+docking
+evicted
+##col
+unmarked
+##lving
+gnome
+lettering
+litres
+musique
+##oint
+benevolent
+##jal
+blackened
+##anna
+mccall
+racers
+tingle
+##ocene
+##orestation
+introductions
+radically
+292
+##hiff
+##باد
+1610
+1739
+munchen
+plead
+##nka
+condo
+scissors
+##sight
+##tens
+apprehension
+##cey
+##yin
+hallmark
+watering
+formulas
+sequels
+##llas
+aggravated
+bae
+commencing
+##building
+enfield
+prohibits
+marne
+vedic
+civilized
+euclidean
+jagger
+beforehand
+blasts
+dumont
+##arney
+##nem
+740
+conversions
+hierarchical
+rios
+simulator
+##dya
+##lellan
+hedges
+oleg
+thrusts
+shadowed
+darby
+maximize
+1744
+gregorian
+##nded
+##routed
+sham
+unspecified
+##hog
+emory
+factual
+##smo
+##tp
+fooled
+##rger
+ortega
+wellness
+marlon
+##oton
+##urance
+casket
+keating
+ley
+enclave
+##ayan
+char
+influencing
+jia
+##chenko
+412
+ammonia
+erebidae
+incompatible
+violins
+cornered
+##arat
+grooves
+astronauts
+columbian
+rampant
+fabrication
+kyushu
+mahmud
+vanish
+##dern
+mesopotamia
+##lete
+ict
+##rgen
+caspian
+kenji
+pitted
+##vered
+999
+grimace
+roanoke
+tchaikovsky
+twinned
+##analysis
+##awan
+xinjiang
+arias
+clemson
+kazakh
+sizable
+1662
+##khand
+##vard
+plunge
+tatum
+vittorio
+##nden
+cholera
+##dana
+##oper
+bracing
+indifference
+projectile
+superliga
+##chee
+realises
+upgrading
+299
+porte
+retribution
+##vies
+nk
+stil
+##resses
+ama
+bureaucracy
+blackberry
+bosch
+testosterone
+collapses
+greer
+##pathic
+ioc
+fifties
+malls
+##erved
+bao
+baskets
+adolescents
+siegfried
+##osity
+##tosis
+mantra
+detecting
+existent
+fledgling
+##cchi
+dissatisfied
+gan
+telecommunication
+mingled
+sobbed
+6000
+controversies
+outdated
+taxis
+##raus
+fright
+slams
+##lham
+##fect
+##tten
+detectors
+fetal
+tanned
+##uw
+fray
+goth
+olympian
+skipping
+mandates
+scratches
+sheng
+unspoken
+hyundai
+tracey
+hotspur
+restrictive
+##buch
+americana
+mundo
+##bari
+burroughs
+diva
+vulcan
+##6th
+distinctions
+thumping
+##ngen
+mikey
+sheds
+fide
+rescues
+springsteen
+vested
+valuation
+##ece
+##ely
+pinnacle
+rake
+sylvie
+##edo
+almond
+quivering
+##irus
+alteration
+faltered
+##wad
+51st
+hydra
+ticked
+##kato
+recommends
+##dicated
+antigua
+arjun
+stagecoach
+wilfred
+trickle
+pronouns
+##pon
+aryan
+nighttime
+##anian
+gall
+pea
+stitch
+##hei
+leung
+milos
+##dini
+eritrea
+nexus
+starved
+snowfall
+kant
+parasitic
+cot
+discus
+hana
+strikers
+appleton
+kitchens
+##erina
+##partisan
+##itha
+##vius
+disclose
+metis
+##channel
+1701
+tesla
+##vera
+fitch
+1735
+blooded
+##tila
+decimal
+##tang
+##bai
+cyclones
+eun
+bottled
+peas
+pensacola
+basha
+bolivian
+crabs
+boil
+lanterns
+partridge
+roofed
+1645
+necks
+##phila
+opined
+patting
+##kla
+##lland
+chuckles
+volta
+whereupon
+##nche
+devout
+euroleague
+suicidal
+##dee
+inherently
+involuntary
+knitting
+nasser
+##hide
+puppets
+colourful
+courageous
+southend
+stills
+miraculous
+hodgson
+richer
+rochdale
+ethernet
+greta
+uniting
+prism
+umm
+##haya
+##itical
+##utation
+deterioration
+pointe
+prowess
+##ropriation
+lids
+scranton
+billings
+subcontinent
+##koff
+##scope
+brute
+kellogg
+psalms
+degraded
+##vez
+stanisław
+##ructured
+ferreira
+pun
+astonishing
+gunnar
+##yat
+arya
+prc
+gottfried
+##tight
+excursion
+##ographer
+dina
+##quil
+##nare
+huffington
+illustrious
+wilbur
+gundam
+verandah
+##zard
+naacp
+##odle
+constructive
+fjord
+kade
+##naud
+generosity
+thrilling
+baseline
+cayman
+frankish
+plastics
+accommodations
+zoological
+##fting
+cedric
+qb
+motorized
+##dome
+##otted
+squealed
+tackled
+canucks
+budgets
+situ
+asthma
+dail
+gabled
+grasslands
+whimpered
+writhing
+judgments
+##65
+minnie
+pv
+##carbon
+bananas
+grille
+domes
+monique
+odin
+maguire
+markham
+tierney
+##estra
+##chua
+libel
+poke
+speedy
+atrium
+laval
+notwithstanding
+##edly
+fai
+kala
+##sur
+robb
+##sma
+listings
+luz
+supplementary
+tianjin
+##acing
+enzo
+jd
+ric
+scanner
+croats
+transcribed
+##49
+arden
+cv
+##hair
+##raphy
+##lver
+##uy
+357
+seventies
+staggering
+alam
+horticultural
+hs
+regression
+timbers
+blasting
+##ounded
+montagu
+manipulating
+##cit
+catalytic
+1550
+troopers
+##meo
+condemnation
+fitzpatrick
+##oire
+##roved
+inexperienced
+1670
+castes
+##lative
+outing
+314
+dubois
+flicking
+quarrel
+ste
+learners
+1625
+iq
+whistled
+##class
+282
+classify
+tariffs
+temperament
+355
+folly
+liszt
+##yles
+immersed
+jordanian
+ceasefire
+apparel
+extras
+maru
+fished
+##bio
+harta
+stockport
+assortment
+craftsman
+paralysis
+transmitters
+##cola
+blindness
+##wk
+fatally
+proficiency
+solemnly
+##orno
+repairing
+amore
+groceries
+ultraviolet
+##chase
+schoolhouse
+##tua
+resurgence
+nailed
+##otype
+##×
+ruse
+saliva
+diagrams
+##tructing
+albans
+rann
+thirties
+1b
+antennas
+hilarious
+cougars
+paddington
+stats
+##eger
+breakaway
+ipod
+reza
+authorship
+prohibiting
+scoffed
+##etz
+##ttle
+conscription
+defected
+trondheim
+##fires
+ivanov
+keenan
+##adan
+##ciful
+##fb
+##slow
+locating
+##ials
+##tford
+cadiz
+basalt
+blankly
+interned
+rags
+rattling
+##tick
+carpathian
+reassured
+sync
+bum
+guildford
+iss
+staunch
+##onga
+astronomers
+sera
+sofie
+emergencies
+susquehanna
+##heard
+duc
+mastery
+vh1
+williamsburg
+bayer
+buckled
+craving
+##khan
+##rdes
+bloomington
+##write
+alton
+barbecue
+##bians
+justine
+##hri
+##ndt
+delightful
+smartphone
+newtown
+photon
+retrieval
+peugeot
+hissing
+##monium
+##orough
+flavors
+lighted
+relaunched
+tainted
+##games
+##lysis
+anarchy
+microscopic
+hopping
+adept
+evade
+evie
+##beau
+inhibit
+sinn
+adjustable
+hurst
+intuition
+wilton
+cisco
+44th
+lawful
+lowlands
+stockings
+thierry
+##dalen
+##hila
+##nai
+fates
+prank
+tb
+maison
+lobbied
+provocative
+1724
+4a
+utopia
+##qual
+carbonate
+gujarati
+purcell
+##rford
+curtiss
+##mei
+overgrown
+arenas
+mediation
+swallows
+##rnik
+respectful
+turnbull
+##hedron
+##hope
+alyssa
+ozone
+##ʻi
+ami
+gestapo
+johansson
+snooker
+canteen
+cuff
+declines
+empathy
+stigma
+##ags
+##iner
+##raine
+taxpayers
+gui
+volga
+##wright
+##copic
+lifespan
+overcame
+tattooed
+enactment
+giggles
+##ador
+##camp
+barrington
+bribe
+obligatory
+orbiting
+peng
+##enas
+elusive
+sucker
+##vating
+cong
+hardship
+empowered
+anticipating
+estrada
+cryptic
+greasy
+detainees
+planck
+sudbury
+plaid
+dod
+marriott
+kayla
+##ears
+##vb
+##zd
+mortally
+##hein
+cognition
+radha
+319
+liechtenstein
+meade
+richly
+argyle
+harpsichord
+liberalism
+trumpets
+lauded
+tyrant
+salsa
+tiled
+lear
+promoters
+reused
+slicing
+trident
+##chuk
+##gami
+##lka
+cantor
+checkpoint
+##points
+gaul
+leger
+mammalian
+##tov
+##aar
+##schaft
+doha
+frenchman
+nirvana
+##vino
+delgado
+headlining
+##eron
+##iography
+jug
+tko
+1649
+naga
+intersections
+##jia
+benfica
+nawab
+##suka
+ashford
+gulp
+##deck
+##vill
+##rug
+brentford
+frazier
+pleasures
+dunne
+potsdam
+shenzhen
+dentistry
+##tec
+flanagan
+##dorff
+##hear
+chorale
+dinah
+prem
+quezon
+##rogated
+relinquished
+sutra
+terri
+##pani
+flaps
+##rissa
+poly
+##rnet
+homme
+aback
+##eki
+linger
+womb
+##kson
+##lewood
+doorstep
+orthodoxy
+threaded
+westfield
+##rval
+dioceses
+fridays
+subsided
+##gata
+loyalists
+##biotic
+##ettes
+letterman
+lunatic
+prelate
+tenderly
+invariably
+souza
+thug
+winslow
+##otide
+furlongs
+gogh
+jeopardy
+##runa
+pegasus
+##umble
+humiliated
+standalone
+tagged
+##roller
+freshmen
+klan
+##bright
+attaining
+initiating
+transatlantic
+logged
+viz
+##uance
+1723
+combatants
+intervening
+stephane
+chieftain
+despised
+grazed
+317
+cdc
+galveston
+godzilla
+macro
+simulate
+##planes
+parades
+##esses
+960
+##ductive
+##unes
+equator
+overdose
+##cans
+##hosh
+##lifting
+joshi
+epstein
+sonora
+treacherous
+aquatics
+manchu
+responsive
+##sation
+supervisory
+##christ
+##llins
+##ibar
+##balance
+##uso
+kimball
+karlsruhe
+mab
+##emy
+ignores
+phonetic
+reuters
+spaghetti
+820
+almighty
+danzig
+rumbling
+tombstone
+designations
+lured
+outset
+##felt
+supermarkets
+##wt
+grupo
+kei
+kraft
+susanna
+##blood
+comprehension
+genealogy
+##aghan
+##verted
+redding
+##ythe
+1722
+bowing
+##pore
+##roi
+lest
+sharpened
+fulbright
+valkyrie
+sikhs
+##unds
+swans
+bouquet
+merritt
+##tage
+##venting
+commuted
+redhead
+clerks
+leasing
+cesare
+dea
+hazy
+##vances
+fledged
+greenfield
+servicemen
+##gical
+armando
+blackout
+dt
+sagged
+downloadable
+intra
+potion
+pods
+##4th
+##mism
+xp
+attendants
+gambia
+stale
+##ntine
+plump
+asteroids
+rediscovered
+buds
+flea
+hive
+##neas
+1737
+classifications
+debuts
+##eles
+olympus
+scala
+##eurs
+##gno
+##mute
+hummed
+sigismund
+visuals
+wiggled
+await
+pilasters
+clench
+sulfate
+##ances
+bellevue
+enigma
+trainee
+snort
+##sw
+clouded
+denim
+##rank
+##rder
+churning
+hartman
+lodges
+riches
+sima
+##missible
+accountable
+socrates
+regulates
+mueller
+##cr
+1702
+avoids
+solids
+himalayas
+nutrient
+pup
+##jevic
+squat
+fades
+nec
+##lates
+##pina
+##rona
+##ου
+privateer
+tequila
+##gative
+##mpton
+apt
+hornet
+immortals
+##dou
+asturias
+cleansing
+dario
+##rries
+##anta
+etymology
+servicing
+zhejiang
+##venor
+##nx
+horned
+erasmus
+rayon
+relocating
+£10
+##bags
+escalated
+promenade
+stubble
+2010s
+artisans
+axial
+liquids
+mora
+sho
+yoo
+##tsky
+bundles
+oldies
+##nally
+notification
+bastion
+##ths
+sparkle
+##lved
+1728
+leash
+pathogen
+highs
+##hmi
+immature
+880
+gonzaga
+ignatius
+mansions
+monterrey
+sweets
+bryson
+##loe
+polled
+regatta
+brightest
+pei
+rosy
+squid
+hatfield
+payroll
+addict
+meath
+cornerback
+heaviest
+lodging
+##mage
+capcom
+rippled
+##sily
+barnet
+mayhem
+ymca
+snuggled
+rousseau
+##cute
+blanchard
+284
+fragmented
+leighton
+chromosomes
+risking
+##md
+##strel
+##utter
+corinne
+coyotes
+cynical
+hiroshi
+yeomanry
+##ractive
+ebook
+grading
+mandela
+plume
+agustin
+magdalene
+##rkin
+bea
+femme
+trafford
+##coll
+##lun
+##tance
+52nd
+fourier
+upton
+##mental
+camilla
+gust
+iihf
+islamabad
+longevity
+##kala
+feldman
+netting
+##rization
+endeavour
+foraging
+mfa
+orr
+##open
+greyish
+contradiction
+graz
+##ruff
+handicapped
+marlene
+tweed
+oaxaca
+spp
+campos
+miocene
+pri
+configured
+cooks
+pluto
+cozy
+pornographic
+##entes
+70th
+fairness
+glided
+jonny
+lynne
+rounding
+sired
+##emon
+##nist
+remade
+uncover
+##mack
+complied
+lei
+newsweek
+##jured
+##parts
+##enting
+##pg
+293
+finer
+guerrillas
+athenian
+deng
+disused
+stepmother
+accuse
+gingerly
+seduction
+521
+confronting
+##walker
+##going
+gora
+nostalgia
+sabres
+virginity
+wrenched
+##minated
+syndication
+wielding
+eyre
+##56
+##gnon
+##igny
+behaved
+taxpayer
+sweeps
+##growth
+childless
+gallant
+##ywood
+amplified
+geraldine
+scrape
+##ffi
+babylonian
+fresco
+##rdan
+##kney
+##position
+1718
+restricting
+tack
+fukuoka
+osborn
+selector
+partnering
+##dlow
+318
+gnu
+kia
+tak
+whitley
+gables
+##54
+##mania
+mri
+softness
+immersion
+##bots
+##evsky
+1713
+chilling
+insignificant
+pcs
+##uis
+elites
+lina
+purported
+supplemental
+teaming
+##americana
+##dding
+##inton
+proficient
+rouen
+##nage
+##rret
+niccolo
+selects
+##bread
+fluffy
+1621
+gruff
+knotted
+mukherjee
+polgara
+thrash
+nicholls
+secluded
+smoothing
+thru
+corsica
+loaf
+whitaker
+inquiries
+##rrier
+##kam
+indochina
+289
+marlins
+myles
+peking
+##tea
+extracts
+pastry
+superhuman
+connacht
+vogel
+##ditional
+##het
+##udged
+##lash
+gloss
+quarries
+refit
+teaser
+##alic
+##gaon
+20s
+materialized
+sling
+camped
+pickering
+tung
+tracker
+pursuant
+##cide
+cranes
+soc
+##cini
+##typical
+##viere
+anhalt
+overboard
+workout
+chores
+fares
+orphaned
+stains
+##logie
+fenton
+surpassing
+joyah
+triggers
+##itte
+grandmaster
+##lass
+##lists
+clapping
+fraudulent
+ledger
+nagasaki
+##cor
+##nosis
+##tsa
+eucalyptus
+tun
+##icio
+##rney
+##tara
+dax
+heroism
+ina
+wrexham
+onboard
+unsigned
+##dates
+moshe
+galley
+winnie
+droplets
+exiles
+praises
+watered
+noodles
+##aia
+fein
+adi
+leland
+multicultural
+stink
+bingo
+comets
+erskine
+modernized
+canned
+constraint
+domestically
+chemotherapy
+featherweight
+stifled
+##mum
+darkly
+irresistible
+refreshing
+hasty
+isolate
+##oys
+kitchener
+planners
+##wehr
+cages
+yarn
+implant
+toulon
+elects
+childbirth
+yue
+##lind
+##lone
+cn
+rightful
+sportsman
+junctions
+remodeled
+specifies
+##rgh
+291
+##oons
+complimented
+##urgent
+lister
+ot
+##logic
+bequeathed
+cheekbones
+fontana
+gabby
+##dial
+amadeus
+corrugated
+maverick
+resented
+triangles
+##hered
+##usly
+nazareth
+tyrol
+1675
+assent
+poorer
+sectional
+aegean
+##cous
+296
+nylon
+ghanaian
+##egorical
+##weig
+cushions
+forbid
+fusiliers
+obstruction
+somerville
+##scia
+dime
+earrings
+elliptical
+leyte
+oder
+polymers
+timmy
+atm
+midtown
+piloted
+settles
+continual
+externally
+mayfield
+##uh
+enrichment
+henson
+keane
+persians
+1733
+benji
+braden
+pep
+324
+##efe
+contenders
+pepsi
+valet
+##isches
+298
+##asse
+##earing
+goofy
+stroll
+##amen
+authoritarian
+occurrences
+adversary
+ahmedabad
+tangent
+toppled
+dorchester
+1672
+modernism
+marxism
+islamist
+charlemagne
+exponential
+racks
+unicode
+brunette
+mbc
+pic
+skirmish
+##bund
+##lad
+##powered
+##yst
+hoisted
+messina
+shatter
+##ctum
+jedi
+vantage
+##music
+##neil
+clemens
+mahmoud
+corrupted
+authentication
+lowry
+nils
+##washed
+omnibus
+wounding
+jillian
+##itors
+##opped
+serialized
+narcotics
+handheld
+##arm
+##plicity
+intersecting
+stimulating
+##onis
+crate
+fellowships
+hemingway
+casinos
+climatic
+fordham
+copeland
+drip
+beatty
+leaflets
+robber
+brothel
+madeira
+##hedral
+sphinx
+ultrasound
+##vana
+valor
+forbade
+leonid
+villas
+##aldo
+duane
+marquez
+##cytes
+disadvantaged
+forearms
+kawasaki
+reacts
+consular
+lax
+uncles
+uphold
+##hopper
+concepcion
+dorsey
+lass
+##izan
+arching
+passageway
+1708
+researches
+tia
+internationals
+##graphs
+##opers
+distinguishes
+javanese
+divert
+##uven
+plotted
+##listic
+##rwin
+##erik
+##tify
+affirmative
+signifies
+validation
+##bson
+kari
+felicity
+georgina
+zulu
+##eros
+##rained
+##rath
+overcoming
+##dot
+argyll
+##rbin
+1734
+chiba
+ratification
+windy
+earls
+parapet
+##marks
+hunan
+pristine
+astrid
+punta
+##gart
+brodie
+##kota
+##oder
+malaga
+minerva
+rouse
+##phonic
+bellowed
+pagoda
+portals
+reclamation
+##gur
+##odies
+##⁄₄
+parentheses
+quoting
+allergic
+palette
+showcases
+benefactor
+heartland
+nonlinear
+##tness
+bladed
+cheerfully
+scans
+##ety
+##hone
+1666
+girlfriends
+pedersen
+hiram
+sous
+##liche
+##nator
+1683
+##nery
+##orio
+##umen
+bobo
+primaries
+smiley
+##cb
+unearthed
+uniformly
+fis
+metadata
+1635
+ind
+##oted
+recoil
+##titles
+##tura
+##ια
+406
+hilbert
+jamestown
+mcmillan
+tulane
+seychelles
+##frid
+antics
+coli
+fated
+stucco
+##grants
+1654
+bulky
+accolades
+arrays
+caledonian
+carnage
+optimism
+puebla
+##tative
+##cave
+enforcing
+rotherham
+seo
+dunlop
+aeronautics
+chimed
+incline
+zoning
+archduke
+hellenistic
+##oses
+##sions
+candi
+thong
+##ople
+magnate
+rustic
+##rsk
+projective
+slant
+##offs
+danes
+hollis
+vocalists
+##ammed
+congenital
+contend
+gesellschaft
+##ocating
+##pressive
+douglass
+quieter
+##cm
+##kshi
+howled
+salim
+spontaneously
+townsville
+buena
+southport
+##bold
+kato
+1638
+faerie
+stiffly
+##vus
+##rled
+297
+flawless
+realising
+taboo
+##7th
+bytes
+straightening
+356
+jena
+##hid
+##rmin
+cartwright
+berber
+bertram
+soloists
+411
+noses
+417
+coping
+fission
+hardin
+inca
+##cen
+1717
+mobilized
+vhf
+##raf
+biscuits
+curate
+##85
+##anial
+331
+gaunt
+neighbourhoods
+1540
+##abas
+blanca
+bypassed
+sockets
+behold
+coincidentally
+##bane
+nara
+shave
+splinter
+terrific
+##arion
+##erian
+commonplace
+juris
+redwood
+waistband
+boxed
+caitlin
+fingerprints
+jennie
+naturalized
+##ired
+balfour
+craters
+jody
+bungalow
+hugely
+quilt
+glitter
+pigeons
+undertaker
+bulging
+constrained
+goo
+##sil
+##akh
+assimilation
+reworked
+##person
+persuasion
+##pants
+felicia
+##cliff
+##ulent
+1732
+explodes
+##dun
+##inium
+##zic
+lyman
+vulture
+hog
+overlook
+begs
+northwards
+ow
+spoil
+##urer
+fatima
+favorably
+accumulate
+sargent
+sorority
+corresponded
+dispersal
+kochi
+toned
+##imi
+##lita
+internacional
+newfound
+##agger
+##lynn
+##rigue
+booths
+peanuts
+##eborg
+medicare
+muriel
+nur
+##uram
+crates
+millennia
+pajamas
+worsened
+##breakers
+jimi
+vanuatu
+yawned
+##udeau
+carousel
+##hony
+hurdle
+##ccus
+##mounted
+##pod
+rv
+##eche
+airship
+ambiguity
+compulsion
+recapture
+##claiming
+arthritis
+##osomal
+1667
+asserting
+ngc
+sniffing
+dade
+discontent
+glendale
+ported
+##amina
+defamation
+rammed
+##scent
+fling
+livingstone
+##fleet
+875
+##ppy
+apocalyptic
+comrade
+lcd
+##lowe
+cessna
+eine
+persecuted
+subsistence
+demi
+hoop
+reliefs
+710
+coptic
+progressing
+stemmed
+perpetrators
+1665
+priestess
+##nio
+dobson
+ebony
+rooster
+itf
+tortricidae
+##bbon
+##jian
+cleanup
+##jean
+##øy
+1721
+eighties
+taxonomic
+holiness
+##hearted
+##spar
+antilles
+showcasing
+stabilized
+##nb
+gia
+mascara
+michelangelo
+dawned
+##uria
+##vinsky
+extinguished
+fitz
+grotesque
+£100
+##fera
+##loid
+##mous
+barges
+neue
+throbbed
+cipher
+johnnie
+##a1
+##mpt
+outburst
+##swick
+spearheaded
+administrations
+c1
+heartbreak
+pixels
+pleasantly
+##enay
+lombardy
+plush
+##nsed
+bobbie
+##hly
+reapers
+tremor
+xiang
+minogue
+substantive
+hitch
+barak
+##wyl
+kwan
+##encia
+910
+obscene
+elegance
+indus
+surfer
+bribery
+conserve
+##hyllum
+##masters
+horatio
+##fat
+apes
+rebound
+psychotic
+##pour
+iteration
+##mium
+##vani
+botanic
+horribly
+antiques
+dispose
+paxton
+##hli
+##wg
+timeless
+1704
+disregard
+engraver
+hounds
+##bau
+##version
+looted
+uno
+facilitates
+groans
+masjid
+rutland
+antibody
+disqualification
+decatur
+footballers
+quake
+slacks
+48th
+rein
+scribe
+stabilize
+commits
+exemplary
+tho
+##hort
+##chison
+pantry
+traversed
+##hiti
+disrepair
+identifiable
+vibrated
+baccalaureate
+##nnis
+csa
+interviewing
+##iensis
+##raße
+greaves
+wealthiest
+343
+classed
+jogged
+£5
+##58
+##atal
+illuminating
+knicks
+respecting
+##uno
+scrubbed
+##iji
+##dles
+kruger
+moods
+growls
+raider
+silvia
+chefs
+kam
+vr
+cree
+percival
+##terol
+gunter
+counterattack
+defiant
+henan
+ze
+##rasia
+##riety
+equivalence
+submissions
+##fra
+##thor
+bautista
+mechanically
+##heater
+cornice
+herbal
+templar
+##mering
+outputs
+ruining
+ligand
+renumbered
+extravagant
+mika
+blockbuster
+eta
+insurrection
+##ilia
+darkening
+ferocious
+pianos
+strife
+kinship
+##aer
+melee
+##anor
+##iste
+##may
+##oue
+decidedly
+weep
+##jad
+##missive
+##ppel
+354
+puget
+unease
+##gnant
+1629
+hammering
+kassel
+ob
+wessex
+##lga
+bromwich
+egan
+paranoia
+utilization
+##atable
+##idad
+contradictory
+provoke
+##ols
+##ouring
+##tangled
+knesset
+##very
+##lette
+plumbing
+##sden
+##¹
+greensboro
+occult
+sniff
+338
+zev
+beaming
+gamer
+haggard
+mahal
+##olt
+##pins
+mendes
+utmost
+briefing
+gunnery
+##gut
+##pher
+##zh
+##rok
+1679
+khalifa
+sonya
+##boot
+principals
+urbana
+wiring
+##liffe
+##minating
+##rrado
+dahl
+nyu
+skepticism
+np
+townspeople
+ithaca
+lobster
+somethin
+##fur
+##arina
+##−1
+freighter
+zimmerman
+biceps
+contractual
+##herton
+amend
+hurrying
+subconscious
+##anal
+336
+meng
+clermont
+spawning
+##eia
+##lub
+dignitaries
+impetus
+snacks
+spotting
+twigs
+##bilis
+##cz
+##ouk
+libertadores
+nic
+skylar
+##aina
+##firm
+gustave
+asean
+##anum
+dieter
+legislatures
+flirt
+bromley
+trolls
+umar
+##bbies
+##tyle
+blah
+parc
+bridgeport
+crank
+negligence
+##nction
+46th
+constantin
+molded
+bandages
+seriousness
+00pm
+siegel
+carpets
+compartments
+upbeat
+statehood
+##dner
+##edging
+marko
+730
+platt
+##hane
+paving
+##iy
+1738
+abbess
+impatience
+limousine
+nbl
+##talk
+441
+lucille
+mojo
+nightfall
+robbers
+##nais
+karel
+brisk
+calves
+replicate
+ascribed
+telescopes
+##olf
+intimidated
+##reen
+ballast
+specialization
+##sit
+aerodynamic
+caliphate
+rainer
+visionary
+##arded
+epsilon
+##aday
+##onte
+aggregation
+auditory
+boosted
+reunification
+kathmandu
+loco
+robyn
+402
+acknowledges
+appointing
+humanoid
+newell
+redeveloped
+restraints
+##tained
+barbarians
+chopper
+1609
+italiana
+##lez
+##lho
+investigates
+wrestlemania
+##anies
+##bib
+690
+##falls
+creaked
+dragoons
+gravely
+minions
+stupidity
+volley
+##harat
+##week
+musik
+##eries
+##uously
+fungal
+massimo
+semantics
+malvern
+##ahl
+##pee
+discourage
+embryo
+imperialism
+1910s
+profoundly
+##ddled
+jiangsu
+sparkled
+stat
+##holz
+sweatshirt
+tobin
+##iction
+sneered
+##cheon
+##oit
+brit
+causal
+smyth
+##neuve
+diffuse
+perrin
+silvio
+##ipes
+##recht
+detonated
+iqbal
+selma
+##nism
+##zumi
+roasted
+##riders
+tay
+##ados
+##mament
+##mut
+##rud
+840
+completes
+nipples
+cfa
+flavour
+hirsch
+##laus
+calderon
+sneakers
+moravian
+##ksha
+1622
+rq
+294
+##imeters
+bodo
+##isance
+##pre
+##ronia
+anatomical
+excerpt
+##lke
+dh
+kunst
+##tablished
+##scoe
+biomass
+panted
+unharmed
+gael
+housemates
+montpellier
+##59
+coa
+rodents
+tonic
+hickory
+singleton
+##taro
+451
+1719
+aldo
+breaststroke
+dempsey
+och
+rocco
+##cuit
+merton
+dissemination
+midsummer
+serials
+##idi
+haji
+polynomials
+##rdon
+gs
+enoch
+prematurely
+shutter
+taunton
+£3
+##grating
+##inates
+archangel
+harassed
+##asco
+326
+archway
+dazzling
+##ecin
+1736
+sumo
+wat
+##kovich
+1086
+honneur
+##ently
+##nostic
+##ttal
+##idon
+1605
+403
+1716
+blogger
+rents
+##gnan
+hires
+##ikh
+##dant
+howie
+##rons
+handler
+retracted
+shocks
+1632
+arun
+duluth
+kepler
+trumpeter
+##lary
+peeking
+seasoned
+trooper
+##mara
+laszlo
+##iciencies
+##rti
+heterosexual
+##inatory
+##ssion
+indira
+jogging
+##inga
+##lism
+beit
+dissatisfaction
+malice
+##ately
+nedra
+peeling
+##rgeon
+47th
+stadiums
+475
+vertigo
+##ains
+iced
+restroom
+##plify
+##tub
+illustrating
+pear
+##chner
+##sibility
+inorganic
+rappers
+receipts
+watery
+##kura
+lucinda
+##oulos
+reintroduced
+##8th
+##tched
+gracefully
+saxons
+nutritional
+wastewater
+rained
+favourites
+bedrock
+fisted
+hallways
+likeness
+upscale
+##lateral
+1580
+blinds
+prequel
+##pps
+##tama
+deter
+humiliating
+restraining
+tn
+vents
+1659
+laundering
+recess
+rosary
+tractors
+coulter
+federer
+##ifiers
+##plin
+persistence
+##quitable
+geschichte
+pendulum
+quakers
+##beam
+bassett
+pictorial
+buffet
+koln
+##sitor
+drills
+reciprocal
+shooters
+##57
+##cton
+##tees
+converge
+pip
+dmitri
+donnelly
+yamamoto
+aqua
+azores
+demographics
+hypnotic
+spitfire
+suspend
+wryly
+roderick
+##rran
+sebastien
+##asurable
+mavericks
+##fles
+##200
+himalayan
+prodigy
+##iance
+transvaal
+demonstrators
+handcuffs
+dodged
+mcnamara
+sublime
+1726
+crazed
+##efined
+##till
+ivo
+pondered
+reconciled
+shrill
+sava
+##duk
+bal
+cad
+heresy
+jaipur
+goran
+##nished
+341
+lux
+shelly
+whitehall
+##hre
+israelis
+peacekeeping
+##wled
+1703
+demetrius
+ousted
+##arians
+##zos
+beale
+anwar
+backstroke
+raged
+shrinking
+cremated
+##yck
+benign
+towing
+wadi
+darmstadt
+landfill
+parana
+soothe
+colleen
+sidewalks
+mayfair
+tumble
+hepatitis
+ferrer
+superstructure
+##gingly
+##urse
+##wee
+anthropological
+translators
+##mies
+closeness
+hooves
+##pw
+mondays
+##roll
+##vita
+landscaping
+##urized
+purification
+sock
+thorns
+thwarted
+jalan
+tiberius
+##taka
+saline
+##rito
+confidently
+khyber
+sculptors
+##ij
+brahms
+hammersmith
+inspectors
+battista
+fivb
+fragmentation
+hackney
+##uls
+arresting
+exercising
+antoinette
+bedfordshire
+##zily
+dyed
+##hema
+1656
+racetrack
+variability
+##tique
+1655
+austrians
+deteriorating
+madman
+theorists
+aix
+lehman
+weathered
+1731
+decreed
+eruptions
+1729
+flaw
+quinlan
+sorbonne
+flutes
+nunez
+1711
+adored
+downwards
+fable
+rasped
+1712
+moritz
+mouthful
+renegade
+shivers
+stunts
+dysfunction
+restrain
+translit
+327
+pancakes
+##avio
+##cision
+##tray
+351
+vial
+##lden
+bain
+##maid
+##oxide
+chihuahua
+malacca
+vimes
+##rba
+##rnier
+1664
+donnie
+plaques
+##ually
+337
+bangs
+floppy
+huntsville
+loretta
+nikolay
+##otte
+eater
+handgun
+ubiquitous
+##hett
+eras
+zodiac
+1634
+##omorphic
+1820s
+##zog
+cochran
+##bula
+##lithic
+warring
+##rada
+dalai
+excused
+blazers
+mcconnell
+reeling
+bot
+este
+##abi
+geese
+hoax
+taxon
+##bla
+guitarists
+##icon
+condemning
+hunts
+inversion
+moffat
+taekwondo
+##lvis
+1624
+stammered
+##rest
+##rzy
+sousa
+fundraiser
+marylebone
+navigable
+uptown
+cabbage
+daniela
+salman
+shitty
+whimper
+##kian
+##utive
+programmers
+protections
+rm
+##rmi
+##rued
+forceful
+##enes
+fuss
+##tao
+##wash
+brat
+oppressive
+reykjavik
+spartak
+ticking
+##inkles
+##kiewicz
+adolph
+horst
+maui
+protege
+straighten
+cpc
+landau
+concourse
+clements
+resultant
+##ando
+imaginative
+joo
+reactivated
+##rem
+##ffled
+##uising
+consultative
+##guide
+flop
+kaitlyn
+mergers
+parenting
+somber
+##vron
+supervise
+vidhan
+##imum
+courtship
+exemplified
+harmonies
+medallist
+refining
+##rrow
+##ка
+amara
+##hum
+780
+goalscorer
+sited
+overshadowed
+rohan
+displeasure
+secretive
+multiplied
+osman
+##orth
+engravings
+padre
+##kali
+##veda
+miniatures
+mis
+##yala
+clap
+pali
+rook
+##cana
+1692
+57th
+antennae
+astro
+oskar
+1628
+bulldog
+crotch
+hackett
+yucatan
+##sure
+amplifiers
+brno
+ferrara
+migrating
+##gree
+thanking
+turing
+##eza
+mccann
+ting
+andersson
+onslaught
+gaines
+ganga
+incense
+standardization
+##mation
+sentai
+scuba
+stuffing
+turquoise
+waivers
+alloys
+##vitt
+regaining
+vaults
+##clops
+##gizing
+digger
+furry
+memorabilia
+probing
+##iad
+payton
+rec
+deutschland
+filippo
+opaque
+seamen
+zenith
+afrikaans
+##filtration
+disciplined
+inspirational
+##merie
+banco
+confuse
+grafton
+tod
+##dgets
+championed
+simi
+anomaly
+biplane
+##ceptive
+electrode
+##para
+1697
+cleavage
+crossbow
+swirl
+informant
+##lars
+##osta
+afi
+bonfire
+spec
+##oux
+lakeside
+slump
+##culus
+##lais
+##qvist
+##rrigan
+1016
+facades
+borg
+inwardly
+cervical
+xl
+pointedly
+050
+stabilization
+##odon
+chests
+1699
+hacked
+ctv
+orthogonal
+suzy
+##lastic
+gaulle
+jacobite
+rearview
+##cam
+##erted
+ashby
+##drik
+##igate
+##mise
+##zbek
+affectionately
+canine
+disperse
+latham
+##istles
+##ivar
+spielberg
+##orin
+##idium
+ezekiel
+cid
+##sg
+durga
+middletown
+##cina
+customized
+frontiers
+harden
+##etano
+##zzy
+1604
+bolsheviks
+##66
+coloration
+yoko
+##bedo
+briefs
+slabs
+debra
+liquidation
+plumage
+##oin
+blossoms
+dementia
+subsidy
+1611
+proctor
+relational
+jerseys
+parochial
+ter
+##ici
+esa
+peshawar
+cavalier
+loren
+cpi
+idiots
+shamrock
+1646
+dutton
+malabar
+mustache
+##endez
+##ocytes
+referencing
+terminates
+marche
+yarmouth
+##sop
+acton
+mated
+seton
+subtly
+baptised
+beige
+extremes
+jolted
+kristina
+telecast
+##actic
+safeguard
+waldo
+##baldi
+##bular
+endeavors
+sloppy
+subterranean
+##ensburg
+##itung
+delicately
+pigment
+tq
+##scu
+1626
+##ound
+collisions
+coveted
+herds
+##personal
+##meister
+##nberger
+chopra
+##ricting
+abnormalities
+defective
+galician
+lucie
+##dilly
+alligator
+likened
+##genase
+burundi
+clears
+complexion
+derelict
+deafening
+diablo
+fingered
+champaign
+dogg
+enlist
+isotope
+labeling
+mrna
+##erre
+brilliance
+marvelous
+##ayo
+1652
+crawley
+ether
+footed
+dwellers
+deserts
+hamish
+rubs
+warlock
+skimmed
+##lizer
+870
+buick
+embark
+heraldic
+irregularities
+##ajan
+kiara
+##kulam
+##ieg
+antigen
+kowalski
+##lge
+oakley
+visitation
+##mbit
+vt
+##suit
+1570
+murderers
+##miento
+##rites
+chimneys
+##sling
+condemn
+custer
+exchequer
+havre
+##ghi
+fluctuations
+##rations
+dfb
+hendricks
+vaccines
+##tarian
+nietzsche
+biking
+juicy
+##duced
+brooding
+scrolling
+selangor
+##ragan
+352
+annum
+boomed
+seminole
+sugarcane
+##dna
+departmental
+dismissing
+innsbruck
+arteries
+ashok
+batavia
+daze
+kun
+overtook
+##rga
+##tlan
+beheaded
+gaddafi
+holm
+electronically
+faulty
+galilee
+fractures
+kobayashi
+##lized
+gunmen
+magma
+aramaic
+mala
+eastenders
+inference
+messengers
+bf
+##qu
+407
+bathrooms
+##vere
+1658
+flashbacks
+ideally
+misunderstood
+##jali
+##weather
+mendez
+##grounds
+505
+uncanny
+##iii
+1709
+friendships
+##nbc
+sacrament
+accommodated
+reiterated
+logistical
+pebbles
+thumped
+##escence
+administering
+decrees
+drafts
+##flight
+##cased
+##tula
+futuristic
+picket
+intimidation
+winthrop
+##fahan
+interfered
+339
+afar
+francoise
+morally
+uta
+cochin
+croft
+dwarfs
+##bruck
+##dents
+##nami
+biker
+##hner
+##meral
+nano
+##isen
+##ometric
+##pres
+##ан
+brightened
+meek
+parcels
+securely
+gunners
+##jhl
+##zko
+agile
+hysteria
+##lten
+##rcus
+bukit
+champs
+chevy
+cuckoo
+leith
+sadler
+theologians
+welded
+##section
+1663
+jj
+plurality
+xander
+##rooms
+##formed
+shredded
+temps
+intimately
+pau
+tormented
+##lok
+##stellar
+1618
+charred
+ems
+essen
+##mmel
+alarms
+spraying
+ascot
+blooms
+twinkle
+##abia
+##apes
+internment
+obsidian
+##chaft
+snoop
+##dav
+##ooping
+malibu
+##tension
+quiver
+##itia
+hays
+mcintosh
+travers
+walsall
+##ffie
+1623
+beverley
+schwarz
+plunging
+structurally
+m3
+rosenthal
+vikram
+##tsk
+770
+ghz
+##onda
+##tiv
+chalmers
+groningen
+pew
+reckon
+unicef
+##rvis
+55th
+##gni
+1651
+sulawesi
+avila
+cai
+metaphysical
+screwing
+turbulence
+##mberg
+augusto
+samba
+56th
+baffled
+momentary
+toxin
+##urian
+##wani
+aachen
+condoms
+dali
+steppe
+##3d
+##app
+##oed
+##year
+adolescence
+dauphin
+electrically
+inaccessible
+microscopy
+nikita
+##ega
+atv
+##cel
+##enter
+##oles
+##oteric
+##ы
+accountants
+punishments
+wrongly
+bribes
+adventurous
+clinch
+flinders
+southland
+##hem
+##kata
+gough
+##ciency
+lads
+soared
+##ה
+undergoes
+deformation
+outlawed
+rubbish
+##arus
+##mussen
+##nidae
+##rzburg
+arcs
+##ingdon
+##tituted
+1695
+wheelbase
+wheeling
+bombardier
+campground
+zebra
+##lices
+##oj
+##bain
+lullaby
+##ecure
+donetsk
+wylie
+grenada
+##arding
+##ης
+squinting
+eireann
+opposes
+##andra
+maximal
+runes
+##broken
+##cuting
+##iface
+##ror
+##rosis
+additive
+britney
+adultery
+triggering
+##drome
+detrimental
+aarhus
+containment
+jc
+swapped
+vichy
+##ioms
+madly
+##oric
+##rag
+brant
+##ckey
+##trix
+1560
+1612
+broughton
+rustling
+##stems
+##uder
+asbestos
+mentoring
+##nivorous
+finley
+leaps
+##isan
+apical
+pry
+slits
+substitutes
+##dict
+intuitive
+fantasia
+insistent
+unreasonable
+##igen
+##vna
+domed
+hannover
+margot
+ponder
+##zziness
+impromptu
+jian
+lc
+rampage
+stemming
+##eft
+andrey
+gerais
+whichever
+amnesia
+appropriated
+anzac
+clicks
+modifying
+ultimatum
+cambrian
+maids
+verve
+yellowstone
+##mbs
+conservatoire
+##scribe
+adherence
+dinners
+spectra
+imperfect
+mysteriously
+sidekick
+tatar
+tuba
+##aks
+##ifolia
+distrust
+##athan
+##zle
+c2
+ronin
+zac
+##pse
+celaena
+instrumentalist
+scents
+skopje
+##mbling
+comical
+compensated
+vidal
+condor
+intersect
+jingle
+wavelengths
+##urrent
+mcqueen
+##izzly
+carp
+weasel
+422
+kanye
+militias
+postdoctoral
+eugen
+gunslinger
+##ɛ
+faux
+hospice
+##for
+appalled
+derivation
+dwarves
+##elis
+dilapidated
+##folk
+astoria
+philology
+##lwyn
+##otho
+##saka
+inducing
+philanthropy
+##bf
+##itative
+geek
+markedly
+sql
+##yce
+bessie
+indices
+rn
+##flict
+495
+frowns
+resolving
+weightlifting
+tugs
+cleric
+contentious
+1653
+mania
+rms
+##miya
+##reate
+##ruck
+##tucket
+bien
+eels
+marek
+##ayton
+##cence
+discreet
+unofficially
+##ife
+leaks
+##bber
+1705
+332
+dung
+compressor
+hillsborough
+pandit
+shillings
+distal
+##skin
+381
+##tat
+##you
+nosed
+##nir
+mangrove
+undeveloped
+##idia
+textures
+##inho
+##500
+##rise
+ae
+irritating
+nay
+amazingly
+bancroft
+apologetic
+compassionate
+kata
+symphonies
+##lovic
+airspace
+##lch
+930
+gifford
+precautions
+fulfillment
+sevilla
+vulgar
+martinique
+##urities
+looting
+piccolo
+tidy
+##dermott
+quadrant
+armchair
+incomes
+mathematicians
+stampede
+nilsson
+##inking
+##scan
+foo
+quarterfinal
+##ostal
+shang
+shouldered
+squirrels
+##owe
+344
+vinegar
+##bner
+##rchy
+##systems
+delaying
+##trics
+ars
+dwyer
+rhapsody
+sponsoring
+##gration
+bipolar
+cinder
+starters
+##olio
+##urst
+421
+signage
+##nty
+aground
+figurative
+mons
+acquaintances
+duets
+erroneously
+soyuz
+elliptic
+recreated
+##cultural
+##quette
+##ssed
+##tma
+##zcz
+moderator
+scares
+##itaire
+##stones
+##udence
+juniper
+sighting
+##just
+##nsen
+britten
+calabria
+ry
+bop
+cramer
+forsyth
+stillness
+##л
+airmen
+gathers
+unfit
+##umber
+##upt
+taunting
+##rip
+seeker
+streamlined
+##bution
+holster
+schumann
+tread
+vox
+##gano
+##onzo
+strive
+dil
+reforming
+covent
+newbury
+predicting
+##orro
+decorate
+tre
+##puted
+andover
+ie
+asahi
+dept
+dunkirk
+gills
+##tori
+buren
+huskies
+##stis
+##stov
+abstracts
+bets
+loosen
+##opa
+1682
+yearning
+##glio
+##sir
+berman
+effortlessly
+enamel
+napoli
+persist
+##peration
+##uez
+attache
+elisa
+b1
+invitations
+##kic
+accelerating
+reindeer
+boardwalk
+clutches
+nelly
+polka
+starbucks
+##kei
+adamant
+huey
+lough
+unbroken
+adventurer
+embroidery
+inspecting
+stanza
+##ducted
+naia
+taluka
+##pone
+##roids
+chases
+deprivation
+florian
+##jing
+##ppet
+earthly
+##lib
+##ssee
+colossal
+foreigner
+vet
+freaks
+patrice
+rosewood
+triassic
+upstate
+##pkins
+dominates
+ata
+chants
+ks
+vo
+##400
+##bley
+##raya
+##rmed
+555
+agra
+infiltrate
+##ailing
+##ilation
+##tzer
+##uppe
+##werk
+binoculars
+enthusiast
+fujian
+squeak
+##avs
+abolitionist
+almeida
+boredom
+hampstead
+marsden
+rations
+##ands
+inflated
+334
+bonuses
+rosalie
+patna
+##rco
+329
+detachments
+penitentiary
+54th
+flourishing
+woolf
+##dion
+##etched
+papyrus
+##lster
+##nsor
+##toy
+bobbed
+dismounted
+endelle
+inhuman
+motorola
+tbs
+wince
+wreath
+##ticus
+hideout
+inspections
+sanjay
+disgrace
+infused
+pudding
+stalks
+##urbed
+arsenic
+leases
+##hyl
+##rrard
+collarbone
+##waite
+##wil
+dowry
+##bant
+##edance
+genealogical
+nitrate
+salamanca
+scandals
+thyroid
+necessitated
+##!
+##"
+###
+##$
+##%
+##&
+##'
+##(
+##)
+##*
+##+
+##,
+##-
+##.
+##/
+##:
+##;
+##<
+##=
+##>
+##?
+##@
+##[
+##\
+##]
+##^
+##_
+##`
+##{
+##|
+##}
+##~
+##¡
+##¢
+##£
+##¤
+##¥
+##¦
+##§
+##¨
+##©
+##ª
+##«
+##¬
+##®
+##±
+##´
+##µ
+##¶
+##·
+##º
+##»
+##¼
+##¾
+##¿
+##æ
+##ð
+##÷
+##þ
+##đ
+##ħ
+##ŋ
+##œ
+##ƒ
+##ɐ
+##ɑ
+##ɒ
+##ɔ
+##ɕ
+##ə
+##ɡ
+##ɣ
+##ɨ
+##ɪ
+##ɫ
+##ɬ
+##ɯ
+##ɲ
+##ɴ
+##ɹ
+##ɾ
+##ʀ
+##ʁ
+##ʂ
+##ʃ
+##ʉ
+##ʊ
+##ʋ
+##ʌ
+##ʎ
+##ʐ
+##ʑ
+##ʒ
+##ʔ
+##ʰ
+##ʲ
+##ʳ
+##ʷ
+##ʸ
+##ʻ
+##ʼ
+##ʾ
+##ʿ
+##ˈ
+##ˡ
+##ˢ
+##ˣ
+##ˤ
+##β
+##γ
+##δ
+##ε
+##ζ
+##θ
+##κ
+##λ
+##μ
+##ξ
+##ο
+##π
+##ρ
+##σ
+##τ
+##υ
+##φ
+##χ
+##ψ
+##ω
+##б
+##г
+##д
+##ж
+##з
+##м
+##п
+##с
+##у
+##ф
+##х
+##ц
+##ч
+##ш
+##щ
+##ъ
+##э
+##ю
+##ђ
+##є
+##і
+##ј
+##љ
+##њ
+##ћ
+##ӏ
+##ա
+##բ
+##գ
+##դ
+##ե
+##թ
+##ի
+##լ
+##կ
+##հ
+##մ
+##յ
+##ն
+##ո
+##պ
+##ս
+##վ
+##տ
+##ր
+##ւ
+##ք
+##־
+##א
+##ב
+##ג
+##ד
+##ו
+##ז
+##ח
+##ט
+##י
+##ך
+##כ
+##ל
+##ם
+##מ
+##ן
+##נ
+##ס
+##ע
+##ף
+##פ
+##ץ
+##צ
+##ק
+##ר
+##ש
+##ת
+##،
+##ء
+##ب
+##ت
+##ث
+##ج
+##ح
+##خ
+##ذ
+##ز
+##س
+##ش
+##ص
+##ض
+##ط
+##ظ
+##ع
+##غ
+##ـ
+##ف
+##ق
+##ك
+##و
+##ى
+##ٹ
+##پ
+##چ
+##ک
+##گ
+##ں
+##ھ
+##ہ
+##ے
+##अ
+##आ
+##उ
+##ए
+##क
+##ख
+##ग
+##च
+##ज
+##ट
+##ड
+##ण
+##त
+##थ
+##द
+##ध
+##न
+##प
+##ब
+##भ
+##म
+##य
+##र
+##ल
+##व
+##श
+##ष
+##स
+##ह
+##ा
+##ि
+##ी
+##ो
+##।
+##॥
+##ং
+##অ
+##আ
+##ই
+##উ
+##এ
+##ও
+##ক
+##খ
+##গ
+##চ
+##ছ
+##জ
+##ট
+##ড
+##ণ
+##ত
+##থ
+##দ
+##ধ
+##ন
+##প
+##ব
+##ভ
+##ম
+##য
+##র
+##ল
+##শ
+##ষ
+##স
+##হ
+##া
+##ি
+##ী
+##ে
+##க
+##ச
+##ட
+##த
+##ந
+##ன
+##ப
+##ம
+##ய
+##ர
+##ல
+##ள
+##வ
+##ா
+##ி
+##ு
+##ே
+##ை
+##ನ
+##ರ
+##ಾ
+##ක
+##ය
+##ර
+##ල
+##ව
+##ා
+##ก
+##ง
+##ต
+##ท
+##น
+##พ
+##ม
+##ย
+##ร
+##ล
+##ว
+##ส
+##อ
+##า
+##เ
+##་
+##།
+##ག
+##ང
+##ད
+##ན
+##པ
+##བ
+##མ
+##འ
+##ར
+##ལ
+##ས
+##မ
+##ა
+##ბ
+##გ
+##დ
+##ე
+##ვ
+##თ
+##ი
+##კ
+##ლ
+##მ
+##ნ
+##ო
+##რ
+##ს
+##ტ
+##უ
+##ᄀ
+##ᄂ
+##ᄃ
+##ᄅ
+##ᄆ
+##ᄇ
+##ᄉ
+##ᄊ
+##ᄋ
+##ᄌ
+##ᄎ
+##ᄏ
+##ᄐ
+##ᄑ
+##ᄒ
+##ᅡ
+##ᅢ
+##ᅥ
+##ᅦ
+##ᅧ
+##ᅩ
+##ᅪ
+##ᅭ
+##ᅮ
+##ᅯ
+##ᅲ
+##ᅳ
+##ᅴ
+##ᅵ
+##ᆨ
+##ᆫ
+##ᆯ
+##ᆷ
+##ᆸ
+##ᆼ
+##ᴬ
+##ᴮ
+##ᴰ
+##ᴵ
+##ᴺ
+##ᵀ
+##ᵃ
+##ᵇ
+##ᵈ
+##ᵉ
+##ᵍ
+##ᵏ
+##ᵐ
+##ᵒ
+##ᵖ
+##ᵗ
+##ᵘ
+##ᵣ
+##ᵤ
+##ᵥ
+##ᶜ
+##ᶠ
+##‐
+##‑
+##‒
+##–
+##—
+##―
+##‖
+##‘
+##’
+##‚
+##“
+##”
+##„
+##†
+##‡
+##•
+##…
+##‰
+##′
+##″
+##›
+##‿
+##⁄
+##⁰
+##ⁱ
+##⁴
+##⁵
+##⁶
+##⁷
+##⁸
+##⁹
+##⁻
+##ⁿ
+##₅
+##₆
+##₇
+##₈
+##₉
+##₊
+##₍
+##₎
+##ₐ
+##ₑ
+##ₒ
+##ₓ
+##ₕ
+##ₖ
+##ₗ
+##ₘ
+##ₚ
+##ₛ
+##ₜ
+##₤
+##₩
+##€
+##₱
+##₹
+##ℓ
+##№
+##ℝ
+##™
+##⅓
+##⅔
+##←
+##↑
+##→
+##↓
+##↔
+##↦
+##⇄
+##⇌
+##⇒
+##∂
+##∅
+##∆
+##∇
+##∈
+##∗
+##∘
+##√
+##∞
+##∧
+##∨
+##∩
+##∪
+##≈
+##≡
+##≤
+##≥
+##⊂
+##⊆
+##⊕
+##⊗
+##⋅
+##─
+##│
+##■
+##▪
+##●
+##★
+##☆
+##☉
+##♠
+##♣
+##♥
+##♦
+##♯
+##⟨
+##⟩
+##ⱼ
+##⺩
+##⺼
+##⽥
+##、
+##。
+##〈
+##〉
+##《
+##》
+##「
+##」
+##『
+##』
+##〜
+##あ
+##い
+##う
+##え
+##お
+##か
+##き
+##く
+##け
+##こ
+##さ
+##し
+##す
+##せ
+##そ
+##た
+##ち
+##っ
+##つ
+##て
+##と
+##な
+##に
+##ぬ
+##ね
+##の
+##は
+##ひ
+##ふ
+##へ
+##ほ
+##ま
+##み
+##む
+##め
+##も
+##や
+##ゆ
+##よ
+##ら
+##り
+##る
+##れ
+##ろ
+##を
+##ん
+##ァ
+##ア
+##ィ
+##イ
+##ウ
+##ェ
+##エ
+##オ
+##カ
+##キ
+##ク
+##ケ
+##コ
+##サ
+##シ
+##ス
+##セ
+##タ
+##チ
+##ッ
+##ツ
+##テ
+##ト
+##ナ
+##ニ
+##ノ
+##ハ
+##ヒ
+##フ
+##ヘ
+##ホ
+##マ
+##ミ
+##ム
+##メ
+##モ
+##ャ
+##ュ
+##ョ
+##ラ
+##リ
+##ル
+##レ
+##ロ
+##ワ
+##ン
+##・
+##ー
+##一
+##三
+##上
+##下
+##不
+##世
+##中
+##主
+##久
+##之
+##也
+##事
+##二
+##五
+##井
+##京
+##人
+##亻
+##仁
+##介
+##代
+##仮
+##伊
+##会
+##佐
+##侍
+##保
+##信
+##健
+##元
+##光
+##八
+##公
+##内
+##出
+##分
+##前
+##劉
+##力
+##加
+##勝
+##北
+##区
+##十
+##千
+##南
+##博
+##原
+##口
+##古
+##史
+##司
+##合
+##吉
+##同
+##名
+##和
+##囗
+##四
+##国
+##國
+##土
+##地
+##坂
+##城
+##堂
+##場
+##士
+##夏
+##外
+##大
+##天
+##太
+##夫
+##奈
+##女
+##子
+##学
+##宀
+##宇
+##安
+##宗
+##定
+##宣
+##宮
+##家
+##宿
+##寺
+##將
+##小
+##尚
+##山
+##岡
+##島
+##崎
+##川
+##州
+##巿
+##帝
+##平
+##年
+##幸
+##广
+##弘
+##張
+##彳
+##後
+##御
+##德
+##心
+##忄
+##志
+##忠
+##愛
+##成
+##我
+##戦
+##戸
+##手
+##扌
+##政
+##文
+##新
+##方
+##日
+##明
+##星
+##春
+##昭
+##智
+##曲
+##書
+##月
+##有
+##朝
+##木
+##本
+##李
+##村
+##東
+##松
+##林
+##森
+##楊
+##樹
+##橋
+##歌
+##止
+##正
+##武
+##比
+##氏
+##民
+##水
+##氵
+##氷
+##永
+##江
+##沢
+##河
+##治
+##法
+##海
+##清
+##漢
+##瀬
+##火
+##版
+##犬
+##王
+##生
+##田
+##男
+##疒
+##発
+##白
+##的
+##皇
+##目
+##相
+##省
+##真
+##石
+##示
+##社
+##神
+##福
+##禾
+##秀
+##秋
+##空
+##立
+##章
+##竹
+##糹
+##美
+##義
+##耳
+##良
+##艹
+##花
+##英
+##華
+##葉
+##藤
+##行
+##街
+##西
+##見
+##訁
+##語
+##谷
+##貝
+##貴
+##車
+##軍
+##辶
+##道
+##郎
+##郡
+##部
+##都
+##里
+##野
+##金
+##鈴
+##镇
+##長
+##門
+##間
+##阝
+##阿
+##陳
+##陽
+##雄
+##青
+##面
+##風
+##食
+##香
+##馬
+##高
+##龍
+##龸
+##ﬁ
+##ﬂ
+##！
+##（
+##）
+##，
+##－
+##．
+##／
+##：
+##？
+##～
diff --git a/kit/assets/Semantics/tinybert_f32.bolt b/kit/assets/Semantics/tinybert_f32.bolt
new file mode 100644
index 00000000..7de4a412
Binary files /dev/null and b/kit/assets/Semantics/tinybert_f32.bolt differ
diff --git a/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/encoder_flow.prototxt b/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/encoder_flow.prototxt
new file mode 100644
index 00000000..eb0e74ed
--- /dev/null
+++ b/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/encoder_flow.prototxt
@@ -0,0 +1,350 @@
+name: "encoder"
+input: "sounds"
+input: "encoder_block0_trunk0_layer0_mem"
+input: "encoder_block0_trunk0_layer1_mem"
+input: "encoder_block1_trunk1_layer0_kmem"
+input: "encoder_block1_trunk1_layer0_vmem"
+input: "encoder_block1_trunk1_layer1_kmem"
+input: "encoder_block1_trunk1_layer1_vmem"
+input: "encoder_block2_trunk0_layer0_mem"
+input: "encoder_block2_trunk0_layer1_mem"
+input: "encoder_block2_trunk1_layer0_kmem"
+input: "encoder_block2_trunk1_layer0_vmem"
+input: "encoder_block2_trunk1_layer1_kmem"
+input: "encoder_block2_trunk1_layer1_vmem"
+input: "encoder_block3_trunk0_layer0_mem"
+input: "encoder_block3_trunk0_layer1_mem"
+input: "encoder_block3_trunk1_layer0_kmem"
+input: "encoder_block3_trunk1_layer0_vmem"
+input: "encoder_block3_trunk1_layer1_kmem"
+input: "encoder_block3_trunk1_layer1_vmem"
+input: "encoder_block3_trunk1_layer2_kmem"
+input: "encoder_block3_trunk1_layer2_vmem"
+input: "encoder_block3_trunk1_layer3_kmem"
+input: "encoder_block3_trunk1_layer3_vmem"
+output: "encoder_block3_transformer_ln"
+output: "encoder_block0_conv0_neg_slice"
+output: "encoder_block0_conv1_neg_slice"
+output: "encoder_block1_transformer_layer0_k_neg_slice"
+output: "encoder_block1_transformer_layer0_v_neg_slice"
+output: "encoder_block1_transformer_layer1_k_neg_slice"
+output: "encoder_block1_transformer_layer1_v_neg_slice"
+output: "encoder_block2_conv0_neg_slice"
+output: "encoder_block2_conv1_neg_slice"
+output: "encoder_block2_transformer_layer0_k_neg_slice"
+output: "encoder_block2_transformer_layer0_v_neg_slice"
+output: "encoder_block2_transformer_layer1_k_neg_slice"
+output: "encoder_block2_transformer_layer1_v_neg_slice"
+output: "encoder_block3_conv0_neg_slice"
+output: "encoder_block3_conv1_neg_slice"
+output: "encoder_block3_transformer_layer0_k_neg_slice"
+output: "encoder_block3_transformer_layer0_v_neg_slice"
+output: "encoder_block3_transformer_layer1_k_neg_slice"
+output: "encoder_block3_transformer_layer1_v_neg_slice"
+output: "encoder_block3_transformer_layer2_k_neg_slice"
+output: "encoder_block3_transformer_layer2_v_neg_slice"
+output: "encoder_block3_transformer_layer3_k_neg_slice"
+output: "encoder_block3_transformer_layer3_v_neg_slice"
+node {
+  name: "sounds"
+  type: "Input"
+  output: "sounds"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 15
+  input_dim: 128
+}
+node {
+  name: "encoder_block0_trunk0_layer0_mem"
+  type: "Input"
+  output: "encoder_block0_trunk0_layer0_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 2
+  input_dim: 128
+  input_dim: 1
+}
+node {
+  name: "encoder_block0_trunk0_layer1_mem"
+  type: "Input"
+  output: "encoder_block0_trunk0_layer1_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHWC8"
+  input_dim: 1
+  input_dim: 32
+  input_dim: 1
+  input_dim: 64
+}
+node {
+  name: "encoder_block1_trunk1_layer0_kmem"
+  type: "Input"
+  output: "encoder_block1_trunk1_layer0_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 5
+  input_dim: 6
+  input_dim: 64
+}
+node {
+  name: "encoder_block1_trunk1_layer0_vmem"
+  type: "Input"
+  output: "encoder_block1_trunk1_layer0_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 5
+  input_dim: 6
+  input_dim: 64
+}
+node {
+  name: "encoder_block1_trunk1_layer1_kmem"
+  type: "Input"
+  output: "encoder_block1_trunk1_layer1_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 6
+  input_dim: 64
+}
+node {
+  name: "encoder_block1_trunk1_layer1_vmem"
+  type: "Input"
+  output: "encoder_block1_trunk1_layer1_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 6
+  input_dim: 64
+}
+node {
+  name: "encoder_block2_trunk0_layer0_mem"
+  type: "Input"
+  output: "encoder_block2_trunk0_layer0_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 2
+  input_dim: 384
+}
+node {
+  name: "encoder_block2_trunk0_layer1_mem"
+  type: "Input"
+  output: "encoder_block2_trunk0_layer1_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHWC8"
+  input_dim: 1
+  input_dim: 1024
+  input_dim: 1
+  input_dim: 1
+}
+node {
+  name: "encoder_block2_trunk1_layer0_kmem"
+  type: "Input"
+  output: "encoder_block2_trunk1_layer0_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block2_trunk1_layer0_vmem"
+  type: "Input"
+  output: "encoder_block2_trunk1_layer0_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block2_trunk1_layer1_kmem"
+  type: "Input"
+  output: "encoder_block2_trunk1_layer1_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block2_trunk1_layer1_vmem"
+  type: "Input"
+  output: "encoder_block2_trunk1_layer1_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk0_layer0_mem"
+  type: "Input"
+  output: "encoder_block3_trunk0_layer0_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 2
+  input_dim: 512
+}
+node {
+  name: "encoder_block3_trunk0_layer1_mem"
+  type: "Input"
+  output: "encoder_block3_trunk0_layer1_mem"
+  input_type: "FLOAT32"
+  input_format: "NCHWC8"
+  input_dim: 1
+  input_dim: 1024
+  input_dim: 1
+  input_dim: 1
+}
+node {
+  name: "encoder_block3_trunk1_layer0_kmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer0_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer0_vmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer0_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer1_kmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer1_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 15
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer1_vmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer1_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 15
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer2_kmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer2_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 23
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer2_vmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer2_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 23
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer3_kmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer3_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 31
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_block3_trunk1_layer3_vmem"
+  type: "Input"
+  output: "encoder_block3_trunk1_layer3_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 31
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "encoder_inference"
+  type: "Inference"
+  input: "sounds"
+  input: "encoder_block0_trunk0_layer0_mem"
+  input: "encoder_block0_trunk0_layer1_mem"
+  input: "encoder_block1_trunk1_layer0_kmem"
+  input: "encoder_block1_trunk1_layer0_vmem"
+  input: "encoder_block1_trunk1_layer1_kmem"
+  input: "encoder_block1_trunk1_layer1_vmem"
+  input: "encoder_block2_trunk0_layer0_mem"
+  input: "encoder_block2_trunk0_layer1_mem"
+  input: "encoder_block2_trunk1_layer0_kmem"
+  input: "encoder_block2_trunk1_layer0_vmem"
+  input: "encoder_block2_trunk1_layer1_kmem"
+  input: "encoder_block2_trunk1_layer1_vmem"
+  input: "encoder_block3_trunk0_layer0_mem"
+  input: "encoder_block3_trunk0_layer1_mem"
+  input: "encoder_block3_trunk1_layer0_kmem"
+  input: "encoder_block3_trunk1_layer0_vmem"
+  input: "encoder_block3_trunk1_layer1_kmem"
+  input: "encoder_block3_trunk1_layer1_vmem"
+  input: "encoder_block3_trunk1_layer2_kmem"
+  input: "encoder_block3_trunk1_layer2_vmem"
+  input: "encoder_block3_trunk1_layer3_kmem"
+  input: "encoder_block3_trunk1_layer3_vmem"
+  output: "encoder_block3_transformer_ln"
+  output: "encoder_block0_conv0_neg_slice"
+  output: "encoder_block0_conv1_neg_slice"
+  output: "encoder_block1_transformer_layer0_k_neg_slice"
+  output: "encoder_block1_transformer_layer0_v_neg_slice"
+  output: "encoder_block1_transformer_layer1_k_neg_slice"
+  output: "encoder_block1_transformer_layer1_v_neg_slice"
+  output: "encoder_block2_conv0_neg_slice"
+  output: "encoder_block2_conv1_neg_slice"
+  output: "encoder_block2_transformer_layer0_k_neg_slice"
+  output: "encoder_block2_transformer_layer0_v_neg_slice"
+  output: "encoder_block2_transformer_layer1_k_neg_slice"
+  output: "encoder_block2_transformer_layer1_v_neg_slice"
+  output: "encoder_block3_conv0_neg_slice"
+  output: "encoder_block3_conv1_neg_slice"
+  output: "encoder_block3_transformer_layer0_k_neg_slice"
+  output: "encoder_block3_transformer_layer0_v_neg_slice"
+  output: "encoder_block3_transformer_layer1_k_neg_slice"
+  output: "encoder_block3_transformer_layer1_v_neg_slice"
+  output: "encoder_block3_transformer_layer2_k_neg_slice"
+  output: "encoder_block3_transformer_layer2_v_neg_slice"
+  output: "encoder_block3_transformer_layer3_k_neg_slice"
+  output: "encoder_block3_transformer_layer3_v_neg_slice"
+  infer_output_size_parameter: "encoderInferOutputSize"
+  preprocess_parameter: "encoderPreProcess"
+  inference_parameter: "./asr_ios/asr_convolution_transformer_encoder_f32.bolt"
+}
diff --git a/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/joint_flow.prototxt b/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/joint_flow.prototxt
new file mode 100644
index 00000000..9767b78f
--- /dev/null
+++ b/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/joint_flow.prototxt
@@ -0,0 +1,33 @@
+name: "joint_flow"
+input: "encoder"
+input: "prediction_net"
+output: "output_argmax"
+node {
+  name: "encoder"
+  type: "Input"
+  output: "encoder"
+  input_type: "FLOAT32"
+  input_format: "MTK"
+  input_dim: 1
+  input_dim: 1
+  input_dim: 512
+}
+node {
+  name: "prediction_net"
+  type: "Input"
+  output: "prediction_net"
+  input_type: "FLOAT32"
+  input_format: "MTK"
+  input_dim: 1
+  input_dim: 1
+  input_dim: 512
+}
+node {
+  name: "joint_inference"
+  type: "Inference"
+  input: "encoder"
+  input: "prediction_net"
+  output: "output_argmax"
+  infer_output_size_parameter: "jointInferOutputSize"
+  inference_parameter: "./asr_ios/asr_convolution_transformer_joint_net_f32.bolt"
+}
diff --git a/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/pinyin2hanzi_flow.prototxt b/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/pinyin2hanzi_flow.prototxt
new file mode 100644
index 00000000..02e95cf4
--- /dev/null
+++ b/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/pinyin2hanzi_flow.prototxt
@@ -0,0 +1,24 @@
+name: "pinyin2hanzi_flow"
+input: "pinyin"
+output: "hanzi_squeeze/Squeeze"
+node {
+  name: "pinyin"
+  type: "Input"
+  output: "pinyin"
+  input_type: "UINT32"
+  input_format: "NORMAL"
+  input_dim: 1
+  input_dim: 32
+}
+node {
+  name: "pinyin2hanzi_inference"
+  type: "Inference"
+  input: "pinyin"
+  output: "hanzi_squeeze/Squeeze"
+  infer_output_size_parameter: "pinyin2hanziInferOutputSize"
+  preprocess_parameter: "pinyin2hanziPreProcess"
+  preprocess_parameter: "/data/local/tmp/CI/test/pinyin_lm_embedding.bin"
+  preprocess_parameter: "1601"
+  preprocess_parameter: "512"
+  inference_parameter: "./asr_ios/cnn_pinyin_lm_b7h512e4_cn_en_20200518_cloud_fp32_f32.bolt"
+}
diff --git a/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/prediction_flow.prototxt b/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/prediction_flow.prototxt
new file mode 100644
index 00000000..ecb982ff
--- /dev/null
+++ b/kit/iOS/ChineseSpeechRecognition/ChineseSpeechRecognition/libbolt/prediction_flow.prototxt
@@ -0,0 +1,139 @@
+name: "prediction"
+input: "label"
+input: "prediction_net_layer0_kmem"
+input: "prediction_net_layer0_vmem"
+input: "prediction_net_layer1_kmem"
+input: "prediction_net_layer1_vmem"
+input: "prediction_net_layer2_kmem"
+input: "prediction_net_layer2_vmem"
+input: "prediction_net_layer3_kmem"
+input: "prediction_net_layer3_vmem"
+output: "prediction_net_ln"
+output: "prediction_net_layer0_k_neg_slice"
+output: "prediction_net_layer0_v_neg_slice"
+output: "prediction_net_layer1_k_neg_slice"
+output: "prediction_net_layer1_v_neg_slice"
+output: "prediction_net_layer2_k_neg_slice"
+output: "prediction_net_layer2_v_neg_slice"
+output: "prediction_net_layer3_k_neg_slice"
+output: "prediction_net_layer3_v_neg_slice"
+node {
+  name: "label"
+  type: "Input"
+  output: "label"
+  input_type: "UINT32"
+  input_format: "NORMAL"
+  input_dim: 1
+  input_dim: 1
+}
+node {
+  name: "prediction_net_layer0_kmem"
+  type: "Input"
+  output: "prediction_net_layer0_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 3
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer0_vmem"
+  type: "Input"
+  output: "prediction_net_layer0_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 3
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer1_kmem"
+  type: "Input"
+  output: "prediction_net_layer1_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 5
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer1_vmem"
+  type: "Input"
+  output: "prediction_net_layer1_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 5
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer2_kmem"
+  type: "Input"
+  output: "prediction_net_layer2_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer2_vmem"
+  type: "Input"
+  output: "prediction_net_layer2_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 7
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer3_kmem"
+  type: "Input"
+  output: "prediction_net_layer3_kmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_net_layer3_vmem"
+  type: "Input"
+  output: "prediction_net_layer3_vmem"
+  input_type: "FLOAT32"
+  input_format: "NCHW"
+  input_dim: 1
+  input_dim: 9
+  input_dim: 8
+  input_dim: 64
+}
+node {
+  name: "prediction_inference"
+  input: "label"
+  input: "prediction_net_layer0_kmem"
+  input: "prediction_net_layer0_vmem"
+  input: "prediction_net_layer1_kmem"
+  input: "prediction_net_layer1_vmem"
+  input: "prediction_net_layer2_kmem"
+  input: "prediction_net_layer2_vmem"
+  input: "prediction_net_layer3_kmem"
+  input: "prediction_net_layer3_vmem"
+  output: "prediction_net_ln"
+  output: "prediction_net_layer0_k_neg_slice"
+  output: "prediction_net_layer0_v_neg_slice"
+  output: "prediction_net_layer1_k_neg_slice"
+  output: "prediction_net_layer1_v_neg_slice"
+  output: "prediction_net_layer2_k_neg_slice"
+  output: "prediction_net_layer2_v_neg_slice"
+  output: "prediction_net_layer3_k_neg_slice"
+  output: "prediction_net_layer3_v_neg_slice"
+  infer_output_size_parameter: "predictionInferOutputSize"
+  inference_parameter: "./asr_ios/asr_convolution_transformer_prediction_net_f32.bolt"
+}
diff --git a/kit/setup.sh b/kit/setup.sh
index 5c2a3490..eb0b2009 100644
--- a/kit/setup.sh
+++ b/kit/setup.sh
@@ -8,8 +8,8 @@ kit_flow=$2
 project_dir=""
 
 # inference demos
-demos=("ImageClassification" "Semantics")
-xdemos=("SimpleImageClassification" "Semantics")
+demos=("ImageClassification" "Semantics" "ReadingComprehension")
+xdemos=("SimpleImageClassification" "Semantics" "ReadingComprehension")
 if [[ ${CXX} =~ android ]]; then
     for((i=0; i<${#demos[@]}; i++)) do
         demo=${demos[$i]};
diff --git a/model_tools/CMakeLists.txt b/model_tools/CMakeLists.txt
index b043cb09..c9ec6df1 100644
--- a/model_tools/CMakeLists.txt
+++ b/model_tools/CMakeLists.txt
@@ -10,6 +10,8 @@ FATAL: can not find bolt.cmake in <BOLT_ROOT>/common/cmakes directory,
     ")
 endif (BOLT_CONFIGURE_FILE)
 
+include_directories(${JSONCPP_INCLUDE_DIR})
+
 project(model_tools)
 
 set_policy()
diff --git a/model_tools/include/OPOptimizers/ActivationOptimizer.hpp b/model_tools/include/OPOptimizers/ActivationOptimizer.hpp
index c1e26bb6..ac0f52e2 100644
--- a/model_tools/include/OPOptimizers/ActivationOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ActivationOptimizer.hpp
@@ -36,24 +36,19 @@ class ActivationOptimizer : public OPOptimizer {
                 // tensor relationship rewrite
                 if (spec->ops[prevOpIndex].type == OT_Conv) {
                     switch (spec->ops[prevOpIndex].ps.conv_spec.convolution_type) {
-                        case Convolution_Pointwise: {
+                        case CONVOLUTION_POINTWISE:
+                        case CONVOLUTION_DECONVOLUTION:
+                        case CONVOLUTION_DILATION: {
                             spec->ops[prevOpIndex].ps.conv_spec.pw_activation_type = ACTIVATION_RELU;
                             break;
                         }
-                        case Convolution_Deconvolution: {
-                            spec->ops[prevOpIndex].ps.conv_spec.pw_activation_type = ACTIVATION_RELU;
-                            break;
-                        }
-                        case Convolution_Depthwise: {
+                        case CONVOLUTION_DEPTHWISE: {
                             spec->ops[prevOpIndex].ps.conv_spec.dw_activation_type = ACTIVATION_RELU;
                             break;
                         }
-                        case Convolution_Dilation: {
-                            spec->ops[prevOpIndex].ps.conv_spec.pw_activation_type = ACTIVATION_RELU;
-                            break;
-                        }
                         default: {
-                            CHECK_REQUIREMENT(0);
+                            UNI_ERROR_LOG(
+                                "not support to fuse %s + activation.\n", spec->ops[i].name);
                             break;
                         }
                     }
diff --git a/model_tools/include/OPOptimizers/BNScaleOptimizer.hpp b/model_tools/include/OPOptimizers/BNScaleOptimizer.hpp
index 611b4a7a..3769e68a 100644
--- a/model_tools/include/OPOptimizers/BNScaleOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/BNScaleOptimizer.hpp
@@ -60,8 +60,8 @@ class BNScaleOptimizer : public OPOptimizer {
                 if (spec->ws[scaleWeightIndex].vec == nullptr) {
                     spec->ws[scaleWeightIndex].bytes_of_vec = channelCur * sizeof(F32);
                     spec->ws[scaleWeightIndex].vec =
-                        (U8 *)mt_new_storage(spec->ws[scaleWeightIndex].bytes_of_vec);
-                    memset(
+                        (U8 *)mt_malloc(spec->ws[scaleWeightIndex].bytes_of_vec);
+                    UNI_MEMSET(
                         spec->ws[scaleWeightIndex].vec, 0, spec->ws[scaleWeightIndex].bytes_of_vec);
                 }
 
@@ -72,21 +72,10 @@ class BNScaleOptimizer : public OPOptimizer {
                     alphaPtr[m] /= stdValue[m];
                     betaPtr[m] = betaPtr[m] - alphaPtr[m] * gamaCur * meanPtr[m];
                 }
-                // free BN memory
-                if (spec->ws[bnWeightIndex].weight != nullptr) {
-                    spec->ws[bnWeightIndex].bytes_of_weight = 0;
-                    if (outOfFileMapRange(spec->ws[bnWeightIndex].weight, spec->mfd)) {
-                        delete spec->ws[bnWeightIndex].weight;
-                    }
-                    spec->ws[bnWeightIndex].weight = nullptr;
-                }
-                if (spec->ws[bnWeightIndex].vec != nullptr) {
-                    spec->ws[bnWeightIndex].bytes_of_vec = 0;
-                    if (outOfFileMapRange(spec->ws[bnWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[bnWeightIndex].vec;
-                    }
-                    spec->ws[bnWeightIndex].vec = nullptr;
-                }
+                spec->ws[bnWeightIndex].bytes_of_weight = 0;
+                mt_free(spec->ws[bnWeightIndex].weight, spec);
+                spec->ws[bnWeightIndex].bytes_of_vec = 0;
+                mt_free(spec->ws[bnWeightIndex].vec, spec);
                 setOperatorInvalid(spec, bnOpIndex, true);
                 hasOptimized = true;
                 i--;
diff --git a/model_tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp b/model_tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp
index 10764553..5549bfd1 100644
--- a/model_tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp
@@ -28,7 +28,7 @@ class ChannelPaddingOptimizer : public OPOptimizer {
     {
         OperatorSpec channelResizeOperator = mt_create_operator(name, OT_ChannelResize, 1, 1);
         if (symmetric == nullptr || symmetric == NULL) {
-            memset(channelResizeOperator.ps.channel_resize_spec.symmetric, 0, NAME_LEN);
+            UNI_MEMSET(channelResizeOperator.ps.channel_resize_spec.symmetric, 0, NAME_LEN);
         } else {
             str_copy(channelResizeOperator.ps.channel_resize_spec.symmetric, symmetric,
                 strlen(symmetric));
@@ -79,7 +79,7 @@ class ChannelPaddingOptimizer : public OPOptimizer {
         std::string channelResizeNamePrefix = "ChannelResize_";
         for (int i = 0; i < spec->num_operator_specs; i++) {
             if (spec->ops[i].type == OT_Conv &&
-                spec->ops[i].ps.conv_spec.convolution_type == Convolution_Depthwise) {
+                spec->ops[i].ps.conv_spec.convolution_type == CONVOLUTION_DEPTHWISE) {
                 OperatorSpec currentOperator = spec->ops[i];
                 U32 numKernels = currentOperator.ps.conv_spec.num_outputs;
                 U32 paddingBase = channelAlign;
@@ -97,24 +97,19 @@ class ChannelPaddingOptimizer : public OPOptimizer {
                 U8 *weight = spec->ws[weightIndex].weight;
                 spec->ws[weightIndex].bytes_of_weight = weightSizeNew;
                 spec->ws[weightIndex].weight =
-                    (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_weight);
-                memcpy(spec->ws[weightIndex].weight, weight, weightSize);
-                memset(spec->ws[weightIndex].weight + weightSize, 0, weightSizeNew - weightSize);
-                if (outOfFileMapRange(weight, spec->mfd)) {
-                    delete weight;
-                }
+                    (U8 *)mt_malloc(spec->ws[weightIndex].bytes_of_weight);
+                UNI_MEMCPY(spec->ws[weightIndex].weight, weight, weightSize);
+                UNI_MEMSET(spec->ws[weightIndex].weight + weightSize, 0, weightSizeNew - weightSize);
+                mt_free(weight, spec);
                 U8 *vec = spec->ws[weightIndex].vec;
                 if (vec != nullptr) {
                     U32 vecSize = spec->ws[weightIndex].bytes_of_vec;
                     U32 vecSizeNew = spec->ws[weightIndex].bytes_of_vec / numKernels * numKernelsNew;
                     spec->ws[weightIndex].bytes_of_vec = vecSizeNew;
-                    spec->ws[weightIndex].vec =
-                        (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_vec);
-                    memcpy((U8 *)(spec->ws[weightIndex].vec), vec, vecSize);
-                    memset((U8 *)(spec->ws[weightIndex].vec + vecSize), 0, vecSizeNew - vecSize);
-                    if (outOfFileMapRange(vec, spec->mfd)) {
-                        delete vec;
-                    }
+                    spec->ws[weightIndex].vec = (U8 *)mt_malloc(spec->ws[weightIndex].bytes_of_vec);
+                    UNI_MEMCPY((U8 *)(spec->ws[weightIndex].vec), vec, vecSize);
+                    UNI_MEMSET((U8 *)(spec->ws[weightIndex].vec + vecSize), 0, vecSizeNew - vecSize);
+                    mt_free(vec, spec);
                 }
                 std::string channelResizeName1 = channelResizeNamePrefix + std::to_string(i);
                 std::string channelResizeName2 = channelResizeNamePrefix + std::to_string(i + 2);
@@ -128,10 +123,10 @@ class ChannelPaddingOptimizer : public OPOptimizer {
                 continue;
             }
             if ((spec->ops[i].type == OT_Conv &&
-                    (spec->ops[i].ps.conv_spec.convolution_type == Convolution_Pointwise ||
-                        spec->ops[i].ps.conv_spec.convolution_type == Convolution_Dilation)) ||
+                    (spec->ops[i].ps.conv_spec.convolution_type == CONVOLUTION_POINTWISE ||
+                        spec->ops[i].ps.conv_spec.convolution_type == CONVOLUTION_DILATION)) ||
                 (spec->ops[i].type == OT_Deconvolution &&
-                    spec->ops[i].ps.conv_spec.convolution_type == Convolution_Deconvolution)) {
+                    spec->ops[i].ps.conv_spec.convolution_type == CONVOLUTION_DECONVOLUTION)) {
                 OperatorSpec currentOperator = spec->ops[i];
                 U32 groups = currentOperator.ps.conv_spec.group;
                 U32 paddingBase = channelAlign * groups;
@@ -159,8 +154,8 @@ class ChannelPaddingOptimizer : public OPOptimizer {
                 U8 *weight = spec->ws[weightIndex].weight;
                 spec->ws[weightIndex].bytes_of_weight = weightSizeNew;
                 spec->ws[weightIndex].weight =
-                    (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_weight);
-                memset(spec->ws[weightIndex].weight, 0, weightSizeNew);
+                    (U8 *)mt_malloc(spec->ws[weightIndex].bytes_of_weight);
+                UNI_MEMSET(spec->ws[weightIndex].weight, 0, weightSizeNew);
                 U32 ocGroupSize = numKernels / groups;
                 U32 ocGroupSizeNew = numKernelsNew / groups;
 
@@ -176,7 +171,7 @@ class ChannelPaddingOptimizer : public OPOptimizer {
                                 U32 indexNew = ((cg * icGroupSizeNew + ic) * numKernelsNew +
                                                    og * ocGroupSizeNew) *
                                     tileSize;
-                                memcpy((U8 *)(spec->ws[weightIndex].weight) + indexNew,
+                                UNI_MEMCPY((U8 *)(spec->ws[weightIndex].weight) + indexNew,
                                     weight + index, tileSize * ocGroupSize);
                             }
                         }
@@ -188,33 +183,28 @@ class ChannelPaddingOptimizer : public OPOptimizer {
                                 U32 index = ((og * ocGroupSize + oc) * inputChannels + c) * tileSize;
                                 U32 indexNew =
                                     ((og * ocGroupSizeNew + oc) * inputChannels + c) * tileSize;
-                                memcpy((U8 *)(spec->ws[weightIndex].weight) + indexNew,
+                                UNI_MEMCPY((U8 *)(spec->ws[weightIndex].weight) + indexNew,
                                     weight + index, tileSize);
                             }
                         }
                     }
                 }
-                if (outOfFileMapRange(weight, spec->mfd)) {
-                    delete weight;
-                }
+                mt_free(weight, spec);
                 U8 *vec = spec->ws[weightIndex].vec;
                 if (vec != nullptr && numKernels != numKernelsNew) {
                     U32 vecSize = spec->ws[weightIndex].bytes_of_vec;
                     U32 vecSizeNew = spec->ws[weightIndex].bytes_of_vec / numKernels * numKernelsNew;
                     spec->ws[weightIndex].bytes_of_vec = vecSizeNew;
-                    spec->ws[weightIndex].vec =
-                        (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_vec);
+                    spec->ws[weightIndex].vec = (U8 *)mt_malloc(spec->ws[weightIndex].bytes_of_vec);
                     U32 tile = vecSize / groups;
                     U32 tileNew = vecSizeNew / groups;
                     for (U32 g = 0; g < groups; g++) {
-                        memcpy(
+                        UNI_MEMCPY(
                             (U8 *)(spec->ws[weightIndex].vec) + g * tileNew, vec + g * tile, tile);
-                        memset((U8 *)(spec->ws[weightIndex].vec) + g * tileNew + tile, 0,
+                        UNI_MEMSET((U8 *)(spec->ws[weightIndex].vec) + g * tileNew + tile, 0,
                             tileNew - tile);
                     }
-                    if (outOfFileMapRange(vec, spec->mfd)) {
-                        delete vec;
-                    }
+                    mt_free(vec, spec);
                 }
                 int channelResizeIndex1 = i;
                 int channelResizeIndex2 = i + 2;
diff --git a/model_tools/include/OPOptimizers/ConvConvOptimizer.hpp b/model_tools/include/OPOptimizers/ConvConvOptimizer.hpp
index b9e4d85e..8b4095df 100644
--- a/model_tools/include/OPOptimizers/ConvConvOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ConvConvOptimizer.hpp
@@ -20,11 +20,98 @@ class ConvConvOptimizer : public OPOptimizer {
     bool optimize(ModelSpec *spec) override
     {
         bool hasOptimized = false;
-        hasOptimized |= horizontal_optimize(spec);
+        hasOptimized |= conv_concat_optimize(spec);
+        //hasOptimized |= conv_conv_optimize(spec);
         return hasOptimized;
     }
 
-    bool horizontal_optimize(ModelSpec *spec)
+    bool conv_conv_optimize(ModelSpec *spec)
+    {
+        bool hasOptimized = false;
+        for (int i = 0; i < spec->num_operator_specs - 1; i++) {
+            if (spec->ops[i].num_outputs > 0) {
+                auto nextOps = searchOperatorIndexByInput(
+                    spec, spec->ops[i].output_tensors_name[0], i + 1, spec->num_operator_specs);
+                if (nextOps.size() <= 1) {
+                    continue;
+                }
+                int weightSize = 0;
+                int vecSize = 0;
+                std::vector<int> convOps;
+                std::vector<int> convWeights;
+                std::set<std::string> convParams;
+                for (U32 j = 0; j < nextOps.size(); j++) {
+                    int k = nextOps[j].first;
+                    if (spec->ops[k].type == OT_Conv &&
+                        (spec->ops[k].ps.conv_spec.convolution_type == CONVOLUTION_POINTWISE ||
+                            spec->ops[k].ps.conv_spec.convolution_type == CONVOLUTION_DILATION)) {
+                        int id = searchWeightIndex(spec, spec->ops[k].name);
+                        if (id < 0 || spec->ws[id].mdt != DT_F32 ||
+                            spec->ws[id].bytes_of_weight == 0 || spec->ws[id].bytes_of_vec == 0) {
+                            continue;
+                        }
+                        convOps.push_back(k);
+                        convWeights.push_back(id);
+                        convParams.insert(copyBuffer<1024>(&(spec->ops[k].ps),
+                            get_operator_parameter_size(sg_boltVersion, spec->ops[k].type)));
+                        weightSize += spec->ws[id].bytes_of_weight;
+                        vecSize += spec->ws[id].bytes_of_vec;
+                    }
+                }
+
+                if (convOps.size() <= 1 || convParams.size() != 1) {
+                    continue;
+                }
+                U8 *weight = (U8 *)mt_malloc(weightSize);
+                U8 *vec = (U8 *)mt_malloc(vecSize);
+                int k = convOps[0];
+                int id = convWeights[0];
+                UNI_MEMCPY(weight, spec->ws[id].weight, spec->ws[id].bytes_of_weight);
+                mt_free(spec->ws[id].weight, spec);
+                spec->ws[id].weight = weight;
+                weight += spec->ws[id].bytes_of_weight;
+                spec->ws[id].bytes_of_weight = weightSize;
+                UNI_MEMCPY(vec, spec->ws[id].vec, spec->ws[id].bytes_of_vec);
+                mt_free(spec->ws[id].vec, spec);
+                spec->ws[id].vec = vec;
+                vec += spec->ws[id].bytes_of_vec;
+                spec->ws[id].bytes_of_vec = vecSize;
+
+                std::string name = "slice_" + std::to_string(i);
+                OperatorSpec sliceOperator =
+                    mt_create_operator(name.c_str(), OT_Slice, 1, convOps.size());
+                UNI_STRCPY(sliceOperator.input_tensors_name[0], name.c_str());
+                UNI_STRCPY(sliceOperator.output_tensors_name[0], spec->ops[k].output_tensors_name[0]);
+                UNI_STRCPY(spec->ops[k].output_tensors_name[0], name.c_str());
+                sliceOperator.ps.slice_spec.axis = 1;
+                sliceOperator.ps.slice_spec.num_slice = convOps.size() - 1;
+                sliceOperator.ps.slice_spec.slice_points[0] = spec->ops[k].ps.conv_spec.num_outputs;
+                for (unsigned int j = 1; j < convOps.size(); j++) {
+                    int k = convOps[j];
+                    sliceOperator.ps.slice_spec.slice_points[j] =
+                        sliceOperator.ps.slice_spec.slice_points[j - 1] +
+                        spec->ops[k].ps.conv_spec.num_outputs;
+                    UNI_STRCPY(
+                        sliceOperator.output_tensors_name[j], spec->ops[k].output_tensors_name[0]);
+                    spec->ops[convOps[0]].ps.conv_spec.num_outputs +=
+                        spec->ops[k].ps.conv_spec.num_outputs;
+
+                    int id = convWeights[j];
+                    UNI_MEMCPY(weight, spec->ws[id].weight, spec->ws[id].bytes_of_weight);
+                    weight += spec->ws[id].bytes_of_weight;
+                    UNI_MEMCPY(vec, spec->ws[id].vec, spec->ws[id].bytes_of_vec);
+                    vec += spec->ws[id].bytes_of_vec;
+
+                    setOperatorInvalid(spec, k, false);
+                    hasOptimized = true;
+                }
+                mt_insert_operator(spec, convOps[0] + 1, sliceOperator);
+            }
+        }
+        return hasOptimized;
+    }
+
+    bool conv_concat_optimize(ModelSpec *spec)
     {
         bool hasOptimized = false;
         for (int i = 2; i < spec->num_operator_specs; i++) {
@@ -46,8 +133,8 @@ class ConvConvOptimizer : public OPOptimizer {
                     }
                     int k = prevOps[0].first;
                     if (spec->ops[k].type == OT_Conv &&
-                        (spec->ops[k].ps.conv_spec.convolution_type == Convolution_Pointwise ||
-                            spec->ops[k].ps.conv_spec.convolution_type == Convolution_Dilation)) {
+                        (spec->ops[k].ps.conv_spec.convolution_type == CONVOLUTION_POINTWISE ||
+                            spec->ops[k].ps.conv_spec.convolution_type == CONVOLUTION_DILATION)) {
                         int id = searchWeightIndex(spec, spec->ops[k].name);
                         auto nextOps = searchOperatorIndexByInput(spec,
                             spec->ops[i].input_tensors_name[j], k + 1, spec->num_operator_specs);
@@ -60,8 +147,8 @@ class ConvConvOptimizer : public OPOptimizer {
                         convInputs.insert(spec->ops[k].input_tensors_name[0]);
                         convOps.push_back(k);
                         convWeights.push_back(id);
-                        convParams.insert(
-                            copyBuffer<1024>(&(spec->ops[k].ps), sizeof(ParameterSpec)));
+                        convParams.insert(copyBuffer<1024>(&(spec->ops[k].ps),
+                            get_operator_parameter_size(sg_boltVersion, spec->ops[k].type)));
                         weightSize += spec->ws[id].bytes_of_weight;
                         vecSize += spec->ws[id].bytes_of_vec;
                         channels += spec->ops[k].ps.conv_spec.num_outputs;
@@ -72,29 +159,25 @@ class ConvConvOptimizer : public OPOptimizer {
                 }
                 if (fuseConv && convInputs.size() == 1 && convParams.size() == 1 &&
                     convOps.size() > 1) {
-                    U8 *weight = (U8 *)mt_new_storage(weightSize);
-                    U8 *vec = (U8 *)mt_new_storage(vecSize);
+                    U8 *weight = (U8 *)mt_malloc(weightSize);
+                    U8 *vec = (U8 *)mt_malloc(vecSize);
                     int id = convWeights[0];
-                    memcpy(weight, spec->ws[id].weight, spec->ws[id].bytes_of_weight);
-                    if (outOfFileMapRange(spec->ws[id].weight, spec->mfd)) {
-                        delete spec->ws[id].weight;
-                    }
+                    UNI_MEMCPY(weight, spec->ws[id].weight, spec->ws[id].bytes_of_weight);
+                    mt_free(spec->ws[id].weight, spec);
                     spec->ws[id].weight = weight;
                     weight += spec->ws[id].bytes_of_weight;
                     spec->ws[id].bytes_of_weight = weightSize;
-                    memcpy(vec, spec->ws[id].vec, spec->ws[id].bytes_of_vec);
-                    if (outOfFileMapRange(spec->ws[id].vec, spec->mfd)) {
-                        delete spec->ws[id].vec;
-                    }
+                    UNI_MEMCPY(vec, spec->ws[id].vec, spec->ws[id].bytes_of_vec);
+                    mt_free(spec->ws[id].vec, spec);
                     spec->ws[id].vec = vec;
                     vec += spec->ws[id].bytes_of_vec;
                     spec->ws[id].bytes_of_vec = vecSize;
                     spec->ops[convOps[0]].ps.conv_spec.num_outputs = channels;
                     for (unsigned int j = 1; j < convOps.size(); j++) {
                         int id = convWeights[j];
-                        memcpy(weight, spec->ws[id].weight, spec->ws[id].bytes_of_weight);
+                        UNI_MEMCPY(weight, spec->ws[id].weight, spec->ws[id].bytes_of_weight);
                         weight += spec->ws[id].bytes_of_weight;
-                        memcpy(vec, spec->ws[id].vec, spec->ws[id].bytes_of_vec);
+                        UNI_MEMCPY(vec, spec->ws[id].vec, spec->ws[id].bytes_of_vec);
                         vec += spec->ws[id].bytes_of_vec;
 
                         setOperatorInvalid(spec, convOps[j]);
diff --git a/model_tools/include/OPOptimizers/ConvFCOptimizer.hpp b/model_tools/include/OPOptimizers/ConvFCOptimizer.hpp
new file mode 100644
index 00000000..d0c4baea
--- /dev/null
+++ b/model_tools/include/OPOptimizers/ConvFCOptimizer.hpp
@@ -0,0 +1,86 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_CONV_FC_OPTIMIZER
+#define _H_CONV_FC_OPTIMIZER
+
+#include "OPOptimizer.hpp"
+
+class ConvFCOptimizer : public OPOptimizer {
+    bool optimize(ModelSpec *spec) override
+    {
+        bool hasOptimized = false;
+        for (int i = 0; i < (int)spec->num_operator_specs - 3; i++) {
+            if (spec->ops[i].type == OT_Conv) {
+                std::vector<std::pair<int, int>> nextOpIndexes = searchOperatorIndexByInput(
+                    spec, spec->ops[i].output_tensors_name[0], i + 1, spec->num_operator_specs);
+                int transId = -1;
+                for (U32 j = 0; j < nextOpIndexes.size(); j++) {
+                    if (OT_Transpose == spec->ops[nextOpIndexes[j].first].type &&
+                        spec->ops[nextOpIndexes[j].first].ps.transpose_spec.num_axes == 4 &&
+                        spec->ops[nextOpIndexes[j].first].ps.transpose_spec.axes[0] == 0 &&
+                        spec->ops[nextOpIndexes[j].first].ps.transpose_spec.axes[1] == 2 &&
+                        spec->ops[nextOpIndexes[j].first].ps.transpose_spec.axes[2] == 1 &&
+                        spec->ops[nextOpIndexes[j].first].ps.transpose_spec.axes[3] == 3) {
+                        transId = nextOpIndexes[j].first;
+                        break;
+                    }
+                }
+                if (transId == -1) {
+                    continue;
+                }
+                nextOpIndexes = searchOperatorIndexByInput(spec,
+                    spec->ops[transId].output_tensors_name[0], transId + 1,
+                    spec->num_operator_specs);
+                if (nextOpIndexes.size() != 1 ||
+                    OT_Reshape != spec->ops[nextOpIndexes[0].first].type ||
+                    spec->ops[nextOpIndexes[0].first].ps.reshape_spec.num_shape != 3) {
+                    continue;
+                }
+                int reshapeId = nextOpIndexes[0].first;
+
+                nextOpIndexes = searchOperatorIndexByInput(spec,
+                    spec->ops[reshapeId].output_tensors_name[0], reshapeId + 1,
+                    spec->num_operator_specs);
+                if (nextOpIndexes.size() != 1 || OT_FC != spec->ops[nextOpIndexes[0].first].type) {
+                    continue;
+                }
+
+                int fcId = nextOpIndexes[0].first;
+                int wid = searchWeightIndex(spec, spec->ops[fcId].name);
+                CHECK_REQUIREMENT(wid >= 0);
+                CHECK_REQUIREMENT(spec->ws[wid].mdt == DT_F32);
+                F32 *p0 = (F32 *)spec->ws[wid].weight;
+                F32 *p1 = (F32 *)mt_malloc(spec->ws[wid].bytes_of_weight);
+                U32 column = spec->ops[i].ps.conv_spec.num_outputs;
+                U32 row = spec->ws[wid].bytes_of_weight / bytesOf(spec->ws[wid].mdt) /
+                    spec->ops[fcId].ps.fc_spec.num_outputs / column;
+                for (U32 a = 0, offset = 0; a < spec->ops[fcId].ps.fc_spec.num_outputs;
+                     a++, offset += row * column) {
+                    for (U32 r = 0; r < row; r++) {
+                        for (U32 c = 0; c < column; c++) {
+                            p1[offset + r * column + c] = p0[offset + c * row + r];
+                        }
+                    }
+                }
+                spec->ws[wid].weight = (U8 *)p1;
+                mt_free(p0, spec);
+                spec->ops[transId].ps.transpose_spec.axes[2] = 3;
+                spec->ops[transId].ps.transpose_spec.axes[3] = 1;
+                hasOptimized = true;
+            }
+        }
+        return hasOptimized;
+    }
+};
+#endif
diff --git a/model_tools/include/OPOptimizers/ConvolutionEltwiseOptimizer.hpp b/model_tools/include/OPOptimizers/ConvolutionEltwiseOptimizer.hpp
index 33ba5661..066a8a1d 100644
--- a/model_tools/include/OPOptimizers/ConvolutionEltwiseOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ConvolutionEltwiseOptimizer.hpp
@@ -32,7 +32,7 @@ class ConvolutionEltwiseOptimizer : public OPOptimizer {
         bool hasOptimized = false;
         for (int i = 1; i < spec->num_operator_specs; i++) {
             if (spec->ops[i].type == OT_Eltwise && spec->ops[i].num_inputs == 2 &&
-                spec->ops[i].ps.eltwise_spec.elt_mode == ELTWISE_SUM) {
+                spec->ops[i].ps.eltwise_spec.mode == ELTWISE_SUM) {
                 bool thisOptimized = false;
                 int fuseConvIdx = -1;
                 std::string curIn = spec->ops[i].input_tensors_name[0];
@@ -44,15 +44,17 @@ class ConvolutionEltwiseOptimizer : public OPOptimizer {
                 int nextNodeNum = 1;
                 for (auto nextOpIndex : nextOpIndexes) {
                     if (isValidOperator(spec, nextOpIndex.first) &&
-                        strcmp(spec->ops[nextOpIndex.first].name, spec->ops[i].name)) {
+                        std::string(spec->ops[nextOpIndex.first].name) !=
+                            std::string(spec->ops[i].name)) {
                         ++nextNodeNum;
                     }
                 }
                 if (nextNodeNum == 1 && isValidOperator(spec, curInIdx) &&
                     spec->ops[curInIdx].type == OT_Conv &&
-                    (spec->ops[curInIdx].ps.conv_spec.convolution_type == Convolution_Pointwise ||
+                    (spec->ops[curInIdx].ps.conv_spec.convolution_type == CONVOLUTION_POINTWISE ||
                         spec->ops[curInIdx].ps.conv_spec.convolution_type ==
-                            Convolution_Depthwise_Pointwise) &&
+                            CONVOLUTION_DEPTHWISE_POINTWISE ||
+                        spec->ops[curInIdx].ps.conv_spec.convolution_type == CONVOLUTION_DILATION) &&
                     spec->ops[curInIdx].ps.conv_spec.pw_activation_type == ACTIVATION_NULL) {
                     thisOptimized = true;
                     fuseConvIdx = curInIdx;
@@ -67,15 +69,18 @@ class ConvolutionEltwiseOptimizer : public OPOptimizer {
                         spec, curIn, curInIdx + 1, spec->num_operator_specs, false);
                     for (auto nextOpIndex : nextOpIndexes) {
                         if (isValidOperator(spec, nextOpIndex.first) &&
-                            strcmp(spec->ops[nextOpIndex.first].name, spec->ops[i].name)) {
+                            std::string(spec->ops[nextOpIndex.first].name) !=
+                                std::string(spec->ops[i].name)) {
                             ++nextNodeNum;
                         }
                     }
                     if (nextNodeNum == 1 && isValidOperator(spec, curInIdx) &&
                         spec->ops[curInIdx].type == OT_Conv &&
-                        (spec->ops[curInIdx].ps.conv_spec.convolution_type == Convolution_Pointwise ||
+                        (spec->ops[curInIdx].ps.conv_spec.convolution_type == CONVOLUTION_POINTWISE ||
                             spec->ops[curInIdx].ps.conv_spec.convolution_type ==
-                                Convolution_Depthwise_Pointwise) &&
+                                CONVOLUTION_DEPTHWISE_POINTWISE ||
+                            spec->ops[curInIdx].ps.conv_spec.convolution_type ==
+                                CONVOLUTION_DILATION) &&
                         spec->ops[curInIdx].ps.conv_spec.pw_activation_type == ACTIVATION_NULL) {
                         thisOptimized = true;
                         fuseConvIdx = curInIdx;
@@ -108,6 +113,25 @@ class ConvolutionEltwiseOptimizer : public OPOptimizer {
                     str_copy(spec->ws[convWeightIndex].op_name, nodeName.data(),
                         strlen(nodeName.data()));
 
+                    // set feature scale
+                    spec->ops[i].num_quant_feature = spec->ops[fuseConvIdx].num_quant_feature;
+                    if (0 == spec->ops[i].num_quant_feature) {
+                        spec->ops[i].feature_scale = nullptr;
+                    } else {
+                        spec->ops[i].feature_scale = (QuantSpec *)mt_malloc(
+                            spec->ops[fuseConvIdx].num_quant_feature * sizeof(QuantSpec));
+                        for (U32 j = 0; j < spec->ops[i].num_quant_feature; j++) {
+                            spec->ops[i].feature_scale[j].num_scale =
+                                spec->ops[fuseConvIdx].feature_scale[j].num_scale;
+                            int num = spec->ops[i].feature_scale[j].num_scale;
+
+                            spec->ops[i].feature_scale[j].scale =
+                                (F32 *)mt_malloc(num * sizeof(F32));
+                            UNI_MEMCPY(spec->ops[i].feature_scale[j].scale,
+                                spec->ops[fuseConvIdx].feature_scale[j].scale, num * sizeof(F32));
+                        }
+                    }
+
                     setOperatorInvalid(spec, fuseConvIdx, true);
                 }
             }
diff --git a/model_tools/include/OPOptimizers/ConvolutionSliceOptimizer.hpp b/model_tools/include/OPOptimizers/ConvolutionSliceOptimizer.hpp
index 2363534a..62f30eeb 100644
--- a/model_tools/include/OPOptimizers/ConvolutionSliceOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ConvolutionSliceOptimizer.hpp
@@ -62,33 +62,33 @@ class ConvolutionSliceOptimizer : public OPOptimizer {
                 int p = 0;
                 int cropLength = 0;
                 if (spec->ops[sliceIndex].ps.tfslice_spec.begin[dims - 1] > 0) {
-                    p = spec->ops[i].ps.conv_spec.padding_left;
+                    p = spec->ops[i].ps.conv_spec.pad_left;
                     set_padding(&spec->ops[sliceIndex].ps.tfslice_spec.begin_mask[dims - 1],
                         &spec->ops[sliceIndex].ps.tfslice_spec.begin[dims - 1], &p,
                         spec->ops[i].ps.conv_spec.stride_w);
-                    spec->ops[i].ps.conv_spec.padding_left = p;
+                    spec->ops[i].ps.conv_spec.pad_left = p;
                 }
                 if (spec->ops[sliceIndex].ps.tfslice_spec.begin[dims - 2] > 0) {
-                    p = spec->ops[i].ps.conv_spec.padding_top;
+                    p = spec->ops[i].ps.conv_spec.pad_top;
                     set_padding(&spec->ops[sliceIndex].ps.tfslice_spec.begin_mask[dims - 2],
                         &spec->ops[sliceIndex].ps.tfslice_spec.begin[dims - 2], &p,
                         spec->ops[i].ps.conv_spec.stride_h);
-                    spec->ops[i].ps.conv_spec.padding_top = p;
+                    spec->ops[i].ps.conv_spec.pad_top = p;
                 }
                 if (spec->ops[sliceIndex].ps.tfslice_spec.end[dims - 1] < 0) {
-                    p = spec->ops[i].ps.conv_spec.padding_right;
+                    p = spec->ops[i].ps.conv_spec.pad_right;
                     cropLength = -spec->ops[sliceIndex].ps.tfslice_spec.end[dims - 1];
                     set_padding(&spec->ops[sliceIndex].ps.tfslice_spec.end_mask[dims - 1],
                         &cropLength, &p, spec->ops[i].ps.conv_spec.stride_w);
                     spec->ops[sliceIndex].ps.tfslice_spec.end[dims - 1] = -cropLength;
-                    spec->ops[i].ps.conv_spec.padding_right = p;
+                    spec->ops[i].ps.conv_spec.pad_right = p;
                 }
                 if (spec->ops[sliceIndex].ps.tfslice_spec.end[dims - 2] < 0) {
-                    p = spec->ops[i].ps.conv_spec.padding_bottom;
+                    p = spec->ops[i].ps.conv_spec.pad_bottom;
                     cropLength = -spec->ops[sliceIndex].ps.tfslice_spec.end[dims - 2];
                     set_padding(&spec->ops[sliceIndex].ps.tfslice_spec.end_mask[dims - 2],
                         &cropLength, &p, spec->ops[i].ps.conv_spec.stride_h);
-                    spec->ops[i].ps.conv_spec.padding_bottom = p;
+                    spec->ops[i].ps.conv_spec.pad_bottom = p;
                     spec->ops[sliceIndex].ps.tfslice_spec.end[dims - 2] = -cropLength;
                 }
 
diff --git a/model_tools/include/OPOptimizers/ConvolutionStrideOptimizer.hpp b/model_tools/include/OPOptimizers/ConvolutionStrideOptimizer.hpp
index 5acfa22a..cdc8de78 100644
--- a/model_tools/include/OPOptimizers/ConvolutionStrideOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ConvolutionStrideOptimizer.hpp
@@ -166,13 +166,13 @@ class ConvolutionStrideOptimizer : public OPOptimizer {
                         poolOperator.ps.pooling_spec.stride_h = thisStideH;
                         poolOperator.ps.pooling_spec.stride_w = thisStideW;
                         poolOperator.ps.pooling_spec.stride_t = 1;
-                        poolOperator.ps.pooling_spec.padding_before = 0;
-                        poolOperator.ps.pooling_spec.padding_after = 0;
-                        poolOperator.ps.pooling_spec.padding_top = 0;
-                        poolOperator.ps.pooling_spec.padding_bottom = 0;
-                        poolOperator.ps.pooling_spec.padding_left = 0;
-                        poolOperator.ps.pooling_spec.padding_right = 0;
-                        poolOperator.ps.pooling_spec.rm = FLOOR;
+                        poolOperator.ps.pooling_spec.pad_before = 0;
+                        poolOperator.ps.pooling_spec.pad_after = 0;
+                        poolOperator.ps.pooling_spec.pad_top = 0;
+                        poolOperator.ps.pooling_spec.pad_bottom = 0;
+                        poolOperator.ps.pooling_spec.pad_left = 0;
+                        poolOperator.ps.pooling_spec.pad_right = 0;
+                        poolOperator.ps.pooling_spec.round_mode = ROUND_FLOOR;
                         poolOperator.ps.pooling_spec.mode = POOLING_MAX;
 
                         str_copy(poolOperator.output_tensors_name[0], poolingName.data(),
diff --git a/model_tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp b/model_tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp
index 67852381..84884223 100644
--- a/model_tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp
@@ -49,18 +49,18 @@ class DeprecatedOPOptimizer : public OPOptimizer {
                     spec->ops[i].ps.pooling_spec.stride_t == 1 &&
                     spec->ops[i].ps.pooling_spec.stride_h == 1 &&
                     spec->ops[i].ps.pooling_spec.stride_w == 1 &&
-                    spec->ops[i].ps.pooling_spec.padding_before == 0 &&
-                    spec->ops[i].ps.pooling_spec.padding_after == 0 &&
-                    spec->ops[i].ps.pooling_spec.padding_top == 0 &&
-                    spec->ops[i].ps.pooling_spec.padding_bottom == 0 &&
-                    spec->ops[i].ps.pooling_spec.padding_left == 0 &&
-                    spec->ops[i].ps.pooling_spec.padding_right == 0) {
+                    spec->ops[i].ps.pooling_spec.pad_before == 0 &&
+                    spec->ops[i].ps.pooling_spec.pad_after == 0 &&
+                    spec->ops[i].ps.pooling_spec.pad_top == 0 &&
+                    spec->ops[i].ps.pooling_spec.pad_bottom == 0 &&
+                    spec->ops[i].ps.pooling_spec.pad_left == 0 &&
+                    spec->ops[i].ps.pooling_spec.pad_right == 0) {
                     setOperatorInvalid(spec, i, true);
                     hasOptimized = true;
                     continue;
                 }
             }
-            if (spec->ops[i].type == OT_Split || spec->ops[i].type == OT_Dropout) {
+            if (spec->ops[i].type == OT_Dropout) {
                 setOperatorInvalid(spec, i, true);
                 hasOptimized = true;
                 continue;
@@ -82,7 +82,8 @@ class DeprecatedOPOptimizer : public OPOptimizer {
                 for (U32 j = 0; j < spec->ops[i].num_inputs; j++) {
                     key += spec->ops[i].input_tensors_name[j] + std::string(",");
                 }
-                key += copyBuffer<1024>(&(spec->ops[i].ps), sizeof(ParameterSpec));
+                key += copyBuffer<1024>(&(spec->ops[i].ps),
+                    get_operator_parameter_size(sg_boltVersion, spec->ops[i].type));
                 if (weightMap.find(spec->ops[i].name) != weightMap.end()) {
                     int weightId = weightMap[spec->ops[i].name];
                     if (spec->ws[weightId].bytes_of_weight > 0) {
@@ -97,8 +98,7 @@ class DeprecatedOPOptimizer : public OPOptimizer {
 
                 if (operatorMap.find(key) == operatorMap.end()) {
                     operatorMap[key] = i;
-                } else {
-                    CHECK_REQUIREMENT(spec->ops[i].num_outputs == 1);
+                } else if (spec->ops[i].num_outputs == 1) {
                     char *lastOut = spec->ops[operatorMap[key]].output_tensors_name[0];
                     std::string curOut = spec->ops[i].output_tensors_name[0];
                     auto nextOpIndexes = searchOperatorIndexByInput(
@@ -115,10 +115,40 @@ class DeprecatedOPOptimizer : public OPOptimizer {
                             str_copy(spec->output_names[j], lastOut, NAME_LEN);
                         }
                     }
-                    continue;
                 }
             }
         }
+        for (int i = 0; i < spec->num_inputs; i++) {
+            std::string name = spec->input_names[i];
+            std::vector<std::pair<int, int>> nextOpIndexes =
+                searchOperatorIndexByInput(spec, name, 0, spec->num_operator_specs);
+            if (nextOpIndexes.size() == 0) {
+                UNI_INFO_LOG("remove model input:%s.\n", name.c_str());
+                for (int j = i + 1; j < spec->num_inputs; j++) {
+                    spec->input_dims[j - 1] = spec->input_dims[j];
+                    str_copy(spec->input_names[j - 1], spec->input_names[j], NAME_LEN);
+                }
+                mt_free(spec->input_names[spec->num_inputs - 1]);
+                spec->num_inputs--;
+
+                int out_idx = -1;
+                for (int j = 0; j < spec->num_outputs; j++) {
+                    if (spec->output_names[j] == name) {
+                        out_idx = j;
+                        break;
+                    }
+                }
+                if (out_idx >= 0) {
+                    UNI_INFO_LOG("remove model output:%s.\n", name.c_str());
+                    for (int j = out_idx + 1; j < spec->num_outputs; j++) {
+                        str_copy(spec->output_names[j - 1], spec->output_names[j], NAME_LEN);
+                    }
+                    mt_free(spec->output_names[spec->num_outputs - 1]);
+                    spec->num_outputs--;
+                }
+                hasOptimized = true;
+            }
+        }
         return hasOptimized;
     }
 };
diff --git a/model_tools/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp b/model_tools/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp
index 8dac61ef..bdb0c621 100644
--- a/model_tools/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp
@@ -26,14 +26,14 @@ class DepthwisePointwiseOptimizer : public OPOptimizer {
             }
             // process depthwise convolution
             if (spec->ops[i].type == OT_Conv &&
-                spec->ops[i].ps.conv_spec.convolution_type == Convolution_Depthwise) {
+                spec->ops[i].ps.conv_spec.convolution_type == CONVOLUTION_DEPTHWISE) {
                 int dwConvOpIndex = i;
                 std::vector<std::pair<int, int>> nextOpIndexes = searchOperatorIndexByInput(spec,
                     spec->ops[dwConvOpIndex].output_tensors_name[0], dwConvOpIndex + 1,
                     spec->num_operator_specs);
                 if (nextOpIndexes.size() != 1 || OT_Conv != spec->ops[nextOpIndexes[0].first].type ||
                     spec->ops[nextOpIndexes[0].first].ps.conv_spec.convolution_type !=
-                        Convolution_Pointwise ||
+                        CONVOLUTION_POINTWISE ||
                     spec->ops[nextOpIndexes[0].first].ps.conv_spec.group != 1) {
                     UNI_WARNING_LOG("encounter unoptimize DepthwiseConv layer(no PointwiseConv): "
                                     "%s\n",
@@ -52,73 +52,53 @@ class DepthwisePointwiseOptimizer : public OPOptimizer {
 
                 U32 weightSize = spec->ws[dwConvWeightIndex].bytes_of_weight +
                     spec->ws[convWeightIndex].bytes_of_weight;
-                U8 *weight = (U8 *)mt_new_storage(weightSize);
-                memcpy(weight, spec->ws[dwConvWeightIndex].weight,
+                U8 *weight = (U8 *)mt_malloc(weightSize);
+                UNI_MEMCPY(weight, spec->ws[dwConvWeightIndex].weight,
                     spec->ws[dwConvWeightIndex].bytes_of_weight);
-                memcpy(weight + spec->ws[dwConvWeightIndex].bytes_of_weight,
+                UNI_MEMCPY(weight + spec->ws[dwConvWeightIndex].bytes_of_weight,
                     spec->ws[convWeightIndex].weight, spec->ws[convWeightIndex].bytes_of_weight);
 
                 U32 vecSize = sizeof(F32) *
                     (spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs +
                         spec->ops[convOpIndex].ps.conv_spec.num_outputs);
-                U8 *vec = (U8 *)mt_new_storage(vecSize);
+                U8 *vec = (U8 *)mt_malloc(vecSize);
                 U8 *ptr = vec;
                 if (spec->ws[dwConvWeightIndex].bytes_of_vec == 0) {
-                    memset(
+                    UNI_MEMSET(
                         ptr, 0, sizeof(F32) * (spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs));
                 } else {
                     CHECK_REQUIREMENT(
                         sizeof(F32) * (spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs) ==
                         spec->ws[dwConvWeightIndex].bytes_of_vec);
-                    memcpy(ptr, spec->ws[dwConvWeightIndex].vec,
+                    UNI_MEMCPY(ptr, spec->ws[dwConvWeightIndex].vec,
                         spec->ws[dwConvWeightIndex].bytes_of_vec);
                 }
                 ptr = vec + sizeof(F32) * (spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs);
                 if (spec->ws[convWeightIndex].bytes_of_vec == 0) {
-                    memset(ptr, 0, sizeof(F32) * (spec->ops[convOpIndex].ps.conv_spec.num_outputs));
+                    UNI_MEMSET(
+                        ptr, 0, sizeof(F32) * (spec->ops[convOpIndex].ps.conv_spec.num_outputs));
                 } else {
                     CHECK_REQUIREMENT(
                         sizeof(F32) * (spec->ops[convOpIndex].ps.conv_spec.num_outputs) ==
                         spec->ws[convWeightIndex].bytes_of_vec);
-                    memcpy(
+                    UNI_MEMCPY(
                         ptr, spec->ws[convWeightIndex].vec, spec->ws[convWeightIndex].bytes_of_vec);
                 }
 
-                // free and reallocate
-                if (spec->ws[dwConvWeightIndex].weight != nullptr) {
-                    spec->ws[dwConvWeightIndex].bytes_of_weight = 0;
-                    if (outOfFileMapRange(spec->ws[dwConvWeightIndex].weight, spec->mfd)) {
-                        delete spec->ws[dwConvWeightIndex].weight;
-                    }
-                    spec->ws[dwConvWeightIndex].weight = nullptr;
-                }
-                if (spec->ws[dwConvWeightIndex].vec != nullptr) {
-                    spec->ws[dwConvWeightIndex].bytes_of_vec = 0;
-                    if (outOfFileMapRange(spec->ws[dwConvWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[dwConvWeightIndex].vec;
-                    }
-                    spec->ws[dwConvWeightIndex].vec = nullptr;
-                }
-                if (spec->ws[convWeightIndex].weight != nullptr) {
-                    spec->ws[convWeightIndex].bytes_of_weight = 0;
-                    if (outOfFileMapRange(spec->ws[convWeightIndex].weight, spec->mfd)) {
-                        delete spec->ws[convWeightIndex].weight;
-                    }
-                    spec->ws[convWeightIndex].weight = nullptr;
-                }
-                if (spec->ws[convWeightIndex].vec != nullptr) {
-                    spec->ws[convWeightIndex].bytes_of_vec = 0;
-                    if (outOfFileMapRange(spec->ws[convWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[convWeightIndex].vec;
-                    }
-                    spec->ws[convWeightIndex].vec = nullptr;
-                }
+                spec->ws[dwConvWeightIndex].bytes_of_weight = 0;
+                mt_free(spec->ws[dwConvWeightIndex].weight, spec);
+                spec->ws[dwConvWeightIndex].bytes_of_vec = 0;
+                mt_free(spec->ws[dwConvWeightIndex].vec, spec);
+                spec->ws[convWeightIndex].bytes_of_weight = 0;
+                mt_free(spec->ws[convWeightIndex].weight, spec);
+                spec->ws[convWeightIndex].bytes_of_vec = 0;
+                mt_free(spec->ws[convWeightIndex].vec, spec);
 
                 // retain depthwise convolution operator
                 spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs =
                     spec->ops[convOpIndex].ps.conv_spec.num_outputs;
                 spec->ops[dwConvOpIndex].ps.conv_spec.convolution_type =
-                    Convolution_Depthwise_Pointwise;
+                    CONVOLUTION_DEPTHWISE_POINTWISE;
                 spec->ops[dwConvOpIndex].ps.conv_spec.pw_activation_type =
                     spec->ops[convOpIndex].ps.conv_spec.pw_activation_type;
                 spec->ws[dwConvWeightIndex].bytes_of_weight = weightSize;
diff --git a/model_tools/include/OPOptimizers/DilationConvOptimizer.hpp b/model_tools/include/OPOptimizers/DilationConvOptimizer.hpp
index 993de6b8..ab2feab2 100644
--- a/model_tools/include/OPOptimizers/DilationConvOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/DilationConvOptimizer.hpp
@@ -46,52 +46,35 @@ class DilationConvolutionOptimizer : public OPOptimizer {
             if (mergeDilation) {
                 int spaceToBatchIndex = searchWeightIndex(spec, spec->ops[spaceToBatchOpIndex].name);
                 int batchToSpaceIndex = searchWeightIndex(spec, spec->ops[batchToSpaceOpIndex].name);
-                float *vecPtr = (float *)(spec->ws[spaceToBatchIndex].vec);
-                float *vecPtr2 = (float *)(spec->ws[batchToSpaceIndex].vec);
-                if (spec->ws[spaceToBatchIndex].bytes_of_vec != 16 &&
-                    spec->ws[batchToSpaceIndex].bytes_of_vec != 16) {
-                    UNI_ERROR_LOG("not support");
-                }
+                float *dilations = (float *)(spec->ws[spaceToBatchIndex].weight);
+                float *pad1 = (float *)(spec->ws[spaceToBatchIndex].vec);
+                float *pad2 = (float *)(spec->ws[batchToSpaceIndex].vec);
+                int dim = spec->ws[spaceToBatchIndex].bytes_of_weight /
+                    bytesOf(spec->ws[spaceToBatchIndex].mdt);
 
-                spec->ops[convOpIndex].ps.conv_spec.convolution_type = Convolution_Dilation;
-                spec->ops[convOpIndex].ps.conv_spec.padding_top = vecPtr[0] - vecPtr2[0];
-                spec->ops[convOpIndex].ps.conv_spec.padding_bottom = vecPtr[1] - vecPtr2[1];
-                spec->ops[convOpIndex].ps.conv_spec.padding_left = vecPtr[2] - vecPtr2[2];
-                spec->ops[convOpIndex].ps.conv_spec.padding_right = vecPtr[3] - vecPtr2[3];
-                spec->ops[convOpIndex].ps.conv_spec.dilatedRate_h = 2;
-                spec->ops[convOpIndex].ps.conv_spec.dilatedRate_w = 2;
+                spec->ops[convOpIndex].ps.conv_spec.dilatedRate_h = dilations[0];
+                spec->ops[convOpIndex].ps.conv_spec.pad_top = pad1[0] - pad2[0];
+                spec->ops[convOpIndex].ps.conv_spec.pad_bottom = pad1[1] - pad2[1];
+                if (dim > 1) {
+                    spec->ops[convOpIndex].ps.conv_spec.dilatedRate_w = dilations[1];
+                    spec->ops[convOpIndex].ps.conv_spec.pad_left = pad1[2] - pad2[2];
+                    spec->ops[convOpIndex].ps.conv_spec.pad_right = pad1[3] - pad2[3];
+                }
+                if (dim == 3) {
+                    spec->ops[convOpIndex].ps.conv_spec.dilatedRate_t = dilations[0];
+                    spec->ops[convOpIndex].ps.conv_spec.pad_before = pad1[0] - pad2[0];
+                    spec->ops[convOpIndex].ps.conv_spec.pad_after = pad1[1] - pad2[1];
+                    spec->ops[convOpIndex].ps.conv_spec.dilatedRate_h = dilations[1];
+                    spec->ops[convOpIndex].ps.conv_spec.pad_top = pad1[2] - pad2[2];
+                    spec->ops[convOpIndex].ps.conv_spec.pad_bottom = pad1[3] - pad2[3];
+                    spec->ops[convOpIndex].ps.conv_spec.dilatedRate_w = dilations[2];
+                    spec->ops[convOpIndex].ps.conv_spec.pad_left = pad1[4] - pad2[4];
+                    spec->ops[convOpIndex].ps.conv_spec.pad_right = pad1[5] - pad2[5];
+                }
 
                 setOperatorInvalid(spec, spaceToBatchOpIndex, true);
                 setOperatorInvalid(spec, batchToSpaceOpIndex, true);
-
-                if (spec->ws[spaceToBatchIndex].weight != nullptr) {
-                    spec->ws[spaceToBatchIndex].bytes_of_weight = 0;
-                    if (outOfFileMapRange(spec->ws[spaceToBatchIndex].weight, spec->mfd)) {
-                        delete spec->ws[spaceToBatchIndex].weight;
-                    }
-                    spec->ws[spaceToBatchIndex].weight = nullptr;
-                }
-                if (spec->ws[spaceToBatchIndex].vec != nullptr) {
-                    spec->ws[spaceToBatchIndex].bytes_of_vec = 0;
-                    if (outOfFileMapRange(spec->ws[spaceToBatchIndex].vec, spec->mfd)) {
-                        delete spec->ws[spaceToBatchIndex].vec;
-                    }
-                    spec->ws[spaceToBatchIndex].vec = nullptr;
-                }
-                if (spec->ws[batchToSpaceIndex].weight != nullptr) {
-                    spec->ws[batchToSpaceIndex].bytes_of_weight = 0;
-                    if (outOfFileMapRange(spec->ws[batchToSpaceIndex].weight, spec->mfd)) {
-                        delete spec->ws[batchToSpaceIndex].weight;
-                    }
-                    spec->ws[batchToSpaceIndex].weight = nullptr;
-                }
-                if (spec->ws[batchToSpaceIndex].vec != nullptr) {
-                    spec->ws[batchToSpaceIndex].bytes_of_vec = 0;
-                    if (outOfFileMapRange(spec->ws[batchToSpaceIndex].vec, spec->mfd)) {
-                        delete spec->ws[batchToSpaceIndex].vec;
-                    }
-                    spec->ws[batchToSpaceIndex].vec = nullptr;
-                }
+                hasOptimized = true;
             }
         }
         return hasOptimized;
diff --git a/model_tools/include/OPOptimizers/FCFCOptimizer.hpp b/model_tools/include/OPOptimizers/FCFCOptimizer.hpp
index 105886db..27ae3410 100644
--- a/model_tools/include/OPOptimizers/FCFCOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/FCFCOptimizer.hpp
@@ -68,40 +68,22 @@ class FCFCOptimizer : public OPOptimizer {
                 U32 a3_k = a1_k;
                 U32 a3_size = a3_m * a3_k * bytesOf(spec->ws[a3_id].mdt);
                 U32 b3_size = a3_m * bytesOf(spec->ws[a3_id].mdt);
-                U8 *a3 = (U8 *)mt_new_storage(a3_size);
-                U8 *b3 = (U8 *)mt_new_storage(b3_size);
+                U8 *a3 = (U8 *)mt_malloc(a3_size);
+                U8 *b3 = (U8 *)mt_malloc(b3_size);
                 mmm_nt<F32>((F32 *)spec->ws[a2_id].weight, (F32 *)spec->ws[a1_id].weight, nullptr,
                     (F32 *)a3, a3_m, a3_k, a2_k);
                 mmm_nt<F32>((F32 *)spec->ws[a2_id].weight, (F32 *)spec->ws[a1_id].vec,
                     (F32 *)spec->ws[a2_id].vec, (F32 *)b3, a3_m, 1, a2_k);
 
-                //erase first fc parameter
-                if (spec->ws[a1_id].weight != nullptr) {
-                    spec->ws[a1_id].bytes_of_weight = 0;
-                    if (outOfFileMapRange(spec->ws[a1_id].weight, spec->mfd)) {
-                        delete spec->ws[a1_id].weight;
-                    }
-                    spec->ws[a1_id].weight = nullptr;
-                }
-                if (spec->ws[a1_id].vec != nullptr) {
-                    spec->ws[a1_id].bytes_of_vec = 0;
-                    if (outOfFileMapRange(spec->ws[a1_id].vec, spec->mfd)) {
-                        delete spec->ws[a1_id].vec;
-                    }
-                    spec->ws[a1_id].vec = nullptr;
-                }
+                setWeightOperatorInvalid(spec, a1_id);
+
                 str_copy(spec->ops[nextOpIndexes[0].first].input_tensors_name[0],
                     spec->ops[i].input_tensors_name[0], NAME_LEN);
                 setOperatorInvalid(spec, i);
 
-                if (spec->ws[a2_id].weight != nullptr &&
-                    outOfFileMapRange(spec->ws[a2_id].weight, spec->mfd)) {
-                    delete spec->ws[a2_id].weight;
-                }
-                if (spec->ws[a2_id].vec != nullptr &&
-                    outOfFileMapRange(spec->ws[a2_id].vec, spec->mfd)) {
-                    delete spec->ws[a2_id].vec;
-                }
+                mt_free(spec->ws[a2_id].weight, spec);
+                mt_free(spec->ws[a2_id].vec, spec);
+
                 spec->ws[a2_id].bytes_of_weight = a3_size;
                 spec->ws[a2_id].weight = a3;
                 spec->ws[a2_id].bytes_of_vec = b3_size;
@@ -139,62 +121,38 @@ class FCFCOptimizer : public OPOptimizer {
 
                 U32 weightSize = spec->ws[prevWeightIndex].bytes_of_weight +
                     spec->ws[curWeightIndex].bytes_of_weight;
-                U8 *weight = (U8 *)mt_new_storage(weightSize);
-                memcpy(weight, spec->ws[prevWeightIndex].weight,
+                U8 *weight = (U8 *)mt_malloc(weightSize);
+                UNI_MEMCPY(weight, spec->ws[prevWeightIndex].weight,
                     spec->ws[prevWeightIndex].bytes_of_weight);
-                memcpy(weight + spec->ws[prevWeightIndex].bytes_of_weight,
+                UNI_MEMCPY(weight + spec->ws[prevWeightIndex].bytes_of_weight,
                     spec->ws[curWeightIndex].weight, spec->ws[curWeightIndex].bytes_of_weight);
 
                 U32 vecSize = sizeof(F32) *
                     (spec->ops[prevOpIndex].ps.fc_spec.num_outputs +
                         spec->ops[curOpIndex].ps.fc_spec.num_outputs);
-                U8 *vec = (U8 *)mt_new_storage(vecSize);
+                U8 *vec = (U8 *)mt_malloc(vecSize);
                 U8 *ptr = vec;
                 if (spec->ws[prevWeightIndex].bytes_of_vec == 0) {
-                    memset(ptr, 0, sizeof(F32) * (spec->ops[prevOpIndex].ps.fc_spec.num_outputs));
+                    UNI_MEMSET(
+                        ptr, 0, sizeof(F32) * (spec->ops[prevOpIndex].ps.fc_spec.num_outputs));
                 } else {
                     CHECK_REQUIREMENT(sizeof(F32) * (spec->ops[prevOpIndex].ps.fc_spec.num_outputs) ==
                         spec->ws[prevWeightIndex].bytes_of_vec);
-                    memcpy(
+                    UNI_MEMCPY(
                         ptr, spec->ws[prevWeightIndex].vec, spec->ws[prevWeightIndex].bytes_of_vec);
                 }
                 ptr = vec + sizeof(F32) * (spec->ops[prevOpIndex].ps.fc_spec.num_outputs);
                 if (spec->ws[curWeightIndex].bytes_of_vec == 0) {
-                    memset(ptr, 0, sizeof(F32) * (spec->ops[curOpIndex].ps.fc_spec.num_outputs));
+                    UNI_MEMSET(ptr, 0, sizeof(F32) * (spec->ops[curOpIndex].ps.fc_spec.num_outputs));
                 } else {
                     CHECK_REQUIREMENT(sizeof(F32) * (spec->ops[curOpIndex].ps.fc_spec.num_outputs) ==
                         spec->ws[curWeightIndex].bytes_of_vec);
-                    memcpy(ptr, spec->ws[curWeightIndex].vec, spec->ws[curWeightIndex].bytes_of_vec);
+                    UNI_MEMCPY(
+                        ptr, spec->ws[curWeightIndex].vec, spec->ws[curWeightIndex].bytes_of_vec);
                 }
 
-                if (spec->ws[prevWeightIndex].weight != nullptr) {
-                    spec->ws[prevWeightIndex].bytes_of_weight = 0;
-                    if (outOfFileMapRange(spec->ws[prevWeightIndex].weight, spec->mfd)) {
-                        delete spec->ws[prevWeightIndex].weight;
-                    }
-                    spec->ws[prevWeightIndex].weight = nullptr;
-                }
-                if (spec->ws[prevWeightIndex].vec != nullptr) {
-                    spec->ws[prevWeightIndex].bytes_of_vec = 0;
-                    if (outOfFileMapRange(spec->ws[prevWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[prevWeightIndex].vec;
-                    }
-                    spec->ws[prevWeightIndex].vec = nullptr;
-                }
-                if (spec->ws[curWeightIndex].weight != nullptr) {
-                    spec->ws[curWeightIndex].bytes_of_weight = 0;
-                    if (outOfFileMapRange(spec->ws[curWeightIndex].weight, spec->mfd)) {
-                        delete spec->ws[curWeightIndex].weight;
-                    }
-                    spec->ws[curWeightIndex].weight = nullptr;
-                }
-                if (spec->ws[curWeightIndex].vec != nullptr) {
-                    spec->ws[curWeightIndex].bytes_of_vec = 0;
-                    if (outOfFileMapRange(spec->ws[curWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[curWeightIndex].vec;
-                    }
-                    spec->ws[curWeightIndex].vec = nullptr;
-                }
+                setWeightOperatorInvalid(spec, prevWeightIndex);
+                setWeightOperatorInvalid(spec, curWeightIndex);
 
                 // FC params
                 spec->ops[prevOpIndex].ps.fc_spec.num_slices++;
@@ -208,15 +166,14 @@ class FCFCOptimizer : public OPOptimizer {
 
                 // operator spec
                 spec->ops[prevOpIndex].num_outputs = slices;
-                I8 **names = (I8 **)mt_new_storage(slices * sizeof(I8 *));
+                I8 **names = (I8 **)mt_malloc(slices * sizeof(I8 *));
 
                 for (U32 j = 0; j < slices - 1; j++) {
                     names[j] = spec->ops[prevOpIndex].output_tensors_name[j];
                 }
                 names[slices - 1] = spec->ops[curOpIndex].output_tensors_name[0];
-                delete spec->ops[prevOpIndex].output_tensors_name;
-                delete spec->ops[curOpIndex].output_tensors_name;
-                spec->ops[curOpIndex].output_tensors_name = nullptr;
+                mt_free(spec->ops[prevOpIndex].output_tensors_name);
+                mt_free(spec->ops[curOpIndex].output_tensors_name);
                 spec->ops[curOpIndex].num_outputs = 0;
                 spec->ops[prevOpIndex].output_tensors_name = names;
 
diff --git a/model_tools/include/OPOptimizers/GATOptimizer.hpp b/model_tools/include/OPOptimizers/GATOptimizer.hpp
index 632b6001..52085af8 100644
--- a/model_tools/include/OPOptimizers/GATOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/GATOptimizer.hpp
@@ -59,36 +59,36 @@ class GATOptimizer : public OPOptimizer {
                 }
 
                 OperatorSpec p;
-                memset(&p, 0, sizeof(OperatorSpec));
+                UNI_MEMSET(&p, 0, sizeof(OperatorSpec));
                 std::string opName = "gat" + std::to_string(gat_layer++);
-                memcpy(p.name, opName.c_str(), opName.size());
+                UNI_MEMCPY(p.name, opName.c_str(), opName.size());
                 p.type = OT_GAT;
                 p.num_inputs = 5;
-                p.input_tensors_name = (I8 **)mt_new_storage(p.num_inputs * sizeof(I8 *));
+                p.input_tensors_name = (I8 **)mt_malloc(p.num_inputs * sizeof(I8 *));
                 for (int j = i, n = 0; j < i + 2; j++) {
                     for (U32 k = 0; k < spec->ops[j].num_inputs; k++, n++) {
-                        p.input_tensors_name[n] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
-                        strcpy(p.input_tensors_name[n], spec->ops[j].input_tensors_name[k]);
+                        p.input_tensors_name[n] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
+                        UNI_STRCPY(p.input_tensors_name[n], spec->ops[j].input_tensors_name[k]);
                     }
                 }
-                p.input_tensors_name[4] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
-                strcpy(p.input_tensors_name[4], spec->ops[i + 4].output_tensors_name[0]);
+                p.input_tensors_name[4] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
+                UNI_STRCPY(p.input_tensors_name[4], spec->ops[i + 4].output_tensors_name[0]);
                 p.num_outputs = 1;
-                p.output_tensors_name = (I8 **)mt_new_storage(p.num_outputs * sizeof(I8 *));
-                p.output_tensors_name[0] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
-                strcpy(p.output_tensors_name[0], spec->ops[k - 1].output_tensors_name[0]);
+                p.output_tensors_name = (I8 **)mt_malloc(p.num_outputs * sizeof(I8 *));
+                p.output_tensors_name[0] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
+                UNI_STRCPY(p.output_tensors_name[0], spec->ops[k - 1].output_tensors_name[0]);
 
                 p.ps.gat_spec.num_heads = num_heads;
-                p.ps.gat_spec.activation = activation;
+                p.ps.gat_spec.activation_type = activation;
                 int n = i + 5;
                 for (U32 j = 0; j < spec->ops[n].num_inputs; j++) {
-                    delete spec->ops[n].input_tensors_name[j];
+                    mt_free(spec->ops[n].input_tensors_name[j]);
                 }
-                delete spec->ops[n].input_tensors_name;
+                mt_free(spec->ops[n].input_tensors_name);
                 for (U32 j = 0; j < spec->ops[n].num_outputs; j++) {
-                    delete spec->ops[n].output_tensors_name[j];
+                    mt_free(spec->ops[n].output_tensors_name[j]);
                 }
-                delete spec->ops[n].output_tensors_name;
+                mt_free(spec->ops[n].output_tensors_name);
                 spec->ops[n] = p;
 
                 setOperatorInvalid(spec, i, false);
diff --git a/model_tools/include/OPOptimizers/GeluOptimizer.hpp b/model_tools/include/OPOptimizers/GeluOptimizer.hpp
index 35403018..fdc9ea5c 100644
--- a/model_tools/include/OPOptimizers/GeluOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/GeluOptimizer.hpp
@@ -54,10 +54,10 @@ class GeluOptimizer : public OPOptimizer {
                         UNI_ABS(spec->ops[AddIndex].ps.power_spec.shift - 1) < eps &&
                         UNI_ABS(spec->ops[AddIndex].ps.power_spec.power - 1) < eps) {
                         if (spec->ops[secMulIndex].type == OT_Eltwise &&
-                            spec->ops[secMulIndex].ps.eltwise_spec.elt_mode == ELTWISE_PROD) {
+                            spec->ops[secMulIndex].ps.eltwise_spec.mode == ELTWISE_PROD) {
                             spec->ops[secMulIndex].num_inputs = 1;
-                            delete spec->ops[secMulIndex].input_tensors_name[1];
-                            memcpy(spec->ops[secMulIndex].input_tensors_name[0],
+                            mt_free(spec->ops[secMulIndex].input_tensors_name[1]);
+                            UNI_MEMCPY(spec->ops[secMulIndex].input_tensors_name[0],
                                 spec->ops[divIndex].input_tensors_name[0], NAME_LEN);
                             spec->ops[secMulIndex].type = OT_Gelu;
                             setOperatorInvalid(spec, firMulIndex, true);
diff --git a/model_tools/include/OPOptimizers/HSigmoidOptimizer.hpp b/model_tools/include/OPOptimizers/HSigmoidOptimizer.hpp
index 899324ae..06e5ab64 100644
--- a/model_tools/include/OPOptimizers/HSigmoidOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/HSigmoidOptimizer.hpp
@@ -49,7 +49,7 @@ class HSigmoidOptimizer : public OPOptimizer {
                         if (tmpVec.size() != 1) {
                             continue;
                         }
-                        memcpy(spec->ops[div6Index].input_tensors_name[0],
+                        UNI_MEMCPY(spec->ops[div6Index].input_tensors_name[0],
                             spec->ops[add3Index].input_tensors_name[0], NAME_LEN);
                         spec->ops[div6Index].type = OT_HSigmoid;
                         setOperatorInvalid(spec, add3Index);
diff --git a/model_tools/include/OPOptimizers/HSwishOptimizer.hpp b/model_tools/include/OPOptimizers/HSwishOptimizer.hpp
index 3ed58759..9880f2e0 100644
--- a/model_tools/include/OPOptimizers/HSwishOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/HSwishOptimizer.hpp
@@ -38,7 +38,7 @@ class HSwishOptimizer : public OPOptimizer {
                         continue;
                     }
                     if (spec->ops[mulIndex].type == OT_Eltwise &&
-                        spec->ops[mulIndex].ps.eltwise_spec.elt_mode == ELTWISE_PROD &&
+                        spec->ops[mulIndex].ps.eltwise_spec.mode == ELTWISE_PROD &&
                         spec->ops[mulIndex].num_inputs == 2 &&
                         ((std::string(spec->ops[add3Index].input_tensors_name[0]) ==
                                  std::string(spec->ops[mulIndex].input_tensors_name[0]) &&
@@ -63,14 +63,14 @@ class HSwishOptimizer : public OPOptimizer {
                             spec->ops[tmpVec[0].first].ps.power_spec.shift == 0 &&
                             spec->ops[tmpVec[0].first].ps.power_spec.power == 1) {
                             int div6Index = tmpVec[0].first;
-                            memcpy(spec->ops[div6Index].input_tensors_name[0],
+                            UNI_MEMCPY(spec->ops[div6Index].input_tensors_name[0],
                                 spec->ops[add3Index].input_tensors_name[0], NAME_LEN);
                             spec->ops[div6Index].type = OT_HSwish;
                             setOperatorInvalid(spec, mulIndex);
                         } else {
                             spec->ops[mulIndex].num_inputs = 1;
-                            delete spec->ops[mulIndex].input_tensors_name[1];
-                            memcpy(spec->ops[mulIndex].input_tensors_name[0],
+                            mt_free(spec->ops[mulIndex].input_tensors_name[1]);
+                            UNI_MEMCPY(spec->ops[mulIndex].input_tensors_name[0],
                                 spec->ops[add3Index].input_tensors_name[0], NAME_LEN);
                             spec->ops[mulIndex].type = OT_HSwishNoDiv;
                         }
@@ -86,7 +86,7 @@ class HSwishOptimizer : public OPOptimizer {
                         int div6Index = mulIndex;
                         mulIndex++;
                         if (spec->ops[mulIndex].type == OT_Eltwise &&
-                            spec->ops[mulIndex].ps.eltwise_spec.elt_mode == ELTWISE_PROD &&
+                            spec->ops[mulIndex].ps.eltwise_spec.mode == ELTWISE_PROD &&
                             spec->ops[mulIndex].num_inputs == 2 &&
                             ((std::string(spec->ops[add3Index].input_tensors_name[0]) ==
                                      std::string(spec->ops[mulIndex].input_tensors_name[0]) &&
@@ -97,8 +97,8 @@ class HSwishOptimizer : public OPOptimizer {
                                     std::string(spec->ops[div6Index].output_tensors_name[0]) ==
                                         std::string(spec->ops[mulIndex].input_tensors_name[0])))) {
                             spec->ops[mulIndex].num_inputs = 1;
-                            delete spec->ops[mulIndex].input_tensors_name[1];
-                            memcpy(spec->ops[mulIndex].input_tensors_name[0],
+                            mt_free(spec->ops[mulIndex].input_tensors_name[1]);
+                            UNI_MEMCPY(spec->ops[mulIndex].input_tensors_name[0],
                                 spec->ops[add3Index].input_tensors_name[0], NAME_LEN);
                             setOperatorInvalid(spec, add3Index);
                             setOperatorInvalid(spec, relu6Index);
diff --git a/model_tools/include/OPOptimizers/InPlaceOptimizer.hpp b/model_tools/include/OPOptimizers/InPlaceOptimizer.hpp
index 9dd9c9fe..6f24869a 100644
--- a/model_tools/include/OPOptimizers/InPlaceOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/InPlaceOptimizer.hpp
@@ -144,10 +144,10 @@ class InPlaceOptimizer : public OPOptimizer {
                 ret = true;
                 break;
             }
-            case OT_Power: {
-                ret = true;
-                break;
-            }
+            //case OT_Power: {
+            //    ret = true;
+            //    break;
+            //}
             default: {
                 break;
             }
diff --git a/model_tools/include/OPOptimizers/InnerProductOptimizer.hpp b/model_tools/include/OPOptimizers/InnerProductOptimizer.hpp
index 869a89dc..2b441432 100644
--- a/model_tools/include/OPOptimizers/InnerProductOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/InnerProductOptimizer.hpp
@@ -45,18 +45,15 @@ class InnerProductOptimizer : public OPOptimizer {
                                 }
                                 spec->ws[firScaleWeightIndex].bytes_of_vec =
                                     spec->ws[secScaleWeightIndex].bytes_of_vec;
-                                U8 *ln_vec = (U8 *)mt_new_storage(
-                                    spec->ws[secScaleWeightIndex].bytes_of_vec);
-                                memcpy(ln_vec, spec->ws[secScaleWeightIndex].vec,
+                                U8 *ln_vec =
+                                    (U8 *)mt_malloc(spec->ws[secScaleWeightIndex].bytes_of_vec);
+                                UNI_MEMCPY(ln_vec, spec->ws[secScaleWeightIndex].vec,
                                     spec->ws[secScaleWeightIndex].bytes_of_vec);
                                 spec->ws[firScaleWeightIndex].vec = ln_vec;
 
                                 spec->ws[secScaleWeightIndex].bytes_of_vec = 0;
-                                if (outOfFileMapRange(spec->ws[secScaleWeightIndex].vec, spec->mfd)) {
-                                    delete spec->ws[secScaleWeightIndex].vec;
-                                }
-                                spec->ws[secScaleWeightIndex].vec = nullptr;
-                                memcpy(spec->ops[firScaleIndex].output_tensors_name[0],
+                                mt_free(spec->ws[secScaleWeightIndex].vec, spec);
+                                UNI_MEMCPY(spec->ops[firScaleIndex].output_tensors_name[0],
                                     spec->ops[secScaleIndex].output_tensors_name[0], NAME_LEN);
                                 setOperatorInvalid(spec, secScaleIndex);
                                 hasOptimized = true;
diff --git a/model_tools/include/OPOptimizers/InputTransOptimizer.hpp b/model_tools/include/OPOptimizers/InputTransOptimizer.hpp
index bf4cd08b..f9721113 100644
--- a/model_tools/include/OPOptimizers/InputTransOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/InputTransOptimizer.hpp
@@ -39,11 +39,10 @@ class InputTransOptimizer : public OPOptimizer {
                 continue;
             }
             auto transPs = spec->ops[i].ps.transpose_spec;
-            if (spec->ops[i].type == OT_Transpose && transPs.trans_size == 4 &&
-                transPs.trans_dims[0] == 0 && transPs.trans_dims[1] == 3 &&
-                transPs.trans_dims[2] == 1 && transPs.trans_dims[3] == 2) {
+            int id = modelInput[name];
+            if (spec->ops[i].type == OT_Transpose && transPs.num_axes == 4 && transPs.axes[0] == 0 &&
+                transPs.axes[1] == 3 && transPs.axes[2] == 1 && transPs.axes[3] == 2) {
                 setOperatorInvalid(spec, i, true);
-                int id = modelInput[name];
                 int c = spec->input_dims[id].dims[0];
                 int w = spec->input_dims[id].dims[1];
                 int h = spec->input_dims[id].dims[2];
@@ -54,6 +53,17 @@ class InputTransOptimizer : public OPOptimizer {
                 spec->input_dims[id].dims[3] = n;
                 hasOptimized = true;
             }
+            if (spec->ops[i].type == OT_Transpose && transPs.num_axes == 3 &&
+                transPs.axes[0] == 0 && transPs.axes[1] == 2 && transPs.axes[2] == 1) {
+                setOperatorInvalid(spec, i, true);
+                int c = spec->input_dims[id].dims[0];
+                int h = spec->input_dims[id].dims[1];
+                int n = spec->input_dims[id].dims[2];
+                spec->input_dims[id].dims[0] = h;
+                spec->input_dims[id].dims[1] = c;
+                spec->input_dims[id].dims[2] = n;
+                hasOptimized = true;
+            }
             if (spec->ops[i].type == OT_Embedding || spec->ops[i].type == OT_Gather) {
                 if (spec->input_dims[modelInput[name]].dt == DT_F32 ||
                     spec->input_dims[modelInput[name]].dt == DT_F16) {
diff --git a/model_tools/include/OPOptimizers/InvariantSliceOptimizer.hpp b/model_tools/include/OPOptimizers/InvariantSliceOptimizer.hpp
index 9935ef12..da5ea4a1 100644
--- a/model_tools/include/OPOptimizers/InvariantSliceOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/InvariantSliceOptimizer.hpp
@@ -23,9 +23,9 @@ class InvariantSliceOptimizer : public OPOptimizer {
         for (int i = 1; i < spec->num_operator_specs; i++) {
             if (spec->ops[i].type == OT_Slice) {
                 int sliceOpIndex = i;
-                int curSliceSize = spec->ops[sliceOpIndex].ps.slice_spec.slice_size;
+                int curSliceSize = spec->ops[sliceOpIndex].ps.slice_spec.num_slice;
                 if (curSliceSize == 0) {
-                    memcpy(spec->ops[sliceOpIndex - 1].output_tensors_name[0],
+                    UNI_MEMCPY(spec->ops[sliceOpIndex - 1].output_tensors_name[0],
                         spec->ops[sliceOpIndex].output_tensors_name[0], NAME_LEN);
                     setOperatorInvalid(spec, sliceOpIndex);
                     hasOptimized = true;
diff --git a/model_tools/include/OPOptimizers/LayerNormOptimizer.hpp b/model_tools/include/OPOptimizers/LayerNormOptimizer.hpp
index 9ad534d1..a4f64b1f 100644
--- a/model_tools/include/OPOptimizers/LayerNormOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/LayerNormOptimizer.hpp
@@ -18,14 +18,22 @@
 
 class LayerNormOptimizer : public OPOptimizer {
     bool optimize(ModelSpec *spec) override
+    {
+        bool hasOptimized = false;
+        hasOptimized |= optimizeLN(spec);
+        //hasOptimized |= optimizeTransposeLN(spec);
+        return hasOptimized;
+    }
+
+    bool optimizeLN(ModelSpec *spec)
     {
         bool hasOptimized = false;
         for (int i = 0; i < spec->num_operator_specs - 6; i++) {
             int k = i;
             int reduce_mean = 0;
             if (k < spec->num_operator_specs && spec->ops[k].type == OT_Reduction &&
-                spec->ops[k].ps.reduction_spec.reduction_mode == REDUCTION_MEAN) {
-                if (spec->ops[k].ps.reduction_spec.axes_num != 1) {
+                spec->ops[k].ps.reduction_spec.mode == REDUCTION_MEAN) {
+                if (spec->ops[k].ps.reduction_spec.num_axes != 1) {
                     continue;
                 }
                 reduce_mean++;
@@ -41,8 +49,7 @@ class LayerNormOptimizer : public OPOptimizer {
             }
             // var = sum(x - u) ^ 2 / n
             if (k < spec->num_operator_specs && spec->ops[k].type == OT_Eltwise &&
-                spec->ops[k].num_inputs == 2 &&
-                spec->ops[k].ps.eltwise_spec.elt_mode == ELTWISE_SUB) {
+                spec->ops[k].num_inputs == 2 && spec->ops[k].ps.eltwise_spec.mode == ELTWISE_SUB) {
                 k++;
                 k = skipInvalidOperator(spec, k);
                 if (k < spec->num_operator_specs && spec->ops[k].type == OT_Cast) {
@@ -56,8 +63,8 @@ class LayerNormOptimizer : public OPOptimizer {
                     continue;
                 }
                 if (k < spec->num_operator_specs && spec->ops[k].type == OT_Reduction &&
-                    spec->ops[k].ps.reduction_spec.reduction_mode == REDUCTION_MEAN) {
-                    if (spec->ops[k].ps.reduction_spec.axes_num != 1) {
+                    spec->ops[k].ps.reduction_spec.mode == REDUCTION_MEAN) {
+                    if (spec->ops[k].ps.reduction_spec.num_axes != 1) {
                         continue;
                     }
                     reduce_mean++;
@@ -77,8 +84,8 @@ class LayerNormOptimizer : public OPOptimizer {
                 spec->ops[k].ps.power_spec.power == 2) {
                 k++;
                 if (k < spec->num_operator_specs && spec->ops[k].type == OT_Reduction &&
-                    spec->ops[k].ps.reduction_spec.reduction_mode == REDUCTION_MEAN) {
-                    if (spec->ops[k].ps.reduction_spec.axes_num != 1) {
+                    spec->ops[k].ps.reduction_spec.mode == REDUCTION_MEAN) {
+                    if (spec->ops[k].ps.reduction_spec.num_axes != 1) {
                         continue;
                     }
                     reduce_mean++;
@@ -101,14 +108,14 @@ class LayerNormOptimizer : public OPOptimizer {
                 }
                 if (k < spec->num_operator_specs && spec->ops[k].type == OT_Eltwise &&
                     spec->ops[k].num_inputs == 2 &&
-                    spec->ops[k].ps.eltwise_spec.elt_mode == ELTWISE_SUB) {
+                    spec->ops[k].ps.eltwise_spec.mode == ELTWISE_SUB) {
                     k++;
                 } else {
                     continue;
                 }
                 if (k < spec->num_operator_specs && spec->ops[k].type == OT_Eltwise &&
                     spec->ops[k].num_inputs == 2 &&
-                    spec->ops[k].ps.eltwise_spec.elt_mode == ELTWISE_SUB) {
+                    spec->ops[k].ps.eltwise_spec.mode == ELTWISE_SUB) {
                     k++;
                 } else {
                     continue;
@@ -130,9 +137,13 @@ class LayerNormOptimizer : public OPOptimizer {
                 continue;
             }
             k = skipInvalidOperator(spec, k);
+            if (k < spec->num_operator_specs && spec->ops[k].type == OT_Power &&
+                spec->ops[k].ps.power_spec.scale == 1 && spec->ops[k].ps.power_spec.power == 1 &&
+                UNI_ABS(spec->ops[k].ps.power_spec.shift) < 0.0001) {
+	        k++;
+	    }
             if (k < spec->num_operator_specs && spec->ops[k].type == OT_Eltwise &&
-                spec->ops[k].num_inputs == 2 &&
-                spec->ops[k].ps.eltwise_spec.elt_mode == ELTWISE_DIV) {
+                spec->ops[k].num_inputs == 2 && spec->ops[k].ps.eltwise_spec.mode == ELTWISE_DIV) {
                 k++;
                 k = skipInvalidOperator(spec, k);
                 if (k < spec->num_operator_specs && spec->ops[k].type == OT_Scale) {
@@ -140,16 +151,20 @@ class LayerNormOptimizer : public OPOptimizer {
                         setOperatorInvalid(spec, j, true);
                     }
                     spec->ops[k].type = OT_LayerNorm;
+                    spec->ops[k].ps.ln_spec.axis = -1;
                     i = k;
                     hasOptimized = true;
                 }
             } else if (k < spec->num_operator_specs - 4 && spec->ops[k].type == OT_Scale &&
                 spec->ops[k + 1].type == OT_Eltwise && spec->ops[k + 2].type == OT_Eltwise &&
-                spec->ops[k + 2].ps.eltwise_spec.elt_mode == ELTWISE_SUB &&
+                spec->ops[k + 2].ps.eltwise_spec.mode == ELTWISE_SUB &&
                 spec->ops[k + 3].type == OT_Eltwise && spec->ops[k + 4].type == OT_Eltwise) {
+                std::vector<std::pair<int, int>> prevOpIndexes = searchOperatorIndexByOutput(
+                    spec, spec->ops[k + 2].input_tensors_name[0], 0, k + 2);
+                CHECK_REQUIREMENT(prevOpIndexes.size() == 1);
                 char *subWeightName = spec->ops[k + 2].input_tensors_name[0];
                 int scaleWeightIndex = searchWeightIndex(spec, spec->ops[k].name);
-                int subWeightIndex = searchWeightIndex(spec, subWeightName);
+                int subWeightIndex = searchWeightIndex(spec, spec->ops[prevOpIndexes[0].first].name);
                 CHECK_REQUIREMENT(scaleWeightIndex >= 0);
                 CHECK_REQUIREMENT(subWeightIndex >= 0);
                 if (spec->ws[scaleWeightIndex].bytes_of_vec != 0) {
@@ -163,22 +178,79 @@ class LayerNormOptimizer : public OPOptimizer {
                 }
                 spec->ws[scaleWeightIndex].bytes_of_vec = spec->ws[subWeightIndex].bytes_of_weight;
                 spec->ws[scaleWeightIndex].vec =
-                    (U8 *)mt_new_storage(spec->ws[scaleWeightIndex].bytes_of_vec);
-                memcpy(spec->ws[scaleWeightIndex].vec, spec->ws[subWeightIndex].weight,
+                    (U8 *)mt_malloc(spec->ws[scaleWeightIndex].bytes_of_vec);
+                UNI_MEMCPY(spec->ws[scaleWeightIndex].vec, spec->ws[subWeightIndex].weight,
                     spec->ws[scaleWeightIndex].bytes_of_vec);
 
                 for (int j = i; j < k; j++) {
                     setOperatorInvalid(spec, j, true);
                 }
-                strcpy(spec->ops[k].output_tensors_name[0], spec->ops[k + 4].output_tensors_name[0]);
+                UNI_STRCPY(spec->ops[k].output_tensors_name[0], spec->ops[k + 4].output_tensors_name[0]);
                 for (int j = 1; j < 5; j++) {
                     setOperatorInvalid(spec, k + j, false);
                 }
                 setOperatorInvalid(spec, sub0Index, true);
                 spec->ops[k].type = OT_LayerNorm;
+                spec->ops[k].ps.ln_spec.axis = -1;
                 i = k + 5;
                 hasOptimized = true;
-            } else {
+            }
+        }
+        return hasOptimized;
+    }
+
+    bool optimizeTransposeLN(ModelSpec *spec)
+    {
+        bool hasOptimized = false;
+        for (int i = 0; i < spec->num_operator_specs - 2; i++) {
+            if (spec->ops[i].type == OT_Transpose && spec->ops[i].ps.transpose_spec.num_axes == 3 &&
+                spec->ops[i].ps.transpose_spec.axes[0] == 0 &&
+                spec->ops[i].ps.transpose_spec.axes[1] == 2 &&
+                spec->ops[i].ps.transpose_spec.axes[2] == 1) {
+                std::vector<std::pair<int, int>> nextOpIndexes = searchOperatorIndexByInput(
+                    spec, spec->ops[i].output_tensors_name[0], i + 1, spec->num_operator_specs);
+                if (nextOpIndexes.size() != 1) {
+                    continue;
+                }
+                int j = nextOpIndexes[0].first;
+                if (spec->ops[j].type != OT_LayerNorm || spec->ops[j].ps.ln_spec.axis != -1) {
+                    continue;
+                }
+
+                nextOpIndexes = searchOperatorIndexByInput(
+                    spec, spec->ops[j].output_tensors_name[0], j + 1, spec->num_operator_specs);
+                if (nextOpIndexes.size() > 1) {
+                    continue;
+                }
+                // transpose + LN
+                if (nextOpIndexes.size() == 0) {
+                    setOperatorInvalid(spec, i, true);
+                    spec->ops[j].ps.ln_spec.axis = 1;
+                    hasOptimized = true;
+                    continue;
+                }
+
+                int k = nextOpIndexes[0].first;
+                if (spec->ops[k].type == OT_Swish || spec->ops[k].type == OT_Relu) {
+                    nextOpIndexes = searchOperatorIndexByInput(
+                        spec, spec->ops[k].output_tensors_name[0], k + 1, spec->num_operator_specs);
+                    if (nextOpIndexes.size() != 1) {
+                        continue;
+                    }
+                    k = nextOpIndexes[0].first;
+                }
+                // transpose + LN + transpose
+                if (spec->ops[k].type == OT_Transpose &&
+                    spec->ops[k].ps.transpose_spec.num_axes == 3 &&
+                    spec->ops[k].ps.transpose_spec.axes[0] == 0 &&
+                    spec->ops[k].ps.transpose_spec.axes[1] == 2 &&
+                    spec->ops[k].ps.transpose_spec.axes[2] == 1) {
+                    setOperatorInvalid(spec, i, true);
+                    setOperatorInvalid(spec, k, true);
+                    spec->ops[j].ps.ln_spec.axis = 1;
+                    hasOptimized = true;
+                    continue;
+                }
             }
         }
         return hasOptimized;
diff --git a/model_tools/include/OPOptimizers/MemoryReuseOptimizer.hpp b/model_tools/include/OPOptimizers/MemoryReuseOptimizer.hpp
index 7689db7f..ded8d501 100644
--- a/model_tools/include/OPOptimizers/MemoryReuseOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/MemoryReuseOptimizer.hpp
@@ -82,7 +82,7 @@ class MemoryReuseOptimizer : public OPOptimizer {
                 U32 numInputs = spec->ops[i].num_inputs;
                 U32 numOutputs = spec->ops[i].num_outputs;
                 spec->ops[i].tensor_positions =
-                    (I32 *)mt_new_storage((numInputs + numOutputs) * bytesOf(DT_I32));
+                    (I32 *)mt_malloc((numInputs + numOutputs) * bytesOf(DT_I32));
 
                 std::vector<std::tuple<std::string, int, U32>> layerTensors;
 
diff --git a/model_tools/include/OPOptimizers/MergeSameAndScaleOPOptimizer.hpp b/model_tools/include/OPOptimizers/MergeSameAndScaleOPOptimizer.hpp
index 51b5b624..80f3a0b3 100644
--- a/model_tools/include/OPOptimizers/MergeSameAndScaleOPOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/MergeSameAndScaleOPOptimizer.hpp
@@ -77,8 +77,8 @@ class MergeSameAndScaleOPOptimizer : public OPOptimizer {
                 (spec->ops[nextOpIndexes[0].first].ps.power_spec.power == 1)) {
                 int curIdx = nextOpIndexes[0].first;
                 if (nextOpIndexes.size() == 2 &&
-                    strcmp(spec->ops[curIdx].input_tensors_name[0],
-                        spec->ops[curIdx].output_tensors_name[0])) {
+                    std::string(spec->ops[curIdx].input_tensors_name[0]) !=
+                        std::string(spec->ops[curIdx].output_tensors_name[0])) {
                     continue;
                 }
 
@@ -94,8 +94,8 @@ class MergeSameAndScaleOPOptimizer : public OPOptimizer {
                 if (spec->ws[scaleWeightIndex].vec == nullptr) {
                     spec->ws[scaleWeightIndex].bytes_of_vec = channelAlpha * sizeof(F32);
                     spec->ws[scaleWeightIndex].vec =
-                        (U8 *)mt_new_storage(spec->ws[scaleWeightIndex].bytes_of_vec);
-                    memset(
+                        (U8 *)mt_malloc(spec->ws[scaleWeightIndex].bytes_of_vec);
+                    UNI_MEMSET(
                         spec->ws[scaleWeightIndex].vec, 0, spec->ws[scaleWeightIndex].bytes_of_vec);
                 }
 
diff --git a/model_tools/include/OPOptimizers/MergeSharedWeightOptimizer.hpp b/model_tools/include/OPOptimizers/MergeSharedWeightOptimizer.hpp
index 8bb32f04..4fd9ab49 100644
--- a/model_tools/include/OPOptimizers/MergeSharedWeightOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/MergeSharedWeightOptimizer.hpp
@@ -50,7 +50,7 @@ class MergeSharedWeightOptimizer : public OPOptimizer {
                 // Update matmul to fc
                 spec->ops[matmulOpIndex].type = OT_FC;
                 spec->ops[matmulOpIndex].num_inputs = 1;
-                delete spec->ops[matmulOpIndex].input_tensors_name[1];
+                mt_free(spec->ops[matmulOpIndex].input_tensors_name[1]);
                 spec->ops[matmulOpIndex].ps.fc_spec.num_outputs =
                     spec->ops[sharedWeightIdx].ps.shared_weight_spec.desc.dims[1];
                 spec->ops[matmulOpIndex].ps.fc_spec.num_slices = 1;
@@ -60,10 +60,10 @@ class MergeSharedWeightOptimizer : public OPOptimizer {
                 int matmulWsIndex = searchWeightIndex(spec, spec->ops[matmulOpIndex].name);
                 int sharedWsIndex = searchWeightIndex(spec, spec->ops[sharedWeightIdx].name);
                 U32 weightSize = spec->ws[sharedWsIndex].bytes_of_weight;
-                U8 *weight = (U8 *)mt_new_storage(weightSize);
+                U8 *weight = (U8 *)mt_malloc(weightSize);
 
                 if (transpose_b) {
-                    memcpy(weight, spec->ws[sharedWsIndex].weight, weightSize);
+                    UNI_MEMCPY(weight, spec->ws[sharedWsIndex].weight, weightSize);
                 } else {
                     // transpose
                     U32 row = spec->ops[sharedWeightIdx].ps.shared_weight_spec.desc.dims[1];
@@ -71,7 +71,7 @@ class MergeSharedWeightOptimizer : public OPOptimizer {
                     U32 unit = bytesOf(spec->ws[sharedWsIndex].mdt);
                     for (U32 r = 0; r < row; ++r) {
                         for (U32 c = 0; c < column; ++c) {
-                            memcpy(weight + (c * row + r) * unit,
+                            UNI_MEMCPY(weight + (c * row + r) * unit,
                                 spec->ws[sharedWsIndex].weight + (r * column + c) * unit, unit);
                         }
                     }
@@ -85,7 +85,6 @@ class MergeSharedWeightOptimizer : public OPOptimizer {
                     spec, sharedOut, sharedWeightIdx + 1, spec->num_operator_specs, false);
                 if (nextOpIndexes.size() == 0) {
                     setOperatorInvalid(spec, sharedWeightIdx, true);
-                    setWeightOperatorInvalid(spec, sharedWsIndex);
                 }
 
                 // Merge Eltwise + SharedWeight to FC bias
@@ -114,8 +113,8 @@ class MergeSharedWeightOptimizer : public OPOptimizer {
                 if (spec->ws[sharedWsIndex].bytes_of_weight / bytesOf(spec->ws[sharedWsIndex].mdt) ==
                     spec->ops[matmulOpIndex].ps.fc_spec.num_outputs) {
                     U32 biasSize = spec->ws[sharedWsIndex].bytes_of_weight;
-                    U8 *bias = (U8 *)mt_new_storage(biasSize);
-                    memcpy(bias, spec->ws[sharedWsIndex].weight, biasSize);
+                    U8 *bias = (U8 *)mt_malloc(biasSize);
+                    UNI_MEMCPY(bias, spec->ws[sharedWsIndex].weight, biasSize);
                     spec->ws[matmulWsIndex].vec = bias;
                     spec->ws[matmulWsIndex].bytes_of_vec = biasSize;
 
@@ -124,7 +123,6 @@ class MergeSharedWeightOptimizer : public OPOptimizer {
                         spec, sharedOut, sharedWeightIdx + 1, spec->num_operator_specs, false);
                     if (nextOpIndexes.size() == 1) {
                         setOperatorInvalid(spec, sharedWeightIdx, true);
-                        setWeightOperatorInvalid(spec, sharedWsIndex);
                     }
                     setOperatorInvalid(spec, eltwiseIdx, true);
                 }
diff --git a/model_tools/include/OPOptimizers/MultiHeadAttentionOptimizer.hpp b/model_tools/include/OPOptimizers/MultiHeadAttentionOptimizer.hpp
index 280d9741..5f66f2c5 100644
--- a/model_tools/include/OPOptimizers/MultiHeadAttentionOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/MultiHeadAttentionOptimizer.hpp
@@ -342,7 +342,7 @@ class MultiHeadAttentionOptimizer : public OPOptimizer {
                         }
                     }
 
-                    MultiheadAttentionParamSpec multihead_spec;
+                    MultiHeadAttentionParamSpec multihead_spec;
                     multihead_spec.fc_desc[0] = spec->ops[leftIpIndex].ps.fc_spec;
                     multihead_spec.fc_desc[1] = spec->ops[midIpIndex].ps.fc_spec;
                     multihead_spec.fc_desc[2] = spec->ops[rightIpIndex].ps.fc_spec;
@@ -352,7 +352,7 @@ class MultiHeadAttentionOptimizer : public OPOptimizer {
                     multihead_spec.power_spec = spec->ops[globalPowerIndex].ps.power_spec;
                     multihead_spec.eltwiseWithLayerNormIn[0] = firstBoolTag;
                     multihead_spec.eltwiseWithLayerNormIn[1] = secBoolTag;
-                    multihead_spec.actiMode = globalActi;
+                    multihead_spec.activation_type = globalActi;
                     multihead_spec.reshapeDesc[0] = spec->ops[leftReshapeIndex].ps.reshape_spec;
                     multihead_spec.reshapeDesc[1] = spec->ops[midReshapeIndex].ps.reshape_spec;
                     multihead_spec.reshapeDesc[2] = spec->ops[rightReshapeIndex].ps.reshape_spec;
@@ -361,7 +361,7 @@ class MultiHeadAttentionOptimizer : public OPOptimizer {
                     multihead_spec.eltwiseDesc[1] = spec->ops[secEltIndex].ps.eltwise_spec;
 
                     spec->ops[layerNormOpIndex].type = OT_MultiHeadAttention;
-                    spec->ops[layerNormOpIndex].ps.multiheadAttention_spec = multihead_spec;
+                    spec->ops[layerNormOpIndex].ps.multihead_attention_spec = multihead_spec;
 
                     int lnWeightIndex = searchWeightIndex(spec, spec->ops[layerNormOpIndex].name);
                     int leftIpWeightIndex = searchWeightIndex(spec, spec->ops[leftIpIndex].name);
@@ -389,71 +389,68 @@ class MultiHeadAttentionOptimizer : public OPOptimizer {
                         spec->ws[backLayerNorm1WeightIndex].bytes_of_vec +
                         spec->ws[backIpWeightIndex2].bytes_of_vec +
                         spec->ws[backIpWeightIndex1].bytes_of_vec;
-                    U8 *multihead_weight = (U8 *)mt_new_storage(weightSize);
-                    U8 *multihead_vec = (U8 *)mt_new_storage(biasSize);
+                    U8 *multihead_weight = (U8 *)mt_malloc(weightSize);
+                    U8 *multihead_vec = (U8 *)mt_malloc(biasSize);
                     int weightOffset = 0;
-                    memcpy(&multihead_weight[weightOffset], spec->ws[lnWeightIndex].weight,
+                    UNI_MEMCPY(&multihead_weight[weightOffset], spec->ws[lnWeightIndex].weight,
                         spec->ws[lnWeightIndex].bytes_of_weight);
                     weightOffset += spec->ws[lnWeightIndex].bytes_of_weight;
-                    memcpy(&multihead_weight[weightOffset], spec->ws[leftIpWeightIndex].weight,
+                    UNI_MEMCPY(&multihead_weight[weightOffset], spec->ws[leftIpWeightIndex].weight,
                         spec->ws[leftIpWeightIndex].bytes_of_weight);
                     weightOffset += spec->ws[leftIpWeightIndex].bytes_of_weight;
-                    memcpy(&multihead_weight[weightOffset], spec->ws[midIpWeightIndex].weight,
+                    UNI_MEMCPY(&multihead_weight[weightOffset], spec->ws[midIpWeightIndex].weight,
                         spec->ws[midIpWeightIndex].bytes_of_weight);
                     weightOffset += spec->ws[midIpWeightIndex].bytes_of_weight;
-                    memcpy(&multihead_weight[weightOffset], spec->ws[rightIpWeightIndex].weight,
+                    UNI_MEMCPY(&multihead_weight[weightOffset], spec->ws[rightIpWeightIndex].weight,
                         spec->ws[rightIpWeightIndex].bytes_of_weight);
                     weightOffset += spec->ws[rightIpWeightIndex].bytes_of_weight;
-                    memcpy(&multihead_weight[weightOffset], spec->ws[backIpWeightIndex].weight,
+                    UNI_MEMCPY(&multihead_weight[weightOffset], spec->ws[backIpWeightIndex].weight,
                         spec->ws[backIpWeightIndex].bytes_of_weight);
                     weightOffset += spec->ws[backIpWeightIndex].bytes_of_weight;
-                    memcpy(&multihead_weight[weightOffset],
+                    UNI_MEMCPY(&multihead_weight[weightOffset],
                         spec->ws[backLayerNorm1WeightIndex].weight,
                         spec->ws[backLayerNorm1WeightIndex].bytes_of_weight);
                     weightOffset += spec->ws[backLayerNorm1WeightIndex].bytes_of_weight;
-                    memcpy(&multihead_weight[weightOffset], spec->ws[backIpWeightIndex2].weight,
+                    UNI_MEMCPY(&multihead_weight[weightOffset], spec->ws[backIpWeightIndex2].weight,
                         spec->ws[backIpWeightIndex2].bytes_of_weight);
                     weightOffset += spec->ws[backIpWeightIndex2].bytes_of_weight;
-                    memcpy(&multihead_weight[weightOffset], spec->ws[backIpWeightIndex1].weight,
+                    UNI_MEMCPY(&multihead_weight[weightOffset], spec->ws[backIpWeightIndex1].weight,
                         spec->ws[backIpWeightIndex1].bytes_of_weight);
 
                     int vecOffset = 0;
-                    memcpy(&multihead_vec[vecOffset], spec->ws[lnWeightIndex].vec,
+                    UNI_MEMCPY(&multihead_vec[vecOffset], spec->ws[lnWeightIndex].vec,
                         spec->ws[lnWeightIndex].bytes_of_vec);
                     vecOffset += spec->ws[lnWeightIndex].bytes_of_vec;
-                    memcpy(&multihead_vec[vecOffset], spec->ws[leftIpWeightIndex].vec,
+                    UNI_MEMCPY(&multihead_vec[vecOffset], spec->ws[leftIpWeightIndex].vec,
                         spec->ws[leftIpWeightIndex].bytes_of_vec);
                     vecOffset += spec->ws[leftIpWeightIndex].bytes_of_vec;
-                    memcpy(&multihead_vec[vecOffset], spec->ws[midIpWeightIndex].vec,
+                    UNI_MEMCPY(&multihead_vec[vecOffset], spec->ws[midIpWeightIndex].vec,
                         spec->ws[midIpWeightIndex].bytes_of_vec);
                     vecOffset += spec->ws[midIpWeightIndex].bytes_of_vec;
-                    memcpy(&multihead_vec[vecOffset], spec->ws[rightIpWeightIndex].vec,
+                    UNI_MEMCPY(&multihead_vec[vecOffset], spec->ws[rightIpWeightIndex].vec,
                         spec->ws[rightIpWeightIndex].bytes_of_vec);
                     vecOffset += spec->ws[rightIpWeightIndex].bytes_of_vec;
-                    memcpy(&multihead_vec[vecOffset], spec->ws[backIpWeightIndex].vec,
+                    UNI_MEMCPY(&multihead_vec[vecOffset], spec->ws[backIpWeightIndex].vec,
                         spec->ws[backIpWeightIndex].bytes_of_vec);
                     vecOffset += spec->ws[backIpWeightIndex].bytes_of_vec;
-                    memcpy(&multihead_vec[vecOffset], spec->ws[backLayerNorm1WeightIndex].vec,
+                    UNI_MEMCPY(&multihead_vec[vecOffset], spec->ws[backLayerNorm1WeightIndex].vec,
                         spec->ws[backLayerNorm1WeightIndex].bytes_of_vec);
                     vecOffset += spec->ws[backLayerNorm1WeightIndex].bytes_of_vec;
-                    memcpy(&multihead_vec[vecOffset], spec->ws[backIpWeightIndex2].vec,
+                    UNI_MEMCPY(&multihead_vec[vecOffset], spec->ws[backIpWeightIndex2].vec,
                         spec->ws[backIpWeightIndex2].bytes_of_vec);
                     vecOffset += spec->ws[backIpWeightIndex2].bytes_of_vec;
-                    memcpy(&multihead_vec[vecOffset], spec->ws[backIpWeightIndex1].vec,
+                    UNI_MEMCPY(&multihead_vec[vecOffset], spec->ws[backIpWeightIndex1].vec,
                         spec->ws[backIpWeightIndex1].bytes_of_vec);
                     spec->ws[lnWeightIndex].bytes_of_weight = weightSize;
                     spec->ws[lnWeightIndex].bytes_of_vec = biasSize;
 
-                    if (outOfFileMapRange(spec->ws[lnWeightIndex].weight, spec->mfd)) {
-                        delete spec->ws[lnWeightIndex].weight;
-                    }
-                    if (outOfFileMapRange(spec->ws[lnWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[lnWeightIndex].vec;
-                    }
+                    mt_free(spec->ws[lnWeightIndex].weight, spec);
+                    mt_free(spec->ws[lnWeightIndex].vec, spec);
+
                     spec->ws[lnWeightIndex].weight = multihead_weight;
                     spec->ws[lnWeightIndex].vec = multihead_vec;
 
-                    memcpy(spec->ops[layerNormOpIndex].output_tensors_name[0],
+                    UNI_MEMCPY(spec->ops[layerNormOpIndex].output_tensors_name[0],
                         spec->ops[secEltIndex].output_tensors_name[0], NAME_LEN);
 
                     for (int k = layerNormOpIndex + 1; k <= secEltIndex; k++) {
diff --git a/model_tools/include/OPOptimizers/NoQuantLabelOptimizer.hpp b/model_tools/include/OPOptimizers/NoQuantLabelOptimizer.hpp
deleted file mode 100644
index 771a702c..00000000
--- a/model_tools/include/OPOptimizers/NoQuantLabelOptimizer.hpp
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef _H_NOQUANTLABELOPTIMIZER
-#define _H_NOQUANTLABELOPTIMIZER
-
-// ptr->feature_scale[0].scale[0] 0: no quant
-// ptr->feature_scale[0].scale[0] -1: quant, output int8
-// ptr->feature_scale[0].scale[0] -2: quant, output float32/float16
-
-#include "OPOptimizer.hpp"
-
-class NoQuantLabelOptimizer : public OPOptimizer {
-public:
-    NoQuantLabelOptimizer(bool actFP16, float clipVal)
-    {
-        if (clipVal > 0) {
-            this->uniScale = 127.0 / clipVal;
-        } else {
-            this->uniScale = 0;
-        }
-        this->hasCache = false;
-        this->actFP16 = actFP16;
-    }
-
-    bool optimize(ModelSpec *spec) override
-    {
-        bool hasOptimized = false;
-
-        for (int i = 0; i < spec->num_operator_specs; i++) {
-            if (spec->ops[i].type == OT_None) {
-                continue;
-            }
-            if (uniScale > 0) {
-                if (spec->ops[i].type == OT_FC || spec->ops[i].type == OT_MatMul ||
-                    spec->ops[i].type == OT_RNN) {
-                    this->label_clip_input(spec->ops + i);
-                    if (spec->ops[i].type == OT_FC || spec->ops[i].type == OT_RNN) {
-                        int weightIdx = searchWeightIndex(spec, spec->ops[i].name);
-                        CHECK_REQUIREMENT(-1 != weightIdx);
-                        CHECK_REQUIREMENT(DT_F32 == spec->ws[weightIdx].mdt);
-                        UNI_INFO_LOG("Clipping the weight of FC or LSTM\n");
-                        F32 clipMax = 127.0 / uniScale;
-                        F32 clipMin = -1 * clipMax;
-                        U32 len = spec->ws[weightIdx].bytes_of_weight / bytesOf(DT_F32);
-                        F32 *w = (F32 *)mt_new_storage(spec->ws[weightIdx].bytes_of_weight);
-                        memcpy(w, spec->ws[weightIdx].weight, spec->ws[weightIdx].bytes_of_weight);
-                        for (U32 j = 0; j < len; j++) {
-                            if (w[j] > clipMax) {
-                                w[j] = clipMax;
-                            } else if (w[j] < clipMin) {
-                                w[j] = clipMin;
-                            }
-                        }
-                        if (spec->ws[weightIdx].weight != nullptr) {
-                            if (outOfFileMapRange(spec->ws[weightIdx].weight, spec->mfd)) {
-                                delete spec->ws[weightIdx].weight;
-                            }
-                            spec->ws[weightIdx].weight = nullptr;
-                        }
-                        spec->ws[weightIdx].weight = (U8 *)w;
-                    }
-                }
-                continue;
-            }
-
-            if (OT_Conv == spec->ops[i].type) {
-                std::string curIn = spec->ops[i].input_tensors_name[0];
-                if (this->is_kin_to_model_input(spec, curIn, i)) {  // input is model input
-                    this->label_OP_as_no_quant(spec->ops + i);
-                    hasOptimized = true;
-                }
-            }
-
-            // Activation other than ReLU
-            if (spec->ops[i].type == OT_Relu6 || spec->ops[i].type == OT_HSwish ||
-                spec->ops[i].type == OT_HSigmoid || spec->ops[i].type == OT_Sigmoid ||
-                spec->ops[i].type == OT_Clip || spec->ops[i].type == OT_Gelu ||
-                spec->ops[i].type == OT_TanH || spec->ops[i].type == OT_Resize ||
-                spec->ops[i].type == OT_LayerNorm || spec->ops[i].type == OT_Deconvolution ||
-                spec->ops[i].type == OT_HSwishNoDiv ||
-                (spec->ops[i].type == OT_Relu && spec->ops[i].ps.relu_spec.neg_slope != 0)) {
-                std::string curIn = spec->ops[i].input_tensors_name[0];
-                this->label_fp_outputs(spec, curIn);
-                hasOptimized = true;
-            }
-
-            if (spec->ops[i].type == OT_Concat) {
-                for (U32 j = 0; j < spec->ops[i].num_inputs; j++) {
-                    std::string curIn = spec->ops[i].input_tensors_name[j];
-                    std::vector<std::pair<int, int>> prevIndex =
-                        searchOperatorIndexByOutput(spec, curIn, 0, i);
-                    if (prevIndex.size() == 0) {  // model input
-                        this->hasCache = true;
-                        std::string outName = spec->ops[i].output_tensors_name[0];
-                        this->label_fp_outputs(spec, outName);
-                        break;
-                    }
-                }
-            }
-
-            if (spec->ops[i].type == OT_Softmax) {
-                std::string inputName = spec->ops[i].input_tensors_name[0];
-                int prevKeyIndex;
-                std::vector<std::pair<int, int>> prevKeyIndexes =
-                    searchOperatorIndexByOutput(spec, inputName, 0, i);
-                while (prevKeyIndexes.size() != 0) {
-                    prevKeyIndex = prevKeyIndexes[0].first;
-                    OperatorType ot = spec->ops[prevKeyIndex].type;
-                    if (OT_Conv == ot || OT_FC == ot || OT_MatMul == ot) {
-                        break;
-                    } else {
-                        inputName =
-                            spec->ops[prevKeyIndex].input_tensors_name[prevKeyIndexes[0].second];
-                        prevKeyIndexes =
-                            searchOperatorIndexByOutput(spec, inputName, 0, prevKeyIndex);
-                    }
-                }
-                prevKeyIndex = prevKeyIndexes[0].first;
-                if (-1 == prevKeyIndex) {
-                    UNI_INFO_LOG("Softmax receives model input directly\n");
-                    continue;
-                }
-                // this->label_OP_as_no_quant(spec->ops + prevKeyIndex);
-
-                for (U32 j = 0; j < spec->ops[i].num_inputs; j++) {
-                    std::string prevIn = spec->ops[i].input_tensors_name[j];
-                    this->label_fp_outputs(spec, prevIn);
-                }
-                hasOptimized = true;
-            }
-
-            if (spec->ops[i].type == OT_Eltwise || spec->ops[i].type == OT_DetectionOutput ||
-                spec->ops[i].type == OT_Scale || actFP16) {
-                for (U32 j = 0; j < spec->ops[i].num_inputs; j++) {
-                    std::string curIn = spec->ops[i].input_tensors_name[j];
-                    this->label_fp_outputs(spec, curIn);
-                    hasOptimized = true;
-                }
-            }
-
-            if (spec->ops[i].type == OT_MatMul || spec->ops[i].type == OT_FC) {
-                for (U32 j = 0; j < spec->ops[i].num_inputs; j++) {
-                    std::string curIn = spec->ops[i].input_tensors_name[j];
-                    this->label_quant_outputs(spec, curIn);
-                    hasOptimized = true;
-                }
-            }
-        }
-        // Make sure model outputs are floating-point
-        for (int i = 0; i < spec->num_outputs; i++) {
-            std::string modelOutput = spec->output_names[i];
-            this->label_fp_outputs(spec, modelOutput);
-        }
-        return hasOptimized;
-    }
-
-    static void label_OP_as_no_quant(OperatorSpec *ptr)
-    {
-        switch (ptr->num_quant_feature) {
-            case 0: {
-                ptr->num_quant_feature = 1;
-                ptr->feature_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec));
-                ptr->feature_scale[0].num_scale = 1;
-                ptr->feature_scale[0].scale = (F32 *)mt_new_storage(sizeof(F32));
-                ptr->feature_scale[0].scale[0] = 0;
-                break;
-            }
-            case 1: {
-                CHECK_REQUIREMENT(1 == ptr->feature_scale[0].num_scale);
-                ptr->feature_scale[0].scale[0] = 0;
-                break;
-            }
-            default: {
-                CHECK_STATUS(NOT_SUPPORTED);
-            }
-        }
-    }
-
-    void label_fp_outputs(ModelSpec *ms, std::string tensorName)
-    {
-        std::vector<std::pair<int, int>> prevIndices =
-            searchOperatorIndexByOutput(ms, tensorName, 0, ms->num_operator_specs, false);
-        if (prevIndices.size() == 0) {
-            return;
-        }
-        int prevIndex = prevIndices[prevIndices.size() - 1].first;
-        OperatorSpec *ptr = ms->ops + prevIndex;
-        if (0 == ptr->num_quant_feature) {
-            ptr->num_quant_feature = 1;
-            ptr->feature_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec));
-            ptr->feature_scale[0].num_scale = 1;
-            ptr->feature_scale[0].scale = (F32 *)mt_new_storage(sizeof(F32));
-        } else {
-            if (-1 != ptr->feature_scale[0].scale[0]) {
-                ptr->feature_scale[0].scale[0] = -2;
-                return;  // Already processed the upstream
-            }
-        }
-
-        ptr->feature_scale[0].scale[0] = -2;
-        OperatorType ot = ms->ops[prevIndex].type;
-        if (OT_Conv != ot && OT_FC != ot && OT_MatMul != ot && OT_PriorBox != ot) {
-            for (U32 i = 0; i < ms->ops[prevIndex].num_inputs; i++) {
-                std::string name = ms->ops[prevIndex].input_tensors_name[i];
-                label_fp_outputs(ms, name);
-            }
-        }
-        if (hasCache && OT_MatMul == ot) {
-            for (U32 i = 0; i < ms->ops[prevIndex].num_inputs; i++) {
-                std::string name = ms->ops[prevIndex].input_tensors_name[i];
-                label_fp_outputs(ms, name);
-            }
-        }
-    }
-
-    void label_quant_outputs(ModelSpec *ms, std::string tensorName)
-    {
-        std::vector<std::pair<int, int>> prevIndices =
-            searchOperatorIndexByOutput(ms, tensorName, 0, ms->num_operator_specs, false);
-        if (prevIndices.size() == 0) {
-            return;
-        }
-        int prevIndex = prevIndices[prevIndices.size() - 1].first;
-        OperatorSpec *ptr = ms->ops + prevIndex;
-        OperatorType ot = ms->ops[prevIndex].type;
-
-        if (0 == ptr->num_quant_feature) {
-            ptr->num_quant_feature = 1;
-            ptr->feature_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec));
-            ptr->feature_scale[0].num_scale = 1;
-            ptr->feature_scale[0].scale = (F32 *)mt_new_storage(sizeof(F32));
-            ptr->feature_scale[0].scale[0] = -1;
-        } else {
-            return;
-        }
-
-        if (ot == OT_Embedding || ot == OT_LayerNorm) {  // only support the two ops now
-            return;
-        }
-
-        for (U32 i = 0; i < ms->ops[prevIndex].num_inputs; i++) {
-            std::string name = ms->ops[prevIndex].input_tensors_name[i];
-            label_quant_outputs(ms, name);
-        }
-    }
-
-    void label_clip_input(OperatorSpec *ptr)
-    {
-        CHECK_REQUIREMENT(0 == ptr->num_quant_feature);
-        ptr->num_quant_feature = ptr->num_inputs + ptr->num_outputs;
-        ptr->feature_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec) * ptr->num_quant_feature);
-        U32 i;
-        for (i = 0; i < ptr->num_inputs; i++) {
-            ptr->feature_scale[i].num_scale = 1;
-            ptr->feature_scale[i].scale = (F32 *)mt_new_storage(sizeof(F32));
-            ptr->feature_scale[i].scale[0] = this->uniScale;
-        }
-        for (; i < ptr->num_quant_feature; i++) {
-            ptr->feature_scale[i].num_scale = 1;
-            ptr->feature_scale[i].scale = (F32 *)mt_new_storage(sizeof(F32));
-            ptr->feature_scale[i].scale[0] = -2;
-        }
-    }
-
-    bool is_kin_to_model_input(ModelSpec *ms, std::string name, int bound)
-    {
-        if (0 == bound) {
-            return true;
-        }
-        std::vector<std::pair<int, int>> prevIndices =
-            searchOperatorIndexByOutput(ms, name, 0, bound, false);
-        if (0 == prevIndices.size()) {
-            return true;
-        }
-        int prevIndex = prevIndices[prevIndices.size() - 1].first;
-        OperatorType ot = ms->ops[prevIndex].type;
-        if (OT_Conv == ot || OT_FC == ot || OT_MatMul == ot) {
-            return false;
-        }
-        for (U32 i = 0; i < ms->ops[prevIndex].num_inputs; i++) {
-            if (!is_kin_to_model_input(ms, ms->ops[prevIndex].input_tensors_name[i], prevIndex)) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-private:
-    float uniScale;
-    bool hasCache;
-    bool actFP16;
-};
-#endif
diff --git a/model_tools/include/OPOptimizers/OPOptimizer.hpp b/model_tools/include/OPOptimizers/OPOptimizer.hpp
index 1207f925..00ca27e1 100644
--- a/model_tools/include/OPOptimizers/OPOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/OPOptimizer.hpp
@@ -32,7 +32,7 @@ class OPOptimizer {
     {
         size = UNI_MIN(size, bufferSize);
         char buffer[bufferSize];
-        memcpy(buffer, ptr, size);
+        UNI_MEMCPY(buffer, ptr, size);
         if (size < bufferSize)
             buffer[size] = '\0';
         else
@@ -60,16 +60,15 @@ class OPOptimizer {
         return ret;
     }
 
-    static int searchWeightIndex(ModelSpec *spec, char *op_name)
+    static int searchWeightIndex(ModelSpec *spec, std::string op_name)
     {
         if (spec->num_weight_specs <= 0) {
             return -1;
         }
 
-        std::string opNameStr = op_name;
         for (int i = 0; i < spec->num_weight_specs; i++) {
             std::string key = spec->ws[i].op_name;
-            if (key == opNameStr) {
+            if (key == op_name) {
                 return i;
             }
         }
@@ -91,10 +90,11 @@ class OPOptimizer {
 
     void setOperatorInvalid(ModelSpec *spec, int index, bool removeEdge = false)
     {
-        UNI_DEBUG_LOG("remove operator(%d) and edges(%d).\n", index, removeEdge);
         if (index >= spec->num_operator_specs || index < 0) {
             return;
         }
+        UNI_DEBUG_LOG("remove operator:%s(%s) and edges(%d).\n", spec->ops[index].name,
+            OperatorTypeName()[spec->ops[index].type], removeEdge);
         spec->ops[index].type = OT_None;
         int weightId = searchWeightIndex(spec, spec->ops[index].name);
         if (weightId >= 0) {
@@ -128,16 +128,13 @@ class OPOptimizer {
 
     void setWeightOperatorInvalid(ModelSpec *spec, int index)
     {
+        UNI_DEBUG_LOG("remove weight operator:%s.\n", spec->ws[index].op_name);
         spec->ws[index].bytes_of_weight = 0;
+        mt_free(spec->ws[index].weight, spec);
         spec->ws[index].bytes_of_vec = 0;
-        if (outOfFileMapRange(spec->ws[index].weight, spec->mfd)) {
-            delete spec->ws[index].weight;
-        }
-        spec->ws[index].weight = nullptr;
-        if (outOfFileMapRange(spec->ws[index].vec, spec->mfd)) {
-            delete spec->ws[index].vec;
-        }
-        spec->ws[index].vec = nullptr;
+        mt_free(spec->ws[index].vec, spec);
+        spec->ws[index].num_quant_scale = 0;
+        mt_free(spec->ws[index].weight_scale, spec);
     }
 
     int searchOperatorIndexByName(ModelSpec *spec, std::string name)
diff --git a/model_tools/include/OPOptimizers/PadOptimizer.hpp b/model_tools/include/OPOptimizers/PadOptimizer.hpp
index aa4cd1bf..df55c430 100644
--- a/model_tools/include/OPOptimizers/PadOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/PadOptimizer.hpp
@@ -20,52 +20,51 @@ class PadOptimizer : public OPOptimizer {
     bool optimize(ModelSpec *spec) override
     {
         bool hasOptimized = false;
-        for (int i = 0; i < spec->num_operator_specs; i++) {
-            if (spec->ops[i].type == OT_Pad && spec->ops[i].ps.pad_spec.pad_mode == Pad_Constant &&
+        for (int i = 0; i < spec->num_operator_specs - 1; i++) {
+            if (spec->ops[i].type == OT_Pad && spec->ops[i].ps.pad_spec.pad_mode == PAD_CONSTANT &&
                 spec->ops[i].ps.pad_spec.constant_value == 0) {
                 int padOpIndex = i;
                 std::vector<std::pair<int, int>> nextOpIndexes = searchOperatorIndexByInput(spec,
                     spec->ops[padOpIndex].output_tensors_name[0], padOpIndex + 1,
                     spec->num_operator_specs);
-                if ((nextOpIndexes.size() != 1) ||
-                    ((OT_Pooling != spec->ops[nextOpIndexes[0].first].type) &&
-                        (OT_Conv != spec->ops[nextOpIndexes[0].first].type))) {
+                if (nextOpIndexes.size() != 1) {
                     continue;
                 }
                 int nextOpIndex = nextOpIndexes[0].first;
-                if (spec->ops[nextOpIndex].type == OT_Pooling) {
-                    if (spec->ops[nextOpIndex].ps.pooling_spec.mode == POOLING_MAX) {
-                        continue;
-                    }
-                    spec->ops[nextOpIndex].ps.pooling_spec.padding_before +=
+                if (spec->ops[nextOpIndex].type == OT_Pooling &&
+                    spec->ops[nextOpIndex].ps.pooling_spec.mode == POOLING_MEAN) {
+                    spec->ops[nextOpIndex].ps.pooling_spec.count_include_pad = true;
+                    spec->ops[nextOpIndex].ps.pooling_spec.pad_before +=
                         spec->ops[padOpIndex].ps.pad_spec.before;
-                    spec->ops[nextOpIndex].ps.pooling_spec.padding_after +=
+                    spec->ops[nextOpIndex].ps.pooling_spec.pad_after +=
                         spec->ops[padOpIndex].ps.pad_spec.after;
-                    spec->ops[nextOpIndex].ps.pooling_spec.padding_top +=
+                    spec->ops[nextOpIndex].ps.pooling_spec.pad_top +=
                         spec->ops[padOpIndex].ps.pad_spec.top;
-                    spec->ops[nextOpIndex].ps.pooling_spec.padding_bottom +=
+                    spec->ops[nextOpIndex].ps.pooling_spec.pad_bottom +=
                         spec->ops[padOpIndex].ps.pad_spec.bottom;
-                    spec->ops[nextOpIndex].ps.pooling_spec.padding_left +=
+                    spec->ops[nextOpIndex].ps.pooling_spec.pad_left +=
                         spec->ops[padOpIndex].ps.pad_spec.left;
-                    spec->ops[nextOpIndex].ps.pooling_spec.padding_right +=
+                    spec->ops[nextOpIndex].ps.pooling_spec.pad_right +=
                         spec->ops[padOpIndex].ps.pad_spec.right;
+                    setOperatorInvalid(spec, padOpIndex, true);
+                    hasOptimized = true;
                 }
                 if (spec->ops[nextOpIndex].type == OT_Conv) {
-                    spec->ops[nextOpIndex].ps.conv_spec.padding_before +=
+                    spec->ops[nextOpIndex].ps.conv_spec.pad_before +=
                         spec->ops[padOpIndex].ps.pad_spec.before;
-                    spec->ops[nextOpIndex].ps.conv_spec.padding_after +=
+                    spec->ops[nextOpIndex].ps.conv_spec.pad_after +=
                         spec->ops[padOpIndex].ps.pad_spec.after;
-                    spec->ops[nextOpIndex].ps.conv_spec.padding_top +=
+                    spec->ops[nextOpIndex].ps.conv_spec.pad_top +=
                         spec->ops[padOpIndex].ps.pad_spec.top;
-                    spec->ops[nextOpIndex].ps.conv_spec.padding_bottom +=
+                    spec->ops[nextOpIndex].ps.conv_spec.pad_bottom +=
                         spec->ops[padOpIndex].ps.pad_spec.bottom;
-                    spec->ops[nextOpIndex].ps.conv_spec.padding_left +=
+                    spec->ops[nextOpIndex].ps.conv_spec.pad_left +=
                         spec->ops[padOpIndex].ps.pad_spec.left;
-                    spec->ops[nextOpIndex].ps.conv_spec.padding_right +=
+                    spec->ops[nextOpIndex].ps.conv_spec.pad_right +=
                         spec->ops[padOpIndex].ps.pad_spec.right;
+                    setOperatorInvalid(spec, padOpIndex, true);
+                    hasOptimized = true;
                 }
-                setOperatorInvalid(spec, padOpIndex, true);
-                hasOptimized = true;
             }
         }
         return hasOptimized;
diff --git a/model_tools/include/OPOptimizers/PowerOptimizer.hpp b/model_tools/include/OPOptimizers/PowerOptimizer.hpp
index 2ffab87e..b89cda66 100644
--- a/model_tools/include/OPOptimizers/PowerOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/PowerOptimizer.hpp
@@ -76,7 +76,7 @@ class PowerOptimizer : public OPOptimizer {
                 if (nextOpIndexes.size() != 1 ||
                     OT_Eltwise != spec->ops[nextOpIndexes[0].first].type ||
                     spec->ops[nextOpIndexes[0].first].num_inputs != 2 ||
-                    spec->ops[nextOpIndexes[0].first].ps.eltwise_spec.elt_mode != ELTWISE_SUM) {
+                    spec->ops[nextOpIndexes[0].first].ps.eltwise_spec.mode != ELTWISE_SUM) {
                     continue;
                 }
                 int eltwiseIndex = nextOpIndexes[0].first;
@@ -110,8 +110,7 @@ class PowerOptimizer : public OPOptimizer {
                     spec->ops[eltwiseIndex].num_inputs = 1;
                     str_copy(spec->ops[eltwiseIndex].input_tensors_name[0],
                         spec->ops[i].input_tensors_name[0], NAME_LEN);
-                    delete spec->ops[eltwiseIndex].input_tensors_name[1];
-                    spec->ops[eltwiseIndex].input_tensors_name[1] = nullptr;
+                    mt_free(spec->ops[eltwiseIndex].input_tensors_name[1]);
                     ReLUParamSpec reluParam =
                         spec->ops[eltwiseIndex].ps.eltwise_spec.activation_spec.relu_spec;
                     spec->ops[eltwiseIndex].ps.relu_spec = reluParam;
@@ -129,7 +128,7 @@ class PowerOptimizer : public OPOptimizer {
     {
         bool hasOptimized = false;
         for (int i = 0; i < spec->num_operator_specs - 1; i++) {
-            if (spec->ops[i].type == OT_Power) {
+            if (spec->ops[i].type == OT_Power && spec->ops[i].ps.power_spec.shift == 0) {
                 std::vector<std::pair<int, int>> nextOpIndexes = searchOperatorIndexByInput(
                     spec, spec->ops[i].output_tensors_name[0], i + 1, spec->num_operator_specs);
                 if (nextOpIndexes.size() != 1 || OT_Power != spec->ops[nextOpIndexes[0].first].type) {
@@ -153,7 +152,7 @@ class PowerOptimizer : public OPOptimizer {
         bool hasOptimized = false;
         for (int i = 0; i < spec->num_operator_specs; i++) {
             if (spec->ops[i].type == OT_Eltwise && spec->ops[i].num_inputs == 2 &&
-                spec->ops[i].ps.eltwise_spec.elt_mode == ELTWISE_PROD &&
+                spec->ops[i].ps.eltwise_spec.mode == ELTWISE_PROD &&
                 std::string(spec->ops[i].input_tensors_name[0]) ==
                     spec->ops[i].input_tensors_name[1]) {
                 spec->ops[i].type = OT_Power;
@@ -161,7 +160,7 @@ class PowerOptimizer : public OPOptimizer {
                 spec->ops[i].ps.power_spec.shift = 0;
                 spec->ops[i].ps.power_spec.power = 2;
                 spec->ops[i].num_inputs = 1;
-                delete spec->ops[i].input_tensors_name[1];
+                mt_free(spec->ops[i].input_tensors_name[1]);
                 hasOptimized = true;
             }
         }
@@ -181,6 +180,6 @@ class PowerOptimizer : public OPOptimizer {
     }
 
 private:
-    float eps = 0.0001;
+    float eps = 1e-16;
 };
 #endif
diff --git a/model_tools/include/OPOptimizers/QuantizationOptimizer.hpp b/model_tools/include/OPOptimizers/QuantizationOptimizer.hpp
new file mode 100644
index 00000000..69c93a10
--- /dev/null
+++ b/model_tools/include/OPOptimizers/QuantizationOptimizer.hpp
@@ -0,0 +1,645 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_QUANTIZATIONOPTIMIZER
+#define _H_QUANTIZATIONOPTIMIZER
+
+// ptr->feature_scale[0].scale[0] 0: no quant
+// ptr->feature_scale[0].scale[0] -1: quant, output int8
+// ptr->feature_scale[0].scale[0] -2: quant, output float32/float16
+#include <set>
+#include <unordered_map>
+#include "OPOptimizer.hpp"
+#include <sstream>
+#include <fstream>
+#include <json/json.h>
+
+const static std::set<OperatorType> NoQuantOP = {OT_HSwish, OT_HSigmoid, OT_Sigmoid, OT_Clip,
+    OT_Gelu, OT_TanH, OT_Resize, OT_LayerNorm, OT_Deconvolution, OT_HSwishNoDiv, OT_Eltwise,
+    OT_Softmax, OT_DetectionOutput, OT_Scale, OT_SharedWeight, OT_Concat, OT_Swish};
+const static std::set<OperatorType> QuantOP = {OT_Conv, OT_MatMul, OT_FC};
+const static std::set<OperatorType> IntegerOP = {OT_Gather, OT_Embedding};
+const static std::set<OperatorType> C8OP = {OT_Conv};
+
+class QuantizationOptimizer : public OPOptimizer {
+public:
+    QuantizationOptimizer()
+    {
+        this->actFP = false;
+        this->scaleFile = nullptr;
+        this->clipVal = -1;
+    }
+
+    QuantizationOptimizer(bool actFP, const char *scaleFile, F32 clipVal)
+    {
+        this->actFP = actFP;
+        this->scaleFile = scaleFile;
+        this->clipVal = clipVal;
+    }
+
+    void SetC8Flag(ModelSpec *spec)
+    {
+        for (int i = 0; i < spec->num_operator_specs; i++) {
+            if (spec->ops[i].type != OT_QuantizeLinear) {
+                continue;
+            }
+            std::string curOut = spec->ops[i].output_tensors_name[0];
+            auto nextIndex =
+                searchOperatorIndexByInput(spec, curOut, i + 1, spec->num_operator_specs);
+            bool trans = true;
+            for (auto next : nextIndex) {
+                if (!C8OP.count(spec->ops[next.first].type)) {
+                    trans = false;
+                }
+            }
+            // spec->ops[i].ps.quant_spec.trans = trans;
+        }
+    }
+
+    void mergeQuantizeLinear(ModelSpec *spec)
+    {
+        for (int i = 0; i < spec->num_operator_specs; i++) {
+            if (spec->ops[i].type == OT_None || spec->ops[i].num_outputs > 1) {
+                continue;
+            }
+            std::string curOut = spec->ops[i].output_tensors_name[0];
+            auto nextIndex =
+                searchOperatorIndexByInput(spec, curOut, i + 1, spec->num_operator_specs);
+            if (nextIndex.size() < 2) {
+                continue;
+            }
+            std::set<int> quantizeIdx;
+            for (auto next : nextIndex) {
+                if (spec->ops[next.first].type == OT_QuantizeLinear) {
+                    quantizeIdx.insert(next.first);
+                }
+            }
+            if (quantizeIdx.size() <= 1) {
+                continue;
+            }
+            std::unordered_map<float, std::vector<int>> m;
+            for (int idx : quantizeIdx) {
+                m[spec->ops[idx].feature_scale[0].scale[0]].push_back(idx);
+            }
+            for (auto ele : m) {
+                if (ele.second.size() > 1) {
+                    // merge these Quantize OP
+                    std::string inputName = spec->ops[ele.second[0]].output_tensors_name[0];
+                    for (unsigned j = 1; j < ele.second.size(); ++j) {
+                        curOut = spec->ops[ele.second[j]].output_tensors_name[0];
+                        nextIndex = searchOperatorIndexByInput(
+                            spec, curOut, i + 1, spec->num_operator_specs);
+                        if (nextIndex.size() > 1) {
+                            CHECK_STATUS(NOT_SUPPORTED);
+                        }
+                        str_copy(
+                            spec->ops[nextIndex[0].first].input_tensors_name[nextIndex[0].second],
+                            inputName.data(), strlen(inputName.data()));
+                        setOperatorInvalid(spec, ele.second[j], true);
+                    }
+                }
+            }
+        }
+    }
+
+    void insertQuantizeLinearKernel(ModelSpec *spec,
+        int insertIdx,
+        int nextIdx,
+        int nextInputIdx,
+        std::string inputName,
+        std::string outputName)
+    {
+        OperatorSpec quantizeOperator =
+            mt_create_operator(outputName.c_str(), OT_QuantizeLinear, 1, 1);
+        quantizeOperator.ps.quant_spec.axis = 0;
+        quantizeOperator.ps.quant_spec.dt = DT_U8_Q;
+        // quantizeOperator.ps.quant_spec.trans = false;
+        str_copy(
+            quantizeOperator.output_tensors_name[0], outputName.data(), strlen(outputName.data()));
+        str_copy(quantizeOperator.input_tensors_name[0], inputName.data(), strlen(inputName.data()));
+        str_copy(spec->ops[nextIdx].input_tensors_name[nextInputIdx], outputName.data(),
+            strlen(outputName.data()));
+        mt_insert_operator(spec, insertIdx, quantizeOperator);
+
+        nextIdx += 1;
+        if (spec->ops[nextIdx].type == OT_MatMul && nextInputIdx == 1) {
+            spec->ops[insertIdx].ps.quant_spec.dt = DT_I8;
+        }
+
+        // set the output scale
+        if (spec->ops[nextIdx].num_quant_feature ==
+            (spec->ops[nextIdx].num_inputs + spec->ops[nextIdx].num_outputs)) {
+            spec->ops[insertIdx].num_quant_feature = 1;
+            spec->ops[insertIdx].feature_scale = (QuantSpec *)mt_malloc(sizeof(QuantSpec));
+            U32 numScale = spec->ops[nextIdx].feature_scale[nextInputIdx].num_scale;
+            spec->ops[insertIdx].feature_scale[0].num_scale = numScale;
+            spec->ops[insertIdx].feature_scale[0].scale = (F32 *)mt_malloc(sizeof(F32) * numScale);
+            UNI_MEMCPY(spec->ops[insertIdx].feature_scale[0].scale,
+                spec->ops[nextIdx].feature_scale[nextInputIdx].scale, sizeof(F32) * numScale);
+        } else {
+            label_OP(spec->ops + insertIdx, -1);
+        }
+    }
+
+    void insertQuantizeLinear(ModelSpec *spec)
+    {
+        for (int i = 0; i < spec->num_operator_specs; i++) {
+            if (spec->ops[i].type == OT_None) {
+                continue;
+            }
+            CHECK_REQUIREMENT(spec->ops[i].num_quant_feature > 0);
+            CHECK_REQUIREMENT(spec->ops[i].feature_scale[0].num_scale > 0);
+            if (spec->ops[i].feature_scale[0].scale[0] == 0) {
+                continue;
+            }
+            for (unsigned k = 0; k < spec->ops[i].num_inputs; ++k) {
+                std::string curIn = spec->ops[i].input_tensors_name[k];
+                std::vector<std::pair<int, int>> prevIndex =
+                    searchOperatorIndexByOutput(spec, curIn, 0, i);
+
+                int prevNumQuant = 0;
+                int prevOutputScale = 0;
+                if (!prevIndex.empty()) {
+                    prevNumQuant = spec->ops[prevIndex[0].first].num_quant_feature;
+                    prevOutputScale =
+                        spec->ops[prevIndex[0].first].feature_scale[prevNumQuant - 1].scale[0];
+                }
+                if (IntegerOP.count(spec->ops[i].type)) {
+                    continue;
+                }
+                if (prevOutputScale == 0 || prevOutputScale == -2) {
+                    std::string quantizeName = "Quantize_" + curIn + std::to_string(i);
+                    insertQuantizeLinearKernel(spec, i, i, k, curIn, quantizeName);
+                    ++i;
+                }
+            }
+        }
+    }
+
+    std::pair<int, std::vector<int>> FindPathToNextLabeledOP(ModelSpec *spec, int i)
+    {
+        std::vector<int> paths(1, i);
+        std::vector<std::pair<int, int>> nextIndex(1, std::make_pair(i, 0));
+        while (nextIndex.size() > 0) {
+            int nextId = nextIndex[0].first;
+            if (spec->ops[nextId].num_quant_feature > 0) {
+                return std::make_pair(nextId, paths);
+            }
+            paths.push_back(nextId);
+            if (nextIndex.size() > 1 || spec->ops[nextId].num_outputs > 1) {
+                return std::make_pair(-2, paths);
+            }
+            std::string curOut = spec->ops[nextId].output_tensors_name[0];
+            nextIndex =
+                searchOperatorIndexByInput(spec, curOut, nextId + 1, spec->num_operator_specs);
+        }
+        return std::make_pair(-1, paths);
+    }
+
+    bool isNotNaiveRelu(OperatorSpec *op)
+    {
+        return ((op->type == OT_Relu) && (op->ps.relu_spec.neg_slope != 0));
+    }
+
+    bool isAvgPooling(OperatorSpec *op)
+    {
+        return ((op->type == OT_Pooling) && (op->ps.pooling_spec.mode == POOLING_MEAN));
+    }
+
+    bool isDepthWiseConv(OperatorSpec *op)
+    {
+        return ((op->type == OT_Conv) && (op->ps.conv_spec.num_outputs == op->ps.conv_spec.group));
+    }
+
+    bool isNoQuantOp(OperatorSpec *op)
+    {
+        return (NoQuantOP.count(op->type) || isDepthWiseConv(op) || isAvgPooling(op) ||
+            isNotNaiveRelu(op));
+    }
+
+    bool isQuantOp(OperatorSpec *op)
+    {
+        return (QuantOP.count(op->type) && !isNoQuantOp(op));
+    }
+
+    void parseAndPreLabel(ModelSpec *spec)
+    {
+        for (int i = 0; i < spec->num_operator_specs; i++) {
+            if (spec->ops[i].type == OT_None) {
+                continue;
+            }
+            if (OT_Conv == spec->ops[i].type) {
+                std::string curIn = spec->ops[i].input_tensors_name[0];
+                // input is model input
+                if (this->is_kin_to_model_input(spec, curIn, i)) {
+                    this->label_OP_as_no_quant(spec->ops + i);
+                    continue;
+                }
+            }
+            if (QuantOP.count(spec->ops[i].type)) {
+                // All quantized nodes are labeled float output at first.
+                label_OP_as_quant_float(spec->ops + i);
+            }
+            if (isNoQuantOp(spec->ops + i)) {
+                label_OP_as_no_quant(spec->ops + i);
+            }
+        }
+    }
+
+    float setToTheNextScale(ModelSpec *spec, int i)
+    {
+        std::string curOut = spec->ops[i].output_tensors_name[0];
+        auto nextIndex = searchOperatorIndexByInput(spec, curOut, i + 1, spec->num_operator_specs);
+        if (nextIndex.empty()) {
+            return 0;
+        }
+        float scale = -2;
+        for (auto index : nextIndex) {
+            float tmp = 0;
+            if (spec->ops[index.first].num_quant_feature == 0) {
+                tmp = setToTheNextScale(spec, index.first);
+            } else {
+                tmp = spec->ops[index.first]
+                          .feature_scale[spec->ops[index.first].num_quant_feature - 1]
+                          .scale[0];
+                if (tmp != 0) {
+                    tmp = -1;
+                }
+            }
+            if (scale != -2 && tmp != scale) {
+                scale = 0;
+                break;
+            }
+            scale = tmp;
+        }
+        if (nextIndex.empty()) {
+            scale = 0;
+        }
+        label_OP(spec->ops + i, scale);
+        return scale;
+    }
+
+    void labelContinuousPath(ModelSpec *spec)
+    {
+        for (int i = 0; i < spec->num_operator_specs; i++) {
+            if (spec->ops[i].num_quant_feature > 0 || spec->ops[i].type == OT_None) {
+                continue;
+            }
+            std::pair<int, std::vector<int>> path = FindPathToNextLabeledOP(spec, i);
+            int flag = 0;
+            if (path.first > 0 && spec->ops[path.first].feature_scale[0].scale[0] < 0) {
+                flag = -1;
+            }
+            if (path.first != -2) {
+                for (int id : path.second) {
+                    label_OP(spec->ops + id, flag);
+                }
+            }
+        }
+        for (int i = 0; i < spec->num_operator_specs; i++) {  // handle the multi-branch case
+            if (spec->ops[i].type == OT_None || NoQuantOP.count(spec->ops[i].type) ||
+                QuantOP.count(spec->ops[i].type) || spec->ops[i].num_quant_feature > 0) {
+                continue;
+            }
+            setToTheNextScale(spec, i);
+        }
+    }
+
+    void SetQuantOutput(ModelSpec *spec)
+    {
+        for (int i = 0; i < spec->num_operator_specs; i++) {
+            if (spec->ops[i].type == OT_None) {
+                continue;
+            }
+            if (isQuantOp(spec->ops + i)) {
+                std::string curOut = spec->ops[i].output_tensors_name[0];
+                std::vector<std::pair<int, int>> nextIndex =
+                    searchOperatorIndexByInput(spec, curOut, i + 1, spec->num_operator_specs);
+                int flag = -1;
+                if (nextIndex.empty()) {
+                    flag = -2;
+                }
+                for (auto nextNode : nextIndex) {
+                    if (spec->ops[nextNode.first].feature_scale[0].scale[0] != -1 &&
+                        (!isQuantOp(spec->ops + nextNode.first))) {
+                        flag = -2;
+                        break;
+                    }
+                }
+                if (flag == -1 && spec->ops[i].feature_scale[0].scale[0] != 0) {
+                    label_OP_as_quant_int8(spec->ops + i);
+                }
+            }
+        }
+    }
+
+    void clip_weight(ModelSpec *spec, int idx, F32 clipVal)
+    {
+        int weightIdx = searchWeightIndex(spec, spec->ops[idx].name);
+        if (weightIdx < 0) {
+            return;
+        }
+        CHECK_REQUIREMENT(DT_F32 == spec->ws[weightIdx].mdt);
+
+        UNI_INFO_LOG("Clipping the weight of %s\n", spec->ops[idx].name);
+
+        F32 clipMax = clipVal;
+        F32 clipMin = -1 * clipMax;
+        U32 len = spec->ws[weightIdx].bytes_of_weight / bytesOf(DT_F32);
+        F32 *w = (F32 *)mt_malloc(spec->ws[weightIdx].bytes_of_weight);
+        UNI_MEMCPY(w, spec->ws[weightIdx].weight, spec->ws[weightIdx].bytes_of_weight);
+        for (U32 j = 0; j < len; j++) {
+            if (w[j] > clipMax) {
+                w[j] = clipMax;
+            } else if (w[j] < clipMin) {
+                w[j] = clipMin;
+            }
+        }
+        mt_free(spec->ws[weightIdx].weight, spec);
+        spec->ws[weightIdx].weight = (U8 *)w;
+    }
+
+    void label_OP(OperatorSpec *ptr, float scale)
+    {
+        switch (ptr->num_quant_feature) {
+            case 0: {
+                ptr->num_quant_feature = 1;
+                ptr->feature_scale = (QuantSpec *)mt_malloc(sizeof(QuantSpec));
+                ptr->feature_scale[0].num_scale = 1;
+                ptr->feature_scale[0].scale = (F32 *)mt_malloc(sizeof(F32));
+                ptr->feature_scale[0].scale[0] = scale;
+                break;
+            }
+            case 1: {
+                CHECK_REQUIREMENT(1 == ptr->feature_scale[0].num_scale);
+                ptr->feature_scale[0].scale[0] = scale;
+                break;
+            }
+            default: {
+                CHECK_STATUS(NOT_SUPPORTED);
+            }
+        }
+    }
+
+    void label_OP_as_no_quant(OperatorSpec *ptr)
+    {
+        label_OP(ptr, 0);
+    }
+
+    void label_OP_as_quant_float(OperatorSpec *ptr)
+    {
+        label_OP(ptr, -2);
+    }
+
+    void label_OP_as_quant_int8(OperatorSpec *ptr)
+    {
+        label_OP(ptr, -1);
+    }
+
+    bool is_kin_to_model_input(ModelSpec *spec, std::string name, int bound)
+    {
+        if (0 == bound) {
+            return true;
+        }
+        std::vector<std::pair<int, int>> prevIndices =
+            searchOperatorIndexByOutput(spec, name, 0, bound, false);
+        if (0 == prevIndices.size()) {
+            return true;
+        }
+        int prevIndex = prevIndices[prevIndices.size() - 1].first;
+        OperatorType ot = spec->ops[prevIndex].type;
+        if (OT_Conv == spec->ops[bound].type &&
+            (spec->ops[bound].ps.conv_spec.convolution_type == CONVOLUTION_POINTWISE ||
+                spec->ops[bound].ps.conv_spec.convolution_type == CONVOLUTION_DILATION)) {
+            if (spec->ops[bound].ps.conv_spec.num_outputs % 8 != 0) {
+                return true;
+            }
+            int weightIdx = searchWeightIndex(spec, spec->ops[bound].name);
+            CHECK_REQUIREMENT(weightIdx >= 0);
+            int ic = spec->ws[weightIdx].bytes_of_weight / bytesOf(spec->ws[weightIdx].mdt) /
+                spec->ops[bound].ps.conv_spec.num_outputs;
+            if (spec->ops[bound].ps.conv_spec.kernel_t > 0) {
+                ic /= spec->ops[bound].ps.conv_spec.kernel_t;
+            }
+            if (spec->ops[bound].ps.conv_spec.kernel_h > 0) {
+                ic /= spec->ops[bound].ps.conv_spec.kernel_h;
+            }
+            if (spec->ops[bound].ps.conv_spec.kernel_w > 0) {
+                ic /= spec->ops[bound].ps.conv_spec.kernel_w;
+            }
+            if (ic % 8 != 0) {
+                return true;
+            }
+        }
+        if (OT_Conv == ot || OT_FC == ot || OT_MatMul == ot) {
+            return false;
+        }
+        for (U32 i = 0; i < spec->ops[prevIndex].num_inputs; i++) {
+            if (!is_kin_to_model_input(spec, spec->ops[prevIndex].input_tensors_name[i], prevIndex)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool optimizeNormal(ModelSpec *spec)
+    {
+        // Label the key quant and no-quant OP
+        parseAndPreLabel(spec);
+
+        // Label the other OP, depending on the graph
+        // If a continuous path ends at a quantized node, these nodes will be labeled as quantized nodes with int8 output.
+        // However, if the path ends at a non-quantized node, then these nodes will be marked as non-quantized nodes with float output.
+        labelContinuousPath(spec);
+
+        // Modify some quantized OP output to int8.
+        SetQuantOutput(spec);
+
+        return true;
+    }
+
+    bool optimizeActFP(ModelSpec *spec)
+    {
+        // Label the quant and no-quant OP
+        for (int i = 0; i < spec->num_operator_specs; i++) {
+            if (spec->ops[i].type == OT_None) {
+                continue;
+            }
+            if (OT_Conv == spec->ops[i].type) {
+                std::string curIn = spec->ops[i].input_tensors_name[0];
+                if (this->is_kin_to_model_input(spec, curIn, i)) {  // input is model input
+                    this->label_OP_as_no_quant(spec->ops + i);
+                    continue;
+                }
+            }
+            if (isQuantOp(spec->ops + i)) {
+                // All quantized nodes are labeled float output.
+                label_OP_as_quant_float(spec->ops + i);
+            } else {
+                label_OP_as_no_quant(spec->ops + i);
+            }
+        }
+        return true;
+    }
+
+    bool optimizeQATWithScale(ModelSpec *spec)
+    {
+        std::fstream file(std::string(this->scaleFile), std::ios::in);
+        Json::Value value;
+        Json::Reader reader;
+        if (!reader.parse(file, value)) {
+            UNI_ERROR_LOG("%s is not a valid JSON file.", scaleFile);
+        }
+        file.close();
+
+        parseAndPreLabel(spec);
+
+        for (I32 i = 0; i < spec->num_operator_specs; i++) {
+            if (spec->ops[i].type == OT_None) {
+                continue;
+            }
+            if (spec->ops[i].num_quant_feature == 1 && spec->ops[i].feature_scale[0].scale[0] == 0) {
+                continue;
+            }
+            std::string layerName = std::string(spec->ops[i].name);
+
+            // only quantize the layer in the scale file
+            if (!value[layerName].isObject()) {
+                label_OP_as_no_quant(spec->ops + i);
+                continue;
+            }
+
+            if (spec->ops[i].num_quant_feature == 0) {
+                UNI_WARNING_LOG("The %s Layer may not be quantized.\n", layerName.c_str());
+            }
+
+            std::vector<std::vector<F32>> scales;
+
+            // all nodes are set to F32 default
+            U32 inputNum = spec->ops[i].num_inputs;
+            U32 outputNum = spec->ops[i].num_outputs;
+            for (U32 j = 0; j < inputNum; j++) {
+                scales.push_back({-2});
+            }
+            for (U32 j = 0; j < outputNum; j++) {
+                scales.push_back({-2});
+            }
+            if (value[layerName]["inputs"].isObject()) {
+                for (U32 j = 0; j < inputNum; j++) {
+                    // only support 1 clip value now
+                    std::string inputName = std::string(spec->ops[i].input_tensors_name[j]);
+                    if (value[layerName]["inputs"][inputName].isDouble()) {
+                        scales[j] = {127.0f / value[layerName]["inputs"][inputName].asFloat()};
+                    }
+                }
+            }
+
+            if (value[layerName]["outputs"].isObject()) {
+                for (U32 j = 0; j < outputNum; j++) {
+                    // only support 1 clip value now
+                    std::string outputName = std::string(spec->ops[i].output_tensors_name[j]);
+                    if (value[layerName]["outputs"][outputName].isDouble()) {
+                        scales[j] = {127.0f / value[layerName]["outputs"][outputName].asFloat()};
+                    }
+                }
+            }
+
+            // weight clip value
+            if (value[layerName]["weights"].isObject() && value[layerName]["weights"].size() >= 1) {
+                CHECK_REQUIREMENT(value[layerName]["weights"].size() == 1);
+                Json::Value::Members members = value[layerName]["weights"].getMemberNames();
+                CHECK_REQUIREMENT(value[layerName]["weights"][members[0]].isDouble());
+                clip_weight(spec, i, value[layerName]["weights"][members[0]].asFloat());
+            }
+
+            // Store scales into result model
+            if (nullptr != spec->ops[i].feature_scale) {  // Could be labelled with -2
+                for (U32 k = 0; k < spec->ops[i].num_quant_feature; k++) {
+                    mt_free(spec->ops[i].feature_scale[k].scale);
+                }
+                mt_free(spec->ops[i].feature_scale);
+            }
+
+            spec->ops[i].num_quant_feature = scales.size();
+            spec->ops[i].feature_scale = (QuantSpec *)mt_malloc(scales.size() * sizeof(QuantSpec));
+
+            for (U32 k = 0; k < scales.size(); k++) {
+                spec->ops[i].feature_scale[k].num_scale = scales[k].size();
+                U32 scaleBytes = scales[k].size() * sizeof(F32);
+                spec->ops[i].feature_scale[k].scale = (F32 *)mt_malloc(scaleBytes);
+                UNI_MEMCPY(spec->ops[i].feature_scale[k].scale, scales[k].data(), scaleBytes);
+            }
+        }
+        return true;
+    }
+
+    bool optimizeQATWithGlobalClip(ModelSpec *spec)
+    {
+        // Label the quant and no-quant OP
+        for (int i = 0; i < spec->num_operator_specs; i++) {
+            if (spec->ops[i].type == OT_None) {
+                continue;
+            }
+            if (OT_Conv == spec->ops[i].type) {
+                std::string curIn = spec->ops[i].input_tensors_name[0];
+                if (this->is_kin_to_model_input(spec, curIn, i)) {  // input is model input
+                    this->label_OP_as_no_quant(spec->ops + i);
+                    continue;
+                }
+            }
+            if (isQuantOp(spec->ops + i)) {
+                // All quantized nodes are labeled float output.
+                // label_OP_as_quant_float(spec->ops + i);
+                clip_weight(spec, i, this->clipVal);
+                U32 scaleNum = spec->ops[i].num_inputs + spec->ops[i].num_outputs;
+                spec->ops[i].feature_scale = (QuantSpec *)mt_malloc(scaleNum * sizeof(QuantSpec));
+                for (U32 j = 0; j < scaleNum; ++j) {
+                    spec->ops[i].feature_scale[j].num_scale = 1;
+                    spec->ops[i].feature_scale[j].scale = (F32 *)mt_malloc(sizeof(F32));
+                    if (j < spec->ops[i].num_inputs) {
+                        spec->ops[i].feature_scale[j].scale[0] = this->clipVal;
+                    } else {
+                        spec->ops[i].feature_scale[j].scale[0] = -2;
+                    }
+                }
+            } else {
+                label_OP_as_no_quant(spec->ops + i);
+            }
+        }
+        return true;
+    }
+
+    bool optimize(ModelSpec *spec)
+    {
+        if (this->scaleFile != nullptr) {
+            optimizeQATWithScale(spec);
+        } else if (this->clipVal > 0) {
+            optimizeQATWithGlobalClip(spec);
+        } else if (this->actFP) {
+            optimizeActFP(spec);
+        } else {
+            optimizeNormal(spec);
+        }
+
+        // Insert QuantizeLinearOP between the OP with float output and the quantized OP.
+        insertQuantizeLinear(spec);
+        mergeQuantizeLinear(spec);
+        return true;
+    }
+
+private:
+    bool actFP;
+    const char *scaleFile;
+    F32 clipVal;
+};
+#endif
diff --git a/model_tools/include/OPOptimizers/RNNOptimizer.hpp b/model_tools/include/OPOptimizers/RNNOptimizer.hpp
index c2d92afd..93f9b53c 100644
--- a/model_tools/include/OPOptimizers/RNNOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/RNNOptimizer.hpp
@@ -52,58 +52,58 @@ class RNNOptimizer : public OPOptimizer {
                     case RNN_LSTM: {
                         gates = 4;
                         int align = gates * alignBase;
-                        NPadding = (param.numOutput * gates + alignBase - 1) / alignBase *
+                        NPadding = (param.num_outputs * gates + alignBase - 1) / alignBase *
                             alignBase / gates;
                         paddingOutput =
-                            (param.numOutput * gates + align - 1) / align * align / gates;
+                            (param.num_outputs * gates + align - 1) / align * align / gates;
                         break;
                     }
                     case RNN_GRU: {
                         gates = 3;
                         int align = gates * alignBase;
-                        NPadding = (param.numOutput * gates + alignBase - 1) / alignBase *
+                        NPadding = (param.num_outputs * gates + alignBase - 1) / alignBase *
                             alignBase / gates;
                         paddingOutput =
-                            (param.numOutput * gates + align - 1) / align * align / gates;
+                            (param.num_outputs * gates + align - 1) / align * align / gates;
                         break;
                     }
                     case RNN_GRU_LBR: {
                         gates = 3;
                         int align = alignBase;
-                        NPadding = paddingOutput = (param.numOutput + align - 1) / align * align;
+                        NPadding = paddingOutput = (param.num_outputs + align - 1) / align * align;
                         break;
                     }
                     default:
                         UNI_ERROR_LOG("RNN hidden size padding not support this mode.");
                 }
-                if (param.numOutput == NPadding) {
+                if (param.num_outputs == NPadding) {
                     continue;
                 }
                 // currently not support to padding PLSTM
-                if (param.numProjection > 0) {
+                if (param.num_projection > 0) {
                     continue;
                 }
                 UNI_WARNING_LOG("padding RNN/GRU/LSTM operator %s's hidden states to 32 "
                                 "times(%d->%d). If you don't want to use it, please set shell "
                                 "environment variable BOLT_PADDING to OFF.\n",
-                    spec->ops[i].name, param.numOutput, paddingOutput);
+                    spec->ops[i].name, param.num_outputs, paddingOutput);
                 int weightIndex = searchWeightIndex(spec, spec->ops[i].name);
                 CHECK_REQUIREMENT(weightIndex >= 0);
                 int directions;
-                if (param.biDirection) {
+                if (param.bi_direction) {
                     directions = 2;
                 } else {
                     directions = 1;
                 }
                 U32 oldNum =
-                    directions * gates * param.numOutput * bytesOf(spec->ws[weightIndex].mdt);
+                    directions * gates * param.num_outputs * bytesOf(spec->ws[weightIndex].mdt);
                 U32 newNum = directions * gates * paddingOutput * bytesOf(spec->ws[weightIndex].mdt);
                 int inputDim = spec->ws[weightIndex].bytes_of_weight / oldNum;
-                int paddingInputDim = inputDim - param.numOutput + paddingOutput;
+                int paddingInputDim = inputDim - param.num_outputs + paddingOutput;
                 U32 oldVecNum = oldNum;
                 U32 newVecNum = newNum;
                 if (param.mode == RNN_GRU_LBR) {
-                    oldVecNum += directions * param.numOutput * bytesOf(spec->ws[weightIndex].mdt);
+                    oldVecNum += directions * param.num_outputs * bytesOf(spec->ws[weightIndex].mdt);
                     newVecNum += directions * paddingOutput * bytesOf(spec->ws[weightIndex].mdt);
                 }
 
@@ -113,71 +113,69 @@ class RNNOptimizer : public OPOptimizer {
                 U8 *oldWeight = spec->ws[weightIndex].weight;
                 U8 *oldVec = spec->ws[weightIndex].vec;
 
-                spec->ops[i].ps.rnn_spec.numOutput = paddingOutput;
+                spec->ops[i].ps.rnn_spec.num_outputs = paddingOutput;
                 spec->ws[weightIndex].bytes_of_weight = newNum * paddingInputDim;
                 spec->ws[weightIndex].weight =
-                    (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_weight);
+                    (U8 *)mt_malloc(spec->ws[weightIndex].bytes_of_weight);
                 spec->ws[weightIndex].bytes_of_vec =
                     (spec->ws[weightIndex].bytes_of_vec > 0) ? newVecNum : 0;
-                spec->ws[weightIndex].vec = (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_vec);
+                spec->ws[weightIndex].vec = (U8 *)mt_malloc(spec->ws[weightIndex].bytes_of_vec);
                 U8 *newWeight = spec->ws[weightIndex].weight;
                 U8 *newVec = spec->ws[weightIndex].vec;
 
-                int oldVecStride = param.numOutput * bytesOf(spec->ws[weightIndex].mdt);
+                int oldVecStride = param.num_outputs * bytesOf(spec->ws[weightIndex].mdt);
                 int oldWeightStride = oldVecStride * inputDim;
                 int newVecStride = paddingOutput * bytesOf(spec->ws[weightIndex].mdt);
                 int newWeightStride = newVecStride * paddingInputDim;
                 int vecBlank =
-                    (paddingOutput - param.numOutput) * bytesOf(spec->ws[weightIndex].mdt);
+                    (paddingOutput - param.num_outputs) * bytesOf(spec->ws[weightIndex].mdt);
                 int weightBlank = vecBlank * paddingInputDim;
                 for (int j = 0, wid = 0, vid = 0; j < directions; j++) {
                     for (int k = 0; k < gates; k++, wid++, vid++) {
-                        for (U32 m = 0; m < param.numOutput; m++) {
-                            int id00 = (wid * param.numOutput + m) * inputDim *
+                        for (U32 m = 0; m < param.num_outputs; m++) {
+                            int id00 = (wid * param.num_outputs + m) * inputDim *
                                 bytesOf(spec->ws[weightIndex].mdt);
                             int id01 = (wid * paddingOutput + m) * paddingInputDim *
                                 bytesOf(spec->ws[weightIndex].mdt);
                             int copySize = inputDim * bytesOf(spec->ws[weightIndex].mdt);
-                            memcpy(newWeight + id01, oldWeight + id00, copySize);
-                            memset(newWeight + id01 + copySize, 0,
+                            UNI_MEMCPY(newWeight + id01, oldWeight + id00, copySize);
+                            UNI_MEMSET(newWeight + id01 + copySize, 0,
                                 (paddingInputDim - inputDim) * bytesOf(spec->ws[weightIndex].mdt));
                         }
-                        int id01 = (wid * paddingOutput + param.numOutput) * paddingInputDim *
+                        int id01 = (wid * paddingOutput + param.num_outputs) * paddingInputDim *
                             bytesOf(spec->ws[weightIndex].mdt);
-                        memset(newWeight + id01, 0, weightBlank);
+                        UNI_MEMSET(newWeight + id01, 0, weightBlank);
                         if (oldVec != nullptr) {
                             int id10 = vid * oldVecStride;
                             int id11 = vid * newVecStride;
-                            memcpy(newVec + id11, oldVec + id10, oldVecStride);
-                            memset(newVec + id11 + oldVecStride, 0, vecBlank);
+                            UNI_MEMCPY(newVec + id11, oldVec + id10, oldVecStride);
+                            UNI_MEMSET(newVec + id11 + oldVecStride, 0, vecBlank);
                         }
                     }
                     if (param.mode == RNN_GRU_LBR && oldVec != nullptr) {
                         int id10 = vid * oldVecStride;
                         int id11 = vid * newVecStride;
-                        memcpy(newVec + id11, oldVec + id10, oldVecStride);
-                        memset(newVec + id11 + oldVecStride, 0, vecBlank);
+                        UNI_MEMCPY(newVec + id11, oldVec + id10, oldVecStride);
+                        UNI_MEMSET(newVec + id11 + oldVecStride, 0, vecBlank);
                         vid++;
                     }
                 }
-                delete oldWeight;
-                if (oldVec != nullptr) {
-                    delete oldVec;
-                }
+                mt_free(oldWeight, spec);
+                mt_free(oldVec, spec);
 
                 std::string name = std::string(spec->ops[i].name) + std::string("_slice");
                 OperatorSpec tfsliceOperator = mt_create_operator(name.c_str(), OT_TfSlice, 1, 1);
                 TfSliceParamSpec tfSlicePs;
-                memset(&tfSlicePs, 0, sizeof(tfSlicePs));
-                tfSlicePs.dim_size = 3;
-                for (U32 j = 0; j < tfSlicePs.dim_size; j++) {
+                UNI_MEMSET(&tfSlicePs, 0, sizeof(tfSlicePs));
+                tfSlicePs.num_dims = 3;
+                for (U32 j = 0; j < tfSlicePs.num_dims; j++) {
                     tfSlicePs.begin[j] = 0;
                     tfSlicePs.end[j] = -1;
                     tfSlicePs.strides[j] = 1;
                     tfSlicePs.begin_mask[j] = 1;
                     tfSlicePs.end_mask[j] = 1;
                 }
-                tfSlicePs.end[2] = param.numOutput;
+                tfSlicePs.end[2] = param.num_outputs;
                 tfSlicePs.end_mask[2] = 0;
                 tfsliceOperator.ps.tfslice_spec = tfSlicePs;
                 str_copy(tfsliceOperator.output_tensors_name[0],
@@ -193,12 +191,12 @@ class RNNOptimizer : public OPOptimizer {
     }
 
     template <typename T>
-    bool transpose(int length, T *input, int *trans, int *output)
+    bool transpose(int length, T *input, unsigned int *trans, T *output)
     {
         bool same = true;
         for (int j = 0; j < length; j++) {
             output[j] = input[trans[j]];
-            if (output[j] != j) {
+            if (output[j] != (T)j) {
                 same = false;
             }
         }
@@ -231,15 +229,15 @@ class RNNOptimizer : public OPOptimizer {
                             break;
                         }
                     }
-                    int dims[3] = {1, 0, 2};
+                    unsigned int dims[3] = {1, 0, 2};
                     if (spec->ops[prevOpIndex].type == OT_Transpose &&
-                        spec->ops[prevOpIndex].ps.transpose_spec.trans_size == 3) {
+                        spec->ops[prevOpIndex].ps.transpose_spec.num_axes == 3) {
                         if (!remove) {
                             UNI_ERROR_LOG("RNNOptimizer can not process Transpose before RNN, "
                                           "1->N\n");
                         }
                         remove = this->transpose<unsigned int>(
-                            3, spec->ops[prevOpIndex].ps.transpose_spec.trans_dims, dims, dims);
+                            3, spec->ops[prevOpIndex].ps.transpose_spec.axes, dims, dims);
                         if (remove) {
                             setOperatorInvalid(spec, prevOpIndex, true);
                             std::vector<std::pair<int, int>> prevOpIndexes1 =
@@ -251,23 +249,24 @@ class RNNOptimizer : public OPOptimizer {
                         } else {
                             if (prevTransposeSet.find(prevOpIndex) == prevTransposeSet.end()) {
                                 prevTransposeSet.insert(prevOpIndex);
-                                memcpy(spec->ops[prevOpIndex].ps.transpose_spec.trans_dims, dims,
-                                    sizeof(int) * 3);
+                                UNI_MEMCPY(spec->ops[prevOpIndex].ps.transpose_spec.axes, dims,
+                                    sizeof(unsigned int) * 3);
                             }
                             rnnInputId = prevOpIndex;
                         }
                     }
                     if (spec->ops[prevOpIndex].type == OT_Reshape &&
-                        spec->ops[prevOpIndex].ps.reshape_spec.shape_size == 3 &&
-                        spec->ops[prevOpIndex].ps.reshape_spec.shape_dims[0] != 1) {
+                        spec->ops[prevOpIndex].ps.reshape_spec.num_shape == 3 &&
+                        spec->ops[prevOpIndex].ps.reshape_spec.shape[0] != 1) {
                         if (!remove) {
                             UNI_ERROR_LOG("RNNOptimizer can not process Reshape before RNN, "
                                           "1->N\n");
                         }
+                        int out[3];
                         this->transpose<int>(
-                            3, spec->ops[prevOpIndex].ps.reshape_spec.shape_dims, dims, dims);
-                        memcpy(spec->ops[prevOpIndex].ps.reshape_spec.shape_dims, dims,
-                            sizeof(int) * 3);
+                            3, spec->ops[prevOpIndex].ps.reshape_spec.shape, dims, out);
+                        UNI_MEMCPY(
+                            spec->ops[prevOpIndex].ps.reshape_spec.shape, out, sizeof(int) * 3);
                         rnnInputId = prevOpIndex;
                     }
                 }
@@ -322,7 +321,7 @@ class RNNOptimizer : public OPOptimizer {
                                 //        str_copy(
                                 //            spec->input_names[k - 1], spec->input_names[k], NAME_LEN);
                                 //    }
-                                //    delete spec->input_names[spec->num_inputs - 1];
+                                //    mt_free(spec->input_names[spec->num_inputs - 1]);
                                 //    spec->num_inputs--;
                             }
                         }
@@ -363,11 +362,11 @@ class RNNOptimizer : public OPOptimizer {
                     }
                     if (index == i) {
                         for (U32 k = 1; k < spec->ops[index].num_inputs; k++) {
-                            delete spec->ops[index].input_tensors_name[k];
+                            mt_free(spec->ops[index].input_tensors_name[k]);
                         }
                         spec->ops[index].num_inputs = 1;
                         for (U32 k = 1; k < spec->ops[index].num_outputs; k++) {
-                            delete spec->ops[index].output_tensors_name[k];
+                            mt_free(spec->ops[index].output_tensors_name[k]);
                         }
                         spec->ops[index].num_outputs = 1;
                     }
@@ -389,10 +388,10 @@ class RNNOptimizer : public OPOptimizer {
 
                     // onnx-scan + transpose(1,0,2)
                     if (spec->ops[nextOpIndex1].type == OT_Transpose &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_size == 3 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[0] == 1 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[1] == 0 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[2] == 2) {
+                        spec->ops[nextOpIndex1].ps.transpose_spec.num_axes == 3 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[0] == 1 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[1] == 0 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[2] == 2) {
                         str_copy(spec->ops[i].output_tensors_name[0],
                             spec->ops[nextOpIndex1].output_tensors_name[0], NAME_LEN);
                         setOperatorInvalid(spec, nextOpIndex1);
@@ -404,27 +403,27 @@ class RNNOptimizer : public OPOptimizer {
                             spec->ops[nextOpIndex1].output_tensors_name[0], NAME_LEN);
                         setOperatorInvalid(spec, nextOpIndex1);
                         if (spec->ops[nextOpIndex2].type == OT_Transpose &&
-                            spec->ops[nextOpIndex2].ps.transpose_spec.trans_size == 3) {
-                            int dims[3] = {1, 0, 2};
-                            bool remove = this->transpose<unsigned int>(3,
-                                spec->ops[nextOpIndex2].ps.transpose_spec.trans_dims, dims, dims);
+                            spec->ops[nextOpIndex2].ps.transpose_spec.num_axes == 3) {
+                            unsigned int dims[3] = {1, 0, 2};
+                            bool remove = this->transpose<unsigned int>(
+                                3, spec->ops[nextOpIndex2].ps.transpose_spec.axes, dims, dims);
                             if (remove) {
                                 str_copy(spec->ops[i].output_tensors_name[0],
                                     spec->ops[nextOpIndex2].output_tensors_name[0], NAME_LEN);
                                 setOperatorInvalid(spec, nextOpIndex2);
                             } else {
-                                memcpy(spec->ops[nextOpIndex2].ps.transpose_spec.trans_dims, dims,
+                                UNI_MEMCPY(spec->ops[nextOpIndex2].ps.transpose_spec.axes, dims,
                                     sizeof(int) * 3);
                             }
                         }
                     }
                     // onnx-rnn + transpose(2,0,1,3) + reshape(0,0,-1)
                     if (spec->ops[nextOpIndex1].type == OT_Transpose &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_size == 4 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[0] == 2 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[1] == 0 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[2] == 1 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[3] == 3 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.num_axes == 4 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[0] == 2 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[1] == 0 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[2] == 1 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[3] == 3 &&
                         spec->ops[nextOpIndex2].type == OT_Reshape) {
                         str_copy(spec->ops[i].output_tensors_name[0],
                             spec->ops[nextOpIndex2].output_tensors_name[0], NAME_LEN);
@@ -433,11 +432,11 @@ class RNNOptimizer : public OPOptimizer {
                     }
                     // onnx-birnn + transpose(0,2,1,3) + reshape(0,0,-1) + transpose(1,0,2)/rnn
                     if (spec->ops[nextOpIndex1].type == OT_Transpose &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_size == 4 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[0] == 0 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[1] == 2 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[2] == 1 &&
-                        spec->ops[nextOpIndex1].ps.transpose_spec.trans_dims[3] == 3 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.num_axes == 4 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[0] == 0 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[1] == 2 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[2] == 1 &&
+                        spec->ops[nextOpIndex1].ps.transpose_spec.axes[3] == 3 &&
                         spec->ops[nextOpIndex2].type == OT_Reshape) {
                         str_copy(spec->ops[i].output_tensors_name[0],
                             spec->ops[nextOpIndex2].output_tensors_name[0], NAME_LEN);
@@ -452,17 +451,17 @@ class RNNOptimizer : public OPOptimizer {
                         }
                         int nextOpIndex3 = nextOpIndexes3[0].first;
                         if (spec->ops[nextOpIndex3].type == OT_Transpose &&
-                            spec->ops[nextOpIndex3].ps.transpose_spec.trans_size == 3) {
-                            int dims[3] = {1, 0, 2};
-                            bool remove = this->transpose<unsigned int>(3,
-                                spec->ops[nextOpIndex3].ps.transpose_spec.trans_dims, dims, dims);
+                            spec->ops[nextOpIndex3].ps.transpose_spec.num_axes == 3) {
+                            unsigned int input[3] = {1, 0, 2}, output[3];
+                            bool remove = this->transpose<unsigned int>(
+                                3, input, spec->ops[nextOpIndex3].ps.transpose_spec.axes, output);
                             if (remove) {
                                 str_copy(spec->ops[i].output_tensors_name[0],
                                     spec->ops[nextOpIndex3].output_tensors_name[0], NAME_LEN);
                                 setOperatorInvalid(spec, nextOpIndex3);
                             } else {
-                                memcpy(spec->ops[nextOpIndex3].ps.transpose_spec.trans_dims, dims,
-                                    sizeof(int) * 3);
+                                UNI_MEMCPY(spec->ops[nextOpIndex3].ps.transpose_spec.axes, output,
+                                    sizeof(unsigned int) * 3);
                             }
                         }
                     }
diff --git a/model_tools/include/OPOptimizers/ReshapeINOptimizer.hpp b/model_tools/include/OPOptimizers/ReshapeINOptimizer.hpp
index e404831b..4df7ae19 100644
--- a/model_tools/include/OPOptimizers/ReshapeINOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ReshapeINOptimizer.hpp
@@ -37,7 +37,7 @@ class ReshapeINOptimizer : public OPOptimizer {
                 }
 
                 int reshapeIndex = prevOpIndexes[0].first;
-                int *reshape0Shape = spec->ops[reshapeIndex].ps.reshape_spec.shape_dims;
+                int *reshape0Shape = spec->ops[reshapeIndex].ps.reshape_spec.shape;
 
                 bool thisOptimized = true;
                 for (int j = 0; j < axis; ++j) {
diff --git a/model_tools/include/OPOptimizers/ReshapeOptimizer.hpp b/model_tools/include/OPOptimizers/ReshapeOptimizer.hpp
index 40520e63..08738b24 100644
--- a/model_tools/include/OPOptimizers/ReshapeOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ReshapeOptimizer.hpp
@@ -41,49 +41,49 @@ class ReshapeOptimizer : public OPOptimizer {
                 int nextOpIndex = nextOpIndexes[0].first;
                 if (spec->ops[nextOpIndex].type == OT_Squeeze) {
                     int index = 0;
-                    for (int j = 0; j < spec->ops[i].ps.reshape_spec.shape_size; j++) {
+                    for (int j = 0; j < spec->ops[i].ps.reshape_spec.num_shape; j++) {
                         bool flag = false;
-                        for (int k = 0; k < spec->ops[nextOpIndex].ps.squeeze_spec.axes_num; k++) {
+                        for (int k = 0; k < spec->ops[nextOpIndex].ps.squeeze_spec.num_axes; k++) {
                             if (j == spec->ops[nextOpIndex].ps.squeeze_spec.axes[k]) {
                                 flag = true;
                                 break;
                             }
                         }
-                        if (UNI_ABS(spec->ops[i].ps.reshape_spec.shape_dims[j]) > 1 || !flag) {
+                        if (UNI_ABS(spec->ops[i].ps.reshape_spec.shape[j]) > 1 || !flag) {
                             if (flag) {
                                 UNI_WARNING_LOG("try to squeeze an non-1 axis.\n");
                             }
-                            spec->ops[i].ps.reshape_spec.shape_dims[index] =
-                                spec->ops[i].ps.reshape_spec.shape_dims[j];
+                            spec->ops[i].ps.reshape_spec.shape[index] =
+                                spec->ops[i].ps.reshape_spec.shape[j];
                             index++;
                         }
                     }
-                    spec->ops[i].ps.reshape_spec.shape_size = index;
+                    spec->ops[i].ps.reshape_spec.num_shape = index;
                     setOperatorInvalid(spec, nextOpIndex, true);
                     hasOptimized = true;
                 }
                 if (spec->ops[nextOpIndex].type == OT_Unsqueeze) {
                     const int dim_max = 8;
                     int shapes[dim_max];
-                    memset(shapes, 0, sizeof(int) * dim_max);
-                    for (int k = 0; k < spec->ops[nextOpIndex].ps.unsqueeze_spec.axes_num; k++) {
+                    UNI_MEMSET(shapes, 0, sizeof(int) * dim_max);
+                    for (int k = 0; k < spec->ops[nextOpIndex].ps.unsqueeze_spec.num_axes; k++) {
                         shapes[spec->ops[nextOpIndex].ps.unsqueeze_spec.axes[k]] = 1;
                     }
                     int index = 0;
                     int j;
                     for (j = 0; j < dim_max; j++) {
                         if (shapes[j] == 0) {
-                            if (index < spec->ops[i].ps.reshape_spec.shape_size) {
-                                shapes[j] = spec->ops[i].ps.reshape_spec.shape_dims[index];
+                            if (index < spec->ops[i].ps.reshape_spec.num_shape) {
+                                shapes[j] = spec->ops[i].ps.reshape_spec.shape[index];
                                 index++;
                             } else {
                                 break;
                             }
                         }
                     }
-                    spec->ops[i].ps.reshape_spec.shape_size = j;
-                    for (int j = 0; j < spec->ops[i].ps.reshape_spec.shape_size; j++) {
-                        spec->ops[i].ps.reshape_spec.shape_dims[j] = shapes[j];
+                    spec->ops[i].ps.reshape_spec.num_shape = j;
+                    for (int j = 0; j < spec->ops[i].ps.reshape_spec.num_shape; j++) {
+                        spec->ops[i].ps.reshape_spec.shape[j] = shapes[j];
                     }
                     setOperatorInvalid(spec, nextOpIndex, true);
                     hasOptimized = true;
diff --git a/model_tools/include/OPOptimizers/ResizeFuseOptimizer.hpp b/model_tools/include/OPOptimizers/ResizeFuseOptimizer.hpp
index 434fe8e7..b246b5e0 100644
--- a/model_tools/include/OPOptimizers/ResizeFuseOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ResizeFuseOptimizer.hpp
@@ -21,37 +21,34 @@ class ResizeFuseOptimizer : public OPOptimizer {
     {
         bool hasOptimized = false;
         for (int i = 0; i < spec->num_operator_specs - 5; i++) {
-            if (spec->ops[i].type == OT_Relu) {
-                auto cur_relu_spec = spec->ops[i].ps.relu_spec;
-                std::string tmp_str = spec->ops[i].name;
-                if (cur_relu_spec.neg_slope == 0) {
-                    continue;
-                }
-                if (spec->ops[i + 1].type == OT_Shape) {
-                    if (spec->ops[i + 2].type == OT_TfSlice) {
-                        if (spec->ops[i + 3].type == OT_Concat) {
-                            int weightIndex =
-                                searchWeightIndex(spec, spec->ops[i + 3].input_tensors_name[1]);
-                            if (weightIndex == -1) {
+            if (spec->ops[i + 1].type == OT_Shape) {
+                if (spec->ops[i + 2].type == OT_TfSlice) {
+                    if (spec->ops[i + 3].type == OT_Concat) {
+                        std::vector<std::pair<int, int>> prevOpIndexes = searchOperatorIndexByOutput(
+                            spec, spec->ops[i + 3].input_tensors_name[1], 0, i);
+                        CHECK_REQUIREMENT(prevOpIndexes.size() == 1);
+                        int weightIndex =
+                            searchWeightIndex(spec, spec->ops[prevOpIndexes[0].first].name);
+                        if (weightIndex == -1) {
+                            continue;
+                        }
+                        if (spec->ops[i + 4].type == OT_Resize) {
+                            if (spec->ws[weightIndex].mdt != DT_I32 &&
+                                spec->ws[weightIndex].mdt != DT_U32) {
                                 continue;
                             }
-                            if (spec->ops[i + 4].type == OT_Resize) {
-                                if (spec->ws[weightIndex].mdt == DT_I32) {
-                                    int *weight_ptr = (int *)(spec->ws[weightIndex].weight);
-                                    spec->ops[i + 4].ps.resize_spec.sizes[0] = weight_ptr[0];
-                                    spec->ops[i + 4].ps.resize_spec.sizes[1] = weight_ptr[1];
-                                } else {
-                                    float *weight_ptr = (float *)(spec->ws[weightIndex].weight);
-                                    spec->ops[i + 4].ps.resize_spec.sizes[0] = weight_ptr[0];
-                                    spec->ops[i + 4].ps.resize_spec.sizes[1] = weight_ptr[1];
-                                }
-                                spec->ops[i + 4].ps.resize_spec.num_sizes = 2;
-                                spec->ops[i + 4].ps.resize_spec.num_scales = 0;
-                                setOperatorInvalid(spec, i + 1, true);
-                                setOperatorInvalid(spec, i + 2, true);
-                                setOperatorInvalid(spec, i + 3, true);
-                                hasOptimized = true;
+                            if (spec->ws[weightIndex].bytes_of_weight != bytesOf(DT_I32) * 2) {
+                                continue;
                             }
+                            int *ptr = (int *)(spec->ws[weightIndex].weight);
+                            spec->ops[i + 4].ps.resize_spec.sizes[0] = ptr[0];
+                            spec->ops[i + 4].ps.resize_spec.sizes[1] = ptr[1];
+                            spec->ops[i + 4].ps.resize_spec.num_sizes = 2;
+                            spec->ops[i + 4].ps.resize_spec.num_scales = 0;
+                            setOperatorInvalid(spec, i + 1, true);
+                            setOperatorInvalid(spec, i + 2, true);
+                            setOperatorInvalid(spec, i + 3, true);
+                            hasOptimized = true;
                         }
                     }
                 }
diff --git a/model_tools/include/OPOptimizers/RsqrtOptimizer.hpp b/model_tools/include/OPOptimizers/RsqrtOptimizer.hpp
index b2cb97d6..00c2ef86 100644
--- a/model_tools/include/OPOptimizers/RsqrtOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/RsqrtOptimizer.hpp
@@ -24,7 +24,7 @@ class RsqrtOptimizer : public OPOptimizer {
             if (spec->ops[i].type == OT_Power && spec->ops[i + 1].type == OT_Reciprocal) {
                 if (spec->ops[i].ps.power_spec.power == 0.5) {
                     spec->ops[i].ps.power_spec.power = -0.5;
-                    memcpy(spec->ops[i].output_tensors_name[0],
+                    UNI_MEMCPY(spec->ops[i].output_tensors_name[0],
                         spec->ops[i + 1].output_tensors_name[0], NAME_LEN);
                     setOperatorInvalid(spec, i + 1);
                     hasOptimized = true;
diff --git a/model_tools/include/OPOptimizers/ScaleWeightOptimizer.hpp b/model_tools/include/OPOptimizers/ScaleWeightOptimizer.hpp
index e2503fdd..d79936f0 100644
--- a/model_tools/include/OPOptimizers/ScaleWeightOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ScaleWeightOptimizer.hpp
@@ -33,9 +33,9 @@ class ScaleWeightOptimizer : public OPOptimizer {
                 int convOpIndex = nextOpIndexes[0].first;
                 if (!(OT_Conv == spec->ops[convOpIndex].type &&
                         (spec->ops[convOpIndex].ps.conv_spec.convolution_type ==
-                                Convolution_Pointwise ||
+                                CONVOLUTION_POINTWISE ||
                             spec->ops[convOpIndex].ps.conv_spec.convolution_type ==
-                                Convolution_Dilation))) {
+                                CONVOLUTION_DILATION))) {
                     continue;
                 }
 
@@ -65,17 +65,15 @@ class ScaleWeightOptimizer : public OPOptimizer {
                 if (betaPtr == nullptr) {
                     setOperatorInvalid(spec, scaleOpIndex, true);
                 } else {
-                    F32 *vecTemp = (F32 *)mt_new_storage(spec->ws[scaleWeightIndex].bytes_of_vec);
+                    F32 *vecTemp = (F32 *)mt_malloc(spec->ws[scaleWeightIndex].bytes_of_vec);
                     for (U32 m = 0; m < channelCur; m++) {
                         vecTemp[m] = betaPtr[m] / alphaPtr[m];
                     }
-                    if (outOfFileMapRange(spec->ws[scaleWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[scaleWeightIndex].vec;
-                    }
+                    mt_free(spec->ws[scaleWeightIndex].vec, spec);
                     spec->ws[scaleWeightIndex].vec = (U8 *)vecTemp;
                 }
                 F32 *oldWeight = (F32 *)spec->ws[convWeightIndex].weight;
-                F32 *weightTemp = (F32 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_weight);
+                F32 *weightTemp = (F32 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_weight);
                 int weightPerChannel = spec->ops[convOpIndex].ps.conv_spec.kernel_t *
                     spec->ops[convOpIndex].ps.conv_spec.kernel_h *
                     spec->ops[convOpIndex].ps.conv_spec.kernel_w;
@@ -86,15 +84,11 @@ class ScaleWeightOptimizer : public OPOptimizer {
                         }
                     }
                 }
-                if (outOfFileMapRange(spec->ws[convWeightIndex].weight, spec->mfd)) {
-                    delete spec->ws[convWeightIndex].weight;
-                }
+                mt_free(spec->ws[convWeightIndex].weight, spec);
                 spec->ws[convWeightIndex].weight = (U8 *)weightTemp;
-                if (outOfFileMapRange(spec->ws[scaleWeightIndex].weight, spec->mfd)) {
-                    delete spec->ws[scaleWeightIndex].weight;
-                }
-                spec->ws[scaleWeightIndex].weight = nullptr;
+
                 spec->ws[scaleWeightIndex].bytes_of_weight = 0;
+                mt_free(spec->ws[scaleWeightIndex].weight, spec);
                 hasOptimized = true;
             }
         }
diff --git a/model_tools/include/OPOptimizers/ShGaUnCoReOptimizer.hpp b/model_tools/include/OPOptimizers/ShGaUnCoReOptimizer.hpp
index 08ee2752..9909d64d 100644
--- a/model_tools/include/OPOptimizers/ShGaUnCoReOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/ShGaUnCoReOptimizer.hpp
@@ -27,21 +27,19 @@ class ShGaUnCoReOptimizer : public OPOptimizer {
                     spec->ops[i + 3].type == OT_Unsqueeze && spec->ops[i + 4].type == OT_Concat &&
                     spec->ops[i + 5].type == OT_Reshape) {
                     for (int k = 1; k < (int)(spec->ops[shapeOpIndex - 1].num_outputs); k++) {
-                        delete spec->ops[shapeOpIndex - 1].output_tensors_name[k];
-                        spec->ops[shapeOpIndex - 1].output_tensors_name[k] = nullptr;
+                        mt_free(spec->ops[shapeOpIndex - 1].output_tensors_name[k]);
                     }
                     spec->ops[shapeOpIndex - 1].num_outputs = 1;
 
                     for (int k = 1; k < (int)(spec->ops[i + 5].num_outputs); k++) {
-                        delete spec->ops[i + 5].input_tensors_name[k];
-                        spec->ops[i + 5].input_tensors_name[k] = nullptr;
+                        mt_free(spec->ops[i + 5].input_tensors_name[k]);
                     }
                     spec->ops[i + 5].num_inputs = 1;
 
                     // make the reshape proper
-                    spec->ops[i + 5].ps.reshape_spec.shape_dims[0] = 1;
-                    spec->ops[i + 5].ps.reshape_spec.shape_dims[1] = -1;
-                    spec->ops[i + 5].ps.reshape_spec.shape_size = 2;
+                    spec->ops[i + 5].ps.reshape_spec.shape[0] = 1;
+                    spec->ops[i + 5].ps.reshape_spec.shape[1] = -1;
+                    spec->ops[i + 5].ps.reshape_spec.num_shape = 2;
 
                     // drop the redundant op
                     setOperatorInvalid(spec, i);
diff --git a/model_tools/include/OPOptimizers/SpliceFCOptimizer.hpp b/model_tools/include/OPOptimizers/SpliceFCOptimizer.hpp
index 29bd204c..9ff0b714 100644
--- a/model_tools/include/OPOptimizers/SpliceFCOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/SpliceFCOptimizer.hpp
@@ -34,7 +34,7 @@ class SpliceFCOptimizer : public OPOptimizer {
                     continue;
                 }
 
-                // delete splice weight(forward indexes)
+                // remove splice weight(forward indexes)
                 int spliceWeightIndex = searchWeightIndex(spec, spec->ops[i].name);
                 CHECK_REQUIREMENT(spliceWeightIndex >= 0);
                 setWeightOperatorInvalid(spec, spliceWeightIndex);
diff --git a/model_tools/include/OPOptimizers/StdDeviationOptimizer.hpp b/model_tools/include/OPOptimizers/StdDeviationOptimizer.hpp
index 74565c07..b2d8f19e 100644
--- a/model_tools/include/OPOptimizers/StdDeviationOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/StdDeviationOptimizer.hpp
@@ -23,9 +23,8 @@ class StdDeviationOptimizer : public OPOptimizer {
         for (int i = 0; i < spec->num_operator_specs - 2; i++) {
             if (OT_SqDiff == spec->ops[i].type) {
                 if (OT_Reduction == spec->ops[i + 1].type && OT_Power == spec->ops[i + 2].type) {
-                    CHECK_REQUIREMENT(
-                        REDUCTION_MEAN == spec->ops[i + 1].ps.reduction_spec.reduction_mode);
-                    spec->ops[i + 1].ps.reduction_spec.reduction_mode = REDUCTION_STD_DEVIATION;
+                    CHECK_REQUIREMENT(REDUCTION_MEAN == spec->ops[i + 1].ps.reduction_spec.mode);
+                    spec->ops[i + 1].ps.reduction_spec.mode = REDUCTION_STD_DEVIATION;
 
                     str_copy(spec->ops[i + 1].input_tensors_name[0],
                         spec->ops[i].input_tensors_name[0], NAME_LEN);
diff --git a/model_tools/include/OPOptimizers/SwapChannelResizePoolingOptimizer.hpp b/model_tools/include/OPOptimizers/SwapChannelResizePoolingOptimizer.hpp
new file mode 100644
index 00000000..cd672f4e
--- /dev/null
+++ b/model_tools/include/OPOptimizers/SwapChannelResizePoolingOptimizer.hpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_SwapChannelResizePoolingOPTIMIZER
+#define _H_SwapChannelResizePoolingOPTIMIZER
+
+#include "OPOptimizer.hpp"
+
+class SwapChannelResizePoolingOptimizer : public SwapOPOptimizer {
+    bool optimize(ModelSpec *spec) override
+    {
+        bool hasOptimized = false;
+        for (int i = 0; i < spec->num_operator_specs - 1; i++) {
+            if (spec->ops[i].type == OT_ChannelResize) {
+                auto tmpVec = searchOperatorIndexByInput(
+                    spec, spec->ops[i].output_tensors_name[0], i + 1, spec->num_operator_specs);
+                if (tmpVec.size() != 1) {
+                    continue;
+                }
+                int next = tmpVec[0].first;
+                if (spec->ops[next].type == OT_Pooling) {
+                    shift_left(spec, {i, next});
+                    hasOptimized = true;
+                    continue;
+                }
+                if (spec->ops[next].type == OT_Sigmoid) {
+                    tmpVec = searchOperatorIndexByInput(
+                        spec, spec->ops[next].output_tensors_name[0], next + 1, spec->num_operator_specs);
+                    if (tmpVec.size() != 1) {
+                        continue;
+                    }
+                    int next2 = tmpVec[0].first;
+                    if (spec->ops[next].type == OT_Pooling) {
+                        shift_left(spec, {i, next, next2});
+                        hasOptimized = true;
+                        continue;
+                    }
+                    if (spec->ops[next2].type == OT_Sigmoid) {
+                        tmpVec = searchOperatorIndexByInput(
+                            spec, spec->ops[next2].output_tensors_name[0], next2 + 1, spec->num_operator_specs);
+                        if (tmpVec.size() != 1) {
+                            continue;
+                        }
+                        int next3 = tmpVec[0].first;
+                        if (spec->ops[next3].type == OT_Pooling) {
+                            shift_left(spec, {i, next, next2, next3});
+                            hasOptimized = true;
+                            continue;
+                        }
+                    }
+                }
+            }
+        }
+        return hasOptimized;
+    }
+};
+#endif
diff --git a/model_tools/include/OPOptimizers/SwapOPOptimizer.hpp b/model_tools/include/OPOptimizers/SwapOPOptimizer.hpp
new file mode 100644
index 00000000..37c62087
--- /dev/null
+++ b/model_tools/include/OPOptimizers/SwapOPOptimizer.hpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_SwapOPOPTIMIZER
+#define _H_SwapOPOPTIMIZER
+
+#include "OPOptimizer.hpp"
+
+class SwapOPOptimizer : public OPOptimizer {
+protected:
+    // original [0, 1, 2] -> [1, 2, 0]
+    void shift_left(ModelSpec *spec, std::vector<int> order) {
+        int num = order.size();
+        if (num <= 1) {
+            return;
+        }
+        std::string name0 = spec->ops[order[0]].name;
+        auto type0 = spec->ops[order[0]].type;
+        auto ps0 = spec->ops[order[0]].ps;
+        for (int i = 1; i < num; i++) {
+            UNI_STRCPY(spec->ops[order[i-1]].name, spec->ops[order[i]].name);
+            spec->ops[order[i-1]].type = spec->ops[order[i]].type;
+            spec->ops[order[i-1]].ps = spec->ops[order[i]].ps;
+        }
+        UNI_STRCPY(spec->ops[order[num-1]].name, name0.c_str());
+        spec->ops[order[num-1]].type = type0;
+        spec->ops[order[num-1]].ps = ps0;
+    }
+};
+#endif
diff --git a/model_tools/include/OPOptimizers/SwapPadTranspose.hpp b/model_tools/include/OPOptimizers/SwapPadTransposeOptimizer.hpp
similarity index 60%
rename from model_tools/include/OPOptimizers/SwapPadTranspose.hpp
rename to model_tools/include/OPOptimizers/SwapPadTransposeOptimizer.hpp
index bbac015f..25b6b612 100644
--- a/model_tools/include/OPOptimizers/SwapPadTranspose.hpp
+++ b/model_tools/include/OPOptimizers/SwapPadTransposeOptimizer.hpp
@@ -14,9 +14,9 @@
 #ifndef _H_SwapPadTransposeOPTIMIZER
 #define _H_SwapPadTransposeOPTIMIZER
 
-#include "OPOptimizer.hpp"
+#include "SwapOPOptimizer.hpp"
 
-class SwapPadTransposeOptimizer : public OPOptimizer {
+class SwapPadTransposeOptimizer : public SwapOPOptimizer {
     bool optimize(ModelSpec *spec) override
     {
         bool hasOptimized = false;
@@ -32,33 +32,21 @@ class SwapPadTransposeOptimizer : public OPOptimizer {
                     auto padPs = spec->ops[i].ps.pad_spec;
                     auto transPs = spec->ops[next].ps.transpose_spec;
                     if (padPs.front != 0 || padPs.back != 0) {
-                        if (transPs.trans_size == 4) {
-                            if (transPs.trans_dims[0] == 0) {
-                                int oriC1 = padPs.front;
-                                int oriC2 = padPs.back;
-                                int oriH1 = padPs.top;
-                                int oriH2 = padPs.bottom;
-                                int oriW1 = padPs.left;
-                                int oriW2 = padPs.right;
-
-                                padPs.front = oriW1;
-                                padPs.back = oriW2;
-                                padPs.top = oriC1;
-                                padPs.bottom = oriC2;
-                                padPs.left = oriH1;
-                                padPs.right = oriH2;
-
-                                std::string padName = spec->ops[i].name;
-                                std::string transName = spec->ops[next].name;
-                                strcpy(spec->ops[i].name, transName.c_str());
-                                strcpy(spec->ops[next].name, padName.c_str());
-                                spec->ops[i].type = OT_Transpose;
-                                spec->ops[next].type = OT_Pad;
-                                spec->ops[i].ps.transpose_spec = transPs;
-                                spec->ops[next].ps.pad_spec = padPs;
+                        if (transPs.num_axes == 4 && transPs.axes[0] == 0
+                            && transPs.axes[1] == 3
+                            && transPs.axes[2] == 1
+                            && transPs.axes[3] == 2
+                            ) {
+                                spec->ops[i].ps.pad_spec.front = padPs.left;
+                                spec->ops[i].ps.pad_spec.back = padPs.right;
+                                spec->ops[i].ps.pad_spec.top = padPs.front;
+                                spec->ops[i].ps.pad_spec.bottom = padPs.back;
+                                spec->ops[i].ps.pad_spec.left = padPs.top;
+                                spec->ops[i].ps.pad_spec.right = padPs.bottom;
+
+                                shift_left(spec, {i, next});
                                 hasOptimized = true;
                                 i++;
-                            }
                         }
                     }
                 }
diff --git a/model_tools/include/OPOptimizers/SwapTransposeElt.hpp b/model_tools/include/OPOptimizers/SwapTransposeEltOptimizer.hpp
similarity index 75%
rename from model_tools/include/OPOptimizers/SwapTransposeElt.hpp
rename to model_tools/include/OPOptimizers/SwapTransposeEltOptimizer.hpp
index 80f53161..a098f6cc 100644
--- a/model_tools/include/OPOptimizers/SwapTransposeElt.hpp
+++ b/model_tools/include/OPOptimizers/SwapTransposeEltOptimizer.hpp
@@ -14,9 +14,9 @@
 #ifndef _H_SwapTransposeEltOPTIMIZER
 #define _H_SwapTransposeEltOPTIMIZER
 
-#include "OPOptimizer.hpp"
+#include "SwapOPOptimizer.hpp"
 
-class SwapTransposeEltOptimizer : public OPOptimizer {
+class SwapTransposeEltOptimizer : public SwapOPOptimizer {
     bool optimize(ModelSpec *spec) override
     {
         bool hasOptimized = false;
@@ -35,20 +35,7 @@ class SwapTransposeEltOptimizer : public OPOptimizer {
                 }
                 if (spec->ops[next].type == OT_Power || spec->ops[next].type == OT_Relu ||
                     spec->ops[next].type == OT_HSwish || spec->ops[next].type == OT_HSwishNoDiv) {
-                    auto ps1 = spec->ops[i].ps;
-                    auto ps2 = spec->ops[next].ps;
-
-                    auto opType1 = spec->ops[i].type;
-                    auto opType2 = spec->ops[next].type;
-
-                    std::string opName1 = spec->ops[i].name;
-                    std::string opName2 = spec->ops[next].name;
-                    strcpy(spec->ops[i].name, opName2.c_str());
-                    strcpy(spec->ops[next].name, opName1.c_str());
-                    spec->ops[i].type = opType2;
-                    spec->ops[next].type = opType1;
-                    spec->ops[i].ps = ps2;
-                    spec->ops[next].ps = ps1;
+                    shift_left(spec, {i, next});
                     hasOptimized = true;
                 }
             }
diff --git a/model_tools/include/OPOptimizers/SwishOptimizer.hpp b/model_tools/include/OPOptimizers/SwishOptimizer.hpp
new file mode 100644
index 00000000..a97f3932
--- /dev/null
+++ b/model_tools/include/OPOptimizers/SwishOptimizer.hpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_SwishOPTIMIZER
+#define _H_SwishOPTIMIZER
+
+#include "OPOptimizer.hpp"
+
+class SwishOptimizer : public OPOptimizer {
+    bool optimize(ModelSpec *spec) override
+    {
+        bool hasOptimized = false;
+        for (int i = 0; i < spec->num_operator_specs - 1; i++) {
+            if (spec->ops[i].type == OT_Sigmoid) {
+                auto next = searchOperatorIndexByInput(
+                    spec, spec->ops[i].output_tensors_name[0], i + 1, spec->num_operator_specs);
+                if (next.size() != 1) {
+                    continue;
+                }
+                int eltId = next[0].first;
+                int inputId = next[0].second;
+                if (spec->ops[eltId].type == OT_Eltwise && spec->ops[eltId].num_inputs == 2 &&
+                    spec->ops[eltId].ps.eltwise_spec.mode == ELTWISE_PROD &&
+                    std::string(spec->ops[i].input_tensors_name[0]) ==
+                        std::string(spec->ops[eltId].input_tensors_name[1 - inputId])) {
+                    spec->ops[eltId].type = OT_Swish;
+
+                    setOperatorInvalid(spec, i);
+                    mt_free(spec->ops[eltId].input_tensors_name[inputId]);
+                    spec->ops[eltId].num_inputs = 1;
+                    hasOptimized = true;
+                }
+            }
+        }
+        return hasOptimized;
+    }
+};
+#endif
diff --git a/model_tools/include/OPOptimizers/TransConcatTrans.hpp b/model_tools/include/OPOptimizers/TransConcatTransOptimizer.hpp
similarity index 89%
rename from model_tools/include/OPOptimizers/TransConcatTrans.hpp
rename to model_tools/include/OPOptimizers/TransConcatTransOptimizer.hpp
index de636e79..8bf8a37f 100644
--- a/model_tools/include/OPOptimizers/TransConcatTrans.hpp
+++ b/model_tools/include/OPOptimizers/TransConcatTransOptimizer.hpp
@@ -43,9 +43,9 @@ class TransConcatTransOptimizer : public OPOptimizer {
                     }
                     int lastIndex = tmpVec[0].first;
                     auto transPs = spec->ops[lastIndex].ps.transpose_spec;
-                    if (spec->ops[lastIndex].type != OT_Transpose || transPs.trans_size != 4 ||
-                        transPs.trans_dims[0] != 0 || transPs.trans_dims[1] != 2 ||
-                        transPs.trans_dims[2] != 3 || transPs.trans_dims[3] != 1) {
+                    if (spec->ops[lastIndex].type != OT_Transpose || transPs.num_axes != 4 ||
+                        transPs.axes[0] != 0 || transPs.axes[1] != 2 || transPs.axes[2] != 3 ||
+                        transPs.axes[3] != 1) {
                         tag1 = false;
                         break;
                     }
@@ -70,9 +70,9 @@ class TransConcatTransOptimizer : public OPOptimizer {
                 for (U32 k = 0; k < tmpVec.size(); k++) {
                     int nextIndex = tmpVec[k].first;
                     auto transPs = spec->ops[nextIndex].ps.transpose_spec;
-                    if (spec->ops[nextIndex].type != OT_Transpose || transPs.trans_size != 4 ||
-                        transPs.trans_dims[0] != 0 || transPs.trans_dims[1] != 3 ||
-                        transPs.trans_dims[2] != 1 || transPs.trans_dims[3] != 2) {
+                    if (spec->ops[nextIndex].type != OT_Transpose || transPs.num_axes != 4 ||
+                        transPs.axes[0] != 0 || transPs.axes[1] != 3 || transPs.axes[2] != 1 ||
+                        transPs.axes[3] != 2) {
                         tag2 = false;
                         insertIndex.push_back(tmpVec[k]);
                     } else {
@@ -93,11 +93,11 @@ class TransConcatTransOptimizer : public OPOptimizer {
                 spec->ops[i].ps.concat_spec.axis = 1;
                 if (!tag2) {
                     TransposeParamSpec transPs;
-                    transPs.trans_size = 4;
-                    transPs.trans_dims[0] = 0;
-                    transPs.trans_dims[1] = 2;
-                    transPs.trans_dims[2] = 3;
-                    transPs.trans_dims[3] = 1;
+                    transPs.num_axes = 4;
+                    transPs.axes[0] = 0;
+                    transPs.axes[1] = 2;
+                    transPs.axes[2] = 3;
+                    transPs.axes[3] = 1;
                     std::string name = "concat_transpose_" + std::to_string(i);
                     OperatorSpec transposeOperator =
                         mt_create_operator(name.c_str(), OT_Transpose, 1, 1);
diff --git a/model_tools/include/OPOptimizers/TransposeConvOptimizer.hpp b/model_tools/include/OPOptimizers/TransposeConvOptimizer.hpp
new file mode 100644
index 00000000..8d601c9a
--- /dev/null
+++ b/model_tools/include/OPOptimizers/TransposeConvOptimizer.hpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_TRANSPOSE_CONV_OPTIMIZER
+#define _H_TRANSPOSE_CONV_OPTIMIZER
+
+#include "OPOptimizer.hpp"
+
+class TransposeConvOptimizer : public OPOptimizer {
+    bool optimize(ModelSpec *spec) override
+    {
+        bool hasOptimized = false;
+        for (int i = 0; i < spec->num_operator_specs - 1; i++) {
+            if (spec->ops[i].type == OT_Transpose &&
+                ((spec->ops[i].ps.transpose_spec.num_axes == 4 &&
+                     spec->ops[i].ps.transpose_spec.axes[0] == 0 &&
+                     spec->ops[i].ps.transpose_spec.axes[1] == 3 &&
+                     spec->ops[i].ps.transpose_spec.axes[2] == 1 &&
+                     spec->ops[i].ps.transpose_spec.axes[3] == 2) ||
+                    (spec->ops[i].ps.transpose_spec.num_axes == 3 &&
+                        spec->ops[i].ps.transpose_spec.axes[0] == 0 &&
+                        spec->ops[i].ps.transpose_spec.axes[1] == 2 &&
+                        spec->ops[i].ps.transpose_spec.axes[2] == 1))) {
+                auto nextop = searchOperatorIndexByInput(
+                    spec, spec->ops[i].output_tensors_name[0], i + 1, spec->num_operator_specs);
+                if (nextop.size() != 1) {
+                    continue;
+                }
+                int next = nextop[0].first;
+                if (spec->ops[next].type == OT_Where) {
+                    nextop = searchOperatorIndexByInput(spec,
+                        spec->ops[next].output_tensors_name[0], next + 1, spec->num_operator_specs);
+                    if (nextop.size() != 1) {
+                        continue;
+                    }
+                    next = nextop[0].first;
+                }
+                if (spec->ops[next].type == OT_Conv) {
+                    spec->ops[i].ps.transpose_spec.df = DF_NCHWC8;
+                    hasOptimized = true;
+                }
+            }
+        }
+        return hasOptimized;
+    }
+};
+#endif
diff --git a/model_tools/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp b/model_tools/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp
index 30bfa839..b916e217 100644
--- a/model_tools/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp
@@ -40,8 +40,8 @@ class TransposeMatMulToFCOptimizer : public OPOptimizer {
 
                     int matmulIdx = nextOpIndexes[0].first;
                     int matmulSubIdx = nextOpIndexes[0].second;
-                    U32 paramSize = spec->ops[i].ps.transpose_spec.trans_size;
-                    U32 *transDims = spec->ops[i].ps.transpose_spec.trans_dims;
+                    U32 paramSize = spec->ops[i].ps.transpose_spec.num_axes;
+                    U32 *transDims = spec->ops[i].ps.transpose_spec.axes;
                     bool fuseTranspose = true;
                     if (paramSize < 2) {
                         continue;
@@ -97,7 +97,7 @@ class TransposeMatMulToFCOptimizer : public OPOptimizer {
                 // Update matmul to fc
                 spec->ops[matmulOpIndex].type = OT_FC;
                 spec->ops[matmulOpIndex].num_inputs = 1;
-                delete spec->ops[matmulOpIndex].input_tensors_name[1];
+                mt_free(spec->ops[matmulOpIndex].input_tensors_name[1]);
                 spec->ops[matmulOpIndex].ps.fc_spec.num_outputs =
                     spec->ws[transposeWeightIndex].bytes_of_vec;
                 spec->ops[matmulOpIndex].ps.fc_spec.num_slices = 1;
diff --git a/model_tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp b/model_tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp
index a1fc2835..1bc28616 100644
--- a/model_tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp
@@ -22,7 +22,7 @@ class TransposeMulToScaleOptimizer : public OPOptimizer {
         bool hasOptimized = false;
         for (int i = 1; i < spec->num_operator_specs; i++) {
             if (spec->ops[i].type == OT_Eltwise && spec->ops[i].num_inputs == 2 &&
-                spec->ops[i].ps.eltwise_spec.elt_mode == ELTWISE_PROD) {
+                spec->ops[i].ps.eltwise_spec.mode == ELTWISE_PROD) {
                 int mulOpIndex = i;
                 std::vector<std::pair<int, int>> prevOpIndexes = searchOperatorIndexByOutput(
                     spec, spec->ops[mulOpIndex].input_tensors_name[0], 0, mulOpIndex);
diff --git a/model_tools/include/OPOptimizers/TransposeOptimizer.hpp b/model_tools/include/OPOptimizers/TransposeOptimizer.hpp
index 63564d14..e0dd53ba 100644
--- a/model_tools/include/OPOptimizers/TransposeOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/TransposeOptimizer.hpp
@@ -32,16 +32,16 @@ class TransposeOptimizer : public OPOptimizer {
                 if (spec->ops[next].type == OT_Transpose && spec->ops[next].num_inputs == 1) {
                     auto ps1 = spec->ops[i].ps.transpose_spec;
                     auto ps2 = spec->ops[next].ps.transpose_spec;
-                    if (ps1.trans_size != ps2.trans_size) {
+                    if (ps1.num_axes != ps2.num_axes) {
                         UNI_ERROR_LOG("neighbor two transpose operators(%s, %s) dimensions not "
                                       "equal.\n",
                             spec->ops[i].name, spec->ops[next].name);
                         continue;
                     }
                     bool invalid = true;
-                    for (U32 j = 0; j < ps2.trans_size; j++) {
-                        ps2.trans_dims[j] = ps1.trans_dims[ps2.trans_dims[j]];
-                        if (ps2.trans_dims[j] != j) {
+                    for (U32 j = 0; j < ps2.num_axes; j++) {
+                        ps2.axes[j] = ps1.axes[ps2.axes[j]];
+                        if (ps2.axes[j] != j) {
                             invalid = false;
                         }
                     }
diff --git a/model_tools/include/OPOptimizers/WeightBNOptimizer.hpp b/model_tools/include/OPOptimizers/WeightBNOptimizer.hpp
index 08365f95..a701d515 100644
--- a/model_tools/include/OPOptimizers/WeightBNOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/WeightBNOptimizer.hpp
@@ -64,13 +64,18 @@ class WeightBNOptimizer : public OPOptimizer {
                     isBNN = 1;
                 }
                 F32 *weightTemp = (F32 *)spec->ws[convWeightIndex].weight;
+                F32 *weightTempRecAlloc =
+                    (F32 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_weight);
+                UNI_MEMCPY(
+                    weightTempRecAlloc, weightTemp, spec->ws[convWeightIndex].bytes_of_weight);
+                spec->ws[convWeightIndex].weight = (U8 *)weightTempRecAlloc;
                 if (spec->ws[convWeightIndex].vec == nullptr) {
                     spec->ws[convWeightIndex].bytes_of_vec = channelCur * sizeof(F32);
                     if (isBNN == 1) {
                         spec->ws[convWeightIndex].bytes_of_vec *= 2;
                     }
                     spec->ws[convWeightIndex].vec =
-                        (U8 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_vec);
+                        (U8 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_vec);
                     if (isBNN == 1) {
                         F32 *scale = (F32 *)spec->ws[convWeightIndex].vec;
                         F32 *bias = scale + channelCur;
@@ -79,7 +84,7 @@ class WeightBNOptimizer : public OPOptimizer {
                             bias[m] = 0;
                         }
                     } else {
-                        memset(spec->ws[convWeightIndex].vec, 0,
+                        UNI_MEMSET(spec->ws[convWeightIndex].vec, 0,
                             spec->ws[convWeightIndex].bytes_of_vec);
                     }
                 }
@@ -93,33 +98,22 @@ class WeightBNOptimizer : public OPOptimizer {
                         bias[m] = (bias[m] - gamaCur * meanPtr[m]) / stdValue[m];
                     }
                 } else {
+                    F32 *vecTempReAlloc = (F32 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_vec);
+                    UNI_MEMCPY(vecTempReAlloc, vecTemp, spec->ws[convWeightIndex].bytes_of_vec);
+                    spec->ws[convWeightIndex].vec = (U8 *)vecTempReAlloc;
                     int weightDataSize = spec->ws[convWeightIndex].bytes_of_weight /
                         bytesOf(spec->ws[convWeightIndex].mdt);
                     int weightPerChannel = weightDataSize / channelCur;
                     // NCHW
                     for (U32 m = 0; m < channelCur; m++) {
                         F32 *convWeightPerChannel = weightTemp + weightPerChannel * m;
+                        F32 *convWeightReAllocPerChannel = weightTempRecAlloc + weightPerChannel * m;
                         for (int n = 0; n < weightPerChannel; n++) {
-                            convWeightPerChannel[n] /= stdValue[m];
+                            convWeightReAllocPerChannel[n] /= stdValue[m];
                         }
-                        vecTemp[m] = (vecTemp[m] - gamaCur * meanPtr[m]) / stdValue[m];
+                        vecTempReAlloc[m] = (vecTempReAlloc[m] - gamaCur * meanPtr[m]) / stdValue[m];
                     }
                 }
-                // free BN memory
-                if (spec->ws[bnWeightIndex].weight != nullptr) {
-                    spec->ws[bnWeightIndex].bytes_of_weight = 0;
-                    if (outOfFileMapRange(spec->ws[bnWeightIndex].weight, spec->mfd)) {
-                        delete spec->ws[bnWeightIndex].weight;
-                    }
-                    spec->ws[bnWeightIndex].weight = nullptr;
-                }
-                if (spec->ws[bnWeightIndex].vec != nullptr) {
-                    spec->ws[bnWeightIndex].bytes_of_vec = 0;
-                    if (outOfFileMapRange(spec->ws[bnWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[bnWeightIndex].vec;
-                    }
-                    spec->ws[bnWeightIndex].vec = nullptr;
-                }
                 setOperatorInvalid(spec, bnOpIndex, true);
                 hasOptimized = true;
             }
diff --git a/model_tools/include/OPOptimizers/WeightScaleOptimizer.hpp b/model_tools/include/OPOptimizers/WeightScaleOptimizer.hpp
index 6d46ff01..f856ba08 100644
--- a/model_tools/include/OPOptimizers/WeightScaleOptimizer.hpp
+++ b/model_tools/include/OPOptimizers/WeightScaleOptimizer.hpp
@@ -22,6 +22,7 @@ class WeightScaleOptimizer : public OPOptimizer {
         bool hasOptimized = false;
         hasOptimized |= optimize_power(spec);
         hasOptimized |= optimize_scale(spec);
+        hasOptimized |= optimize_power_scale(spec);
         return hasOptimized;
     }
 
@@ -63,7 +64,7 @@ class WeightScaleOptimizer : public OPOptimizer {
                     spec->ws[convWeightIndex].bytes_of_weight =
                         spec->ws[convWeightIndex].bytes_of_vec;
                     spec->ws[convWeightIndex].weight =
-                        (U8 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_weight);
+                        (U8 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_weight);
                     F32 *ptr = (F32 *)spec->ws[convWeightIndex].weight;
                     for (U32 m = 0; m < spec->ws[convWeightIndex].bytes_of_weight /
                              bytesOf(spec->ws[convWeightIndex].mdt);
@@ -71,8 +72,8 @@ class WeightScaleOptimizer : public OPOptimizer {
                         ptr[m] = 1;
                     }
                 }
-                F32 *weightTemp = (F32 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_weight);
-                memcpy(weightTemp, spec->ws[convWeightIndex].weight,
+                F32 *weightTemp = (F32 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_weight);
+                UNI_MEMCPY(weightTemp, spec->ws[convWeightIndex].weight,
                     spec->ws[convWeightIndex].bytes_of_weight);
                 if (spec->ws[convWeightIndex].vec == nullptr ||
                     spec->ws[convWeightIndex].bytes_of_vec == 0) {
@@ -89,11 +90,12 @@ class WeightScaleOptimizer : public OPOptimizer {
                         continue;
                     }
                     spec->ws[convWeightIndex].vec =
-                        (U8 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_vec);
-                    memset(spec->ws[convWeightIndex].vec, 0, spec->ws[convWeightIndex].bytes_of_vec);
+                        (U8 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_vec);
+                    UNI_MEMSET(
+                        spec->ws[convWeightIndex].vec, 0, spec->ws[convWeightIndex].bytes_of_vec);
                 }
-                F32 *vecTemp = (F32 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_vec);
-                memcpy(
+                F32 *vecTemp = (F32 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_vec);
+                UNI_MEMCPY(
                     vecTemp, spec->ws[convWeightIndex].vec, spec->ws[convWeightIndex].bytes_of_vec);
                 for (U32 m = 0; m < spec->ws[convWeightIndex].bytes_of_weight /
                          bytesOf(spec->ws[convWeightIndex].mdt);
@@ -107,17 +109,9 @@ class WeightScaleOptimizer : public OPOptimizer {
                         spec->ops[powerOpIndex].ps.power_spec.shift;
                 }
 
-                // free origin spec->ws[convWeightIndex] memory
-                if (spec->ws[convWeightIndex].vec != nullptr) {
-                    if (outOfFileMapRange(spec->ws[convWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[convWeightIndex].vec;
-                    }
-                }
-                if (spec->ws[convWeightIndex].weight != nullptr) {
-                    if (outOfFileMapRange(spec->ws[convWeightIndex].weight, spec->mfd)) {
-                        delete spec->ws[convWeightIndex].weight;
-                    }
-                }
+                mt_free(spec->ws[convWeightIndex].vec, spec);
+                mt_free(spec->ws[convWeightIndex].weight, spec);
+
                 spec->ws[convWeightIndex].vec = (U8 *)vecTemp;
                 spec->ws[convWeightIndex].weight = (U8 *)weightTemp;
 
@@ -186,14 +180,14 @@ class WeightScaleOptimizer : public OPOptimizer {
                     spec->ws[convWeightIndex].bytes_of_weight == 0) {
                     spec->ws[convWeightIndex].bytes_of_weight = channelCur * sizeof(F32);
                     spec->ws[convWeightIndex].weight =
-                        (U8 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_weight);
+                        (U8 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_weight);
                     F32 *ptr = (F32 *)spec->ws[convWeightIndex].weight;
                     for (U32 m = 0; m < channelCur; m++) {
                         ptr[m] = 1;
                     }
                 }
-                F32 *weightTemp = (F32 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_weight);
-                memcpy(weightTemp, spec->ws[convWeightIndex].weight,
+                F32 *weightTemp = (F32 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_weight);
+                UNI_MEMCPY(weightTemp, spec->ws[convWeightIndex].weight,
                     spec->ws[convWeightIndex].bytes_of_weight);
                 if (spec->ws[convWeightIndex].vec == nullptr) {
                     spec->ws[convWeightIndex].bytes_of_vec = channelCur * sizeof(F32);
@@ -201,7 +195,7 @@ class WeightScaleOptimizer : public OPOptimizer {
                         spec->ws[convWeightIndex].bytes_of_vec *= 2;
                     }
                     spec->ws[convWeightIndex].vec =
-                        (U8 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_vec);
+                        (U8 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_vec);
                     if (isBNN == 1) {
                         F32 *scale = (F32 *)spec->ws[convWeightIndex].vec;
                         F32 *bias = scale + channelCur;
@@ -210,12 +204,12 @@ class WeightScaleOptimizer : public OPOptimizer {
                             bias[m] = 0;
                         }
                     } else {
-                        memset(spec->ws[convWeightIndex].vec, 0,
+                        UNI_MEMSET(spec->ws[convWeightIndex].vec, 0,
                             spec->ws[convWeightIndex].bytes_of_vec);
                     }
                 }
-                F32 *vecTemp = (F32 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_vec);
-                memcpy(
+                F32 *vecTemp = (F32 *)mt_malloc(spec->ws[convWeightIndex].bytes_of_vec);
+                UNI_MEMCPY(
                     vecTemp, spec->ws[convWeightIndex].vec, spec->ws[convWeightIndex].bytes_of_vec);
                 if (isBNN == 1) {
                     F32 *scale = vecTemp;
@@ -248,38 +242,75 @@ class WeightScaleOptimizer : public OPOptimizer {
                     }
                 }
 
-                // free origin spec->ws[convWeightIndex] memory
-                if (spec->ws[convWeightIndex].vec != nullptr) {
-                    if (outOfFileMapRange(spec->ws[convWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[convWeightIndex].vec;
-                    }
+                mt_free(spec->ws[convWeightIndex].weight, spec);
+                mt_free(spec->ws[convWeightIndex].vec, spec);
+                spec->ws[convWeightIndex].weight = (U8 *)weightTemp;
+                spec->ws[convWeightIndex].vec = (U8 *)vecTemp;
+
+                setOperatorInvalid(spec, scaleOpIndex, true);
+                hasOptimized = true;
+                i--;
+            }
+        }
+        return hasOptimized;
+    }
+
+    bool optimize_power_scale(ModelSpec *spec)
+    {
+        bool hasOptimized = false;
+        for (int i = 0; i < spec->num_operator_specs - 1; i++) {
+            if (OT_Power == spec->ops[i].type && 1 == spec->ops[i].ps.power_spec.power) {
+                std::vector<std::pair<int, int>> nextOpIndexes = searchOperatorIndexByInput(
+                    spec, spec->ops[i].output_tensors_name[0], i + 1, spec->num_operator_specs);
+                if (nextOpIndexes.size() != 1 || OT_Scale != spec->ops[nextOpIndexes[0].first].type) {
+                    continue;
                 }
-                if (spec->ws[convWeightIndex].weight != nullptr) {
-                    if (outOfFileMapRange(spec->ws[convWeightIndex].weight, spec->mfd)) {
-                        delete spec->ws[convWeightIndex].weight;
-                    }
+                int scaleOpIndex = nextOpIndexes[0].first;
+                if (spec->ops[scaleOpIndex].num_inputs > 1) {
+                    UNI_WARNING_LOG(
+                        "encounter unoptimize Scale layer(multi-inputs): %s\n", spec->ops[i].name);
+                    continue;
                 }
-                spec->ws[convWeightIndex].vec = (U8 *)vecTemp;
-                spec->ws[convWeightIndex].weight = (U8 *)weightTemp;
 
-                // free scale memory
-                if (spec->ws[scaleWeightIndex].weight != nullptr) {
-                    spec->ws[scaleWeightIndex].bytes_of_weight = 0;
-                    if (outOfFileMapRange(spec->ws[scaleWeightIndex].weight, spec->mfd)) {
-                        delete spec->ws[scaleWeightIndex].weight;
+                // scale
+                int scaleWeightIndex = searchWeightIndex(spec, spec->ops[scaleOpIndex].name);
+                CHECK_REQUIREMENT(scaleWeightIndex >= 0);
+                CHECK_REQUIREMENT(spec->ws[scaleWeightIndex].mdt == DT_F32);
+                U32 channelAlpha = spec->ws[scaleWeightIndex].bytes_of_weight /
+                    bytesOf(spec->ws[scaleWeightIndex].mdt);
+                U32 channelBeta = spec->ws[scaleWeightIndex].bytes_of_vec /
+                    bytesOf(spec->ws[scaleWeightIndex].mdt);
+                U32 channelCur = UNI_MAX(channelAlpha, channelBeta);
+                F32 *alpha0 = (F32 *)spec->ws[scaleWeightIndex].weight;
+                F32 *beta0 = (F32 *)spec->ws[scaleWeightIndex].vec;
+
+                spec->ws[scaleWeightIndex].bytes_of_weight =
+                    channelCur * bytesOf(spec->ws[scaleWeightIndex].mdt);
+                spec->ws[scaleWeightIndex].bytes_of_vec = spec->ws[scaleWeightIndex].bytes_of_weight;
+                spec->ws[scaleWeightIndex].weight =
+                    (U8 *)mt_malloc(spec->ws[scaleWeightIndex].bytes_of_weight);
+                spec->ws[scaleWeightIndex].vec =
+                    (U8 *)mt_malloc(spec->ws[scaleWeightIndex].bytes_of_vec);
+                F32 *alpha1 = (F32 *)spec->ws[scaleWeightIndex].weight;
+                F32 *beta1 = (F32 *)spec->ws[scaleWeightIndex].vec;
+                for (U32 m = 0; m < channelCur; m++) {
+                    F32 beta = spec->ops[i].ps.power_spec.shift;
+                    if (alpha0 == nullptr) {
+                        alpha1[m] = spec->ops[i].ps.power_spec.scale;
+                    } else {
+                        alpha1[m] = alpha0[m] * spec->ops[i].ps.power_spec.scale;
+                        beta *= alpha0[m];
                     }
-                    spec->ws[scaleWeightIndex].weight = nullptr;
-                }
-                if (spec->ws[scaleWeightIndex].vec != nullptr) {
-                    spec->ws[scaleWeightIndex].bytes_of_vec = 0;
-                    if (outOfFileMapRange(spec->ws[scaleWeightIndex].vec, spec->mfd)) {
-                        delete spec->ws[scaleWeightIndex].vec;
+                    if (beta0 == nullptr) {
+                        beta1[m] = beta;
+                    } else {
+                        beta1[m] = beta + beta0[m];
                     }
-                    spec->ws[scaleWeightIndex].vec = nullptr;
                 }
-                setOperatorInvalid(spec, scaleOpIndex, true);
+                mt_free(alpha0, spec);
+                mt_free(beta0, spec);
+                setOperatorInvalid(spec, i, true);
                 hasOptimized = true;
-                i--;
             }
         }
         return hasOptimized;
diff --git a/model_tools/include/model_converter.h b/model_tools/include/model_converter.h
index 521b5cb4..2ca3f142 100644
--- a/model_tools/include/model_converter.h
+++ b/model_tools/include/model_converter.h
@@ -33,4 +33,6 @@ EE tflite_converter(std::string dir, std::string mfn, ModelSpec *ms);
 #ifdef _USE_TENSORFLOW
 EE tensorflow_converter(std::string dir, std::string mfn, ModelSpec *ms);
 #endif
+
+EE mindspore_converter(std::string dir, std::string mfn, ModelSpec *ms);
 #endif
diff --git a/model_tools/include/model_optimizer.hpp b/model_tools/include/model_optimizer.hpp
index 9c58bbbc..64f12daf 100644
--- a/model_tools/include/model_optimizer.hpp
+++ b/model_tools/include/model_optimizer.hpp
@@ -29,7 +29,7 @@
 #include "OPOptimizers/FCFCOptimizer.hpp"
 #include "OPOptimizers/ClipOptimizer.hpp"
 #include "OPOptimizers/ReshapeOptimizer.hpp"
-#include "OPOptimizers/NoQuantLabelOptimizer.hpp"
+#include "OPOptimizers/QuantizationOptimizer.hpp"
 #include "OPOptimizers/MemoryReuseOptimizer.hpp"
 #include "OPOptimizers/ShGaUnCoReOptimizer.hpp"
 #include "OPOptimizers/RNNOptimizer.hpp"
@@ -47,12 +47,14 @@
 #include "OPOptimizers/DilationConvOptimizer.hpp"
 #include "OPOptimizers/ConvolutionSliceOptimizer.hpp"
 #include "OPOptimizers/SignOptimizer.hpp"
-#include "OPOptimizers/SwapPadTranspose.hpp"
+#include "OPOptimizers/SwapPadTransposeOptimizer.hpp"
 #include "OPOptimizers/ResizeFuseOptimizer.hpp"
 #include "OPOptimizers/TransposeOptimizer.hpp"
 #include "OPOptimizers/InputTransOptimizer.hpp"
-#include "OPOptimizers/SwapTransposeElt.hpp"
-#include "OPOptimizers/TransConcatTrans.hpp"
+#include "OPOptimizers/SwapTransposeEltOptimizer.hpp"
+#include "OPOptimizers/SwapChannelResizePoolingOptimizer.hpp"
+#include "OPOptimizers/TransConcatTransOptimizer.hpp"
+#include "OPOptimizers/SwishOptimizer.hpp"
 #include "OPOptimizers/HSwishOptimizer.hpp"
 #include "OPOptimizers/HSigmoidOptimizer.hpp"
 #include "OPOptimizers/ReorderChannelResizeOptimizer.hpp"
@@ -64,6 +66,8 @@
 #include "OPOptimizers/MergeSharedWeightOptimizer.hpp"
 #include "OPOptimizers/GATOptimizer.hpp"
 #include "OPOptimizers/ConvConvOptimizer.hpp"
+#include "OPOptimizers/ConvFCOptimizer.hpp"
+#include "OPOptimizers/TransposeConvOptimizer.hpp"
 
 class ModelSpecOptimizer {
 public:
@@ -74,6 +78,10 @@ class ModelSpecOptimizer {
     {
         bool optimizeOrNot = false;
         for (auto opo : opos) {
+            const char *classNameAll = typeid(*opo).name();
+            char *className;
+            strtol(classNameAll, &className, 10);
+            UNI_DEBUG_LOG("run optimizer: %s.\n", className);
             if (opo->optimize(spec)) {
                 optimizeOrNot = true;
             }
@@ -88,6 +96,7 @@ class ModelSpecOptimizer {
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new DeprecatedOPOptimizer()));
 
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new GATOptimizer()));
+        this->opos.push_back(std::shared_ptr<OPOptimizer>(new InputTransOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new RsqrtOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new FuseReshapeOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new ModifyDtOfInputOptimizer()));
@@ -100,13 +109,13 @@ class ModelSpecOptimizer {
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new DilationConvolutionOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new SignOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new GeluOptimizer()));
+        this->opos.push_back(std::shared_ptr<OPOptimizer>(new SwishOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new HSwishOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new HSigmoidOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new TransposeMatMulToFCOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new InnerProductOptimizer()));
         // this->opos.push_back(std::shared_ptr<OPOptimizer>(new MultiHeadAttentionOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new InvariantSliceOptimizer()));
-        this->opos.push_back(std::shared_ptr<OPOptimizer>(new InPlaceOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new PowerOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new SpliceFCOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new ActivationOptimizer()));
@@ -119,14 +128,14 @@ class ModelSpecOptimizer {
             this->opos.push_back(std::shared_ptr<OPOptimizer>(new WeightBNOptimizer()));
             this->opos.push_back(std::shared_ptr<OPOptimizer>(new WeightScaleOptimizer()));
             this->opos.push_back(std::shared_ptr<OPOptimizer>(new ActivationOptimizer()));
+            this->opos.push_back(std::shared_ptr<OPOptimizer>(new BNScaleOptimizer()));
         }
-        this->opos.push_back(std::shared_ptr<OPOptimizer>(new BNScaleOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new PadOptimizer()));
 
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new ActivationOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new ConvConvOptimizer()));
-        this->opos.push_back(std::shared_ptr<OPOptimizer>(new ChannelPaddingOptimizer()));
         if (!isPTQ) {
+            this->opos.push_back(std::shared_ptr<OPOptimizer>(new ChannelPaddingOptimizer()));
             //this->opos.push_back(std::shared_ptr<OPOptimizer>(new ScaleWeightOptimizer()));
             this->opos.push_back(std::shared_ptr<OPOptimizer>(new DepthwisePointwiseOptimizer()));
         }
@@ -140,14 +149,20 @@ class ModelSpecOptimizer {
 
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new MergeSameAndScaleOPOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new ConvolutionStrideOptimizer()));
-        this->opos.push_back(std::shared_ptr<OPOptimizer>(new InputTransOptimizer()));
         //this->opos.push_back(std::shared_ptr<OPOptimizer>(new ReshapeINOptimizer()));
+        //this->opos.push_back(std::shared_ptr<OPOptimizer>(new FCFCOptimizer()));
+        this->opos.push_back(std::shared_ptr<OPOptimizer>(new ConvFCOptimizer()));
+        this->opos.push_back(std::shared_ptr<OPOptimizer>(new TransposeConvOptimizer()));
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new MergeSharedWeightOptimizer()));
         // this->opos.push_back(std::shared_ptr<OPOptimizer>(new ConvolutionEltwiseOptimizer()));
         // this->opos.push_back(std::shared_ptr<OPOptimizer>(new ReorderChannelResizeOptimizer()));
+        this->opos.push_back(std::shared_ptr<OPOptimizer>(new SwapChannelResizePoolingOptimizer()));
 
         // Please leave MemoryReuseOptimizer at last
-        this->opos.push_back(std::shared_ptr<OPOptimizer>(new MemoryReuseOptimizer()));
+        if (!isPTQ) {
+            this->opos.push_back(std::shared_ptr<OPOptimizer>(new InPlaceOptimizer()));
+            this->opos.push_back(std::shared_ptr<OPOptimizer>(new MemoryReuseOptimizer()));
+        }
     }
 
     void suggest_for_training()
@@ -157,10 +172,11 @@ class ModelSpecOptimizer {
 
         this->opos.push_back(std::shared_ptr<OPOptimizer>(new PadOptimizer()));
 
-        this->opos.push_back(std::shared_ptr<OPOptimizer>(new NoQuantLabelOptimizer(false, 0)));
+        this->opos.push_back(std::shared_ptr<OPOptimizer>(new MemoryReuseOptimizer()));
+        // this->opos.push_back(std::shared_ptr<OPOptimizer>(new QuantizationOptimizer()));
     }
 
-    void suggest_for_ptq(std::string inferPrecision, bool fuseBN, F32 clipVal, bool hasScale)
+    void suggest_for_ptq(std::string inferPrecision, bool fuseBN, const char *scaleFile, F32 clipVal)
     {
         if (fuseBN) {
             // Fuse BN with previous conv or fc
@@ -169,17 +185,18 @@ class ModelSpecOptimizer {
             this->opos.push_back(std::shared_ptr<OPOptimizer>(new WeightScaleOptimizer()));
             this->opos.push_back(std::shared_ptr<OPOptimizer>(new WeightBNOptimizer()));
             this->opos.push_back(std::shared_ptr<OPOptimizer>(new ActivationOptimizer()));
+            this->opos.push_back(std::shared_ptr<OPOptimizer>(new ConvolutionStrideOptimizer()));
         }
+        this->opos.push_back(std::shared_ptr<OPOptimizer>(new ChannelPaddingOptimizer()));
 
         bool hiddenMode = (inferPrecision == "HIDDEN");
-        if (!hiddenMode) {
-            this->opos.push_back(std::shared_ptr<OPOptimizer>(new DepthwisePointwiseOptimizer()));
-        }
+        //if (!hiddenMode) {
+        //    this->opos.push_back(std::shared_ptr<OPOptimizer>(new DepthwisePointwiseOptimizer()));
+        //}
 
-        if (!hasScale) {
-            this->opos.push_back(
-                std::shared_ptr<OPOptimizer>(new NoQuantLabelOptimizer(hiddenMode, clipVal)));
-        }
+        this->opos.push_back(
+            std::shared_ptr<OPOptimizer>(new QuantizationOptimizer(hiddenMode, scaleFile, clipVal)));
+        this->opos.push_back(std::shared_ptr<OPOptimizer>(new MemoryReuseOptimizer()));
     }
 
     void empty()
diff --git a/model_tools/include/online_conversion.h b/model_tools/include/online_conversion.h
index e09d82f5..832f354e 100644
--- a/model_tools/include/online_conversion.h
+++ b/model_tools/include/online_conversion.h
@@ -17,7 +17,8 @@
 void *OnlineModelConversion(const char *storagePath,
     const char *modelName,
     const char *inferPrecision,
-    int removeProcessOpsNum);
+    int removeProcessOpsNum = 0,
+    bool trainMode = false);
 
 void OnlineModelReclaim(void *ms);
 #endif
diff --git a/model_tools/src/CMakeLists.txt b/model_tools/src/CMakeLists.txt
index c27c6241..04b640b6 100644
--- a/model_tools/src/CMakeLists.txt
+++ b/model_tools/src/CMakeLists.txt
@@ -24,6 +24,10 @@ if (USE_TENSORFLOW)
     add_subdirectory(tensorflow)
     target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${PROJECT_NAME}_tensorflow)
 endif(USE_TENSORFLOW)
+if (USE_MINDSPORE)
+    add_subdirectory(mindspore)
+    target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${PROJECT_NAME}_mindspore)
+endif(USE_MINDSPORE)
 target_link_libraries (${PROJECT_NAME} LINK_PUBLIC uni)
 
 set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}")
diff --git a/model_tools/src/caffe/caffe_adaptee.h b/model_tools/src/caffe/caffe_adaptee.h
index 9eed95d0..a511af93 100644
--- a/model_tools/src/caffe/caffe_adaptee.h
+++ b/model_tools/src/caffe/caffe_adaptee.h
@@ -13,10 +13,6 @@
 
 #ifndef _H_CAFFEADAPTEE
 #define _H_CAFFEADAPTEE
-#include <string>
-#include <fstream>
-#include <map>
-#include <vector>
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
@@ -69,123 +65,34 @@ class CaffeAdaptee : public ModelAdaptee {
 
     OperatorType convert_caffe_type(std::string inputType)
     {
-        if (inputType == "Convolution") {
-            return OT_Conv;
-        } else if (inputType == "Deconvolution") {
-            return OT_Deconvolution;
-        } else if (inputType == "L2Norm") {
-            return OT_L2Normalization;
-        } else if (inputType == "BatchNorm") {
-            return OT_BatchNorm;
-        } else if (inputType == "Scale") {
-            return OT_Scale;
-        } else if (inputType == "Eltwise") {
-            return OT_Eltwise;
-        } else if (inputType == "InnerProduct") {
-            return OT_FC;
-        } else if (inputType == "Pooling") {
-            return OT_Pooling;
-        } else if (inputType == "ReLU") {
-            return OT_Relu;
-        } else if (inputType == "ReLU6") {
-            return OT_Relu6;
-        } else if (inputType == "HSwish") {
-            return OT_HSwish;
-        } else if (inputType == "Sigmoid") {
-            return OT_Sigmoid;
-        } else if (inputType == "HSigmoid") {
-            return OT_HSigmoid;
-        } else if (inputType == "Softmax") {
-            return OT_Softmax;
-        } else if (inputType == "Concat") {
-            return OT_Concat;
-        } else if (inputType == "Embed") {
-            return OT_Embedding;
-        } else if (inputType == "Gelu") {
-            return OT_Gelu;
-        } else if (inputType == "LayerNorm") {
-            return OT_LayerNorm;
-        } else if (inputType == "MatMul") {
-            return OT_MatMul;
-        } else if (inputType == "Power") {
-            return OT_Power;
-        } else if (inputType == "Reshape") {
-            return OT_Reshape;
-        } else if (inputType == "Slice") {
-            return OT_Slice;
-        } else if (inputType == "Attention") {
-            return OT_Attention;
-        } else if (inputType == "Input") {
-            return OT_Input;
-        } else if (inputType == "LSTM") {
-            return OT_RNN;
-        } else if (inputType == "TanH") {
-            return OT_TanH;
-        } else if (inputType == "SoftmaxWithLoss") {
-            return OT_SoftmaxWithLoss;
-        } else if (inputType == "Squeeze") {
-            return OT_Squeeze;
-        } else if (inputType == "Unsqueeze") {
-            return OT_Unsqueeze;
-        } else if (inputType == "Reduction") {
-            return OT_Reduction;
-        } else if (inputType == "ArgMax") {
-            return OT_ArgMax;
-        } else if (inputType == "PreAllocatedMemory") {
-            return OT_PreAllocatedMemory;
-        } else if (inputType == "SharedWeight") {
-            return OT_SharedWeight;
-        } else if (inputType == "Copy") {
-            return OT_Copy;
-        } else if (inputType == "Check") {
-            return OT_Check;
-        } else if (inputType == "Repeat") {
-            return OT_Repeat;
-        } else if (inputType == "Interp") {
-            return OT_Resize;
-        } else if (inputType == "Jump") {
-            return OT_Jump;
-        } else if (inputType == "AttentionMask") {
-            return OT_AttentionMask;
-        } else if (inputType == "RelativePositionEmbed") {
-            return OT_RelativePositionEmbedding;
-        } else if (inputType == "RelativeShift") {
-            return OT_RelativeShift;
-        } else if (inputType == "Dropout") {
-            return OT_Dropout;
-        } else if (inputType == "Flatten") {
-            return OT_Reshape;
-        } else if (inputType == "Permute") {
-            return OT_Transpose;
-        } else if (inputType == "Clip") {
-            return OT_Clip;
-        } else if (inputType == "PriorBox") {
-            return OT_PriorBox;
-        } else if (inputType == "DetectionOutput") {
-            return OT_DetectionOutput;
-        } else if (inputType == "Yolov3DetectionOutput") {
-            return OT_Yolov3DetectionOutput;
-        } else if (inputType == "Mish") {
-            return OT_Mish;
-        } else if (inputType == "PReLU") {
-            return OT_PRelu;
-        } else if (inputType == "Tile") {
-            return OT_Tile;
-        } else if (inputType == "Pad") {
-            return OT_Pad;
-        } else if (inputType == "SoftPlus") {
-            return OT_SoftPlus;
-        } else if (inputType == "Exp") {
-            return OT_Exp;
-        } else if (inputType == "AbsVal") {
-            return OT_Abs;
-        } else if (inputType == "Silence") {
-            return OT_None;
-        } else {
+        std::map<std::string, OperatorType> operatorMap = {{"Convolution", OT_Conv},
+            {"Deconvolution", OT_Deconvolution}, {"L2Norm", OT_L2Normalization},
+            {"BatchNorm", OT_BatchNorm}, {"Scale", OT_Scale}, {"Eltwise", OT_Eltwise},
+            {"InnerProduct", OT_FC}, {"Pooling", OT_Pooling}, {"ReLU", OT_Relu},
+            {"ReLU6", OT_Relu6}, {"HSwish", OT_HSwish}, {"Sigmoid", OT_Sigmoid},
+            {"HSigmoid", OT_HSigmoid}, {"Softmax", OT_Softmax}, {"Concat", OT_Concat},
+            {"Embed", OT_Embedding}, {"Gelu", OT_Gelu}, {"LayerNorm", OT_LayerNorm},
+            {"MatMul", OT_MatMul}, {"Power", OT_Power}, {"Reshape", OT_Reshape},
+            {"Slice", OT_Slice}, {"Attention", OT_Attention}, {"Input", OT_Input}, {"LSTM", OT_RNN},
+            {"TanH", OT_TanH}, {"SoftmaxWithLoss", OT_SoftmaxWithLoss}, {"Squeeze", OT_Squeeze},
+            {"Unsqueeze", OT_Unsqueeze}, {"Reduction", OT_Reduction}, {"ArgMax", OT_ArgMax},
+            {"PreAllocatedMemory", OT_PreAllocatedMemory}, {"SharedWeight", OT_SharedWeight},
+            {"Copy", OT_Copy}, {"Check", OT_Check}, {"Repeat", OT_Repeat}, {"Interp", OT_Resize},
+            {"Jump", OT_Jump}, {"AttentionMask", OT_AttentionMask},
+            {"RelativePositionEmbed", OT_RelativePositionEmbedding},
+            {"RelativeShift", OT_RelativeShift}, {"Dropout", OT_Dropout}, {"Flatten", OT_Reshape},
+            {"Permute", OT_Transpose}, {"Clip", OT_Clip}, {"PriorBox", OT_PriorBox},
+            {"DetectionOutput", OT_DetectionOutput},
+            {"Yolov3DetectionOutput", OT_Yolov3DetectionOutput}, {"Mish", OT_Mish},
+            {"PReLU", OT_PRelu}, {"Tile", OT_Tile}, {"Pad", OT_Pad}, {"SoftPlus", OT_SoftPlus},
+            {"Exp", OT_Exp}, {"AbsVal", OT_Abs}, {"Silence", OT_None}};
+        if (operatorMap.find(inputType) == operatorMap.end()) {
             UNI_ERROR_LOG("operator name:%s type:%s not supported.\n", this->layer.name().c_str(),
                 inputType.c_str());
+            return OT_None;
+        } else {
+            return operatorMap[inputType];
         }
-        return OT_None;
     }
 
     int net_search_layerId(caffe::NetParameter &netParams, std::string &layerName)
@@ -271,18 +178,18 @@ class CaffeAdaptee : public ModelAdaptee {
             weights.push_back(std::make_pair(blob2, blobSize));
         }
         if (weights.size() > 0) {
-            wsPtr[weightIndex].weight = (U8 *)mt_new_storage(wsPtr[weightIndex].bytes_of_weight);
+            wsPtr[weightIndex].weight = (U8 *)mt_malloc(wsPtr[weightIndex].bytes_of_weight);
             U8 *ptr = wsPtr[weightIndex].weight;
             for (U32 i = 0; i < weights.size(); i++) {
-                memcpy(ptr, weights[i].first.data().data(), weights[i].second);
+                UNI_MEMCPY(ptr, weights[i].first.data().data(), weights[i].second);
                 ptr += weights[i].second;
             }
         }
         if (biases.size() > 0) {
-            wsPtr[weightIndex].vec = (U8 *)mt_new_storage(wsPtr[weightIndex].bytes_of_vec);
+            wsPtr[weightIndex].vec = (U8 *)mt_malloc(wsPtr[weightIndex].bytes_of_vec);
             U8 *ptr = wsPtr[weightIndex].vec;
             for (U32 i = 0; i < biases.size(); i++) {
-                memcpy(ptr, biases[i].first.data().data(), biases[i].second);
+                UNI_MEMCPY(ptr, biases[i].first.data().data(), biases[i].second);
                 ptr += biases[i].second;
             }
         }
@@ -320,7 +227,7 @@ class CaffeAdaptee : public ModelAdaptee {
 
         ms->num_operator_specs = proto.layer_size();
         OperatorSpec *opsPtr =
-            (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs);
+            (OperatorSpec *)mt_malloc(sizeof(OperatorSpec) * ms->num_operator_specs);
         ms->ops = opsPtr;
         for (I32 i = 0; i < ms->num_operator_specs; i++) {
             ms->ops[i].tensor_positions = nullptr;
@@ -329,7 +236,7 @@ class CaffeAdaptee : public ModelAdaptee {
         }
 
         int inputsNumber = 0;
-        weightNumber = 0;  // set global variable initial value
+        this->weightNumber = 0;
         std::map<std::string, int> outputCounts;
         std::set<std::string> sharedWeightCounts;
         for (int i = 0; i < proto.input_size(); i++) {
@@ -354,9 +261,9 @@ class CaffeAdaptee : public ModelAdaptee {
             opsPtr[i].type = convert_caffe_type(layer.type());
             int bottomSize = layer.bottom_size();
             opsPtr[i].num_inputs = bottomSize;
-            opsPtr[i].input_tensors_name = (I8 **)mt_new_storage(bottomSize * sizeof(I8 *));
+            opsPtr[i].input_tensors_name = (I8 **)mt_malloc(bottomSize * sizeof(I8 *));
             for (int j = 0; j < bottomSize; j++) {
-                opsPtr[i].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+                opsPtr[i].input_tensors_name[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
                 str_copy(opsPtr[i].input_tensors_name[j], layer.bottom(j).c_str(),
                     layer.bottom(j).length());
                 if (outputCounts.find(layer.bottom(j)) == outputCounts.end()) {
@@ -370,9 +277,9 @@ class CaffeAdaptee : public ModelAdaptee {
             }
             int topSize = layer.top_size();
             opsPtr[i].num_outputs = topSize;
-            opsPtr[i].output_tensors_name = (I8 **)mt_new_storage(topSize * sizeof(I8 *));
+            opsPtr[i].output_tensors_name = (I8 **)mt_malloc(topSize * sizeof(I8 *));
             for (int j = 0; j < topSize; j++) {
-                opsPtr[i].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+                opsPtr[i].output_tensors_name[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
                 str_copy(
                     opsPtr[i].output_tensors_name[j], layer.top(j).c_str(), layer.top(j).length());
                 if (outputCounts.find(layer.top(j)) == outputCounts.end()) {
@@ -384,16 +291,16 @@ class CaffeAdaptee : public ModelAdaptee {
 
             CHECK_STATUS(adapt_operator(opsPtr[i].type, &(ms->ops[i].ps)));
             if (opsPtr[i].type == OT_MatMul && sharedWeightCounts.count(layer.bottom(1))) {
-                weightNumber += 1;
+                this->weightNumber++;
             }
         }
 
         inputsNumber = (inputsNumber > proto.input_size()) ? inputsNumber : proto.input_size();
         ms->num_inputs = inputsNumber;
-        ms->input_names = (I8 **)mt_new_storage(inputsNumber * sizeof(I8 *));
-        ms->input_dims = (TensorDesc *)mt_new_storage(sizeof(TensorDesc) * inputsNumber);
+        ms->input_names = (I8 **)mt_malloc(inputsNumber * sizeof(I8 *));
+        ms->input_dims = (TensorDesc *)mt_malloc(sizeof(TensorDesc) * inputsNumber);
         for (int i = 0; i < inputsNumber; i++) {
-            ms->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+            ms->input_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
             ms->input_dims[i] = tensor0d();
             if (proto.input_size() > 0) {
                 str_copy(ms->input_names[i], proto.input(i).c_str(), proto.input(i).length());
@@ -429,11 +336,11 @@ class CaffeAdaptee : public ModelAdaptee {
             }
         }
         ms->num_outputs = outputsNumber;
-        ms->output_names = (I8 **)mt_new_storage(outputsNumber * sizeof(I8 *));
+        ms->output_names = (I8 **)mt_malloc(outputsNumber * sizeof(I8 *));
         outputsNumber = 0;
         for (auto iter : outputCounts) {
             if (iter.second > 0) {
-                ms->output_names[outputsNumber] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+                ms->output_names[outputsNumber] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
                 str_copy(ms->output_names[outputsNumber], iter.first.c_str(), iter.first.length());
                 outputsNumber++;
             }
@@ -445,7 +352,7 @@ class CaffeAdaptee : public ModelAdaptee {
     EE adapt_weights(ModelSpec *ms) override
     {
         EE ret = SUCCESS;
-        WeightSpec *wsPtr = (WeightSpec *)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs);
+        WeightSpec *wsPtr = (WeightSpec *)mt_malloc(sizeof(WeightSpec) * ms->num_weight_specs);
         for (int j = 0; j < ms->num_weight_specs; j++) {
             wsPtr[j].num_quant_scale = 0;
             wsPtr[j].weight_scale = nullptr;
@@ -523,7 +430,7 @@ class CaffeAdaptee : public ModelAdaptee {
             }
         }
 
-        CHECK_REQUIREMENT(weightIndex == weightNumber);
+        CHECK_REQUIREMENT(weightIndex == this->weightNumber);
         // relationship init null
         ms->num_op_tensor_entries = 0;
         ms->op_relationship_entries = nullptr;
@@ -532,214 +439,189 @@ class CaffeAdaptee : public ModelAdaptee {
 
     ParameterSpec adapt_Resize() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ResizeParamSpec resizePs;
-        memset(&resizePs, 0, sizeof(resizePs));
-        auto caffeInterpParam = layer.interp_param();
-        resizePs.sizes[0] = caffeInterpParam.height();
-        resizePs.sizes[1] = caffeInterpParam.width();
-        resizePs.num_sizes = 2;
-        resizePs.num_scales = 0;
-        curPs.resize_spec = resizePs;
-        return curPs;
+        ParameterSpec ps;
+        ResizeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.interp_param();
+        p.sizes[0] = cp.height();
+        p.sizes[1] = cp.width();
+        p.num_sizes = 2;
+        p.num_scales = 0;
+        ps.resize_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Conv() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        weightNumber = weightNumber + 1;
-        ConvolutionParamSpec cps;
-        memset(&cps, 0, sizeof(cps));
-        cps.num_outputs = layer.convolution_param().num_output();
-        cps.num_outputs_origin = cps.num_outputs;
-        cps.kernel_t = 1;
-        cps.stride_t = 1;
-        cps.padding_before = 0;
-        cps.padding_after = 0;
-        cps.dilatedRate_t = 1;
-        if (layer.convolution_param().has_kernel_w() && layer.convolution_param().has_kernel_h()) {
-            cps.kernel_w = layer.convolution_param().kernel_w();
-            cps.kernel_h = layer.convolution_param().kernel_h();
+        this->weightNumber++;
+        ParameterSpec ps;
+        ConvolutionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.convolution_param();
+        p.num_outputs = cp.num_output();
+        p.num_outputs_origin = p.num_outputs;
+        p.kernel_t = 1;
+        p.stride_t = 1;
+        p.pad_before = 0;
+        p.pad_after = 0;
+        p.dilatedRate_t = 1;
+        if (cp.has_kernel_w() && cp.has_kernel_h()) {
+            p.kernel_w = cp.kernel_w();
+            p.kernel_h = cp.kernel_h();
         } else {
-            cps.kernel_h = (layer.convolution_param().kernel_size_size() > 0)
-                ? layer.convolution_param().kernel_size(0)
-                : 1;
-            cps.kernel_w = (layer.convolution_param().kernel_size_size() > 1)
-                ? layer.convolution_param().kernel_size(1)
-                : cps.kernel_h;
+            p.kernel_h = (cp.kernel_size_size() > 0) ? cp.kernel_size(0) : 1;
+            p.kernel_w = (cp.kernel_size_size() > 1) ? cp.kernel_size(1) : p.kernel_h;
         }
 
-        cps.group = (layer.convolution_param().has_group()) ? layer.convolution_param().group()
-                                                            : 1;  // group[default=1]
-
-        cps.dilatedRate_h = (layer.convolution_param().dilation_size() != 0)
-            ? layer.convolution_param().dilation(0)
-            : 1;
-        cps.dilatedRate_w = cps.dilatedRate_h;
+        p.group = (cp.has_group()) ? cp.group() : 1;
+        p.dilatedRate_h = (cp.dilation_size() != 0) ? cp.dilation(0) : 1;
+        p.dilatedRate_w = p.dilatedRate_h;
 
-        if (cps.group != 1 && cps.group == cps.num_outputs) {
-            cps.convolution_type = Convolution_Depthwise;
+        if (p.group != 1 && p.group == p.num_outputs) {
+            p.convolution_type = CONVOLUTION_DEPTHWISE;
         } else {
-            if (cps.dilatedRate_h > 1 || cps.dilatedRate_w > 1) {
-                cps.convolution_type = Convolution_Dilation;
-            } else {
-                cps.convolution_type = Convolution_Pointwise;
-            }
+            p.convolution_type = CONVOLUTION_POINTWISE;
         }
-        cps.dw_activation_type = ACTIVATION_NULL;
-        cps.pw_activation_type = ACTIVATION_NULL;
-        if (layer.convolution_param().has_stride_w() && layer.convolution_param().has_stride_h()) {
-            cps.stride_w = layer.convolution_param().stride_w();
-            cps.stride_h = layer.convolution_param().stride_h();
+        p.dw_activation_type = ACTIVATION_NULL;
+        p.pw_activation_type = ACTIVATION_NULL;
+        if (cp.has_stride_w() && cp.has_stride_h()) {
+            p.stride_w = cp.stride_w();
+            p.stride_h = cp.stride_h();
         } else {
-            cps.stride_h = (layer.convolution_param().stride_size() != 0)
-                ? layer.convolution_param().stride(0)
-                : 1;  // stride[default=1]
-            cps.stride_w = (layer.convolution_param().stride_size() > 1)
-                ? layer.convolution_param().stride(1)
-                : cps.stride_h;
-        }
-        if (layer.convolution_param().has_pad_w() && layer.convolution_param().has_pad_h()) {
-            cps.padding_left = layer.convolution_param().pad_w();
-            cps.padding_right = cps.padding_left;
-            cps.padding_top = layer.convolution_param().pad_h();
-            cps.padding_bottom = cps.padding_top;
+            p.stride_h = (cp.stride_size() != 0) ? cp.stride(0) : 1;  // stride[default=1]
+            p.stride_w = (cp.stride_size() > 1) ? cp.stride(1) : p.stride_h;
+        }
+        if (cp.has_pad_w() && cp.has_pad_h()) {
+            p.pad_left = cp.pad_w();
+            p.pad_right = p.pad_left;
+            p.pad_top = cp.pad_h();
+            p.pad_bottom = p.pad_top;
         } else {
-            cps.padding_top =
-                (layer.convolution_param().pad_size() > 0) ? layer.convolution_param().pad(0) : 0;
-            cps.padding_bottom = (layer.convolution_param().pad_size() > 1)
-                ? layer.convolution_param().pad(1)
-                : cps.padding_top;
-            cps.padding_left = (layer.convolution_param().pad_size() > 2)
-                ? layer.convolution_param().pad(2)
-                : cps.padding_top;
-            cps.padding_right = (layer.convolution_param().pad_size() > 3)
-                ? layer.convolution_param().pad(3)
-                : cps.padding_top;
-        }
-        curPs.conv_spec = cps;
-        return curPs;
+            p.pad_top = (cp.pad_size() > 0) ? cp.pad(0) : 0;
+            p.pad_bottom = (cp.pad_size() > 1) ? cp.pad(1) : p.pad_top;
+            p.pad_left = (cp.pad_size() > 2) ? cp.pad(2) : p.pad_top;
+            p.pad_right = (cp.pad_size() > 3) ? cp.pad(3) : p.pad_top;
+        }
+        ps.conv_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Deconvolution() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        weightNumber = weightNumber + 1;
-        ConvolutionParamSpec cps;
-        memset(&cps, 0, sizeof(cps));
-        cps.num_outputs = layer.convolution_param().num_output();
-        cps.num_outputs_origin = cps.num_outputs;
-        cps.kernel_t = 1;
-        cps.stride_t = 1;
-        cps.padding_before = 0;
-        cps.padding_after = 0;
-        cps.dilatedRate_t = 1;
-        if (layer.convolution_param().has_kernel_w() && layer.convolution_param().has_kernel_h()) {
-            cps.kernel_w = layer.convolution_param().kernel_w();
-            cps.kernel_h = layer.convolution_param().kernel_h();
+        this->weightNumber++;
+        ParameterSpec ps;
+        ConvolutionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.convolution_param();
+        p.num_outputs = cp.num_output();
+        p.num_outputs_origin = p.num_outputs;
+        p.kernel_t = 1;
+        p.stride_t = 1;
+        p.pad_before = 0;
+        p.pad_after = 0;
+        p.dilatedRate_t = 1;
+        p.output_pad_t = 0;
+        if (cp.has_kernel_w() && cp.has_kernel_h()) {
+            p.kernel_w = cp.kernel_w();
+            p.kernel_h = cp.kernel_h();
         } else {
-            cps.kernel_h = layer.convolution_param().kernel_size(0);
-            cps.kernel_w = cps.kernel_h;
+            p.kernel_h = cp.kernel_size(0);
+            p.kernel_w = p.kernel_h;
         }
 
-        cps.group = (layer.convolution_param().has_group()) ? layer.convolution_param().group() : 1;
-        if (1 != cps.group) {
+        p.group = (cp.has_group()) ? cp.group() : 1;
+        if (1 != p.group) {
             UNI_ERROR_LOG(
                 "can not process operator name:%s group != 1.", this->layer.name().c_str());
         }
-        cps.dilatedRate_h = 1;
-        cps.dilatedRate_w = 1;
-        cps.convolution_type = Convolution_Deconvolution;
-        cps.dw_activation_type = ACTIVATION_NULL;
-        cps.pw_activation_type = ACTIVATION_NULL;
-        if (layer.convolution_param().has_stride_w() && layer.convolution_param().has_stride_h()) {
-            cps.stride_w = layer.convolution_param().stride_w();
-            cps.stride_h = layer.convolution_param().stride_h();
+        p.dilatedRate_h = 1;
+        p.dilatedRate_w = 1;
+        p.convolution_type = CONVOLUTION_DECONVOLUTION;
+        p.dw_activation_type = ACTIVATION_NULL;
+        p.pw_activation_type = ACTIVATION_NULL;
+        if (cp.has_stride_w() && cp.has_stride_h()) {
+            p.stride_w = cp.stride_w();
+            p.stride_h = cp.stride_h();
         } else {
-            cps.stride_h = (layer.convolution_param().stride_size() != 0)
-                ? layer.convolution_param().stride(0)
-                : 1;  // stride[default=1]
-            cps.stride_w = cps.stride_h;
-        }
-        cps.rm = CEIL;
-        if (layer.convolution_param().has_pad_w() && layer.convolution_param().has_pad_h()) {
-            cps.padding_left = layer.convolution_param().pad_w();
-            cps.padding_right = cps.padding_left;
-            cps.padding_top = layer.convolution_param().pad_h();
-            cps.padding_bottom = cps.padding_top;
+            p.stride_h = (cp.stride_size() != 0) ? cp.stride(0) : 1;
+            p.stride_w = p.stride_h;
+        }
+        p.round_mode = ROUND_CEIL;
+        if (cp.has_pad_w() && cp.has_pad_h()) {
+            p.pad_left = cp.pad_w();
+            p.pad_right = p.pad_left;
+            p.pad_top = cp.pad_h();
+            p.pad_bottom = p.pad_top;
         } else {
-            cps.padding_top = (layer.convolution_param().pad_size() != 0)
-                ? layer.convolution_param().pad(0)
-                : 0;  // pad[default=0]
-            cps.padding_bottom = cps.padding_top;
-            cps.padding_left = cps.padding_top;
-            cps.padding_right = cps.padding_top;
-        }
-        curPs.conv_spec = cps;
-        return curPs;
+            p.pad_top = (cp.pad_size() != 0) ? cp.pad(0) : 0;
+            p.pad_bottom = p.pad_top;
+            p.pad_left = p.pad_top;
+            p.pad_right = p.pad_top;
+        }
+        p.output_pad_h = 0;
+        p.output_pad_w = 0;
+        ps.conv_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Pooling() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PoolingParamSpec pps;
-        memset(&pps, 0, sizeof(pps));
-        pps.kernel_t = 1;
-        pps.stride_t = 1;
-        pps.padding_before = 0;
-        pps.padding_after = 0;
-        if (layer.pooling_param().has_kernel_w() && layer.pooling_param().has_kernel_h()) {
-            pps.kernel_w = layer.pooling_param().kernel_w();
-            pps.kernel_h = layer.pooling_param().kernel_h();
+        ParameterSpec ps;
+        PoolingParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.kernel_t = 1;
+        p.stride_t = 1;
+        p.pad_before = 0;
+        p.pad_after = 0;
+        auto cp = layer.pooling_param();
+        if (cp.has_kernel_w() && cp.has_kernel_h()) {
+            p.kernel_w = cp.kernel_w();
+            p.kernel_h = cp.kernel_h();
         } else {
-            pps.kernel_h = layer.pooling_param().kernel_size();
-            pps.kernel_w = pps.kernel_h;
+            p.kernel_h = cp.kernel_size();
+            p.kernel_w = p.kernel_h;
         }
-        if (layer.pooling_param().has_stride_w() && layer.pooling_param().has_stride_h()) {
-            pps.stride_w = layer.pooling_param().stride_w();
-            pps.stride_h = layer.pooling_param().stride_h();
+        if (cp.has_stride_w() && cp.has_stride_h()) {
+            p.stride_w = cp.stride_w();
+            p.stride_h = cp.stride_h();
         } else {
-            pps.stride_h = layer.pooling_param().stride();
-            pps.stride_w = pps.stride_h;
+            p.stride_h = cp.stride();
+            p.stride_w = p.stride_h;
         }
-        bool global_pooling = layer.pooling_param().global_pooling();
+        bool global_pooling = cp.global_pooling();
         if (global_pooling) {
-            pps.kernel_h = 0;
-            pps.kernel_w = 0;
-            pps.stride_h = 1;
-            pps.stride_w = 1;
+            p.kernel_h = 0;
+            p.kernel_w = 0;
+            p.stride_h = 1;
+            p.stride_w = 1;
         } else {
-            CHECK_REQUIREMENT(pps.kernel_h > 0);
+            CHECK_REQUIREMENT(p.kernel_h > 0);
         }
-        if (layer.pooling_param().has_pad_w() && layer.pooling_param().has_pad_h()) {
-            pps.padding_left = layer.pooling_param().pad_w();
-            pps.padding_right = pps.padding_left;
-            pps.padding_top = layer.pooling_param().pad_h();
-            pps.padding_bottom = pps.padding_top;
+        if (cp.has_pad_w() && cp.has_pad_h()) {
+            p.pad_left = cp.pad_w();
+            p.pad_right = p.pad_left;
+            p.pad_top = cp.pad_h();
+            p.pad_bottom = p.pad_top;
         } else {
-            pps.padding_top = layer.pooling_param().has_pad() ? layer.pooling_param().pad() : 0;
-            pps.padding_bottom = pps.padding_top;
-            pps.padding_left = pps.padding_top;
-            pps.padding_right = pps.padding_top;
+            p.pad_top = cp.has_pad() ? cp.pad() : 0;
+            p.pad_bottom = p.pad_top;
+            p.pad_left = p.pad_top;
+            p.pad_right = p.pad_top;
         }
 
-        if (layer.pooling_param().has_round_mode() && layer.pooling_param().round_mode() == 1) {
-            pps.rm = FLOOR;
+        if (cp.has_round_mode() && cp.round_mode() == 1) {
+            p.round_mode = ROUND_FLOOR;
         } else {
-            pps.rm = CEIL;
+            p.round_mode = ROUND_CEIL;
         }
-        auto op = layer.pooling_param().pool();
+        auto op = cp.pool();
         switch (op) {
             case caffe::PoolingParameter_PoolMethod_MAX: {
-                pps.mode = POOLING_MAX;
+                p.mode = POOLING_MAX;
                 break;
             }
             case caffe::PoolingParameter_PoolMethod_AVE: {
-                pps.mode = POOLING_MEAN;
+                p.mode = POOLING_MEAN;
                 break;
             }
             default: {
@@ -749,70 +631,70 @@ class CaffeAdaptee : public ModelAdaptee {
                     this->layer.name().c_str(), descriptor->FindValueByNumber(op)->name().c_str());
             }
         }
-        curPs.pooling_spec = pps;
-        return curPs;
+        p.count_include_pad = true;
+        ps.pooling_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Fc() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        weightNumber = weightNumber + 1;
-        FullyConnectedParamSpec ips;
-        memset(&ips, 0, sizeof(ips));
-        ips.num_outputs = layer.inner_product_param().num_output();
-        ips.num_slices = 1;
-        ips.slice_point[0] = ips.num_outputs;
-        curPs.fc_spec = ips;
-        return curPs;
+        this->weightNumber++;
+        ParameterSpec ps;
+        FullyConnectedParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.num_outputs = layer.inner_product_param().num_output();
+        p.num_slices = 1;
+        p.slice_point[0] = p.num_outputs;
+        ps.fc_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_BatchNorm() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        weightNumber = weightNumber + 1;
-        BatchNormParamSpec bnps;
-        memset(&bnps, 0, sizeof(bnps));
-        bnps.axis = layer.batch_norm_param().axis();
-        bnps.eps = layer.batch_norm_param().eps();
-        bnps.gama = 1;
-        bnps.momentum = layer.batch_norm_param().moving_average_fraction();
-        curPs.bn_spec = bnps;
-        return curPs;
+        this->weightNumber++;
+        ParameterSpec ps;
+        BatchNormParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.batch_norm_param();
+        p.axis = cp.axis();
+        p.eps = cp.eps();
+        p.gama = 1;
+        p.momentum = cp.moving_average_fraction();
+        ps.bn_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_LayerNorm() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        weightNumber = weightNumber + 1;
-        return curPs;
+        this->weightNumber++;
+        ParameterSpec ps;
+        LayerNormParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = -1;   //layer.layer_norm_param().axis();
+        p.eps = 1e-6;  //layer.layer_norm_param().eps();
+        ps.ln_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Eltwise() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        EltwiseParamSpec eps;
-        memset(&eps, 0, sizeof(eps));
-        EltwiseSumSpec ess;
-        memset(&ess, 0, sizeof(ess));
-
-        auto caffeEltwiseParam = layer.eltwise_param();
-        auto op = caffeEltwiseParam.operation();
+        ParameterSpec ps;
+        EltwiseParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.eltwise_param();
+        auto op = cp.operation();
         switch (op) {
             case caffe::EltwiseParameter_EltwiseOp_PROD:
-                eps.elt_mode = ELTWISE_PROD;
+                p.mode = ELTWISE_PROD;
                 break;
             case caffe::EltwiseParameter_EltwiseOp_SUM:
-                eps.elt_mode = ELTWISE_SUM;
+                p.mode = ELTWISE_SUM;
                 break;
             case caffe::EltwiseParameter_EltwiseOp_MAX:
-                eps.elt_mode = ELTWISE_MAX;
+                p.mode = ELTWISE_MAX;
                 break;
             case caffe::EltwiseParameter_EltwiseOp_DIV:
-                eps.elt_mode = ELTWISE_DIV;
+                p.mode = ELTWISE_DIV;
                 break;
             default: {
                 const google::protobuf::EnumDescriptor *descriptor =
@@ -821,212 +703,200 @@ class CaffeAdaptee : public ModelAdaptee {
                     this->layer.name().c_str(), descriptor->FindValueByNumber(op)->name().c_str());
             }
         }
-        U32 bytes = caffeEltwiseParam.coeff_size() * sizeof(F32);
-        ess.coeff_size = caffeEltwiseParam.coeff_size();
-        memcpy(ess.coeff_values, caffeEltwiseParam.coeff().data(), bytes);
-        for (int j = 0; j < caffeEltwiseParam.coeff_size(); j++) {
-            CHECK_REQUIREMENT(ess.coeff_values[j] == 1);
+        U32 bytes = cp.coeff_size() * sizeof(F32);
+        p.sum_spec.num_coeff = cp.coeff_size();
+        UNI_MEMCPY(p.sum_spec.coeff, cp.coeff().data(), bytes);
+        for (int j = 0; j < cp.coeff_size(); j++) {
+            CHECK_REQUIREMENT(p.sum_spec.coeff[j] == 1);
         }
-        eps.elt_sum_spec = ess;
-        eps.activation_type = ACTIVATION_NULL;
-        curPs.eltwise_spec = eps;
-        return curPs;
+        p.activation_type = ACTIVATION_NULL;
+        ps.eltwise_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Embedding() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        weightNumber = weightNumber + 1;
-        EmbedParamSpec embedPs;
-        memset(&embedPs, 0, sizeof(embedPs));
-        auto caffeEmbedParam = layer.embed_param();
-        embedPs.input_dim = caffeEmbedParam.input_dim();
-        embedPs.num_output = caffeEmbedParam.num_output();
-        embedPs.bias_term = caffeEmbedParam.bias_term() == 0 ? false : true;
-        embedPs.transpose = caffeEmbedParam.transpose() == 0 ? false : true;
-        curPs.embed_spec = embedPs;
-        return curPs;
+        this->weightNumber++;
+        ParameterSpec ps;
+        EmbedParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.embed_param();
+        p.num_inputs = cp.input_dim();
+        p.num_outputs = cp.num_output();
+        p.bias_term = cp.bias_term() == 0 ? false : true;
+        p.transpose = cp.transpose() == 0 ? false : true;
+        ps.embed_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Power() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PowerParamSpec powerPs;
-        memset(&powerPs, 0, sizeof(powerPs));
-        auto caffePowerParam = layer.power_param();
-        powerPs.scale = caffePowerParam.scale();
-        powerPs.shift = caffePowerParam.shift();
-        powerPs.power = caffePowerParam.power();
-        curPs.power_spec = powerPs;
-        return curPs;
+        ParameterSpec ps;
+        PowerParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.power_param();
+        p.scale = cp.scale();
+        p.shift = cp.shift();
+        p.power = cp.power();
+        ps.power_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Reshape() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ReshapeParamSpec reshapePs;
-        memset(&reshapePs, 0, sizeof(reshapePs));
+        ParameterSpec ps;
+        ReshapeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         if (this->op == "Flatten") {
-            auto caffeFlattenParam = layer.flatten_param();
-            CHECK_REQUIREMENT(
-                -1 == caffeFlattenParam.end_axis());  // Currently compute as reshape layer
-            reshapePs.shape_size = caffeFlattenParam.axis() + 1;
-            for (I32 iter = 0; iter < reshapePs.shape_size - 1; iter++) {
-                reshapePs.shape_dims[iter] = 0;
+            auto cp = layer.flatten_param();
+            CHECK_REQUIREMENT(-1 == cp.end_axis());
+            p.num_shape = cp.axis() + 1;
+            for (I32 iter = 0; iter < p.num_shape - 1; iter++) {
+                p.shape[iter] = 0;
             }
-            reshapePs.shape_dims[reshapePs.shape_size - 1] = -1;
-            reshapePs.axis = 0;
-            reshapePs.num_axes = -1;
+            p.shape[p.num_shape - 1] = -1;
+            p.axis = 0;
+            p.num_axes = -1;
         } else {
-            auto caffeReshapeParam = layer.reshape_param();
-            reshapePs.shape_size = caffeReshapeParam.shape().dim_size();
-            for (I32 iter = 0; iter < caffeReshapeParam.shape().dim_size(); iter++) {
-                reshapePs.shape_dims[iter] = caffeReshapeParam.shape().dim(iter);
+            auto cp = layer.reshape_param();
+            p.num_shape = cp.shape().dim_size();
+            for (I32 iter = 0; iter < cp.shape().dim_size(); iter++) {
+                p.shape[iter] = cp.shape().dim(iter);
             }
-            reshapePs.axis = caffeReshapeParam.axis();
-            reshapePs.num_axes = caffeReshapeParam.num_axes();
+            p.axis = cp.axis();
+            p.num_axes = cp.num_axes();
         }
-        curPs.reshape_spec = reshapePs;
-        return curPs;
+        ps.reshape_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Slice() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SliceParamSpec slicePs;
-        memset(&slicePs, 0, sizeof(slicePs));
-        auto caffeSliceParam = layer.slice_param();
-        for (I32 iter = 0; iter < caffeSliceParam.slice_point().size(); iter++) {
-            slicePs.slice_points[iter] = caffeSliceParam.slice_point(iter);
-        }
-        slicePs.slice_size = caffeSliceParam.slice_point().size();
-        slicePs.axis = caffeSliceParam.axis();
-        curPs.slice_spec = slicePs;
-        return curPs;
+        ParameterSpec ps;
+        SliceParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.slice_param();
+        for (I32 iter = 0; iter < cp.slice_point().size(); iter++) {
+            p.slice_points[iter] = cp.slice_point(iter);
+        }
+        p.num_slice = cp.slice_point().size();
+        p.axis = cp.axis();
+        ps.slice_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Transpose() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        TransposeParamSpec transPs;
-        memset(&transPs, 0, sizeof(transPs));
-        auto caffePermuteParam = layer.permute_param();
-        for (I32 iter = 0; iter < caffePermuteParam.order().size(); iter++) {
-            transPs.trans_dims[iter] = caffePermuteParam.order(iter);
-        }
-        transPs.trans_size = caffePermuteParam.order().size();
-        curPs.transpose_spec = transPs;
-        return curPs;
+        ParameterSpec ps;
+        TransposeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.permute_param();
+        for (I32 iter = 0; iter < cp.order().size(); iter++) {
+            p.axes[iter] = cp.order(iter);
+        }
+        p.num_axes = cp.order().size();
+        ps.transpose_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Tile() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        TileParamSpec tilePS;
-        auto caffeTileParam = layer.tile_param();
+        ParameterSpec ps;
+        TileParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.tile_param();
         for (int i = 0; i < 8; ++i) {
-            tilePS.repeatsInfo[i] = 1;
+            p.repeats[i] = 1;
         }
-        tilePS.dimsSize = 1;
-        tilePS.axis = caffeTileParam.axis();
-        tilePS.repeatsInfo[0] = caffeTileParam.tiles();
-        curPs.tile_spec = tilePS;
-        return curPs;
+        p.num_repeats = 1;
+        p.axis = cp.axis();
+        p.repeats[0] = cp.tiles();
+        ps.tile_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Pad() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PadParamSpec padPs;
-        auto caffePadParam = layer.padding_param();
-        padPs.before = 0;
-        padPs.after = 0;
-        padPs.top = caffePadParam.shape(0);
-        padPs.bottom = caffePadParam.shape(1);
-        padPs.left = caffePadParam.shape(2);
-        padPs.right = caffePadParam.shape(3);
-        padPs.constant_value = 0;
-        padPs.pad_mode = Pad_Constant;
-        curPs.pad_spec = padPs;
-        return curPs;
+        ParameterSpec ps;
+        PadParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.padding_param();
+        p.before = 0;
+        p.after = 0;
+        p.top = cp.shape(0);
+        p.bottom = cp.shape(1);
+        p.left = cp.shape(2);
+        p.right = cp.shape(3);
+        p.constant_value = 0;
+        p.pad_mode = PAD_CONSTANT;
+        ps.pad_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Attention() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        AttentionParamSpec attentionPs;
-        memset(&attentionPs, 0, sizeof(attentionPs));
-        auto caffe_attention_param = layer.attention_param();
-        attentionPs.num_heads = caffe_attention_param.num_heads();
-        attentionPs.from_sequence_length = caffe_attention_param.from_sequence_length();
-        attentionPs.to_sequence_length = caffe_attention_param.to_sequence_length();
-        curPs.attention_spec = attentionPs;
-        return curPs;
+        ParameterSpec ps;
+        AttentionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.attention_param();
+        p.num_heads = cp.num_heads();
+        p.from_sequence_length = cp.from_sequence_length();
+        p.to_sequence_length = cp.to_sequence_length();
+        ps.attention_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_RNN() override
     {
-        weightNumber = weightNumber + 1;
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        RNNParamSpec rnnPs;
-        memset(&rnnPs, 0, sizeof(rnnPs));
-        auto caffeLSTMParam = layer.lstm_param();
-        rnnPs.mode = RNN_LSTM;
-        rnnPs.numOutput = caffeLSTMParam.num_output();
-        rnnPs.steps = caffeLSTMParam.steps();
-        if (rnnPs.steps == -2) {
-            rnnPs.steps = 0;
-            rnnPs.biDirection = true;
+        this->weightNumber++;
+        ParameterSpec ps;
+        RNNParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.lstm_param();
+        p.mode = RNN_LSTM;
+        p.num_outputs = cp.num_output();
+        p.steps = cp.steps();
+        if (p.steps == -2) {
+            p.steps = 0;
+            p.bi_direction = true;
         } else {
-            rnnPs.biDirection = false;
-        }
-        rnnPs.numProjection = caffeLSTMParam.num_proj();
-        rnnPs.zoneoutCell = caffeLSTMParam.zoneout_cell();
-        rnnPs.zoneoutOutput = caffeLSTMParam.zoneout_output();
-        rnnPs.forgetBias = 1.0;
-        rnnPs.activationMode = ACTIVATION_TANH;
-        curPs.rnn_spec = rnnPs;
-        return curPs;
+            p.bi_direction = false;
+        }
+        p.num_projection = cp.num_proj();
+        p.zoneout_cell = cp.zoneout_cell();
+        p.zoneout_output = cp.zoneout_output();
+        p.forget_bias = 1.0;
+        p.activation_type = ACTIVATION_TANH;
+        ps.rnn_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Scale() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        weightNumber = weightNumber + 1;
-        ScaleParamSpec scalePs;
-        memset(&scalePs, 0, sizeof(scalePs));
-        auto caffeScaleParam = layer.scale_param();
-        scalePs.axis = caffeScaleParam.axis();
-        curPs.scale_spec = scalePs;
-        return curPs;
+        this->weightNumber++;
+        ParameterSpec ps;
+        ScaleParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = layer.scale_param().axis();
+        ps.scale_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Reduction() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ReductionParamSpec reductionPs;
-        memset(&reductionPs, 0, sizeof(reductionPs));
-        auto caffeReductionParam = layer.reduction_param();
-        reductionPs.axes[0] = caffeReductionParam.axis();
-        reductionPs.axes_num = 1;
-        auto op = caffeReductionParam.operation();
+        ParameterSpec ps;
+        ReductionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.reduction_param();
+        p.axes[0] = cp.axis();
+        p.num_axes = 1;
+        auto op = cp.operation();
         switch (op) {
             case caffe::ReductionParameter_ReductionOp_SUM:
-                reductionPs.reduction_mode = REDUCTION_SUM;
+                p.mode = REDUCTION_SUM;
                 break;
             case caffe::ReductionParameter_ReductionOp_MEAN:
-                reductionPs.reduction_mode = REDUCTION_MEAN;
+                p.mode = REDUCTION_MEAN;
                 break;
             default: {
                 const google::protobuf::EnumDescriptor *descriptor =
@@ -1035,80 +905,72 @@ class CaffeAdaptee : public ModelAdaptee {
                     this->layer.name().c_str(), descriptor->FindValueByNumber(op)->name().c_str());
             }
         }
-        reductionPs.coeff = caffeReductionParam.coeff();
-        reductionPs.keep_dim = caffeReductionParam.keep_dim();
-        curPs.reduction_spec = reductionPs;
-        return curPs;
+        p.coeff = cp.coeff();
+        p.keep_dim = cp.keep_dim();
+        ps.reduction_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Squeeze() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SqueezeParamSpec squeezePs;
-        memset(&squeezePs, 0, sizeof(squeezePs));
-        auto caffeSqueezeParam = layer.squeeze_param();
-        squeezePs.axes[0] = caffeSqueezeParam.axis();
-        squeezePs.axes_num = 1;
-        curPs.squeeze_spec = squeezePs;
-        return curPs;
+        ParameterSpec ps;
+        SqueezeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axes[0] = layer.squeeze_param().axis();
+        p.num_axes = 1;
+        ps.squeeze_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Unsqueeze() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        UnsqueezeParamSpec unsqueezePs;
-        memset(&unsqueezePs, 0, sizeof(unsqueezePs));
-        auto caffeUnsqueezeParam = layer.unsqueeze_param();
-        unsqueezePs.axes[0] = caffeUnsqueezeParam.axis();
-        unsqueezePs.axes_num = 1;
-        curPs.unsqueeze_spec = unsqueezePs;
-        return curPs;
+        ParameterSpec ps;
+        UnsqueezeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axes[0] = layer.unsqueeze_param().axis();
+        p.num_axes = 1;
+        ps.unsqueeze_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_ArgMax() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ArgMaxParamSpec argmaxPs;
-        memset(&argmaxPs, 0, sizeof(argmaxPs));
-        auto caffeArgMaxParam = layer.argmax_param();
-        argmaxPs.axis = caffeArgMaxParam.axis();
-        curPs.argmax_spec = argmaxPs;
-        return curPs;
+        ParameterSpec ps;
+        ArgMaxParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = layer.argmax_param().axis();
+        ps.argmax_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Repeat() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        RepeatParamSpec repeatPs;
-        memset(&repeatPs, 0, sizeof(repeatPs));
-        auto caffeRepeatParam = layer.repeat_param();
-        repeatPs.loops = caffeRepeatParam.loops();
-        repeatPs.axis = caffeRepeatParam.axis();
-        curPs.repeat_spec = repeatPs;
-        return curPs;
+        ParameterSpec ps;
+        RepeatParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.repeat_param();
+        p.loops = cp.loops();
+        p.axis = cp.axis();
+        ps.repeat_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Check() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        CheckParamSpec checkPs;
-        memset(&checkPs, 0, sizeof(checkPs));
-        auto caffeCheckParam = layer.check_param();
-        auto op = caffeCheckParam.operation();
+        ParameterSpec ps;
+        CheckParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.check_param();
+        auto op = cp.operation();
         switch (op) {
             case caffe::CheckParameter_CheckOp_EQUAL:
-                checkPs.check_mode = CHECK_EQUAL;
+                p.mode = CHECK_EQUAL;
                 break;
             case caffe::CheckParameter_CheckOp_GREAT:
-                checkPs.check_mode = CHECK_GREAT;
+                p.mode = CHECK_GREATER;
                 break;
             case caffe::CheckParameter_CheckOp_GREATEQUAL:
-                checkPs.check_mode = CHECK_GREATEQUAL;
+                p.mode = CHECK_GREATER_EQUAL;
                 break;
             default: {
                 const google::protobuf::EnumDescriptor *descriptor =
@@ -1117,33 +979,31 @@ class CaffeAdaptee : public ModelAdaptee {
                     this->layer.name().c_str(), descriptor->FindValueByNumber(op)->name().c_str());
             }
         }
-        curPs.check_spec = checkPs;
-        return curPs;
+        ps.check_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_PreAllocatedMemory() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PreAllocatedMemoryParamSpec preAllocatedMemoryPs;
-        memset(&preAllocatedMemoryPs, 0, sizeof(preAllocatedMemoryPs));
-        auto caffePreAllocatedMemoryParam = layer.preallocated_memory_param();
-        preAllocatedMemoryPs.desc.nDims = caffePreAllocatedMemoryParam.shape().dim_size();
-        for (I32 iter = 0; iter < caffePreAllocatedMemoryParam.shape().dim_size(); iter++) {
-            preAllocatedMemoryPs.desc.dims[preAllocatedMemoryPs.desc.nDims - 1 - iter] =
-                caffePreAllocatedMemoryParam.shape().dim(iter);
-        }
-        preAllocatedMemoryPs.desc.df = getTensorDefaultDataFormat(preAllocatedMemoryPs.desc.nDims);
-        auto dt = caffePreAllocatedMemoryParam.data_type();
+        ParameterSpec ps;
+        PreAllocatedMemoryParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.preallocated_memory_param();
+        p.desc.nDims = cp.shape().dim_size();
+        for (I32 iter = 0; iter < cp.shape().dim_size(); iter++) {
+            p.desc.dims[p.desc.nDims - 1 - iter] = cp.shape().dim(iter);
+        }
+        p.desc.df = getTensorDefaultDataFormat(p.desc.nDims);
+        auto dt = cp.data_type();
         switch (dt) {
             case caffe::PreAllocatedMemoryParameter_DataType_FLOAT32:
-                preAllocatedMemoryPs.desc.dt = DT_F32;
+                p.desc.dt = DT_F32;
                 break;
             case caffe::PreAllocatedMemoryParameter_DataType_UINT32:
-                preAllocatedMemoryPs.desc.dt = DT_U32;
+                p.desc.dt = DT_U32;
                 break;
             case caffe::PreAllocatedMemoryParameter_DataType_INT32:
-                preAllocatedMemoryPs.desc.dt = DT_I32;
+                p.desc.dt = DT_I32;
                 break;
             default: {
                 const google::protobuf::EnumDescriptor *descriptor =
@@ -1152,34 +1012,33 @@ class CaffeAdaptee : public ModelAdaptee {
                     this->layer.name().c_str(), descriptor->FindValueByNumber(dt)->name().c_str());
             }
         }
-        curPs.preallocated_memory_spec = preAllocatedMemoryPs;
-        return curPs;
+        p.value = cp.value();
+        ps.preallocated_memory_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_SharedWeight() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        weightNumber = weightNumber + 1;
-        SharedWeightParamSpec sharedWeightPs;
-        memset(&sharedWeightPs, 0, sizeof(sharedWeightPs));
-        auto caffeSharedWeightParam = layer.shared_weight_param();
-        sharedWeightPs.desc.nDims = caffeSharedWeightParam.shape().dim_size();
-        for (I32 iter = 0; iter < caffeSharedWeightParam.shape().dim_size(); iter++) {
-            sharedWeightPs.desc.dims[sharedWeightPs.desc.nDims - 1 - iter] =
-                caffeSharedWeightParam.shape().dim(iter);
-        }
-        sharedWeightPs.desc.df = getTensorDefaultDataFormat(sharedWeightPs.desc.nDims);
-        auto dt = caffeSharedWeightParam.data_type();
+        this->weightNumber++;
+        ParameterSpec ps;
+        SharedWeightParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.shared_weight_param();
+        p.desc.nDims = cp.shape().dim_size();
+        for (I32 iter = 0; iter < cp.shape().dim_size(); iter++) {
+            p.desc.dims[p.desc.nDims - 1 - iter] = cp.shape().dim(iter);
+        }
+        p.desc.df = getTensorDefaultDataFormat(p.desc.nDims);
+        auto dt = cp.data_type();
         switch (dt) {
             case caffe::SharedWeightParameter_DataType_FLOAT32:
-                sharedWeightPs.desc.dt = DT_F32;
+                p.desc.dt = DT_F32;
                 break;
             case caffe::SharedWeightParameter_DataType_UINT32:
-                sharedWeightPs.desc.dt = DT_U32;
+                p.desc.dt = DT_U32;
                 break;
             case caffe::SharedWeightParameter_DataType_INT32:
-                sharedWeightPs.desc.dt = DT_I32;
+                p.desc.dt = DT_I32;
                 break;
             default: {
                 const google::protobuf::EnumDescriptor *descriptor =
@@ -1188,287 +1047,267 @@ class CaffeAdaptee : public ModelAdaptee {
                     this->layer.name().c_str(), descriptor->FindValueByNumber(dt)->name().c_str());
             }
         }
-        curPs.shared_weight_spec = sharedWeightPs;
-        return curPs;
+        ps.shared_weight_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Copy() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        CopyParamSpec copyPs;
-        memset(&copyPs, 0, sizeof(copyPs));
-        auto caffeCopyParam = layer.copy_param();
-        copyPs.src_dims[0] = caffeCopyParam.src_batch_stride();
-        copyPs.src_dims[1] = caffeCopyParam.src_stride();
-        copyPs.src_dims[2] = caffeCopyParam.src_offset();
-        copyPs.dst_dims[0] = caffeCopyParam.dst_batch_stride();
-        copyPs.dst_dims[1] = caffeCopyParam.dst_stride();
-        copyPs.dst_dims[2] = caffeCopyParam.dst_offset();
-        copyPs.length = caffeCopyParam.length();
-        curPs.copy_spec = copyPs;
-        return curPs;
+        ParameterSpec ps;
+        CopyParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.copy_param();
+        p.src_dims[0] = cp.src_batch_stride();
+        p.src_dims[1] = cp.src_stride();
+        p.src_dims[2] = cp.src_offset();
+        p.dst_dims[0] = cp.dst_batch_stride();
+        p.dst_dims[1] = cp.dst_stride();
+        p.dst_dims[2] = cp.dst_offset();
+        p.length = cp.length();
+        ps.copy_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_MatMul() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        MatMulParamSpec matmulPs;
-        memset(&matmulPs, 0, sizeof(matmulPs));
-        auto caffeMatMulParam = layer.matmul_param();
-        matmulPs.transpose_a = caffeMatMulParam.transpose_a();
-        matmulPs.transpose_b = caffeMatMulParam.transpose_b();
-        curPs.matmul_spec = matmulPs;
-        return curPs;
+        ParameterSpec ps;
+        MatMulParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.matmul_param();
+        p.transpose_a = cp.transpose_a();
+        p.transpose_b = cp.transpose_b();
+        ps.matmul_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_AttentionMask() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        AttentionMaskParamSpec attentionMaskPs;
-        memset(&attentionMaskPs, 0, sizeof(attentionMaskPs));
-        auto caffeAttentionMaskParam = layer.attention_mask_param();
-        attentionMaskPs.attention_length = caffeAttentionMaskParam.attention_length();
-        attentionMaskPs.same_length = caffeAttentionMaskParam.same_length();
-        attentionMaskPs.mask = caffeAttentionMaskParam.mask();
-        curPs.attention_mask_spec = attentionMaskPs;
-        return curPs;
+        ParameterSpec ps;
+        AttentionMaskParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.attention_mask_param();
+        p.attention_length = cp.attention_length();
+        p.same_length = cp.same_length();
+        p.mask = cp.mask();
+        ps.attention_mask_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_RelativePositionEmbedding() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        weightNumber = weightNumber + 1;
+        this->weightNumber++;
+        ParameterSpec ps;
         EmbedParamSpec p;
-        memset(&p, 0, sizeof(p));
-        auto caffeRelativePositionEmbedParam = layer.relative_position_embed_param();
-        p.input_dim = caffeRelativePositionEmbedParam.input_dim();
-        p.num_output = caffeRelativePositionEmbedParam.num_output();
-        p.bias_term = caffeRelativePositionEmbedParam.bias_term() == 0 ? false : true;
-        p.transpose = caffeRelativePositionEmbedParam.transpose() == 0 ? false : true;
-        p.axis = caffeRelativePositionEmbedParam.axis();
-        curPs.embed_spec = p;
-        return curPs;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.relative_position_embed_param();
+        p.num_inputs = cp.input_dim();
+        p.num_outputs = cp.num_output();
+        p.bias_term = cp.bias_term() == 0 ? false : true;
+        p.transpose = cp.transpose() == 0 ? false : true;
+        p.axis = cp.axis();
+        ps.embed_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_RelativeShift() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        RelativeShiftParamSpec relativeShiftPs;
-        memset(&relativeShiftPs, 0, sizeof(relativeShiftPs));
-        auto caffeRelativeShiftParam = layer.relative_shift_param();
-        relativeShiftPs.axis = caffeRelativeShiftParam.axis();
-        relativeShiftPs.shift_length = caffeRelativeShiftParam.shift_length();
-        curPs.relative_shift_spec = relativeShiftPs;
-        return curPs;
+        ParameterSpec ps;
+        RelativeShiftParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.relative_shift_param();
+        p.axis = cp.axis();
+        p.shift_length = cp.shift_length();
+        ps.relative_shift_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Concat() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ConcatParamSpec concatPs;
-        memset(&concatPs, 0, sizeof(concatPs));
-        auto caffeConcatParam = layer.concat_param();
-        concatPs.axis = caffeConcatParam.axis();
-        curPs.concat_spec = concatPs;
-        return curPs;
+        ParameterSpec ps;
+        ConcatParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = layer.concat_param().axis();
+        ps.concat_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Softmax() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SoftmaxParamSpec softmaxPs;
-        memset(&softmaxPs, 0, sizeof(softmaxPs));
-        auto caffeSoftmaxParam = layer.softmax_param();
-        softmaxPs.axis = caffeSoftmaxParam.axis();
-        curPs.softmax_spec = softmaxPs;
-        return curPs;
+        ParameterSpec ps;
+        SoftmaxParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = layer.softmax_param().axis();
+        ps.softmax_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_PriorBox() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PriorBoxParamSpec priorboxPs;
-        memset(&priorboxPs, 0, sizeof(priorboxPs));
-        auto caffePriorBoxParam = layer.prior_box_param();
-        CHECK_REQUIREMENT(
-            caffePriorBoxParam.min_size_size() <= 2 && caffePriorBoxParam.max_size_size() <= 2);
+        ParameterSpec ps;
+        PriorBoxParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.prior_box_param();
+        CHECK_REQUIREMENT(cp.min_size_size() <= 2 && cp.max_size_size() <= 2);
         for (int i = 0; i < 2; i++) {
-            priorboxPs.min_sizes[i] = 0;
-            if (i < caffePriorBoxParam.min_size_size()) {
-                priorboxPs.min_sizes[i] = caffePriorBoxParam.min_size(i);
+            p.min_sizes[i] = 0;
+            if (i < cp.min_size_size()) {
+                p.min_sizes[i] = cp.min_size(i);
             }
         }
         for (int i = 0; i < 2; i++) {
-            priorboxPs.max_sizes[i] = 0;
-            if (i < caffePriorBoxParam.max_size_size()) {
-                priorboxPs.max_sizes[i] = caffePriorBoxParam.max_size(i);
+            p.max_sizes[i] = 0;
+            if (i < cp.max_size_size()) {
+                p.max_sizes[i] = cp.max_size(i);
             }
         }
-        CHECK_REQUIREMENT(caffePriorBoxParam.aspect_ratio_size() <= 2);
+        CHECK_REQUIREMENT(cp.aspect_ratio_size() <= 2);
         for (int i = 0; i < 2; i++) {
-            priorboxPs.aspect_ratios[i] = 0;
-            if (i < caffePriorBoxParam.aspect_ratio_size()) {
-                priorboxPs.aspect_ratios[i] = caffePriorBoxParam.aspect_ratio(i);
+            p.aspect_ratios[i] = 0;
+            if (i < cp.aspect_ratio_size()) {
+                p.aspect_ratios[i] = cp.aspect_ratio(i);
             }
         }
-        if (caffePriorBoxParam.has_flip()) {
-            if (caffePriorBoxParam.flip()) {
-                priorboxPs.flip = 1;
+        if (cp.has_flip()) {
+            if (cp.flip()) {
+                p.flip = 1;
             } else {
-                priorboxPs.flip = 0;
+                p.flip = 0;
             }
         } else {
-            priorboxPs.flip = 1;
+            p.flip = 1;
         }
-        if (caffePriorBoxParam.has_clip()) {
-            if (caffePriorBoxParam.clip()) {
-                priorboxPs.clip = 1;
+        if (cp.has_clip()) {
+            if (cp.clip()) {
+                p.clip = 1;
             } else {
-                priorboxPs.clip = 0;
+                p.clip = 0;
             }
         } else {
-            priorboxPs.clip = 0;
-        }
-        if (caffePriorBoxParam.variance_size() == 4) {
-            priorboxPs.variances[0] = caffePriorBoxParam.variance(0);
-            priorboxPs.variances[1] = caffePriorBoxParam.variance(1);
-            priorboxPs.variances[2] = caffePriorBoxParam.variance(2);
-            priorboxPs.variances[3] = caffePriorBoxParam.variance(3);
-        } else if (caffePriorBoxParam.variance_size() == 1) {
-            priorboxPs.variances[0] = caffePriorBoxParam.variance(0);
-            priorboxPs.variances[1] = caffePriorBoxParam.variance(0);
-            priorboxPs.variances[2] = caffePriorBoxParam.variance(0);
-            priorboxPs.variances[3] = caffePriorBoxParam.variance(0);
-        }
-        priorboxPs.image_w = 0;
-        priorboxPs.image_h = 0;
-        if (caffePriorBoxParam.has_img_size()) {
-            priorboxPs.image_w = caffePriorBoxParam.img_size();
-            priorboxPs.image_h = caffePriorBoxParam.img_size();
-        }
-        if (caffePriorBoxParam.has_img_w() && caffePriorBoxParam.has_img_h()) {
-            priorboxPs.image_w = caffePriorBoxParam.img_w();
-            priorboxPs.image_h = caffePriorBoxParam.img_h();
-        }
-        priorboxPs.step_w = 0;
-        priorboxPs.step_h = 0;
-        if (caffePriorBoxParam.has_step()) {
-            priorboxPs.step_w = caffePriorBoxParam.step();
-            priorboxPs.step_h = caffePriorBoxParam.step();
-        }
-        if (caffePriorBoxParam.has_step_w() && caffePriorBoxParam.has_step_h()) {
-            priorboxPs.step_w = caffePriorBoxParam.step_w();
-            priorboxPs.step_h = caffePriorBoxParam.step_h();
-        }
-        priorboxPs.offset = caffePriorBoxParam.offset();
-        curPs.prior_box_spec = priorboxPs;
-        return curPs;
+            p.clip = 0;
+        }
+        if (cp.variance_size() == 4) {
+            p.variances[0] = cp.variance(0);
+            p.variances[1] = cp.variance(1);
+            p.variances[2] = cp.variance(2);
+            p.variances[3] = cp.variance(3);
+        } else if (cp.variance_size() == 1) {
+            p.variances[0] = cp.variance(0);
+            p.variances[1] = cp.variance(0);
+            p.variances[2] = cp.variance(0);
+            p.variances[3] = cp.variance(0);
+        }
+        p.image_w = 0;
+        p.image_h = 0;
+        if (cp.has_img_size()) {
+            p.image_w = cp.img_size();
+            p.image_h = cp.img_size();
+        }
+        if (cp.has_img_w() && cp.has_img_h()) {
+            p.image_w = cp.img_w();
+            p.image_h = cp.img_h();
+        }
+        p.step_w = 0;
+        p.step_h = 0;
+        if (cp.has_step()) {
+            p.step_w = cp.step();
+            p.step_h = cp.step();
+        }
+        if (cp.has_step_w() && cp.has_step_h()) {
+            p.step_w = cp.step_w();
+            p.step_h = cp.step_h();
+        }
+        p.offset = cp.offset();
+        ps.prior_box_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_DetectionOutput() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        DetectionOutputParamSpec detectionoutputPs;
-        memset(&detectionoutputPs, 0, sizeof(detectionoutputPs));
-        auto caffeDetectionOutputParam = layer.detection_output_param();
-        detectionoutputPs.num_class = caffeDetectionOutputParam.num_classes();
-        CHECK_REQUIREMENT((caffeDetectionOutputParam.background_label_id() == 0) &&
-            (caffeDetectionOutputParam.share_location() == true));
-        detectionoutputPs.nms_threshold = caffeDetectionOutputParam.nms_param().nms_threshold();
-        detectionoutputPs.nms_top_k = caffeDetectionOutputParam.nms_param().top_k();
-        detectionoutputPs.keep_top_k = caffeDetectionOutputParam.keep_top_k();
-        detectionoutputPs.confidence_threshold = caffeDetectionOutputParam.confidence_threshold();
-        curPs.detection_output_spec = detectionoutputPs;
-        return curPs;
+        ParameterSpec ps;
+        DetectionOutputParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.detection_output_param();
+        p.num_class = cp.num_classes();
+        CHECK_REQUIREMENT((cp.background_label_id() == 0) && (cp.share_location() == true));
+        p.nms_threshold = cp.nms_param().nms_threshold();
+        p.nms_top_k = cp.nms_param().top_k();
+        p.keep_top_k = cp.keep_top_k();
+        p.confidence_threshold = cp.confidence_threshold();
+        ps.detection_output_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Yolov3DetectionOutput() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        Yolov3DetectionOutputParamSpec yolov3detectionoutputPs;
-        memset(&yolov3detectionoutputPs, 0, sizeof(yolov3detectionoutputPs));
-        auto caffeYolov3DetectionOutputParam = layer.yolov3_detection_output_param();
-        yolov3detectionoutputPs.num_class = caffeYolov3DetectionOutputParam.num_classes();
-        yolov3detectionoutputPs.num_box = caffeYolov3DetectionOutputParam.num_box();
-        yolov3detectionoutputPs.confidence_threshold =
-            caffeYolov3DetectionOutputParam.confidence_threshold();
-        yolov3detectionoutputPs.nms_threshold = caffeYolov3DetectionOutputParam.nms_threshold();
+        ParameterSpec ps;
+        Yolov3DetectionOutputParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.yolov3_detection_output_param();
+        p.num_class = cp.num_classes();
+        p.num_box = cp.num_box();
+        p.confidence_threshold = cp.confidence_threshold();
+        p.nms_threshold = cp.nms_threshold();
         for (int i = 0; i < 18; i++) {
-            yolov3detectionoutputPs.biases[i] = 0;
-            if (i < caffeYolov3DetectionOutputParam.biases_size()) {
-                yolov3detectionoutputPs.biases[i] = caffeYolov3DetectionOutputParam.biases(i);
+            p.biases[i] = 0;
+            if (i < cp.biases_size()) {
+                p.biases[i] = cp.biases(i);
             }
         }
         for (int i = 0; i < 3; i++) {
-            yolov3detectionoutputPs.anchors_scale[i] = 0;
-            if (i < caffeYolov3DetectionOutputParam.anchors_scale_size()) {
-                yolov3detectionoutputPs.anchors_scale[i] =
-                    caffeYolov3DetectionOutputParam.anchors_scale(i);
+            p.anchors_scale[i] = 0;
+            if (i < cp.anchors_scale_size()) {
+                p.anchors_scale[i] = cp.anchors_scale(i);
             }
         }
-        yolov3detectionoutputPs.mask_group_num = caffeYolov3DetectionOutputParam.mask_group_num();
+        p.mask_group_num = cp.mask_group_num();
         for (int i = 0; i < 9; i++) {
-            yolov3detectionoutputPs.mask[i] = 0;
-            if (i < caffeYolov3DetectionOutputParam.mask_size()) {
-                yolov3detectionoutputPs.mask[i] = caffeYolov3DetectionOutputParam.mask(i);
+            p.mask[i] = 0;
+            if (i < cp.mask_size()) {
+                p.mask[i] = cp.mask(i);
             }
         }
-        curPs.yolov3_detection_output_spec = yolov3detectionoutputPs;
-        return curPs;
+        ps.yolov3_detection_output_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Clip() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ClipParamSpec clipParam;
-        memset(&clipParam, 0, sizeof(clipParam));
-        auto caffeClipParam = layer.clip_param();
-        clipParam.min = caffeClipParam.min();
-        clipParam.max = caffeClipParam.max();
-        curPs.clip_spec = clipParam;
-        return curPs;
+        ParameterSpec ps;
+        ClipParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        auto cp = layer.clip_param();
+        p.min = cp.min();
+        p.max = cp.max();
+        ps.clip_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Relu() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ReLUParamSpec reluSpec;
-        memset(&reluSpec, 0, sizeof(reluSpec));
-        reluSpec.neg_slope = layer.relu_param().negative_slope();
-        curPs.relu_spec = reluSpec;
-        return curPs;
+        ParameterSpec ps;
+        ReLUParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.neg_slope = layer.relu_param().negative_slope();
+        ps.relu_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_PRelu() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        weightNumber = weightNumber + 1;
-        return curPs;
+        this->weightNumber++;
+        ParameterSpec ps;
+        return ps;
     }
 
     ParameterSpec adapt_Exp() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        auto caffeExpParam = layer.exp_param();
-        if (caffeExpParam.base() != -1 || caffeExpParam.scale() != 1 || caffeExpParam.shift() != 0) {
+        ParameterSpec ps;
+        auto cp = layer.exp_param();
+        if (cp.base() != -1 || cp.scale() != 1 || cp.shift() != 0) {
             UNI_ERROR_LOG("can not process operator name:%s base!=-1(e), scale!=1, shift!=0.\n",
                 this->layer.name().c_str());
         }
-        return curPs;
+        return ps;
     }
 
 private:
@@ -1478,4 +1317,4 @@ class CaffeAdaptee : public ModelAdaptee {
     caffe::LayerParameter layer;
     int weightNumber;
 };
-#endif
+#endif
\ No newline at end of file
diff --git a/model_tools/src/mindspore/CMakeLists.txt b/model_tools/src/mindspore/CMakeLists.txt
new file mode 100644
index 00000000..f194d5f2
--- /dev/null
+++ b/model_tools/src/mindspore/CMakeLists.txt
@@ -0,0 +1,26 @@
+file(GLOB srcs *.cpp)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+include_directories(${Protobuf_INCLUDE_DIR})
+
+set(Protobuf_IMPORT_DIRS ${BOLT_ROOT}/third_party/proto)
+protobuf_generate_cpp(MINDSPORE_PROTO_SRCS MINDSPORE_PROTO_HDRS ${BOLT_ROOT}/third_party/proto/mind_ir.proto)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(../)
+
+# shared library
+add_library(${PROJECT_NAME}_mindspore SHARED ${srcs} ${MINDSPORE_PROTO_HDRS} ${MINDSPORE_PROTO_SRCS})
+
+target_link_libraries(${PROJECT_NAME}_mindspore LINK_PUBLIC model_spec uni ${Protobuf_SHARED_LIBRARY})
+
+# static library
+add_library(${PROJECT_NAME}_mindspore_static STATIC ${srcs} ${MINDSPORE_PROTO_HDRS} ${MINDSPORE_PROTO_SRCS})
+
+set_target_properties(${PROJECT_NAME}_mindspore_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}_mindspore")
+set_target_properties(${PROJECT_NAME}_mindspore PROPERTIES CLEAN_DIRECT_OUTPUT 1)
+set_target_properties(${PROJECT_NAME}_mindspore_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
+install(TARGETS ${PROJECT_NAME}_mindspore ${PROJECT_NAME}_mindspore_static
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
diff --git a/model_tools/src/mindspore/mindspore_adaptee.h b/model_tools/src/mindspore/mindspore_adaptee.h
new file mode 100644
index 00000000..1a3d1b8c
--- /dev/null
+++ b/model_tools/src/mindspore/mindspore_adaptee.h
@@ -0,0 +1,412 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_MINDSPOREADAPTEE
+#define _H_MINDSPOREADAPTEE
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/message.h>
+#include "mind_ir.pb.h"
+#include "model_adaptee.h"
+
+class MindsporeAdaptee : public ModelAdaptee {
+public:
+    MindsporeAdaptee()
+    {}
+
+    ~MindsporeAdaptee()
+    {}
+
+protected:
+    OperatorType convert_ms_type(const std::string &msNodeType)
+    {
+        if (msNodeType == "Conv2D") {
+	    return OT_Conv;
+	} else if (msNodeType == "ReLU") {
+	    return OT_Relu;
+	} else if (msNodeType == "MaxPool") {
+	    return OT_Pooling;
+	} else if (msNodeType == "Reshape") {
+	    return OT_Reshape;
+	} else if (msNodeType == "MatMul") {
+	    return OT_FC;
+	} else if (msNodeType == "BiasAdd") {
+	    return OT_Scale;
+	} else {
+	    UNI_ERROR_LOG("Unsupport this mindspore op");
+	}
+	return OT_None;
+    }
+
+
+    EE read_from_mindir_file(const char *path, google::protobuf::Message *message)
+    {
+        std::ifstream fs(path, std::ifstream::in | std::ifstream::binary);
+        if (!fs.is_open()) {
+            UNI_ERROR_LOG("Can not open mindir model file!\n");
+        }
+
+        google::protobuf::io::IstreamInputStream input(&fs);
+        google::protobuf::io::CodedInputStream codedstr(&input);
+
+        codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
+
+        bool ret = message->ParseFromCodedStream(&codedstr);
+        fs.close();
+	if (!ret) {
+	    return NOT_SUPPORTED;
+	}
+
+        return SUCCESS;
+    }
+
+    EE parse_file(std::string modelDirectory, std::string modelFileName) override
+    {
+        std::string modelPath = modelDirectory + "/" + modelFileName + ".mindir";
+	    CHECK_STATUS(read_from_mindir_file(modelPath.c_str(), (google::protobuf::Message *)(&mindsporeModel)));
+	    mindsporeGraph = mindsporeModel.graph();
+	    return SUCCESS;
+    }
+
+    TensorDesc gen_desc_from_mindirTp(mind_ir::TensorProto tp)
+    {
+        TensorDesc desc;
+	    desc.dt = DT_F32;
+	    desc.nDims = tp.dims_size();
+	    if (desc.nDims == 4) {
+	        desc.df = DF_NCHW;
+	    } else if (desc.nDims == 3) {
+	        desc.df = DF_MTK;
+	    } else if (desc.nDims == 2) {
+	        desc.df = DF_NORMAL;
+	    } else {
+	        UNI_ERROR_LOG("Do not support this input, please check the model again.");
+	    }
+        // reversed order to assign the dims
+	    for (int i = 0; i < tp.dims_size(); i++) {
+            desc.dims[i] = tp.dims(tp.dims_size() - 1 - i);	    
+	    }
+	    return desc;
+    }
+
+    EE adapt_operators(ModelSpec *ms) override
+    {
+	    str_copy(ms->model_name, mindsporeGraph.name().c_str(), mindsporeGraph.name().length());
+        ms->dt = DT_F32;
+
+        ms->num_inputs = mindsporeGraph.input_size();    // if some input belongs to const, need to filter
+        ms->input_names = (I8 **)mt_malloc(ms->num_inputs * sizeof(I8 *));
+        ms->input_dims = (TensorDesc *)mt_malloc(sizeof(TensorDesc) * ms->num_inputs);
+        for (int i = 0; i < mindsporeGraph.input_size(); i++) {
+            mind_ir::ValueInfoProto curInput = mindsporeGraph.input(i);
+            std::string inputName = curInput.name();    
+            ms->input_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
+            str_copy(ms->input_names[i], inputName.c_str(), inputName.length());
+	        if (curInput.tensor_size() != 1) {
+	            UNI_ERROR_LOG("input extraction of mindsporeGraph failed!");
+	        } else {
+	            mind_ir::TensorProto curTensor = curInput.tensor(0);
+                ms->input_dims[i] = gen_desc_from_mindirTp(curTensor);
+	        }
+        }	
+
+        int dependIndex = -1;
+        for (int i = 0; i < mindsporeGraph.node_size(); i++) {
+            if (mindsporeGraph.node(i).op_type() == "Depend") {
+                dependIndex = i;
+                break;
+            }
+        }
+        ms->num_outputs = mindsporeGraph.node(dependIndex).input_size() - 1;
+        ms->output_names = (I8 **)mt_malloc(ms->num_outputs * sizeof(I8 *));
+        for (int i = 0; i < ms->num_outputs; i++) {
+            std::string curName = mindsporeGraph.node(dependIndex).input(i);
+            ms->output_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
+            str_copy(ms->output_names[i], curName.c_str(), curName.length());
+        }
+	
+	    std::vector<OperatorSpec> operatorSpecVec;
+	    for (int i = 0; i < mindsporeGraph.node_size(); i++) {    // not all the nodes are valid
+	        mindsporeNode = mindsporeGraph.node(i);
+	    
+	    // mark constant info
+	    if (mindsporeNode.op_type() == "Constant") {
+                std::string opOutput = mindsporeNode.output(0);
+                constantIndex[opOutput] = i;		
+	    }
+            if (mindsporeNode.op_type() == "Load") {
+		loadIndex[mindsporeNode.output(0)] = i;
+	    }
+	    if (weightsOperators.find(mindsporeNode.op_type()) != weightsOperators.end()) {
+	        weightsIndex.push_back(i);        
+	    }
+
+	    if (deprecatedOperators.find(mindsporeNode.op_type()) != deprecatedOperators.end()) {
+		UNI_DEBUG_LOG("[Deprecated]\n");
+	    } else {
+		OperatorType opType = convert_ms_type(mindsporeNode.op_type());
+		int curInputSize = mindsporeNode.input_size();
+		if (weightsOperators.find(mindsporeNode.op_type()) != weightsOperators.end()) {
+		    curInputSize = weightsOperators[mindsporeNode.op_type()];
+		}
+		if (mindsporeNode.op_type() == "Reshape") {
+		    curInputSize = 1;
+		}
+		OperatorSpec os = mt_create_operator(mindsporeNode.name().c_str(), opType, curInputSize, mindsporeNode.output_size());
+	        for (int j = 0; j < curInputSize; j++) {
+		    str_copy(os.input_tensors_name[j], mindsporeNode.input(j).c_str(), mindsporeNode.input(j).length()); 
+		}
+		for (int j = 0; j < mindsporeNode.output_size(); j++) {
+		    str_copy(os.output_tensors_name[j], mindsporeNode.output(j).c_str(), mindsporeNode.output(j).length());
+		}
+		adapt_operator(opType, &(os.ps));
+
+                operatorSpecVec.push_back(os);		
+	    }
+	}
+	
+	ms->num_operator_specs = operatorSpecVec.size();
+	ms->ops = (OperatorSpec *)mt_malloc(sizeof(OperatorSpec) * ms->num_operator_specs);
+	UNI_MEMCPY(ms->ops, operatorSpecVec.data(), sizeof(OperatorSpec) * operatorSpecVec.size());
+        for (I32 i = 0; i < ms->num_operator_specs; i++) {
+            ms->ops[i].tensor_positions = nullptr;
+            ms->ops[i].num_quant_feature = 0;
+            ms->ops[i].feature_scale = nullptr;
+        }	
+        return SUCCESS;
+    }
+
+    EE adapt_weights(ModelSpec *ms) override
+    {
+        std::map<std::string , int> paramIndex;	    
+        for (int i = 0; i < mindsporeGraph.parameter_size(); i++) {
+            paramIndex[mindsporeGraph.parameter(i).name()] = i;    // swap space for speed
+        }       
+
+	std::vector<WeightSpec> ws;
+        for (int i = 0; i < weightsIndex.size(); i++) {
+	    mind_ir::NodeProto curNode = mindsporeGraph.node(weightsIndex[i]);
+	    int curLoadOpIndex = loadIndex[curNode.input(1)];
+	    mind_ir::NodeProto curLoadNode = mindsporeGraph.node(curLoadOpIndex);
+	    if (paramIndex.find(curLoadNode.input(0)) == paramIndex.end()) {
+	        UNI_ERROR_LOG("Do not find valid param.\n");
+	    }
+	    int curParamIndex = paramIndex[curLoadNode.input(0)];
+	    mind_ir::TensorProto curTensorProto = mindsporeGraph.parameter(curParamIndex);
+	    int totalDim = 1;
+	    for (int j = 0; j < curTensorProto.dims_size(); j++) {
+		totalDim *= curTensorProto.dims(j);
+	    }
+
+            WeightSpec weightSpec;
+	    if (curNode.op_type() == "Conv2D" || curNode.op_type() == "MatMul") {
+	        weightSpec = mt_create_weight(curNode.name().c_str(), DT_F32, totalDim * sizeof(float), 0, 0);
+		
+		if (curTensorProto.has_raw_data()) {
+		    const std::string &rawData = curTensorProto.raw_data();
+		    float* elementPtr = (float*)(rawData.data());
+		    UNI_MEMCPY(weightSpec.weight, elementPtr, totalDim * sizeof(float));
+		} else {
+		    UNI_ERROR_LOG("Do not support the weight extraction.\n");
+		}
+	    } else if (curNode.op_type() == "BiasAdd") {
+	        weightSpec = mt_create_weight(curNode.name().c_str(), DT_F32, 0, totalDim * sizeof(float), 0);
+                if (curTensorProto.has_raw_data()) {
+                    const std::string &rawData = curTensorProto.raw_data();
+                    float* elementPtr = (float*)(rawData.data());
+                    UNI_MEMCPY(weightSpec.vec, elementPtr, totalDim * sizeof(float));
+                } else {
+                    UNI_ERROR_LOG("Do not support the vec extraction.\n");
+                }		
+	    }
+	    ws.push_back(weightSpec);
+	}
+
+        ms->num_weight_specs = ws.size();
+        ms->ws = (WeightSpec *)mt_malloc(sizeof(WeightSpec) * ms->num_weight_specs);
+        UNI_MEMCPY(ms->ws, ws.data(), sizeof(WeightSpec) * ws.size());	
+        return SUCCESS;
+    }
+
+    ParameterSpec adapt_Conv() override
+    {
+        ParameterSpec ps;	    
+        ConvolutionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+
+        p.kernel_t = 1;
+        p.kernel_h = 1;
+        p.kernel_w = 1;
+        p.dilatedRate_t = 1;
+        p.dilatedRate_h = 1;
+        p.dilatedRate_w = 1;
+        p.stride_t = 1;
+        p.stride_h = 1;
+        p.stride_w = 1;	
+
+	for (int i = 0; i < mindsporeNode.attribute_size(); i++) {
+	    mind_ir::AttributeProto curAttribute = mindsporeNode.attribute(i);
+	    if (curAttribute.name() == "kernel_size") {
+		if (curAttribute.ints_size() != 2) {
+		    UNI_ERROR_LOG("Do not support this conv kernel size\n");
+		} else {
+		    p.kernel_h = curAttribute.ints(0);
+		    p.kernel_w = curAttribute.ints(1);
+		}
+	    } else if (curAttribute.name() == "out_channel") {
+		p.num_outputs = curAttribute.i();
+		p.num_outputs_origin = p.num_outputs;
+	    } else if (curAttribute.name() == "pad" || curAttribute.name() == "pad_list") {
+		if (curAttribute.ints_size() != 4) {    // TODO specify the order
+		    p.pad_top = curAttribute.ints(0);
+		    p.pad_left = curAttribute.ints(1);
+		    p.pad_bottom = curAttribute.ints(2);
+		    p.pad_right = curAttribute.ints(3);
+		}
+	    } else if (curAttribute.name() == "groups" || curAttribute.name() == "group") {
+                p.group = curAttribute.i();		
+	    } else if (curAttribute.name() == "dilation") {
+                p.dilatedRate_h = curAttribute.ints(0);
+                p.dilatedRate_w = curAttribute.ints(1);		
+	    }
+	}
+	
+        if (p.group == p.num_outputs && p.group != 1 ) {
+            p.convolution_type = CONVOLUTION_DEPTHWISE;
+        } else {
+            p.convolution_type = CONVOLUTION_POINTWISE;
+        }
+
+        p.dw_activation_type = ACTIVATION_NULL;
+        p.pw_activation_type = ACTIVATION_NULL;
+        ps.conv_spec = p;
+	return ps;
+    }
+
+    ParameterSpec adapt_Relu() override
+    {
+        ParameterSpec ps;
+	    ReLUParamSpec p;
+	    UNI_MEMSET(&p, 0, sizeof(p));
+	    p.neg_slope = 0.0;
+	    ps.relu_spec = p;
+        return ps;	
+    }
+
+    ParameterSpec adapt_Pooling() override
+    {
+        ParameterSpec ps;
+	    PoolingParamSpec p;
+	    UNI_MEMSET(&p, 0, sizeof(p));
+
+	    if (mindsporeNode.op_type() == "MaxPool") {
+	        p.mode = POOLING_MAX;
+        } else {
+	        p.mode = POOLING_MEAN;
+	    }
+	    p.round_mode = ROUND_FLOOR;
+
+        for (int i = 0; i < mindsporeNode.attribute_size(); i++) {
+            mind_ir::AttributeProto curAttribute = mindsporeNode.attribute(i);
+            if (curAttribute.name() == "kernel_size") {
+		        p.kernel_t = 1;    
+	            if (curAttribute.ints_size() ==  2) {
+		            p.kernel_h = curAttribute.ints(0);
+		            p.kernel_w = curAttribute.ints(1);
+		        } else if (curAttribute.ints_size() == 4) {
+                    p.kernel_h = curAttribute.ints(2);
+                    p.kernel_w = curAttribute.ints(3);		    
+		        } else {
+		            UNI_ERROR_LOG("Do not support this pooling kernel size.\n");
+		        }
+            } else if (curAttribute.name() == "strides") {
+	            p.stride_t = 1;
+	            if (curAttribute.ints_size() == 2) {
+	                p.stride_h = curAttribute.ints(0);
+		            p.stride_w = curAttribute.ints(1);
+	            } else if (curAttribute.ints_size() == 4) {
+	                p.stride_h = curAttribute.ints(2);
+		            p.stride_w = curAttribute.ints(3);
+	            } else {
+	                UNI_ERROR_LOG("Do not support this pooling stride size.\n");
+	            }	
+	        }// TODO pad
+        }
+
+        ps.pooling_spec = p;	
+	    return ps;
+    }
+
+    ParameterSpec adapt_Reshape() override
+    {
+	    // Locate the constant
+        ParameterSpec ps;
+        ReshapeParamSpec p;
+	    UNI_MEMSET(&p, 0, sizeof(p));
+
+	    if(constantIndex.find(mindsporeNode.input(1)) == constantIndex.end()) {
+	        UNI_ERROR_LOG("Do not find shape info\n");
+        } 
+	    mind_ir::NodeProto constantNode = mindsporeGraph.node(constantIndex[mindsporeNode.input(1)]);
+	    p.num_shape = constantNode.attribute(0).ints_size();
+        for (int i = 0; i <  constantNode.attribute(0).ints_size(); i++) {
+	    p.shape[i] = constantNode.attribute(0).ints(i);
+	}
+        p.axis = 0;
+        p.num_axes = -1;
+        ps.reshape_spec = p;
+        return ps;
+    }
+
+    ParameterSpec adapt_Fc() override
+    {
+        ParameterSpec ps;
+        FullyConnectedParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.num_outputs = -1;   // unsigned int, need to specify correctly
+	    mind_ir::AttributeProto fir_att = mindsporeNode.attribute(0);
+	    mind_ir::TensorProto tp = fir_att.tensors(0);
+	    p.num_outputs = tp.dims(1);
+	    p.num_slices = 1;
+        p.slice_point[0] = p.num_outputs;
+        ps.fc_spec = p;
+	    return ps;
+    }
+
+    ParameterSpec adapt_Scale() override
+    {
+        ParameterSpec ps;
+	    ScaleParamSpec p;
+	    UNI_MEMSET(&p, 0, sizeof(p));
+	    p.axis = -1;    // TODO
+        ps.scale_spec = p;
+        return ps;
+    }
+
+public:
+    std::map<std::string, int> deprecatedOperators{{"Depend", 1}, {"UpdateState", 1}, {"MakeTuple", 1}, 
+	    {"Load", 1}, {"Constant", 1}};    
+    std::map<std::string, int> weightsOperators{{"Conv2D", 1}, {"MatMul", 1}, {"BiasAdd", 1}};
+
+private:
+    mind_ir::ModelProto mindsporeModel;
+    mind_ir::GraphProto mindsporeGraph;
+    mind_ir::NodeProto mindsporeNode;
+    
+    std::map<std::string, mind_ir::TensorProto> parameters;
+    std::map<std::string, int> constantIndex;   
+    std::map<std::string, int> loadIndex;
+    std::vector<int> weightsIndex;
+};
+#endif
diff --git a/model_tools/include/model_quantization.h b/model_tools/src/mindspore/mindspore_wrapper.cpp
similarity index 81%
rename from model_tools/include/model_quantization.h
rename to model_tools/src/mindspore/mindspore_wrapper.cpp
index c5ce0f64..2d009bbd 100644
--- a/model_tools/include/model_quantization.h
+++ b/model_tools/src/mindspore/mindspore_wrapper.cpp
@@ -1,22 +1,23 @@
-// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef _H_MODEL_QUANTIZATION
-#define _H_MODEL_QUANTIZATION
-
-#include <vector>
-#include <string>
-#include "model_spec.h"
-
-void add_scale_from_file(ModelSpec *spec, const char *scaleFile);
-#endif
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "mindspore_adaptee.h"
+#include "model_converter.h"
+
+EE mindspore_converter(std::string dir, std::string mfn, ModelSpec *ms)
+{
+    ModelAdaptee *ade = new MindsporeAdaptee();
+    EE ret = ade->adapt(dir, mfn, ms);
+    delete ade;
+    return ret;
+}
diff --git a/model_tools/src/model_adaptee.h b/model_tools/src/model_adaptee.h
index 1181bfcc..c09701e9 100644
--- a/model_tools/src/model_adaptee.h
+++ b/model_tools/src/model_adaptee.h
@@ -14,6 +14,11 @@
 #ifndef _H_MODELADAPTEE
 #define _H_MODELADAPTEE
 
+#include <string>
+#include <fstream>
+#include <map>
+#include <vector>
+
 #include "uni.h"
 #include "model_common.h"
 
@@ -21,19 +26,19 @@
     virtual ParameterSpec name()                                                           \
     {                                                                                      \
         UNI_WARNING_LOG("%s use default(0) operator parmeter as return.\n", __FUNCTION__); \
-        ParameterSpec curPs;                                                               \
-        memset(&curPs, 0, sizeof(ParameterSpec));                                          \
-        return curPs;                                                                      \
+        ParameterSpec ps;                                                                  \
+        UNI_MEMSET(&ps, 0, sizeof(ps));                                                    \
+        return ps;                                                                         \
     }
 
 class ModelAdaptee {
 public:
     virtual EE adapt(std::string dir, std::string mfn, ModelSpec *ms)
     {
-        EE ret = parse_file(dir, mfn);
-        ret = adapt_operators(ms);
-        ret = adapt_weights(ms);
-        return ret;
+        CHECK_STATUS(parse_file(dir, mfn));
+        CHECK_STATUS(adapt_operators(ms));
+        CHECK_STATUS(adapt_weights(ms));
+        return SUCCESS;
     }
 
     ModelAdaptee()
@@ -51,126 +56,78 @@ class ModelAdaptee {
 
     virtual EE adapt_operator(OperatorType type, ParameterSpec *ps)
     {
-        if (type == OT_Conv) {
-            *ps = adapt_Conv();
-        } else if (type == OT_Deconvolution) {
-            *ps = adapt_Deconvolution();
-        } else if (type == OT_FC) {
-            *ps = adapt_Fc();
-        } else if (type == OT_RNN) {
-            *ps = adapt_RNN();
-        } else if (type == OT_MatMul) {
-            *ps = adapt_MatMul();
-        } else if (type == OT_Resize) {
-            *ps = adapt_Resize();
-        } else if (type == OT_Pooling) {
-            *ps = adapt_Pooling();
-        } else if (type == OT_Scale) {
-            *ps = adapt_Scale();
-        } else if (type == OT_PRelu) {
-            *ps = adapt_PRelu();
-        } else if (type == OT_BatchNorm) {
-            *ps = adapt_BatchNorm();
-        } else if (type == OT_InstanceNorm) {
-            *ps = adapt_InstanceNorm();
-        } else if (type == OT_LayerNorm) {
-            *ps = adapt_LayerNorm();
-        } else if (type == OT_Reduction) {
-            *ps = adapt_Reduction();
-        } else if (type == OT_ArgMax) {
-            *ps = adapt_ArgMax();
-        } else if (type == OT_Softmax || type == OT_LogSoftmax) {
-            *ps = adapt_Softmax();
-        } else if (type == OT_Clip) {
-            *ps = adapt_Clip();
-        } else if (type == OT_Power) {
-            *ps = adapt_Power();
-        } else if (type == OT_Relu) {
-            *ps = adapt_Relu();
-        } else if (type == OT_Gather) {
-            *ps = adapt_Gather();
-        } else if (type == OT_Embedding) {
-            *ps = adapt_Embedding();
-        } else if (type == OT_Pad) {
-            *ps = adapt_Pad();
-        } else if (type == OT_Eltwise) {
-            *ps = adapt_Eltwise();
-        } else if (type == OT_Concat) {
-            *ps = adapt_Concat();
-        } else if (type == OT_Slice) {
-            *ps = adapt_Slice();
-        } else if (type == OT_TfSlice) {
-            *ps = adapt_TfSlice();
-        } else if (type == OT_Cast) {
-            *ps = adapt_Cast();
-        } else if (type == OT_Transpose) {
-            *ps = adapt_Transpose();
-        } else if (type == OT_Reshape) {
-            *ps = adapt_Reshape();
-        } else if (type == OT_Squeeze) {
-            *ps = adapt_Squeeze();
-        } else if (type == OT_Unsqueeze) {
-            *ps = adapt_Unsqueeze();
-        } else if (type == OT_Space2Depth) {
-            *ps = adapt_Space2Depth();
-        } else if (type == OT_Depth2Space) {
-            *ps = adapt_Depth2Space();
-        } else if (type == OT_PreAllocatedMemory) {
-            *ps = adapt_PreAllocatedMemory();
-        } else if (type == OT_SharedWeight) {
-            *ps = adapt_SharedWeight();
-        } else if (type == OT_Copy) {
-            *ps = adapt_Copy();
-        } else if (type == OT_Check) {
-            *ps = adapt_Check();
-        } else if (type == OT_Repeat) {
-            *ps = adapt_Repeat();
-        } else if (type == OT_Attention) {
-            *ps = adapt_Attention();
-        } else if (type == OT_AttentionMask) {
-            *ps = adapt_AttentionMask();
-        } else if (type == OT_RelativePositionEmbedding) {
-            *ps = adapt_RelativePositionEmbedding();
-        } else if (type == OT_RelativeShift) {
-            *ps = adapt_RelativeShift();
-        } else if (type == OT_PriorBox) {
-            *ps = adapt_PriorBox();
-        } else if (type == OT_DetectionOutput) {
-            *ps = adapt_DetectionOutput();
-        } else if (type == OT_Yolov3DetectionOutput) {
-            *ps = adapt_Yolov3DetectionOutput();
-        } else if (type == OT_Tile) {
-            *ps = adapt_Tile();
-        } else if (type == OT_Splice) {
-            *ps = adapt_Splice();
-        } else if (type == OT_SoftPlus) {
-            *ps = adapt_SoftPlus();
-        } else if (type == OT_Exp) {
-            *ps = adapt_Exp();
-        } else if (type == OT_Tdnn) {
-            *ps = adapt_Tdnn();
-        } else if (type == OT_TopK) {
-            *ps = adapt_TopK();
-        } else if (type == OT_SpaceToBatchNd) {
-            *ps = adapt_SpaceToBatchNd();
-        } else if (type == OT_BatchToSpaceNd) {
-            *ps = adapt_BatchToSpaceNd();
-        } else if (type == OT_Where) {
-            *ps = adapt_Where();
-        } else if (type == OT_Expand) {
-            *ps = adapt_Expand();
-        } else if (type == OT_Scatter) {
-            *ps = adapt_Scatter();
-        } else if (type == OT_Equal) {
-            *ps = adapt_Equal();
-        } else if (type == OT_Select) {
-            *ps = adapt_Select();
-        } else if (type == OT_RoIAlign) {
-            *ps = adapt_RoIAlign();
-        } else if (type == OT_GenerateProposals) {
-            *ps = adapt_GenerateProposals();
+        typedef ParameterSpec (ModelAdaptee::*AdaptOperatorFunction)();
+        std::map<OperatorType, AdaptOperatorFunction> functions = {
+            {OT_Conv, &ModelAdaptee::adapt_Conv},
+            {OT_Deconvolution, &ModelAdaptee::adapt_Deconvolution},
+            {OT_FC, &ModelAdaptee::adapt_Fc},
+            {OT_RNN, &ModelAdaptee::adapt_RNN},
+            {OT_MatMul, &ModelAdaptee::adapt_MatMul},
+            {OT_Resize, &ModelAdaptee::adapt_Resize},
+            {OT_Pooling, &ModelAdaptee::adapt_Pooling},
+            {OT_Scale, &ModelAdaptee::adapt_Scale},
+            {OT_PRelu, &ModelAdaptee::adapt_PRelu},
+            {OT_BatchNorm, &ModelAdaptee::adapt_BatchNorm},
+            {OT_InstanceNorm, &ModelAdaptee::adapt_InstanceNorm},
+            {OT_LayerNorm, &ModelAdaptee::adapt_LayerNorm},
+            {OT_Reduction, &ModelAdaptee::adapt_Reduction},
+            {OT_ArgMax, &ModelAdaptee::adapt_ArgMax},
+            {OT_Softmax, &ModelAdaptee::adapt_Softmax},
+            {OT_LogSoftmax, &ModelAdaptee::adapt_Softmax},
+            {OT_Clip, &ModelAdaptee::adapt_Clip},
+            {OT_Power, &ModelAdaptee::adapt_Power},
+            {OT_Relu, &ModelAdaptee::adapt_Relu},
+            {OT_Gather, &ModelAdaptee::adapt_Gather},
+            {OT_Embedding, &ModelAdaptee::adapt_Embedding},
+            {OT_Pad, &ModelAdaptee::adapt_Pad},
+            {OT_Eltwise, &ModelAdaptee::adapt_Eltwise},
+            {OT_Concat, &ModelAdaptee::adapt_Concat},
+            {OT_Slice, &ModelAdaptee::adapt_Slice},
+            {OT_TfSlice, &ModelAdaptee::adapt_TfSlice},
+            {OT_Cast, &ModelAdaptee::adapt_Cast},
+            {OT_Transpose, &ModelAdaptee::adapt_Transpose},
+            {OT_Reshape, &ModelAdaptee::adapt_Reshape},
+            {OT_Squeeze, &ModelAdaptee::adapt_Squeeze},
+            {OT_Unsqueeze, &ModelAdaptee::adapt_Unsqueeze},
+            {OT_Space2Depth, &ModelAdaptee::adapt_Space2Depth},
+            {OT_Depth2Space, &ModelAdaptee::adapt_Depth2Space},
+            {OT_PreAllocatedMemory, &ModelAdaptee::adapt_PreAllocatedMemory},
+            {OT_SharedWeight, &ModelAdaptee::adapt_SharedWeight},
+            {OT_Copy, &ModelAdaptee::adapt_Copy},
+            {OT_Check, &ModelAdaptee::adapt_Check},
+            {OT_Repeat, &ModelAdaptee::adapt_Repeat},
+            {OT_Attention, &ModelAdaptee::adapt_Attention},
+            {OT_AttentionMask, &ModelAdaptee::adapt_AttentionMask},
+            {OT_RelativePositionEmbedding, &ModelAdaptee::adapt_RelativePositionEmbedding},
+            {OT_RelativeShift, &ModelAdaptee::adapt_RelativeShift},
+            {OT_PriorBox, &ModelAdaptee::adapt_PriorBox},
+            {OT_DetectionOutput, &ModelAdaptee::adapt_DetectionOutput},
+            {OT_Yolov3DetectionOutput, &ModelAdaptee::adapt_Yolov3DetectionOutput},
+            {OT_Tile, &ModelAdaptee::adapt_Tile},
+            {OT_Splice, &ModelAdaptee::adapt_Splice},
+            {OT_SoftPlus, &ModelAdaptee::adapt_SoftPlus},
+            {OT_Exp, &ModelAdaptee::adapt_Exp},
+            {OT_Tdnn, &ModelAdaptee::adapt_Tdnn},
+            {OT_TopK, &ModelAdaptee::adapt_TopK},
+            {OT_SpaceToBatchNd, &ModelAdaptee::adapt_SpaceToBatchNd},
+            {OT_BatchToSpaceNd, &ModelAdaptee::adapt_BatchToSpaceNd},
+            {OT_Where, &ModelAdaptee::adapt_Where},
+            {OT_Expand, &ModelAdaptee::adapt_Expand},
+            {OT_Scatter, &ModelAdaptee::adapt_Scatter},
+            {OT_Select, &ModelAdaptee::adapt_Select},
+            {OT_RoIAlign, &ModelAdaptee::adapt_RoIAlign},
+            {OT_GridSample, &ModelAdaptee::adapt_GridSample},
+            {OT_GenerateProposals, &ModelAdaptee::adapt_GenerateProposals},
+            {OT_OneHot, &ModelAdaptee::adapt_OneHot},
+            {OT_CumSum, &ModelAdaptee::adapt_CumSum},
+            {OT_NonMaxSuppression, &ModelAdaptee::adapt_NonMaxSuppression},
+            {OT_ConstantOfShape, &ModelAdaptee::adapt_ConstantOfShape},
+            {OT_Range, &ModelAdaptee::adapt_Range},
+        };
+        if (functions.find(type) == functions.end()) {
+            UNI_MEMSET(ps, 0, sizeof(*ps));
         } else {
-            memset(ps, 0, sizeof(ParameterSpec));
+            *ps = (this->*(functions[type]))();
         }
         return SUCCESS;
     }
@@ -192,10 +149,10 @@ class ModelAdaptee {
 
     virtual ParameterSpec adapt_Softmax()
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(ParameterSpec));
-        curPs.softmax_spec.axis = -1;
-        return curPs;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        ps.softmax_spec.axis = -1;
+        return ps;
     }
 
     REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Clip)
@@ -203,10 +160,10 @@ class ModelAdaptee {
 
     virtual ParameterSpec adapt_Relu()
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(ParameterSpec));
-        curPs.relu_spec.neg_slope = 0;
-        return curPs;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        ps.relu_spec.neg_slope = 0;
+        return ps;
     }
 
     REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Gather)
@@ -216,10 +173,10 @@ class ModelAdaptee {
 
     virtual ParameterSpec adapt_Concat()
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(ParameterSpec));
-        curPs.concat_spec.axis = 1;
-        return curPs;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        ps.concat_spec.axis = 1;
+        return ps;
     }
 
     REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Slice)
@@ -254,9 +211,14 @@ class ModelAdaptee {
     REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Where)
     REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Expand)
     REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Scatter)
-    REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Equal)
     REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Select)
     REGISTER_EMPTY_ADAPT_OPERATOR(adapt_RoIAlign)
     REGISTER_EMPTY_ADAPT_OPERATOR(adapt_GenerateProposals)
+    REGISTER_EMPTY_ADAPT_OPERATOR(adapt_GridSample)
+    REGISTER_EMPTY_ADAPT_OPERATOR(adapt_OneHot)
+    REGISTER_EMPTY_ADAPT_OPERATOR(adapt_CumSum)
+    REGISTER_EMPTY_ADAPT_OPERATOR(adapt_NonMaxSuppression)
+    REGISTER_EMPTY_ADAPT_OPERATOR(adapt_ConstantOfShape)
+    REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Range)
 };
 #endif
diff --git a/model_tools/src/model_data_type_converter.cpp b/model_tools/src/model_data_type_converter.cpp
index 4d57a1fd..ac19c1cf 100644
--- a/model_tools/src/model_data_type_converter.cpp
+++ b/model_tools/src/model_data_type_converter.cpp
@@ -135,16 +135,15 @@ inline DataType get_storage_type(
     for (int i = 0; i < ms->num_operator_specs; i++) {
         std::string name = ms->ops[i].name;
         if (name == opName) {
-            auto opType = ms->ops[i].type;
-            if (OT_LayerNorm == opType || OT_Scale == opType || OT_PRelu == opType) {
-                return originalType;
-            }
-            if ("INT8" == storageMode) {
-                return DT_I8;
-            }
             if (1 == ms->ops[i].num_quant_feature && 1 == ms->ops[i].feature_scale[0].num_scale &&
                 0 == ms->ops[i].feature_scale[0].scale[0]) {
-                return originalType;
+                if ("INT8" == storageMode) {
+                    return (originalType == DT_F16) ? DT_F16_8Q : DT_F32_8Q;
+                } else if ("MIX" == storageMode) {
+                    return originalType;
+                } else {
+                    CHECK_STATUS(NOT_SUPPORTED);
+                }
             } else {
                 return DT_I8;
             }
@@ -162,51 +161,54 @@ EE ms_datatype_converter(
     CHECK_STATUS(getTargetDataType(convertMode, &(targetMs->dt)));
 
     targetMs->num_inputs = originalMs->num_inputs;
-    targetMs->input_names = (I8 **)mt_new_storage(targetMs->num_inputs * sizeof(I8 *));
+    targetMs->input_names = (I8 **)mt_malloc(targetMs->num_inputs * sizeof(I8 *));
     for (I32 j = 0; j < targetMs->num_inputs; j++) {
-        targetMs->input_names[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+        targetMs->input_names[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
         str_copy(targetMs->input_names[j], originalMs->input_names[j], NAME_LEN);
     }
-    targetMs->input_dims = (TensorDesc *)mt_new_storage(targetMs->num_inputs * sizeof(TensorDesc));
-    memcpy(targetMs->input_dims, originalMs->input_dims, targetMs->num_inputs * sizeof(TensorDesc));
+    targetMs->input_dims = (TensorDesc *)mt_malloc(targetMs->num_inputs * sizeof(TensorDesc));
+    UNI_MEMCPY(
+        targetMs->input_dims, originalMs->input_dims, targetMs->num_inputs * sizeof(TensorDesc));
     for (I32 i = 0; i < targetMs->num_inputs; i++) {
         CHECK_STATUS(getTargetDataType(convertMode, &(targetMs->input_dims[i].dt)));
     }
 
     targetMs->num_outputs = originalMs->num_outputs;
-    targetMs->output_names = (I8 **)mt_new_storage(targetMs->num_outputs * sizeof(I8 *));
+    targetMs->output_names = (I8 **)mt_malloc(targetMs->num_outputs * sizeof(I8 *));
     for (int j = 0; j < targetMs->num_outputs; j++) {
-        targetMs->output_names[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+        targetMs->output_names[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
         str_copy(targetMs->output_names[j], originalMs->output_names[j], NAME_LEN);
     }
 
     targetMs->num_operator_specs = originalMs->num_operator_specs;
     OperatorSpec *opsPtr =
-        (OperatorSpec *)mt_new_storage(targetMs->num_operator_specs * sizeof(OperatorSpec));
+        (OperatorSpec *)mt_malloc(targetMs->num_operator_specs * sizeof(OperatorSpec));
     std::map<std::string, DataType> weightDataTypeMap, vecDataTypeMap;
     for (int i = 0; i < targetMs->num_operator_specs; i++) {
         str_copy(opsPtr[i].name, originalMs->ops[i].name, NAME_LEN);
         opsPtr[i].type = originalMs->ops[i].type;
         opsPtr[i].num_inputs = originalMs->ops[i].num_inputs;
-        opsPtr[i].input_tensors_name = (I8 **)mt_new_storage(opsPtr[i].num_inputs * sizeof(I8 *));
+        opsPtr[i].input_tensors_name = (I8 **)mt_malloc(opsPtr[i].num_inputs * sizeof(I8 *));
         for (U32 j = 0; j < opsPtr[i].num_inputs; j++) {
-            opsPtr[i].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
-            memcpy(opsPtr[i].input_tensors_name[j], originalMs->ops[i].input_tensors_name[j],
+            opsPtr[i].input_tensors_name[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
+            UNI_MEMCPY(opsPtr[i].input_tensors_name[j], originalMs->ops[i].input_tensors_name[j],
                 NAME_LEN);
         }
         opsPtr[i].num_outputs = originalMs->ops[i].num_outputs;
-        opsPtr[i].output_tensors_name = (I8 **)mt_new_storage(opsPtr[i].num_outputs * sizeof(I8 *));
+        opsPtr[i].output_tensors_name = (I8 **)mt_malloc(opsPtr[i].num_outputs * sizeof(I8 *));
         for (U32 j = 0; j < opsPtr[i].num_outputs; j++) {
-            opsPtr[i].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
-            memcpy(opsPtr[i].output_tensors_name[j], originalMs->ops[i].output_tensors_name[j],
+            opsPtr[i].output_tensors_name[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
+            UNI_MEMCPY(opsPtr[i].output_tensors_name[j], originalMs->ops[i].output_tensors_name[j],
                 NAME_LEN);
         }
 
         if (OT_None != opsPtr[i].type) {
             U32 numTensors = opsPtr[i].num_inputs + opsPtr[i].num_outputs;
-            opsPtr[i].tensor_positions = (I32 *)mt_new_storage(numTensors * sizeof(I32));
-            memcpy(opsPtr[i].tensor_positions, originalMs->ops[i].tensor_positions,
-                numTensors * sizeof(I32));
+            opsPtr[i].tensor_positions = (I32 *)mt_malloc(numTensors * sizeof(I32));
+            if (originalMs->ops[i].tensor_positions != nullptr) {
+                UNI_MEMCPY(opsPtr[i].tensor_positions, originalMs->ops[i].tensor_positions,
+                    numTensors * sizeof(I32));
+            }
         } else {
             opsPtr[i].tensor_positions = nullptr;
         }
@@ -216,14 +218,14 @@ EE ms_datatype_converter(
             opsPtr[i].feature_scale = nullptr;
         } else {
             opsPtr[i].feature_scale =
-                (QuantSpec *)mt_new_storage(opsPtr[i].num_quant_feature * sizeof(QuantSpec));
+                (QuantSpec *)mt_malloc(opsPtr[i].num_quant_feature * sizeof(QuantSpec));
             for (U32 j = 0; j < opsPtr[i].num_quant_feature; j++) {
                 opsPtr[i].feature_scale[j].num_scale = originalMs->ops[i].feature_scale[j].num_scale;
                 int num = opsPtr[i].feature_scale[j].num_scale;
 
-                opsPtr[i].feature_scale[j].scale = (F32 *)mt_new_storage(num * sizeof(F32));
-                memcpy(opsPtr[i].feature_scale[j].scale, originalMs->ops[i].feature_scale[j].scale,
-                    num * sizeof(F32));
+                opsPtr[i].feature_scale[j].scale = (F32 *)mt_malloc(num * sizeof(F32));
+                UNI_MEMCPY(opsPtr[i].feature_scale[j].scale,
+                    originalMs->ops[i].feature_scale[j].scale, num * sizeof(F32));
             }
         }
 
@@ -236,13 +238,18 @@ EE ms_datatype_converter(
                     getTargetDataType(convertMode, &(opsPtr[i].ps.shared_weight_spec.desc.dt)));
                 break;
             }
+            case OT_ConstantOfShape: {
+                CHECK_STATUS(
+                    getTargetDataType(convertMode, &(opsPtr[i].ps.constant_of_shape_spec.dt)));
+                break;
+            }
             case OT_PreAllocatedMemory: {
                 CHECK_STATUS(getTargetDataType(
                     convertMode, &(opsPtr[i].ps.preallocated_memory_spec.desc.dt)));
                 break;
             }
             case OT_Cast: {
-                CHECK_STATUS(getTargetDataType(convertMode, &(opsPtr[i].ps.cast_spec.targetDt)));
+                CHECK_STATUS(getTargetDataType(convertMode, &(opsPtr[i].ps.cast_spec.dt)));
                 break;
             }
             case OT_Gather: {
@@ -265,8 +272,7 @@ EE ms_datatype_converter(
     }
     targetMs->ops = opsPtr;
     targetMs->num_weight_specs = originalMs->num_weight_specs;
-    WeightSpec *wsPtr =
-        (WeightSpec *)mt_new_storage(targetMs->num_weight_specs * sizeof(WeightSpec));
+    WeightSpec *wsPtr = (WeightSpec *)mt_malloc(targetMs->num_weight_specs * sizeof(WeightSpec));
     F32 maxQuantizationError = 0;
     char *environmentSetting = getenv("BOLT_INT8_STORAGE_ERROR_THRESHOLD");
     F32 quantizationErrorThreshold = (environmentSetting != NULL) ? atof(environmentSetting) : 0.002;
@@ -289,7 +295,8 @@ EE ms_datatype_converter(
             if (wdt == DT_F32 || wdt == DT_F16) {
                 wsPtr[i].mdt = get_storage_type(targetMs, wsPtr[i].op_name, storageMode, wdt);
             }
-            if (wsPtr[i].mdt == DT_I8 && (convertMode == F32_to_F32 || convertMode == F32_to_F16) &&
+            if ((wsPtr[i].mdt == DT_F32_8Q || wsPtr[i].mdt == DT_F16_8Q) &&
+                (convertMode == F32_to_F32 || convertMode == F32_to_F16) &&
                 originalMs->ws[i].mdt == DT_F32) {
                 F32 error = quantizationError(originalMs, originalMs->ws[i].op_name);
                 maxQuantizationError = UNI_MAX(maxQuantizationError, error);
@@ -305,22 +312,26 @@ EE ms_datatype_converter(
             }
 
             weightNum = originalMs->ws[i].bytes_of_weight / bytesOf(originalMs->ws[i].mdt);
-            wsPtr[i].bytes_of_weight = weightNum * bytesOf(wsPtr[i].mdt);
+            if ((wsPtr[i].mdt == DT_F32_8Q || wsPtr[i].mdt == DT_F16_8Q)) {
+                wsPtr[i].bytes_of_weight = weightNum;
+            } else {
+                wsPtr[i].bytes_of_weight = weightNum * bytesOf(wsPtr[i].mdt);
+            }
         }
-        wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight);
+        wsPtr[i].weight = (U8 *)mt_malloc(wsPtr[i].bytes_of_weight);
 
         wsPtr[i].num_quant_scale = originalMs->ws[i].num_quant_scale;
         if (0 == wsPtr[i].num_quant_scale) {
             wsPtr[i].weight_scale = nullptr;
         } else {
             wsPtr[i].weight_scale =
-                (QuantSpec *)mt_new_storage(wsPtr[i].num_quant_scale * sizeof(QuantSpec));
+                (QuantSpec *)mt_malloc(wsPtr[i].num_quant_scale * sizeof(QuantSpec));
             for (U32 j = 0; j < wsPtr[i].num_quant_scale; j++) {
                 wsPtr[i].weight_scale[j].num_scale = originalMs->ws[i].weight_scale[j].num_scale;
                 int num = wsPtr[i].weight_scale[j].num_scale;
 
-                wsPtr[i].weight_scale[j].scale = (F32 *)mt_new_storage(num * sizeof(F32));
-                memcpy(wsPtr[i].weight_scale[j].scale, originalMs->ws[i].weight_scale[j].scale,
+                wsPtr[i].weight_scale[j].scale = (F32 *)mt_malloc(num * sizeof(F32));
+                UNI_MEMCPY(wsPtr[i].weight_scale[j].scale, originalMs->ws[i].weight_scale[j].scale,
                     num * sizeof(F32));
             }
         }
@@ -335,22 +346,27 @@ EE ms_datatype_converter(
             vdt = DT_F16;
         }
         wsPtr[i].bytes_of_vec = biasNum * bytesOf(vdt);
-        wsPtr[i].vec = (U8 *)mt_new_storage(wsPtr[i].bytes_of_vec);
+        wsPtr[i].vec = (U8 *)mt_malloc(wsPtr[i].bytes_of_vec);
 
         if (vecDataTypeMap.find(wsPtr[i].op_name) != vecDataTypeMap.end()) {
             if (wsPtr[i].bytes_of_vec > 0) {
-                memcpy(wsPtr[i].vec, originalMs->ws[i].vec, originalMs->ws[i].bytes_of_vec);
+                UNI_MEMCPY(wsPtr[i].vec, originalMs->ws[i].vec, originalMs->ws[i].bytes_of_vec);
             }
         }
-        if (DT_I32 == originalMs->ws[i].mdt || DT_U32 == originalMs->ws[i].mdt) {
+        if (DT_I32 == originalMs->ws[i].mdt ||
+            DT_U32 == originalMs->ws[i].mdt ||
+            DT_U8 == originalMs->ws[i].mdt)
+        {
             if (wsPtr[i].bytes_of_weight > 0) {
-                memcpy(wsPtr[i].weight, originalMs->ws[i].weight, originalMs->ws[i].bytes_of_weight);
+                UNI_MEMCPY(
+                    wsPtr[i].weight, originalMs->ws[i].weight, originalMs->ws[i].bytes_of_weight);
             }
             if (wsPtr[i].bytes_of_vec > 0) {
-                memcpy(wsPtr[i].vec, originalMs->ws[i].vec, originalMs->ws[i].bytes_of_vec);
+                UNI_MEMCPY(wsPtr[i].vec, originalMs->ws[i].vec, originalMs->ws[i].bytes_of_vec);
             }
         } else {
             switch (wsPtr[i].mdt) {
+                case DT_I64:
                 case DT_I32:
                 case DT_U32:
                 case DT_F32:
@@ -363,13 +379,15 @@ EE ms_datatype_converter(
                     }
                     break;
                 }
+                case DT_F32_8Q:
+                case DT_F16_8Q:
                 case DT_I8: {
                     F32 scale = ws_datatype_converter_int8(
                         originalMs->ws[i].weight, wsPtr[i].weight, weightNum);
                     wsPtr[i].num_quant_scale = 1;
-                    wsPtr[i].weight_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec));
+                    wsPtr[i].weight_scale = (QuantSpec *)mt_malloc(sizeof(QuantSpec));
                     wsPtr[i].weight_scale[0].num_scale = 1;
-                    wsPtr[i].weight_scale[0].scale = (F32 *)mt_new_storage(sizeof(F32));
+                    wsPtr[i].weight_scale[0].scale = (F32 *)mt_malloc(sizeof(F32));
                     wsPtr[i].weight_scale[0].scale[0] = scale;
 
                     if (vecDataTypeMap.find(wsPtr[i].op_name) == vecDataTypeMap.end()) {
@@ -405,7 +423,7 @@ EE ms_datatype_converter(
 
     if (nullptr != originalMs->op_relationship_entries) {
         targetMs->num_op_tensor_entries = originalMs->num_op_tensor_entries;
-        targetMs->op_relationship_entries = (OperatorRelationshipMapEntry *)mt_new_storage(
+        targetMs->op_relationship_entries = (OperatorRelationshipMapEntry *)mt_malloc(
             targetMs->num_op_tensor_entries * sizeof(OperatorRelationshipMapEntry));
         for (int i = 0; i < targetMs->num_op_tensor_entries; i++) {
             str_copy(targetMs->op_relationship_entries[i].op,
@@ -413,22 +431,22 @@ EE ms_datatype_converter(
 
             targetMs->op_relationship_entries[i].num_inputs =
                 originalMs->op_relationship_entries[i].num_inputs;
-            targetMs->op_relationship_entries[i].input_op_names = (I8 **)mt_new_storage(
-                targetMs->op_relationship_entries[i].num_inputs * sizeof(I8 *));
+            targetMs->op_relationship_entries[i].input_op_names =
+                (I8 **)mt_malloc(targetMs->op_relationship_entries[i].num_inputs * sizeof(I8 *));
             for (U32 j = 0; j < targetMs->op_relationship_entries[i].num_inputs; j++) {
                 targetMs->op_relationship_entries[i].input_op_names[j] =
-                    (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+                    (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
                 str_copy(targetMs->op_relationship_entries[i].input_op_names[j],
                     originalMs->op_relationship_entries[i].input_op_names[j], NAME_LEN);
             }
 
             targetMs->op_relationship_entries[i].num_outputs =
                 originalMs->op_relationship_entries[i].num_outputs;
-            targetMs->op_relationship_entries[i].output_op_names = (I8 **)mt_new_storage(
-                targetMs->op_relationship_entries[i].num_outputs * sizeof(I8 *));
+            targetMs->op_relationship_entries[i].output_op_names =
+                (I8 **)mt_malloc(targetMs->op_relationship_entries[i].num_outputs * sizeof(I8 *));
             for (U32 j = 0; j < targetMs->op_relationship_entries[i].num_outputs; j++) {
                 targetMs->op_relationship_entries[i].output_op_names[j] =
-                    (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+                    (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
                 str_copy(targetMs->op_relationship_entries[i].output_op_names[j],
                     originalMs->op_relationship_entries[i].output_op_names[j], NAME_LEN);
             }
diff --git a/model_tools/src/model_quantization.cpp b/model_tools/src/model_quantization.cpp
deleted file mode 100644
index 259fc8d5..00000000
--- a/model_tools/src/model_quantization.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#include <map>
-#include <sstream>
-#include <fstream>
-#include <json/json.h>
-#include "model_quantization.h"
-#include "model_common.h"
-#include "OPOptimizers/OPOptimizer.hpp"
-
-void add_scale_from_file(ModelSpec *ms, const char *scaleFile)
-{
-    std::fstream file(std::string(scaleFile), std::ios::in);
-    Json::Value value;
-    Json::Reader reader;
-    if (!reader.parse(file, value)) {
-        UNI_ERROR_LOG("%s is not a valid JSON file.", scaleFile);
-    }
-    file.close();
-
-    for (I32 i = 0; i < ms->num_operator_specs; i++) {
-        if (isDeprecatedOp(ms->ops[i].type)) {
-            continue;
-        }
-        if (ms->ops[i].num_quant_feature == 1 && ms->ops[i].feature_scale[0].scale[0] == 0) {
-            UNI_WARNING_LOG("%s cannot be quantized.\n", ms->ops[i].name);
-            continue;
-        }
-        std::string layerName = std::string(ms->ops[i].name);
-
-        // only quantize the layer in the scale file
-        if (!value[layerName].isObject()) {
-            CHECK_REQUIREMENT(ms->ops[i].num_quant_feature == 0);
-            ms->ops[i].num_quant_feature = 1;
-            ms->ops[i].feature_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec));
-            ms->ops[i].feature_scale[0].num_scale = 1;
-            ms->ops[i].feature_scale[0].scale = (F32 *)mt_new_storage(sizeof(F32));
-            ms->ops[i].feature_scale[0].scale[0] = 0;
-            continue;
-        }
-
-        std::vector<std::vector<F32>> scales;
-
-        // all nodes are set to F32 default
-        U32 inputNum = ms->ops[i].num_inputs;
-        U32 outputNum = ms->ops[i].num_outputs;
-        for (U32 j = 0; j < inputNum; j++) {
-            scales.push_back({-2});
-        }
-        for (U32 j = 0; j < outputNum; j++) {
-            scales.push_back({-2});
-        }
-        if (value[layerName]["inputs"].isObject()) {
-            for (U32 j = 0; j < inputNum; j++) {
-                // only support 1 clip value now
-                std::string inputName = std::string(ms->ops[i].input_tensors_name[j]);
-                if (value[layerName]["inputs"][inputName].isDouble()) {
-                    scales[j] = {127.0f / value[layerName]["inputs"][inputName].asFloat()};
-                }
-            }
-        }
-
-        if (value[layerName]["outputs"].isObject()) {
-            for (U32 j = 0; j < outputNum; j++) {
-                // only support 1 clip value now
-                std::string outputName = std::string(ms->ops[i].output_tensors_name[j]);
-                if (value[layerName]["outputs"][outputName].isDouble()) {
-                    scales[j] = {127.0f / value[layerName]["outputs"][outputName].asFloat()};
-                }
-            }
-        }
-
-        // weight clip value
-        if (value[layerName]["weights"].isObject() && value[layerName]["weights"].size() >= 1) {
-            CHECK_REQUIREMENT(value[layerName]["weights"].size() == 1);
-            Json::Value::Members members = value[layerName]["weights"].getMemberNames();
-            CHECK_REQUIREMENT(value[layerName]["weights"][members[0]].isDouble());
-            int weightIdx = OPOptimizer::searchWeightIndex(ms, ms->ops[i].name);
-            CHECK_REQUIREMENT(-1 != weightIdx);
-            CHECK_REQUIREMENT(DT_F32 == ms->ws[weightIdx].mdt);
-            UNI_INFO_LOG("Clipping the weight of %s\n", ms->ops[i].name);
-            F32 clipMax = value[layerName]["weights"][members[0]].asFloat();
-            F32 clipMin = -1 * clipMax;
-            U32 len = ms->ws[weightIdx].bytes_of_weight / bytesOf(DT_F32);
-            F32 *w = (F32 *)mt_new_storage(ms->ws[weightIdx].bytes_of_weight);
-            memcpy(w, ms->ws[weightIdx].weight, ms->ws[weightIdx].bytes_of_weight);
-            for (U32 j = 0; j < len; j++) {
-                if (w[j] > clipMax) {
-                    w[j] = clipMax;
-                } else if (w[j] < clipMin) {
-                    w[j] = clipMin;
-                }
-            }
-            if (ms->ws[weightIdx].weight != nullptr) {
-                if (outOfFileMapRange(ms->ws[weightIdx].weight, ms->mfd)) {
-                    delete ms->ws[weightIdx].weight;
-                }
-                ms->ws[weightIdx].weight = nullptr;
-            }
-            ms->ws[weightIdx].weight = (U8 *)w;
-        }
-
-        // Store scales into result model
-        if (nullptr != ms->ops[i].feature_scale) {  // Could be labelled with -2
-            for (U32 k = 0; k < ms->ops[i].num_quant_feature; k++) {
-                if (nullptr != ms->ops[i].feature_scale[k].scale) {
-                    delete ms->ops[i].feature_scale[k].scale;
-                }
-            }
-            delete ms->ops[i].feature_scale;
-        }
-
-        ms->ops[i].num_quant_feature = scales.size();
-        ms->ops[i].feature_scale = (QuantSpec *)mt_new_storage(scales.size() * sizeof(QuantSpec));
-
-        for (U32 k = 0; k < scales.size(); k++) {
-            ms->ops[i].feature_scale[k].num_scale = scales[k].size();
-            U32 scaleBytes = scales[k].size() * sizeof(F32);
-            ms->ops[i].feature_scale[k].scale = (F32 *)mt_new_storage(scaleBytes);
-            memcpy(ms->ops[i].feature_scale[k].scale, scales[k].data(), scaleBytes);
-        }
-    }
-}
diff --git a/model_tools/src/online_conversion.cpp b/model_tools/src/online_conversion.cpp
index a8afe668..dc8c71f0 100644
--- a/model_tools/src/online_conversion.cpp
+++ b/model_tools/src/online_conversion.cpp
@@ -29,7 +29,8 @@ bool fileExist(const std::string &name)
 void *OnlineModelConversion(const char *storagePath,
     const char *modelName,
     const char *inferPrecision,
-    I32 removeProcessOpsNum)
+    I32 removeProcessOpsNum,
+    bool trainMode)
 {
     DataConvertType converterMode = F32_to_F32;
     if (inferPrecision == std::string("PTQ")) {
@@ -40,7 +41,7 @@ void *OnlineModelConversion(const char *storagePath,
         converterMode = F32_to_F32;
     } else {
         UNI_ERROR_LOG("Unknown converter data precision: %s.\n", inferPrecision);
-        exit(1);
+        return nullptr;
     }
 
     ModelSpec *originalMs = new ModelSpec();
@@ -74,16 +75,25 @@ void *OnlineModelConversion(const char *storagePath,
         UNI_INFO_LOG("Start to convert %s.json...\n", prefix.c_str());
         tensorflow_converter(storagePath, modelName, originalMs);
 #endif
+#ifdef _USE_MINDSPORE
+    } else if (fileExist(prefix + ".mindir")) {
+        UNI_INFO_LOG("Start to convert %s.mindir...\n", prefix.c_str());
+        mindspore_converter(storagePath, modelName, originalMs);
+#endif	
     } else {
-        UNI_ERROR_LOG("Can not find %s.prototxt/caffemodel, %s.onnx, %s.tflite or %s.json model "
+        UNI_ERROR_LOG("Can not find %s.prototxt/caffemodel, %s.onnx, %s.tflite or %s.json model or %s.mindir"
                       "file.\n",
-            prefix.c_str(), prefix.c_str(), prefix.c_str(), prefix.c_str());
-        exit(1);
+            prefix.c_str(), prefix.c_str(), prefix.c_str(), prefix.c_str(), prefix.c_str());
+        return nullptr;
     }
 
     UNI_DEBUG_LOG("Start to optimize graph...\n");
     ModelSpecOptimizer msOptimizer;
-    msOptimizer.suggest(inferPrecision == std::string("PTQ"));
+    if (trainMode) {
+        msOptimizer.suggest_for_training();
+    } else {
+        msOptimizer.suggest(inferPrecision == std::string("PTQ"));
+    }
     msOptimizer.optimize(originalMs);
 
     CHECK_STATUS(ms_datatype_converter(originalMs, targetMs, converterMode, "NOQUANT"));
diff --git a/model_tools/src/onnx/onnx_adaptee.h b/model_tools/src/onnx/onnx_adaptee.h
index bf60beea..980e1b74 100644
--- a/model_tools/src/onnx/onnx_adaptee.h
+++ b/model_tools/src/onnx/onnx_adaptee.h
@@ -13,8 +13,6 @@
 
 #ifndef _H_ONNXADAPTEE
 #define _H_ONNXADAPTEE
-#include <string>
-#include <fstream>
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
@@ -29,241 +27,171 @@ class OnnxAdaptee : public ModelAdaptee {
     {
         this->removePreprocessOpNum = _removePreprocessOpNum;
         this->useBNN = _useBNN;
+        this->useShare = true;
     }
+
     ~OnnxAdaptee()
     {}
 
 protected:
-    DataType get_weight_data_type(U32 weightLen, F32 *weight)
+    EE read_file(const char *modelPath, google::protobuf::Message *message)
     {
-        if (1 >= weightLen || !useBNN) {
-            return DT_F32;
-        }
-        F32 val0 = 1;
-        F32 val1 = 0;
-        for (U32 i = 0; i < weightLen; i++) {
-            F32 cur = weight[i];
-            if (cur <= 0 && val0 <= 0 && cur != val0) {
-                return DT_F32;
-            }
-            if (cur > 0 && val1 > 0 && cur != val1) {
-                return DT_F32;
-            }
-            if (cur <= 0 && val0 > 0) {
-                val0 = cur;
-            }
-            if (cur > 0 && val1 <= 0) {
-                val1 = cur;
-            }
+        std::ifstream fs(modelPath, std::ifstream::in | std::ifstream::binary);
+        if (!fs.is_open()) {
+            UNI_ERROR_LOG("can not open onnx model file %s.\n", modelPath);
         }
-        if (val0 == 0) {
-            return DT_BIN01;
+
+        google::protobuf::io::IstreamInputStream input(&fs);
+        google::protobuf::io::CodedInputStream codedstr(&input);
+        codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
+        if (!message->ParseFromCodedStream(&codedstr)) {
+            UNI_ERROR_LOG("can not parse onnx model file %s.\n", modelPath);
         }
-        //CHECK_REQUIREMENT(0 == val0 + val1);
-        return DT_BIN11;
+        fs.close();
+        return SUCCESS;
     }
 
-    std::vector<int> getOperatorWeightInputIndex(const onnx::NodeProto &weightNode)
+    EE parse_file(std::string modelDirectory, std::string modelFileName) override
     {
-        std::vector<int> index;
-        for (int i = 0; i < weightNode.input_size(); i++) {
-            if (onnxWeights.end() != onnxWeights.find(weightNode.input(i))) {
-                index.push_back(i);
-            }
+        std::string modelPath = modelDirectory + "/" + modelFileName + ".onnx";
+        CHECK_STATUS(read_file(modelPath.c_str(), (google::protobuf::Message *)(&onnxModel)));
+
+        onnxGraph = onnxModel.graph();
+        for (int i = 0; i < onnxGraph.initializer_size(); i++) {
+            const onnx::TensorProto &initializer = onnxGraph.initializer(i);
+            onnxWeights[initializer.name()] = initializer;
+        }
+        for (int i = 0; i < onnxGraph.value_info_size(); i++) {
+            const onnx::ValueInfoProto &value = onnxGraph.value_info(i);
+            onnxValues[value.name()] = value;
         }
-        return index;
+        return SUCCESS;
     }
 
-    EE read_from_onnx_file(const char *path, google::protobuf::Message *message)
+    std::vector<int> get_weight_ids(const onnx::NodeProto &node)
     {
-        std::ifstream fs(path, std::ifstream::in | std::ifstream::binary);
-        if (!fs.is_open()) {
-            UNI_ERROR_LOG("can not open onnx model file %s.\n", path);
+        std::vector<int> ids;
+        for (int i = 0; i < node.input_size(); i++) {
+            if (onnxWeights.end() != onnxWeights.find(node.input(i))) {
+                ids.push_back(i);
+            }
         }
-
-        google::protobuf::io::IstreamInputStream input(&fs);
-        google::protobuf::io::CodedInputStream codedstr(&input);
-
-        codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
-
-        bool ret = message->ParseFromCodedStream(&codedstr);
-        fs.close();
-
-        return (ret) ? SUCCESS : NOT_SUPPORTED;
+        return ids;
     }
 
     OperatorType convert_onnx_type(const std::string &onnxNodeType)
     {
-        std::vector<int> indexes = getOperatorWeightInputIndex(this->onnxNode);
-        if (onnxNodeType == "Conv") {
-            return OT_Conv;
-        } else if (onnxNodeType == "BatchNormalization" || onnxNodeType == "BatchNorm") {
-            return OT_BatchNorm;
-        } else if (onnxNodeType == "InstanceNormalization") {
-            return OT_InstanceNorm;
-        } else if (onnxNodeType == "Sum" || onnxNodeType == "Add" || onnxNodeType == "Mul" ||
+        std::map<std::string, OperatorType> operatorMap = {{"Conv", OT_Conv},
+            {"BatchNormalization", OT_BatchNorm}, {"BatchNorm", OT_BatchNorm},
+            {"InstanceNormalization", OT_InstanceNorm}, {"AveragePool", OT_Pooling},
+            {"MaxPool", OT_Pooling}, {"GlobalAveragePool", OT_Pooling}, {"Relu", OT_Relu},
+            {"LeakyRelu", OT_Relu}, {"Softmax", OT_Softmax}, {"Concat", OT_Concat}, {"Pad", OT_Pad},
+            {"Max", OT_Clip}, {"Min", OT_Clip}, {"Clip", OT_Clip}, {"Reshape", OT_Reshape},
+            {"Squeeze", OT_Squeeze}, {"Unsqueeze", OT_Unsqueeze}, {"Transpose", OT_Transpose},
+            {"Gather", OT_Gather}, {"GatherElements", OT_Gather}, {"GatherND", OT_Gather},
+            {"Resize", OT_Resize}, {"Upsample", OT_Resize}, {"Cast", OT_Cast},
+            {"Constant", OT_Constant}, {"Flatten", OT_Reshape}, {"ConvTranspose", OT_Deconvolution},
+            {"Tanh", OT_TanH}, {"LogSoftmax", OT_LogSoftmax}, {"Shape", OT_Shape}, {"Erf", OT_Erf},
+            {"Pow", OT_Power}, {"Sqrt", OT_Power}, {"RNN", OT_RNN}, {"GRU", OT_RNN},
+            {"Scan", OT_RNN}, {"LSTM", OT_RNN}, {"ConstantOfShape", OT_ConstantOfShape},
+            {"SpaceToDepth", OT_Space2Depth}, {"DepthToSpace", OT_Depth2Space}, {"PRelu", OT_PRelu},
+            {"ArgMax", OT_ArgMax}, {"Tile", OT_Tile}, {"Sigmoid", OT_Sigmoid}, {"Slice", OT_TfSlice},
+            {"ReduceSum", OT_Reduction}, {"ReduceMin", OT_Reduction}, {"ReduceL2", OT_Reduction},
+            {"Split", OT_Slice}, {"Splice", OT_Splice}, {"Where", OT_Where},
+            {"SoftPlus", OT_SoftPlus}, {"Exp", OT_Exp}, {"NoOp", OT_Slice}, {"Tdnn", OT_Tdnn},
+            {"Dropout", OT_Dropout}, {"Scale", OT_Power}, {"TopK", OT_TopK}, {"Equal", OT_Check},
+            {"Sign", OT_Sign}, {"TFL_HARD_SWISH", OT_HSwish}, {"Expand", OT_Expand},
+            {"Scatter", OT_Scatter}, {"ScatterND", OT_Scatter}, {"ScatterElements", OT_Scatter},
+            {"Not", OT_Not}, {"Abs", OT_Abs}, {"Reciprocal", OT_Reciprocal}, {"And", OT_Eltwise},
+            {"Or", OT_Eltwise}, {"Xor", OT_Eltwise}, {"Log", OT_Log}, {"Neg", OT_Neg},
+            {"GenerateProposals", OT_GenerateProposals}, {"RoIAlign", OT_RoIAlign},
+            {"Round", OT_Round}, {"Floor", OT_Floor}, {"Ceil", OT_Ceil}, {"CumSum", OT_CumSum},
+            {"RandomUniformLike", OT_RandomUniform}, {"RandomUniform", OT_RandomUniform},
+            {"GridSample", OT_GridSample}, {"HardSigmoid", OT_HSigmoid}, {"OneHot", OT_OneHot},
+            {"Identity", OT_Slice}, {"TFL_L2_NORMALIZATION", OT_L2Normalization},
+            {"NonMaxSuppression", OT_NonMaxSuppression}, {"Less", OT_Check}, {"Greater", OT_Check},
+            {"GreaterOrEqual", OT_Check}, {"LessOrEqual", OT_Check}, {"NonZero", OT_NonZero},
+            {"RoiAlign", OT_RoIAlign}, {"Loop", OT_Range}};
+        if (operatorMap.find(onnxNodeType) != operatorMap.end()) {
+            return operatorMap[onnxNodeType];
+        }
+        std::vector<int> ids = get_weight_ids(this->onnxNode);
+        if (onnxNodeType == "Sum" || onnxNodeType == "Add" || onnxNodeType == "Mul" ||
             onnxNodeType == "Div" || onnxNodeType == "Sub") {
-            if (indexes.size() == 0 || this->onnxNode.input_size() > 2) {
+            if (ids.size() == 0 || this->onnxNode.input_size() > 2) {
                 return OT_Eltwise;
             }
-            for (U32 i = 0; i < indexes.size(); i++) {
-                onnx::TensorProto &weightTp = onnxWeights[this->onnxNode.input(indexes[i])];
-                if (is_multi_dim(weightTp)) {
+            std::vector<U32> axis(ids.size());
+            for (U32 i = 0; i < ids.size(); i++) {
+                TensorDesc desc = get_desc(onnxWeights[this->onnxNode.input(ids[i])]);
+                int count = 0;
+                for (U32 j = 0; j < desc.nDims; j++) {
+                    if (desc.dims[j] > 1) {
+                        count++;
+                        axis[i] = j;
+                    }
+                }
+                if (count > 1) {
                     return OT_Eltwise;
                 }
             }
-            const onnx::TensorProto &weightTp = onnxWeights[this->onnxNode.input(indexes[0])];
-            int weightNum = get_data_size_from_tensor_proto(weightTp);
-            if (weightNum == 1) {
+            const onnx::TensorProto &weight = onnxWeights[this->onnxNode.input(ids[0])];
+            if (get_length(weight) == 1) {
                 return OT_Power;
-            } else if ((onnxNodeType == "Div" || onnxNodeType == "Sub") && indexes[0] == 0) {
+            } else if ((onnxNodeType == "Div" || onnxNodeType == "Sub") && ids[0] == 0) {
+                return OT_Eltwise;
+            } else if (this->onnxWeightReferCount[this->onnxNode.input(ids[0])] > 1) {
                 return OT_Eltwise;
             } else {
-                if (this->onnxWeightReferCount[this->onnxNode.input(indexes[0])] > 1) {
-                    return OT_Eltwise;
-                } else {
-                    return OT_Scale;
-                }
+                return OT_Scale;
             }
-        } else if (onnxNodeType == "AveragePool" || onnxNodeType == "MaxPool" ||
-            onnxNodeType == "GlobalAveragePool") {
-            return OT_Pooling;
         } else if (onnxNodeType == "ReduceMean" || onnxNodeType == "ReduceMax") {
-            std::vector<int> axesInfo =
-                get_node_vector_ints_attribute_by_name(this->onnxNode, "axes");
-            int keepdimsInfo = get_node_single_int_attribute_by_name(this->onnxNode, "keepdims", 0);
-            if (axesInfo.size() == 2 && axesInfo[0] == 2 && axesInfo[1] == 3 && keepdimsInfo == 1) {
+            std::vector<int> axes = get_ints(this->onnxNode, "axes");
+            int keepdims = get_int(this->onnxNode, "keepdims", 0);
+            if (axes.size() == 2 && axes[0] == 2 && axes[1] == 3 && keepdims == 1) {
                 return OT_Pooling;
             }
             return OT_Reduction;
-        } else if (onnxNodeType == "Relu" || onnxNodeType == "LeakyRelu") {
-            return OT_Relu;
-        } else if (onnxNodeType == "Softmax") {
-            return OT_Softmax;
-        } else if (onnxNodeType == "Concat") {
-            return OT_Concat;
-        } else if (onnxNodeType == "Pad") {
-            return OT_Pad;
-        } else if (onnxNodeType == "Max" || onnxNodeType == "Min" || onnxNodeType == "Clip") {
-            return OT_Clip;
-        } else if (onnxNodeType == "Reshape") {
-            return OT_Reshape;
-        } else if (onnxNodeType == "Squeeze") {
-            return OT_Squeeze;
-        } else if (onnxNodeType == "Transpose") {
-            return OT_Transpose;
-        } else if (onnxNodeType == "Gather" || onnxNodeType == "GatherElements" ||
-            onnxNodeType == "GatherND") {
-            return OT_Gather;
-        } else if (onnxNodeType == "Unsqueeze") {
-            return OT_Unsqueeze;
-        } else if (onnxNodeType == "Resize" || onnxNodeType == "Upsample") {
-            return OT_Resize;
-        } else if (onnxNodeType == "Cast") {
-            return OT_Cast;
-        } else if (onnxNodeType == "Constant") {
-            return OT_Constant;
         } else if (onnxNodeType == "MatMul" || onnxNodeType == "Gemm" || onnxNodeType == "Linear") {
-            if (indexes.size() == 0 || (indexes.size() == 1 && indexes[0] == 2)) {
+            if (ids.size() == 0 || (ids.size() == 1 && ids[0] == 2)) {
                 return OT_MatMul;
             } else {
-                auto weightName = this->onnxNode.input(indexes[0]);
+                auto weightName = this->onnxNode.input(ids[0]);
                 onnx::TensorProto &weightTp = onnxWeights[weightName];
-                if (weightTp.dims_size() == 2 && this->onnxWeightReferCount[weightName] == 1) {
+                if (weightTp.dims_size() == 2 && this->onnxWeightReferCount[weightName] <= 1) {
                     return OT_FC;
                 } else {
                     return OT_MatMul;
                 }
             }
-        } else if (onnxNodeType == "Flatten") {
-            return OT_Reshape;
-        } else if (onnxNodeType == "ConvTranspose") {
-            return OT_Deconvolution;
-        } else if (onnxNodeType == "Tanh") {
-            return OT_TanH;
-        } else if (onnxNodeType == "LogSoftmax") {
-            return OT_LogSoftmax;
-        } else if (onnxNodeType == "Shape") {
-            return OT_Shape;
-        } else if (onnxNodeType == "Erf") {
-            return OT_Erf;
-        } else if (onnxNodeType == "Pow" || onnxNodeType == "Sqrt") {
-            return OT_Power;
-        } else if (onnxNodeType == "RNN" || onnxNodeType == "GRU" || onnxNodeType == "LSTM" ||
-            onnxNodeType == "Scan") {
-            return OT_RNN;
-        } else if (onnxNodeType == "ConstantOfShape") {
-            return OT_ConstantOfShape;
-        } else if (onnxNodeType == "SpaceToDepth") {
-            return OT_Space2Depth;
-        } else if (onnxNodeType == "DepthToSpace") {
-            return OT_Depth2Space;
-        } else if (onnxNodeType == "PRelu") {
-            return OT_PRelu;
-        } else if (onnxNodeType == "ArgMax") {
-            return OT_ArgMax;
-        } else if (onnxNodeType == "Tile") {
-            return OT_Tile;
-        } else if (onnxNodeType == "Sigmoid") {
-            return OT_Sigmoid;
-        } else if (onnxNodeType == "Slice") {
-            return OT_TfSlice;
-        } else if (onnxNodeType == "ReduceSum" || onnxNodeType == "ReduceMin" ||
-            onnxNodeType == "ReduceL2") {
-            return OT_Reduction;
-        } else if (onnxNodeType == "Split") {
-            return OT_Slice;
-        } else if (onnxNodeType == "Splice") {
-            return OT_Splice;
-        } else if (onnxNodeType == "Greater") {
-            return OT_Greater;
-        } else if (onnxNodeType == "Where") {
-            return OT_Where;
-        } else if (onnxNodeType == "SoftPlus") {
-            return OT_SoftPlus;
-        } else if (onnxNodeType == "Exp") {
-            return OT_Exp;
-        } else if (onnxNodeType == "NoOp") {
-            return OT_Split;
-        } else if (onnxNodeType == "Tdnn") {
-            return OT_Tdnn;
-        } else if (onnxNodeType == "Dropout") {
-            return OT_Dropout;
-        } else if (onnxNodeType == "Scale") {
-            return OT_Power;
-        } else if (onnxNodeType == "TopK") {
-            return OT_TopK;
-        } else if (onnxNodeType == "Equal") {
-            return OT_Equal;
-        } else if (onnxNodeType == "Sign") {
-            return OT_Sign;
-        } else if (onnxNodeType == "TFL_HARD_SWISH") {
-            return OT_HSwish;
-        } else if (onnxNodeType == "Expand") {
-            return OT_Expand;
-        } else if (onnxNodeType == "ScatterND" || onnxNodeType == "ScatterElements") {
-            return OT_Scatter;
-        } else if (onnxNodeType == "Not") {
-            return OT_Not;
-        } else if (onnxNodeType == "Abs") {
-            return OT_Abs;
-        } else if (onnxNodeType == "Reciprocal") {
-            return OT_Reciprocal;
-        } else if (onnxNodeType == "And" || onnxNodeType == "Or" || onnxNodeType == "Xor") {
-            return OT_Eltwise;
-        } else if (onnxNodeType == "Log") {
-            return OT_Log;
-        } else if (onnxNodeType == "Neg") {
-            return OT_Neg;
-        } else if (onnxNodeType == "GenerateProposals") {
-            return OT_GenerateProposals;
-        } else if (onnxNodeType == "RoIAlign") {
-            return OT_RoIAlign;
         } else {
             UNI_ERROR_LOG("operator name:%s type:%s not supported.\n",
                 this->onnxNode.name().c_str(), onnxNodeType.c_str());
+            return OT_None;
         }
-        return OT_None;
+    }
+
+    std::string to_string(onnx::AttributeProto::AttributeType type)
+    {
+        const google::protobuf::EnumDescriptor *descriptor =
+            onnx::AttributeProto::AttributeType_descriptor();
+        return descriptor->FindValueByNumber(type)->name();
+    }
+
+    std::string to_string(int onnxDataType)
+    {
+        const google::protobuf::EnumDescriptor *descriptor =
+            onnx::TensorProto::DataType_descriptor();
+        return descriptor->FindValueByNumber(onnxDataType)->name();
+    }
+
+    std::string get_name(const onnx::NodeProto &node)
+    {
+        std::string opName = node.name();
+        if (opName.empty() && node.output_size() > 0) {
+            opName = node.output(0);
+        }
+        return opName;
     }
 
     int get_attribute_id(const onnx::NodeProto &node, const char *attributeName)
@@ -279,243 +207,259 @@ class OnnxAdaptee : public ModelAdaptee {
         return ret;
     }
 
-    std::vector<int> get_node_vector_ints_attribute_by_name(
-        const onnx::NodeProto &node, const char *key)
+    DataType get_type(
+        const onnx::NodeProto &node, const char *attributeName, DataType defaultValue = DT_F32)
     {
-        std::vector<int> result;
-        int id = get_attribute_id(node, key);
+        int id = get_attribute_id(node, attributeName);
         if (id < 0) {
-            return result;
+            return defaultValue;
         }
-        const onnx::AttributeProto &attribute = node.attribute(id);
-        result.resize(attribute.ints_size());
-        for (int j = 0; j < attribute.ints_size(); j++) {
-            result[j] = UNI_MIN(attribute.ints(j), INT_MAX);
+        const onnx::AttributeProto &attr = node.attribute(id);
+        const auto &type = attr.type();
+        DataType ret;
+        if (type == onnx::AttributeProto::INT || type == onnx::AttributeProto::INTS) {
+            ret = DT_I32;
+        } else if (type == onnx::AttributeProto::FLOAT || type == onnx::AttributeProto::FLOATS) {
+            ret = DT_F32;
+        } else if (type == onnx::AttributeProto::TENSOR) {
+            ret = get_type(attr.t());
+        } else {
+            UNI_ERROR_LOG("can not get operator name:%s attribute:%s %s type.\n",
+                this->onnxNode.name().c_str(), attr.name().c_str(), to_string(type).c_str());
         }
-        return result;
+        return ret;
     }
 
-    std::vector<F32> get_node_vector_float_tensor_attribute_by_name(
-        const onnx::NodeProto &node, const char *key)
+    int get_int(const onnx::NodeProto &node, const char *attributeName, int defaultValue = 0)
     {
-        std::vector<F32> result;
-        int id = get_attribute_id(node, key);
+        int id = get_attribute_id(node, attributeName);
+        if (id < 0) {
+            return defaultValue;
+        }
+        const onnx::AttributeProto &attr = node.attribute(id);
+        if (attr.type() != onnx::AttributeProto::INT) {
+            UNI_ERROR_LOG("can not get operator name:%s attribute:%s %s type value.\n",
+                this->onnxNode.name().c_str(), attr.name().c_str(), to_string(attr.type()).c_str());
+        }
+        return UNI_MIN(attr.i(), INT_MAX);
+    }
+
+    std::vector<int> get_ints(const onnx::NodeProto &node, const char *attributeName)
+    {
+        std::vector<int> result;
+        int id = get_attribute_id(node, attributeName);
         if (id < 0) {
             return result;
         }
-        const onnx::AttributeProto &attribute = node.attribute(id);
-        CHECK_REQUIREMENT(4 == attribute.type());
-        const onnx::TensorProto &tp = attribute.t();
-        U8 *value;
-        if (tp.has_raw_data()) {
-            const std::string &rawData = tp.raw_data();
-            value = (U8 *)(rawData.data());
-        } else if (tp.data_type() == onnx::TensorProto::FLOAT) {
-            value = (U8 *)(tp.float_data().data());
+        const onnx::AttributeProto &attr = node.attribute(id);
+        if (attr.type() == onnx::AttributeProto::TENSOR) {
+            result = get_ints(attr.t());
+        } else if (attr.type() == onnx::AttributeProto::INTS) {
+            result.resize(attr.ints_size());
+            for (int j = 0; j < attr.ints_size(); j++) {
+                result[j] = UNI_MIN(attr.ints(j), INT_MAX);
+            }
         } else {
-            UNI_ERROR_LOG("can not process operator name:%s tensor:%s %s type attribute.\n",
-                this->onnxNode.name().c_str(), tp.name().c_str(),
-                onnx_data_type_string(tp.data_type()).c_str());
+            UNI_ERROR_LOG("can not get operator name:%s attribute:%s %s type value.\n",
+                this->onnxNode.name().c_str(), attr.name().c_str(), to_string(attr.type()).c_str());
         }
-
-        result.resize(tp.dims(0));
-        memcpy(result.data(), value, tp.dims(0) * sizeof(float));
         return result;
     }
 
-    int get_node_single_int_attribute_by_name(
-        const onnx::NodeProto &node, const char *key, int defaultValue = 0)
+    float get_float(const onnx::NodeProto &node, const char *attributeName, float defaultValue = 0.f)
     {
-        int id = get_attribute_id(node, key);
+        int id = get_attribute_id(node, attributeName);
         if (id < 0) {
             return defaultValue;
         }
-        const onnx::AttributeProto &attribute = node.attribute(id);
-        return UNI_MIN(attribute.i(), INT_MAX);
+        const onnx::AttributeProto &attr = node.attribute(id);
+        float ret;
+        if (attr.type() == onnx::AttributeProto::FLOAT) {
+            ret = attr.f();
+        } else {
+            ret = get_floats(node, attributeName)[0];
+        }
+        return ret;
     }
 
-    std::string get_node_str_attribute_by_name(const onnx::NodeProto &node,
-        const char *key,
-        const std::string &defaultValue = std::string())
+    std::vector<F32> get_floats(const onnx::NodeProto &node, const char *attributeName)
     {
-        int id = get_attribute_id(node, key);
+        std::vector<F32> result;
+        int id = get_attribute_id(node, attributeName);
         if (id < 0) {
-            return defaultValue;
+            return result;
+        }
+        const onnx::AttributeProto &attr = node.attribute(id);
+        if (attr.type() == onnx::AttributeProto::TENSOR) {
+            result = get_floats(attr.t());
+        } else if (attr.type() == onnx::AttributeProto::FLOATS) {
+            result.resize(attr.floats_size());
+            for (int j = 0; j < attr.floats_size(); j++) {
+                result[j] = attr.floats(j);
+            }
+        } else {
+            UNI_ERROR_LOG("can not get operator name:%s attribute:%s %s type value.\n",
+                this->onnxNode.name().c_str(), attr.name().c_str(), to_string(attr.type()).c_str());
         }
-        return node.attribute(id).s();
+        return result;
     }
 
-    float get_node_float_attribute_by_name(
-        const onnx::NodeProto &node, const char *key, float defaultValue = 0.f)
+    std::string get_string(const onnx::NodeProto &node,
+        const char *attributeName,
+        const std::string &defaultValue = std::string())
     {
-        int id = get_attribute_id(node, key);
+        int id = get_attribute_id(node, attributeName);
         if (id < 0) {
             return defaultValue;
         }
-        return node.attribute(id).f();
+        const onnx::AttributeProto &attr = node.attribute(id);
+        if (attr.type() != onnx::AttributeProto::STRING) {
+            UNI_ERROR_LOG("can not get operator name:%s attribute:%s %s type value.\n",
+                this->onnxNode.name().c_str(), attr.name().c_str(), to_string(attr.type()).c_str());
+        }
+        return attr.s();
     }
 
-    std::string onnx_data_type_string(int num)
+    DataType get_type(const onnx::TensorProto::DataType &type)
     {
-        const google::protobuf::EnumDescriptor *descriptor =
-            onnx::TensorProto::DataType_descriptor();
-        return descriptor->FindValueByNumber(num)->name();
-    }
-
-    int get_data_size_from_tensor_proto(const onnx::TensorProto &tensorProto)
-    {
-        int size = 0;
-        if (tensorProto.has_raw_data()) {
-            const std::string &rawData = tensorProto.raw_data();
-            if (tensorProto.data_type() == onnx::TensorProto::BOOL) {
-                size = (int)rawData.size() / sizeof(bool);
-            } else if (tensorProto.data_type() == onnx::TensorProto::INT64) {
-                size = (int)rawData.size() / sizeof(int64_t);
-            } else if (tensorProto.data_type() == onnx::TensorProto::INT32) {
-                size = (int)rawData.size() / sizeof(int);
-            } else if (tensorProto.data_type() == onnx::TensorProto::FLOAT) {
-                size = (int)rawData.size() / sizeof(float);
-            } else {
-                UNI_ERROR_LOG("can not process onnx converter name:%s tensor:%s %s type raw "
-                              "tensor.\n",
-                    this->onnxNode.name().c_str(), tensorProto.name().c_str(),
-                    onnx_data_type_string(tensorProto.data_type()).c_str());
-            }
-        } else if (tensorProto.data_type() == onnx::TensorProto::FLOAT) {
-            size = tensorProto.float_data_size();
+        std::map<onnx::TensorProto::DataType, DataType> types = {{onnx::TensorProto::INT64, DT_I64},
+            {onnx::TensorProto::INT32, DT_I32}, {onnx::TensorProto::UINT64, DT_U64},
+            {onnx::TensorProto::UINT32, DT_U32}, {onnx::TensorProto::UINT8, DT_U8},
+            {onnx::TensorProto::INT8, DT_I8}, {onnx::TensorProto::BOOL, DT_U8},
+            {onnx::TensorProto::FLOAT, DT_F32}, {onnx::TensorProto::FLOAT16, DT_F16},
+            {onnx::TensorProto::UNDEFINED, DT_NUM}};
+        DataType ret = DT_F32;
+        if (types.find(type) == types.end()) {
+            UNI_ERROR_LOG("can not process onnx data type %s.\n", to_string(type).c_str());
         } else {
-            UNI_ERROR_LOG("can not process operator name:%s tensor:%s %s type tensor.\n",
-                this->onnxNode.name().c_str(), tensorProto.name().c_str(),
-                onnx_data_type_string(tensorProto.data_type()).c_str());
+            ret = types[type];
         }
-        return size;
+        return ret;
     }
 
-    TensorDesc genDescFromTp(const onnx::TensorProto &tp)
+    DataType get_type(const onnx::TensorProto &tp)
     {
-        DataType dt;
-        if (onnx::TensorProto::FLOAT == tp.data_type() ||
-            onnx::TensorProto::DOUBLE == tp.data_type()) {
-            dt = DT_F32;
-        } else if (onnx::TensorProto::INT64 == tp.data_type() ||
-            onnx::TensorProto::INT32 == tp.data_type()) {
-            dt = DT_I32;
-        } else if (onnx::TensorProto::FLOAT16 == tp.data_type()) {
-            dt = DT_F16;
-        } else {
-            UNI_ERROR_LOG("can not process operator name:%s tensor:%s %s type tensor desc.\n",
-                this->onnxNode.name().c_str(), tp.name().c_str(),
-                onnx_data_type_string(tp.data_type()).c_str());
-        }
-        TensorDesc desc = tensor0d();
-        desc.nDims = tp.dims_size();
-        desc.dt = dt;
-        desc.df = getTensorDefaultDataFormat(desc.nDims);
-        for (U32 j = 0; j < desc.nDims; j++) {
-            desc.dims[desc.nDims - 1 - j] = tp.dims(j);
-        }
-        return desc;
+        return get_type((onnx::TensorProto::DataType)tp.data_type());
     }
 
-    bool *get_bool_ptr_from_tensor_proto(const onnx::TensorProto &tensorProto)
+    DataType cut_type(DataType type)
     {
-        bool *ptr = nullptr;
-        if (tensorProto.has_raw_data()) {
-            const std::string &rawData = tensorProto.raw_data();
-            ptr = (bool *)rawData.data();
-        } else {
-            UNI_ERROR_LOG("can not process operator name:%s tensor:%s %s type non-raw bool "
-                          "tensor.\n",
-                this->onnxNode.name().c_str(), tensorProto.name().c_str(),
-                onnx_data_type_string(tensorProto.data_type()).c_str());
+        DataType ret;
+        switch (type) {
+            case DT_F64:
+            case DT_F32:
+            case DT_F16:
+                ret = DT_F32;
+                break;
+            case DT_I32:
+            case DT_I64:
+            case DT_U32:
+            case DT_U64:
+                ret = DT_I32;
+                break;
+            case DT_I8:
+                ret = DT_I8;
+                break;
+            case DT_U8:
+                ret = DT_U8;
+                break;
+            default:
+                UNI_ERROR_LOG("can not cut %s type to inner data type.\n", DataTypeName()[type]);
+                break;
         }
-        return ptr;
+        return ret;
     }
 
-    U8 *get_ptr_from_weight_obj(const onnx::TensorProto &tensorProto)
+    U8 *get_ptr(const onnx::TensorProto &tp)
     {
         U8 *ptr = nullptr;
-        if (tensorProto.has_raw_data()) {
-            const std::string &rawData = tensorProto.raw_data();
+        if (tp.has_raw_data()) {
+            const std::string &rawData = tp.raw_data();
             ptr = (U8 *)rawData.data();
-        } else if (tensorProto.data_type() == onnx::TensorProto::FLOAT) {
-            ptr = (U8 *)tensorProto.float_data().data();
+        } else if (tp.data_type() == onnx::TensorProto::FLOAT) {
+            ptr = (U8 *)tp.float_data().data();
+        } else if (tp.data_type() == onnx::TensorProto::INT64) {
+            ptr = (U8 *)tp.int64_data().data();
+        } else if (tp.data_type() == onnx::TensorProto::INT32) {
+            ptr = (U8 *)tp.int32_data().data();
+        } else if (tp.data_type() == onnx::TensorProto::UNDEFINED) {
+            ptr = nullptr;
         } else {
-            UNI_ERROR_LOG("can not process operator name:%s tensor:%s %s type weight.\n",
-                this->onnxNode.name().c_str(), tensorProto.name().c_str(),
-                onnx_data_type_string(tensorProto.data_type()).c_str());
+            UNI_ERROR_LOG("can not get operator name:%s tensor:%s data, type: %s.\n",
+                this->onnxNode.name().c_str(), tp.name().c_str(), to_string(tp.data_type()).c_str());
         }
         return ptr;
     }
 
-    std::vector<int> get_int_vec_from_tensorProto(const onnx::TensorProto &tp)
+    int get_length(const onnx::TensorProto &tp)
     {
-        int size = 0;
-        std::vector<int> shape;
-
-        if (tp.data_type() == onnx::TensorProto::INT64 ||
-            tp.data_type() == onnx::TensorProto::UNDEFINED) {
-            U8 *shapeData = 0;
-            if (tp.has_raw_data()) {
-                shapeData = (U8 *)tp.raw_data().data();
-                size = tp.raw_data().size() / 8;
-            } else {
-                shapeData = (U8 *)tp.int64_data().data();
-                size = tp.int64_data_size();
-            }
-            shape.resize(size);
-            for (int j = 0; j < size; j++) {
-                int64_t value;
-                memcpy(&value, shapeData + j * sizeof(int64_t), sizeof(int64_t));
-                shape[j] = UNI_MIN(value, INT_MAX);
-            }
+        int length = 0;
+        if (tp.has_raw_data()) {
+            length = tp.raw_data().size() / bytesOf(get_type(tp));
+        } else if (tp.data_type() == onnx::TensorProto::FLOAT) {
+            length = tp.float_data_size();
         } else if (tp.data_type() == onnx::TensorProto::INT32) {
-            U8 *shapeData = nullptr;
-            if (tp.has_raw_data()) {
-                shapeData = (U8 *)tp.raw_data().data();
-                size = tp.raw_data().size() / 4;
-            } else {
-                shapeData = (U8 *)tp.int32_data().data();
-                size = tp.int32_data_size();
-            }
-            shape.resize(size);
-            memcpy(shape.data(), shapeData, sizeof(int32_t) * size);
+            length = tp.int32_data_size();
+        } else if (tp.data_type() == onnx::TensorProto::INT64) {
+            length = tp.int64_data_size();
+        } else if (tp.data_type() == onnx::TensorProto::UNDEFINED) {
+            length = 0;
         } else {
-            UNI_ERROR_LOG("can not process operator name:%s tensor:%s %s type tensor.\n",
-                this->onnxNode.name().c_str(), tp.name().c_str(),
-                onnx_data_type_string(tp.data_type()).c_str());
+            UNI_ERROR_LOG("can not get operator name:%s tensor:%s length, type:%s.\n",
+                this->onnxNode.name().c_str(), tp.name().c_str(), to_string(tp.data_type()).c_str());
         }
-        return shape;
+        return length;
     }
 
-    float getSinFloat_from_tensorProto(const onnx::TensorProto &tp)
+    std::vector<int> get_ints(const onnx::TensorProto &tp)
     {
-        float value = 0;
-        int size = get_data_size_from_tensor_proto(tp);
-        auto type = tp.data_type();
-        if (size == 1) {
-            if (type == onnx::TensorProto::FLOAT) {
-                memcpy(&value, get_ptr_from_weight_obj(tp), sizeof(float));
-            } else if (type == onnx::TensorProto::INT64 || type == onnx::TensorProto::INT32) {
-                value = get_int_vec_from_tensorProto(tp)[0];
-            } else {
-                UNI_ERROR_LOG("can not process operator name:%s tensor:%s %d-%s type tensor.\n",
-                    this->onnxNode.name().c_str(), tp.name().c_str(), size,
-                    onnx_data_type_string(type).c_str());
-            }
-        } else {
-            UNI_ERROR_LOG("can not process operator name:%s tensor:%s %d-%s type tensor.\n",
-                this->onnxNode.name().c_str(), tp.name().c_str(), size,
-                onnx_data_type_string(type).c_str());
+        int length = get_length(tp);
+        std::vector<int> data(length);
+        transformToInt(get_type(tp), get_ptr(tp), data.data(), length);
+        return data;
+    }
+
+    std::vector<F32> get_floats(const onnx::TensorProto &tp)
+    {
+        int length = get_length(tp);
+        std::vector<F32> data(length);
+        transformToFloat(get_type(tp), get_ptr(tp), data.data(), length);
+        return data;
+    }
+
+    TensorDesc get_desc(const onnx::TensorProto &tp)
+    {
+        TensorDesc desc = tensor0d();
+        desc.dt = cut_type(get_type(tp));
+        desc.nDims = tp.dims_size();
+        desc.df = getTensorDefaultDataFormat(desc.nDims);
+        for (U32 j = 0; j < desc.nDims; j++) {
+            desc.dims[desc.nDims - 1 - j] = tp.dims(j);
+        }
+        return desc;
+    }
+
+    TensorDesc get_desc(const onnx::ValueInfoProto &vip)
+    {
+        TensorDesc desc;
+        desc.dt =
+            cut_type(get_type((onnx::TensorProto::DataType)vip.type().tensor_type().elem_type()));
+        desc.nDims = vip.type().tensor_type().shape().dim().size();
+        desc.df = getTensorDefaultDataFormat(desc.nDims);
+        for (U32 j = 0; j < desc.nDims; j++) {
+            desc.dims[desc.nDims - 1 - j] = vip.type().tensor_type().shape().dim(j).dim_value();
         }
-        return value;
+        return desc;
     }
 
-    void memcpy_trans2d(void *dest, void *src, int N, int K)
+    // dst's dimension is [N, K];
+    // src's dimension is [K, N];
+    void UNI_MEMCPY_trans2d(void *dst, void *src, int N, int K)
     {
         for (int r = 0, index = 0; r < N; r++) {
             for (int c = 0; c < K; c++, index += sizeof(float)) {
-                memcpy((U8 *)dest + index, (U8 *)src + (c * N + r) * sizeof(float), sizeof(float));
+                UNI_MEMCPY(
+                    (U8 *)dst + index, (U8 *)src + (c * N + r) * sizeof(float), sizeof(float));
             }
         }
     }
@@ -539,58 +483,49 @@ class OnnxAdaptee : public ModelAdaptee {
         return ret;
     }
 
-    void assign_weight(WeightSpec &ws,
-        std::string opName,
-        std::vector<onnx::TensorProto> weightTp,
-        std::vector<onnx::TensorProto> biasTp)
+    void copy_tensors(std::vector<onnx::TensorProto> tp, U8 *dst, DataType type)
     {
-        // basic
-        str_copy(ws.op_name, opName.c_str(), opName.length());
-        ws.mdt = DT_F32;
-        // aasign onnxWeights
-        if (weightTp.size() == 0) {
-            ws.bytes_of_weight = 0;
-            ws.weight = nullptr;
-        } else {
-            U8 *weight_ptr = get_ptr_from_weight_obj(weightTp[0]);
-            ws.bytes_of_weight = get_data_size_from_tensor_proto(weightTp[0]) * sizeof(float);
-            ws.weight = (U8 *)mt_new_storage(ws.bytes_of_weight);
-            memcpy(ws.weight, weight_ptr, ws.bytes_of_weight);
-        }
-        // assign bias
-        if (biasTp.size() == 0) {
-            ws.bytes_of_vec = 0;
-            ws.vec = nullptr;
-        } else {
-            U8 *vec_ptr = get_ptr_from_weight_obj(biasTp[0]);
-            ws.bytes_of_vec = get_data_size_from_tensor_proto(biasTp[0]) * sizeof(float);
-            ws.vec = (U8 *)mt_new_storage(ws.bytes_of_vec);
-            memcpy(ws.vec, vec_ptr, ws.bytes_of_vec);
+        for (U32 i = 0; i < tp.size(); i++) {
+            U8 *ptr = get_ptr(tp[i]);
+            DataType dt = get_type(tp[i]);
+            int length = get_length(tp[i]);
+            if (type == dt) {
+                UNI_MEMCPY(dst, ptr, length * bytesOf(dt));
+            } else if (type == DT_I32) {
+                transformToInt(dt, ptr, (int *)dst, length);
+            } else {
+                UNI_ERROR_LOG(
+                    "can not convert %s data to %s.\n", DataTypeName()[dt], DataTypeName()[type]);
+            }
+            dst += length * bytesOf(dt);
         }
     }
 
-    EE parse_file(std::string dir, std::string mfn) override
+    WeightSpec convert_weight(std::string operatorName,
+        std::vector<onnx::TensorProto> weight,
+        std::vector<onnx::TensorProto> bias)
     {
-        std::string onnxSuffix = ".onnx";
-        std::string onnxPath = dir + "/" + mfn + onnxSuffix;
-
-        EE ret = read_from_onnx_file(onnxPath.c_str(), (google::protobuf::Message *)(&onnxModel));
-        if (ret != SUCCESS) {
-            UNI_ERROR_LOG("can not read onnx model file %s.\n", onnxPath.c_str());
-        }
-
-        onnxGraph = onnxModel.graph();
-
-        for (int i = 0; i < onnxGraph.initializer_size(); i++) {
-            const onnx::TensorProto &initializer = onnxGraph.initializer(i);
-            onnxWeights[initializer.name()] = initializer;
+        U32 bytes0 = 0, bytes1 = 0;
+        DataType wdt = DT_F32, mdt = DT_F32, vdt = DT_F32;
+        for (U32 i = 0; i < bias.size(); i++) {
+            DataType dt = cut_type(get_type(bias[i]));
+            bytes1 += get_length(bias[i]) * bytesOf(dt);
+            wdt = vdt = dt;
+            UNI_DEBUG_LOG("copy tensor:%s type:%s->%s to bias section.\n", bias[i].name().c_str(),
+                to_string(bias[i].data_type()).c_str(), DataTypeName()[dt]);
         }
-
-        for (int i = 0; i < onnxGraph.value_info_size(); i++) {
-            const onnx::ValueInfoProto &value = onnxGraph.value_info(i);
-            onnxValues[value.name()] = value;
+        for (U32 i = 0; i < weight.size(); i++) {
+            DataType dt = cut_type(get_type(weight[i]));
+            bytes0 += get_length(weight[i]) * bytesOf(dt);
+            wdt = mdt = dt;
+            UNI_DEBUG_LOG("copy tensor:%s type:%s->%s to weight section.\n",
+                weight[i].name().c_str(), to_string(weight[i].data_type()).c_str(),
+                DataTypeName()[dt]);
         }
-        return ret;
+        WeightSpec w = mt_create_weight(operatorName.c_str(), wdt, bytes0, bytes1, 0);
+        copy_tensors(weight, w.weight, mdt);
+        copy_tensors(bias, w.vec, vdt);
+        return w;
     }
 
     std::string crop_name(const std::string &name)
@@ -607,15 +542,6 @@ class OnnxAdaptee : public ModelAdaptee {
         return ret;
     }
 
-    std::string get_name(const onnx::NodeProto &node)
-    {
-        std::string opName = node.name();
-        if (opName.empty() && node.output_size() > 0) {
-            opName = node.output(0);
-        }
-        return opName;
-    }
-
     EE adapt_operators(ModelSpec *ms) override
     {
         str_copy(ms->model_name, onnxGraph.name().c_str(), onnxGraph.name().length());
@@ -629,46 +555,19 @@ class OnnxAdaptee : public ModelAdaptee {
             }
             ms->num_inputs++;
         }
-        ms->input_names = (I8 **)mt_new_storage(ms->num_inputs * sizeof(I8 *));
-        ms->input_dims = (TensorDesc *)mt_new_storage(sizeof(TensorDesc) * ms->num_inputs);
+        ms->input_names = (I8 **)mt_malloc(ms->num_inputs * sizeof(I8 *));
+        ms->input_dims = (TensorDesc *)mt_malloc(sizeof(TensorDesc) * ms->num_inputs);
         for (int i = 0, index = 0; i < onnxGraph.input().size(); i++) {
             auto input_node = onnxGraph.input(i);
             auto input_name = input_node.name();
             if (onnxWeights.find(input_name) != onnxWeights.end()) {
                 continue;
             }
-            ms->input_names[index] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+            ms->input_names[index] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
             input_name = this->crop_name(input_name);
             str_copy(ms->input_names[index], input_name.c_str(), input_name.length());
 
-            ms->input_dims[index] = tensor0d();
-            auto type = input_node.type().tensor_type().elem_type();
-            switch (type) {
-                case onnx::TensorProto::INT64:
-                case onnx::TensorProto::INT32:
-                    ms->input_dims[index].dt = DT_I32;
-                    break;
-                case onnx::TensorProto::UINT64:
-                case onnx::TensorProto::UINT32:
-                    ms->input_dims[index].dt = DT_U32;
-                    break;
-                case onnx::TensorProto::DOUBLE:
-                case onnx::TensorProto::FLOAT:
-                case onnx::TensorProto::FLOAT16:
-                case onnx::TensorProto::BFLOAT16:
-                    ms->input_dims[index].dt = DT_F32;
-                    break;
-                default:
-                    UNI_ERROR_LOG(
-                        "can not process %s type input.\n", onnx_data_type_string(type).c_str());
-                    break;
-            }
-            ms->input_dims[index].nDims = input_node.type().tensor_type().shape().dim().size();
-            ms->input_dims[index].df = getTensorDefaultDataFormat(ms->input_dims[index].nDims);
-            for (U32 j = 0; j < ms->input_dims[index].nDims; j++) {
-                ms->input_dims[index].dims[ms->input_dims[index].nDims - 1 - j] =
-                    input_node.type().tensor_type().shape().dim(j).dim_value();
-            }
+            ms->input_dims[index] = get_desc(input_node);
             // batch must > 0
             for (U32 j = 0; j < ms->input_dims[index].nDims; j++) {
                 if (ms->input_dims[index].dims[j] == 0) {
@@ -679,11 +578,11 @@ class OnnxAdaptee : public ModelAdaptee {
         }
 
         ms->num_outputs = onnxGraph.output().size();
-        ms->output_names = (I8 **)mt_new_storage(ms->num_outputs * sizeof(I8 *));
+        ms->output_names = (I8 **)mt_malloc(ms->num_outputs * sizeof(I8 *));
         for (int i = 0; i < onnxGraph.output().size(); i++) {
             std::string output_name = onnxGraph.output(i).name();
             output_name = this->crop_name(output_name);
-            ms->output_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+            ms->output_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
             str_copy(ms->output_names[i], output_name.c_str(), output_name.length());
         }
 
@@ -693,22 +592,24 @@ class OnnxAdaptee : public ModelAdaptee {
             for (int j = 0; j < this->onnxNode.input_size(); j++) {
                 const std::string &input_name = this->onnxNode.input(j);
                 if (this->onnxWeights.find(input_name) != this->onnxWeights.end()) {
-#if 0
-                    this->onnxWeightReferCount[input_name] = 1;
-#else
-                    if (this->onnxWeightReferCount.find(input_name) ==
-                        this->onnxWeightReferCount.end()) {
-                        this->onnxWeightReferCount[input_name] = 1;
+                    if (this->useShare) {
+                        if (this->onnxWeightReferCount.find(input_name) ==
+                                this->onnxWeightReferCount.end() ||
+                            get_length(onnxWeights[input_name]) == 1) {
+                            this->onnxWeightReferCount[input_name] = 1;
+                        } else {
+                            this->onnxWeightReferCount[input_name]++;
+                        }
+                        if (onnxNodeType == "Gemm" && j == 2) {
+                            auto BName = this->onnxNode.input(1);
+                            auto CName = this->onnxNode.input(2);
+                            this->onnxWeightReferCount[CName] =
+                                UNI_MAX(this->onnxWeightReferCount[BName],
+                                    this->onnxWeightReferCount[CName]);
+                        }
                     } else {
-                        this->onnxWeightReferCount[input_name]++;
-                    }
-                    if (onnxNodeType == "Gemm" && j == 2) {
-                        auto BName = this->onnxNode.input(1);
-                        auto CName = this->onnxNode.input(2);
-                        this->onnxWeightReferCount[CName] = UNI_MAX(
-                            this->onnxWeightReferCount[BName], this->onnxWeightReferCount[CName]);
+                        this->onnxWeightReferCount[input_name] = 1;
                     }
-#endif
                 }
             }
         }
@@ -719,7 +620,7 @@ class OnnxAdaptee : public ModelAdaptee {
             }
         }
 
-        std::vector<OperatorSpec> operatorSpecVec;
+        std::vector<OperatorSpec> ops;
         for (int nodeIndex = 0; nodeIndex < onnxGraph.node_size(); nodeIndex++) {
             this->onnxNode = onnxGraph.node(nodeIndex);
             std::string opName = get_name(this->onnxNode);
@@ -741,7 +642,8 @@ class OnnxAdaptee : public ModelAdaptee {
             std::vector<std::string> inputNames, outputNames;
             for (int j = 0; j < this->onnxNode.input_size(); j++) {
                 const std::string &input_name = this->onnxNode.input(j);
-                if (opType == OT_Eltwise || opType == OT_Concat || opType == OT_MatMul) {
+                if (opType == OT_Eltwise || opType == OT_Concat || opType == OT_MatMul ||
+                    opType == OT_Check || opType == OT_Where || (opType == OT_Reshape && j == 0)) {
                     inputNames.push_back(input_name);
                 } else if (input_name == "" ||
                     this->onnxWeights.find(input_name) != this->onnxWeights.end()) {
@@ -778,72 +680,54 @@ class OnnxAdaptee : public ModelAdaptee {
                 }
             }
 
-            OperatorSpec operatorSpec;
-            str_copy(operatorSpec.name, opName.c_str(), opName.length());
-            operatorSpec.type = opType;
-            operatorSpec.num_inputs = inputNames.size();
-            operatorSpec.input_tensors_name =
-                (I8 **)mt_new_storage(operatorSpec.num_inputs * sizeof(I8 *));
-            for (U32 j = 0; j < operatorSpec.num_inputs; j++) {
+            OperatorSpec os =
+                mt_create_operator(opName.c_str(), opType, inputNames.size(), outputNames.size());
+            for (U32 j = 0; j < os.num_inputs; j++) {
                 inputNames[j] = crop_name(inputNames[j]);
-                operatorSpec.input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
-                str_copy(operatorSpec.input_tensors_name[j], inputNames[j].c_str(),
-                    inputNames[j].length());
-            }
-            operatorSpec.num_outputs = outputNames.size();
-            operatorSpec.output_tensors_name =
-                (I8 **)mt_new_storage(operatorSpec.num_outputs * sizeof(I8 *));
-            for (U32 j = 0; j < operatorSpec.num_outputs; j++) {
+                str_copy(os.input_tensors_name[j], inputNames[j].c_str(), inputNames[j].length());
+            }
+            for (U32 j = 0; j < os.num_outputs; j++) {
                 outputNames[j] = crop_name(outputNames[j]);
-                operatorSpec.output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
-                str_copy(operatorSpec.output_tensors_name[j], outputNames[j].c_str(),
-                    outputNames[j].length());
+                str_copy(os.output_tensors_name[j], outputNames[j].c_str(), outputNames[j].length());
             }
-
-            CHECK_STATUS(adapt_operator(opType, &(operatorSpec.ps)));
-            operatorSpecVec.push_back(operatorSpec);
+            CHECK_STATUS(adapt_operator(opType, &(os.ps)));
+            ops.push_back(os);
 
             if (onnxNodeType == "BatchNormalization") {
-                OperatorSpec operatorSpec;
-                std::string scaleInputName = outputNames[0];
-                std::string scaleOpName = opName + "_scale";
-                str_copy(operatorSpec.name, scaleOpName.c_str(), scaleOpName.length());
-                operatorSpec.type = OT_Scale;
-                operatorSpec.num_inputs = 1;
-                operatorSpec.input_tensors_name = (I8 **)mt_new_storage(sizeof(I8 *));
-                operatorSpec.input_tensors_name[0] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
-                str_copy(operatorSpec.input_tensors_name[0], scaleInputName.c_str(),
-                    scaleInputName.length());
-                operatorSpec.num_outputs = 1;
-                operatorSpec.output_tensors_name = (I8 **)mt_new_storage(sizeof(I8 *));
-                operatorSpec.output_tensors_name[0] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
-                str_copy(operatorSpec.output_tensors_name[0], scaleInputName.c_str(),
-                    scaleInputName.length());
-
-                CHECK_STATUS(adapt_operator(operatorSpec.type, &(operatorSpec.ps)));
-                operatorSpecVec.push_back(operatorSpec);
-            }
-        }
-
-        std::vector<OperatorSpec> sharedWeightOperatorSpecVec;
+                std::string scaleName = opName + "_scale";
+                OperatorSpec os = mt_create_operator(scaleName.c_str(), OT_Scale, 1, 1);
+                str_copy(os.input_tensors_name[0], outputNames[0].c_str(), outputNames[0].length());
+                str_copy(os.output_tensors_name[0], outputNames[0].c_str(), outputNames[0].length());
+                CHECK_STATUS(adapt_operator(os.type, &(os.ps)));
+                ops.push_back(os);
+            }
+        }
+
+        std::vector<OperatorSpec> sops;
         for (auto iter = this->sharedWeights.begin(); iter != this->sharedWeights.end(); iter++) {
-            OperatorSpec tmpOps = mt_create_operator(iter->c_str(), OT_SharedWeight, 0, 1);
-            str_copy(tmpOps.output_tensors_name[0], iter->c_str(), iter->length());
+            std::string opName = "weight_" + *iter;
+            OperatorSpec os = mt_create_operator(opName.c_str(), OT_SharedWeight, 0, 1);
+            str_copy(os.output_tensors_name[0], iter->c_str(), iter->length());
             const auto &weightTp = onnxWeights[*iter];
-            tmpOps.ps.shared_weight_spec.desc = genDescFromTp(weightTp);
-            int num = get_data_size_from_tensor_proto(weightTp);
-            if (tmpOps.ps.shared_weight_spec.desc.nDims == 0 && num > 0) {
-                tmpOps.ps.shared_weight_spec.desc.nDims = 1;
-                tmpOps.ps.shared_weight_spec.desc.dims[0] = num;
-            }
-            sharedWeightOperatorSpecVec.push_back(tmpOps);
-        }
-        ms->num_operator_specs = sharedWeightOperatorSpecVec.size() + operatorSpecVec.size();
-        ms->ops = (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs);
-        memcpy(ms->ops, sharedWeightOperatorSpecVec.data(),
-            sizeof(OperatorSpec) * sharedWeightOperatorSpecVec.size());
-        memcpy(ms->ops + sharedWeightOperatorSpecVec.size(), operatorSpecVec.data(),
-            sizeof(OperatorSpec) * operatorSpecVec.size());
+            TensorDesc desc = get_desc(weightTp);
+            int num = get_length(weightTp);
+            if (desc.nDims == 0 && num > 0) {
+                desc.nDims = 1;
+                desc.dims[0] = num;
+            }
+            if (desc.dt == DT_I32 && desc.nDims + num <= DIM_LEN) {
+                std::vector<int> ptr = get_ints(weightTp);
+                for (int i = 0; i < num; i++) {
+                    desc.dims[desc.nDims + i] = ptr[i];
+                }
+            }
+            os.ps.shared_weight_spec.desc = desc;
+            sops.push_back(os);
+        }
+        ms->num_operator_specs = sops.size() + ops.size();
+        ms->ops = (OperatorSpec *)mt_malloc(sizeof(OperatorSpec) * ms->num_operator_specs);
+        UNI_MEMCPY(ms->ops, sops.data(), sizeof(OperatorSpec) * sops.size());
+        UNI_MEMCPY(ms->ops + sops.size(), ops.data(), sizeof(OperatorSpec) * ops.size());
         for (I32 i = 0; i < ms->num_operator_specs; i++) {
             ms->ops[i].tensor_positions = nullptr;
             ms->ops[i].num_quant_feature = 0;
@@ -852,40 +736,65 @@ class OnnxAdaptee : public ModelAdaptee {
         return SUCCESS;
     }
 
+    DataType use_bnn(const onnx::NodeProto &node)
+    {
+        if (!useBNN) {
+            return DT_F32;
+        }
+        int weight_id = 1;
+        if (onnxWeights.find(node.input(weight_id)) == onnxWeights.end()) {
+            return DT_F32;
+        }
+        auto weight = onnxWeights[node.input(weight_id)];
+        int length = get_length(weight);
+        if (1 >= length) {
+            return DT_F32;
+        }
+        int oc = weight.dims(0);
+        int ic = weight.dims(1);
+        int fhfw = 1;
+        std::vector<int> kernel = get_ints(node, "kernel_shape");
+        for (U32 i = 0; i < kernel.size(); i++) {
+            fhfw *= kernel[i];
+        }
+        if (ic % 32 != 0 || oc % 16 != 0 || fhfw % 8 != 1) {
+            UNI_WARNING_LOG("operator name:%s can not use 1-bit calculation, because 1-bit only "
+                            "support input_channel(%d) mod 32 = 0, output_channel(%d) mod 16 = 0, "
+                            "and fhfw(%d) mod 8 = 1.\n",
+                node.name().c_str(), ic, oc, fhfw);
+            return DT_F32;
+        }
+        int count0 = 0, count1 = 0, count_1 = 0;
+        float value;
+        U8 *ptr = get_ptr(weight);
+        for (int i = 0; i < length; i++) {
+            UNI_MEMCPY(&value, ptr, sizeof(float));
+            ptr += sizeof(float);
+            if (value == 0) {
+                count0++;
+            } else if (value == 1) {
+                count1++;
+            } else if (value == -1) {
+                count_1++;
+            }
+        }
+        if (count0 + count1 == length) {
+            return DT_BIN01;
+        }
+        if (count_1 + count1 == length) {
+            return DT_BIN11;
+        }
+        UNI_WARNING_LOG("operator name:%s can not use 1-bit calculation, because weight is not 0/1 "
+                        "or -1/1.\n",
+            node.name().c_str());
+        return DT_F32;
+    }
+
     EE adapt_weights(ModelSpec *ms) override
     {
-        std::vector<WeightSpec> weightSpecVec;
+        std::vector<WeightSpec> ws;
         for (auto iter = this->sharedWeights.begin(); iter != this->sharedWeights.end(); iter++) {
-            const auto &weightTp = onnxWeights[*iter];
-            DataType dt;
-            std::vector<int> intVec;
-            U8 *ptr;
-            switch (weightTp.data_type()) {
-                case onnx::TensorProto::INT64:
-                case onnx::TensorProto::INT32:
-                case onnx::TensorProto::UINT64:
-                case onnx::TensorProto::UINT32:
-                    intVec = get_int_vec_from_tensorProto(weightTp);
-                    ptr = (U8 *)intVec.data();
-                    dt = DT_I32;
-                    break;
-                case onnx::TensorProto::DOUBLE:
-                case onnx::TensorProto::FLOAT:
-                case onnx::TensorProto::FLOAT16:
-                case onnx::TensorProto::BFLOAT16:
-                    ptr = get_ptr_from_weight_obj(weightTp);
-                    dt = DT_F32;
-                    break;
-                default:
-                    UNI_ERROR_LOG("can not process tensor:%s %s type weight.\n",
-                        weightTp.name().c_str(),
-                        onnx_data_type_string(weightTp.data_type()).c_str());
-                    break;
-            }
-            U32 bytes = get_data_size_from_tensor_proto(weightTp) * bytesOf(dt);
-            WeightSpec weightSpec = mt_create_weight(iter->c_str(), dt, bytes, 0, 0);
-            memcpy(weightSpec.weight, ptr, bytes);
-            weightSpecVec.push_back(weightSpec);
+            ws.push_back(convert_weight("weight_" + *iter, {onnxWeights[*iter]}, {}));
         }
         for (int nodeIndex = 0; nodeIndex < onnxGraph.node_size(); nodeIndex++) {
             this->onnxNode = onnxGraph.node(nodeIndex);
@@ -896,220 +805,110 @@ class OnnxAdaptee : public ModelAdaptee {
             if (this->nameMap.find(opName) != this->nameMap.end()) {
                 opName = this->nameMap[opName];
             }
-            auto indices = getOperatorWeightInputIndex(this->onnxNode);
+            auto indices = get_weight_ids(this->onnxNode);
 
             WeightSpec weightSpec;
             if (onnxNodeType == "Conv" || onnxNodeType == "ConvTranspose") {
-                // if convInputNum == 3, means has bias , otherwise do not have bias
-                int convInputNum = this->onnxNode.input_size();
-
-                const onnx::TensorProto &convWeightTp = onnxWeights[this->onnxNode.input(1)];
-
-                int convWeightNum = get_data_size_from_tensor_proto(convWeightTp);
-                U8 *convWeightParamPtr = get_ptr_from_weight_obj(convWeightTp);
-                str_copy(weightSpec.op_name, opName.c_str(), opName.length());
-
-                // Please do not change to bytesOf(mdt)
-                weightSpec.bytes_of_weight = convWeightNum * sizeof(float);
-                weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
-                memcpy(weightSpec.weight, convWeightParamPtr, weightSpec.bytes_of_weight);
-                // traverse weight elements to see whether it is bnn convolution
-                weightSpec.mdt = get_weight_data_type(convWeightNum, (F32 *)weightSpec.weight);
-                int oc = convWeightTp.dims(0);
-                int ic = convWeightTp.dims(1);
-                int fhfw = 1;
-                std::vector<int> kernel =
-                    get_node_vector_ints_attribute_by_name(this->onnxNode, "kernel_shape");
-                for (U32 i = 0; i < kernel.size(); i++) {
-                    fhfw *= kernel[i];
-                }
-                if (oc % 16 != 0 || ic % 32 != 0 || fhfw % 8 != 1) {
-                    weightSpec.mdt = DT_F32;
-                }
-
-                int convBiasNum = 0;
-                U8 *convBiasParamPtr = nullptr;
-                if (convInputNum == 3) {
-                    const onnx::TensorProto &convBiasTp = onnxWeights[this->onnxNode.input(2)];
-                    convBiasNum = get_data_size_from_tensor_proto(convBiasTp);
-                    convBiasParamPtr = get_ptr_from_weight_obj(convBiasTp);
-                    weightSpec.bytes_of_vec = convBiasNum * sizeof(float);
+                weightSpec = convert_weight(opName, {onnxWeights[this->onnxNode.input(1)]}, {});
+                weightSpec.mdt = use_bnn(this->onnxNode);
+                if (this->onnxNode.input_size() == 3) {
+                    auto &bias = onnxWeights[this->onnxNode.input(2)];
+                    int length = get_length(bias);
+                    U8 *ptr = get_ptr(bias);
+                    weightSpec.bytes_of_vec = length * sizeof(float);
+                    // BNN conv must have a scale vector and a bias vector, so that it can fuse with BN
                     if (DT_BIN11 == weightSpec.mdt || DT_BIN01 == weightSpec.mdt) {
-                        // BNN conv must have a scale vector and a bias vector, so that it can fuse with BN
                         weightSpec.bytes_of_vec *= 2;
-                    }
-                    weightSpec.vec = (U8 *)mt_new_storage(weightSpec.bytes_of_vec);
-                    if (DT_BIN11 == weightSpec.mdt || DT_BIN01 == weightSpec.mdt) {
-                        U32 vecBytes = convBiasNum * sizeof(float);
-                        F32 *scale = (F32 *)weightSpec.vec;
-                        for (I32 j = 0; j < convBiasNum; j++) {
-                            scale[j] = 1.0;
-                        }
+                        weightSpec.vec = (U8 *)mt_malloc(weightSpec.bytes_of_vec);
+                        UNI_INIT(length, DT_F32, 1, weightSpec.vec);
                         // Copy bias (if any) to the second half for BNN
-                        memcpy(weightSpec.vec + vecBytes, convBiasParamPtr, vecBytes);
+                        UNI_MEMCPY(
+                            weightSpec.vec + length * sizeof(float), ptr, length * sizeof(float));
                     } else {
-                        memcpy(weightSpec.vec, convBiasParamPtr, weightSpec.bytes_of_vec);
+                        weightSpec.vec = (U8 *)mt_malloc(weightSpec.bytes_of_vec);
+                        UNI_MEMCPY(weightSpec.vec, ptr, weightSpec.bytes_of_vec);
                     }
-                } else {
-                    weightSpec.bytes_of_vec = 0;
-                    weightSpec.vec = nullptr;
                 }
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(weightSpec);
             } else if (onnxNodeType == "Gemm" || onnxNodeType == "Linear") {
-                str_copy(weightSpec.op_name, opName.c_str(), opName.length());
-                weightSpec.mdt = DT_F32;
-                int transB = 1;
                 if (onnxNodeType == "Linear" ||
                     this->onnxWeightReferCount[this->onnxNode.input(2)] > 1) {
-                    weightSpec.bytes_of_vec = 0;
-                    weightSpec.vec = nullptr;
+                    weightSpec = convert_weight(opName, {}, {});
                 } else {
-                    const onnx::TensorProto &fcBiasTp = onnxWeights[this->onnxNode.input(2)];
-                    int fcBiasNum = get_data_size_from_tensor_proto(fcBiasTp);
-                    U8 *fcBiasParamPtr = get_ptr_from_weight_obj(fcBiasTp);
-                    weightSpec.bytes_of_vec = fcBiasNum * sizeof(float);
-                    weightSpec.vec = (U8 *)mt_new_storage(weightSpec.bytes_of_vec);
-                    memcpy(weightSpec.vec, fcBiasParamPtr, weightSpec.bytes_of_vec);
+                    weightSpec = convert_weight(opName, {}, {onnxWeights[this->onnxNode.input(2)]});
                 }
+                int transB = 1;
                 if (onnxNodeType == "Gemm") {
-                    transB = get_node_single_int_attribute_by_name(this->onnxNode, "transB", 0);
+                    transB = get_int(this->onnxNode, "transB", 0);
                 }
-                if (this->onnxWeightReferCount[this->onnxNode.input(1)] > 1) {
-                    weightSpec.bytes_of_weight = 0;
-                    weightSpec.weight = nullptr;
-                } else {
-                    const onnx::TensorProto &fcWeightTp = onnxWeights[this->onnxNode.input(1)];
-                    int fcWeightNum = get_data_size_from_tensor_proto(fcWeightTp);
-                    U8 *fcWeightParamPtr = get_ptr_from_weight_obj(fcWeightTp);
-                    weightSpec.bytes_of_weight = fcWeightNum * sizeof(float);
-                    weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
+                if (this->onnxWeightReferCount[this->onnxNode.input(1)] <= 1) {
+                    const onnx::TensorProto &weight = onnxWeights[this->onnxNode.input(1)];
+                    int length = get_length(weight);
+                    U8 *ptr = get_ptr(weight);
+                    weightSpec.bytes_of_weight = length * sizeof(float);
+                    weightSpec.weight = (U8 *)mt_malloc(weightSpec.bytes_of_weight);
                     if (transB) {
-                        memcpy(weightSpec.weight, fcWeightParamPtr, fcWeightNum * sizeof(float));
+                        UNI_MEMCPY(weightSpec.weight, ptr, length * sizeof(float));
                     } else {
-                        memcpy_trans2d(weightSpec.weight, fcWeightParamPtr, (int)fcWeightTp.dims(1),
-                            (int)fcWeightTp.dims(0));
+                        UNI_MEMCPY_trans2d(weightSpec.weight, ptr, weight.dims(1), weight.dims(0));
                     }
                 }
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(weightSpec);
             } else if (onnxNodeType == "BatchNormalization") {
-                const onnx::TensorProto &scale = onnxWeights[this->onnxNode.input(1)];
-                const onnx::TensorProto &bias = onnxWeights[this->onnxNode.input(2)];
-                const onnx::TensorProto &mean = onnxWeights[this->onnxNode.input(3)];
-                const onnx::TensorProto &var = onnxWeights[this->onnxNode.input(4)];
-
-                U8 *meanPtr = get_ptr_from_weight_obj(mean);
-                int bnMeanNum = get_data_size_from_tensor_proto(mean);
-                U8 *varPtr = get_ptr_from_weight_obj(var);
-                int bnVarNum = get_data_size_from_tensor_proto(var);
-
-                str_copy(weightSpec.op_name, opName.c_str(), opName.length());
-                weightSpec.mdt = DT_F32;
-                weightSpec.bytes_of_weight = bnMeanNum * sizeof(float);
-                weightSpec.bytes_of_vec = bnVarNum * sizeof(float);
-
-                weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
-                memcpy(weightSpec.weight, meanPtr, weightSpec.bytes_of_weight);
-                weightSpec.vec = (U8 *)mt_new_storage(weightSpec.bytes_of_vec);
-                memcpy(weightSpec.vec, varPtr, weightSpec.bytes_of_vec);
-                weightSpecVec.push_back(weightSpec);
-
-                // for scale
-                std::string scaleWeightOpName = opName + "_scale";
-                U8 *scalePtr = get_ptr_from_weight_obj(scale);
-                int scaleWeightNum = get_data_size_from_tensor_proto(scale);
-                U8 *biasPtr = get_ptr_from_weight_obj(bias);
-                int scaleBiasNum = get_data_size_from_tensor_proto(bias);
-
-                str_copy(weightSpec.op_name, scaleWeightOpName.c_str(), scaleWeightOpName.length());
-                weightSpec.mdt = DT_F32;
-                weightSpec.bytes_of_weight = scaleWeightNum * sizeof(float);
-                weightSpec.bytes_of_vec = scaleBiasNum * sizeof(float);
-
-                weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
-                memcpy(weightSpec.weight, scalePtr, weightSpec.bytes_of_weight);
-                weightSpec.vec = (U8 *)mt_new_storage(weightSpec.bytes_of_vec);
-                memcpy(weightSpec.vec, biasPtr, weightSpec.bytes_of_vec);
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(convert_weight(opName, {onnxWeights[this->onnxNode.input(3)]},
+                    {onnxWeights[this->onnxNode.input(4)]}));
+                ws.push_back(convert_weight(opName + "_scale",
+                    {onnxWeights[this->onnxNode.input(1)]}, {onnxWeights[this->onnxNode.input(2)]}));
             } else if (onnxNodeType == "BatchNorm" || onnxNodeType == "InstanceNormalization") {
-                std::vector<onnx::TensorProto> weightTp = {onnxWeights[this->onnxNode.input(1)]};
-                std::vector<onnx::TensorProto> biasTp = {onnxWeights[this->onnxNode.input(2)]};
-                assign_weight(weightSpec, opName, weightTp, biasTp);
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(convert_weight(opName, {onnxWeights[this->onnxNode.input(1)]},
+                    {onnxWeights[this->onnxNode.input(2)]}));
             } else if (onnxNodeType == "Tdnn") {
-                std::vector<onnx::TensorProto> weightTp = {onnxWeights[this->onnxNode.input(2)]};
-                std::vector<onnx::TensorProto> biasTp = {onnxWeights[this->onnxNode.input(3)]};
-                assign_weight(weightSpec, opName, weightTp, biasTp);
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(convert_weight(opName, {onnxWeights[this->onnxNode.input(2)]},
+                    {onnxWeights[this->onnxNode.input(3)]}));
             } else if ((onnxNodeType == "MatMul" || onnxNodeType == "PRelu") && indices.size() > 0) {
-                str_copy(weightSpec.op_name, opName.c_str(), opName.length());
-                weightSpec.mdt = DT_F32;
-                std::string weightName = this->onnxNode.input(indices[0]);
-                if (onnxNodeType == "MatMul" &&
-                    (this->onnxWeightReferCount[weightName] > 1 ||
-                        this->sharedWeights.find(weightName) != this->sharedWeights.end())) {
-                    weightSpec.bytes_of_weight = 0;
-                    weightSpec.weight = nullptr;
-                } else {
+                weightSpec = convert_weight(opName, {}, {});
+                std::string weightName = this->onnxNode.input(1);
+                if (onnxNodeType != "MatMul" ||
+                    (this->onnxWeightReferCount[weightName] <= 1 &&
+                        this->sharedWeights.find(weightName) == this->sharedWeights.end())) {
                     const onnx::TensorProto &weight = onnxWeights[weightName];
-                    U8 *weight_ptr = get_ptr_from_weight_obj(weight);
-                    int weight_num = get_data_size_from_tensor_proto(weight);
-                    weightSpec.bytes_of_weight = weight_num * sizeof(float);
-                    weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
+                    int length = get_length(weight);
+                    weightSpec.bytes_of_weight = length * sizeof(float);
+                    weightSpec.weight = (U8 *)mt_malloc(weightSpec.bytes_of_weight);
                     int row = weight.dims(0);
-                    int column = weight_num / row;
-                    for (int m = 0, index = 0; m < column; m++) {
-                        for (int n = 0; n < row; n++, index += sizeof(float)) {
-                            memcpy(weightSpec.weight + index,
-                                weight_ptr + (n * column + m) * sizeof(float), sizeof(float));
-                        }
-                    }
+                    UNI_MEMCPY_trans2d(weightSpec.weight, get_ptr(weight), length / row, row);
                 }
-                weightSpec.bytes_of_vec = 0;
-                weightSpec.vec = nullptr;
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(weightSpec);
             } else if (onnxNodeType == "Mul" || onnxNodeType == "Div") {
-                if (indices.size() == 0 ||
-                    get_data_size_from_tensor_proto(
-                        onnxWeights[this->onnxNode.input(indices[0])]) == 1 ||
-                    this->sharedWeights.find(this->onnxNode.input(indices[0])) !=
-                        this->sharedWeights.end()) {
+                OperatorType type = convert_onnx_type(onnxNodeType);
+                if (indices.size() == 0 || type == OT_Power || type == OT_Eltwise) {
                     continue;
                 }
-                std::vector<onnx::TensorProto> weightTp = {
-                    onnxWeights[this->onnxNode.input(indices[0])]};
-                std::vector<onnx::TensorProto> biasTp;
-                assign_weight(weightSpec, opName, weightTp, biasTp);
+                weightSpec =
+                    convert_weight(opName, {onnxWeights[this->onnxNode.input(indices[0])]}, {});
                 if (onnxNodeType == "Div") {
                     F32 *scale = (F32 *)weightSpec.weight;
                     for (U32 j = 0; j < weightSpec.bytes_of_weight / sizeof(float); j++) {
                         scale[j] = 1 / scale[j];
                     }
                 }
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(weightSpec);
             } else if (onnxNodeType == "Add" || onnxNodeType == "Sub") {
-                if (indices.size() == 0 ||
-                    get_data_size_from_tensor_proto(
-                        onnxWeights[this->onnxNode.input(indices[0])]) == 1 ||
-                    this->sharedWeights.find(this->onnxNode.input(indices[0])) !=
-                        this->sharedWeights.end()) {
+                OperatorType type = convert_onnx_type(onnxNodeType);
+                if (indices.size() == 0 || type == OT_Power || type == OT_Eltwise) {
                     continue;
                 }
-                std::vector<onnx::TensorProto> weightTp;
-                std::vector<onnx::TensorProto> biasTp = {
-                    onnxWeights[this->onnxNode.input(indices[0])]};
-                assign_weight(weightSpec, opName, weightTp, biasTp);
+                weightSpec =
+                    convert_weight(opName, {}, {onnxWeights[this->onnxNode.input(indices[0])]});
                 if (onnxNodeType == "Sub") {
                     F32 *scale = (F32 *)weightSpec.vec;
                     for (U32 j = 0; j < weightSpec.bytes_of_vec / sizeof(float); j++) {
-                        scale[j] = (-1) * scale[j];
+                        scale[j] = -1 * scale[j];
                     }
                 }
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(weightSpec);
             } else if (onnxNodeType == "Transpose" && indices.size() > 0) {
-                std::vector<onnx::TensorProto> weightTp = {onnxWeights[this->onnxNode.input(0)]};
-                std::vector<onnx::TensorProto> biasTp;
-                assign_weight(weightSpec, opName, weightTp, biasTp);
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(convert_weight(opName, {onnxWeights[this->onnxNode.input(0)]}, {}));
             } else if (onnxNodeType == "RNN" || onnxNodeType == "GRU" || onnxNodeType == "LSTM") {
                 const onnx::TensorProto &W = onnxWeights[this->onnxNode.input(1)];
                 const onnx::TensorProto &R = onnxWeights[this->onnxNode.input(2)];
@@ -1143,9 +942,7 @@ class OnnxAdaptee : public ModelAdaptee {
                 } else if (onnxNodeType == "GRU") {
                     gates = 3;
                     order = {0, 1, 2};
-                    if (0 !=
-                        get_node_single_int_attribute_by_name(
-                            this->onnxNode, "linear_before_reset", 0)) {
+                    if (0 != get_int(this->onnxNode, "linear_before_reset", 0)) {
                         gru_lbr = true;
                         biasNum += biasNum / gates;
                     }
@@ -1156,40 +953,36 @@ class OnnxAdaptee : public ModelAdaptee {
                     UNI_ERROR_LOG("can not process operator name:%s type:%s.\n",
                         this->onnxNode.name().c_str(), onnxNodeType.c_str());
                 }
-                U8 *W_ptr = get_ptr_from_weight_obj(W);
-                U8 *R_ptr = get_ptr_from_weight_obj(R);
-                U8 *B_ptr = get_ptr_from_weight_obj(B);
-                weightSpec.mdt = DT_F32;
-                str_copy(weightSpec.op_name, opName.c_str(), opName.length());
-                weightSpec.bytes_of_weight =
-                    (W.dims(0) * W.dims(1) * (W.dims(2) + R.dims(2))) * sizeof(float);
-                weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
-                weightSpec.bytes_of_vec = biasNum * sizeof(float);
-                weightSpec.vec = (U8 *)mt_new_storage(weightSpec.bytes_of_vec);
+                weightSpec = mt_create_weight(opName.c_str(), DT_F32,
+                    (W.dims(0) * W.dims(1) * (W.dims(2) + R.dims(2))) * sizeof(float),
+                    biasNum * sizeof(float), 0);
                 int hidden = W.dims(1) / gates;
                 U8 *weightPtr = weightSpec.weight;
                 F32 *biasPtr = (F32 *)weightSpec.vec;
+                U8 *W_ptr = get_ptr(W);
+                U8 *R_ptr = get_ptr(R);
+                U8 *B_ptr = get_ptr(B);
                 // loop direction
                 for (int j = 0; j < W.dims(0); j++) {
                     // loop LSTM(iofc), GRU(zrh), RNN(g)
                     for (int m = 0; m < gates; m++) {
                         int k = order[m];
                         for (int n = 0; n < hidden; n++) {
-                            memcpy(weightPtr,
+                            UNI_MEMCPY(weightPtr,
                                 W_ptr + ((j * gates + k) * hidden + n) * W.dims(2) * sizeof(float),
                                 W.dims(2) * sizeof(float));
                             weightPtr += W.dims(2) * sizeof(float);
-                            memcpy(weightPtr,
+                            UNI_MEMCPY(weightPtr,
                                 R_ptr + ((j * gates + k) * hidden + n) * R.dims(2) * sizeof(float),
                                 R.dims(2) * sizeof(float));
                             weightPtr += R.dims(2) * sizeof(float);
 
                             if (biasNum > 0) {
                                 float W_B, R_B;
-                                memcpy(&W_B,
+                                UNI_MEMCPY(&W_B,
                                     B_ptr + (((j * 2) * gates + k) * hidden + n) * sizeof(float),
                                     sizeof(float));
-                                memcpy(&R_B,
+                                UNI_MEMCPY(&R_B,
                                     B_ptr + (((j * 2 + 1) * gates + k) * hidden + n) * sizeof(float),
                                     sizeof(float));
                                 // not to preprocess LBR GRU's h gates bias
@@ -1207,75 +1000,33 @@ class OnnxAdaptee : public ModelAdaptee {
                         biasPtr += hidden;
                     }
                 }
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(weightSpec);
             } else if (onnxNodeType == "Splice") {
-                std::vector<int> indices =
-                    get_node_vector_ints_attribute_by_name(this->onnxNode, "forward_indexes");
-                str_copy(weightSpec.op_name, opName.c_str(), opName.length());
-                weightSpec.mdt = DT_U32;
-                weightSpec.bytes_of_weight = indices.size() * sizeof(U32);
-                weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
-                memcpy(weightSpec.weight, indices.data(), weightSpec.bytes_of_weight);
-                weightSpec.bytes_of_vec = 0;
-                weightSpec.vec = nullptr;
-                weightSpecVec.push_back(weightSpec);
-            } else if (onnxNodeType == "Where") {
-                bool *conditionTpPtr = nullptr;
-                int conditionTpSize = 0;
-                std::vector<float> conditionVec;
-                if (onnxWeights.find(this->onnxNode.input(0)) != onnxWeights.end()) {
-                    auto conditionTp = onnxWeights[this->onnxNode.input(0)];
-                    conditionTpPtr = (bool *)(get_ptr_from_weight_obj(conditionTp));
-                    conditionTpSize = get_data_size_from_tensor_proto(conditionTp);
-                    for (int i = 0; i < conditionTpSize; i++) {
-                        float curCon = (conditionTpPtr[i] == true) ? 1.0 : 0.0;
-                        conditionVec.push_back(curCon);
-                    }
-                }
-                U8 *yPtr = nullptr;
-                int yTpSize = 0;
-                if (onnxWeights.find(this->onnxNode.input(2)) != onnxWeights.end()) {
-                    auto yTp = onnxWeights[this->onnxNode.input(2)];
-                    yPtr = get_ptr_from_weight_obj(yTp);
-                    yTpSize = get_data_size_from_tensor_proto(yTp);
-                }
-                weightSpec.mdt = DT_F32;
-                str_copy(weightSpec.op_name, opName.c_str(), opName.length());
-                weightSpec.bytes_of_weight = conditionTpSize * sizeof(float);
-                if (weightSpec.bytes_of_weight == 0) {
-                    weightSpec.weight = nullptr;
-                } else {
-                    weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
-                    memcpy(weightSpec.weight, conditionVec.data(), weightSpec.bytes_of_weight);
-                }
-                weightSpec.bytes_of_vec = yTpSize * sizeof(float);
-                if (weightSpec.bytes_of_vec == 0) {
-                    weightSpec.vec = nullptr;
-                } else {
-                    weightSpec.vec = (U8 *)mt_new_storage(weightSpec.bytes_of_vec);
-                    memcpy(weightSpec.vec, yPtr, weightSpec.bytes_of_vec);
-                }
-                weightSpecVec.push_back(weightSpec);
-            } else if (onnxNodeType == "Equal") {
-                auto cmpTp = onnxWeights[this->onnxNode.input(1)];
-                int cmpTpSize = get_data_size_from_tensor_proto(cmpTp);
-                if (cmpTp.data_type() == onnx::TensorProto::FLOAT) {
-                    weightSpec.mdt = DT_F32;
-                } else if (cmpTp.data_type() == onnx::TensorProto::INT32) {
-                    weightSpec.mdt = DT_I32;
-                } else {
-                    UNI_ERROR_LOG("can not process operator name:%s %s type Equal.\n",
-                        this->onnxNode.name().c_str(),
-                        onnx_data_type_string(cmpTp.data_type()).c_str());
-                }
-                U8 *cmpPtr = (U8 *)get_ptr_from_weight_obj(cmpTp);
-                str_copy(weightSpec.op_name, opName.c_str(), opName.length());
-                weightSpec.bytes_of_weight = cmpTpSize * sizeof(float);
-                weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
-                memcpy(weightSpec.weight, cmpPtr, weightSpec.bytes_of_weight);
-                weightSpec.bytes_of_vec = 0;
-                weightSpec.vec = nullptr;
-                weightSpecVec.push_back(weightSpec);
+                std::vector<int> indices = get_ints(this->onnxNode, "forward_indexes");
+                weightSpec =
+                    mt_create_weight(opName.c_str(), DT_U32, indices.size() * sizeof(int), 0, 0);
+                UNI_MEMCPY(weightSpec.weight, indices.data(), weightSpec.bytes_of_weight);
+                ws.push_back(weightSpec);
+                //} else if (onnxNodeType == "Where") {
+                //    if (onnxWeights.find(this->onnxNode.input(2)) != onnxWeights.end()) {
+                //        weightSpec = convert_weight(opName, {}, {onnxWeights[this->onnxNode.input(2)]});
+                //    } else {
+                //        weightSpec = convert_weight(opName, {}, {});
+                //    }
+                //    if (onnxWeights.find(this->onnxNode.input(0)) != onnxWeights.end()) {
+                //        auto &condition = onnxWeights[this->onnxNode.input(0)];
+                //        int length = get_length(condition);
+                //        weightSpec.bytes_of_weight = length * sizeof(float);
+                //        weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
+                //        transformToFloat(DT_I8, get_ptr(condition), (float *)weightSpec.weight, length);
+                //    }
+                //    ws.push_back(weightSpec);
+                //} else if (onnxNodeType == "Less" || onnxNodeType == "LessOrEqual" ||
+                //    onnxNodeType == "Equal" || onnxNodeType == "Greater" ||
+                //    onnxNodeType == "GreaterOrEqual") {
+                //    if (indices.size() > 0) {
+                //        ws.push_back(convert_weight(opName, {onnxWeights[this->onnxNode.input(indices[0])]}, {}));
+                //    }
             } else if (onnxNodeType == "Scan") {
                 onnx::GraphProto gp;
                 for (int k = 0; k < this->onnxNode.attribute_size(); k++) {
@@ -1285,155 +1036,109 @@ class OnnxAdaptee : public ModelAdaptee {
                         break;
                     }
                 }
-
                 // extract the weight from scan tp
-                std::map<std::string, std::vector<onnx::TensorProto>> weightMap;
-                std::vector<onnx::TensorProto> tps;
-                weightMap["Gemm"] = tps;
-                weightMap["MatMul"] = tps;
-                std::map<std::string, std::vector<int>> transMap;
-                std::vector<int> trans;
-                transMap["Gemm"] = trans;
-                transMap["MatMul"] = trans;
+                std::map<std::string, std::vector<onnx::TensorProto>> weightMap = {
+                    {"Gemm", {}},
+                    {"MatMul", {}},
+                };
+                std::map<std::string, std::vector<int>> transMap = {{"Gemm", {}}, {"MatMul", {}}};
                 for (int j = 0; j < gp.node_size(); j++) {
-                    auto curNode = gp.node(j);
-                    if (curNode.op_type() == "Gemm" || curNode.op_type() == "MatMul") {
-                        for (int k = 0; k < (int)curNode.input_size(); k++) {
-                            if (onnxWeights.find(curNode.input(k)) != onnxWeights.end()) {
-                                auto hidWeightTp = onnxWeights[curNode.input(k)];
-                                if (get_data_size_from_tensor_proto(hidWeightTp) == 0) {
+                    auto node = gp.node(j);
+                    auto type = node.op_type();
+                    if (type == "Gemm" || type == "MatMul") {
+                        for (int k = 0; k < node.input_size(); k++) {
+                            auto name = node.input(k);
+                            if (onnxWeights.find(name) != onnxWeights.end()) {
+                                auto weight = onnxWeights[name];
+                                if (get_length(weight) == 0) {
                                     continue;
                                 } else {
-                                    weightMap[curNode.op_type()].push_back(hidWeightTp);
+                                    weightMap[type].push_back(weight);
                                 }
                             }
                         }
-                        int noTransB = 1;
-                        noTransB = get_node_single_int_attribute_by_name(curNode, "transB", 0);
-                        transMap[curNode.op_type()].push_back(noTransB);
+                        transMap[type].push_back(get_int(node, "transB", 0));
                     }
                 }
-
-                // initial empty desc
-                TensorDesc wDesc1 = genDescFromTp(weightMap["Gemm"][0]);
-                TensorDesc bDesc1;
+                std::vector<onnx::TensorProto> bias;
                 if (weightMap["Gemm"].size() > 1) {
-                    bDesc1 = genDescFromTp(weightMap["Gemm"][1]);
-                } else {
-                    bDesc1 = tensor0d();
-                }
-                TensorDesc wDesc2;
-                if (weightMap["MatMul"].size() > 0) {
-                    wDesc2 = genDescFromTp(weightMap["MatMul"][0]);
-                } else {
-                    wDesc2 = tensor0d();
+                    bias.push_back(weightMap["Gemm"][1]);
                 }
-                TensorDesc bDesc2;
                 if (weightMap["MatMul"].size() > 1) {
-                    bDesc2 = genDescFromTp(weightMap["MatMul"][1]);
-                } else {
-                    bDesc2 = tensor0d();
+                    bias.push_back(weightMap["MatMul"][1]);
                 }
+                weightSpec = convert_weight(opName, {}, bias);
 
-                weightSpec.mdt = DT_F32;
-                str_copy(weightSpec.op_name, opName.c_str(), opName.length());
-                int wBytes = tensorNumElements(wDesc1) + tensorNumElements(wDesc2);
-                weightSpec.bytes_of_weight = wBytes * sizeof(float);
-                if (weightSpec.bytes_of_weight == 0) {
-                    weightSpec.weight = nullptr;
-                } else {
-                    weightSpec.weight = (U8 *)mt_new_storage(weightSpec.bytes_of_weight);
-                    int wOffSet = 0;
-                    if (tensorNumElements(wDesc1) > 0) {
-                        U8 *tmpWPtr1 = get_ptr_from_weight_obj(weightMap["Gemm"][0]);
+                U32 bytes1 = 0, bytes2 = 0;
+                if (weightMap["Gemm"].size() > 0) {
+                    bytes1 = get_length(weightMap["Gemm"][0]) * sizeof(float);
+                }
+                if (weightMap["MatMul"].size() > 0) {
+                    bytes2 = get_length(weightMap["MatMul"][0]) * sizeof(float);
+                }
+                weightSpec.bytes_of_weight = bytes1 + bytes2;
+                if (weightSpec.bytes_of_weight > 0) {
+                    weightSpec.weight = (U8 *)mt_malloc(weightSpec.bytes_of_weight);
+                    if (bytes1 > 0) {
+                        U8 *ptr = get_ptr(weightMap["Gemm"][0]);
                         if (transMap["Gemm"][0]) {
-                            memcpy(
-                                &((weightSpec.weight)[wOffSet]), tmpWPtr1, tensorNumBytes(wDesc1));
+                            UNI_MEMCPY(weightSpec.weight, ptr, bytes1);
                         } else {
-                            memcpy_trans2d(&((weightSpec.weight)[wOffSet]), tmpWPtr1,
-                                (int)weightMap["Gemm"][0].dims(1),
-                                (int)weightMap["Gemm"][0].dims(0));
+                            UNI_MEMCPY_trans2d(weightSpec.weight, ptr, weightMap["Gemm"][0].dims(1),
+                                weightMap["Gemm"][0].dims(0));
                         }
-                        wOffSet += tensorNumBytes(wDesc1);
                     }
-                    if (tensorNumElements(wDesc2) > 0) {
-                        U8 *tmpWPtr2 = get_ptr_from_weight_obj(weightMap["MatMul"][0]);
+                    if (bytes2 > 0) {
+                        U8 *ptr = get_ptr(weightMap["MatMul"][0]);
                         if (transMap["MatMul"][0]) {
-                            memcpy(
-                                &((weightSpec.weight)[wOffSet]), tmpWPtr2, tensorNumBytes(wDesc2));
+                            UNI_MEMCPY(weightSpec.weight + bytes1, ptr, bytes2);
                         } else {
-                            memcpy_trans2d(&((weightSpec.weight)[wOffSet]), tmpWPtr2,
-                                (int)weightMap["MatMul"][0].dims(1),
-                                (int)weightMap["MatMul"][0].dims(0));
+                            UNI_MEMCPY_trans2d(weightSpec.weight + bytes1, ptr,
+                                weightMap["MatMul"][0].dims(1), weightMap["MatMul"][0].dims(0));
                         }
                     }
                 }
-
-                int bBytes = tensorNumElements(bDesc1) + tensorNumElements(bDesc2);
-                weightSpec.bytes_of_vec = bBytes * sizeof(float);
-                if (weightSpec.bytes_of_vec == 0) {
-                    weightSpec.vec = nullptr;
-                } else {
-                    weightSpec.vec = (U8 *)mt_new_storage(weightSpec.bytes_of_vec);
-                    int bOffSet = 0;
-                    if (tensorNumElements(bDesc1) > 0) {
-                        U8 *tmpBPtr1 = get_ptr_from_weight_obj(weightMap["Gemm"][1]);
-                        memcpy(&((weightSpec.vec)[bOffSet]), tmpBPtr1, tensorNumBytes(bDesc1));
-                        bOffSet += tensorNumBytes(bDesc1);
-                    }
-                    if (tensorNumElements(bDesc2) > 0) {
-                        U8 *tmpBPtr2 = get_ptr_from_weight_obj(weightMap["MatMul"][1]);
-                        memcpy(&((weightSpec.vec)[bOffSet]), tmpBPtr2, tensorNumBytes(bDesc2));
-                    }
-                }
-                weightSpecVec.push_back(weightSpec);
-            } else if (onnxNodeType == "ScatterND" || onnxNodeType == "ScatterElements" ||
-                onnxNodeType == "Gather" || onnxNodeType == "GatherND" ||
-                onnxNodeType == "GatherElements") {
-                std::vector<onnx::TensorProto> weightTp, biasTp;
+                ws.push_back(weightSpec);
+            } else if (onnxNodeType == "Scatter" || onnxNodeType == "ScatterND" ||
+                onnxNodeType == "ScatterElements" || onnxNodeType == "Gather" ||
+                onnxNodeType == "GatherND" || onnxNodeType == "GatherElements") {
+                std::vector<onnx::TensorProto> weight, bias;
                 const std::string &input0 = this->onnxNode.input(0);
                 if (onnxWeights.find(input0) != onnxWeights.end() &&
-                    this->onnxWeightReferCount[input0] == 1) {
-                    weightTp.push_back(onnxWeights[this->onnxNode.input(0)]);
+                    this->onnxWeightReferCount[input0] <= 1) {
+                    weight.push_back(onnxWeights[this->onnxNode.input(0)]);
                 }
                 // update tensor
-                if (onnxNodeType == "ScatterND" || onnxNodeType == "ScatterElements") {
+                if (onnxNodeType == "Scatter" || onnxNodeType == "ScatterND" ||
+                    onnxNodeType == "ScatterElements") {
                     const std::string &input2 = this->onnxNode.input(2);
                     if (onnxWeights.find(input2) != onnxWeights.end() &&
-                        this->onnxWeightReferCount[input2] == 1) {
-                        weightTp.push_back(onnxWeights[this->onnxNode.input(2)]);
+                        this->onnxWeightReferCount[input2] <= 1) {
+                        weight.push_back(onnxWeights[this->onnxNode.input(2)]);
                     }
                 }
-                assign_weight(weightSpec, opName, weightTp, biasTp);
                 const std::string &input1 = this->onnxNode.input(1);
                 if (onnxWeights.find(input1) != onnxWeights.end() &&
-                    this->onnxWeightReferCount[input1] == 1) {
-                    std::vector<int> index = get_int_vec_from_tensorProto(onnxWeights[input1]);
-                    weightSpec.bytes_of_vec = sizeof(int) * index.size();
-                    weightSpec.vec = (U8 *)mt_new_storage(weightSpec.bytes_of_vec);
-                    memcpy(weightSpec.vec, index.data(), weightSpec.bytes_of_vec);
+                    this->onnxWeightReferCount[input1] <= 1) {
+                    bias.push_back(onnxWeights[input1]);
                 }
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(convert_weight(opName, weight, bias));
             } else if (onnxNodeType == "GenerateProposals") {
-                std::vector<onnx::TensorProto> weightTp = {onnxWeights[this->onnxNode.input(3)]};
-                std::vector<onnx::TensorProto> biasTp;
-                assign_weight(weightSpec, opName, weightTp, biasTp);
-                weightSpecVec.push_back(weightSpec);
+                ws.push_back(convert_weight(opName, {onnxWeights[this->onnxNode.input(3)]}, {}));
             }
         }
-        ms->num_weight_specs = weightSpecVec.size();
-        ms->ws = (WeightSpec *)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs);
-        memcpy(ms->ws, weightSpecVec.data(), sizeof(WeightSpec) * weightSpecVec.size());
-        for (I32 i = 0; i < ms->num_weight_specs; i++) {
-            ms->ws[i].num_quant_scale = 0;
-            ms->ws[i].weight_scale = nullptr;
-        }
+        ms->num_weight_specs = ws.size();
+        ms->ws = (WeightSpec *)mt_malloc(sizeof(WeightSpec) * ms->num_weight_specs);
+        UNI_MEMCPY(ms->ws, ws.data(), sizeof(WeightSpec) * ws.size());
         return SUCCESS;
     }
 
-    void insert_shared_weight()
+    void add_shared_weight(const onnx::NodeProto &node, std::set<int> input_ids = std::set<int>())
     {
         for (int i = 0; i < this->onnxNode.input_size(); i++) {
+            if (input_ids.size() > 0 && input_ids.find(i) == input_ids.end()) {
+                continue;
+            }
             const std::string &name = this->onnxNode.input(i);
             if (onnxWeights.find(name) != onnxWeights.end()) {
                 this->sharedWeights.insert(name);
@@ -1441,525 +1146,557 @@ class OnnxAdaptee : public ModelAdaptee {
         }
     }
 
-    ParameterSpec adapt_SharedWeight() override
-    {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        const onnx::TensorProto &data = onnxWeights[this->onnxNode.input(0)];
-        const onnx::TensorProto &ind = onnxWeights[this->onnxNode.input(1)];
-        SharedWeightParamSpec sharedWeightPs;
-        sharedWeightPs.desc.nDims = 3;
-        sharedWeightPs.desc.dims[2] = 1;
-        sharedWeightPs.desc.dims[1] = ind.dims(1);
-        sharedWeightPs.desc.dims[0] = data.dims(1);
-        sharedWeightPs.desc.df = DF_NORMAL;
-        sharedWeightPs.desc.dt = DT_F32;
-        curPs.shared_weight_spec = sharedWeightPs;
-        return curPs;
-    }
-
     ParameterSpec adapt_Reshape() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ReshapeParamSpec reshapePs;
-        memset(&reshapePs, 0, sizeof(reshapePs));
+        ParameterSpec ps;
+        ReshapeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const std::string &onnxNodeType = this->onnxNode.op_type();
-        std::vector<int> reshapeInfo;
+        std::vector<int> shape;
         if (onnxNodeType == "Flatten") {
-            int axis = get_node_single_int_attribute_by_name(this->onnxNode, "axis", 1);
-            for (int i = 0; i < axis; i++) {
-                reshapeInfo.push_back(0);
-            }
-            reshapeInfo.push_back(-1);
+            int axis = get_int(this->onnxNode, "axis", 1);
+            shape = std::vector<int>(axis, 0);
+            shape.push_back(-1);
         } else {
             if (this->onnxNode.input_size() == 1) {
-                reshapeInfo = get_node_vector_ints_attribute_by_name(this->onnxNode, "shape");
+                shape = get_ints(this->onnxNode, "shape");
             } else {
-                reshapeInfo = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(1)]);
+                shape = get_ints(onnxWeights[this->onnxNode.input(1)]);
             }
         }
-        reshapePs.shape_size = reshapeInfo.size();
-        memcpy(reshapePs.shape_dims, reshapeInfo.data(), reshapePs.shape_size * sizeof(I32));
-        reshapePs.axis = 0;
-        reshapePs.num_axes = -1;
-        curPs.reshape_spec = reshapePs;
-        return curPs;
+        p.num_shape = shape.size();
+        UNI_MEMCPY(p.shape, shape.data(), p.num_shape * sizeof(I32));
+        p.axis = 0;
+        p.num_axes = -1;
+        ps.reshape_spec = p;
+        add_shared_weight(this->onnxNode, {0});
+        return ps;
     }
 
     ParameterSpec adapt_Resize() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ResizeParamSpec resizePs;
-        memset(&resizePs, 0, sizeof(resizePs));
-        resizePs.num_scales = 0;
-        resizePs.num_sizes = 0;
+        ParameterSpec ps;
+        ResizeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<float> scales;
+        std::vector<int> sizes;
         std::string scalesIndex = "";
         std::string sizesIndex = "";
+        std::string mode = get_string(this->onnxNode, "mode", "nearest");
+        std::string trans_mode =
+            get_string(this->onnxNode, "coordinate_transformation_mode", "half_pixel");
+        std::string nearest_mode = get_string(this->onnxNode, "nearest_mode", "round_prefer_floor");
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "Resize") {
             for (int i = 0; i < this->onnxNode.input_size(); i++) {
                 if (onnxWeights.find(this->onnxNode.input(i)) != onnxWeights.end()) {
-                    auto curTp = onnxWeights[this->onnxNode.input(i)];
-                    if (curTp.data_type() == onnx::TensorProto::FLOAT) {
-                        scalesIndex = this->onnxNode.input(i);
-                    } else if (curTp.data_type() == onnx::TensorProto::INT64) {
-                        sizesIndex = this->onnxNode.input(i);
+                    auto tp = onnxWeights[this->onnxNode.input(i)];
+                    if (tp.data_type() == onnx::TensorProto::FLOAT) {
+                        scales = get_floats(tp);
+                    } else if (tp.data_type() == onnx::TensorProto::INT64) {
+                        sizes = get_ints(tp);
                     } else {
                         UNI_ERROR_LOG("can not process operator name:%s %s type attributes.\n",
-                            this->onnxNode.name().c_str(),
-                            onnx_data_type_string(curTp.data_type()).c_str());
+                            this->onnxNode.name().c_str(), to_string(tp.data_type()).c_str());
                     }
                 }
             }
         } else if (onnxNodeType == "Upsample") {
-            scalesIndex = this->onnxNode.input(1);
+            if (this->onnxNode.input_size() > 1) {
+                scales = get_floats(onnxWeights[this->onnxNode.input(1)]);
+            } else {
+                scales = get_floats(this->onnxNode, "scales");
+            }
+            trans_mode = "asymmetric";
+            nearest_mode = "floor";
         } else {
             UNI_ERROR_LOG("can not map operator name:%s type:%s to Resize.\n",
                 this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
-        if (scalesIndex != "") {
-            const onnx::TensorProto &scales = onnxWeights[scalesIndex];
-            if (scales.dims(0) == 0 || scales.dims(0) == 4) {
-                resizePs.num_scales = scales.dims(0);
-                U8 *ptr = get_ptr_from_weight_obj(scales);
-                memcpy(resizePs.scales, ptr, resizePs.num_scales * bytesOf(DT_F32));
-            } else {
-                UNI_ERROR_LOG("can not get scale information from operator name:%s type:%s.\n",
-                    this->onnxNode.name().c_str(), onnxNodeType.c_str());
-            }
+        if (scales.size() > 0) {
+            p.num_scales = scales.size();
+            UNI_MEMCPY(p.scales, scales.data(), p.num_scales * bytesOf(DT_F32));
         }
-        if (sizesIndex != "") {
-            const onnx::TensorProto &sizes = onnxWeights[sizesIndex];
-            if (sizes.dims(0) == 0) {
-            } else if (sizes.dims(0) == 4) {
-                std::vector<int> ptr = get_int_vec_from_tensorProto(sizes);
-                resizePs.num_sizes = 2;
-                resizePs.sizes[0] = ptr[2];
-                resizePs.sizes[1] = ptr[3];
-            } else {
-                UNI_ERROR_LOG("can not get resize information from operator name:%s "
-                              "type:%s.\n",
-                    this->onnxNode.name().c_str(), onnxNodeType.c_str());
+        if (sizes.size() > 0) {
+            p.num_sizes = 0;
+            if (sizes.size() > 2) {
+                p.num_sizes = sizes.size() - 2;
+                UNI_MEMCPY(p.sizes, sizes.data() + 2, p.num_sizes * bytesOf(DT_I32));
             }
         }
 
-        std::string mode = get_node_str_attribute_by_name(this->onnxNode, "mode", "nearest");
-        std::string coordinate_transformation_mode = get_node_str_attribute_by_name(
-            this->onnxNode, "coordinate_transformation_mode", "half_pixel");
-        std::string nearest_mode =
-            get_node_str_attribute_by_name(this->onnxNode, "nearest_mode", "round_prefer_floor");
-
-        if (mode.compare("linear") == 0) {
-            resizePs.mode = LINEAR;
-        } else if (mode.compare("nearest") == 0) {
-            resizePs.mode = NEAREST;
-        } else if (mode.compare("cubic") == 0) {
-            resizePs.mode = CUBIC;
+        if (mode == std::string("linear")) {
+            p.mode = RESIZE_LINEAR;
+        } else if (mode == std::string("nearest")) {
+            p.mode = RESIZE_NEAREST;
+        } else if (mode == std::string("cubic")) {
+            p.mode = RESIZE_CUBIC;
         } else {
             UNI_ERROR_LOG("can not support mode:%s in operator name:%s type:%s.\n", mode.c_str(),
                 this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
 
-        if (coordinate_transformation_mode.compare("align_corners") == 0) {
-            resizePs.trans_mode = ALIGN_CORNERS;
-        } else if (coordinate_transformation_mode.compare("half_pixel") == 0) {
-            resizePs.trans_mode = HALF_PIXEL;
-        } else if (coordinate_transformation_mode.compare("pytorch_half_pixel") == 0) {
-            resizePs.trans_mode = PYTORCH_HALF_PIXEL;
-        } else if (coordinate_transformation_mode.compare("asymmetric") == 0) {
-            resizePs.trans_mode = ASYMMETRIC;
+        if (trans_mode == std::string("align_corners")) {
+            p.trans_mode = COORDINATE_TRANS_ALIGN_CORNERS;
+        } else if (trans_mode == std::string("half_pixel")) {
+            p.trans_mode = COORDINATE_TRANS_HALF_PIXEL;
+        } else if (trans_mode == std::string("pytorch_half_pixel")) {
+            p.trans_mode = COORDINATE_TRANS_PYTORCH_HALF_PIXEL;
+        } else if (trans_mode == std::string("asymmetric")) {
+            p.trans_mode = COORDINATE_TRANS_ASYMMETRIC;
         } else {
             UNI_ERROR_LOG("can not support coordinate transformation mode:%s in operator name:%s "
                           "type:%s.\n",
-                coordinate_transformation_mode.c_str(), this->onnxNode.name().c_str(),
-                onnxNodeType.c_str());
+                trans_mode.c_str(), this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
 
-        if (nearest_mode.compare("round_prefer_floor") == 0) {
-            resizePs.round_mode = ROUND_PREFER_FLOOR;
-        } else if (nearest_mode.compare("round_prefer_ceil") == 0) {
-            resizePs.round_mode = ROUND_PREFER_CEIL;
-        } else if (nearest_mode.compare("floor") == 0) {
-            resizePs.round_mode = FLOOR;
+        if (nearest_mode == std::string("round_prefer_floor")) {
+            p.round_mode = ROUND_PREFER_FLOOR;
+        } else if (nearest_mode == std::string("round_prefer_ceil")) {
+            p.round_mode = ROUND_PREFER_CEIL;
+        } else if (nearest_mode == std::string("floor")) {
+            p.round_mode = ROUND_FLOOR;
         } else {
-            resizePs.round_mode = CEIL;
+            p.round_mode = ROUND_CEIL;
         }
-
-        curPs.resize_spec = resizePs;
-        return curPs;
+        ps.resize_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Transpose() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        TransposeParamSpec transposePs;
-        memset(&transposePs, 0, sizeof(transposePs));
-        std::vector<int> transpose_info =
-            get_node_vector_ints_attribute_by_name(this->onnxNode, "perm");
-        transposePs.trans_size = transpose_info.size();
-        memcpy(transposePs.trans_dims, transpose_info.data(), transposePs.trans_size * sizeof(U32));
-        curPs.transpose_spec = transposePs;
-        return curPs;
+        ParameterSpec ps;
+        TransposeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<int> axes = get_ints(this->onnxNode, "perm");
+        p.num_axes = axes.size();
+        UNI_MEMCPY(p.axes, axes.data(), p.num_axes * sizeof(U32));
+        ps.transpose_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Clip() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ClipParamSpec clipParam;
-        memset(&clipParam, 0, sizeof(clipParam));
+        ParameterSpec ps;
+        ClipParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "Max") {
-            clipParam.min = 0;
-            clipParam.max = UNI_F16_MAX;
+            p.min = 0;
+            p.max = UNI_F16_MAX;
         } else if (onnxNodeType == "Min") {
-            clipParam.min = -UNI_F16_MAX;
-            clipParam.max = 1;
-        } else {  // onnxNodeType == "Clip"
+            p.min = -UNI_F16_MAX;
+            p.max = 1;
+        } else {
             if (this->onnxNode.input_size() == 1) {
-                clipParam.min =
-                    get_node_float_attribute_by_name(this->onnxNode, "min", -UNI_F16_MAX);
-                clipParam.max = get_node_float_attribute_by_name(this->onnxNode, "max", UNI_F16_MAX);
+                p.min = get_float(this->onnxNode, "min", -UNI_F16_MAX);
+                p.max = get_float(this->onnxNode, "max", UNI_F16_MAX);
             } else {
-                if (this->onnxNode.input(1) == "") {
-                    clipParam.min = -UNI_F16_MAX;
-                } else {
-                    clipParam.min =
-                        getSinFloat_from_tensorProto(onnxWeights[this->onnxNode.input(1)]);
-                }
-                if (this->onnxNode.input(2) == "") {
-                    clipParam.max = UNI_F16_MAX;
-                } else {
-                    clipParam.max =
-                        getSinFloat_from_tensorProto(onnxWeights[this->onnxNode.input(2)]);
-                }
+                p.min = (this->onnxNode.input(1) == "")
+                    ? -UNI_F16_MAX
+                    : get_floats(onnxWeights[this->onnxNode.input(1)])[0];
+                p.max = (this->onnxNode.input(2) == "")
+                    ? UNI_F16_MAX
+                    : get_floats(onnxWeights[this->onnxNode.input(2)])[0];
             }
         }
-        curPs.clip_spec = clipParam;
-        return curPs;
+        ps.clip_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Conv() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ConvolutionParamSpec cps;
-        memset(&cps, 0, sizeof(cps));
-        std::vector<int> kernelShape =
-            get_node_vector_ints_attribute_by_name(this->onnxNode, "kernel_shape");
-        std::vector<int> dilations =
-            get_node_vector_ints_attribute_by_name(this->onnxNode, "dilations");
-        std::vector<int> strides = get_node_vector_ints_attribute_by_name(this->onnxNode, "strides");
-        std::vector<int> pads = get_node_vector_ints_attribute_by_name(this->onnxNode, "pads");
-        int group = get_node_single_int_attribute_by_name(this->onnxNode, "group", 1);
+        ParameterSpec ps;
+        ConvolutionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::string autoPad = get_string(this->onnxNode, "auto_pad");
+        std::vector<int> kernels = get_ints(this->onnxNode, "kernel_shape");
+        std::vector<int> dilations = get_ints(this->onnxNode, "dilations");
+        std::vector<int> strides = get_ints(this->onnxNode, "strides");
+        std::vector<int> pads = get_ints(this->onnxNode, "pads");
+        int group = get_int(this->onnxNode, "group", 1);
 
         const onnx::TensorProto &weight = onnxWeights[this->onnxNode.input(1)];
-        cps.num_outputs = weight.dims(0);
-        cps.num_outputs_origin = cps.num_outputs;
-        cps.kernel_t = 1;
-        cps.kernel_h = 1;
-        cps.kernel_w = 1;
-        if (kernelShape.size() == 3) {
-            cps.kernel_t = kernelShape[0];
-            cps.kernel_h = kernelShape[1];
-            cps.kernel_w = kernelShape[2];
-        } else if (kernelShape.size() == 2) {
-            cps.kernel_h = kernelShape[0];
-            cps.kernel_w = kernelShape[1];
-        } else if (kernelShape.size() == 1) {
-            cps.kernel_h = kernelShape[0];
-        }
-
-        cps.dilatedRate_t = 1;
-        cps.dilatedRate_h = 1;
-        cps.dilatedRate_w = 1;
+        p.num_outputs = weight.dims(0);
+        p.num_outputs_origin = p.num_outputs;
+        p.kernel_t = 1;
+        p.kernel_h = 1;
+        p.kernel_w = 1;
+        if (kernels.size() == 3) {
+            p.kernel_t = kernels[0];
+            p.kernel_h = kernels[1];
+            p.kernel_w = kernels[2];
+        } else if (kernels.size() == 2) {
+            p.kernel_h = kernels[0];
+            p.kernel_w = kernels[1];
+        } else if (kernels.size() == 1) {
+            p.kernel_h = kernels[0];
+        }
+
+        p.dilatedRate_t = 1;
+        p.dilatedRate_h = 1;
+        p.dilatedRate_w = 1;
         if (dilations.size() == 3) {
-            cps.dilatedRate_t = dilations[0];
-            cps.dilatedRate_h = dilations[1];
-            cps.dilatedRate_w = dilations[2];
+            p.dilatedRate_t = dilations[0];
+            p.dilatedRate_h = dilations[1];
+            p.dilatedRate_w = dilations[2];
         } else if (dilations.size() == 2) {
-            cps.dilatedRate_h = dilations[0];
-            cps.dilatedRate_w = dilations[1];
+            p.dilatedRate_h = dilations[0];
+            p.dilatedRate_w = dilations[1];
         } else if (dilations.size() == 1) {
-            cps.dilatedRate_h = dilations[0];
+            p.dilatedRate_h = dilations[0];
         }
 
-        cps.stride_t = 1;
-        cps.stride_h = 1;
-        cps.stride_w = 1;
+        p.stride_t = 1;
+        p.stride_h = 1;
+        p.stride_w = 1;
         if (strides.size() == 3) {
-            cps.stride_t = strides[0];
-            cps.stride_h = strides[1];
-            cps.stride_w = strides[2];
+            p.stride_t = strides[0];
+            p.stride_h = strides[1];
+            p.stride_w = strides[2];
         } else if (strides.size() == 2) {
-            cps.stride_h = strides[0];
-            cps.stride_w = strides[1];
+            p.stride_h = strides[0];
+            p.stride_w = strides[1];
         } else if (strides.size() == 1) {
-            cps.stride_h = strides[0];
+            p.stride_h = strides[0];
         }
 
-        cps.padding_before = 0;
-        cps.padding_top = 0;
-        cps.padding_left = 0;
-        cps.padding_after = 0;
-        cps.padding_bottom = 0;
-        cps.padding_right = 0;
+        p.pad_before = 0;
+        p.pad_top = 0;
+        p.pad_left = 0;
+        p.pad_after = 0;
+        p.pad_bottom = 0;
+        p.pad_right = 0;
         if (pads.size() == 6) {
-            cps.padding_before = pads[0];
-            cps.padding_top = pads[1];
-            cps.padding_left = pads[2];
-            cps.padding_after = pads[3];
-            cps.padding_bottom = pads[4];
-            cps.padding_right = pads[5];
+            p.pad_before = pads[0];
+            p.pad_top = pads[1];
+            p.pad_left = pads[2];
+            p.pad_after = pads[3];
+            p.pad_bottom = pads[4];
+            p.pad_right = pads[5];
         } else if (pads.size() == 4) {
-            cps.padding_top = pads[0];
-            cps.padding_left = pads[1];
-            cps.padding_bottom = pads[2];
-            cps.padding_right = pads[3];
+            p.pad_top = pads[0];
+            p.pad_left = pads[1];
+            p.pad_bottom = pads[2];
+            p.pad_right = pads[3];
         } else if (pads.size() == 2) {
-            cps.padding_top = pads[0];
-            cps.padding_bottom = pads[1];
-        }
-
-        cps.group = group;
-        if (cps.group != 1 && cps.group == cps.num_outputs) {
-            cps.convolution_type = Convolution_Depthwise;
+            p.pad_top = pads[0];
+            p.pad_bottom = pads[1];
+        } else if (autoPad == "SAME_UPPER") {
+            p.pad_top = (p.kernel_h - 1) / 2;
+            p.pad_bottom = (p.kernel_h - 1) - p.pad_top;
+            p.pad_left = (p.kernel_w - 1) / 2;
+            p.pad_right = (p.kernel_w - 1) - p.pad_left;
+        }
+
+        p.group = group;
+        if (p.group != 1 && p.group == p.num_outputs) {
+            p.convolution_type = CONVOLUTION_DEPTHWISE;
         } else {
-            if (cps.dilatedRate_t > 1 || cps.dilatedRate_h > 1 || cps.dilatedRate_w > 1) {
-                cps.convolution_type = Convolution_Dilation;
-            } else {
-                cps.convolution_type = Convolution_Pointwise;
-            }
+            p.convolution_type = CONVOLUTION_POINTWISE;
         }
 
-        cps.dw_activation_type = ACTIVATION_NULL;
-        cps.pw_activation_type = ACTIVATION_NULL;
-        curPs.conv_spec = cps;
-        return curPs;
+        p.dw_activation_type = ACTIVATION_NULL;
+        p.pw_activation_type = ACTIVATION_NULL;
+        ps.conv_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Deconvolution() override
     {
         const std::string &onnxNodeType = this->onnxNode.op_type();
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ConvolutionParamSpec cps;
-        memset(&cps, 0, sizeof(cps));
-        std::vector<int> kernelShape =
-            get_node_vector_ints_attribute_by_name(this->onnxNode, "kernel_shape");
-        std::vector<int> dilations =
-            get_node_vector_ints_attribute_by_name(this->onnxNode, "dilations");
-        std::vector<int> strides = get_node_vector_ints_attribute_by_name(this->onnxNode, "strides");
-        std::vector<int> pads = get_node_vector_ints_attribute_by_name(this->onnxNode, "pads");
-        int group = get_node_single_int_attribute_by_name(this->onnxNode, "group", 1);
-        std::vector<int> output_shapes =
-            get_node_vector_ints_attribute_by_name(this->onnxNode, "output_shape");
+        ParameterSpec ps;
+        ConvolutionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::string autoPad = get_string(this->onnxNode, "auto_pad", "NOTSET");
+        std::vector<int> kernels = get_ints(this->onnxNode, "kernel_shape");
+        std::vector<int> dilations = get_ints(this->onnxNode, "dilations");
+        std::vector<int> strides = get_ints(this->onnxNode, "strides");
+        std::vector<int> pads = get_ints(this->onnxNode, "pads");
+        int group = get_int(this->onnxNode, "group", 1);
+        std::vector<int> output_padding = get_ints(this->onnxNode, "output_padding");
+        std::vector<int> output_shapes = get_ints(this->onnxNode, "output_shape");
 
         const onnx::TensorProto &weight = onnxWeights[this->onnxNode.input(1)];
-        cps.num_outputs = weight.dims(1);
-        cps.kernel_t = 1;
-        cps.kernel_h = 1;
-        cps.kernel_w = 1;
-        if (kernelShape.size() == 2) {
-            cps.kernel_h = kernelShape[0];
-            cps.kernel_w = kernelShape[1];
-        } else if (kernelShape.size() == 1) {
-            cps.kernel_h = kernelShape[0];
+        p.num_outputs = weight.dims(1);
+        p.kernel_t = 1;
+        p.kernel_h = 1;
+        p.kernel_w = 1;
+        if (kernels.size() == 3) {
+            p.kernel_t = kernels[0];
+            p.kernel_h = kernels[1];
+            p.kernel_w = kernels[2];
+        } else if (kernels.size() == 2) {
+            p.kernel_h = kernels[0];
+            p.kernel_w = kernels[1];
+        } else if (kernels.size() == 1) {
+            p.kernel_h = kernels[0];
         } else {
             UNI_ERROR_LOG("can not map operator name:%s type:%s to Deconvolution.\n",
                 this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
 
-        cps.dilatedRate_t = 1;
-        cps.dilatedRate_h = 1;
-        cps.dilatedRate_w = 1;
-        if (dilations.size() == 2) {
-            cps.dilatedRate_h = dilations[0];
-            cps.dilatedRate_w = dilations[1];
+        p.dilatedRate_t = 1;
+        p.dilatedRate_h = 1;
+        p.dilatedRate_w = 1;
+        if (dilations.size() == 3) {
+            p.dilatedRate_t = dilations[0];
+            p.dilatedRate_h = dilations[1];
+            p.dilatedRate_w = dilations[2];
+        } else if (dilations.size() == 2) {
+            p.dilatedRate_h = dilations[0];
+            p.dilatedRate_w = dilations[1];
         } else if (dilations.size() == 1) {
-            cps.dilatedRate_h = dilations[0];
+            p.dilatedRate_h = dilations[0];
         }
 
-        cps.stride_t = 1;
-        cps.stride_h = 1;
-        cps.stride_w = 1;
-        if (strides.size() == 2) {
-            cps.stride_h = strides[0];
-            cps.stride_w = strides[1];
+        p.stride_t = 1;
+        p.stride_h = 1;
+        p.stride_w = 1;
+        if (strides.size() == 3) {
+            p.stride_t = strides[0];
+            p.stride_h = strides[1];
+            p.stride_w = strides[2];
+        } else if (strides.size() == 2) {
+            p.stride_h = strides[0];
+            p.stride_w = strides[1];
         } else if (strides.size() == 1) {
-            cps.stride_h = strides[0];
-        }
-
-        cps.padding_before = 0;
-        cps.padding_after = 0;
-        cps.padding_top = 0;
-        cps.padding_bottom = 0;
-        cps.padding_left = 0;
-        cps.padding_right = 0;
-        cps.rm = CEIL;
-        if (onnxValues.find(this->onnxNode.input(0)) != onnxValues.end() &&
-            output_shapes.size() > 0) {
-            auto shape = onnxValues[this->onnxNode.input(0)].type().tensor_type().shape();
-            if (shape.dim().size() > 2) {
-                int ih = shape.dim(2).dim_value();
-                if (output_shapes[0] == (int)cps.stride_h * ih) {
+            p.stride_h = strides[0];
+        }
+
+        p.round_mode = ROUND_CEIL;
+        if (autoPad == "SAME_UPPER" || autoPad == "SAME_LOWER") {
+            p.round_mode = ROUND_TF_SAME;
+        }
+        //if (output_shapes.size() > 0) {
+        //    U32 count = 0;
+        //    for (U32 i = 0; i < output_shapes.size(); i++) {
+        //        if (output_shapes[i] % strides[i] == 0) {
+        //            count++;
+        //        }
+        //    }
+        //    if (count == output_shapes.size()) {
+        //        p.round_mode = ROUND_TF_SAME;
+        //    }
+        //}
+        TensorDesc inputDesc = tensor0d();
+        if (onnxValues.find(this->onnxNode.input(0)) != onnxValues.end()) {
+            inputDesc = get_desc(onnxValues[this->onnxNode.input(0)]);
+        }
+        if (inputDesc.nDims > 0 && output_shapes.size() > 0) {
+            if (inputDesc.nDims > 2) {
+                int ih = inputDesc.dims[inputDesc.nDims - 3];
+                if (output_shapes[0] == (int)p.stride_h * ih) {
                     if (output_shapes.size() > 1) {
-                        int iw = shape.dim(2).dim_value();
-                        if (output_shapes[1] == (int)cps.stride_w * iw) {
-                            cps.rm = TF_SAME;
+                        int iw = inputDesc.dims[inputDesc.nDims - 4];
+                        if (output_shapes[1] == (int)p.stride_w * iw) {
+                            p.round_mode = ROUND_TF_SAME;
                         }
                     } else {
-                        cps.rm = TF_SAME;
+                        p.round_mode = ROUND_TF_SAME;
                     }
                 }
             }
         }
-        if (pads.size() == 4) {
-            cps.padding_top = pads[0];
-            cps.padding_left = pads[1];
-            cps.padding_bottom = pads[2];
-            cps.padding_right = pads[3];
-        } else if (pads.size() == 2) {
-            cps.padding_top = pads[0];
-            cps.padding_bottom = pads[1];
-            cps.padding_left = 0;
-            cps.padding_right = 0;
+        if (pads.size() == 0 && inputDesc.nDims > 0 && p.round_mode == ROUND_CEIL) {
+            unsigned int dim = kernels.size();
+            pads = std::vector<int>(dim * 2);
+            std::vector<int> input_size;
+            for (int i = inputDesc.nDims - 3; i >= 0; i--) {
+                input_size.push_back(inputDesc.dims[i]);
+            }
+            CHECK_REQUIREMENT(dim == input_size.size());
+            CHECK_REQUIREMENT(dim == output_shapes.size());
+            if (strides.size() == 0) {
+                strides = std::vector<int>(dim, 1);
+            }
+            if (dilations.size() == 0) {
+                dilations = std::vector<int>(dim, 1);
+            }
+            if (output_padding.size() == 0) {
+                output_padding = std::vector<int>(dim, 0);
+            }
+            for (unsigned int i = 0; i < dim; i++) {
+                int total_padding = strides[i] * (input_size[i] - 1) + output_padding[i] +
+                    ((kernels[i] - 1) * dilations[i] + 1) - output_shapes[i];
+                if (autoPad == "SAME_UPPER") {
+                    pads[i] = total_padding / 2;
+                    pads[i + dim] = total_padding - (total_padding / 2);
+                } else {
+                    pads[i] = total_padding - (total_padding / 2);
+                    pads[i + dim] = (total_padding / 2);
+                }
+            }
         }
-
-        cps.group = group;
+        p.pad_before = 0;
+        p.pad_after = 0;
+        p.pad_top = 0;
+        p.pad_bottom = 0;
+        p.pad_left = 0;
+        p.pad_right = 0;
+        if (pads.size() == 6) {
+            p.pad_before = pads[0];
+            p.pad_top = pads[1];
+            p.pad_left = pads[2];
+            p.pad_after = pads[3];
+            p.pad_bottom = pads[4];
+            p.pad_right = pads[5];
+        } else if (pads.size() == 4) {
+            p.pad_top = pads[0];
+            p.pad_left = pads[1];
+            p.pad_bottom = pads[2];
+            p.pad_right = pads[3];
+        } else if (pads.size() == 2) {
+            p.pad_top = pads[0];
+            p.pad_bottom = pads[1];
+            p.pad_left = 0;
+            p.pad_right = 0;
+        }
+        p.output_pad_t = 0;
+        p.output_pad_h = 0;
+        p.output_pad_w = 0;
+        if (output_padding.size() == 3) {
+            p.output_pad_t = output_padding[0];
+            p.output_pad_h = output_padding[1];
+            p.output_pad_w = output_padding[2];
+        } else if (output_padding.size() == 2) {
+            p.output_pad_h = output_padding[0];
+            p.output_pad_w = output_padding[1];
+        } else if (output_padding.size() == 1) {
+            p.output_pad_h = output_padding[0];
+        }
+
+        p.group = group;
         if (1 == group) {
-            cps.convolution_type = Convolution_Deconvolution;
+            p.convolution_type = CONVOLUTION_DECONVOLUTION;
         } else {
-            cps.convolution_type = Convolution_Depthwise_Deconvolution;
-            cps.num_outputs = weight.dims(0);
+            p.convolution_type = CONVOLUTION_DEPTHWISE_DECONVOLUTION;
+            p.num_outputs = weight.dims(0);
         }
-        cps.num_outputs_origin = cps.num_outputs;
-        cps.dw_activation_type = ACTIVATION_NULL;
-        cps.pw_activation_type = ACTIVATION_NULL;
-        curPs.conv_spec = cps;
-        return curPs;
+        p.num_outputs_origin = p.num_outputs;
+        p.dw_activation_type = ACTIVATION_NULL;
+        p.pw_activation_type = ACTIVATION_NULL;
+        ps.conv_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Pooling() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PoolingParamSpec pps;
-        memset(&pps, 0, sizeof(pps));
-        std::string autoPad =
-            get_node_str_attribute_by_name(this->onnxNode, "auto_pad");  // deprecated
-        std::vector<int> kernelShape =
-            get_node_vector_ints_attribute_by_name(this->onnxNode, "kernel_shape");
-        std::vector<int> strides = get_node_vector_ints_attribute_by_name(this->onnxNode, "strides");
-        std::vector<int> pads = get_node_vector_ints_attribute_by_name(this->onnxNode, "pads");
+        ParameterSpec ps;
+        PoolingParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::string autoPad = get_string(this->onnxNode, "auto_pad");
+        std::vector<int> kernels = get_ints(this->onnxNode, "kernel_shape");
+        std::vector<int> strides = get_ints(this->onnxNode, "strides");
+        std::vector<int> pads = get_ints(this->onnxNode, "pads");
+        int ceil_mode = get_int(this->onnxNode, "ceil_mode", 0);
+        p.count_include_pad = get_int(this->onnxNode, "count_include_pad", 0);
 
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "AveragePool" || onnxNodeType == "ReduceMean" ||
             onnxNodeType == "GlobalAveragePool") {
-            pps.mode = POOLING_MEAN;
+            p.mode = POOLING_MEAN;
         } else {
-            pps.mode = POOLING_MAX;
+            p.mode = POOLING_MAX;
         }
 
-        if (autoPad == "SAME_UPPER") {
-            pps.rm = CEIL;
+        if (ceil_mode) {
+            p.round_mode = ROUND_CEIL;
         } else {
-            pps.rm = FLOOR;
-        }
-
-        pps.kernel_t = 0;
-        pps.kernel_h = 0;
-        pps.kernel_w = 0;
-        if (kernelShape.size() == 3) {
-            pps.kernel_t = kernelShape[0];
-            pps.kernel_h = kernelShape[1];
-            pps.kernel_w = kernelShape[2];
-        } else if (kernelShape.size() == 2) {
-            pps.kernel_t = 1;
-            pps.kernel_h = kernelShape[0];
-            pps.kernel_w = kernelShape[1];
-        } else if (kernelShape.size() == 1) {
-            pps.kernel_t = 1;
-            pps.kernel_h = kernelShape[0];
-            pps.kernel_w = 1;
-        }
-
-        pps.stride_t = 1;
-        pps.stride_h = 1;
-        pps.stride_w = 1;
+            p.round_mode = ROUND_FLOOR;
+        }
+
+        p.kernel_t = 0;
+        p.kernel_h = 0;
+        p.kernel_w = 0;
+        if (kernels.size() == 3) {
+            p.kernel_t = kernels[0];
+            p.kernel_h = kernels[1];
+            p.kernel_w = kernels[2];
+        } else if (kernels.size() == 2) {
+            p.kernel_t = 1;
+            p.kernel_h = kernels[0];
+            p.kernel_w = kernels[1];
+        } else if (kernels.size() == 1) {
+            p.kernel_t = 1;
+            p.kernel_h = kernels[0];
+            p.kernel_w = 1;
+        }
+
+        p.stride_t = 1;
+        p.stride_h = 1;
+        p.stride_w = 1;
         if (strides.size() == 3) {
-            pps.stride_t = strides[0];
-            pps.stride_h = strides[1];
-            pps.stride_w = strides[2];
+            p.stride_t = strides[0];
+            p.stride_h = strides[1];
+            p.stride_w = strides[2];
         } else if (strides.size() == 2) {
-            pps.stride_h = strides[0];
-            pps.stride_w = strides[1];
+            p.stride_h = strides[0];
+            p.stride_w = strides[1];
         } else if (strides.size() == 1) {
-            pps.stride_h = strides[0];
+            p.stride_h = strides[0];
         }
 
-        pps.padding_before = 0;
-        pps.padding_top = 0;
-        pps.padding_left = 0;
-        pps.padding_after = 0;
-        pps.padding_bottom = 0;
-        pps.padding_right = 0;
+        p.pad_before = 0;
+        p.pad_top = 0;
+        p.pad_left = 0;
+        p.pad_after = 0;
+        p.pad_bottom = 0;
+        p.pad_right = 0;
         if (pads.size() == 6) {
-            pps.padding_before = pads[0];
-            pps.padding_top = pads[1];
-            pps.padding_left = pads[2];
-            pps.padding_after = pads[3];
-            pps.padding_bottom = pads[4];
-            pps.padding_right = pads[5];
+            p.pad_before = pads[0];
+            p.pad_top = pads[1];
+            p.pad_left = pads[2];
+            p.pad_after = pads[3];
+            p.pad_bottom = pads[4];
+            p.pad_right = pads[5];
         } else if (pads.size() == 4) {
-            pps.padding_top = pads[0];
-            pps.padding_left = pads[1];
-            pps.padding_bottom = pads[2];
-            pps.padding_right = pads[3];
+            p.pad_top = pads[0];
+            p.pad_left = pads[1];
+            p.pad_bottom = pads[2];
+            p.pad_right = pads[3];
         } else if (pads.size() == 2) {
-            pps.padding_top = pads[0];
-            pps.padding_bottom = pads[1];
+            p.pad_top = pads[0];
+            p.pad_bottom = pads[1];
+        } else if (autoPad == "SAME_UPPER") {
+            p.pad_top = (p.kernel_h - 1) / 2;
+            p.pad_bottom = (p.kernel_h - 1) - p.pad_top;
+            p.pad_left = (p.kernel_w - 1) / 2;
+            p.pad_right = (p.kernel_w - 1) - p.pad_left;
         }
-        curPs.pooling_spec = pps;
-        return curPs;
+        ps.pooling_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_MatMul() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        MatMulParamSpec matmulPs;
-        memset(&matmulPs, 0, sizeof(matmulPs));
-        matmulPs.transpose_a = false;
-        matmulPs.transpose_b = false;
+        ParameterSpec ps;
+        MatMulParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.transpose_a = false;
+        p.transpose_b = false;
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "Gemm") {
-            matmulPs.transpose_a =
-                get_node_single_int_attribute_by_name(this->onnxNode, "transA", 0);
-            matmulPs.transpose_b =
-                get_node_single_int_attribute_by_name(this->onnxNode, "transB", 0);
+            p.transpose_a = get_int(this->onnxNode, "transA", 0);
+            p.transpose_b = get_int(this->onnxNode, "transB", 0);
         }
-        curPs.matmul_spec = matmulPs;
-        insert_shared_weight();
-        return curPs;
+        ps.matmul_spec = p;
+        add_shared_weight(this->onnxNode);
+        return ps;
     }
 
     ParameterSpec adapt_Fc() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        FullyConnectedParamSpec fcParamSpec;
-        memset(&fcParamSpec, 0, sizeof(fcParamSpec));
-        fcParamSpec.num_outputs = -1;
+        ParameterSpec ps;
+        FullyConnectedParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.num_outputs = -1;
 
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "MatMul") {
             const onnx::TensorProto &matmulTp = onnxWeights[this->onnxNode.input(1)];
             if (matmulTp.dims_size() == 2) {
-                fcParamSpec.num_outputs = matmulTp.dims(1);
+                p.num_outputs = matmulTp.dims(1);
             } else {
                 UNI_ERROR_LOG("can not map operator name:%s type:%s to FullyConnected.\n",
                     this->onnxNode.name().c_str(), onnxNodeType.c_str());
@@ -1967,102 +1704,98 @@ class OnnxAdaptee : public ModelAdaptee {
         } else if (onnxNodeType == "Linear") {
             const onnx::TensorProto &matmulTp = onnxWeights[this->onnxNode.input(1)];
             if (matmulTp.dims_size() == 2) {
-                fcParamSpec.num_outputs = matmulTp.dims(0);
+                p.num_outputs = matmulTp.dims(0);
             } else {
                 UNI_ERROR_LOG("can not map operator name:%s type:%s to FullyConnected.\n",
                     this->onnxNode.name().c_str(), onnxNodeType.c_str());
             }
         } else {
-            float alpha = get_node_float_attribute_by_name(this->onnxNode, "alpha", 1.f);
-            float beta = get_node_float_attribute_by_name(this->onnxNode, "beta", 1.f);
-            int transA = get_node_single_int_attribute_by_name(this->onnxNode, "transA", 0);
-            int transB = get_node_single_int_attribute_by_name(this->onnxNode, "transB", 0);
+            float alpha = get_float(this->onnxNode, "alpha", 1.f);
+            float beta = get_float(this->onnxNode, "beta", 1.f);
+            int transA = get_int(this->onnxNode, "transA", 0);
+            int transB = get_int(this->onnxNode, "transB", 0);
             auto weightTp = onnxWeights[this->onnxNode.input(1)];
             if (transB == 1.0) {
-                fcParamSpec.num_outputs = weightTp.dims(0);
+                p.num_outputs = weightTp.dims(0);
             } else {
-                fcParamSpec.num_outputs = weightTp.dims(1);
+                p.num_outputs = weightTp.dims(1);
             }
             if (!(alpha == 1.f && beta == 1.f && transA == 0)) {
                 UNI_ERROR_LOG("can not map operator name:%s type:%s to FullyConnected.\n",
                     this->onnxNode.name().c_str(), onnxNodeType.c_str());
             }
         }
-        fcParamSpec.num_slices = 1;
-        fcParamSpec.slice_point[0] = fcParamSpec.num_outputs;
-        curPs.fc_spec = fcParamSpec;
-        return curPs;
+        p.num_slices = 1;
+        p.slice_point[0] = p.num_outputs;
+        ps.fc_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_BatchNorm() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        BatchNormParamSpec bnPs;
-        memset(&bnPs, 0, sizeof(bnPs));
-        bnPs.eps = get_node_float_attribute_by_name(this->onnxNode, "epsilon", 1e-5f);
+        ParameterSpec ps;
+        BatchNormParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.eps = get_float(this->onnxNode, "epsilon", 1e-5f);
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "BatchNormalization") {
-            bnPs.axis = 1;
+            p.axis = 1;
         } else if (onnxNodeType == "BatchNorm") {
-            bnPs.axis = -1;
+            p.axis = -1;
         } else {
             UNI_ERROR_LOG("can not map operator name:%s type:%s to BatchNorm.\n",
                 this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
-        bnPs.gama = 1;
-        bnPs.momentum = get_node_float_attribute_by_name(this->onnxNode, "momentum", 0.9);
-        curPs.bn_spec = bnPs;
-        return curPs;
+        p.gama = 1;
+        p.momentum = get_float(this->onnxNode, "momentum", 0.9);
+        ps.bn_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_InstanceNorm() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        InstanceNormParamSpec inPs;
-        memset(&inPs, 0, sizeof(inPs));
-        inPs.eps = get_node_float_attribute_by_name(this->onnxNode, "epsilon", 1e-5f);
-        inPs.axis = 1;
-        inPs.axis_dim = -1;
-        curPs.in_spec = inPs;
-        return curPs;
+        ParameterSpec ps;
+        InstanceNormParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.eps = get_float(this->onnxNode, "epsilon", 1e-5f);
+        p.axis = 1;
+        p.axis_dim = -1;
+        ps.in_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Eltwise() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        EltwiseParamSpec eps;
-        memset(&eps, 0, sizeof(eps));
+        ParameterSpec ps;
+        EltwiseParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "Add" || onnxNodeType == "Sum") {
-            eps.elt_mode = ELTWISE_SUM;
-            eps.elt_sum_spec.coeff_size = 2;
-            for (I32 j = 0; j < eps.elt_sum_spec.coeff_size; j++) {
-                eps.elt_sum_spec.coeff_values[j] = 1.0;
+            p.mode = ELTWISE_SUM;
+            p.sum_spec.num_coeff = 2;
+            for (I32 j = 0; j < p.sum_spec.num_coeff; j++) {
+                p.sum_spec.coeff[j] = 1.0;
             }
         } else if (onnxNodeType == "Mul") {
-            eps.elt_mode = ELTWISE_PROD;
+            p.mode = ELTWISE_PROD;
         } else if (onnxNodeType == "Sub") {
-            eps.elt_mode = ELTWISE_SUB;
+            p.mode = ELTWISE_SUB;
         } else if (onnxNodeType == "Div") {
-            eps.elt_mode = ELTWISE_DIV;
+            p.mode = ELTWISE_DIV;
         } else if (onnxNodeType == "And") {
-            eps.elt_mode = ELTWISE_AND;
+            p.mode = ELTWISE_AND;
         } else if (onnxNodeType == "Or") {
-            eps.elt_mode = ELTWISE_OR;
+            p.mode = ELTWISE_OR;
         } else if (onnxNodeType == "Xor") {
-            eps.elt_mode = ELTWISE_XOR;
+            p.mode = ELTWISE_XOR;
         } else {
             UNI_ERROR_LOG("can not map operator name:%s type:%s to Eltwise.\n",
                 this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
-        eps.activation_type = ACTIVATION_NULL;
-        curPs.eltwise_spec = eps;
-
-        insert_shared_weight();
-        return curPs;
+        p.activation_type = ACTIVATION_NULL;
+        ps.eltwise_spec = p;
+        add_shared_weight(this->onnxNode);
+        return ps;
     }
 
     void handle_Constant()
@@ -2082,66 +1815,64 @@ class OnnxAdaptee : public ModelAdaptee {
 
     ParameterSpec adapt_Pad() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PadParamSpec padPs;
-        memset(&padPs, 0, sizeof(padPs));
-        std::string padModeStr = get_node_str_attribute_by_name(this->onnxNode, "mode");
-        std::vector<int> padVec = get_node_vector_ints_attribute_by_name(this->onnxNode, "pads");
-        F32 padValue = get_node_float_attribute_by_name(this->onnxNode, "value", 0.f);
+        ParameterSpec ps;
+        PadParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::string padModeStr = get_string(this->onnxNode, "mode");
+        std::vector<int> padVec = get_ints(this->onnxNode, "pads");
+        F32 padValue = get_float(this->onnxNode, "value", 0.f);
         if (padModeStr == "constant" || padModeStr.length() == 0) {
-            padPs.pad_mode = Pad_Constant;
+            p.pad_mode = PAD_CONSTANT;
         } else if (padModeStr == "edge") {
-            padPs.pad_mode = Pad_Edge;
+            p.pad_mode = PAD_EDGE;
         } else if (padModeStr == "reflect") {
-            padPs.pad_mode = Pad_Reflect;
+            p.pad_mode = PAD_REFLECT;
         }
 
-        padPs.front = 0;
-        padPs.back = 0;
-        padPs.before = 0;
-        padPs.after = 0;
+        p.front = 0;
+        p.back = 0;
+        p.before = 0;
+        p.after = 0;
         U32 padSize = padVec.size();
         if (padSize == 0) {
             const onnx::TensorProto &padsTp = onnxWeights[this->onnxNode.input(1)];
-            padVec = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(1)]);
+            padVec = get_ints(onnxWeights[this->onnxNode.input(1)]);
             padSize = padVec.size();
         }
         if (padSize == 8) {  // NCHW
-            padPs.front = padVec[1];
-            padPs.top = padVec[2];
-            padPs.left = padVec[3];
-            padPs.back = padVec[5];
-            padPs.bottom = padVec[6];
-            padPs.right = padVec[7];
+            p.front = padVec[1];
+            p.top = padVec[2];
+            p.left = padVec[3];
+            p.back = padVec[5];
+            p.bottom = padVec[6];
+            p.right = padVec[7];
         } else if (padSize == 6) {  // NCH
-            padPs.top = padVec[2];
-            padPs.left = 0;
-            padPs.bottom = padVec[5];
-            padPs.right = 0;
+            p.top = padVec[2];
+            p.left = 0;
+            p.bottom = padVec[5];
+            p.right = 0;
         } else if (padSize == 4) {  // HW
-            padPs.top = padVec[0];
-            padPs.left = padVec[1];
-            padPs.bottom = padVec[2];
-            padPs.right = padVec[3];
+            p.top = padVec[0];
+            p.left = padVec[1];
+            p.bottom = padVec[2];
+            p.right = padVec[3];
         } else {
             UNI_ERROR_LOG("can not process operator name:%s type:%s attributes.\n",
                 this->onnxNode.name().c_str(), this->onnxNode.op_type().c_str());
         }
-        padPs.constant_value = padValue;
-        curPs.pad_spec = padPs;
-        return curPs;
+        p.constant_value = padValue;
+        ps.pad_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Gather() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
+        ParameterSpec ps;
         GatherParamSpec p;
-        memset(&p, 0, sizeof(p));
+        UNI_MEMSET(&p, 0, sizeof(p));
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "Gather" || onnxNodeType == "GatherElements") {
-            p.axis = get_node_single_int_attribute_by_name(this->onnxNode, "axis", 0);
+            p.axis = get_int(this->onnxNode, "axis", 0);
         } else {
             p.axis = INT_MAX;
         }
@@ -2151,7 +1882,7 @@ class OnnxAdaptee : public ModelAdaptee {
             p.element_level = false;
         }
         if (onnxNodeType == "GatherND") {
-            p.batch_dims = get_node_single_int_attribute_by_name(this->onnxNode, "batch_dims", 0);
+            p.batch_dims = get_int(this->onnxNode, "batch_dims", 0);
         } else {
             p.batch_dims = 0;
         }
@@ -2172,11 +1903,11 @@ class OnnxAdaptee : public ModelAdaptee {
                 *desc = tensor0d();
             } else {
                 const onnx::TensorProto &tp = onnxWeights[input_name];
-                TensorDesc tmp = genDescFromTp(tp);
+                TensorDesc tmp = get_desc(tp);
                 if (onnxNodeType == "Gather" && i == 1 && tmp.nDims == 0) {
                     p.index_scalar = true;
                 }
-                int num = get_data_size_from_tensor_proto(tp);
+                int num = get_length(tp);
                 if (tmp.nDims == 0 && num > 0) {
                     tmp.nDims = 1;
                     tmp.dims[0] = num;
@@ -2188,210 +1919,164 @@ class OnnxAdaptee : public ModelAdaptee {
                 }
             }
         }
-        curPs.gather_spec = p;
-        return curPs;
+        ps.gather_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_TfSlice() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        TfSliceParamSpec tfSlicePs;
-        memset(&tfSlicePs, 0, sizeof(tfSlicePs));
-        std::vector<int> startsInfo;
-        std::vector<int> endsInfo;
-        std::vector<int> axesInfo;
-        std::vector<int> stepInfo;
-
+        ParameterSpec ps;
+        TfSliceParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<int> starts, ends, axes, steps;
         if (this->onnxNode.input_size() == 1) {
-            startsInfo = get_node_vector_ints_attribute_by_name(this->onnxNode, "starts");
-            endsInfo = get_node_vector_ints_attribute_by_name(this->onnxNode, "ends");
-            axesInfo = get_node_vector_ints_attribute_by_name(this->onnxNode, "axes");
+            starts = get_ints(this->onnxNode, "starts");
+            ends = get_ints(this->onnxNode, "ends");
+            axes = get_ints(this->onnxNode, "axes");
+            steps = get_ints(this->onnxNode, "steps");
         } else {
-            startsInfo = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(1)]);
-            endsInfo = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(2)]);
+            starts = get_ints(onnxWeights[this->onnxNode.input(1)]);
+            ends = get_ints(onnxWeights[this->onnxNode.input(2)]);
             if (this->onnxNode.input_size() >= 4) {
-                axesInfo = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(3)]);
+                axes = get_ints(onnxWeights[this->onnxNode.input(3)]);
             }
             if (this->onnxNode.input_size() >= 5) {
-                stepInfo = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(4)]);
+                steps = get_ints(onnxWeights[this->onnxNode.input(4)]);
             }
         }
-        tfSlicePs.dim_size = 8;
-        for (U32 i = 0; i < tfSlicePs.dim_size; i++) {
-            tfSlicePs.begin[i] = 0;
-            tfSlicePs.end[i] = -1;
-            tfSlicePs.strides[i] = 1;
-            tfSlicePs.begin_mask[i] = 1;
-            tfSlicePs.end_mask[i] = 1;
+        p.num_dims = 8;
+        for (U32 i = 0; i < p.num_dims; i++) {
+            p.begin[i] = 0;
+            p.end[i] = -1;
+            p.strides[i] = 1;
+            p.begin_mask[i] = 1;
+            p.end_mask[i] = 1;
         }
-        for (U32 i = 0; i < startsInfo.size(); i++) {
+        for (U32 i = 0; i < starts.size(); i++) {
             int axis;
-            if (axesInfo.size() > 0) {
-                axis = axesInfo[i];
+            if (axes.size() > 0) {
+                axis = axes[i];
             } else {
                 axis = i;
             }
-            tfSlicePs.begin[axis] = startsInfo[i];
-            tfSlicePs.end[axis] = endsInfo[i];
-            tfSlicePs.begin_mask[axis] = 0;
-            tfSlicePs.end_mask[axis] = 0;
+            p.begin[axis] = starts[i];
+            p.end[axis] = ends[i];
+            if (steps.size() > 0) {
+                p.strides[axis] = steps[i];
+            }
+            p.begin_mask[axis] = 0;
+            p.end_mask[axis] = 0;
         }
-        curPs.tfslice_spec = tfSlicePs;
-        return curPs;
+        ps.tfslice_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Slice() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SliceParamSpec slice_ps;
-        memset(&slice_ps, 0, sizeof(slice_ps));
-        std::vector<int> splitInfo = get_node_vector_ints_attribute_by_name(this->onnxNode, "split");
-        slice_ps.axis = get_node_single_int_attribute_by_name(this->onnxNode, "axis", 0);
+        ParameterSpec ps;
+        SliceParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<int> split = get_ints(this->onnxNode, "split");
+        p.axis = get_int(this->onnxNode, "axis", 0);
         // Split equally by default. Set all slice_points to 0
-        if (0 == splitInfo.size()) {
-            slice_ps.slice_size = (int)this->onnxNode.output_size();
-            memset(slice_ps.slice_points, 0, slice_ps.slice_size * sizeof(I32));
+        if (0 == split.size()) {
+            p.num_slice = this->onnxNode.output_size() - 1;
+            UNI_MEMSET(p.slice_points, 0, p.num_slice * sizeof(I32));
         } else {
-            slice_ps.slice_size = splitInfo.size();
-            slice_ps.slice_points[0] = splitInfo[0];
-            for (U32 i = 1; i < slice_ps.slice_size; i++) {
-                slice_ps.slice_points[i] = slice_ps.slice_points[i - 1] + splitInfo[i];
+            p.num_slice = split.size() - 1;
+            p.slice_points[0] = split[0];
+            for (U32 i = 1; i < p.num_slice; i++) {
+                p.slice_points[i] = p.slice_points[i - 1] + split[i];
             }
         }
-        curPs.slice_spec = slice_ps;
-        return curPs;
-    }
-
-    ParameterSpec adapt_Embedding() override
-    {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        EmbedParamSpec embed_ps;
-        memset(&embed_ps, 0, sizeof(embed_ps));
-        std::string embed_weight_name = this->onnxNode.input(0);
-        if (onnxWeights.find(this->onnxNode.input(0)) == onnxWeights.end()) {
-            return curPs;
-        }
-        auto tensor_proto = onnxWeights[embed_weight_name];
-        int size_of_dims = tensor_proto.dims_size();
-        if (size_of_dims != 2) {
-            UNI_ERROR_LOG("can not process operator name:%s type:%s attributes.\n",
-                this->onnxNode.name().c_str(), this->onnxNode.op_type().c_str());
-        }
-        embed_ps.input_dim = tensor_proto.dims(0);
-        embed_ps.num_output = tensor_proto.dims(1);
-        embed_ps.bias_term = false;
-        embed_ps.transpose = false;
-        curPs.embed_spec = embed_ps;
-        return curPs;
+        ps.slice_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Squeeze() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SqueezeParamSpec squeezePs;
-        memset(&squeezePs, 0, sizeof(squeezePs));
+        ParameterSpec ps;
+        SqueezeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         std::vector<int> squeezeAxes;
         if (this->onnxNode.input_size() > 1) {
-            squeezeAxes = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(1)]);
+            squeezeAxes = get_ints(onnxWeights[this->onnxNode.input(1)]);
         } else {
-            squeezeAxes = get_node_vector_ints_attribute_by_name(this->onnxNode, "axes");
+            squeezeAxes = get_ints(this->onnxNode, "axes");
         }
-        squeezePs.axes_num = squeezeAxes.size();
+        p.num_axes = squeezeAxes.size();
         for (int squeeze_i = 0; squeeze_i < (int)squeezeAxes.size(); squeeze_i++) {
-            squeezePs.axes[squeeze_i] = squeezeAxes[squeeze_i];
+            p.axes[squeeze_i] = squeezeAxes[squeeze_i];
         }
-        curPs.squeeze_spec = squeezePs;
-        return curPs;
+        ps.squeeze_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Unsqueeze() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        UnsqueezeParamSpec unsqueezePs;
-        memset(&unsqueezePs, 0, sizeof(unsqueezePs));
+        ParameterSpec ps;
+        UnsqueezeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         std::vector<int> unsqueezeAxes;
         if (this->onnxNode.input_size() > 1) {
-            unsqueezeAxes = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(1)]);
+            unsqueezeAxes = get_ints(onnxWeights[this->onnxNode.input(1)]);
         } else {
-            unsqueezeAxes = get_node_vector_ints_attribute_by_name(this->onnxNode, "axes");
+            unsqueezeAxes = get_ints(this->onnxNode, "axes");
         }
-        unsqueezePs.axes_num = unsqueezeAxes.size();
+        p.num_axes = unsqueezeAxes.size();
         for (int unsqueeze_i = 0; unsqueeze_i < (int)unsqueezeAxes.size(); unsqueeze_i++) {
-            unsqueezePs.axes[unsqueeze_i] = unsqueezeAxes[unsqueeze_i];
+            p.axes[unsqueeze_i] = unsqueezeAxes[unsqueeze_i];
         }
-        curPs.unsqueeze_spec = unsqueezePs;
-        return curPs;
+        ps.unsqueeze_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Cast() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        CastParamSpec castPs;
-        memset(&castPs, 0, sizeof(castPs));
-
-        int cast_to;
+        ParameterSpec ps;
+        CastParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        int dst;
         if (this->onnxNode.input_size() == 2 &&
             onnxWeights.find(this->onnxNode.input(1)) != onnxWeights.end()) {
-            cast_to = (get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(1)]))[0];
+            dst = (get_ints(onnxWeights[this->onnxNode.input(1)]))[0];
         } else {
-            cast_to = get_node_single_int_attribute_by_name(this->onnxNode, "to", 0);
-        }
-
-        if (cast_to == onnx::TensorProto::FLOAT) {
-            castPs.targetDt = DT_F32;
-        } else if (cast_to == onnx::TensorProto::FLOAT16) {
-            castPs.targetDt = DT_F16;
-        } else if (cast_to == onnx::TensorProto::INT16 || cast_to == onnx::TensorProto::INT32 ||
-            cast_to == onnx::TensorProto::INT64) {
-            castPs.targetDt = DT_I32;
-        } else if (cast_to == onnx::TensorProto::BOOL) {
-            castPs.targetDt = DT_U8;
-        } else {
-            castPs.targetDt = DT_F32;  // default
+            dst = get_int(this->onnxNode, "to", 0);
         }
-        curPs.cast_spec = castPs;
-        return curPs;
+        p.dt = cut_type(get_type((onnx::TensorProto::DataType)dst));
+        ps.cast_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Concat() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ConcatParamSpec concatPs;
-        memset(&concatPs, 0, sizeof(concatPs));
-        concatPs.axis = get_node_single_int_attribute_by_name(this->onnxNode, "axis", 1);
-        curPs.concat_spec = concatPs;
-
-        insert_shared_weight();
-        return curPs;
+        ParameterSpec ps;
+        ConcatParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = get_int(this->onnxNode, "axis", 1);
+        ps.concat_spec = p;
+        add_shared_weight(this->onnxNode);
+        return ps;
     }
 
     ParameterSpec adapt_Softmax() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SoftmaxParamSpec softmaxPs;
-        memset(&softmaxPs, 0, sizeof(softmaxPs));
-        softmaxPs.axis = get_node_single_int_attribute_by_name(this->onnxNode, "axis", -1);
-        curPs.softmax_spec = softmaxPs;
-        return curPs;
+        ParameterSpec ps;
+        SoftmaxParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = get_int(this->onnxNode, "axis", -1);
+        ps.softmax_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Relu() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ReLUParamSpec reluPs;
-        memset(&reluPs, 0, sizeof(reluPs));
-        reluPs.neg_slope = get_node_float_attribute_by_name(this->onnxNode, "alpha", 0.0);
-        curPs.relu_spec = reluPs;
-        return curPs;
+        ParameterSpec ps;
+        ReLUParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.neg_slope = get_float(this->onnxNode, "alpha", 0.0);
+        ps.relu_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_RNN() override
@@ -2400,342 +2085,312 @@ class OnnxAdaptee : public ModelAdaptee {
         if (onnxNodeType == "Scan") {
             return adapt_Scan();
         }
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        RNNParamSpec rnnPs;
-        memset(&rnnPs, 0, sizeof(rnnPs));
+        ParameterSpec ps;
+        RNNParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         if (onnxNodeType == "RNN") {
-            rnnPs.mode = RNN_RNN;
+            p.mode = RNN_RNN;
         } else if (onnxNodeType == "LSTM") {
-            rnnPs.mode = RNN_LSTM;
+            p.mode = RNN_LSTM;
         } else if (onnxNodeType == "GRU") {
-            int linear_before_reset =
-                get_node_single_int_attribute_by_name(this->onnxNode, "linear_before_reset", 0);
+            int linear_before_reset = get_int(this->onnxNode, "linear_before_reset", 0);
             if (linear_before_reset == 0) {
-                rnnPs.mode = RNN_GRU;
+                p.mode = RNN_GRU;
             } else {
-                rnnPs.mode = RNN_GRU_LBR;
+                p.mode = RNN_GRU_LBR;
             }
         } else {
             UNI_ERROR_LOG("can not map operator name:%s type:%s to RNN.\n",
                 this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
-        rnnPs.numOutput = get_node_single_int_attribute_by_name(this->onnxNode, "hidden_size", 1);
-        rnnPs.biDirection = get_node_str_attribute_by_name(
-                                this->onnxNode, "direction", "forward") == "bidirectional"
-            ? true
-            : false;
-        rnnPs.steps = 0;
-        rnnPs.numProjection = 0;
-        rnnPs.zoneoutCell = 0;
-        rnnPs.zoneoutOutput = 0;
-        rnnPs.forgetBias = 0;
-        rnnPs.activationMode = ACTIVATION_TANH;
-        curPs.rnn_spec = rnnPs;
-        return curPs;
+        p.num_outputs = get_int(this->onnxNode, "hidden_size", 1);
+        p.bi_direction =
+            get_string(this->onnxNode, "direction", "forward") == "bidirectional" ? true : false;
+        p.steps = 0;
+        p.num_projection = 0;
+        p.zoneout_cell = 0;
+        p.zoneout_output = 0;
+        p.forget_bias = 0;
+        p.activation_type = ACTIVATION_TANH;
+        ps.rnn_spec = p;
+        return ps;
     }
 
     // (scale * x + shift) ^ power
     ParameterSpec adapt_Power() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PowerParamSpec powerPs;
-        memset(&powerPs, 0, sizeof(powerPs));
-        powerPs.scale = 1;
-        powerPs.shift = 0;
-        powerPs.power = 1;
+        ParameterSpec ps;
+        PowerParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.scale = 1;
+        p.shift = 0;
+        p.power = 1;
         int index = 0;
         float value = 0;
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "Pow" || onnxNodeType == "Mul" || onnxNodeType == "Div" ||
             onnxNodeType == "Add" || onnxNodeType == "Sub") {
-            std::vector<int> indexes = getOperatorWeightInputIndex(this->onnxNode);
-            CHECK_REQUIREMENT(indexes.size() == 1);
-            index = indexes[0];
-            const onnx::TensorProto &tp = onnxWeights[this->onnxNode.input(index)];
-            value = getSinFloat_from_tensorProto(tp);
+            std::vector<int> ids = get_weight_ids(this->onnxNode);
+            CHECK_REQUIREMENT(ids.size() == 1);
+            index = ids[0];
+            value = get_floats(onnxWeights[this->onnxNode.input(index)])[0];
         }
         if (onnxNodeType == "Pow") {
-            powerPs.power = value;
+            p.power = value;
         } else if (onnxNodeType == "Mul") {
-            powerPs.scale = value;
+            p.scale = value;
         } else if (onnxNodeType == "Div") {
-            powerPs.scale = 1 / value;
+            p.scale = 1 / value;
             if (index == 0) {
-                powerPs.power = -1;
+                p.power = -1;
             }
         } else if (onnxNodeType == "Add") {
-            powerPs.shift = value;
+            p.shift = value;
         } else if (onnxNodeType == "Sub") {
             if (index == 0) {
-                powerPs.scale = -1;
-                powerPs.shift = value;
+                p.scale = -1;
+                p.shift = value;
             } else {
-                powerPs.shift = -1 * value;
+                p.shift = -1 * value;
             }
         } else if (onnxNodeType == "Sqrt") {
-            powerPs.power = 0.5;
+            p.power = 0.5;
         } else if (onnxNodeType == "Scale") {
-            powerPs.scale = get_node_float_attribute_by_name(this->onnxNode, "scale", 1.0);
+            p.scale = get_float(this->onnxNode, "scale", 1.0);
         } else {
             UNI_ERROR_LOG("can not map operator name:%s type:%s to Power.\n",
                 this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
-        curPs.power_spec = powerPs;
-        return curPs;
+        ps.power_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Scale() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ScaleParamSpec scale_ps;
-        memset(&scale_ps, 0, sizeof(scale_ps));
+        ParameterSpec ps;
+        ScaleParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "Add" || onnxNodeType == "Sub" || onnxNodeType == "Mul" ||
             onnxNodeType == "Div") {
-            const auto &tensor = onnxWeights[this->onnxNode.input(1)];
+            std::vector<int> ids = get_weight_ids(this->onnxNode);
+            const auto &tensor = onnxWeights[this->onnxNode.input(ids[0])];
             if (tensor.dims_size() > 1) {
                 for (int idx = 0; idx < tensor.dims_size(); ++idx) {
                     if (tensor.dims(idx) > 1) {
-                        scale_ps.axis = idx;
+                        p.axis = idx - tensor.dims_size();
                         break;
                     }
                 }
             } else {
-                scale_ps.axis = -1;
+                p.axis = -1;
             }
         } else {
-            scale_ps.axis = 1;
+            p.axis = 1;
         }
-        curPs.scale_spec = scale_ps;
-        return curPs;
+        ps.scale_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Space2Depth() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        Space2DepthParamSpec s2dPs;
-        memset(&s2dPs, 0, sizeof(s2dPs));
-        s2dPs.blockSize = get_node_single_int_attribute_by_name(this->onnxNode, "blocksize", 1);
-        curPs.space2depth_spec = s2dPs;
-        return curPs;
+        ParameterSpec ps;
+        Space2DepthParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.block_size = get_int(this->onnxNode, "blocksize", 1);
+        ps.space2depth_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Depth2Space() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        Depth2SpaceParamSpec d2sPs;
-        memset(&d2sPs, 0, sizeof(d2sPs));
-        d2sPs.blockSize = get_node_single_int_attribute_by_name(this->onnxNode, "blocksize", 1);
-        std::string d2s_mode = get_node_str_attribute_by_name(this->onnxNode, "mode", "DCR");
-        str_copy(d2sPs.reMode, d2s_mode.c_str(), d2s_mode.length(), 8);
-        curPs.depth2space_spec = d2sPs;
-        return curPs;
+        ParameterSpec ps;
+        Depth2SpaceParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.block_size = get_int(this->onnxNode, "blocksize", 1);
+        std::string mode = get_string(this->onnxNode, "mode", "DCR");
+        str_copy(p.mode, mode.c_str(), mode.length(), 8);
+        ps.depth2space_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Reduction() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ReductionParamSpec rsPs;
-        memset(&rsPs, 0, sizeof(rsPs));
-        std::vector<int> axesInfo = get_node_vector_ints_attribute_by_name(this->onnxNode, "axes");
-        if (axesInfo.size() == 0 && this->onnxNode.input_size() > 1) {
-            axesInfo = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(1)]);
-        }
-        int keepdimsInfo = get_node_single_int_attribute_by_name(this->onnxNode, "keepdims", 1);
-        rsPs.axes_num = axesInfo.size();
-        for (int i = 0; i < rsPs.axes_num; i++) {
-            rsPs.axes[i] = axesInfo[i];
-        }
-        rsPs.keep_dim = keepdimsInfo == 0 ? false : true;
-        rsPs.coeff = 1.0;
+        ParameterSpec ps;
+        ReductionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<int> axes = get_ints(this->onnxNode, "axes");
+        if (axes.size() == 0 && this->onnxNode.input_size() > 1) {
+            axes = get_ints(onnxWeights[this->onnxNode.input(1)]);
+        }
+        int keepdims = get_int(this->onnxNode, "keepdims", 1);
+        p.num_axes = axes.size();
+        for (int i = 0; i < p.num_axes; i++) {
+            p.axes[i] = axes[i];
+        }
+        p.keep_dim = keepdims == 0 ? false : true;
+        p.coeff = 1.0;
         const std::string &onnxNodeType = this->onnxNode.op_type();
         if (onnxNodeType == "ReduceSum") {
-            rsPs.reduction_mode = REDUCTION_SUM;
+            p.mode = REDUCTION_SUM;
         } else if (onnxNodeType == "ReduceMean") {
-            rsPs.reduction_mode = REDUCTION_MEAN;
+            p.mode = REDUCTION_MEAN;
         } else if (onnxNodeType == "ReduceMax") {
-            rsPs.reduction_mode = REDUCTION_MAX;
+            p.mode = REDUCTION_MAX;
         } else if (onnxNodeType == "ReduceMin") {
-            rsPs.reduction_mode = REDUCTION_MIN;
+            p.mode = REDUCTION_MIN;
         } else if (onnxNodeType == "ReduceL2") {
-            rsPs.reduction_mode = REDUCTION_L2;
+            p.mode = REDUCTION_L2;
         } else {
             UNI_ERROR_LOG("can not map operator name:%s type:%s to Reduction.\n",
                 this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
-        curPs.reduction_spec = rsPs;
-        return curPs;
+        ps.reduction_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_ArgMax() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ArgMaxParamSpec amPs;
-        memset(&amPs, 0, sizeof(amPs));
-        amPs.axis = get_node_single_int_attribute_by_name(this->onnxNode, "axis", -1);
-        curPs.argmax_spec = amPs;
-        return curPs;
+        ParameterSpec ps;
+        ArgMaxParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = get_int(this->onnxNode, "axis", -1);
+        ps.argmax_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_PRelu() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        return curPs;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        return ps;
     }
 
     ParameterSpec adapt_Tile() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        TileParamSpec tilePs;
-        memset(&tilePs, 0, sizeof(tilePs));
-        std::vector<int> tileInfo =
-            get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(1)]);
-        const std::string &onnxNodeType = this->onnxNode.op_type();
-        if (tileInfo.size() > 0 && tileInfo.size() <= 8) {
-            tilePs.dimsSize = tileInfo.size();
-        } else {
-            UNI_ERROR_LOG("can not process operator name:%s type:%s attributes.\n",
-                this->onnxNode.name().c_str(), onnxNodeType.c_str());
-        }
-        for (U32 i = 0; i < tileInfo.size(); i++) {
-            tilePs.repeatsInfo[i] = tileInfo[i];
-        }
-        curPs.tile_spec = tilePs;
-        return curPs;
+        ParameterSpec ps;
+        TileParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<int> repeats = get_ints(onnxWeights[this->onnxNode.input(1)]);
+        p.num_repeats = repeats.size();
+        UNI_MEMCPY(p.repeats, repeats.data(), p.num_repeats * sizeof(int));
+        ps.tile_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Splice() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SpliceParamSpec splicePs;
-        memset(&splicePs, 0, sizeof(splicePs));
-        std::vector<int> context = get_node_vector_ints_attribute_by_name(this->onnxNode, "context");
-        std::vector<int> indexes =
-            get_node_vector_ints_attribute_by_name(this->onnxNode, "forward_indexes");
-        splicePs.num_context = context.size();
+        ParameterSpec ps;
+        SpliceParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<int> context = get_ints(this->onnxNode, "context");
+        std::vector<int> ids = get_ints(this->onnxNode, "forward_indexes");
+        p.num_context = context.size();
         const std::string &onnxNodeType = this->onnxNode.op_type();
-        if (splicePs.num_context == 0) {
+        if (p.num_context == 0) {
             UNI_ERROR_LOG("can not process operator name:%s type:%s attributes.\n",
                 this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
-        for (int i = 0; i < splicePs.num_context; i++) {
-            splicePs.context[i] = context[i];
-        }
-        splicePs.index_min = 0;
-        splicePs.index_max = 0;
-        for (U32 i = 0; i < indexes.size(); i++) {
-            splicePs.index_min = UNI_MIN(splicePs.index_min, indexes[i]);
-            splicePs.index_max = UNI_MAX(splicePs.index_max, indexes[i]);
+        UNI_MEMCPY(p.context, context.data(), p.num_context * sizeof(int));
+        p.index_min = 0;
+        p.index_max = 0;
+        for (U32 i = 0; i < ids.size(); i++) {
+            p.index_min = UNI_MIN(p.index_min, ids[i]);
+            p.index_max = UNI_MAX(p.index_max, ids[i]);
         }
-
-        curPs.splice_spec = splicePs;
-        return curPs;
+        ps.splice_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Tdnn() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        TdnnParamSpec tdnnPs;
-        memset(&tdnnPs, 0, sizeof(tdnnPs));
+        ParameterSpec ps;
+        TdnnParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const onnx::TensorProto &context = onnxWeights[this->onnxNode.input(1)];
         const onnx::TensorProto &params = onnxWeights[this->onnxNode.input(2)];
-        tdnnPs.num_context = get_data_size_from_tensor_proto(context);
-        U8 *ptr = (U8 *)get_ptr_from_weight_obj(context);
-        for (int i = 0; i < tdnnPs.num_context; i++) {
-            int64_t value;
-            memcpy(&value, ptr + i * sizeof(int64_t), sizeof(int64_t));
-            tdnnPs.context[i] = value;
-        }
-        tdnnPs.num_outputs = params.dims(0);
-        tdnnPs.activation_type = ACTIVATION_NULL;
-        curPs.tdnn_spec = tdnnPs;
-        return curPs;
+        p.num_context = get_length(context);
+        UNI_MEMCPY(p.context, get_ints(context).data(), p.num_context * sizeof(int));
+        p.num_outputs = params.dims(0);
+        p.activation_type = ACTIVATION_NULL;
+        ps.tdnn_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_TopK() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
+        ParameterSpec ps;
         TopKParamSpec p;
-        memset(&p, 0, sizeof(p));
-        p.axis = get_node_single_int_attribute_by_name(this->onnxNode, "axis", -1);
-        p.largest = get_node_single_int_attribute_by_name(this->onnxNode, "largest", 1);
-        p.sorted = get_node_single_int_attribute_by_name(this->onnxNode, "sorted", 1);
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = get_int(this->onnxNode, "axis", -1);
+        p.largest = get_int(this->onnxNode, "largest", 1);
+        p.sorted = get_int(this->onnxNode, "sorted", 1);
         if (this->onnxNode.input_size() == 1) {
-            p.topk = get_node_single_int_attribute_by_name(this->onnxNode, "k", 1);
+            p.k = get_int(this->onnxNode, "k", 1);
         } else {
-            p.topk = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(1)])[0];
+            if (onnxWeights.find(this->onnxNode.input(1)) != onnxWeights.end()) {
+                p.k = get_ints(onnxWeights[this->onnxNode.input(1)])[0];
+            } else {
+                p.k = 0;
+            }
         }
-        curPs.topk_spec = p;
-        return curPs;
+        ps.topk_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Where() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        WhereParamSpec wherePs;
-        memset(&wherePs, 0, sizeof(wherePs));
-        const std::string &onnxNodeType = this->onnxNode.op_type();
-        if (onnxWeights.find(this->onnxNode.input(0)) == onnxWeights.end()) {
-            UNI_WARNING_LOG("not find condition initializer in operator name:%s type:%s "
-                            "attributes.\n",
-                this->onnxNode.name().c_str(), onnxNodeType.c_str());
-        } else {
-            const onnx::TensorProto &conditionTp = onnxWeights[this->onnxNode.input(0)];
-            wherePs.conditionDesc = genDescFromTp(conditionTp);
-        }
-
-        if (onnxWeights.find(this->onnxNode.input(2)) == onnxWeights.end()) {
-            UNI_WARNING_LOG("not find y initializer in operator name:%s type:%s attributes.\n",
-                this->onnxNode.name().c_str(), onnxNodeType.c_str());
-        } else {
-            const onnx::TensorProto &yTp = onnxWeights[this->onnxNode.input(2)];
-            wherePs.yDesc = genDescFromTp(yTp);
-        }
-
-        curPs.where_spec = wherePs;
-        return curPs;
+        ParameterSpec ps;
+        //WhereParamSpec p;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        //const std::string &onnxNodeType = this->onnxNode.op_type();
+        //if (onnxWeights.find(this->onnxNode.input(0)) == onnxWeights.end()) {
+        //    UNI_WARNING_LOG("not find condition initializer in operator name:%s type:%s "
+        //                    "attributes.\n",
+        //        this->onnxNode.name().c_str(), onnxNodeType.c_str());
+        //} else {
+        //    const onnx::TensorProto &conditionTp = onnxWeights[this->onnxNode.input(0)];
+        //    p.condition_desc = get_desc(conditionTp);
+        //}
+
+        //if (onnxWeights.find(this->onnxNode.input(2)) == onnxWeights.end()) {
+        //    UNI_WARNING_LOG("not find y initializer in operator name:%s type:%s attributes.\n",
+        //        this->onnxNode.name().c_str(), onnxNodeType.c_str());
+        //} else {
+        //    const onnx::TensorProto &yTp = onnxWeights[this->onnxNode.input(2)];
+        //    p.y_desc = get_desc(yTp);
+        //}
+
+        //ps.where_spec = p;
+        add_shared_weight(this->onnxNode);
+        return ps;
     }
 
     ParameterSpec adapt_Scan()
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        RNNParamSpec rnnPs;
-        memset(&rnnPs, 0, sizeof(rnnPs));
-        onnx::GraphProto curGraph;
+        ParameterSpec ps;
+        RNNParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        onnx::GraphProto subGraph;
         for (int i = 0; i < this->onnxNode.attribute_size(); i++) {
             const onnx::AttributeProto &attribute = this->onnxNode.attribute(i);
             if (attribute.name() == "body") {
-                curGraph = attribute.g();
+                subGraph = attribute.g();
                 break;
             }
         }
 
         std::vector<onnx::TensorProto> gemmTps;
         std::vector<onnx::TensorProto> matmulTps;
-
-        for (int i = 0; i < curGraph.node_size(); i++) {
-            auto curNode = curGraph.node(i);
-            int input_size = (int)curNode.input_size();
+        for (int i = 0; i < subGraph.node_size(); i++) {
+            auto node = subGraph.node(i);
+            int input_size = (int)node.input_size();
             bool stopTag = false;
-            if (curNode.op_type() == "MatMul") {
+            if (node.op_type() == "MatMul") {
                 for (int j = 0; j < input_size; j++) {
-                    if (onnxWeights.find(curNode.input(j)) != onnxWeights.end()) {
-                        auto hidWeightTp = onnxWeights[curNode.input(j)];
-                        int hidWeightSize = get_data_size_from_tensor_proto(hidWeightTp);
+                    if (onnxWeights.find(node.input(j)) != onnxWeights.end()) {
+                        auto hidWeightTp = onnxWeights[node.input(j)];
+                        int hidWeightSize = get_length(hidWeightTp);
                         if (hidWeightSize > 0) {
                             matmulTps.push_back(hidWeightTp);
                             stopTag = true;
@@ -2749,48 +2404,43 @@ class OnnxAdaptee : public ModelAdaptee {
             }
         }
 
-        rnnPs.mode = RNN_LSTM;
-        // numProjection
-        int numProjection = matmulTps[0].dims(0);
-        int numOutput = matmulTps[0].dims(1);
-        rnnPs.numOutput = numOutput;
-        rnnPs.steps = 0;
-        rnnPs.numProjection = numProjection;
-        rnnPs.zoneoutCell = 0;
-        rnnPs.zoneoutOutput = 0;
-        rnnPs.forgetBias = 1.0;
-        rnnPs.activationMode = ACTIVATION_TANH;
-        curPs.rnn_spec = rnnPs;
-        return curPs;
+        p.mode = RNN_LSTM;
+        p.num_outputs = matmulTps[0].dims(1);
+        p.steps = 0;
+        p.num_projection = matmulTps[0].dims(0);
+        p.zoneout_cell = 0;
+        p.zoneout_output = 0;
+        p.forget_bias = 1.0;
+        p.activation_type = ACTIVATION_TANH;
+        ps.rnn_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Expand() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ExpandParamSpec expandPs;
-        memset(&expandPs, 0, sizeof(expandPs));
-        std::vector<int> expandInfo;
+        ParameterSpec ps;
+        ExpandParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<int> shape;
         if (this->onnxNode.input_size() == 1) {
-            expandInfo = get_node_vector_ints_attribute_by_name(this->onnxNode, "shape");
+            shape = get_ints(this->onnxNode, "shape");
         } else {
-            expandInfo = get_int_vec_from_tensorProto(onnxWeights[this->onnxNode.input(1)]);
+            shape = get_ints(onnxWeights[this->onnxNode.input(1)]);
         }
-        expandPs.shape_size = expandInfo.size();
-        memcpy(expandPs.shape_dims, expandInfo.data(), expandPs.shape_size * sizeof(I32));
-        curPs.expand_spec = expandPs;
-        return curPs;
+        p.num_shape = shape.size();
+        UNI_MEMCPY(p.shape, shape.data(), p.num_shape * sizeof(I32));
+        ps.expand_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Scatter() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
+        ParameterSpec ps;
         ScatterParamSpec p;
-        memset(&p, 0, sizeof(p));
+        UNI_MEMSET(&p, 0, sizeof(p));
         const std::string &onnxNodeType = this->onnxNode.op_type();
-        if (onnxNodeType == "ScatterElements") {
-            p.axis = get_node_single_int_attribute_by_name(this->onnxNode, "axis", 0);
+        if (onnxNodeType == "Scatter" || onnxNodeType == "ScatterElements") {
+            p.axis = get_int(this->onnxNode, "axis", 0);
         } else {
             p.axis = INT_MAX;
         }
@@ -2813,89 +2463,222 @@ class OnnxAdaptee : public ModelAdaptee {
                 *desc = tensor0d();
             } else {
                 const onnx::TensorProto &tp = onnxWeights[this->onnxNode.input(i)];
-                *desc = genDescFromTp(tp);
+                *desc = get_desc(tp);
             }
         }
-        curPs.scatter_spec = p;
-        return curPs;
+        ps.scatter_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_RoIAlign() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        RoIAlignParamSpec rps;
-        memset(&rps, 0, sizeof(rps));
-        std::string coordinateTransformationMode = get_node_str_attribute_by_name(
-            this->onnxNode, "coordinate_transformation_mode", "NO_SET");
-        if (coordinateTransformationMode == "NO_SET") {
-            int aligned = get_node_single_int_attribute_by_name(this->onnxNode, "aligned", 1);
+        ParameterSpec ps;
+        RoIAlignParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        const std::string &onnxNodeType = this->onnxNode.op_type();
+        std::string trans_mode;
+        if (onnxNodeType == "RoiAlign") {
+            trans_mode = get_string(this->onnxNode, "coordinate_transformation_mode", "half_pixel");
+        } else {
+            trans_mode = get_string(this->onnxNode, "coordinate_transformation_mode", "NO_SET");
+        }
+        if (trans_mode == "NO_SET") {
+            int aligned = get_int(this->onnxNode, "aligned", 1);
             if (aligned <= 0) {
-                coordinateTransformationMode = "output_half_pixel";
+                trans_mode = "output_half_pixel";
             } else {
-                coordinateTransformationMode = "half_pixel";
+                trans_mode = "half_pixel";
             }
         }
-        if (coordinateTransformationMode == "half_pixel") {
-            rps.coordinateTransformationMode = ROIALIGN_HALF_PIXEL;
-        } else if (coordinateTransformationMode == "output_half_pixel") {
-            rps.coordinateTransformationMode = ROIALIGN_OUTPUT_HALF_PIXEL;
+        if (trans_mode == "half_pixel") {
+            p.trans_mode = COORDINATE_TRANS_HALF_PIXEL;
+        } else if (trans_mode == "output_half_pixel") {
+            p.trans_mode = COORDINATE_TRANS_OUTPUT_HALF_PIXEL;
         } else {
-            CHECK_STATUS(NOT_SUPPORTED);
+            UNI_ERROR_LOG("can not support trans_mode:%s in operator name:%s "
+                          "type:%s.\n",
+                trans_mode.c_str(), this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
 
-        std::string poolingMode = get_node_str_attribute_by_name(this->onnxNode, "mode", "avg");
+        std::string poolingMode = get_string(this->onnxNode, "mode", "avg");
         if (poolingMode == "avg") {
-            rps.mode = POOLING_MEAN;
+            p.mode = POOLING_MEAN;
         } else if (poolingMode == "max") {
-            rps.mode = POOLING_MAX;
+            p.mode = POOLING_MAX;
         } else {
-            CHECK_STATUS(NOT_SUPPORTED);
+            UNI_ERROR_LOG("can not support mode:%s in operator name:%s type:%s.\n",
+                poolingMode.c_str(), this->onnxNode.name().c_str(), onnxNodeType.c_str());
         }
-        rps.output_w = get_node_single_int_attribute_by_name(this->onnxNode, "pooled_w", 1);
-        rps.output_h = get_node_single_int_attribute_by_name(this->onnxNode, "pooled_h", 1);
-        if (rps.output_w == 1) {
-            rps.output_w = get_node_single_int_attribute_by_name(this->onnxNode, "output_width", 1);
+        p.output_w = get_int(this->onnxNode, "pooled_w", 1);
+        if (p.output_w == 1) {
+            p.output_w = get_int(this->onnxNode, "output_width", 1);
         }
-        if (rps.output_h == 1) {
-            rps.output_h = get_node_single_int_attribute_by_name(this->onnxNode, "output_height", 1);
+        p.output_h = get_int(this->onnxNode, "pooled_h", 1);
+        if (p.output_h == 1) {
+            p.output_h = get_int(this->onnxNode, "output_height", 1);
         }
-        rps.sampling_ratio =
-            get_node_single_int_attribute_by_name(this->onnxNode, "sampling_ratio", 0);
-        rps.spatial_scale = get_node_float_attribute_by_name(this->onnxNode, "spatial_scale", 1.0);
-        curPs.roialign_spec = rps;
-        return curPs;
+        p.sampling_ratio = get_int(this->onnxNode, "sampling_ratio", 0);
+        p.spatial_scale = get_float(this->onnxNode, "spatial_scale", 1.0);
+        ps.roialign_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_GenerateProposals() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        GenerateProposalsParamSpec gpps;
-        memset(&gpps, 0, sizeof(gpps));
-        gpps.angle_bound_hi =
-            get_node_single_int_attribute_by_name(this->onnxNode, "angle_bound_hi", 0);
-        gpps.angle_bound_lo =
-            get_node_single_int_attribute_by_name(this->onnxNode, "angle_bound_lo", 0);
-        gpps.angle_bound_on =
-            get_node_single_int_attribute_by_name(this->onnxNode, "angle_bound_on", 0);
-        gpps.clip_angle_thresh =
-            get_node_float_attribute_by_name(this->onnxNode, "clip_angle_thresh", 0.0);
-        gpps.legacy_plus_one =
-            get_node_single_int_attribute_by_name(this->onnxNode, "legacy_plus_one", 0);
-        gpps.min_size = get_node_float_attribute_by_name(this->onnxNode, "min_size", 0.0);
-        gpps.nms_thresh = get_node_float_attribute_by_name(this->onnxNode, "nms_thresh", 0.0);
-        gpps.post_nms_topN =
-            get_node_single_int_attribute_by_name(this->onnxNode, "post_nms_topN", 0);
-        gpps.pre_nms_topN = get_node_single_int_attribute_by_name(this->onnxNode, "pre_nms_topN", 0);
-        gpps.spatial_scale = get_node_float_attribute_by_name(this->onnxNode, "spatial_scale", 0.0);
-        curPs.generate_proposals_spec = gpps;
-        return curPs;
+        ParameterSpec ps;
+        GenerateProposalsParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.angle_bound_hi = get_int(this->onnxNode, "angle_bound_hi", 0);
+        p.angle_bound_lo = get_int(this->onnxNode, "angle_bound_lo", 0);
+        p.angle_bound_on = get_int(this->onnxNode, "angle_bound_on", 0);
+        p.clip_angle_thresh = get_float(this->onnxNode, "clip_angle_thresh", 0.0);
+        p.legacy_plus_one = get_int(this->onnxNode, "legacy_plus_one", 0);
+        p.min_size = get_float(this->onnxNode, "min_size", 0.0);
+        p.nms_thresh = get_float(this->onnxNode, "nms_thresh", 0.0);
+        p.post_nms_topN = get_int(this->onnxNode, "post_nms_topN", 0);
+        p.pre_nms_topN = get_int(this->onnxNode, "pre_nms_topN", 0);
+        p.spatial_scale = get_float(this->onnxNode, "spatial_scale", 0.0);
+        ps.generate_proposals_spec = p;
+        return ps;
+    }
+
+    ParameterSpec adapt_GridSample() override
+    {
+        ParameterSpec ps;
+        GridSampleParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        const std::string &onnxNodeType = this->onnxNode.op_type();
+        std::string mode = get_string(this->onnxNode, "mode", "bilinear");
+        if (mode.compare("bilinear") == 0) {
+            p.mode = RESIZE_LINEAR;
+        } else if (mode.compare("nearest") == 0) {
+            p.mode = RESIZE_NEAREST;
+        } else {
+            UNI_ERROR_LOG("can not support mode:%s in operator name:%s type:%s.\n", mode.c_str(),
+                this->onnxNode.name().c_str(), onnxNodeType.c_str());
+        }
+        p.constant_value = 0;
+        std::string pad_mode = get_string(this->onnxNode, "padding_mode", "zeros");
+        if (pad_mode == "zeros") {
+            p.pad_mode = PAD_CONSTANT;
+        } else if (pad_mode == "border") {
+            p.pad_mode = PAD_EDGE;
+        } else if (pad_mode == "reflection") {
+            p.pad_mode = PAD_REFLECT;
+        } else {
+            UNI_ERROR_LOG("can not support pad_mode:%s in operator name:%s type:%s.\n",
+                pad_mode.c_str(), this->onnxNode.name().c_str(), onnxNodeType.c_str());
+        }
+        p.align_corners = get_int(this->onnxNode, "align_corners", 1);
+        ps.grid_sample_spec = p;
+        return ps;
+    }
+
+    ParameterSpec adapt_OneHot() override
+    {
+        ParameterSpec ps;
+        OneHotParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = get_int(this->onnxNode, "axis", -1);
+        p.depth = get_ints(onnxWeights[this->onnxNode.input(1)])[0];
+        std::vector<float> values = get_floats(onnxWeights[this->onnxNode.input(2)]);
+        UNI_MEMCPY(p.values, values.data(), sizeof(float) * values.size());
+        ps.onehot_spec = p;
+        return ps;
+    }
+
+    ParameterSpec adapt_CumSum() override
+    {
+        ParameterSpec ps;
+        CumSumParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.exclusive = get_int(this->onnxNode, "exclusive", 0);
+        p.reverse = get_int(this->onnxNode, "reverse", 0);
+        p.axis = get_ints(onnxWeights[this->onnxNode.input(1)])[0];
+        ps.cumsum_spec = p;
+        return ps;
+    }
+
+    ParameterSpec adapt_NonMaxSuppression() override
+    {
+        ParameterSpec ps;
+        NonMaxSuppressionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.center_point_box = get_int(this->onnxNode, "center_point_box", 0);
+        if (this->onnxNode.input_size() > 2) {
+            p.max_output_boxes_per_class = get_ints(onnxWeights[this->onnxNode.input(2)])[0];
+        }
+        if (this->onnxNode.input_size() > 3) {
+            p.iou_threshold = get_floats(onnxWeights[this->onnxNode.input(3)])[0];
+        }
+        if (this->onnxNode.input_size() > 4) {
+            p.score_threshold = get_floats(onnxWeights[this->onnxNode.input(4)])[0];
+        }
+        ps.non_max_suppression_spec = p;
+        return ps;
+    }
+
+    ParameterSpec adapt_Check() override
+    {
+        ParameterSpec ps;
+        CheckParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        const std::string &onnxNodeType = this->onnxNode.op_type();
+        if (onnxNodeType == "Less") {
+            p.mode = CHECK_LESS;
+        } else if (onnxNodeType == "LessOrEqual") {
+            p.mode = CHECK_LESS_EQUAL;
+        } else if (onnxNodeType == "Equal") {
+            p.mode = CHECK_EQUAL;
+        } else if (onnxNodeType == "Greater") {
+            p.mode = CHECK_GREATER;
+        } else if (onnxNodeType == "GreaterOrEqual") {
+            p.mode = CHECK_GREATER_EQUAL;
+        } else {
+            UNI_ERROR_LOG("can not map operator name:%s type:%s to Check.\n",
+                this->onnxNode.name().c_str(), onnxNodeType.c_str());
+        }
+        ps.check_spec = p;
+        add_shared_weight(this->onnxNode);
+        return ps;
+    }
+
+    ParameterSpec adapt_ConstantOfShape() override
+    {
+        ParameterSpec ps;
+        ConstantOfShapeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.dt = cut_type(get_type(this->onnxNode, "value"));
+        p.value = get_float(this->onnxNode, "value", 0);
+        ps.constant_of_shape_spec = p;
+        return ps;
+    }
+
+    ParameterSpec adapt_Range() override
+    {
+        ParameterSpec ps;
+        RangeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        const std::string &onnxNodeType = this->onnxNode.op_type();
+        if (onnxNodeType == "Loop") {
+            p.dt = DT_I32;
+            p.start = get_ints(onnxWeights[this->onnxNode.input(2)])[0];
+            p.limit = get_ints(onnxWeights[this->onnxNode.input(0)])[0];
+            p.delta = 1;
+        } else {
+            UNI_ERROR_LOG("can not map operator name:%s type:%s to Range.\n",
+                this->onnxNode.name().c_str(), onnxNodeType.c_str());
+        }
+        ps.range_spec = p;
+        return ps;
     }
 
 private:
     int removePreprocessOpNum;
+    // whether to use 1bit bnn
     bool useBNN;
+    // whether to use onnx shared weight
+    bool useShare;
 
     onnx::ModelProto onnxModel;
     onnx::GraphProto onnxGraph;
diff --git a/model_tools/src/tensorflow/tensorflow_adaptee.h b/model_tools/src/tensorflow/tensorflow_adaptee.h
index 6b2921e6..b508b2a4 100644
--- a/model_tools/src/tensorflow/tensorflow_adaptee.h
+++ b/model_tools/src/tensorflow/tensorflow_adaptee.h
@@ -14,11 +14,7 @@
 #ifndef _H_TENSORFLOWADAPTEE
 #define _H_TENSORFLOWADAPTEE
 #include <json/json.h>
-#include <fstream>
 #include <sstream>
-#include <string>
-#include <vector>
-#include <math.h>
 
 #include "model_adaptee.h"
 
@@ -28,9 +24,8 @@ class TensorflowAdaptee : public ModelAdaptee {
     {
         this->modelInputLayerNum = 0;
         this->entityOpCount = 0;
-        this->weightOpNum = 0;
+        this->weightNumber = 0;
         this->curInDegree = 0;
-        this->curNodeIndex = 0;
     }
 
     ~TensorflowAdaptee()
@@ -52,63 +47,47 @@ class TensorflowAdaptee : public ModelAdaptee {
 
     OperatorType convert_tensorflow_type(std::string tfType)
     {
-        if (tfType.compare("Mul") == 0 || tfType.compare("Sub") == 0 ||
-            tfType.compare("Add") == 0 || tfType.compare("RealDiv") == 0) {
+        std::map<std::string, OperatorType> operatorMap = {
+            {"FusedBatchNorm", OT_BatchNorm},
+            {"Relu6", OT_Relu6},
+            {"DepthwiseConv2dNative", OT_Conv},
+            {"MaxPool", OT_Pooling},
+            {"ConcatV2", OT_Concat},
+            {"Relu", OT_Relu},
+            {"ResizeBilinear", OT_Resize},
+            {"ArgMax", OT_ArgMax},
+            {"ExpandDims", OT_Unsqueeze},
+            {"Pad", OT_Pad},
+            {"PadV2", OT_Pad},
+            {"Transpose", OT_Transpose},
+            {"BiasAdd", OT_FC},
+            {"Conv2DBackpropInput", OT_Conv},
+            {"Conv2D", OT_Conv},
+            {"Cast", OT_Cast},
+            {"Reshape", OT_Reshape},
+            {"Rsqrt", OT_Power},
+            {"Squeeze", OT_Squeeze},
+            {"Sigmoid", OT_Sigmoid},
+            {"Softmax", OT_Softmax},
+            {"AvgPool", OT_Pooling},
+            {"Mean", OT_Reduction},
+            {"Shape", OT_Shape},
+        };
+        if (operatorMap.find(tfType) != operatorMap.end()) {
+            return operatorMap[tfType];
+        }
+        if (tfType == "Mul" || tfType == "Sub" || tfType == "Add" || tfType == "RealDiv") {
             if (curInDegree == 1) {
                 return OT_Power;
             } else {
                 return OT_Eltwise;
             }
-        } else if (tfType.compare("FusedBatchNorm") == 0) {
-            return OT_BatchNorm;
-        } else if (tfType.compare("Relu6") == 0) {
-            return OT_Relu6;
-        } else if (tfType.compare("DepthwiseConv2dNative") == 0) {
-            return OT_Conv;
-        } else if (tfType.compare("MaxPool") == 0) {
-            return OT_Pooling;
-        } else if (tfType.compare("ConcatV2") == 0) {
-            return OT_Concat;
-        } else if (tfType.compare("Relu") == 0) {
-            return OT_Relu;
-        } else if (tfType.compare("ResizeBilinear") == 0) {
-            return OT_Resize;
-        } else if (tfType.compare("ArgMax") == 0) {
-            return OT_ArgMax;
-        } else if (tfType.compare("ExpandDims") == 0) {
-            return OT_Unsqueeze;
-        } else if (tfType.compare("Pad") == 0 || tfType.compare("PadV2") == 0) {
-            return OT_Pad;
-        } else if (tfType.compare("Transpose") == 0) {
-            return OT_Transpose;
-        } else if (tfType.compare("BiasAdd") == 0) {
-            return OT_FC;
-        } else if (tfType.compare("Conv2DBackpropInput") == 0 || tfType.compare("Conv2D") == 0) {
-            return OT_Conv;
-        } else if (tfType.compare("Cast") == 0) {
-            return OT_Cast;
-        } else if (tfType.compare("Reshape") == 0) {
-            return OT_Reshape;
-        } else if (tfType.compare("Rsqrt") == 0) {
-            return OT_Power;
-        } else if (tfType.compare("Squeeze") == 0) {
-            return OT_Squeeze;
-        } else if (tfType.compare("Sigmoid") == 0) {
-            return OT_Sigmoid;
-        } else if (tfType.compare("MatMul") == 0) {
+        } else if (tfType == "MatMul") {
             if (this->curInDegree == 1) {
                 return OT_FC;
             } else {
                 return OT_MatMul;
             }
-        } else if (tfType.compare("Softmax") == 0) {
-            return OT_Softmax;
-        } else if (tfType.compare("AvgPool") == 0) {
-            return OT_Pooling;
-        } else if (tfType.compare("Mean") == 0) {
-            return OT_Reduction;
-        } else if (tfType.compare("Shape") == 0) {
-            return OT_Shape;
         } else {
             UNI_ERROR_LOG("operator name:%s type:%s not supported.\n", this->layerName.c_str(),
                 tfType.c_str());
@@ -183,13 +162,13 @@ class TensorflowAdaptee : public ModelAdaptee {
         ms->dt = DT_F32;
         str_copy(ms->model_name, modelName.c_str(), modelName.length());
         ms->num_inputs = this->modelInputLayerNum;
-        ms->input_names = (I8 **)mt_new_storage(ms->num_inputs * sizeof(I8 *));
-        ms->input_dims = (TensorDesc *)mt_new_storage(sizeof(TensorDesc) * ms->num_inputs);
+        ms->input_names = (I8 **)mt_malloc(ms->num_inputs * sizeof(I8 *));
+        ms->input_dims = (TensorDesc *)mt_malloc(sizeof(TensorDesc) * ms->num_inputs);
         int traverseInputLayerIndex = 0;
 
         ms->num_operator_specs = this->entityOpCount;
         OperatorSpec *opsPtr =
-            (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs);
+            (OperatorSpec *)mt_malloc(sizeof(OperatorSpec) * ms->num_operator_specs);
         ms->ops = opsPtr;
         int traverseEntityOpIndex = 0;
 
@@ -204,7 +183,7 @@ class TensorflowAdaptee : public ModelAdaptee {
                 this->opType = value["node"][i]["op"].asString();
                 if (opType.compare("Placeholder") == 0) {
                     ms->input_names[traverseInputLayerIndex] =
-                        (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+                        (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
                     str_copy(ms->input_names[traverseInputLayerIndex], layerName.c_str(),
                         layerName.length());
                     int placeholder_shape_size =
@@ -230,24 +209,19 @@ class TensorflowAdaptee : public ModelAdaptee {
                     }
                     traverseInputLayerIndex++;
                 } else if (opType.compare("Const") == 0) {
-                    int tensorDimSize =
-                        value["node"][i]["attr"]["value"]["tensor"]["tensorShape"]["dim"].size();
+                    auto shape = value["node"][i]["attr"]["value"]["tensor"]["tensorShape"]["dim"];
+                    int tensorDimSize = shape.size();
                     std::vector<int> tensorDims;
                     int tensorDimsNum = 1;
                     for (int j = 0; j < tensorDimSize; j++) {
-                        tensorDims.push_back(std::stoi(
-                            value["node"][i]["attr"]["value"]["tensor"]["tensorShape"]["dim"][j]["s"
-                                                                                                 "i"
-                                                                                                 "z"
-                                                                                                 "e"]
-                                .asString()));
+                        tensorDims.push_back(std::stoi(shape[j]["size"].asString()));
                         tensorDimsNum *= tensorDims[j];
                     }
                 } else if (opType.compare("Identity") != 0) {
                     std::vector<std::string> inList;
                     std::vector<std::string> constList;
 
-                    this->nodeV = value["node"][i];
+                    this->node = value["node"][i];
                     ParameterSpec tmpPs;
 
                     str_copy(
@@ -285,25 +259,25 @@ class TensorflowAdaptee : public ModelAdaptee {
                             opType != "ExpandDims" && opType != "ResizeBilinear" &&
                             opType != "Reshape" && opType != "Mean") {  // TODO: expand more cases
                             weightIds.push_back(i);
-                            this->weightOpNum = this->weightOpNum + 1;
+                            this->weightNumber = this->weightNumber + 1;
                         }
                     }
 
                     opsPtr[traverseEntityOpIndex].num_inputs = inList.size();
-                    opsPtr[traverseEntityOpIndex].input_tensors_name = (I8 **)mt_new_storage(
-                        opsPtr[traverseEntityOpIndex].num_inputs * sizeof(I8 *));
+                    opsPtr[traverseEntityOpIndex].input_tensors_name =
+                        (I8 **)mt_malloc(opsPtr[traverseEntityOpIndex].num_inputs * sizeof(I8 *));
                     for (int k = 0; k < (int)(opsPtr[traverseEntityOpIndex].num_inputs); k++) {
                         opsPtr[traverseEntityOpIndex].input_tensors_name[k] =
-                            (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+                            (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
                         str_copy(opsPtr[traverseEntityOpIndex].input_tensors_name[k],
                             inList[k].c_str(), inList[k].length());
                     }
                     opsPtr[traverseEntityOpIndex].num_outputs = 1;
-                    opsPtr[traverseEntityOpIndex].output_tensors_name = (I8 **)mt_new_storage(
-                        opsPtr[traverseEntityOpIndex].num_outputs * sizeof(I8 *));
+                    opsPtr[traverseEntityOpIndex].output_tensors_name =
+                        (I8 **)mt_malloc(opsPtr[traverseEntityOpIndex].num_outputs * sizeof(I8 *));
                     for (int k = 0; k < (int)(opsPtr[traverseEntityOpIndex].num_outputs); k++) {
                         opsPtr[traverseEntityOpIndex].output_tensors_name[k] =
-                            (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+                            (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
                         str_copy(opsPtr[traverseEntityOpIndex].output_tensors_name[k],
                             layerName.c_str(), layerName.length());
                     }
@@ -326,159 +300,137 @@ class TensorflowAdaptee : public ModelAdaptee {
         return ret;
     }
 
-    EE adapt_weights(ModelSpec *ms) override
+    std::vector<int> get_weight_ids(const Json::Value &node)
     {
-        ms->num_weight_specs = weightOpNum;
-        WeightSpec *wsPtr = (WeightSpec *)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs);
-        for (int j = 0; j < ms->num_weight_specs; j++) {
-            wsPtr[j].num_quant_scale = 0;
-            wsPtr[j].weight_scale = nullptr;
+        std::vector<int> ids;
+        std::string name = node["name"].asString();
+        std::vector<std::string> all = this->weightConstInput[name];
+        for (U32 i = 0; i < all.size(); i++) {
+            int id;
+            if (constId.find(all[i]) != constId.end()) {
+                id = constId[all[i]];
+            } else {
+                id = constId[idenConst[all[i]]];
+            }
+            ids.push_back(id);
         }
-        ms->ws = wsPtr;
-        Json::Reader reader;
-        Json::Value value;
-        if (reader.parse(newStrValue, value)) {
-            for (int j = 0; j < ms->num_weight_specs; j++) {
-                int curWeightIndex = weightIds[j];
-                std::string weightOpType = value["node"][curWeightIndex]["op"].asString();
-                std::string weightOpName = value["node"][curWeightIndex]["name"].asString();
-                str_copy(wsPtr[j].op_name, weightOpName.c_str(), weightOpName.length());
-                std::vector<std::string> constList = weightConstInput[weightOpName];
-                UNI_DEBUG_LOG("process operator name:%s weight.\n", weightOpName.c_str());
-                if (weightOpType.compare("Conv2D") == 0 ||
-                    weightOpType.compare("Conv2DBackpropInput") == 0 ||
-                    weightOpType.compare("MatMul") == 0 ||
-                    weightOpType.compare("DepthwiseConv2dNative") == 0) {  // To collect more op
-
-                    if (constList.size() == 1) {
-                        std::string curIdenStr = constList[0];
-                        std::string curConstStr = idenConst[curIdenStr];
-                        int curConstIndex = constId[curConstStr];
-                        if (constId.find(curIdenStr) != constId.end()) {
-                            curConstIndex = constId[curIdenStr];
-                        }
-                        int tensorContentSize =
-                            value["node"][curConstIndex]["attr"]["value"]["tensor"]["tensorContent"]
-                                .size();
-                        wsPtr[j].mdt = DT_F32;
-                        wsPtr[j].bytes_of_weight = tensorContentSize * sizeof(float);
-                        float *fp32Ptr = (float *)mt_new_storage(wsPtr[j].bytes_of_weight);
-                        for (int k = 0; k < tensorContentSize; k++) {
-                            fp32Ptr[k] = std::stof(
-                                value["node"][curConstIndex]["attr"]["value"]["tensor"]["tensorCont"
-                                                                                        "ent"][k]
-                                    .asString());
-                        }
-                        wsPtr[j].weight = (U8 *)fp32Ptr;
-                        wsPtr[j].bytes_of_vec = 0;
-                        wsPtr[j].vec = nullptr;
-                    } else {
-                        CHECK_STATUS(NOT_IMPLEMENTED);
-                    }
-                } else if (weightOpType.compare("BiasAdd") == 0) {
-                    if (constList.size() == 1) {
-                        std::string curIdenStr = constList[0];
-                        std::string curConstStr = idenConst[curIdenStr];
-                        int curConstIndex = constId[curConstStr];
-
-                        int tensorContentSize =
-                            value["node"][curConstIndex]["attr"]["value"]["tensor"]["tensorContent"]
-                                .size();
-                        wsPtr[j].mdt = DT_F32;
-                        wsPtr[j].bytes_of_weight = 0;
-                        wsPtr[j].weight = nullptr;
-                        wsPtr[j].bytes_of_vec = tensorContentSize * sizeof(float);
-                        float *fp32Ptr = (float *)mt_new_storage(wsPtr[j].bytes_of_vec);
-                        for (int k = 0; k < tensorContentSize; k++) {
-                            fp32Ptr[k] = std::stof(
-                                value["node"][curConstIndex]["attr"]["value"]["tensor"]["tensorCont"
-                                                                                        "ent"][k]
-                                    .asString());
-                        }
-                        wsPtr[j].vec = (U8 *)fp32Ptr;
-                    } else {
-                        CHECK_STATUS(NOT_IMPLEMENTED);
-                    }
+        return ids;
+    }
 
-                } else if (weightOpType.compare("FusedBatchNorm") == 0) {
-                    if (constList.size() == 4) {
-                        std::string curScaleIdenStr = constList[0];
-                        std::string curScaleConstStr = idenConst[curScaleIdenStr];
-                        int curScaleConstIndex = constId[curScaleConstStr];
-                        if (constId.find(curScaleIdenStr) != constId.end()) {
-                            curScaleConstIndex = constId[curScaleIdenStr];
-                        }
+    int get_length(const Json::Value &tensor)
+    {
+        auto data = tensor["attr"]["value"]["tensor"]["tensorContent"];
+        return data.size();
+    }
 
-                        std::string curOffsetIdenStr = constList[1];
-                        std::string curOffsetConstStr = idenConst[curOffsetIdenStr];
-                        int curOffsetConstIndex = constId[curOffsetConstStr];
-                        if (constId.find(curOffsetIdenStr) != constId.end()) {
-                            curOffsetConstIndex = constId[curOffsetIdenStr];
-                        }
+    std::vector<float> get_floats(const Json::Value &tensor)
+    {
+        auto data = tensor["attr"]["value"]["tensor"]["tensorContent"];
+        int size = data.size();
+        std::vector<float> ret(size);
+        for (int i = 0; i < size; i++) {
+            ret[i] = std::stof(data[i].asString());
+        }
+        return ret;
+    }
 
-                        std::string curMeanIdenStr = constList[2];
-                        std::string curMeanConstStr = idenConst[curMeanIdenStr];
-                        int curMeanConstIndex = constId[curMeanConstStr];
-                        if (constId.find(curMeanIdenStr) != constId.end()) {
-                            curMeanConstIndex = constId[curMeanIdenStr];
-                        }
+    std::vector<int> get_shape(const Json::Value &tensor)
+    {
+        auto shape = tensor["attr"]["value"]["tensor"]["tensorShape"]["dim"];
+        int size = shape.size();
+        std::vector<int> ret(size);
+        for (int i = 0; i < size; i++) {
+            ret[i] = std::stoi(shape[i].asString());
+        }
+        return ret;
+    }
 
-                        std::string curVarianceIdenStr = constList[3];
-                        std::string curVarianceConstStr = idenConst[curVarianceIdenStr];
-                        int curVarianceConstIndex = constId[curVarianceConstStr];
-                        if (constId.find(curVarianceIdenStr) != constId.end()) {
-                            curVarianceConstIndex = constId[curVarianceIdenStr];
-                        }
+    std::vector<int> get_ints(const Json::Value &tensor)
+    {
+        auto data = tensor["attr"]["value"]["tensor"]["tensorContent"];
+        int size = data.size();
+        std::vector<int> ret(size);
+        for (int i = 0; i < size; i++) {
+            ret[i] = std::stoi(data[i].asString());
+        }
+        return ret;
+    }
 
-                        int iterSize =
-                            value["node"][curScaleConstIndex]["attr"]["value"]["tensor"]["tensorCon"
-                                                                                         "tent"]
-                                .size();
-                        wsPtr[j].mdt = DT_F32;
-                        wsPtr[j].bytes_of_weight = iterSize * sizeof(float);
-                        float *fp32FirPtr = (float *)mt_new_storage(wsPtr[j].bytes_of_weight);
-                        wsPtr[j].weight = (U8 *)fp32FirPtr;
-                        wsPtr[j].bytes_of_vec = iterSize * sizeof(float);
-                        float *fp32SecPtr = (float *)mt_new_storage(wsPtr[j].bytes_of_vec);
-                        wsPtr[j].vec = (U8 *)fp32SecPtr;
-
-                        for (int k = 0; k < iterSize; k++) {
-                            float tmpScale = std::stof(
-                                value["node"][curScaleConstIndex]["attr"]["value"]["tensor"]["tenso"
-                                                                                             "rCont"
-                                                                                             "ent"][0]
-                                    .asString());
-                            float tmpOffset = std::stof(
-                                value["node"][curOffsetConstIndex]["attr"]["value"]["tensor"]["tens"
-                                                                                              "orCo"
-                                                                                              "nten"
-                                                                                              "t"][0]
-                                    .asString());
-                            float tmpMean = std::stof(
-                                value["node"][curMeanConstIndex]["attr"]["value"]["tensor"]["tensor"
-                                                                                            "Conten"
-                                                                                            "t"][0]
-                                    .asString());
-                            float tmpVariance = std::stof(
-                                value["node"][curVarianceConstIndex]["attr"]["value"]["tensor"]["te"
-                                                                                                "ns"
-                                                                                                "or"
-                                                                                                "Co"
-                                                                                                "nt"
-                                                                                                "en"
-                                                                                                "t"][0]
-                                    .asString());
-
-                            float tmpNewMean =
-                                tmpMean - tmpOffset * sqrt(tmpVariance / powf(tmpScale, 2));
-                            float tmpNewVariance = tmpVariance / (powf(tmpScale, 2));
-                            fp32FirPtr[k] = tmpNewMean;
-                            fp32SecPtr[k] = tmpNewVariance;
-                        }
-                    } else {
-                        CHECK_STATUS(NOT_IMPLEMENTED);
-                    }
+    std::vector<int> get_ints(const Json::Value &node, const char *attributeName)
+    {
+        auto attribute = node["attr"][attributeName]["list"]["i"];
+        int size = attribute.size();
+        std::vector<int> ret(size);
+        for (int i = 0; i < size; i++) {
+            ret[i] = std::stoi(attribute[i].asString());
+        }
+        return ret;
+    }
+
+    void copy_tensors(std::vector<Json::Value> tensors, U8 *ptr)
+    {
+        for (U32 i = 0; i < tensors.size(); i++) {
+            std::vector<float> data = get_floats(tensors[i]);
+            int bytes = sizeof(float) * data.size();
+            UNI_MEMCPY(ptr, data.data(), bytes);
+            ptr += bytes;
+        }
+    }
+
+    WeightSpec convert_weight(
+        std::string operatorName, std::vector<Json::Value> weight, std::vector<Json::Value> bias)
+    {
+        DataType wdt = DT_F32;
+        U32 bytes0 = 0, bytes1 = 0;
+        for (U32 i = 0; i < weight.size(); i++) {
+            bytes0 += get_length(weight[i]) * bytesOf(wdt);
+        }
+        for (U32 i = 0; i < bias.size(); i++) {
+            bytes1 += get_length(bias[i]) * bytesOf(wdt);
+        }
+        WeightSpec w = mt_create_weight(operatorName.c_str(), wdt, bytes0, bytes1, 0);
+        copy_tensors(weight, w.weight);
+        copy_tensors(bias, w.vec);
+        return w;
+    }
+
+    EE adapt_weights(ModelSpec *ms) override
+    {
+        ms->num_weight_specs = weightNumber;
+        WeightSpec *ws = (WeightSpec *)mt_malloc(sizeof(WeightSpec) * ms->num_weight_specs);
+        ms->ws = ws;
+        Json::Reader reader;
+        Json::Value value;
+        if (!reader.parse(newStrValue, value)) {
+            return NOT_SUPPORTED;
+        }
+        for (int j = 0; j < ms->num_weight_specs; j++) {
+            this->node = value["node"][weightIds[j]];
+            std::string name = this->node["name"].asString();
+            UNI_DEBUG_LOG("process operator name:%s weight.\n", name.c_str());
+            std::vector<int> ids = get_weight_ids(this->node);
+            std::string type = this->node["op"].asString();
+            std::vector<std::string> constList;
+            if (type == "Conv2D" || type == "Conv2DBackpropInput" || type == "MatMul" ||
+                type == "DepthwiseConv2dNative") {
+                ws[j] = convert_weight(name, {value["node"][ids[0]]}, {});
+            } else if (type.compare("BiasAdd") == 0) {
+                ws[j] = convert_weight(name, {}, {value["node"][ids[0]]});
+            } else if (type.compare("FusedBatchNorm") == 0) {
+                U32 bytes = get_length(value["node"][ids[0]]) * sizeof(float);
+                ws[j] = mt_create_weight(name.c_str(), DT_F32, bytes, bytes, 0);
+                std::vector<float> scale = get_floats(value["node"][ids[0]]);
+                std::vector<float> bias = get_floats(value["node"][ids[1]]);
+                std::vector<float> mean = get_floats(value["node"][ids[2]]);
+                std::vector<float> var = get_floats(value["node"][ids[3]]);
+                for (U32 i = 0; i < scale.size(); i++) {
+                    float a = mean[i] - bias[i] * sqrt(var[i] / powf(scale[i], 2));
+                    float b = var[i] / (powf(scale[i], 2));
+                    mean[i] = a;
+                    var[i] = b;
                 }
+                UNI_MEMCPY(ws[j].weight, mean.data(), mean.size() * sizeof(float));
+                UNI_MEMCPY(ws[j].vec, var.data(), var.size() * sizeof(float));
             }
         }
         return SUCCESS;
@@ -486,480 +438,321 @@ class TensorflowAdaptee : public ModelAdaptee {
 
     ParameterSpec adapt_Eltwise() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        EltwiseParamSpec eps;
-        memset(&eps, 0, sizeof(eps));
+        ParameterSpec ps;
+        EltwiseParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         if (opType == "Add") {
-            eps.elt_mode = ELTWISE_SUM;
-            eps.activation_type = ACTIVATION_NULL;
+            p.mode = ELTWISE_SUM;
         } else if (opType == "Sub") {
-            eps.elt_mode = ELTWISE_SUB;
-            eps.activation_type = ACTIVATION_NULL;
+            p.mode = ELTWISE_SUB;
         }
-        curPs.eltwise_spec = eps;
-        return curPs;
+        p.activation_type = ACTIVATION_NULL;
+        ps.eltwise_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_ArgMax() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ArgMaxParamSpec aps;
-        memset(&aps, 0, sizeof(aps));
-        aps.axis = 1;  // TODO
-        curPs.argmax_spec = aps;
-        return curPs;
+        ParameterSpec ps;
+        ArgMaxParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = 1;
+        ps.argmax_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Conv() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ConvolutionParamSpec convPs;
-        memset(&convPs, 0, sizeof(convPs));
-        convPs.kernel_t = 1;
-        convPs.stride_t = 1;
-        convPs.padding_before = 0;
-        convPs.padding_after = 0;
-        convPs.dilatedRate_t = 1;
-
-        std::string conv_op = nodeV["name"].asString();
-        int dilationsInfo[4] = {0, 0, 0, 0};
-        int stridesInfo[4] = {0, 0, 0, 0};
+        ParameterSpec ps;
+        ConvolutionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.kernel_t = 1;
+        p.stride_t = 1;
+        p.pad_before = 0;
+        p.pad_after = 0;
+        p.dilatedRate_t = 1;
+
+        std::vector<int> dilations(4, 1);
         if (opType.compare("DepthwiseConv2dNative") == 0) {
-            for (int i = 0; i < (int)(nodeV["attr"]["dilations"]["list"]["i"].size()); i++) {
-                dilationsInfo[i] = 1;
-            }
-        } else {
-            dilationsInfo[0] = 1;
-            dilationsInfo[1] = 1;
-        }
-        for (int i = 0; i < (int)(nodeV["attr"]["strides"]["list"]["i"].size()); i++) {
-            stridesInfo[i] = std::stoi(
-                nodeV["attr"]["strides"]["list"]["i"][i].asString());  // TODO extract real data
-        }
-        convPs.dilatedRate_h = dilationsInfo[1];  // atten
-        convPs.dilatedRate_w = dilationsInfo[2];
-        convPs.stride_h = stridesInfo[1];
-        convPs.stride_w = stridesInfo[2];
-
-        std::vector<std::string> curConvIdens = this->weightConstInput[conv_op];
-        int curConstId = -1;
-        if (constId.find(curConvIdens[0]) != constId.end()) {
-            curConstId = constId[curConvIdens[0]];
-        } else {
-            curConstId = constId[idenConst[curConvIdens[0]]];
-        }
-        std::string constOpName = this->ttValue["node"][curConstId]["name"].asString();
-        std::vector<int> convWeightKernels;
-        for (int k = 0; k <
-             (int)(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorShape"]["di"
-                                                                                               "m"]
-                       .size());
-             k++) {
-            convWeightKernels.push_back(
-                std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]
-                                       ["tensorShape"]["dim"][k]["size"]
-                                           .asString()));
-        }
-
-        if (convWeightKernels.size() < 4) {
+            dilations = get_ints(this->node, "dilations");
+        }
+        std::vector<int> strides = get_ints(this->node, "strides");
+        ;
+        p.dilatedRate_h = dilations[1];
+        p.dilatedRate_w = dilations[2];
+        p.stride_h = strides[1];
+        p.stride_w = strides[2];
+
+        int id = get_weight_ids(this->node)[0];
+        std::vector<int> kernels = get_shape(this->ttValue["node"][id]);
+        if (kernels.size() < 4) {
             UNI_ERROR_LOG("can not process operator name:%s kernel.\n", this->layerName.c_str());
         }
         if (opType.compare("DepthwiseConv2dNative") == 0) {
-            convPs.num_outputs = convWeightKernels[2];
+            p.num_outputs = kernels[2];
         } else {
-            convPs.num_outputs = convWeightKernels[3];
+            p.num_outputs = kernels[3];
         }
-        convPs.kernel_h = convWeightKernels[0];
-        convPs.kernel_w = convWeightKernels[1];
+        p.kernel_h = kernels[0];
+        p.kernel_w = kernels[1];
 
-        std::string tfPaddingMode =
-            nodeV["attr"]["padding"]["s"].asString();  // choose one of VALID/SAME
+        // choose one of VALID/SAME
+        std::string tfPaddingMode = this->node["attr"]["padding"]["s"].asString();
         if (tfPaddingMode.at(0) == 'V') {
             tfPaddingMode = "VALID";
-            convPs.padding_top = 0;
-            convPs.padding_bottom = 0;
-            convPs.padding_left = 0;
-            convPs.padding_right = 0;
+            p.pad_top = 0;
+            p.pad_bottom = 0;
+            p.pad_left = 0;
+            p.pad_right = 0;
         } else {
             tfPaddingMode = "SAME";
-            convPs.padding_top = (U32)INT_MAX;
-            convPs.padding_bottom = (U32)INT_MAX;
-            convPs.padding_left = (U32)INT_MAX;
-            convPs.padding_right = (U32)INT_MAX;
+            p.pad_top = (U32)INT_MAX;
+            p.pad_bottom = (U32)INT_MAX;
+            p.pad_left = (U32)INT_MAX;
+            p.pad_right = (U32)INT_MAX;
         }
 
-        convPs.group = 1;
-        convPs.dw_activation_type = ACTIVATION_NULL;
-        convPs.pw_activation_type = ACTIVATION_NULL;
+        p.group = 1;
+        p.dw_activation_type = ACTIVATION_NULL;
+        p.pw_activation_type = ACTIVATION_NULL;
 
-        if (convPs.group != 1 && convPs.group == convPs.num_outputs) {
-            convPs.convolution_type = Convolution_Depthwise;
+        if (p.group != 1 && p.group == p.num_outputs) {
+            p.convolution_type = CONVOLUTION_DEPTHWISE;
         } else {
-            if (convPs.dilatedRate_h > 1 || convPs.dilatedRate_w > 1) {
-                convPs.convolution_type = Convolution_Dilation;
-            } else {
-                convPs.convolution_type = Convolution_Pointwise;
-            }
+            p.convolution_type = CONVOLUTION_POINTWISE;
         }
 
         if (opType.compare("DepthwiseConv2dNative") == 0) {
-            convPs.convolution_type = Convolution_Depthwise;
+            p.convolution_type = CONVOLUTION_DEPTHWISE;
         }
-        curPs.conv_spec = convPs;
-        return curPs;
+        ps.conv_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_BatchNorm() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        BatchNormParamSpec bps;
-        memset(&bps, 0, sizeof(bps));
-        bps.axis = 0;
-        bps.eps = nodeV["attr"]["epsilon"]["f"].asFloat();
-        bps.gama = 0;
-        bps.momentum = 0;
-        curPs.bn_spec = bps;
-        return curPs;
+        ParameterSpec ps;
+        BatchNormParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = 0;
+        p.eps = this->node["attr"]["epsilon"]["f"].asFloat();
+        p.gama = 0;
+        p.momentum = 0;
+        ps.bn_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Fc() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        FullyConnectedParamSpec fps;
-        memset(&fps, 0, sizeof(fps));
-        // to locate the const weight op
-        std::string curOpName = nodeV["name"].asString();
-        std::vector<std::string> curConvIdens = this->weightConstInput[curOpName];
-        int curConstId = -1;
-        if (constId.find(curConvIdens[0]) != constId.end()) {
-            curConstId = constId[curConvIdens[0]];
-        } else {
-            curConstId = constId[idenConst[curConvIdens[0]]];
-        }
-        int dimLengthIndex =
-            this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorShape"].size() - 1;
-        fps.num_outputs =
-            std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorShape"]
-                                   ["dim"][dimLengthIndex]["size"]
-                                       .asString());  // fc_dimSize is static two-dimension
-        fps.num_slices = 1;
-        curPs.fc_spec = fps;
-        return curPs;
+        ParameterSpec ps;
+        FullyConnectedParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        int id = get_weight_ids(this->node)[0];
+        std::vector<int> kernels = get_shape(this->ttValue["node"][id]);
+        p.num_outputs = kernels[kernels.size() - 1];
+        p.num_slices = 1;
+        ps.fc_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Pooling() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PoolingParamSpec pps;
-        memset(&pps, 0, sizeof(pps));
-        std::vector<int> kernelSize;  // ihwo
-        std::vector<int> stridesInfo;
-        for (int i = 0; i < (int)(nodeV["attr"]["ksize"]["list"]["i"].size()); i++) {
-            kernelSize.push_back(std::stoi(nodeV["attr"]["ksize"]["list"]["i"][i].asString()));
-        }
-        for (int i = 0; i < (int)(nodeV["attr"]["strides"]["list"]["i"].size()); i++) {
-            stridesInfo.push_back(std::stoi(nodeV["attr"]["strides"]["list"]["i"][i].asString()));
-        }
-        pps.kernel_t = 1;
-        pps.kernel_h = kernelSize[1];
-        pps.kernel_w = kernelSize[2];
-        pps.stride_t = 1;
-        pps.stride_h = 1;
-        pps.stride_w = 1;
-        pps.padding_before = 0;
-        pps.padding_after = 0;
-        pps.padding_top = 0;
-        pps.padding_bottom = 0;
-        pps.padding_left = 0;
-        pps.padding_right = 0;
-        pps.rm = CEIL;
+        ParameterSpec ps;
+        PoolingParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<int> kernels = get_ints(this->node, "ksize");
+        std::vector<int> strides = get_ints(this->node, "strides");
+        p.kernel_t = 1;
+        p.kernel_h = kernels[1];
+        p.kernel_w = kernels[2];
+        p.stride_t = 1;
+        p.stride_h = strides[1];
+        p.stride_w = strides[2];
+        p.pad_before = 0;
+        p.pad_after = 0;
+        p.pad_top = 0;
+        p.pad_bottom = 0;
+        p.pad_left = 0;
+        p.pad_right = 0;
+        p.round_mode = ROUND_CEIL;
         if (opType.compare("MaxPool") == 0) {
-            pps.mode = POOLING_MAX;
-        } else {  // refer to "AvgPool"
-            pps.mode = POOLING_MEAN;
+            p.mode = POOLING_MAX;
+        } else {
+            p.mode = POOLING_MEAN;
         }
-        curPs.pooling_spec = pps;
-        return curPs;
+        p.count_include_pad = false;
+        ps.pooling_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Reduction() override
     {
-        // Mapping to <Mean>
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ReductionParamSpec reductionPs;
-        memset(&reductionPs, 0, sizeof(reductionPs));
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        ReductionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         if (opType.compare("Mean") == 0) {
-            reductionPs.reduction_mode = REDUCTION_MEAN;
+            p.mode = REDUCTION_MEAN;
         } else {
             UNI_ERROR_LOG("can not map operator name:%s type:%s to Reduction.\n",
                 this->layerName.c_str(), opType.c_str());
         }
-        std::string reductionOpName = nodeV["name"].asString();
-        std::vector<std::string> constInputs = weightConstInput[reductionOpName];
-        int constReductionOpIndex = -1;
-        if (constId.find(constInputs[0]) != constId.end()) {
-            constReductionOpIndex = constId[constInputs[0]];
-        } else {
-            constReductionOpIndex = constId[idenConst[constInputs[0]]];
-        }
-        reductionPs.axes_num =
-            this->ttValue["node"][constReductionOpIndex]["attr"]["value"]["tensor"]["tensorContent"]
-                .size();
-        for (int i = 0; i < reductionPs.axes_num; i++) {
-            reductionPs.axes[i] = std::stoi(
-                this->ttValue["node"][constReductionOpIndex]["attr"]["value"]["tensor"]["tensorCont"
-                                                                                        "ent"][i]
-                    .asString());
-        }
-        curPs.reduction_spec = reductionPs;
-        return curPs;
+        int id = get_weight_ids(this->node)[0];
+        std::vector<int> dims = get_ints(this->ttValue["node"][id]);
+        p.num_axes = dims.size();
+        for (int i = 0; i < p.num_axes; i++) {
+            p.axes[i] = dims[i];
+        }
+        ps.reduction_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Pad() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PadParamSpec padPs;
-        memset(&padPs, 0, sizeof(padPs));
-
-        std::string curOpName = nodeV["name"].asString();
-        std::vector<std::string> curConvIdens = this->weightConstInput[curOpName];
-        int curConstId = -1;
-        if (constId.find(curConvIdens[0]) != constId.end()) {
-            curConstId = constId[curConvIdens[0]];
-        } else {
-            curConstId = constId[idenConst[curConvIdens[0]]];
-        }
-
-        std::vector<int> padInfos;
-        for (int i = 0; i < (int)(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["ten"
-                                                                                               "sor"
-                                                                                               "Con"
-                                                                                               "ten"
-                                                                                               "t"]
-                                      .size());
-             i++) {
-            padInfos.push_back(
-                std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte"
-                                                                                       "nt"][i]
-                              .asString()));
-        }
-        padPs.before = 0;
-        padPs.after = 0;
-        padPs.top = padInfos[2];
-        padPs.bottom = padInfos[3];
-        padPs.left = padInfos[4];
-        padPs.right = padInfos[5];
-        padPs.constant_value = 0;  // TODO: for PadV2
-        padPs.pad_mode = Pad_Constant;
-        curPs.pad_spec = padPs;
-        return curPs;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        PadParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        int id = get_weight_ids(this->node)[0];
+        std::vector<int> pad = get_ints(this->ttValue["node"][id]);
+        p.before = 0;
+        p.after = 0;
+        p.top = pad[2];
+        p.bottom = pad[3];
+        p.left = pad[4];
+        p.right = pad[5];
+        p.constant_value = 0;
+        p.pad_mode = PAD_CONSTANT;
+        ps.pad_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Concat() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ConcatParamSpec concatPs;
-        memset(&concatPs, 0, sizeof(concatPs));
-        concatPs.axis = std::stoi(nodeV["attr"]["N"]["i"].asString());
-        curPs.concat_spec = concatPs;
-        return curPs;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        ConcatParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = std::stoi(this->node["attr"]["N"]["i"].asString());
+        ps.concat_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Resize() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ResizeParamSpec resizePs;
-        memset(&resizePs, 0, sizeof(resizePs));
-
-        std::string curOpName = nodeV["name"].asString();
-        std::vector<std::string> curConvIdens = this->weightConstInput[curOpName];
-        int curConstId = -1;
-        if (constId.find(curConvIdens[0]) != constId.end()) {
-            curConstId = constId[curConvIdens[0]];
-        } else {
-            curConstId = constId[idenConst[curConvIdens[0]]];
-        }
-        resizePs.num_sizes = 2;
-        resizePs.num_scales = 0;
-        for (int k = 0; k < (int)(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["ten"
-                                                                                               "sor"
-                                                                                               "Con"
-                                                                                               "ten"
-                                                                                               "t"]
-                                      .size());
-             k++) {
-            resizePs.sizes[k] =
-                std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte"
-                                                                                       "nt"][k]
-                              .asString());
-        }
-
-        curPs.resize_spec = resizePs;
-        return curPs;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        ResizeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+
+        int id = get_weight_ids(this->node)[0];
+        p.num_sizes = 2;
+        std::vector<int> sizes = get_ints(this->ttValue["node"][id]);
+        for (U32 i = 0; i < p.num_sizes; i++) {
+            p.sizes[i] = sizes[i];
+        }
+        p.num_scales = 0;
+        ps.resize_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Power() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PowerParamSpec powerPs;
-        memset(&curPs, 0, sizeof(powerPs));
-        float curScale = 1.0;
-        float curShift = 0.0;
-
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        PowerParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.scale = 1.0;
+        p.shift = 0.0;
+        p.power = 1.0;
         if (opType.compare("Rsqrt") == 0) {
-            powerPs.power = 0.5;
-            curPs.power_spec = powerPs;
-            return curPs;
-        }
-
-        std::string curOpName = nodeV["name"].asString();
-        std::vector<std::string> curConvIdens = this->weightConstInput[curOpName];
-        int curConstId = -1;
-        if (constId.find(curConvIdens[0]) != constId.end()) {
-            curConstId = constId[curConvIdens[0]];
+            p.power = 0.5;
         } else {
-            curConstId = constId[idenConst[curConvIdens[0]]];
-        }
-
-        if (opType.compare("Mul") == 0) {
-            curScale = std::stof(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tens"
-                                                                                              "orCo"
-                                                                                              "nten"
-                                                                                              "t"][0]
-                                     .asString());
-        } else if (opType.compare("Sub") == 0) {
-            curShift = -1 *
-                std::stof(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte"
-                                                                                       "nt"][0]
-                              .asString());
-        } else if (opType.compare("RealDiv") == 0) {
-            curScale = 1.0 /
-                std::stof(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte"
-                                                                                       "nt"][0]
-                              .asString());
-        }
-        powerPs.scale = curScale;
-        powerPs.shift = curShift;
-        powerPs.power = 1;
-        curPs.power_spec = powerPs;
-        return curPs;
+            int id = get_weight_ids(this->node)[0];
+            std::vector<float> data = get_floats(this->ttValue["node"][id]);
+            if (opType.compare("Mul") == 0) {
+                p.scale = data[0];
+            } else if (opType.compare("Sub") == 0) {
+                p.shift = -1 * data[0];
+            } else if (opType.compare("RealDiv") == 0) {
+                p.scale = 1 / data[0];
+            }
+        }
+        ps.power_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Transpose() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        TransposeParamSpec transPs;
-        memset(&transPs, 0, sizeof(transPs));
-        // extract the perm info from the const input
-        std::string curOpName = nodeV["name"].asString();
-        std::vector<std::string> curConvIdens = this->weightConstInput[curOpName];
-        int curConstId = -1;
-        if (constId.find(curConvIdens[0]) != constId.end()) {
-            curConstId = constId[curConvIdens[0]];
-        } else {
-            curConstId = constId[idenConst[curConvIdens[0]]];
-        }
-
-        transPs.trans_size =
-            this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorContent"].size();
-        for (int i = 0; i < (int)(transPs.trans_size); i++) {
-            transPs.trans_dims[i] =
-                std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte"
-                                                                                       "nt"][i]
-                              .asString());
-            ;
-        }
-        curPs.transpose_spec = transPs;
-        return curPs;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        TransposeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        int id = get_weight_ids(this->node)[0];
+        std::vector<int> dims = get_ints(this->ttValue["node"][id]);
+        p.num_axes = dims.size();
+        for (U32 i = 0; i < p.num_axes; i++) {
+            p.axes[i] = dims[i];
+        }
+        ps.transpose_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Reshape() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ReshapeParamSpec reshapePs;
-        memset(&reshapePs, 0, sizeof(reshapePs));
-
-        std::string curOpName = nodeV["name"].asString();
-        std::vector<std::string> curConvIdens = this->weightConstInput[curOpName];
-        if (curConvIdens.size() == 0) {
-            return curPs;
-        }
-        int curConstId = -1;
-        if (constId.find(curConvIdens[0]) != constId.end()) {
-            curConstId = constId[curConvIdens[0]];
-        } else {
-            curConstId = constId[idenConst[curConvIdens[0]]];
-        }
-        reshapePs.shape_size =
-            this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorContent"].size();
-        for (int k = 0; k < reshapePs.shape_size; k++) {
-            reshapePs.shape_dims[k] =
-                std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte"
-                                                                                       "nt"][k]
-                              .asString());
-        }
-        reshapePs.axis = 8;
-        reshapePs.num_axes = -1;
-        curPs.reshape_spec = reshapePs;
-        return curPs;
+        ParameterSpec ps;
+        ReshapeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        int id = get_weight_ids(this->node)[0];
+        std::vector<int> shape = get_ints(this->ttValue["node"][id]);
+        p.num_shape = shape.size();
+        for (int i = 0; i < p.num_shape; i++) {
+            p.shape[i] = shape[i];
+        }
+        p.axis = 8;
+        p.num_axes = -1;
+        ps.reshape_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Squeeze() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SqueezeParamSpec squeezePs;
-        memset(&squeezePs, 0, sizeof(squeezePs));
-        std::vector<int> squeezeDimsInfo;
-        squeezePs.axes_num = nodeV["attr"]["squeeze_dims"]["list"]["i"].size();
-        for (int i = 0; i < (int)(nodeV["attr"]["squeeze_dims"]["list"]["i"].size()); i++) {
-            squeezePs.axes[i] = std::stoi(nodeV["attr"]["squeeze_dims"]["list"]["i"][i].asString());
-        }
-        curPs.squeeze_spec = squeezePs;
-        return curPs;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        SqueezeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<int> dims = get_ints(this->node, "squeeze_dims");
+        p.num_axes = dims.size();
+        for (int i = 0; i < p.num_axes; i++) {
+            p.axes[i] = dims[i];
+        }
+        ps.squeeze_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Unsqueeze() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        UnsqueezeParamSpec unsqueezePs;
-        memset(&unsqueezePs, 0, sizeof(unsqueezePs));
-        std::string unsqueeze_op = nodeV["name"].asString();
-        int expandDimIndex = constId[idenConst[weightConstInput[unsqueeze_op][0]]];
-        unsqueezePs.axes_num =
-            this->ttValue["node"][expandDimIndex]["attr"]["value"]["tensor"]["tensorContent"].size();
-        for (int k = 0; k < unsqueezePs.axes_num; k++) {
-            unsqueezePs.axes[k] = std::stoi(
-                this->ttValue["node"][expandDimIndex]["attr"]["value"]["tensor"]["tensorContent"][k]
-                    .asString());
-        }
-        curPs.unsqueeze_spec = unsqueezePs;
-        return curPs;
+        ParameterSpec ps;
+        UnsqueezeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        int id = get_weight_ids(this->node)[0];
+        std::vector<int> dims = get_ints(this->ttValue["node"][id]);
+        p.num_axes = dims.size();
+        for (int i = 0; i < p.num_axes; i++) {
+            p.axes[i] = dims[i];
+        }
+        ps.unsqueeze_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Cast() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        CastParamSpec castPs;
-        memset(&castPs, 0, sizeof(castPs));
-        castPs.targetDt = DT_F32;
-        curPs.cast_spec = castPs;
-        return curPs;
+        ParameterSpec ps;
+        CastParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.dt = DT_F32;
+        ps.cast_spec = p;
+        return ps;
     }
 
 private:
@@ -968,8 +761,7 @@ class TensorflowAdaptee : public ModelAdaptee {
 
     std::string modelName;
     std::string newStrValue;
-    Json::Value nodeV;
-    int curNodeIndex;
+    Json::Value node;
     std::string opType;
     std::string layerName;
 
@@ -980,7 +772,7 @@ class TensorflowAdaptee : public ModelAdaptee {
     std::map<std::string, std::vector<std::string>> weightConstInput;
     std::vector<int> weightIds;
 
-    int weightOpNum;
+    int weightNumber;
     int curInDegree;
 };
 #endif
diff --git a/model_tools/src/tflite/tflite_adaptee.h b/model_tools/src/tflite/tflite_adaptee.h
index d3944b19..d2ba17d4 100644
--- a/model_tools/src/tflite/tflite_adaptee.h
+++ b/model_tools/src/tflite/tflite_adaptee.h
@@ -15,10 +15,6 @@
 #define _H_TFLITEADAPTEE
 #include "model_adaptee.h"
 
-#include <fstream>
-#include <string>
-#include <vector>
-#include <map>
 #include <tensorflow/lite/schema/schema_generated.h>
 
 #include "tensor_transpose.h"
@@ -75,6 +71,59 @@ class TfliteAdaptee : public ModelAdaptee {
 
     OperatorType convert_tflite_type(tflite::BuiltinOperator tfliteOperatorType)
     {
+        std::map<tflite::BuiltinOperator, OperatorType> operatorMap = {
+            {tflite::BuiltinOperator_CONCATENATION, OT_Concat},
+            {tflite::BuiltinOperator_PACK, OT_Concat},
+            {tflite::BuiltinOperator_CONV_2D, OT_Conv},
+            {tflite::BuiltinOperator_DEPTHWISE_CONV_2D, OT_Conv},
+            {tflite::BuiltinOperator_LOGISTIC, OT_Sigmoid},
+            {tflite::BuiltinOperator_MAX_POOL_2D, OT_Pooling},
+            {tflite::BuiltinOperator_AVERAGE_POOL_2D, OT_Pooling},
+            {tflite::BuiltinOperator_RESHAPE, OT_Reshape},
+            {tflite::BuiltinOperator_RESIZE_BILINEAR, OT_Resize},
+            {tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, OT_Resize},
+            {tflite::BuiltinOperator_SOFTMAX, OT_Softmax},
+            {tflite::BuiltinOperator_TRANSPOSE, OT_Transpose},
+            {tflite::BuiltinOperator_SLICE, OT_TfSlice},
+            {tflite::BuiltinOperator_STRIDED_SLICE, OT_TfSlice},
+            {tflite::BuiltinOperator_RELU, OT_Relu},
+            {tflite::BuiltinOperator_LEAKY_RELU, OT_Relu},
+            {tflite::BuiltinOperator_RELU6, OT_Relu6},
+            {tflite::BuiltinOperator_TANH, OT_TanH},
+            {tflite::BuiltinOperator_MINIMUM, OT_Clip},
+            {tflite::BuiltinOperator_TRANSPOSE_CONV, OT_Deconvolution},
+            {tflite::BuiltinOperator_SQUARED_DIFFERENCE, OT_SqDiff},
+            {tflite::BuiltinOperator_SQRT, OT_Power},
+            {tflite::BuiltinOperator_POW, OT_Power},
+            {tflite::BuiltinOperator_L2_NORMALIZATION, OT_L2Normalization},
+            {tflite::BuiltinOperator_PAD, OT_Pad},
+            {tflite::BuiltinOperator_MIRROR_PAD, OT_Pad},
+            {tflite::BuiltinOperator_HARD_SWISH, OT_HSwish},
+            {tflite::BuiltinOperator_SHAPE, OT_Shape},
+            {tflite::BuiltinOperator_SQUEEZE, OT_Squeeze},
+            {tflite::BuiltinOperator_EXPAND_DIMS, OT_Unsqueeze},
+            {tflite::BuiltinOperator_NEG, OT_Power},
+            {tflite::BuiltinOperator_TOPK_V2, OT_TopK},
+            {tflite::BuiltinOperator_GATHER, OT_Gather},
+            {tflite::BuiltinOperator_GATHER_ND, OT_Gather},
+            {tflite::BuiltinOperator_PRELU, OT_PRelu},
+            {tflite::BuiltinOperator_SPACE_TO_BATCH_ND, OT_SpaceToBatchNd},
+            {tflite::BuiltinOperator_BATCH_TO_SPACE_ND, OT_BatchToSpaceNd},
+            {tflite::BuiltinOperator_ABS, OT_Abs},
+            {tflite::BuiltinOperator_QUANTIZE, OT_Slice},
+            {tflite::BuiltinOperator_FAKE_QUANT, OT_Slice},
+            {tflite::BuiltinOperator_SPLIT, OT_Slice},
+            {tflite::BuiltinOperator_EXP, OT_Exp},
+            {tflite::BuiltinOperator_EQUAL, OT_Check},
+            {tflite::BuiltinOperator_NOT_EQUAL, OT_Check},
+            {tflite::BuiltinOperator_CAST, OT_Cast},
+            {tflite::BuiltinOperator_SUM, OT_Reduction},
+            {tflite::BuiltinOperator_REDUCE_MAX, OT_Reduction},
+            {tflite::BuiltinOperator_SELECT, OT_Select},
+        };
+        if (operatorMap.find(tfliteOperatorType) != operatorMap.end()) {
+            return operatorMap[tfliteOperatorType];
+        }
         std::vector<int> weightInputIndex = getOperatorWeightInputIndex(this->tfliteOperatorIndex);
         if (tfliteOperatorType == tflite::BuiltinOperator_ADD ||
             tfliteOperatorType == tflite::BuiltinOperator_MUL ||
@@ -94,24 +143,6 @@ class TfliteAdaptee : public ModelAdaptee {
             } else {
                 return OT_Eltwise;
             }
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_CONCATENATION ||
-            tfliteOperatorType == tflite::BuiltinOperator_PACK) {
-            return OT_Concat;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_CONV_2D ||
-            tfliteOperatorType == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) {
-            return OT_Conv;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_LOGISTIC) {
-            return OT_Sigmoid;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_MAX_POOL_2D ||
-            tfliteOperatorType == tflite::BuiltinOperator_AVERAGE_POOL_2D) {
-            return OT_Pooling;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_RESHAPE) {
-            return OT_Reshape;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_RESIZE_BILINEAR ||
-            tfliteOperatorType == tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR) {
-            return OT_Resize;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_SOFTMAX) {
-            return OT_Softmax;
         } else if (tfliteOperatorType == tflite::BuiltinOperator_FULLY_CONNECTED) {
             if (weightInputIndex.size() > 0) {
                 bool ttW = (weightInputIndex.size() == 1 && weightInputIndex[0] == 2) ? true : false;
@@ -144,18 +175,6 @@ class TfliteAdaptee : public ModelAdaptee {
             } else {
                 return OT_MatMul;
             }
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_TRANSPOSE) {
-            return OT_Transpose;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_SLICE ||
-            tfliteOperatorType == tflite::BuiltinOperator_STRIDED_SLICE) {
-            return OT_TfSlice;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_RELU ||
-            tfliteOperatorType == tflite::BuiltinOperator_LEAKY_RELU) {
-            return OT_Relu;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_RELU6) {
-            return OT_Relu6;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_TANH) {
-            return OT_TanH;
         } else if (tfliteOperatorType == tflite::BuiltinOperator_MEAN) {
             if (this->tfliteModelBuffer
                     [this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]
@@ -171,60 +190,6 @@ class TfliteAdaptee : public ModelAdaptee {
             } else {
                 return OT_Clip;
             }
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_MINIMUM) {
-            return OT_Clip;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_TRANSPOSE_CONV) {
-            return OT_Deconvolution;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_SQUARED_DIFFERENCE) {
-            return OT_SqDiff;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_SQRT ||
-            tfliteOperatorType == tflite::BuiltinOperator_POW) {
-            return OT_Power;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_L2_NORMALIZATION) {
-            return OT_L2Normalization;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_PAD ||
-            tfliteOperatorType == tflite::BuiltinOperator_MIRROR_PAD) {
-            return OT_Pad;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_HARD_SWISH) {
-            return OT_HSwish;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_SHAPE) {
-            return OT_Shape;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_SQUEEZE) {
-            return OT_Squeeze;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_EXPAND_DIMS) {
-            return OT_Unsqueeze;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_NEG) {
-            return OT_Power;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_TOPK_V2) {
-            return OT_TopK;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_GATHER ||
-            tfliteOperatorType == tflite::BuiltinOperator_GATHER_ND) {
-            return OT_Gather;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_PRELU) {
-            return OT_PRelu;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_SPACE_TO_BATCH_ND) {
-            return OT_SpaceToBatchNd;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_BATCH_TO_SPACE_ND) {
-            return OT_BatchToSpaceNd;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_ABS) {
-            return OT_Abs;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_QUANTIZE ||
-            tfliteOperatorType == tflite::BuiltinOperator_FAKE_QUANT) {
-            return OT_Split;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_SPLIT) {
-            return OT_Slice;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_EXP) {
-            return OT_Exp;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_EQUAL ||
-            tfliteOperatorType == tflite::BuiltinOperator_NOT_EQUAL) {
-            return OT_Equal;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_CAST) {
-            return OT_Cast;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_SUM ||
-            tfliteOperatorType == tflite::BuiltinOperator_REDUCE_MAX) {
-            return OT_Reduction;
-        } else if (tfliteOperatorType == tflite::BuiltinOperator_SELECT) {
-            return OT_Select;
         } else {
             UNI_ERROR_LOG("operator locate:%d type:%s not supported.\n", this->tfliteOperatorIndex,
                 tflite::EnumNamesBuiltinOperator()[tfliteOperatorType]);
@@ -248,7 +213,7 @@ class TfliteAdaptee : public ModelAdaptee {
         const auto size = inputFile.tellg();
         inputFile.seekg(0, std::ios::beg);
 
-        char *buffer = new char[size];
+        char *buffer = (char *)mt_malloc(size);
         inputFile.read(buffer, size);
         inputFile.close();
 
@@ -294,6 +259,7 @@ class TfliteAdaptee : public ModelAdaptee {
             outputs.push_back(std::move((tfliteModel->subgraphs[0]->outputs)[i]));
         }
 
+        mt_free(buffer);
         return ret;
     }
 
@@ -338,26 +304,26 @@ class TfliteAdaptee : public ModelAdaptee {
 
     EE adapt_operators(ModelSpec *ms) override
     {
-        this->modelWeightOpNum = 0;
+        this->weightNumber = 0;
         EE ret = SUCCESS;
         ms->dt = DT_F32;
         str_copy(ms->model_name, modelName.c_str(), modelName.length());
         ms->num_inputs = inputs.size();
-        ms->input_names = (I8 **)mt_new_storage(ms->num_inputs * sizeof(I8 *));
-        ms->input_dims = (TensorDesc *)mt_new_storage(sizeof(TensorDesc) * ms->num_inputs);
+        ms->input_names = (I8 **)mt_malloc(ms->num_inputs * sizeof(I8 *));
+        ms->input_dims = (TensorDesc *)mt_malloc(sizeof(TensorDesc) * ms->num_inputs);
         for (I32 i = 0; i < ms->num_inputs; i++) {
             const int inputIdx = inputs[i];
             const auto &inputTensor = this->tfliteTensors[inputIdx];
-            ms->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+            ms->input_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
             str_copy(ms->input_names[i], (inputTensor->name).c_str(), (inputTensor->name).length());
             ms->input_dims[i] = getDescFromTp(inputTensor, true);
         }
         ms->num_outputs = outputs.size();
-        ms->output_names = (I8 **)mt_new_storage(ms->num_outputs * sizeof(I8 *));
+        ms->output_names = (I8 **)mt_malloc(ms->num_outputs * sizeof(I8 *));
         for (I32 i = 0; i < ms->num_outputs; i++) {
             const int outputIdx = outputs[i];
             const auto &outputTensor = this->tfliteTensors[outputIdx];
-            ms->output_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+            ms->output_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
             str_copy(
                 ms->output_names[i], (outputTensor->name).c_str(), (outputTensor->name).length());
         }
@@ -379,7 +345,7 @@ class TfliteAdaptee : public ModelAdaptee {
                     modifiedInputsOp.end())
                 ? this->tfliteOperators[this->tfliteOperatorIndex]->inputs.size()
                 : modifiedInputsOp[this->boltOperators[this->boltOperatorIndex].type];
-            this->boltOperators[this->boltOperatorIndex].input_tensors_name = (I8 **)mt_new_storage(
+            this->boltOperators[this->boltOperatorIndex].input_tensors_name = (I8 **)mt_malloc(
                 this->boltOperators[this->boltOperatorIndex].num_inputs * sizeof(I8 *));
 
             int inputStartPoint = 0;
@@ -401,20 +367,20 @@ class TfliteAdaptee : public ModelAdaptee {
                     this->tfliteOperators[this->tfliteOperatorIndex]->inputs[iter + inputStartPoint];
                 const auto &inTensor = this->tfliteTensors[inIndex];
                 this->boltOperators[this->boltOperatorIndex].input_tensors_name[iter] =
-                    (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+                    (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
                 str_copy(this->boltOperators[this->boltOperatorIndex].input_tensors_name[iter],
                     (inTensor->name).c_str(), (inTensor->name).length());
             }
             this->boltOperators[this->boltOperatorIndex].num_outputs =
                 this->tfliteOperators[this->tfliteOperatorIndex]->outputs.size();
-            this->boltOperators[this->boltOperatorIndex].output_tensors_name = (I8 **)mt_new_storage(
+            this->boltOperators[this->boltOperatorIndex].output_tensors_name = (I8 **)mt_malloc(
                 this->boltOperators[this->boltOperatorIndex].num_outputs * sizeof(I8 *));
             for (U32 iter = 0; iter < this->boltOperators[this->boltOperatorIndex].num_outputs;
                  iter++) {
                 const int outIndex = this->tfliteOperators[this->tfliteOperatorIndex]->outputs[iter];
                 const auto &outTensor = this->tfliteTensors[outIndex];
                 this->boltOperators[this->boltOperatorIndex].output_tensors_name[iter] =
-                    (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+                    (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
                 str_copy(this->boltOperators[this->boltOperatorIndex].output_tensors_name[iter],
                     outTensor->name.c_str(), outTensor->name.length());
             }
@@ -430,8 +396,9 @@ class TfliteAdaptee : public ModelAdaptee {
         }
 
         ms->num_operator_specs = this->boltOperators.size();
-        ms->ops = (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs);
-        memcpy(ms->ops, this->boltOperators.data(), sizeof(OperatorSpec) * ms->num_operator_specs);
+        ms->ops = (OperatorSpec *)mt_malloc(sizeof(OperatorSpec) * ms->num_operator_specs);
+        UNI_MEMCPY(
+            ms->ops, this->boltOperators.data(), sizeof(OperatorSpec) * ms->num_operator_specs);
         for (I32 i = 0; i < ms->num_operator_specs; i++) {
             this->boltOperatorNameMap[ms->ops[i].name] = i;
             ms->ops[i].tensor_positions = nullptr;
@@ -439,7 +406,7 @@ class TfliteAdaptee : public ModelAdaptee {
             ms->ops[i].feature_scale = nullptr;
         }
         ms->ws = nullptr;
-        ms->num_weight_specs = modelWeightOpNum;
+        ms->num_weight_specs = this->weightNumber;
         return ret;
     }
 
@@ -579,8 +546,8 @@ class TfliteAdaptee : public ModelAdaptee {
                 weight_num *= item;
             }
             ws.bytes_of_weight = weight_num * sizeof(float);
-            ws.weight = (U8 *)mt_new_storage(ws.bytes_of_weight);
-            memcpy(ws.weight, weight_data.data(), ws.bytes_of_weight);
+            ws.weight = (U8 *)mt_malloc(ws.bytes_of_weight);
+            UNI_MEMCPY(ws.weight, weight_data.data(), ws.bytes_of_weight);
         } else {
             ws.bytes_of_weight = 0;
             ws.weight = nullptr;
@@ -596,8 +563,8 @@ class TfliteAdaptee : public ModelAdaptee {
                 bias_num *= item;
             }
             ws.bytes_of_vec = bias_num * sizeof(float);
-            ws.vec = (U8 *)mt_new_storage(ws.bytes_of_vec);
-            memcpy(ws.vec, bias_data.data(), ws.bytes_of_vec);
+            ws.vec = (U8 *)mt_malloc(ws.bytes_of_vec);
+            UNI_MEMCPY(ws.vec, bias_data.data(), ws.bytes_of_vec);
         } else {
             ws.bytes_of_vec = 0;
             ws.vec = nullptr;
@@ -606,7 +573,7 @@ class TfliteAdaptee : public ModelAdaptee {
 
     EE adapt_weights(ModelSpec *ms) override
     {
-        WeightSpec *wsPtr = (WeightSpec *)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs);
+        WeightSpec *wsPtr = (WeightSpec *)mt_malloc(sizeof(WeightSpec) * ms->num_weight_specs);
         for (int j = 0; j < ms->num_weight_specs; j++) {
             wsPtr[j].num_quant_scale = 0;
             wsPtr[j].weight_scale = nullptr;
@@ -639,7 +606,7 @@ class TfliteAdaptee : public ModelAdaptee {
                 wsPtr[weightMovIndex].bytes_of_weight =
                     conv2d_co * conv2d_kh * conv2d_kw * conv2d_ci * sizeof(float);
                 wsPtr[weightMovIndex].weight =
-                    (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_weight);
+                    (U8 *)mt_malloc(wsPtr[weightMovIndex].bytes_of_weight);
                 TensorDesc nhwcWeightDesc =
                     tensor4df(DT_F32, DF_NHWC, conv2d_co, conv2d_ci, conv2d_kh, conv2d_kw);
                 TensorDesc nchwWeightDesc =
@@ -657,9 +624,8 @@ class TfliteAdaptee : public ModelAdaptee {
                     } else {
                         wsPtr[weightMovIndex].bytes_of_vec = conv2d_ci * sizeof(float);
                     }
-                    wsPtr[weightMovIndex].vec =
-                        (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_vec);
-                    memcpy(wsPtr[weightMovIndex].vec, conv2DBias.data(),
+                    wsPtr[weightMovIndex].vec = (U8 *)mt_malloc(wsPtr[weightMovIndex].bytes_of_vec);
+                    UNI_MEMCPY(wsPtr[weightMovIndex].vec, conv2DBias.data(),
                         wsPtr[weightMovIndex].bytes_of_vec);
                 } else {
                     wsPtr[weightMovIndex].bytes_of_vec = 0;
@@ -713,7 +679,7 @@ class TfliteAdaptee : public ModelAdaptee {
                 U32 conv2d_ci = weightShape[3];
                 wsPtr[weightMovIndex].bytes_of_weight = deConvWeight.size() * sizeof(float);
                 wsPtr[weightMovIndex].weight =
-                    (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_weight);
+                    (U8 *)mt_malloc(wsPtr[weightMovIndex].bytes_of_weight);
                 U32 filterDims[4] = {conv2d_ci, conv2d_kw, conv2d_kh, conv2d_co};
                 U32 ftmDims[4] = {conv2d_kw, conv2d_kh, conv2d_co, conv2d_ci};
                 U32 filterTransformDims[4] = {3, 0, 1, 2};
@@ -790,33 +756,32 @@ class TfliteAdaptee : public ModelAdaptee {
 
     ParameterSpec adapt_Eltwise() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        EltwiseParamSpec eltPs;
-        memset(&eltPs, 0, sizeof(eltPs));
+        ParameterSpec ps;
+        EltwiseParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         ActivationMode activationMode = ACTIVATION_NULL;
         if (opCode == tflite::BuiltinOperator_ADD) {
-            eltPs.elt_mode = ELTWISE_SUM;
-            EltwiseSumSpec elt_sum_spec;
-            elt_sum_spec.coeff_size = 2;
-            for (I32 j = 0; j < elt_sum_spec.coeff_size; j++) {
-                elt_sum_spec.coeff_values[j] = 1.0;
+            p.mode = ELTWISE_SUM;
+            EltwiseSumSpec sum_spec;
+            sum_spec.num_coeff = 2;
+            for (I32 j = 0; j < sum_spec.num_coeff; j++) {
+                sum_spec.coeff[j] = 1.0;
             }
-            eltPs.elt_sum_spec = elt_sum_spec;
+            p.sum_spec = sum_spec;
             const auto &tfliteEltwiseOption =
                 this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsAddOptions();
             activationMode =
                 getActivationOperatorType(tfliteEltwiseOption->fused_activation_function);
         } else if (opCode == tflite::BuiltinOperator_SUB) {
-            eltPs.elt_mode = ELTWISE_SUB;
+            p.mode = ELTWISE_SUB;
         } else if (opCode == tflite::BuiltinOperator_MAXIMUM) {
-            eltPs.elt_mode = ELTWISE_MAX;
+            p.mode = ELTWISE_MAX;
         } else if (opCode == tflite::BuiltinOperator_MINIMUM) {
-            eltPs.elt_mode = ELTWISE_MIN;
+            p.mode = ELTWISE_MIN;
         } else if (opCode == tflite::BuiltinOperator_DIV) {
-            eltPs.elt_mode = ELTWISE_DIV;
+            p.mode = ELTWISE_DIV;
         } else if (opCode == tflite::BuiltinOperator_MUL) {
-            eltPs.elt_mode = ELTWISE_PROD;
+            p.mode = ELTWISE_PROD;
             const auto &tfliteEltwiseOption =
                 this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsMulOptions();
             activationMode =
@@ -825,24 +790,23 @@ class TfliteAdaptee : public ModelAdaptee {
             UNI_ERROR_LOG("can not map operator location:%d type:%s to Eltwise.\n",
                 this->tfliteOperatorIndex, tflite::EnumNamesBuiltinOperator()[this->opCode]);
         }
-        eltPs.activation_type = activationMode;
-        curPs.eltwise_spec = eltPs;
+        p.activation_type = activationMode;
+        ps.eltwise_spec = p;
         std::vector<int> weights = getOperatorWeightInputIndex(this->tfliteOperatorIndex);
         for (U32 i = 0; i < weights.size(); i++) {
             insertSharedWeight(this->tfliteOperators[this->tfliteOperatorIndex]->inputs[weights[i]]);
         }
-        return curPs;
+        return ps;
     }
 
     ParameterSpec adapt_Scale() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        modelWeightOpNum++;
-        ScaleParamSpec scalePs;
-        memset(&scalePs, 0, sizeof(scalePs));
-        scalePs.axis = 1;
-        curPs.scale_spec = scalePs;
+        this->weightNumber++;
+        ParameterSpec ps;
+        ScaleParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = 1;
+        ps.scale_spec = p;
 
         if (opCode == tflite::BuiltinOperator_ADD) {
             const auto &addOption =
@@ -853,150 +817,140 @@ class TfliteAdaptee : public ModelAdaptee {
             }
         }
 
-        return curPs;
+        return ps;
     }
 
     ParameterSpec adapt_Conv() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        modelWeightOpNum++;
+        this->weightNumber++;
+        ParameterSpec ps;
+        ConvolutionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const int weightIndex = this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1];
         const auto &weightTensor = this->tfliteTensors[weightIndex];
-
         const auto &weightShape = weightTensor->shape;
         CHECK_REQUIREMENT(weightShape.size() == 4);
-
-        ConvolutionParamSpec convPs;
-        memset(&convPs, 0, sizeof(convPs));
-        convPs.kernel_h = weightShape[1];
-        convPs.kernel_w = weightShape[2];
-        convPs.kernel_t = 1;
-        convPs.stride_t = 1;
-        convPs.padding_before = 0;
-        convPs.padding_after = 0;
-        convPs.dilatedRate_t = 1;
+        p.kernel_h = weightShape[1];
+        p.kernel_w = weightShape[2];
+        p.kernel_t = 1;
+        p.stride_t = 1;
+        p.pad_before = 0;
+        p.pad_after = 0;
+        p.dilatedRate_t = 1;
         if (opCode == tflite::BuiltinOperator_CONV_2D) {
-            convPs.num_outputs = weightShape[0];
-            convPs.num_outputs_origin = convPs.num_outputs;
+            p.num_outputs = weightShape[0];
+            p.num_outputs_origin = p.num_outputs;
 
             const auto &tfliteConvOption =
                 this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsConv2DOptions();
-            convPs.dilatedRate_h = tfliteConvOption->dilation_h_factor;
-            convPs.dilatedRate_w = tfliteConvOption->dilation_w_factor;
-            convPs.stride_h = tfliteConvOption->stride_h;
-            convPs.stride_w = tfliteConvOption->stride_w;
+            p.dilatedRate_h = tfliteConvOption->dilation_h_factor;
+            p.dilatedRate_w = tfliteConvOption->dilation_w_factor;
+            p.stride_h = tfliteConvOption->stride_h;
+            p.stride_w = tfliteConvOption->stride_w;
             const auto activationFunc = tfliteConvOption->fused_activation_function;
             if (1 == tfliteConvOption->padding) {  // VALID
-                convPs.padding_top = 0;
-                convPs.padding_bottom = 0;
-                convPs.padding_left = 0;
-                convPs.padding_right = 0;
+                p.pad_top = 0;
+                p.pad_bottom = 0;
+                p.pad_left = 0;
+                p.pad_right = 0;
             } else {  // SAME
                 const auto &inputTensor =
                     this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]];
                 const auto &inputShape = inputTensor->shape;
-                convPs.padding_top = (convPs.kernel_h - 1) / 2;
-                convPs.padding_bottom = (convPs.kernel_h - 1) / 2;
-                if (convPs.kernel_h % 2 == 0) {
-                    convPs.padding_bottom += 1;
+                p.pad_top = (p.kernel_h - 1) / 2;
+                p.pad_bottom = (p.kernel_h - 1) / 2;
+                if (p.kernel_h % 2 == 0) {
+                    p.pad_bottom += 1;
                 }
-                if (convPs.padding_top != 0 && inputShape[1] % 2 == 0 &&
-                    tfliteConvOption->stride_h % 2 == 0) {
-                    convPs.padding_top -= 1;
+                if (p.pad_top != 0 && inputShape[1] % 2 == 0 && tfliteConvOption->stride_h % 2 == 0) {
+                    p.pad_top -= 1;
                 }
-                convPs.padding_left = (convPs.kernel_w - 1) / 2;
-                convPs.padding_right = (convPs.kernel_w - 1) / 2;
-                if (convPs.kernel_w % 2 == 0) {
-                    convPs.padding_right += 1;
+                p.pad_left = (p.kernel_w - 1) / 2;
+                p.pad_right = (p.kernel_w - 1) / 2;
+                if (p.kernel_w % 2 == 0) {
+                    p.pad_right += 1;
                 }
-                if (convPs.padding_left != 0 && inputShape[2] % 2 == 0 &&
+                if (p.pad_left != 0 && inputShape[2] % 2 == 0 &&
                     tfliteConvOption->stride_w % 2 == 0) {
-                    convPs.padding_left -= 1;
+                    p.pad_left -= 1;
                 }
             }
-            convPs.group = 1;
-            convPs.dw_activation_type = ACTIVATION_NULL;
-            convPs.pw_activation_type = getActivationOperatorType(activationFunc);
-            if (convPs.dilatedRate_h > 1 || convPs.dilatedRate_w > 1) {
-                convPs.convolution_type = Convolution_Dilation;
-            } else {
-                convPs.convolution_type = Convolution_Pointwise;
-            }
+            p.group = 1;
+            p.dw_activation_type = ACTIVATION_NULL;
+            p.pw_activation_type = getActivationOperatorType(activationFunc);
+            p.convolution_type = CONVOLUTION_POINTWISE;
         } else if (opCode == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) {
-            convPs.num_outputs = weightShape[3];
-            convPs.num_outputs_origin = convPs.num_outputs;
+            p.num_outputs = weightShape[3];
+            p.num_outputs_origin = p.num_outputs;
 
             const auto &tfliteConvOption = this->tfliteOperators[this->tfliteOperatorIndex]
                                                ->builtin_options.AsDepthwiseConv2DOptions();
-            convPs.dilatedRate_h = tfliteConvOption->dilation_h_factor;
-            convPs.dilatedRate_w = tfliteConvOption->dilation_w_factor;
-            convPs.stride_h = tfliteConvOption->stride_h;
-            convPs.stride_w = tfliteConvOption->stride_w;
+            p.dilatedRate_h = tfliteConvOption->dilation_h_factor;
+            p.dilatedRate_w = tfliteConvOption->dilation_w_factor;
+            p.stride_h = tfliteConvOption->stride_h;
+            p.stride_w = tfliteConvOption->stride_w;
             const auto activationFunc = tfliteConvOption->fused_activation_function;
 
             if (1 == tfliteConvOption->padding) {  // VALID
-                convPs.padding_top = 0;
-                convPs.padding_bottom = 0;
-                convPs.padding_left = 0;
-                convPs.padding_right = 0;
+                p.pad_top = 0;
+                p.pad_bottom = 0;
+                p.pad_left = 0;
+                p.pad_right = 0;
             } else {  // SAME
                 const auto &inputTensor =
                     this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]];
                 const auto &inputShape = inputTensor->shape;
-                convPs.padding_top = (convPs.kernel_h - 1) / 2;
-                convPs.padding_bottom = (convPs.kernel_h - 1) / 2;
-                if (convPs.kernel_h % 2 == 0) {
-                    convPs.padding_bottom += 1;
+                p.pad_top = (p.kernel_h - 1) / 2;
+                p.pad_bottom = (p.kernel_h - 1) / 2;
+                if (p.kernel_h % 2 == 0) {
+                    p.pad_bottom += 1;
                 }
-                if (convPs.padding_top != 0 && inputShape[1] % 2 == 0 &&
-                    tfliteConvOption->stride_h % 2 == 0) {
-                    convPs.padding_top -= 1;
+                if (p.pad_top != 0 && inputShape[1] % 2 == 0 && tfliteConvOption->stride_h % 2 == 0) {
+                    p.pad_top -= 1;
                 }
-                convPs.padding_left = (convPs.kernel_w - 1) / 2;
-                convPs.padding_right = (convPs.kernel_w - 1) / 2;
-                if (convPs.kernel_w % 2 == 0) {
-                    convPs.padding_right += 1;
+                p.pad_left = (p.kernel_w - 1) / 2;
+                p.pad_right = (p.kernel_w - 1) / 2;
+                if (p.kernel_w % 2 == 0) {
+                    p.pad_right += 1;
                 }
-                if (convPs.padding_left != 0 && inputShape[2] % 2 == 0 &&
+                if (p.pad_left != 0 && inputShape[2] % 2 == 0 &&
                     tfliteConvOption->stride_w % 2 == 0) {
-                    convPs.padding_left -= 1;
+                    p.pad_left -= 1;
                 }
             }
 
-            convPs.group = convPs.num_outputs;
+            p.group = p.num_outputs;
             // process the situation: when depth_multiplier > 1 && fn == depth_multiplier, depthwise ==> pointwise
             if (tfliteConvOption->depth_multiplier > 1 &&
                 tfliteConvOption->depth_multiplier == weightShape[3]) {
-                convPs.convolution_type = Convolution_Pointwise;
-                convPs.dw_activation_type = ACTIVATION_NULL;
-                convPs.pw_activation_type = getActivationOperatorType(activationFunc);
-                convPs.group = 1;
+                p.convolution_type = CONVOLUTION_POINTWISE;
+                p.dw_activation_type = ACTIVATION_NULL;
+                p.pw_activation_type = getActivationOperatorType(activationFunc);
+                p.group = 1;
             } else {
-                convPs.convolution_type = Convolution_Depthwise;
-                convPs.dw_activation_type = getActivationOperatorType(activationFunc);
-                convPs.pw_activation_type = ACTIVATION_NULL;
+                p.convolution_type = CONVOLUTION_DEPTHWISE;
+                p.dw_activation_type = getActivationOperatorType(activationFunc);
+                p.pw_activation_type = ACTIVATION_NULL;
             }
         } else {
             UNI_ERROR_LOG("can not map operator location:%d type:%s to Convolution.\n",
                 this->tfliteOperatorIndex, tflite::EnumNamesBuiltinOperator()[this->opCode]);
         }
-        curPs.conv_spec = convPs;
-        return curPs;
+        ps.conv_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Reduction() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ReductionParamSpec reductionPs;
-        memset(&curPs, 0, sizeof(reductionPs));
+        ParameterSpec ps;
+        ReductionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         if (opCode == tflite::BuiltinOperator_MEAN) {
-            reductionPs.reduction_mode = REDUCTION_MEAN;
+            p.mode = REDUCTION_MEAN;
         } else if (opCode == tflite::BuiltinOperator_SUM) {
-            reductionPs.reduction_mode = REDUCTION_SUM;
+            p.mode = REDUCTION_SUM;
         } else if (opCode == tflite::BuiltinOperator_REDUCE_MAX) {
-            reductionPs.reduction_mode = REDUCTION_MAX;
+            p.mode = REDUCTION_MAX;
         } else {
             UNI_ERROR_LOG("can not map operator location:%d type:%s to Reduction.\n",
                 this->tfliteOperatorIndex, tflite::EnumNamesBuiltinOperator()[this->opCode]);
@@ -1008,35 +962,34 @@ class TfliteAdaptee : public ModelAdaptee {
         const auto &axisTensor =
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
         const auto &axisData = tfliteModelBuffer[axisTensor->buffer]->data;
-        reductionPs.axes_num = axisData.size() / sizeof(int);
+        p.num_axes = axisData.size() / sizeof(int);
         auto axisPtr = reinterpret_cast<const int32_t *>(axisData.data());
-        memcpy(reductionPs.axes, axisPtr, axisData.size());
+        UNI_MEMCPY(p.axes, axisPtr, axisData.size());
         if (this->weightFormat == DF_NHWC) {
-            for (int i = 0; i < reductionPs.axes_num; i++) {
-                reductionPs.axes[i] = NHWCAxisToNCHWAxis(reductionPs.axes[i], inputShape.size());
+            for (int i = 0; i < p.num_axes; i++) {
+                p.axes[i] = NHWCAxisToNCHWAxis(p.axes[i], inputShape.size());
             }
         }
-        reductionPs.coeff = 1;
-        reductionPs.keep_dim = false;
-        curPs.reduction_spec = reductionPs;
-        return curPs;
+        p.coeff = 1;
+        p.keep_dim = false;
+        ps.reduction_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Pooling() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PoolingParamSpec poolingPs;
-        memset(&poolingPs, 0, sizeof(poolingPs));
-        poolingPs.kernel_t = 1;
-        poolingPs.stride_t = 1;
-        poolingPs.padding_before = 0;
-        poolingPs.padding_after = 0;
-        poolingPs.padding_top = 0;
-        poolingPs.padding_bottom = 0;
-        poolingPs.padding_left = 0;
-        poolingPs.padding_right = 0;
-        poolingPs.rm = CEIL;
+        ParameterSpec ps;
+        PoolingParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.kernel_t = 1;
+        p.stride_t = 1;
+        p.pad_before = 0;
+        p.pad_after = 0;
+        p.pad_top = 0;
+        p.pad_bottom = 0;
+        p.pad_left = 0;
+        p.pad_right = 0;
+        p.round_mode = ROUND_CEIL;
 
         const auto &inputTensor =
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]];
@@ -1048,166 +1001,165 @@ class TfliteAdaptee : public ModelAdaptee {
             const auto &axisData = tfliteModelBuffer[axisTensor->buffer]->data;
             auto axisPtr = reinterpret_cast<const int32_t *>(axisData.data());
             CHECK_REQUIREMENT(1 == axisPtr[0] && 2 == axisPtr[1]);
-            poolingPs.mode = POOLING_MEAN;
-            poolingPs.kernel_h = 0;
-            poolingPs.kernel_w = 0;
-            poolingPs.stride_h = 1;
-            poolingPs.stride_w = 1;
+            p.mode = POOLING_MEAN;
+            p.kernel_h = 0;
+            p.kernel_w = 0;
+            p.stride_h = 1;
+            p.stride_w = 1;
         } else {
             const auto &tflitePoolOption =
                 this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsPool2DOptions();
-            poolingPs.kernel_h = tflitePoolOption->filter_height;
-            poolingPs.kernel_w = tflitePoolOption->filter_width;
-            poolingPs.stride_h = tflitePoolOption->stride_h;
-            poolingPs.stride_w = tflitePoolOption->stride_w;
+            p.kernel_h = tflitePoolOption->filter_height;
+            p.kernel_w = tflitePoolOption->filter_width;
+            p.stride_h = tflitePoolOption->stride_h;
+            p.stride_w = tflitePoolOption->stride_w;
             int tfPaddingRoundMode = tflitePoolOption->padding;
             if (tfPaddingRoundMode == 0) {
-                poolingPs.rm = TF_SAME;
-
-                int oLength = (inputShape[2] + poolingPs.stride_w - 1) / poolingPs.stride_w;
-                int padLength = UNI_MAX(
-                    (oLength - 1) * poolingPs.stride_w + poolingPs.kernel_w - inputShape[2], 0);
-                poolingPs.padding_left = padLength / 2;
-                poolingPs.padding_right = padLength - poolingPs.padding_left;
-
-                oLength = (inputShape[1] + poolingPs.stride_h - 1) / poolingPs.stride_h;
-                padLength = UNI_MAX(
-                    (oLength - 1) * poolingPs.stride_h + poolingPs.kernel_h - inputShape[1], 0);
-                poolingPs.padding_top = padLength / 2;
-                poolingPs.padding_bottom = padLength - poolingPs.padding_top;
+                p.round_mode = ROUND_TF_SAME;
+
+                int oLength = (inputShape[2] + p.stride_w - 1) / p.stride_w;
+                int padLength = UNI_MAX((oLength - 1) * p.stride_w + p.kernel_w - inputShape[2], 0);
+                p.pad_left = padLength / 2;
+                p.pad_right = padLength - p.pad_left;
+
+                oLength = (inputShape[1] + p.stride_h - 1) / p.stride_h;
+                padLength = UNI_MAX((oLength - 1) * p.stride_h + p.kernel_h - inputShape[1], 0);
+                p.pad_top = padLength / 2;
+                p.pad_bottom = padLength - p.pad_top;
             } else if (tfPaddingRoundMode == 1) {
-                poolingPs.rm = TF_VALID;
+                p.round_mode = ROUND_TF_VALID;
             } else {
                 UNI_ERROR_LOG("can not process operator location:%d Pooling round mode.\n",
                     this->tfliteOperatorIndex);
             }
             if (opCode == tflite::BuiltinOperator_MAX_POOL_2D) {
-                poolingPs.mode = POOLING_MAX;
+                p.mode = POOLING_MAX;
             } else if (opCode == tflite::BuiltinOperator_AVERAGE_POOL_2D) {
-                poolingPs.mode = POOLING_MEAN;
+                p.mode = POOLING_MEAN;
             }
             insertActivationOperator(
                 getActivationOperatorType(tflitePoolOption->fused_activation_function));
         }
-        curPs.pooling_spec = poolingPs;
-        return curPs;
+        p.count_include_pad = false;
+        ps.pooling_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Reshape() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        const auto &shapeTensor =
-            this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
-        const auto &shapeData = tfliteModelBuffer[shapeTensor->buffer]->data;
-        ReshapeParamSpec reshapePs;
-        memset(&reshapePs, 0, sizeof(reshapePs));
-        reshapePs.shape_size = (int)(shapeData.size() / sizeof(int));
-        auto reshapeDimPtr = reinterpret_cast<const int32_t *>(shapeData.data());
-        for (int iter = 0; iter < reshapePs.shape_size; iter++) {
+        ParameterSpec ps;
+        ReshapeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        std::vector<int> shape;
+        if (this->tfliteOperators[this->tfliteOperatorIndex]->inputs.size() == 1) {
+            const auto &tp =
+                this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsReshapeOptions();
+            shape = tp->new_shape;
+        } else {
+            const auto &t =
+                this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
+            const auto &v = tfliteModelBuffer[t->buffer]->data;
+            shape = std::vector<int>(v.size() / sizeof(int));
+            UNI_MEMCPY(shape.data(), v.data(), v.size());
+        }
+        p.num_shape = shape.size();
+        for (int iter = 0; iter < p.num_shape; iter++) {
             int axis = iter;
             if (this->weightFormat == DF_NHWC) {
-                axis = NHWCAxisToNCHWAxis(iter, reshapePs.shape_size);
+                axis = NHWCAxisToNCHWAxis(iter, p.num_shape);
             }
-            reshapePs.shape_dims[axis] = reshapeDimPtr[iter];
+            p.shape[axis] = shape[iter];
         }
-        reshapePs.axis = 8;
-        reshapePs.num_axes = -1;
-        curPs.reshape_spec = reshapePs;
-        return curPs;
+        p.axis = 8;
+        p.num_axes = -1;
+        ps.reshape_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Transpose() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        TransposeParamSpec transPs;
-        memset(&transPs, 0, sizeof(transPs));
+        ParameterSpec ps;
+        TransposeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const auto &dimsTensor =
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
         const auto &dimsData = tfliteModelBuffer[dimsTensor->buffer]->data;
         CHECK_REQUIREMENT((dimsTensor->shape[0]) == (int)(dimsData.size() / sizeof(int)));
-        transPs.trans_size = dimsTensor->shape[0];
+        p.num_axes = dimsTensor->shape[0];
         auto dims = reinterpret_cast<const int32_t *>(dimsData.data());
-        for (U32 i = 0; i < transPs.trans_size; i++) {
+        for (U32 i = 0; i < p.num_axes; i++) {
             if (this->weightFormat == DF_NHWC) {
-                transPs.trans_dims[i] = NHWCAxisToNCHWAxis(dims[i], transPs.trans_size);
+                p.axes[i] = NHWCAxisToNCHWAxis(dims[i], p.num_axes);
             } else {
-                transPs.trans_dims[i] = dims[i];
+                p.axes[i] = dims[i];
             }
         }
-        curPs.transpose_spec = transPs;
-        return curPs;
+        ps.transpose_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_TfSlice() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        TfSliceParamSpec tfSlicePs;
-        memset(&tfSlicePs, 0, sizeof(tfSlicePs));
+        ParameterSpec ps;
+        TfSliceParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         if (opCode == tflite::BuiltinOperator_STRIDED_SLICE) {
             const auto &stridedSliceOption = this->tfliteOperators[this->tfliteOperatorIndex]
                                                  ->builtin_options.AsStridedSliceOptions();
             const auto &beginTensor =
                 this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
-            tfSlicePs.dim_size = beginTensor->shape[0];
+            p.num_dims = beginTensor->shape[0];
             auto beginData = reinterpret_cast<const int32_t *>(
                 (tfliteModelBuffer[beginTensor->buffer]->data).data());
-            memcpy(tfSlicePs.begin, beginData, sizeof(int) * tfSlicePs.dim_size);
+            UNI_MEMCPY(p.begin, beginData, sizeof(int) * p.num_dims);
             const auto &endTensor =
                 this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[2]];
             auto endData = reinterpret_cast<const int32_t *>(
                 (tfliteModelBuffer[endTensor->buffer]->data).data());
-            memcpy(tfSlicePs.end, endData, sizeof(int) * tfSlicePs.dim_size);
+            UNI_MEMCPY(p.end, endData, sizeof(int) * p.num_dims);
             const auto &stridesTensor =
                 this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[3]];
             auto stridesData = reinterpret_cast<const int32_t *>(
                 (tfliteModelBuffer[stridesTensor->buffer]->data).data());
-            memcpy(tfSlicePs.strides, stridesData, sizeof(int) * tfSlicePs.dim_size);
-            bitsToCharArray(
-                stridedSliceOption->begin_mask, tfSlicePs.begin_mask, tfSlicePs.dim_size);
-            bitsToCharArray(stridedSliceOption->end_mask, tfSlicePs.end_mask, tfSlicePs.dim_size);
-            bitsToCharArray(
-                stridedSliceOption->ellipsis_mask, tfSlicePs.ellipsis_mask, tfSlicePs.dim_size);
-            bitsToCharArray(
-                stridedSliceOption->new_axis_mask, tfSlicePs.new_axis_mask, tfSlicePs.dim_size);
-            bitsToCharArray(stridedSliceOption->shrink_axis_mask, tfSlicePs.shrink_axis_mask,
-                tfSlicePs.dim_size);
+            UNI_MEMCPY(p.strides, stridesData, sizeof(int) * p.num_dims);
+            bitsToCharArray(stridedSliceOption->begin_mask, p.begin_mask, p.num_dims);
+            bitsToCharArray(stridedSliceOption->end_mask, p.end_mask, p.num_dims);
+            bitsToCharArray(stridedSliceOption->ellipsis_mask, p.ellipsis_mask, p.num_dims);
+            bitsToCharArray(stridedSliceOption->new_axis_mask, p.new_axis_mask, p.num_dims);
+            bitsToCharArray(stridedSliceOption->shrink_axis_mask, p.shrink_axis_mask, p.num_dims);
         } else {
             const auto &beginTensor =
                 this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
-            tfSlicePs.dim_size = beginTensor->shape[0];
+            p.num_dims = beginTensor->shape[0];
             auto beginData = reinterpret_cast<const int32_t *>(
                 (tfliteModelBuffer[beginTensor->buffer]->data).data());
-            memcpy(tfSlicePs.begin, beginData, sizeof(int) * tfSlicePs.dim_size);
+            UNI_MEMCPY(p.begin, beginData, sizeof(int) * p.num_dims);
             const auto &sizeTensor =
                 this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[2]];
             auto sizeData = reinterpret_cast<const int32_t *>(
                 (tfliteModelBuffer[sizeTensor->buffer]->data).data());
-            for (U32 i = 0; i < tfSlicePs.dim_size; i++) {
-                tfSlicePs.end[i] = tfSlicePs.begin[i] + sizeData[i];
-                tfSlicePs.strides[i] = 1;
+            for (U32 i = 0; i < p.num_dims; i++) {
+                p.end[i] = p.begin[i] + sizeData[i];
+                p.strides[i] = 1;
             }
-            memset(tfSlicePs.begin_mask, 0, sizeof(char) * tfSlicePs.dim_size);
-            memset(tfSlicePs.end_mask, 0, sizeof(char) * tfSlicePs.dim_size);
-            memset(tfSlicePs.ellipsis_mask, 0, sizeof(char) * tfSlicePs.dim_size);
-            memset(tfSlicePs.new_axis_mask, 0, sizeof(char) * tfSlicePs.dim_size);
-            memset(tfSlicePs.shrink_axis_mask, 0, sizeof(char) * tfSlicePs.dim_size);
+            UNI_MEMSET(p.begin_mask, 0, sizeof(char) * p.num_dims);
+            UNI_MEMSET(p.end_mask, 0, sizeof(char) * p.num_dims);
+            UNI_MEMSET(p.ellipsis_mask, 0, sizeof(char) * p.num_dims);
+            UNI_MEMSET(p.new_axis_mask, 0, sizeof(char) * p.num_dims);
+            UNI_MEMSET(p.shrink_axis_mask, 0, sizeof(char) * p.num_dims);
         }
         if (this->weightFormat == DF_NHWC) {
-            shiftRight<int>(tfSlicePs.begin, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1);
-            shiftRight<int>(tfSlicePs.end, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1);
-            shiftRight<int>(tfSlicePs.strides, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1);
-            shiftRight<char>(tfSlicePs.begin_mask, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1);
-            shiftRight<char>(tfSlicePs.end_mask, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1);
-            shiftRight<char>(tfSlicePs.ellipsis_mask, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1);
-            shiftRight<char>(tfSlicePs.new_axis_mask, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1);
-            shiftRight<char>(
-                tfSlicePs.shrink_axis_mask, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1);
-        }
-        curPs.tfslice_spec = tfSlicePs;
-        return curPs;
+            shiftRight<int>(p.begin, p.num_dims, 1, p.num_dims - 1);
+            shiftRight<int>(p.end, p.num_dims, 1, p.num_dims - 1);
+            shiftRight<int>(p.strides, p.num_dims, 1, p.num_dims - 1);
+            shiftRight<char>(p.begin_mask, p.num_dims, 1, p.num_dims - 1);
+            shiftRight<char>(p.end_mask, p.num_dims, 1, p.num_dims - 1);
+            shiftRight<char>(p.ellipsis_mask, p.num_dims, 1, p.num_dims - 1);
+            shiftRight<char>(p.new_axis_mask, p.num_dims, 1, p.num_dims - 1);
+            shiftRight<char>(p.shrink_axis_mask, p.num_dims, 1, p.num_dims - 1);
+        }
+        ps.tfslice_spec = p;
+        return ps;
     }
 
     void insertSharedWeight(int tensorId)
@@ -1216,90 +1168,87 @@ class TfliteAdaptee : public ModelAdaptee {
         std::string name = tensor->name;
         OperatorSpec sharedWeight = mt_create_operator(name.c_str(), OT_SharedWeight, 0, 1);
         str_copy(sharedWeight.output_tensors_name[0], name.c_str(), NAME_LEN);
-        SharedWeightParamSpec sharedWeightPs;
-        sharedWeightPs.desc = getDescFromTp(tensor);
-        if (sharedWeightPs.desc.nDims == 4 && this->weightFormat == DF_NHWC) {
-            sharedWeightPs.desc.df = DF_NHWC;
+        SharedWeightParamSpec p;
+        p.desc = getDescFromTp(tensor);
+        if (p.desc.nDims == 4 && this->weightFormat == DF_NHWC) {
+            p.desc.df = DF_NHWC;
         }
 
         auto data = reinterpret_cast<U8 *>((tfliteModelBuffer[tensor->buffer]->data).data());
-        WeightSpec weightSpec = mt_create_weight(
-            name.c_str(), sharedWeightPs.desc.dt, tensorNumBytes(sharedWeightPs.desc), 0, 0);
-        if (sharedWeightPs.desc.df == DF_NHWC) {
+        WeightSpec weightSpec =
+            mt_create_weight(name.c_str(), p.desc.dt, tensorNumBytes(p.desc), 0, 0);
+        if (p.desc.df == DF_NHWC) {
             std::vector<int> shape(tensor->shape);
             TensorDesc nchwDesc =
-                tensor4df(sharedWeightPs.desc.dt, DF_NCHW, shape[0], shape[3], shape[1], shape[2]);
-            transformToNCHW(sharedWeightPs.desc, data, nchwDesc, weightSpec.weight);
-            sharedWeightPs.desc = nchwDesc;
+                tensor4df(p.desc.dt, DF_NCHW, shape[0], shape[3], shape[1], shape[2]);
+            transformToNCHW(p.desc, data, nchwDesc, weightSpec.weight);
+            p.desc = nchwDesc;
         } else {
-            memcpy(weightSpec.weight, data, tensorNumBytes(sharedWeightPs.desc));
+            UNI_MEMCPY(weightSpec.weight, data, tensorNumBytes(p.desc));
         }
         this->boltSharedWeights.push_back(weightSpec);
-        sharedWeight.ps.shared_weight_spec = sharedWeightPs;
+        sharedWeight.ps.shared_weight_spec = p;
         this->boltOperators.insert(
             this->boltOperators.begin() + this->boltOperatorIndex, sharedWeight);
         this->boltOperatorInsertBefore++;
-        this->modelWeightOpNum++;
+        this->weightNumber++;
     }
 
     ParameterSpec adapt_MatMul() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        MatMulParamSpec matmulPs;
-        memset(&matmulPs, 0, sizeof(matmulPs));
-        matmulPs.transpose_a = false;
-        matmulPs.transpose_b = false;
+        ParameterSpec ps;
+        MatMulParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.transpose_a = false;
+        p.transpose_b = false;
         std::vector<int> weightInputIndex = getOperatorWeightInputIndex(this->tfliteOperatorIndex);
         if (weightInputIndex.size() == 2 && weightInputIndex[0] == 0 && weightInputIndex[1] == 2) {
-            matmulPs.transpose_b = true;
+            p.transpose_b = true;
             insertSharedWeight(this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]);
         }
-        curPs.matmul_spec = matmulPs;
-        return curPs;
+        ps.matmul_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Fc() override
     {
-        modelWeightOpNum++;
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        FullyConnectedParamSpec ips;
-        memset(&ips, 0, sizeof(ips));
+        this->weightNumber++;
+        ParameterSpec ps;
+        FullyConnectedParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const int index = this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1];
         const auto &tensor = this->tfliteTensors[index];
         I32 size = tfliteModelBuffer[tensor->buffer]->data.size();
         CHECK_REQUIREMENT(size != 0);
         const auto &weightShape = tensor->shape;
-        ips.num_outputs = weightShape[0];
-        ips.num_slices = 1;
-        ips.slice_point[0] = ips.num_outputs;
-        curPs.fc_spec = ips;
+        p.num_outputs = weightShape[0];
+        p.num_slices = 1;
+        p.slice_point[0] = p.num_outputs;
+        ps.fc_spec = p;
         const auto &tfliteFullyConnectedOption = this->tfliteOperators[this->tfliteOperatorIndex]
                                                      ->builtin_options.AsFullyConnectedOptions();
         insertActivationOperator(
             getActivationOperatorType(tfliteFullyConnectedOption->fused_activation_function));
-        return curPs;
+        return ps;
     }
 
     ParameterSpec adapt_Concat() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ConcatParamSpec concatPs;
-        memset(&concatPs, 0, sizeof(concatPs));
+        ParameterSpec ps;
+        ConcatParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         std::vector<int> pinnedInput;
         if (this->opCode == tflite::BuiltinOperator_CONCATENATION) {
             const auto &tfliteConcatOption = this->tfliteOperators[this->tfliteOperatorIndex]
                                                  ->builtin_options.AsConcatenationOptions();
             insertActivationOperator(
                 getActivationOperatorType(tfliteConcatOption->fused_activation_function));
-            concatPs.axis = tfliteConcatOption->axis;
+            p.axis = tfliteConcatOption->axis;
             pinnedInput = getOperatorWeightInputIndex(this->tfliteOperatorIndex);
         } else {
             const auto &tflitePackOption =
                 this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsPackOptions();
-            concatPs.axis = tflitePackOption->axis;
+            p.axis = tflitePackOption->axis;
             int id = tflitePackOption->values_count - 1;
             pinnedInput.push_back(id);
         }
@@ -1318,10 +1267,10 @@ class TfliteAdaptee : public ModelAdaptee {
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->outputs[0]];
         const auto &outputShape = outputTensor->shape;
         if (this->weightFormat == DF_NHWC) {
-            concatPs.axis = NHWCAxisToNCHWAxis(concatPs.axis, outputShape.size());
+            p.axis = NHWCAxisToNCHWAxis(p.axis, outputShape.size());
         }
-        curPs.concat_spec = concatPs;
-        return curPs;
+        ps.concat_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Softmax() override
@@ -1330,144 +1279,145 @@ class TfliteAdaptee : public ModelAdaptee {
             this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsSoftmaxOptions();
         CHECK_REQUIREMENT(1 == tfliteSoftmaxOption->beta);
 
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SoftmaxParamSpec softmaxPs;
-        memset(&softmaxPs, 0, sizeof(softmaxPs));
-        softmaxPs.axis = -1;
-
+        ParameterSpec ps;
+        SoftmaxParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.axis = -1;
         const auto &inputTensor =
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]];
         const auto &inputShape = inputTensor->shape;
         if (this->weightFormat == DF_NHWC) {
-            softmaxPs.axis = NHWCAxisToNCHWAxis(softmaxPs.axis, inputShape.size());
+            p.axis = NHWCAxisToNCHWAxis(p.axis, inputShape.size());
         }
-        curPs.softmax_spec = softmaxPs;
-        return curPs;
+        ps.softmax_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Resize() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ResizeParamSpec resizePs;
-        memset(&resizePs, 0, sizeof(resizePs));
+        ParameterSpec ps;
+        ResizeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const auto &dimsTensor =
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
         const auto &dimsData = tfliteModelBuffer[dimsTensor->buffer]->data;
         CHECK_REQUIREMENT((dimsTensor->shape[0]) == 2);
         auto dims = reinterpret_cast<const int32_t *>(dimsData.data());
-        resizePs.sizes[0] = dims[0];
-        resizePs.sizes[1] = dims[1];
-        resizePs.num_sizes = 2;
-        resizePs.num_scales = 0;
+        p.sizes[0] = dims[0];
+        p.sizes[1] = dims[1];
+        p.num_sizes = 2;
+        p.num_scales = 0;
         if (this->opCode == tflite::BuiltinOperator_RESIZE_BILINEAR) {
-            resizePs.mode = LINEAR;
+            p.mode = RESIZE_LINEAR;
+            const auto &tp = this->tfliteOperators[this->tfliteOperatorIndex]
+                                 ->builtin_options.AsResizeBilinearOptions();
+            p.trans_mode = (tp->align_corners) ? COORDINATE_TRANS_ALIGN_CORNERS
+                                               : COORDINATE_TRANS_ASYMMETRIC;
         } else if (this->opCode == tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR) {
-            resizePs.mode = NEAREST;
+            p.mode = RESIZE_NEAREST;
+            const auto &tp = this->tfliteOperators[this->tfliteOperatorIndex]
+                                 ->builtin_options.AsResizeNearestNeighborOptions();
+            p.trans_mode = (tp->align_corners) ? COORDINATE_TRANS_ALIGN_CORNERS
+                                               : COORDINATE_TRANS_ASYMMETRIC;
         } else {
             UNI_ERROR_LOG("can not map operator location:%d type:%s to Resize.\n",
                 this->tfliteOperatorIndex, tflite::EnumNamesBuiltinOperator()[this->opCode]);
         }
-        curPs.resize_spec = resizePs;
-        return curPs;
+        p.round_mode = ROUND_FLOOR;
+        ps.resize_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Clip() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ClipParamSpec clipPs;
-        memset(&clipPs, 0, sizeof(clipPs));
+        ParameterSpec ps;
+        ClipParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const auto &clipTensor =
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
         const auto &clipData = tfliteModelBuffer[clipTensor->buffer]->data;
         if (opCode == tflite::BuiltinOperator_MINIMUM) {
-            clipPs.max = clipData[0];
-            clipPs.min = std::numeric_limits<float>::min();
+            p.max = clipData[0];
+            p.min = std::numeric_limits<float>::min();
         } else if (opCode == tflite::BuiltinOperator_MAXIMUM) {
-            clipPs.max = std::numeric_limits<float>::max();
-            clipPs.min = clipData[0];
+            p.max = std::numeric_limits<float>::max();
+            p.min = clipData[0];
         } else {
             UNI_ERROR_LOG("can not map operator location:%d type:%s to Clip.\n",
                 this->tfliteOperatorIndex, tflite::EnumNamesBuiltinOperator()[this->opCode]);
         }
-        curPs.clip_spec = clipPs;
-        return curPs;
+        ps.clip_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Deconvolution() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        modelWeightOpNum++;
+        this->weightNumber++;
+        ParameterSpec ps;
+        ConvolutionParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const int weightIndex = this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1];
         const auto &weightTensor = this->tfliteTensors[weightIndex];
-
         const auto &weightShape = weightTensor->shape;
         CHECK_REQUIREMENT(weightShape.size() == 4);
-
-        ConvolutionParamSpec convPs;
-        memset(&convPs, 0, sizeof(convPs));
-        convPs.kernel_t = 1;
-        convPs.kernel_h = weightShape[1];
-        convPs.kernel_w = weightShape[2];
-        convPs.num_outputs = weightShape[0];
-        convPs.num_outputs_origin = convPs.num_outputs;
+        p.kernel_t = 1;
+        p.kernel_h = weightShape[1];
+        p.kernel_w = weightShape[2];
+        p.num_outputs = weightShape[0];
+        p.num_outputs_origin = p.num_outputs;
 
         const auto &tfliteDeConvOption =
             this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsTransposeConvOptions();
-        convPs.stride_t = 1;
-        convPs.stride_h = tfliteDeConvOption->stride_h;
-        convPs.stride_w = tfliteDeConvOption->stride_w;
-        convPs.group = 1;
-
-        convPs.dilatedRate_t = 1;
-        convPs.dilatedRate_h = 1;
-        convPs.dilatedRate_w = 1;
-        convPs.convolution_type = Convolution_Deconvolution;
-        convPs.dw_activation_type = ACTIVATION_NULL;
-        convPs.pw_activation_type = ACTIVATION_NULL;
-
-        convPs.padding_before = 0;
-        convPs.padding_after = 0;
+        p.stride_t = 1;
+        p.stride_h = tfliteDeConvOption->stride_h;
+        p.stride_w = tfliteDeConvOption->stride_w;
+        p.group = 1;
+
+        p.dilatedRate_t = 1;
+        p.dilatedRate_h = 1;
+        p.dilatedRate_w = 1;
+        p.convolution_type = CONVOLUTION_DECONVOLUTION;
+        p.dw_activation_type = ACTIVATION_NULL;
+        p.pw_activation_type = ACTIVATION_NULL;
+
+        p.pad_before = 0;
+        p.pad_after = 0;
         if (tfliteDeConvOption->padding == 1) {
-            convPs.rm = TF_VALID;
-            convPs.padding_top = 0;
-            convPs.padding_bottom = 0;
-            convPs.padding_left = 0;
-            convPs.padding_right = 0;
+            p.round_mode = ROUND_TF_VALID;
+            p.pad_top = 0;
+            p.pad_bottom = 0;
+            p.pad_left = 0;
+            p.pad_right = 0;
         } else {
-            convPs.rm = TF_SAME;
-            if (convPs.kernel_h < convPs.stride_h) {
-                convPs.padding_top = 0;
-                convPs.padding_bottom = 0;
+            p.round_mode = ROUND_TF_SAME;
+            if (p.kernel_h < p.stride_h) {
+                p.pad_top = 0;
+                p.pad_bottom = 0;
             } else {
-                convPs.padding_top = (convPs.kernel_h - convPs.stride_h) / 2;
-                convPs.padding_bottom = convPs.kernel_h - convPs.stride_h - convPs.padding_top;
+                p.pad_top = (p.kernel_h - p.stride_h) / 2;
+                p.pad_bottom = p.kernel_h - p.stride_h - p.pad_top;
             }
-            if (convPs.kernel_w < convPs.stride_w) {
-                convPs.padding_left = 0;
-                convPs.padding_right = 0;
+            if (p.kernel_w < p.stride_w) {
+                p.pad_left = 0;
+                p.pad_right = 0;
             } else {
-                convPs.padding_left = (convPs.kernel_w - convPs.stride_w) / 2;
-                convPs.padding_right = convPs.kernel_w - convPs.stride_w - convPs.padding_left;
+                p.pad_left = (p.kernel_w - p.stride_w) / 2;
+                p.pad_right = p.kernel_w - p.stride_w - p.pad_left;
             }
         }
 
-        curPs.conv_spec = convPs;
-        return curPs;
+        ps.conv_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Power() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PowerParamSpec powerPs;
-        memset(&powerPs, 0, sizeof(powerPs));
-        powerPs.scale = 1;
-        powerPs.shift = 0;
-        powerPs.power = 1;
+        ParameterSpec ps;
+        PowerParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        p.scale = 1;
+        p.shift = 0;
+        p.power = 1;
         float weight = 0;
         std::vector<int> weightInputIndex = getOperatorWeightInputIndex(this->tfliteOperatorIndex);
         if (weightInputIndex.size() > 0) {
@@ -1477,24 +1427,24 @@ class TfliteAdaptee : public ModelAdaptee {
             weight = transformTfliteTensorToVector(weightTensor)[0];
         }
         if (opCode == tflite::BuiltinOperator_SQRT) {
-            powerPs.power = 0.5;
+            p.power = 0.5;
         } else if (opCode == tflite::BuiltinOperator_POW) {
-            powerPs.power = weight;
+            p.power = weight;
         } else if (opCode == tflite::BuiltinOperator_ADD) {
-            powerPs.shift = weight;
+            p.shift = weight;
         } else if (opCode == tflite::BuiltinOperator_SUB) {
-            powerPs.shift = weight * -1;
+            p.shift = weight * -1;
         } else if (opCode == tflite::BuiltinOperator_MUL) {
-            powerPs.scale = weight;
+            p.scale = weight;
         } else if (opCode == tflite::BuiltinOperator_DIV) {
-            powerPs.scale = 1.0 / weight;
+            p.scale = 1.0 / weight;
         } else if (opCode == tflite::BuiltinOperator_NEG) {
-            powerPs.scale = -1.0;
+            p.scale = -1.0;
         } else {
             UNI_ERROR_LOG("can not map operator location:%d type:%s to Power.\n",
                 this->tfliteOperatorIndex, tflite::EnumNamesBuiltinOperator()[this->opCode]);
         }
-        curPs.power_spec = powerPs;
+        ps.power_spec = p;
 
         if (opCode == tflite::BuiltinOperator_ADD) {
             const auto &addOption =
@@ -1505,82 +1455,91 @@ class TfliteAdaptee : public ModelAdaptee {
             }
         }
 
-        return curPs;
+        return ps;
     }
 
     ParameterSpec adapt_Pad() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        PadParamSpec padPs;
-        memset(&padPs, 0, sizeof(padPs));
-        const auto &beginTensor =
+        ParameterSpec ps;
+        PadParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
+        const auto &t =
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
-        auto beginData = reinterpret_cast<const int32_t *>(
-            (tfliteModelBuffer[beginTensor->buffer]->data).data());
-        padPs.before = 0;
-        padPs.after = 0;
-        padPs.top = beginData[2];
-        padPs.bottom = beginData[3];
-        padPs.left = beginData[4];
-        padPs.right = beginData[5];
-        padPs.constant_value = 0;
+        const auto &v = tfliteModelBuffer[t->buffer]->data;
+        auto data = reinterpret_cast<const int32_t *>(v.data());
+        int num = v.size() / sizeof(int);
+        if (num == 8) {
+            // nhwc
+            p.top = data[2];
+            p.bottom = data[3];
+            p.left = data[4];
+            p.right = data[5];
+            p.front = data[6];
+            p.back = data[7];
+        } else if (num == 6) {
+            // nhc
+            p.top = data[2];
+            p.bottom = data[3];
+            p.front = data[4];
+            p.back = data[5];
+        } else {
+            UNI_ERROR_LOG("can not process operator location:%d type:%s parameter.\n",
+                this->tfliteOperatorIndex, tflite::EnumNamesBuiltinOperator()[this->opCode]);
+        }
+        p.constant_value = 0;
         if (this->opCode == tflite::BuiltinOperator_PAD) {
-            padPs.pad_mode = Pad_Constant;
+            p.pad_mode = PAD_CONSTANT;
         } else {
-            padPs.pad_mode = Pad_Reflect;
+            p.pad_mode = PAD_REFLECT;
         }
-        curPs.pad_spec = padPs;
-        return curPs;
+        ps.pad_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Relu() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        ReLUParamSpec reluPs;
-        memset(&reluPs, 0, sizeof(reluPs));
+        ParameterSpec ps;
+        ReLUParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         if (this->opCode == tflite::BuiltinOperator_RELU) {
-            reluPs.neg_slope = 0;
+            p.neg_slope = 0;
         } else {
             const auto &tfliteLeakyReluOption =
                 this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsLeakyReluOptions();
-            reluPs.neg_slope = tfliteLeakyReluOption->alpha;
+            p.neg_slope = tfliteLeakyReluOption->alpha;
         }
-        curPs.relu_spec = reluPs;
-        return curPs;
+        ps.relu_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Squeeze() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SqueezeParamSpec squeezePs;
-        memset(&squeezePs, 0, sizeof(squeezePs));
+        ParameterSpec ps;
+        SqueezeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const auto &tfliteSqueezeOption =
             this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsSqueezeOptions();
-        squeezePs.axes_num = tfliteSqueezeOption->squeeze_dims.size();
+        p.num_axes = tfliteSqueezeOption->squeeze_dims.size();
         const auto &inputTensor =
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]];
         const auto &inputShape = inputTensor->shape;
-        for (int i = 0; i < squeezePs.axes_num; i++) {
+        for (int i = 0; i < p.num_axes; i++) {
             if (this->weightFormat == DF_NHWC) {
-                squeezePs.axes[i] =
+                p.axes[i] =
                     NHWCAxisToNCHWAxis(tfliteSqueezeOption->squeeze_dims[i], inputShape.size());
             } else {
-                squeezePs.axes[i] = tfliteSqueezeOption->squeeze_dims[i];
+                p.axes[i] = tfliteSqueezeOption->squeeze_dims[i];
             }
         }
-        curPs.squeeze_spec = squeezePs;
-        return curPs;
+        ps.squeeze_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Unsqueeze() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        UnsqueezeParamSpec unsqueezePs;
-        memset(&unsqueezePs, 0, sizeof(unsqueezePs));
+        ParameterSpec ps;
+        UnsqueezeParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const auto &weightTensor =
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
         auto weightData = transformTfliteTensorToVector(weightTensor);
@@ -1588,39 +1547,37 @@ class TfliteAdaptee : public ModelAdaptee {
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]];
         const auto &inputShape = inputTensor->shape;
         if (this->weightFormat == DF_NHWC) {
-            unsqueezePs.axes[0] = NHWCAxisToNCHWAxis((int)weightData[0], inputShape.size());
+            p.axes[0] = NHWCAxisToNCHWAxis((int)weightData[0], inputShape.size());
         } else {
-            unsqueezePs.axes[0] = weightData[0];
+            p.axes[0] = weightData[0];
         }
-        unsqueezePs.axes_num = 1;
-        curPs.unsqueeze_spec = unsqueezePs;
-        return curPs;
+        p.num_axes = 1;
+        ps.unsqueeze_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_TopK() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
+        ParameterSpec ps;
         TopKParamSpec p;
-        memset(&p, 0, sizeof(p));
+        UNI_MEMSET(&p, 0, sizeof(p));
         p.axis = 0;
         p.largest = 1;
         p.sorted = 1;
         const auto &weightTensor =
             this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]];
         auto weightData = transformTfliteTensorToVector(weightTensor);
-        p.topk = weightData[0];
-        curPs.topk_spec = p;
-        return curPs;
+        p.k = weightData[0];
+        ps.topk_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Gather() override
     {
-        modelWeightOpNum++;
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
+        this->weightNumber++;
+        ParameterSpec ps;
         GatherParamSpec p;
-        memset(&p, 0, sizeof(p));
+        UNI_MEMSET(&p, 0, sizeof(p));
         if (this->opCode == tflite::BuiltinOperator_GATHER) {
             const auto &gatherOption =
                 this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsGatherOptions();
@@ -1664,40 +1621,39 @@ class TfliteAdaptee : public ModelAdaptee {
                 }
             }
         }
-        curPs.gather_spec = p;
-        return curPs;
+        ps.gather_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_PRelu() override
     {
-        modelWeightOpNum++;
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        return curPs;
+        this->weightNumber++;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        return ps;
     }
 
     ParameterSpec adapt_SpaceToBatchNd() override
     {
-        modelWeightOpNum++;
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        return curPs;
+        this->weightNumber++;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        return ps;
     }
 
     ParameterSpec adapt_BatchToSpaceNd() override
     {
-        modelWeightOpNum++;
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        return curPs;
+        this->weightNumber++;
+        ParameterSpec ps;
+        UNI_MEMSET(&ps, 0, sizeof(ps));
+        return ps;
     }
 
     ParameterSpec adapt_Slice() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        SliceParamSpec slice_ps;
-        memset(&slice_ps, 0, sizeof(slice_ps));
+        ParameterSpec ps;
+        SliceParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         if (this->opCode == tflite::BuiltinOperator_SPLIT) {
             const auto &tfliteSplit =
                 this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsSplitOptions();
@@ -1708,74 +1664,76 @@ class TfliteAdaptee : public ModelAdaptee {
             const auto &inputShape = inputTensor->shape;
             std::vector<float> axisData = transformTfliteTensorToVector(weightTensor);
             if (this->weightFormat == DF_NHWC) {
-                slice_ps.axis = NHWCAxisToNCHWAxis((int)axisData[0], inputShape.size());
+                p.axis = NHWCAxisToNCHWAxis((int)axisData[0], inputShape.size());
             } else {
-                slice_ps.axis = axisData[0];
+                p.axis = axisData[0];
             }
-            slice_ps.slice_size = tfliteSplit->num_splits - 1;
-            memset(slice_ps.slice_points, 0, slice_ps.slice_size * sizeof(I32));
+            p.num_slice = tfliteSplit->num_splits - 1;
+            UNI_MEMSET(p.slice_points, 0, p.num_slice * sizeof(I32));
+        } else if (this->opCode == tflite::BuiltinOperator_QUANTIZE ||
+            this->opCode == tflite::BuiltinOperator_FAKE_QUANT) {
+            p.num_slice = 0;
         } else {
             UNI_ERROR_LOG("can not map operator location:%d type:%s to Slice.\n",
                 this->tfliteOperatorIndex, tflite::EnumNamesBuiltinOperator()[this->opCode]);
         }
-        curPs.slice_spec = slice_ps;
-        return curPs;
+        ps.slice_spec = p;
+        return ps;
     }
 
-    ParameterSpec adapt_Equal() override
+    ParameterSpec adapt_Check() override
     {
-        modelWeightOpNum++;
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        EqualParamSpec equal_ps;
-        memset(&equal_ps, 0, sizeof(equal_ps));
+        this->weightNumber++;
+        ParameterSpec ps;
+        CheckParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         if (this->opCode == tflite::BuiltinOperator_EQUAL) {
-            equal_ps.invert = false;
+            p.mode = CHECK_EQUAL;
+        } else if (this->opCode == tflite::BuiltinOperator_NOT_EQUAL) {
+            p.mode = CHECK_NOT_EQUAL;
         } else {
-            equal_ps.invert = true;
+            UNI_ERROR_LOG("can not map operator location:%d type:%s to Check.\n",
+                this->tfliteOperatorIndex, tflite::EnumNamesBuiltinOperator()[this->opCode]);
         }
-        curPs.equal_spec = equal_ps;
-        return curPs;
+        ps.check_spec = p;
+        return ps;
     }
 
     ParameterSpec adapt_Select() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
+        ParameterSpec ps;
         std::vector<int> weights = getOperatorWeightInputIndex(this->tfliteOperatorIndex);
         for (U32 i = 0; i < weights.size(); i++) {
             insertSharedWeight(this->tfliteOperators[this->tfliteOperatorIndex]->inputs[weights[i]]);
         }
-        return curPs;
+        return ps;
     }
 
     ParameterSpec adapt_Cast() override
     {
-        ParameterSpec curPs;
-        memset(&curPs, 0, sizeof(curPs));
-        CastParamSpec castPs;
-        memset(&castPs, 0, sizeof(castPs));
-
+        ParameterSpec ps;
+        CastParamSpec p;
+        UNI_MEMSET(&p, 0, sizeof(p));
         const auto &tfliteCast =
             this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsCastOptions();
         if (tfliteCast != nullptr) {
-            castPs.targetDt = TfliteTensorType2BoltDataType(tfliteCast->out_data_type);
+            p.dt = TfliteTensorType2BoltDataType(tfliteCast->out_data_type);
         } else {
-            castPs.targetDt = DT_F32;
+            p.dt = DT_F32;
         }
-        curPs.cast_spec = castPs;
-        return curPs;
+        ps.cast_spec = p;
+        return ps;
     }
 
 public:
     std::set<OperatorType> ordinary_weight_op = {
-        OT_PRelu, OT_SpaceToBatchNd, OT_BatchToSpaceNd, OT_FC, OT_Equal, OT_Gather};
+        OT_PRelu, OT_SpaceToBatchNd, OT_BatchToSpaceNd, OT_FC, OT_Check, OT_Gather};
 
     std::map<OperatorType, int> modifiedInputsOp{{OT_Conv, 1}, {OT_Reshape, 1}, {OT_Resize, 1},
         {OT_Transpose, 1}, {OT_FC, 1}, {OT_Slice, 1}, {OT_Scale, 1}, {OT_Pooling, 1}, {OT_Clip, 1},
         {OT_Deconvolution, 1}, {OT_SqDiff, 1}, {OT_Reduction, 1}, {OT_Pad, 1}, {OT_Power, 1},
         {OT_TfSlice, 1}, {OT_SpaceToBatchNd, 1}, {OT_BatchToSpaceNd, 1}, {OT_MatMul, 2},
-        {OT_PRelu, 1}, {OT_Gather, 1}, {OT_Equal, 1}};
+        {OT_PRelu, 1}, {OT_Gather, 1}, {OT_Check, 1}};
 
 private:
     DataFormat weightFormat;
@@ -1787,7 +1745,7 @@ class TfliteAdaptee : public ModelAdaptee {
     std::vector<int> outputs;
     U32 tfliteOperatorIndex;
     tflite::BuiltinOperator opCode;
-    int modelWeightOpNum;
+    int weightNumber;
     std::string modelName;
 
     U32 boltOperatorIndex;
diff --git a/model_tools/tools/X2bolt/X2bolt.cpp b/model_tools/tools/X2bolt/X2bolt.cpp
index e72ec6e6..22ee5898 100644
--- a/model_tools/tools/X2bolt/X2bolt.cpp
+++ b/model_tools/tools/X2bolt/X2bolt.cpp
@@ -14,6 +14,7 @@
 #include <getopt.h>
 #include "online_conversion.h"
 #include "model_print.h"
+#include "model_common.h"
 #include <iostream>
 #include <algorithm>
 
@@ -28,7 +29,7 @@ void print_X2bolt_usage()
                  "2. -m <modelFileName>: The name of your model file without file suffix.\n"
                  "Tips: If your model trained from caffe, please ensure the model file prefix of "
                  "prototxt and caffemodel are the same, otherwise error occurs.\n"
-                 "3. -i <inferencePrecision>: The inference precision. Currently, you can only "
+                 "3. -i [inferencePrecision]: The inference precision. Currently, you can only "
                  "choose one of {FP32, FP16, PTQ, BNN_FP16}. PTQ produces the input for "
                  "post_training_quantization tool. INT8_FP16 is for machine(ARMv8.2+) that "
                  "supports fp16 to compute non BNN(1-bit) operators.\n"
@@ -36,7 +37,12 @@ void print_X2bolt_usage()
                  "The default value is 0.\n"
                  "5. -v : X2bolt version information.\n"
                  "6. -V : Bolt Model detail information.\n"
-                 "7. -h : X2bolt help information.\n"
+                 "7. -t : training format for on-device finetuning.\n"
+                 "8. -h : X2bolt help information.\n"
+                 "9. -I : To modify input names of the model. Please use ',' as the connection "
+                 "symbol.\n"
+                 "10. -O : To modify output names of the model. Please use ',' as the connection "
+                 "symbol.\n"
                  "Example: ./X2bolt -d /local/models/ -m resnet50 -i FP16\n"
                  "If model conversion is successful, you can find the resnet50_f16.bolt file in "
                  "/local/models. Otherwise, you should check the usage Intro above.\n"
@@ -67,9 +73,12 @@ int main(int argc, char *argv[])
     std::string inferPrecision = "FP32";
     I32 removeProcessOpsNum = 0;
     bool printModel = false;
+    bool trainMode = false;
+    std::string modifiedInputs = "";
+    std::string modifiedOutputs = "";
 
     int option;
-    const char *optionstring = "d:m:i:r:V";
+    const char *optionstring = "d:m:i:r:VtI:O:";
     while ((option = getopt(argc, argv, optionstring)) != -1) {
         switch (option) {
             case 'd':
@@ -94,6 +103,15 @@ int main(int argc, char *argv[])
             case 'V':
                 printModel = true;
                 break;
+            case 't':
+                trainMode = true;
+                break;
+            case 'I':
+                modifiedInputs = optarg;
+                break;
+            case 'O':
+                modifiedOutputs = optarg;
+                break;
             default:
                 std::cerr << "Input option gets error. Please check the params meticulously.\n"
                           << std::endl;
@@ -107,12 +125,14 @@ int main(int argc, char *argv[])
     }
     transform(inferPrecision.begin(), inferPrecision.end(), inferPrecision.begin(), toupper);
 
-    void *onlineModel = OnlineModelConversion(
-        storagePath.c_str(), modelFileName.c_str(), inferPrecision.c_str(), removeProcessOpsNum);
+    void *onlineModel = OnlineModelConversion(storagePath.c_str(), modelFileName.c_str(),
+        inferPrecision.c_str(), removeProcessOpsNum, trainMode);
     ModelSpec *ms = (ModelSpec *)onlineModel;
 
     std::string modelStorePath = storagePath + "/" + modelFileName;
-    if (inferPrecision.compare(std::string("PTQ")) == 0) {
+    if (trainMode) {
+        modelStorePath += std::string("_train.bolt");
+    } else if (inferPrecision.compare(std::string("PTQ")) == 0) {
         modelStorePath += std::string("_ptq_input.bolt");
     } else if (inferPrecision.compare(std::string("FP16")) == 0 ||
         inferPrecision.compare(std::string("BNN_FP16")) == 0) {
@@ -123,6 +143,10 @@ int main(int argc, char *argv[])
         UNI_ERROR_LOG("Unknown converter data precision: %s.\n", inferPrecision.c_str());
         exit(1);
     }
+
+    // modified input names and output names
+    modify_ms_inputs_and_outputs(ms, modifiedInputs, modifiedOutputs);
+
     UNI_INFO_LOG("Write bolt model to %s.\n", modelStorePath.c_str());
     CHECK_STATUS(serialize_model_to_file(ms, modelStorePath.c_str()));
     OnlineModelReclaim(onlineModel);
@@ -135,5 +159,6 @@ int main(int argc, char *argv[])
         CHECK_STATUS(mt_destroy_model(&resultMs));
     }
     std::cout << "Model Conversion Succeeded!" << std::endl;
+    // UNI_MEM_STATISTICS();
     return 0;
 }
diff --git a/model_tools/tools/quantization/post_training_quantization.cpp b/model_tools/tools/quantization/post_training_quantization.cpp
index 433e09f6..faa298a8 100644
--- a/model_tools/tools/quantization/post_training_quantization.cpp
+++ b/model_tools/tools/quantization/post_training_quantization.cpp
@@ -14,7 +14,6 @@
 #include <iostream>
 #include <getopt.h>
 #include "model_spec.h"
-#include "model_quantization.h"
 #include "model_calibration.hpp"
 #include "model_data_type_converter.h"
 #include "model_optimizer.hpp"
@@ -75,7 +74,6 @@ int main(int argc, char *argv[])
     ImageFormat imageFormat = RGB;
     F32 mulScale = 1.0;
     bool verbose = false;
-    bool hasScale = false;
 
     int option;
     const char *optionstring = "p:i:b:q:c:s:o:d:f:m:V";
@@ -104,7 +102,6 @@ int main(int argc, char *argv[])
             case 's':
                 std::cout << "option is -s [scaleFileDirectory], value is: " << optarg << std::endl;
                 scaleFile = optarg;
-                hasScale = true;
                 break;
 #if _USE_INT8
             case 'o':
@@ -203,13 +200,9 @@ int main(int argc, char *argv[])
     }
 
     ModelSpecOptimizer msOptimizer;
-    msOptimizer.suggest_for_ptq(inferPrecision, fuseBN, clipVal, hasScale);
+    msOptimizer.suggest_for_ptq(inferPrecision, fuseBN, scaleFile, clipVal);
     msOptimizer.optimize(&ms);
 
-    if (hasScale) {
-        add_scale_from_file(&ms, scaleFile);
-    }
-
     ModelSpec *targetMs = new ModelSpec();
     CHECK_STATUS(mt_create_model(targetMs));
     CHECK_STATUS(ms_datatype_converter(&ms, targetMs, converterMode, quantStorage));
diff --git a/model_tools/tools/tensorflow2caffe/Caffe/layer_parameter.py b/model_tools/tools/tensorflow2caffe/Caffe/layer_parameter.py
index 604779f2..f4cbb622 100644
--- a/model_tools/tools/tensorflow2caffe/Caffe/layer_parameter.py
+++ b/model_tools/tools/tensorflow2caffe/Caffe/layer_parameter.py
@@ -214,6 +214,7 @@ def weight_param(self, shape, data_type):
     #      }
     #      optional DataType data_type = 1;
     #      optional BlobShape shape = 2;
+    #      optional float value = 3 [default = 0];
     #    }
     #
     # prototxt example
@@ -228,11 +229,13 @@ def weight_param(self, shape, data_type):
     #           dim: 100
     #           dim: 200
     #         }
+    #         value: 0
     #       }
     #     }
-    def memory_param(self, shape, data_type):
+    def memory_param(self, shape, data_type, value=0):
         preallocated_memory_param = pb.PreAllocatedMemoryParameter()
         preallocated_memory_param.data_type = self.convert_data_type(data_type)
+        preallocated_memory_param.value = value;
         for i in shape:
             preallocated_memory_param.shape.dim.append(i)
         self.layerParameter.preallocated_memory_param.CopyFrom(preallocated_memory_param)
diff --git a/model_tools/tools/tensorflow2caffe/operators.py b/model_tools/tools/tensorflow2caffe/operators.py
index 84300920..f0bbb17e 100644
--- a/model_tools/tools/tensorflow2caffe/operators.py
+++ b/model_tools/tools/tensorflow2caffe/operators.py
@@ -21,14 +21,19 @@ def print_data(x, name, print_flag=True):
         print("[INFO] tensor name %s shape %s sum %f" % (name, shape_str, x.sum()))
         num = x.size
         x = np.reshape(x, [num])
-        threshold = 60
+        threshold = 40
         if (num < threshold):
             threshold = num
         print(x[:threshold])
 
     @staticmethod
-    def zeros(shape, name):
+    def memory(shape, value, name):
         x = np.zeros(shape)
+        if (value != 0):
+            x = x.reshape([-1])
+            for i in range(len(x)):
+                x[i] = value
+            x = x.reshape(shape)
         if (Operators.calculate):
             Operators.print_data(x, name)
         return x
@@ -244,6 +249,10 @@ def reshape(_x, dim, name):
         if (not Operators.calculate):
             return None;
         x = _x.copy()
+        x_shape = x.shape
+        for i in range(len(dim)):
+            if (dim[i] == 0):
+                dim[i] = x_shape[i]
         x = np.reshape(x, dim)
         Operators.print_data(x, name)
         return x
@@ -726,7 +735,7 @@ def rnn(mode, inputs, state, w, b, projection, projection_bias, zoneout_cell, zo
         name, state_name, printFlag=True):
         if (mode == "LSTM"):
             result, state = Operators.lstm(inputs, state, w, b, projection, projection_bias,
-                zoneout_cell, zoneout_output, None, None, False)
+                zoneout_cell, zoneout_output, name, state_name, printFlag)
         elif (mode == "GRU"):
             result, state = Operators.gru(inputs, state, w, b, name, state_name, printFlag)
         elif (mode == "GRU_LBR"):
diff --git a/model_tools/tools/tensorflow2caffe/tensorflow2caffe.py b/model_tools/tools/tensorflow2caffe/tensorflow2caffe.py
index e33e32f4..c9991395 100644
--- a/model_tools/tools/tensorflow2caffe/tensorflow2caffe.py
+++ b/model_tools/tools/tensorflow2caffe/tensorflow2caffe.py
@@ -443,6 +443,10 @@ def transpose_nhc_nchw(self, x):
         x = self.add_expand_dims(x, 3, x+"_nch_nchw")
         return x
 
+    def transpose_nhc_nch(self, x):
+        x = self.add_transpose(x, x+"_nhc_nch", [0, 2, 1])
+        return x
+
     def calculate_convolution_padding(self, input_shape, kernel_size, strides, mode):
         i_h = input_shape[2]
         i_w = input_shape[3]
@@ -1062,12 +1066,26 @@ def add_repeat_set_times(self, repeat_name, input_name, input_axis, output_name=
         self.caffe_model.add_layer(layer)
         return repeat_name
 
-    def add_memory(self, memory_name, memory_shapes, data_type):
+    def add_memory(self, memory_name, memory_shapes, data_type, value=0, input=None):
+        bottom = []
+        y_shape = None
+        if memory_shapes is not None:
+            y_shape = memory_shapes.copy()
+        if (input is not None):
+            bottom.append(input)
+            x_shape = self.get_tensor_shape(input)
+            if y_shape is None:
+                y_shape = x_shape
+            else:
+                for i in range(len(y_shape)):
+                    if y_shape[i] == 0:
+                        y_shape[i] = x_shape[i]
         layer = caffe_net.LayerParameter(name=memory_name+"_mem", type='PreAllocatedMemory',
+            bottom=bottom,
             top=[memory_name])
-        layer.memory_param(memory_shapes, data_type)
+        layer.memory_param(memory_shapes, data_type, value)
         self.caffe_model.add_layer(layer)
-        self.data_dict[memory_name] = Operators.zeros(memory_shapes, memory_name)
+        self.data_dict[memory_name] = Operators.memory(y_shape, value, memory_name)
         return memory_name
 
     def add_pad(self, input_name, output_name, padding_shapes, padding_values=None):
diff --git a/model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2.py b/model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2.py
index 32d0eb94..725eb145 100644
--- a/model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2.py
+++ b/model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2.py
@@ -24,6 +24,9 @@ def __init__(self):
             self.num_mels = 80 #Number of mel-spectrogram channels and local conditioning dimensionality
             self.outputs_per_step = 1 #number of frames to generate at each decoding step (increase to speed up computation and allows for higher batch size, decreases G&L audio quality)
             self.tacotron_zoneout_rate = 0.1 #zoneout rate for all LSTM cells in the network
+            self.stop_at_any = True
+            self.batch_norm_position = 'before'
+            self.tacotron_speaker_embedding = 128
 
 	        #Mel and Linear spectrograms normalization/scaling and clipping
             self.signal_normalization = True #Whether to normalize mel spectrograms to some predefined range (following below parameters)
@@ -50,6 +53,7 @@ def __init__(self):
             self.attention_kernel = 31 #kernel size of attention convolution
             self.cumulative_weights = True #Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)
 
+            self.decoder_layers = 1 #number of decoder lstm layers
             self.decoder_lstm_units = 512 #number of decoder lstm units on each layer
 
             #Attention synthesis constraints
@@ -78,6 +82,7 @@ def EncoderConvolutions(self, inputs, hparams, activation="relu", scope_id=0, sc
 
         inputs = self.transpose_nhc_nchw(inputs)
         x = inputs
+        assert(hparams.batch_norm_position in ['before', 'after'])
         for i in range(enc_conv_num_layers):
             self.scopes[scope_id+1] = 'conv_layer_{}_'.format(i + 1) + scope
             padding = self.calculate_convolution_padding(self.get_tensor_shape(x), kernel_size, strides, 'same')
@@ -85,13 +90,17 @@ def EncoderConvolutions(self, inputs, hparams, activation="relu", scope_id=0, sc
                                 channels, kernel_size, strides, padding,
                                 data_format="NCHW",
                                 dilation=1, groups=1, layer_names=['conv1d', "kernel", "bias"])
-            x = self.extract_batch_norm(x, output_name_prefix+"_bn_"+str(i+1), scope_id+2,
-                layer_names=["batch_normalization", "moving_mean", "moving_variance"])
+            if (hparams.batch_norm_position == 'before'):
+                x = self.extract_batch_norm(x, output_name_prefix+"_bn_"+str(i+1), scope_id+2,
+                    layer_names=["batch_normalization", "moving_mean", "moving_variance"])
             if activation == "relu":
                 x = self.add_relu(x, output_name_prefix+"_"+scope+"_"+activation+"_"+str(i+1))
             else:
                 print("[ERROR] unsupported activation layer %s in EncoderConvolutions" % (activation))
                 exit(1)
+            if (hparams.batch_norm_position == 'after'):
+                x = self.extract_batch_norm(x, output_name_prefix+"_bn_"+str(i+1), scope_id+2,
+                    layer_names=["batch_normalization", "moving_mean", "moving_variance"])
         x = self.transpose_nchc8_nhc(x)
         return x
 
@@ -160,6 +169,7 @@ def Postnet(self, inputs, hparams, activation="tanh", scope_id=0, scope="postnet
 
         inputs = self.transpose_nhc_nchw(inputs)
         x = inputs
+        assert(hparams.batch_norm_position in ['before', 'after'])
         for i in range(postnet_num_layers - 1):
             self.scopes[scope_id+1] = 'conv_layer_{}_'.format(i + 1) + scope
             padding = self.calculate_convolution_padding(self.get_tensor_shape(x), kernel_size, strides, 'same')
@@ -167,13 +177,17 @@ def Postnet(self, inputs, hparams, activation="tanh", scope_id=0, scope="postnet
                                 channels, kernel_size, strides, padding,
                                 data_format="NCHW",
                                 dilation=1, groups=1, layer_names=['conv1d', "kernel", "bias"])
-            x = self.extract_batch_norm(x, output_name_prefix+"_bn_"+str(i+1), scope_id+2,
-                layer_names=["batch_normalization", "moving_mean", "moving_variance"])
+            if (hparams.batch_norm_position == 'before'):
+                x = self.extract_batch_norm(x, output_name_prefix+"_bn_"+str(i+1), scope_id+2,
+                    layer_names=["batch_normalization", "moving_mean", "moving_variance"])
             if activation == "tanh":
                 x = self.add_tanh(x, output_name_prefix+"_"+scope+"_"+activation+"_"+str(i+1))
             else:
                 print("[ERROR] unsupported activation layer %s in EncoderConvolutions" % (activation))
                 exit(1)
+            if (hparams.batch_norm_position == 'after'):
+                x = self.extract_batch_norm(x, output_name_prefix+"_bn_"+str(i+1), scope_id+2,
+                    layer_names=["batch_normalization", "moving_mean", "moving_variance"])
 
         layer_id = 5
         self.scopes[scope_id+1] = 'conv_layer_{}_'.format(layer_id) + scope
@@ -317,7 +331,7 @@ def generate_decoder(self, rnn_result, inputs=None):
         else:
             self.add_memory(decoder_input_name, decoder_input_shape, data_type="FLOAT32")
             self.add_memory(decoder_attention_name, decoder_attention_shape, data_type="FLOAT32")
-        decoder_query_lstm_states = self.prepare_decoder_states(1, hp.decoder_lstm_units*2, "decoder_query")
+        decoder_query_lstm_states = self.prepare_decoder_states(hp.decoder_layers, hp.decoder_lstm_units*2, "decoder_query")
         self.set_input(inputs)
         self.save_input()
         negative_one = "negative_one"
@@ -353,7 +367,7 @@ def generate_decoder(self, rnn_result, inputs=None):
 
             LSTM_input = self.add_concat([prenet, decoder_attention_name], "decoder_concat1", axis=-1)
 
-            LSTM_output = self.DecoderRNN(LSTM_input, decoder_query_lstm_states, layers=1,
+            LSTM_output = self.DecoderRNN(LSTM_input, decoder_query_lstm_states, layers=hp.decoder_layers,
                         size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate,
                         scope_id=3, scope='decoder_query_LSTM', output_name_prefix="decoder_query_lstm")
             context_vector, alignments, new_cumulated_alignments = self._compute_attention(
diff --git a/model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2_duration.py b/model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2_duration.py
new file mode 100644
index 00000000..c0a2190d
--- /dev/null
+++ b/model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2_duration.py
@@ -0,0 +1,231 @@
+#!/usr/local/bin/python
+# -*- coding: utf-8 -*-
+
+import math
+import numpy as np
+from tensorflow2caffe_tactron2 import Tensorflow2CaffeTactron2
+
+class Tensorflow2CaffeTactron2Duration(Tensorflow2CaffeTactron2):
+    def __init__(self,
+            tensorflow_model_path, caffe_model_path_prefix, caffe_model_name,
+            params,
+            check=False, calc=False):
+        Tensorflow2CaffeTactron2.__init__(self, tensorflow_model_path,
+            caffe_model_path_prefix, caffe_model_name, params, check, calc)
+        self.params.batch_norm_position = 'after'
+        self.params.clip_outputs = True
+        self.params.lower_bound_decay = 0.1
+        self.params.decoder_layers = 2
+        self.params.decoder_lstm_units = 1024
+        self.params.outputs_per_step = 3
+
+    def generate_encoder(self, inputs=None):
+        hp = self.params
+        word_input_name = "words"
+        word_input_shape = [self.batch, hp.max_sequence_length]
+        self.add_input(word_input_name, word_input_shape)
+        mask_input_name = "duration_masks"
+        mask_input_shape = [self.batch, hp.max_sequence_length]
+        self.add_input(mask_input_name, mask_input_shape)
+        speaker_input_name = "speaker"
+        speaker_input_shape = [self.batch, 1]
+        self.add_input(speaker_input_name, speaker_input_shape)
+        self.set_input(inputs)
+        self.save_input()
+
+        self.scopes[0] = "Tacotron_model"
+        self.scopes[1] = "inference"
+        speaker = self.extract_speaker(speaker_input_name, 2)
+        duration = self.extract_duration(word_input_name, mask_input_name, 2)
+        embedding_inputs = "tts_word_embedding"
+        self.extract_embedding(word_input_name, 2, "inputs_embedding", embedding_inputs)
+        convolution_result = self.EncoderConvolutions(embedding_inputs, hparams=hp, scope='encoder_convolutions', scope_id=2, output_name_prefix="encoder")
+        rnn_result = self.EncoderRNN(convolution_result, size=hp.encoder_lstm_units,
+                zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM', scope_id=2, output_name_prefix="encoder")
+        self.save_caffe_model()
+        return self.get_tensor(rnn_result), self.get_tensor(speaker), self.get_tensor(duration)
+
+    def ExpanderConvolutions(self, inputs, scope_id):
+        self.scopes[scope_id] = "expander_convolutions"
+        kernel_size = [7, 1]
+        strides = [1, 1]
+        channels = 512
+        inputs = self.transpose_nhc_nchw(inputs)
+        x = inputs
+        for i in range(2):
+            self.scopes[scope_id+1] = 'conv_layer_{}_expander_convolutions'.format(i + 1)
+            padding = self.calculate_convolution_padding(self.get_tensor_shape(x), kernel_size, strides, 'same')
+            x = self.extract_convolution(x, "expander_conv_"+str(i+1), scope_id+2,
+                                channels, kernel_size, strides, padding,
+                                data_format="NCHW",
+                                dilation=1, groups=1, layer_names=['conv1d', "kernel", "bias"])
+            x = self.add_relu(x, "expander_relu_"+str(i+1))
+            x = self.extract_batch_norm(x, "expander_bn_"+str(i+1), scope_id+2,
+                layer_names=["batch_normalization", "moving_mean", "moving_variance"])
+        x = self.transpose_nchc8_nhc(x)
+        return x
+
+    def generate_decoder(self, inputs=None):
+        hp = self.params
+        expander_input_name = "encoder"
+        expander_input_shape = [self.batch, hp.max_iters, hp.encoder_lstm_units * 2 + 1 + hp.tacotron_speaker_embedding]
+        self.add_input(expander_input_name, expander_input_shape)
+        length_input_name = "decoder_length"
+        length_input_shape = [self.batch, 1]
+        self.add_input(length_input_name, length_input_shape)
+
+        decoder_input_name = "decoder_input"
+        decoder_input_shape = [self.batch, hp.num_mels]
+        decoder_attention_name = "decoder_attention"
+        rnn_result_dim = hp.encoder_lstm_units * 2
+        decoder_attention_shape = [self.batch, rnn_result_dim]
+        self.add_memory(decoder_input_name, decoder_input_shape, data_type="FLOAT32")
+        self.add_memory(decoder_attention_name, decoder_attention_shape, data_type="FLOAT32")
+        decoder_query_lstm_states = self.prepare_decoder_states(hp.decoder_layers, hp.decoder_lstm_units*2, "decoder_query")
+        self.set_input(inputs)
+        self.save_input()
+
+        self.scopes[0] = "Tacotron_model"
+        self.scopes[1] = "inference"
+        context_vectors = self.ExpanderConvolutions(expander_input_name, 2)
+
+        negative_one = "negative_one"
+        weight = np.array([[-1] * self.batch])
+        self.add_weight(negative_one, weight=weight, data_type="INT32")
+        zero = "zero"
+        weight = np.array([[0]*self.batch])
+        self.add_weight(zero, weight=weight, data_type="INT32")
+        position_input_name = "decoder_position"
+        position_input_shape = [self.batch, 1]
+        self.add_memory(position_input_name, position_input_shape, data_type="INT32")
+        self.add_copy(negative_one, 1, 1, 0,
+                      position_input_name, 1, 1, 0,
+                      1, output_name="init_decoder_position")
+
+        decoder_result_name = "decoder_result"
+        decoder_result_shape = [self.batch, hp.outputs_per_step*hp.max_iters, hp.num_mels]
+        self.add_memory(decoder_result_name, decoder_result_shape, data_type="FLOAT32")
+        x = decoder_input_name
+        self.scopes[2] = "decoder"
+        index = 0
+        for i in range(hp.max_iters):
+            self.set_add_layer(i == 0)
+            position_input_name_new = position_input_name+"_add_one"
+            self.add_power(position_input_name, position_input_name_new, scale=1, shift=1, power=1)
+            self.add_copy(position_input_name_new, 1, 1, 0,
+                          position_input_name, 1, 1, 0,
+                          1, output_name="update_position")
+
+            prenet = self.Prenet(x, layers_sizes=hp.prenet_layers, scope_id=3, scope='decoder_prenet', output_name_prefix="decoder")
+
+            LSTM_input = self.add_concat([prenet, decoder_attention_name], "decoder_concat1", axis=-1)
+
+            LSTM_output = self.DecoderRNN(LSTM_input, decoder_query_lstm_states, layers=hp.decoder_layers,
+                        size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate,
+                        scope_id=3, scope='decoder_LSTM', output_name_prefix="decoder_query_lstm")
+            self.add_copy(context_vectors,
+                          -1, rnn_result_dim, 0,
+                          decoder_attention_name,
+                          rnn_result_dim, rnn_result_dim, 0,
+                          rnn_result_dim,
+                          output_name="copy_context_vector",
+                          src_index_name=position_input_name,
+                          dst_index_name=zero)
+            
+            projections_input = self.add_concat([LSTM_output, decoder_attention_name], "decoder_concat2", axis=-1)
+
+            frame_projection = self.FrameProjection(projections_input, hp.num_mels * hp.outputs_per_step,
+                        scope_id=3,  scope='linear_transform_projection', output_name_prefix="decoder")
+
+            self.add_copy(frame_projection,
+                          hp.outputs_per_step*hp.num_mels, hp.outputs_per_step*hp.num_mels, 0,
+                          decoder_result_name,
+                          hp.outputs_per_step*hp.max_iters*hp.num_mels, hp.outputs_per_step*hp.num_mels, 0,
+                          hp.outputs_per_step*hp.num_mels,
+                          output_name="copy_to_global_decoder_buffer",
+                          src_index_name=zero,
+                          dst_index_name=position_input_name)
+            next_input = "decoder_next_input"
+            self.add_slice(frame_projection, ["other", next_input], 1, [(hp.outputs_per_step-1)*hp.num_mels])
+            self.add_copy(next_input,
+                          hp.num_mels, hp.num_mels, 0,
+                          x,
+                          hp.num_mels, hp.num_mels, 0,
+                          hp.num_mels,
+                          output_name="copy_to_next_decoder_input")
+            status = "decoder_check"
+            self.add_check(position_input_name, length_input_name, "equal", status)
+            index = index + 1
+            self.add_repeat(hp.max_iters-1, position_input_name_new, output_name="repeat", status_name=status)
+            if (self.get_tensor(status)[0] or index > hp.max_iters-1):
+                break;
+        outputs = [decoder_result_name, position_input_name]
+        self.add_output(outputs)
+        self.save_caffe_model()
+        return self.get_tensor(decoder_result_name), self.get_tensor(position_input_name)
+
+    def extract_duration(self, word, duration_mask, scope_id):
+        self.scopes[scope_id] = "duration_predictor"
+
+        embedding_inputs = "tts_duration_embedding"
+        self.extract_embedding(word, scope_id + 1, "duration_embedding", embedding_inputs)
+        
+        self.scopes[scope_id + 1] = "encoder_convolutions"
+        x = self.transpose_nhc_nchw(embedding_inputs)
+        for i in range(2):
+            self.scopes[scope_id + 2] = 'conv_layer_{}_encoder_convolutions'.format(i + 1)
+            kernel_size = [7, 1]
+            strides = [1, 1]
+            padding = self.calculate_convolution_padding(self.get_tensor_shape(x), kernel_size, strides, 'same')
+            x = self.extract_convolution(x, "duration_conv_"+str(i+1), scope_id+3,
+                                256, kernel_size, strides, padding,
+                                data_format="NCHW",
+                                dilation=1, groups=1, layer_names=['conv1d', "kernel", "bias"])
+            x = self.add_relu(x, "duration_relu_"+str(i+1))
+            x = self.extract_batch_norm(x, "duration_bn_"+str(i+1), scope_id + 3,
+                layer_names=["batch_normalization", "moving_mean", "moving_variance"])
+        x = self.transpose_nchc8_nhc(x);
+        x = self.extract_dense(x, "duration_dense", scope_id + 1, "projection_duration")
+        x = self.add_relu(x, "duration_dense_relu")
+        x = self.add_squeeze(x, "duration_squeeze", axis=2)
+        x = self.add_prod([x, duration_mask], "duration")
+        return x
+
+    def extract_speaker(self, speaker, scope_id):
+        self.scopes[scope_id] = "speaker_scope"
+        speaker_inputs = "tts_speaker_embedding"
+        self.extract_embedding(speaker, scope_id + 1, "speaker_embedding", speaker_inputs)
+
+        h = self.extract_dense(speaker_inputs, "speaker_h", scope_id + 1, "speaker_processor/H")
+        h = self.add_relu(h, "speaker_relu")
+        t = self.extract_dense(speaker_inputs, "speaker_t", scope_id + 1, "speaker_processor/T")
+        t = self.add_sigmoid(t, "speaker_sigmoid")
+        ht1 = self.add_prod([h, t], "speaker_ht")
+        tt = self.add_power(t, "1_t", scale=-1, shift=1, power=1)
+        ht2 = self.add_prod([speaker_inputs, tt], "speaker_xt")
+        y = self.add_sum([ht1, ht2], "highwaynet")
+        return y
+
+    def generate_postnet(self, inputs=None):
+        hp = self.params
+        decoder_result_name = "decoder"
+        decoder_result_shape = [self.batch, hp.outputs_per_step*hp.max_iters, hp.num_mels]
+        self.add_input(decoder_result_name, decoder_result_shape)
+        self.set_input(inputs)
+        self.save_input()
+
+        self.scopes[0] = "Tacotron_model"
+        self.scopes[1] = "inference"
+
+        T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else (0, hp.max_abs_value)
+        if hp.clip_outputs:
+            decoder_result_name = self.add_clip(decoder_result_name, "clip_decoder", T2_output_range[0] - hp.lower_bound_decay, T2_output_range[1])
+        #Postnet
+        postnet = self.Postnet(decoder_result_name, hparams=hp, scope_id=2, scope='postnet_convolutions', output_name_prefix="postnet")
+        projected_residual = self.FrameProjection(postnet, hp.num_mels, scope_id=2, scope='postnet_projection', output_name_prefix="postnet_projection")
+        mel_outputs = self.add_sum([decoder_result_name, projected_residual], "mel_sum")
+        mel_outputs = self.transpose_nhc_nch(mel_outputs)
+        if hp.clip_outputs:
+            mel_outputs = self.add_clip(mel_outputs, "clip_residual", T2_output_range[0] - hp.lower_bound_decay, T2_output_range[1])
+        self.save_caffe_model()
+        return self.get_tensor(mel_outputs)
diff --git a/model_tools/tools/tensorflow2caffe/tts/transform_tactron2_duration.py b/model_tools/tools/tensorflow2caffe/tts/transform_tactron2_duration.py
new file mode 100644
index 00000000..9265926c
--- /dev/null
+++ b/model_tools/tools/tensorflow2caffe/tts/transform_tactron2_duration.py
@@ -0,0 +1,67 @@
+#!/usr/local/bin/python
+# -*- coding: utf-8 -*-
+
+from tensorflow2caffe_tactron2_duration import Tensorflow2CaffeTactron2Duration
+import numpy as np
+
+def post_encoder(encoder, speaker, duration):
+    seq_len = duration.shape[-1]
+    encoder_dim = encoder.shape[-1]
+    speaker_dim = speaker.shape[-1]
+    y = np.zeros([1, int(duration.sum()) + 2, encoder_dim + 1 + speaker_dim])
+    y[0, -1, encoder_dim + 1:] = speaker[0][0]
+    j = 0
+    s = 0
+    for i in range(seq_len):
+        s = s + duration[0][i]
+        if (s >= int(s)):
+            upper = int(s) + 1
+        else:
+            upper = int(s)
+        k = 0.1
+        for a in range(j, upper):
+            y[0, j, :encoder_dim] = encoder[0][i]
+            y[0][j][encoder_dim] = k
+            k = k + 0.1
+            y[0, j, encoder_dim + 1:] = speaker[0][0]
+            j = j + 1
+    return y
+
+def text_to_speech(tensorflow_model_path):
+    params = Tensorflow2CaffeTactron2Duration.Parameters()
+    tts_caffe = Tensorflow2CaffeTactron2Duration(tensorflow_model_path, "tts_encoder_duration", "tts_encoder_duration",
+                       params,
+                       check=False, calc=True)
+    tts_caffe.print_weight_map()
+    data = {}
+    data["words"] = np.array([[0, 212, 110, 149, 3, 96, 154, 150, 8, 93, 170, 3, 163, 168, 7, 212, 111, 150,
+        3, 96, 154, 150, 4, 194, 7, 93, 168, 3, 166, 86, 7, 165, 81, 149, 3, 95,
+        79, 150, 7, 148, 86, 3, 148, 98, 150, 4, 1]]);
+    data["duration_masks"] = np.array([[1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1]])
+    data["speaker"] = np.array([[0]]);
+    encoder, speaker, duration = tts_caffe.generate_encoder(data)
+    a = post_encoder(encoder, speaker, duration)
+    np.save("encoder.npy", a)
+
+    tts_caffe = Tensorflow2CaffeTactron2Duration(tensorflow_model_path, "tts_decoder", "tts_decoder",
+                       params,
+                       check=False, calc=True)
+    a = np.load("encoder.npy")
+    decoder_length = a.shape[1]
+    data = {}
+    data["encoder"] = a
+    data["decoder_length"] = np.array([[decoder_length - 1]])
+    a, _ = tts_caffe.generate_decoder(data)
+    np.save("decoder.npy", a)
+
+    tts_caffe = Tensorflow2CaffeTactron2Duration(tensorflow_model_path, "tts_postnet", "tts_postnet",
+                       params,
+                       check=False, calc=True)
+    a = np.load("decoder.npy")[:, :decoder_length * params.outputs_per_step, :]
+    data = {}
+    data["decoder"] = a
+    tts_caffe.generate_postnet(data)
+
+if __name__ == '__main__':
+    tensorflow_model_path = "/data/bolt/model_zoo/tensorflow_models/tactron_duration/tacotron_model.ckpt-457000"
+    text_to_speech(tensorflow_model_path)
diff --git a/scripts/build_light_bolt.sh b/scripts/build_light_bolt.sh
index 0a277ed9..486abdad 100644
--- a/scripts/build_light_bolt.sh
+++ b/scripts/build_light_bolt.sh
@@ -17,6 +17,7 @@ build_dir=${11}
 
 CXXFLAGS=`echo ${CXXFLAGS} | sed 's/-fPIC//g'`
 NEWCXXFLAGS=`echo ${CXXFLAGS} | sed 's/-static-libstdc++//g'`
+NEWCXXFLAGS=`echo ${NEWCXXFLAGS} | sed 's/-static-libgcc//g'`
 NEWCXXFLAGS=`echo ${NEWCXXFLAGS} | sed 's/-static//g'`
 
 apple_toolchain=false
@@ -93,6 +94,13 @@ LDFLAGS=""
 if [[ ${CXXFLAGS} =~ -D_USE_ANDROID_LOG ]]; then
     LDFLAGS="${LDFLAGS} -llog"
 fi
+if [[ ${CXXFLAGS} =~ "-D_USE_SECURE_C" ]]; then
+    if [[ "${SecureC_ROOT}" == "" ]]; then
+        echo "[ERROR] please source third_party/<target>.sh before make."
+        exit 1
+    fi
+    LDFLAGS="${LDFLAGS} -L${SecureC_ROOT}/lib -lsecurec"
+fi
 if [[ ${CXXFLAGS} =~ -D_USE_OPENMP ]]; then
     LDFLAGS="${LDFLAGS} -fopenmp"
 fi
@@ -116,15 +124,15 @@ if [[ ${SYSTEM} =~ Windows ]]; then
     BoltModel_write_so_name="${BoltModel_write_so_name} -Wl,--out-implib=BoltModel.lib"
     bolt_write_so_name="${bolt_write_so_name} -Wl,--out-implib=bolt.lib"
 fi
-${CXX} ${CXXFLAGS} -shared -o ${BoltModel_shared_library} ${jniLibraryObjs} ${LDFLAGS} ${BoltModel_write_so_name} &> log.txt
+${CXX} ${CXXFLAGS} -shared -o ${BoltModel_shared_library} ${jniLibraryObjs} ${LDFLAGS} ${BoltModel_write_so_name} &> .build_log.txt
 if [[ $? -ne 0 ]]; then
     ${CXX} ${NEWCXXFLAGS} -shared -o ${BoltModel_shared_library} ${jniLibraryObjs} ${LDFLAGS} ${BoltModel_write_so_name} || exit 1
 fi
-${CXX} ${CXXFLAGS} -shared -o ${bolt_shared_library} ${sharedLibraryObjs} ${LDFLAGS} ${bolt_write_so_name} &> log.txt
+${CXX} ${CXXFLAGS} -shared -o ${bolt_shared_library} ${sharedLibraryObjs} ${LDFLAGS} ${bolt_write_so_name} &> .build_log.txt
 if [[ $? -ne 0 ]]; then
     ${CXX} ${NEWCXXFLAGS} -shared -o ${bolt_shared_library} ${sharedLibraryObjs} ${LDFLAGS} ${bolt_write_so_name} || exit 1
 fi
-rm log.txt
+rm .build_log.txt
 ${AR} -rc ${bolt_static_library} ${staticLibraryObjs} || exit 1
 
 if [[ ! ${CXXFLAGS} =~ -D_DEBUG && ${apple_toolchain} == "false" ]]; then
diff --git a/scripts/setup_compiler.sh b/scripts/setup_compiler.sh
index 27744945..ee2f6ae9 100644
--- a/scripts/setup_compiler.sh
+++ b/scripts/setup_compiler.sh
@@ -61,7 +61,7 @@ if [[ "${host_system}" == "" ]]; then
     exit 1
 fi
 host_hardware=""
-if [[ ${system_info} =~ "x86_64" ]]; then
+if [[ ${system_info} =~ "x86_64" || ${system_info} =~ "amd64" || ${system_info} =~ "i686" ]]; then
     host_hardware="x86_64"
 fi
 if [[ ${system_info} =~ "aarch64" || ${system_info} =~ "arm64" ]]; then
@@ -71,7 +71,7 @@ if [[ ${system_info} =~ "armv7" ]]; then
     host_hardware="armv7"
 fi
 if [[ "${host_hardware}" == "" ]]; then
-    echo "[ERROR] can not recognize host hardware information(${system_info}), we currently support x86_64/aarch64."
+    echo "[ERROR] can not recognize host hardware information(${system_info}), we currently support x86_64/amd64/aarch64/armv7."
     exit 1
 fi
 host="${host_system}-${host_hardware}"
@@ -241,7 +241,7 @@ if [[ "${host}" != "${target}" ]]; then
     if [[ ${target} =~ windows ]]; then
         CMAKE_OPTIONS="${CMAKE_OPTIONS} -DCMAKE_SYSTEM_NAME=Windows"
     fi
-    if [[ ${target} =~ armv7 || ${target} =~ arm_himix100 || ${target} =~ arm_musleabi ]]; then
+    if [[ ! ${use_neon} =~ "off" && ( ${target} =~ armv7 || ${target} =~ arm_himix100 || ${target} =~ arm_musleabi ) ]]; then
         CMAKE_OPTIONS="${CMAKE_OPTIONS} -DCMAKE_SYSTEM_PROCESSOR=armv7-a"
         CCFLAGS="${CCFLAGS} -mfpu=neon-vfpv4"
         if [[ ${target} =~ hardfp ]]; then
diff --git a/scripts/target.sh b/scripts/target.sh
index 1047408d..7a18527d 100644
--- a/scripts/target.sh
+++ b/scripts/target.sh
@@ -2,9 +2,10 @@
 
 targets=("android-aarch64" "android-armv7" "android-x86_64" \
     "ios-aarch64" "ios-armv7" \
-    "linux-x86_64" "linux-x86_64_avx2" "linux-x86_64_avx512" "linux-x86_64_avx512_vnni" "linux-aarch64" "linux-aarch64_blank" "linux-arm_himix100" "linux-armv7_blank" "linux-arm_musleabi" \
-    "windows-x86_64" "windows-x86_64_avx2" "windows-x86_64_avx512" \
-    "macos-x86_64" "macos-x86_64_avx2")
+    "linux-x86_64" "linux-x86_64_avx2" "linux-x86_64_avx512" "linux-x86_64_avx512_vnni" \
+    "linux-aarch64" "linux-aarch64_blank" "linux-arm_himix100" "linux-armv7_blank" "linux-arm_musleabi" \
+    "windows-x86_64" "windows-x86_64_avx2" "windows-x86_64_avx512" "windows-x86_64_avx512_vnni" \
+    "macos-x86_64" "macos-x86_64_avx2" "macos-aarch64")
 
 print_targets() {
     for((i=0; i<${#targets[@]}; i++)) do
diff --git a/third_party/install.sh b/third_party/install.sh
index 846c4880..194bb389 100644
--- a/third_party/install.sh
+++ b/third_party/install.sh
@@ -361,7 +361,7 @@ if [[ ${cmake_options} =~ USE_FLOW=ON && ${cmake_options} =~ BUILD_TEST=ON ]]; t
         mkdir -p ffts-master/build
         cd ffts-master/build
         # change static library name on windows
-	check_sed
+        check_sed
         sed '509c if (ON)' ../CMakeLists.txt > CMakeLists.txt.new
         mv CMakeLists.txt.new ../CMakeLists.txt
         sed '512c endif ()' ../CMakeLists.txt > CMakeLists.txt.new
@@ -431,7 +431,7 @@ if [[ ${cmake_options} =~ BUILD_TEST=ON && "${CC}" != "arm-apple-darwin11-clang"
             opencv_cmake_options="${opencv_cmake_options} -DBUILD_ZLIB=ON"
         fi
         if [[ ${target} =~ linux-arm_himix100  ]]; then
-	    check_sed
+            check_sed
             sed -i "s/std::cbrt/cbrt/g" `grep "std::cbrt" -rl ./`
             sed -i "s/std::copysign/copysign/g" `grep "std::copysign" -rl ./`
         fi
@@ -454,12 +454,52 @@ if [[ \"\${search_opencv_cmake}\" == \"\" ]]; then
     exit 1
 fi
 array=(\${search_opencv_cmake// / })
-echo \$array
 export OpenCV_CMAKE_PATH=\${array[\${#array[@]}-1]}
 cmake_env_options=\"\${cmake_env_options} -DOpenCV_CMAKE_PATH=\${OpenCV_CMAKE_PATH}\"
 " >> ${env_file}
 fi
 
+if [[ ${cmake_options} =~ USE_SECURE_C=ON ]]; then
+    SecureC_ROOT=${work_dir}/secure_c
+    # download and install Huawei Secure C
+    if [[ ! -d "${SecureC_ROOT}/include" || ! -d "${SecureC_ROOT}/lib" ]]; then
+        echo "[INFO] build Huawei Secure C in ${SecureC_ROOT}..."
+        mkdir -p ${SecureC_ROOT}
+        cd ${SecureC_ROOT}
+        if [ ! -d "./huawei_secure_c" ]; then
+            if [ ! -f "${script_dir}/sources/huawei_secure_c-master.zip" ]; then
+                wget --no-check-certificate  > ${log_file} || exit 1
+                git clone https://gitee.com/Janisa/huawei_secure_c || exit 1
+            else
+                cp ${script_dir}/sources/huawei_secure_c-master.zip . || exit 1
+                unzip huawei_secure_c-master.zip > ${log_file} || exit 1
+                rm huawei_secure_c-master.zip
+                mv huawei_secure_c-master huawei_secure_c
+            fi
+        fi
+        cd huawei_secure_c
+        rm -rf build
+        mkdir -p build
+        cd build
+        cmake -G"${CMAKE_GENERATOR}" .. -DCMAKE_INSTALL_PREFIX=${SecureC_ROOT} ${CMAKE_OPTIONS} > ${log_file} || exit 1
+        ${MAKE} -j ${build_threads} >> ${log_file} || exit 1
+        ${MAKE} install >> ${log_file} || exit 1
+        cd ${SecureC_ROOT}
+        rm -rf huawei_secure_c
+    fi
+    echo "
+export SecureC_ROOT=${SecureC_ROOT}" >> ${env_file}
+    if [[ ! -d "${SecureC_ROOT}/include" || ! -d "${SecureC_ROOT}/lib" ]]; then
+        echo "
+if [[ ! -d \"\${SecureC_ROOT}/include\" || ! -d \"\${SecureC_ROOT}/lib\" ]]; then
+    echo \"[ERROR] Huawei Secure C not install success\"
+    exit 1
+fi
+cmake_env_options=\"\${cmake_env_options} -DSecureC_ROOT=\${SecureC_ROOT}\"
+" >> ${env_file}
+    fi
+fi
+
 rm -rf ${log_file}
 
 chmod +x ${env_file}
diff --git a/third_party/proto/caffe.proto b/third_party/proto/caffe.proto
index 33899742..68ebe74b 100644
--- a/third_party/proto/caffe.proto
+++ b/third_party/proto/caffe.proto
@@ -675,6 +675,7 @@ message PreAllocatedMemoryParameter {
   }
   optional DataType data_type = 1;
   optional BlobShape shape = 2;
+  optional float value = 3 [default = 0];
 }
 
 message SharedWeightParameter {
diff --git a/third_party/proto/mind_ir.proto b/third_party/proto/mind_ir.proto
new file mode 100644
index 00000000..b70cf573
--- /dev/null
+++ b/third_party/proto/mind_ir.proto
@@ -0,0 +1,191 @@
+syntax = "proto2";
+package mind_ir;
+
+enum Version {
+  IR_VERSION_START = 0;
+  IR_VERSION = 1;
+}
+
+message AttributeProto {
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    UINT8 = 2;
+    INT8 = 3;
+    UINT16 = 4;
+    INT16 = 5;
+    INT32 = 6;
+    INT64 = 7;
+    STRING = 8;
+    BOOL = 9;
+    FLOAT16 = 10;
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;
+    COMPLEX128 = 15;
+    BFLOAT16 = 16;
+    TENSOR = 17;
+    GRAPH = 18;
+    TENSORS = 19;
+    TUPLE = 20;        // tuple
+    LIST = 21;         // list
+    DICT = 22;         // dictionary
+    UMONAD = 23;
+    IOMONAD = 24;
+    NONE = 25;
+    PRIMITIVECLOSURE = 26;
+    FUNCGRAPHCLOSURE = 27;
+    PARTIALCLOSURE = 28;
+    UNIONFUNCCLOSURE = 29;
+  }
+  optional string name = 1;
+  optional float f = 2;
+  optional int64 i = 3;
+  optional double d = 4;
+  optional bytes s = 5;
+  optional TensorProto t = 6;
+  optional GraphProto g = 7;
+  repeated float floats = 8;
+  repeated double doubles = 9;
+  repeated int64 ints = 10;
+  repeated bytes strings = 11;
+  repeated TensorProto tensors = 12;
+  repeated GraphProto graphs = 13;
+  optional string doc_string = 14;
+  optional string ref_attr_name = 15;
+  optional AttributeType type = 16;
+  repeated AttributeProto values = 17;          // tuple, list,dict of value
+}
+
+
+message ValueInfoProto {
+  optional string name = 1;
+  repeated TensorProto tensor = 2;
+  optional string doc_string = 3;
+  optional string denotation = 4;
+  optional AttributeProto attr_info = 5; // graph input info for other type
+}
+
+
+message NodeProto {
+  repeated string input = 1;
+  repeated string output = 2;
+  optional string name = 3;
+  optional string op_type = 4;
+  repeated AttributeProto attribute = 5;
+  optional string doc_string = 6;
+  optional string domain = 7;
+}
+
+
+message ModelProto {
+  optional string ir_version = 1;
+  optional string producer_name = 2;
+  optional string producer_version = 3;
+  optional string domain = 4;
+  optional string model_version = 5;
+  optional string doc_string = 6;
+  optional GraphProto graph = 7;
+  repeated GraphProto functions = 8; // all the graphs without the main graph.
+  optional PreprocessorProto preprocessor = 9;  // data graph from MindData.
+  optional bool little_endian = 10; // bytes order in load device.
+  optional ParallelProto parallel = 11; // information for parallel.
+  repeated PrimitiveProto primitives = 12; // all the primitives of the model.
+  optional int64 mind_ir_version = 13;
+}
+
+
+message PreprocessorProto {
+  repeated PreprocessOpProto op = 1;
+}
+
+
+message PreprocessOpProto {
+  optional string input_columns = 1;
+  optional string output_columns = 2;
+  optional string project_columns = 3;
+  optional string op_type = 4;
+  optional string operations = 5;
+  optional bool offload = 6;
+}
+
+
+message GraphProto {
+  repeated NodeProto node = 1;      // all the computing node 
+  optional string name = 2;
+  repeated TensorProto parameter = 3;
+  optional string doc_string = 4;
+  repeated ValueInfoProto input = 5;
+  repeated ValueInfoProto output = 6;
+  optional string bprop_hash = 7;
+  repeated AttributeProto attribute = 8;
+}
+
+
+message TensorProto {
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;   // float
+    UINT8 = 2;   // uint8_t
+    INT8 = 3;    // int8_t
+    UINT16 = 4;  // uint16_t
+    INT16 = 5;   // int16_t
+    INT32 = 6;   // int32_t
+    INT64 = 7;   // int64_t
+    STRING = 8;  // string
+    BOOL = 9;    // bool
+    FLOAT16 = 10;
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;
+    COMPLEX128 = 15;
+    BFLOAT16 = 16;
+    FLOAT64 = 17;
+  }
+  message ExternalDataProto {
+  //POSIX filesystem path relative to the directory where the MindIR model was stored.
+    optional string location = 1;
+    optional int64 offset = 2;
+    optional int64 length = 3;
+    optional string checksum= 4;
+  }
+  repeated int64 dims = 1;
+  optional int32 data_type = 2;
+  repeated float float_data = 3;
+  repeated int32 int32_data = 4;
+  repeated bytes string_data = 5;
+  repeated int64 int64_data = 6;
+  optional string name = 7;
+  optional string doc_string = 8;
+  optional bytes raw_data = 9;
+  repeated double double_data = 10;
+  repeated uint64 uint64_data = 11;
+  optional ExternalDataProto external_data = 12;
+  optional string ref_key = 13;
+  repeated int64 min_dims = 14;
+  repeated int64 max_dims = 15;
+}
+
+message ParallelProto {
+  repeated LayoutProto layout = 1;
+}
+
+message LayoutProto {
+  optional string name = 1;
+  repeated int64 device_arrangement_int = 2;
+  repeated int64 tensor_map_int = 3;
+  repeated int64 slice_shape_int = 4;
+  optional int64 field_size = 5;
+  optional bool uniform_split = 6;
+  optional string opt_shard_group = 7;
+}
+
+message PrimitiveProto {
+  optional string name = 1;
+  optional string op_type = 2;
+  repeated AttributeProto attribute = 3;
+}
+
diff --git a/third_party/sources/opencl/include/CL/cl.h b/third_party/sources/opencl/include/CL/cl.h
new file mode 100644
index 00000000..32ae73fc
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl.h
@@ -0,0 +1,1804 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#include <CL/cl_version.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_device_svm_capabilities;
+#endif
+typedef cl_bitfield         cl_command_queue_properties;
+#ifdef CL_VERSION_1_2
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+#endif
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_queue_properties;
+#endif
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_svm_mem_flags;
+#endif
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+#ifdef CL_VERSION_1_2
+typedef cl_bitfield         cl_mem_migration_flags;
+#endif
+typedef cl_uint             cl_image_info;
+#ifdef CL_VERSION_1_1
+typedef cl_uint             cl_buffer_create_type;
+#endif
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+#ifdef CL_VERSION_2_0
+typedef intptr_t            cl_pipe_properties;
+typedef cl_uint             cl_pipe_info;
+#endif
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+#ifdef CL_VERSION_1_2
+typedef cl_uint             cl_program_binary_type;
+#endif
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+#ifdef CL_VERSION_1_2
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+#endif
+typedef cl_uint             cl_kernel_work_group_info;
+#ifdef CL_VERSION_2_1
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_sampler_properties;
+typedef cl_uint             cl_kernel_exec_info;
+#endif
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+#ifdef CL_VERSION_1_2
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+#ifdef CL_VERSION_2_0
+#ifdef __GNUC__
+    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
+#endif
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable : 4201 ) /* Prevents warning about nameless struct/union in /W4 /Za builds */
+#endif
+    union {
+#endif
+      cl_mem                  buffer;
+#ifdef CL_VERSION_2_0
+      cl_mem                  mem_object;
+    };
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+#endif
+} cl_image_desc;
+
+#endif
+
+#ifdef CL_VERSION_1_1
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+#endif
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#ifdef CL_VERSION_1_1
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+#endif
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#ifdef CL_VERSION_1_1
+#define CL_INVALID_PROPERTY                         -64
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
+#endif
+#ifdef CL_VERSION_2_2
+#define CL_INVALID_SPEC_ID                          -71
+#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72
+#endif
+
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#ifdef CL_VERSION_1_2
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+#endif
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+#ifdef CL_VERSION_2_1
+#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905
+#endif
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#endif
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                                   0x1000
+#define CL_DEVICE_VENDOR_ID                              0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
+#define CL_DEVICE_ADDRESS_BITS                           0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
+#define CL_DEVICE_MAX_SAMPLERS                           0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
+#define CL_DEVICE_AVAILABLE                              0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
+#ifdef CL_VERSION_2_0
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
+#endif
+#define CL_DEVICE_NAME                                   0x102B
+#define CL_DEVICE_VENDOR                                 0x102C
+#define CL_DRIVER_VERSION                                0x102D
+#define CL_DEVICE_PROFILE                                0x102E
+#define CL_DEVICE_VERSION                                0x102F
+#define CL_DEVICE_EXTENSIONS                             0x1030
+#define CL_DEVICE_PLATFORM                               0x1031
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#endif
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */
+#ifdef CL_VERSION_1_1
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
+#define CL_DEVICE_PARENT_DEVICE                          0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
+#define CL_DEVICE_PARTITION_TYPE                         0x1046
+#define CL_DEVICE_REFERENCE_COUNT                        0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_DEVICE_IL_VERSION                             0x105B
+#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
+#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
+#endif
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#ifdef CL_VERSION_1_1
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+#endif
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#ifdef CL_VERSION_2_0
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
+#endif
+
+/* cl_context_info */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#ifdef CL_VERSION_1_1
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+#endif
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#ifdef CL_VERSION_1_2
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
+
+#endif
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+#ifdef CL_VERSION_2_0
+#define CL_QUEUE_SIZE                               0x1094
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_QUEUE_DEVICE_DEFAULT                     0x1095
+#endif
+
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#ifdef CL_VERSION_1_2
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+#endif
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#ifdef CL_VERSION_1_1
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_sRGB                                     0x10BF
+#define CL_sRGBx                                    0x10C0
+#define CL_sRGBA                                    0x10C1
+#define CL_sBGRA                                    0x10C2
+#define CL_ABGR                                     0x10C3
+#endif
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#ifdef CL_VERSION_1_2
+#define CL_UNORM_INT24                              0x10DF
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_UNORM_INT_101010_2                       0x10E0
+#endif
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#ifdef CL_VERSION_1_2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_OBJECT_PIPE                          0x10F7
+#endif
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#ifdef CL_VERSION_1_1
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_USES_SVM_POINTER                     0x1109
+#endif
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#ifdef CL_VERSION_1_2
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE                         0x1120
+#define CL_PIPE_MAX_PACKETS                         0x1121
+
+#endif
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#ifdef CL_VERSION_1_1
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+#endif
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+#ifdef CL_VERSION_2_0
+/* These enumerants are for the cl_khr_mipmap_image extension.
+   They have since been added to cl_ext.h with an appropriate
+   KHR suffix, but are left here for backwards compatibility. */
+#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
+#define CL_SAMPLER_LOD_MIN                          0x1156
+#define CL_SAMPLER_LOD_MAX                          0x1157
+#endif
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#ifdef CL_VERSION_1_2
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+#endif
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#ifdef CL_VERSION_1_2
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_PROGRAM_IL                               0x1169
+#endif
+#ifdef CL_VERSION_2_2
+#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A
+#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B
+#endif
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#ifdef CL_VERSION_1_2
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+#endif
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#ifdef CL_VERSION_1_2
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
+#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_type_qualifier */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#ifdef CL_VERSION_2_0
+#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
+#endif
+
+#endif
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#ifdef CL_VERSION_1_2
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+#endif
+
+#ifdef CL_VERSION_2_1
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034
+#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7
+
+#endif
+
+/* cl_event_info */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#ifdef CL_VERSION_1_1
+#define CL_EVENT_CONTEXT                            0x11D4
+#endif
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#ifdef CL_VERSION_1_1
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_COMMAND_SVM_FREE                         0x1209
+#define CL_COMMAND_SVM_MEMCPY                       0x120A
+#define CL_COMMAND_SVM_MEMFILL                      0x120B
+#define CL_COMMAND_SVM_MAP                          0x120C
+#define CL_COMMAND_SVM_UNMAP                        0x120D
+#endif
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+#ifdef CL_VERSION_1_1
+
+/* cl_buffer_create_type */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+#endif
+
+/* cl_profiling_info */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+#ifdef CL_VERSION_2_0
+#define CL_PROFILING_COMMAND_COMPLETE               0x1284
+#endif
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          num_entries,
+                 cl_platform_id * platforms,
+                 cl_uint *        num_platforms) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id   platform,
+                  cl_platform_info param_name,
+                  size_t           param_value_size,
+                  void *           param_value,
+                  size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   platform,
+               cl_device_type   device_type,
+               cl_uint          num_entries,
+               cl_device_id *   devices,
+               cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    device,
+                cl_device_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         in_device,
+                   const cl_device_partition_property * properties,
+                   cl_uint                              num_devices,
+                   cl_device_id *                       out_devices,
+                   cl_uint *                            num_devices_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetDefaultDeviceCommandQueue(cl_context           context,
+                               cl_device_id         device,
+                               cl_command_queue     command_queue) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceAndHostTimer(cl_device_id    device,
+                        cl_ulong*       device_timestamp,
+                        cl_ulong*       host_timestamp) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetHostTimer(cl_device_id device,
+               cl_ulong *   host_timestamp) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+/* Context APIs */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * properties,
+                cl_uint              num_devices,
+                const cl_device_id * devices,
+                void (CL_CALLBACK * pfn_notify)(const char * errinfo,
+                                                const void * private_info,
+                                                size_t       cb,
+                                                void *       user_data),
+                void *               user_data,
+                cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * properties,
+                        cl_device_type      device_type,
+                        void (CL_CALLBACK * pfn_notify)(const char * errinfo,
+                                                        const void * private_info,
+                                                        size_t       cb,
+                                                        void *       user_data),
+                        void *              user_data,
+                        cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         context,
+                 cl_context_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithProperties(cl_context               context,
+                                   cl_device_id             device,
+                                   const cl_queue_properties *    properties,
+                                   cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      command_queue,
+                      cl_command_queue_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   context,
+               cl_mem_flags flags,
+               size_t       size,
+               void *       host_ptr,
+               cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   buffer,
+                  cl_mem_flags             flags,
+                  cl_buffer_create_type    buffer_create_type,
+                  const void *             buffer_create_info,
+                  cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              context,
+              cl_mem_flags            flags,
+              const cl_image_format * image_format,
+              const cl_image_desc *   image_desc,
+              void *                  host_ptr,
+              cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context                 context,
+             cl_mem_flags               flags,
+             cl_uint                    pipe_packet_size,
+             cl_uint                    pipe_max_packets,
+             const cl_pipe_properties * properties,
+             cl_int *                   errcode_ret) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           context,
+                           cl_mem_flags         flags,
+                           cl_mem_object_type   image_type,
+                           cl_uint              num_entries,
+                           cl_image_format *    image_formats,
+                           cl_uint *            num_image_formats) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           memobj,
+                   cl_mem_info      param_name,
+                   size_t           param_value_size,
+                   void *           param_value,
+                   size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           image,
+               cl_image_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPipeInfo(cl_mem           pipe,
+              cl_pipe_info     param_name,
+              size_t           param_value_size,
+              void *           param_value,
+              size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(cl_mem memobj,
+                                 void (CL_CALLBACK * pfn_notify)(cl_mem memobj,
+                                                                 void * user_data),
+                                 void * user_data) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+/* SVM Allocation APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context       context,
+           cl_svm_mem_flags flags,
+           size_t           size,
+           cl_uint          alignment) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFree(cl_context        context,
+          void *            svm_pointer) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+/* Sampler APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSamplerWithProperties(cl_context                     context,
+                              const cl_sampler_properties *  sampler_properties,
+                              cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         sampler,
+                 cl_sampler_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        context,
+                          cl_uint           count,
+                          const char **     strings,
+                          const size_t *    lengths,
+                          cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     context,
+                          cl_uint                        num_devices,
+                          const cl_device_id *           device_list,
+                          const size_t *                 lengths,
+                          const unsigned char **         binaries,
+                          cl_int *                       binary_status,
+                          cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            context,
+                                  cl_uint               num_devices,
+                                  const cl_device_id *  device_list,
+                                  const char *          kernel_names,
+                                  cl_int *              errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithIL(cl_context    context,
+                     const void*    il,
+                     size_t         length,
+                     cl_int*        errcode_ret) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           program,
+               cl_uint              num_devices,
+               const cl_device_id * device_list,
+               const char *         options,
+               void (CL_CALLBACK *  pfn_notify)(cl_program program,
+                                                void * user_data),
+               void *               user_data) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           program,
+                 cl_uint              num_devices,
+                 const cl_device_id * device_list,
+                 const char *         options,
+                 cl_uint              num_input_headers,
+                 const cl_program *   input_headers,
+                 const char **        header_include_names,
+                 void (CL_CALLBACK *  pfn_notify)(cl_program program,
+                                                  void * user_data),
+                 void *               user_data) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           context,
+              cl_uint              num_devices,
+              const cl_device_id * device_list,
+              const char *         options,
+              cl_uint              num_input_programs,
+              const cl_program *   input_programs,
+              void (CL_CALLBACK *  pfn_notify)(cl_program program,
+                                               void * user_data),
+              void *               user_data,
+              cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramReleaseCallback(cl_program          program,
+                            void (CL_CALLBACK * pfn_notify)(cl_program program,
+                                                            void * user_data),
+                            void *              user_data) CL_API_SUFFIX__VERSION_2_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramSpecializationConstant(cl_program  program,
+                                   cl_uint     spec_id,
+                                   size_t      spec_size,
+                                   const void* spec_value) CL_API_SUFFIX__VERSION_2_2;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         program,
+                 cl_program_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            program,
+                      cl_device_id          device,
+                      cl_program_build_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      program,
+               const char *    kernel_name,
+               cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     program,
+                         cl_uint        num_kernels,
+                         cl_kernel *    kernels,
+                         cl_uint *      num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCloneKernel(cl_kernel     source_kernel,
+              cl_int*       errcode_ret) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    kernel) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   kernel) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    kernel,
+               cl_uint      arg_index,
+               size_t       arg_size,
+               const void * arg_value) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel    kernel,
+                         cl_uint      arg_index,
+                         const void * arg_value) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel            kernel,
+                    cl_kernel_exec_info  param_name,
+                    size_t               param_value_size,
+                    const void *         param_value) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       kernel,
+                cl_kernel_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       kernel,
+                   cl_uint         arg_indx,
+                   cl_kernel_arg_info  param_name,
+                   size_t          param_value_size,
+                   void *          param_value,
+                   size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  kernel,
+                         cl_device_id               device,
+                         cl_kernel_work_group_info  param_name,
+                         size_t                     param_value_size,
+                         void *                     param_value,
+                         size_t *                   param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfo(cl_kernel                   kernel,
+                        cl_device_id                device,
+                        cl_kernel_sub_group_info    param_name,
+                        size_t                      input_value_size,
+                        const void*                 input_value,
+                        size_t                      param_value_size,
+                        void*                       param_value,
+                        size_t*                     param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             num_events,
+                const cl_event *    event_list) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         event,
+               cl_event_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    context,
+                  cl_int *      errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   event,
+                     cl_int     execution_status) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback(cl_event    event,
+                   cl_int      command_exec_callback_type,
+                   void (CL_CALLBACK * pfn_notify)(cl_event event,
+                                                   cl_int   event_command_status,
+                                                   void *   user_data),
+                   void *      user_data) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            event,
+                        cl_profiling_info   param_name,
+                        size_t              param_value_size,
+                        void *              param_value,
+                        size_t *            param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    command_queue,
+                    cl_mem              buffer,
+                    cl_bool             blocking_read,
+                    size_t              offset,
+                    size_t              size,
+                    void *              ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    command_queue,
+                        cl_mem              buffer,
+                        cl_bool             blocking_read,
+                        const size_t *      buffer_offset,
+                        const size_t *      host_offset,
+                        const size_t *      region,
+                        size_t              buffer_row_pitch,
+                        size_t              buffer_slice_pitch,
+                        size_t              host_row_pitch,
+                        size_t              host_slice_pitch,
+                        void *              ptr,
+                        cl_uint             num_events_in_wait_list,
+                        const cl_event *    event_wait_list,
+                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   command_queue,
+                     cl_mem             buffer,
+                     cl_bool            blocking_write,
+                     size_t             offset,
+                     size_t             size,
+                     const void *       ptr,
+                     cl_uint            num_events_in_wait_list,
+                     const cl_event *   event_wait_list,
+                     cl_event *         event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    command_queue,
+                         cl_mem              buffer,
+                         cl_bool             blocking_write,
+                         const size_t *      buffer_offset,
+                         const size_t *      host_offset,
+                         const size_t *      region,
+                         size_t              buffer_row_pitch,
+                         size_t              buffer_slice_pitch,
+                         size_t              host_row_pitch,
+                         size_t              host_slice_pitch,
+                         const void *        ptr,
+                         cl_uint             num_events_in_wait_list,
+                         const cl_event *    event_wait_list,
+                         cl_event *          event) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   command_queue,
+                    cl_mem             buffer,
+                    const void *       pattern,
+                    size_t             pattern_size,
+                    size_t             offset,
+                    size_t             size,
+                    cl_uint            num_events_in_wait_list,
+                    const cl_event *   event_wait_list,
+                    cl_event *         event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    command_queue,
+                    cl_mem              src_buffer,
+                    cl_mem              dst_buffer,
+                    size_t              src_offset,
+                    size_t              dst_offset,
+                    size_t              size,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    command_queue,
+                        cl_mem              src_buffer,
+                        cl_mem              dst_buffer,
+                        const size_t *      src_origin,
+                        const size_t *      dst_origin,
+                        const size_t *      region,
+                        size_t              src_row_pitch,
+                        size_t              src_slice_pitch,
+                        size_t              dst_row_pitch,
+                        size_t              dst_slice_pitch,
+                        cl_uint             num_events_in_wait_list,
+                        const cl_event *    event_wait_list,
+                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     command_queue,
+                   cl_mem               image,
+                   cl_bool              blocking_read,
+                   const size_t *       origin,
+                   const size_t *       region,
+                   size_t               row_pitch,
+                   size_t               slice_pitch,
+                   void *               ptr,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    command_queue,
+                    cl_mem              image,
+                    cl_bool             blocking_write,
+                    const size_t *      origin,
+                    const size_t *      region,
+                    size_t              input_row_pitch,
+                    size_t              input_slice_pitch,
+                    const void *        ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   command_queue,
+                   cl_mem             image,
+                   const void *       fill_color,
+                   const size_t *     origin,
+                   const size_t *     region,
+                   cl_uint            num_events_in_wait_list,
+                   const cl_event *   event_wait_list,
+                   cl_event *         event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     command_queue,
+                   cl_mem               src_image,
+                   cl_mem               dst_image,
+                   const size_t *       src_origin,
+                   const size_t *       dst_origin,
+                   const size_t *       region,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+                           cl_mem           src_image,
+                           cl_mem           dst_buffer,
+                           const size_t *   src_origin,
+                           const size_t *   region,
+                           size_t           dst_offset,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+                           cl_mem           src_buffer,
+                           cl_mem           dst_image,
+                           size_t           src_offset,
+                           const size_t *   dst_origin,
+                           const size_t *   region,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue command_queue,
+                   cl_mem           buffer,
+                   cl_bool          blocking_map,
+                   cl_map_flags     map_flags,
+                   size_t           offset,
+                   size_t           size,
+                   cl_uint          num_events_in_wait_list,
+                   const cl_event * event_wait_list,
+                   cl_event *       event,
+                   cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  command_queue,
+                  cl_mem            image,
+                  cl_bool           blocking_map,
+                  cl_map_flags      map_flags,
+                  const size_t *    origin,
+                  const size_t *    region,
+                  size_t *          image_row_pitch,
+                  size_t *          image_slice_pitch,
+                  cl_uint           num_events_in_wait_list,
+                  const cl_event *  event_wait_list,
+                  cl_event *        event,
+                  cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                        cl_mem           memobj,
+                        void *           mapped_ptr,
+                        cl_uint          num_events_in_wait_list,
+                        const cl_event * event_wait_list,
+                        cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       command_queue,
+                           cl_uint                num_mem_objects,
+                           const cl_mem *         mem_objects,
+                           cl_mem_migration_flags flags,
+                           cl_uint                num_events_in_wait_list,
+                           const cl_event *       event_wait_list,
+                           cl_event *             event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                       cl_kernel        kernel,
+                       cl_uint          work_dim,
+                       const size_t *   global_work_offset,
+                       const size_t *   global_work_size,
+                       const size_t *   local_work_size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event * event_wait_list,
+                       cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  command_queue,
+                      void (CL_CALLBACK * user_func)(void *),
+                      void *            args,
+                      size_t            cb_args,
+                      cl_uint           num_mem_objects,
+                      const cl_mem *    mem_list,
+                      const void **     args_mem_loc,
+                      cl_uint           num_events_in_wait_list,
+                      const cl_event *  event_wait_list,
+                      cl_event *        event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue  command_queue,
+                            cl_uint           num_events_in_wait_list,
+                            const cl_event *  event_wait_list,
+                            cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue  command_queue,
+                             cl_uint           num_events_in_wait_list,
+                             const cl_event *  event_wait_list,
+                             cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue  command_queue,
+                 cl_uint           num_svm_pointers,
+                 void *            svm_pointers[],
+                 void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
+                                                    cl_uint          num_svm_pointers,
+                                                    void *           svm_pointers[],
+                                                    void *           user_data),
+                 void *            user_data,
+                 cl_uint           num_events_in_wait_list,
+                 const cl_event *  event_wait_list,
+                 cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue  command_queue,
+                   cl_bool           blocking_copy,
+                   void *            dst_ptr,
+                   const void *      src_ptr,
+                   size_t            size,
+                   cl_uint           num_events_in_wait_list,
+                   const cl_event *  event_wait_list,
+                   cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue  command_queue,
+                    void *            svm_ptr,
+                    const void *      pattern,
+                    size_t            pattern_size,
+                    size_t            size,
+                    cl_uint           num_events_in_wait_list,
+                    const cl_event *  event_wait_list,
+                    cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue  command_queue,
+                cl_bool           blocking_map,
+                cl_map_flags      flags,
+                void *            svm_ptr,
+                size_t            size,
+                cl_uint           num_events_in_wait_list,
+                const cl_event *  event_wait_list,
+                cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue  command_queue,
+                  void *            svm_ptr,
+                  cl_uint           num_events_in_wait_list,
+                  const cl_event *  event_wait_list,
+                  cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMigrateMem(cl_command_queue         command_queue,
+                       cl_uint                  num_svm_pointers,
+                       const void **            svm_pointers,
+                       const size_t *           sizes,
+                       cl_mem_migration_flags   flags,
+                       cl_uint                  num_events_in_wait_list,
+                       const cl_event *         event_wait_list,
+                       cl_event *               event) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL
+clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
+                                         const char *   func_name) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+    /*
+     *  WARNING:
+     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
+     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
+     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
+     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
+     *
+     *  Software developers previously relying on this API are instructed to set the command queue
+     *  properties when creating the queue, instead.
+     */
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clSetCommandQueueProperty(cl_command_queue              command_queue,
+                              cl_command_queue_properties   properties,
+                              cl_bool                       enable,
+                              cl_command_queue_properties * old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_row_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_depth,
+                size_t                  image_row_pitch,
+                size_t                  image_slice_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    command_queue,
+                cl_event *          event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue  command_queue,
+                        cl_uint          num_events,
+                        const cl_event * event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * func_name) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+/* Deprecated OpenCL 2.0 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     context,
+                     cl_device_id                   device,
+                     cl_command_queue_properties    properties,
+                     cl_int *                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
+clCreateSampler(cl_context          context,
+                cl_bool             normalized_coords,
+                cl_addressing_mode  addressing_mode,
+                cl_filter_mode      filter_mode,
+                cl_int *            errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  command_queue,
+              cl_kernel         kernel,
+              cl_uint           num_events_in_wait_list,
+              const cl_event *  event_wait_list,
+              cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
diff --git a/third_party/sources/opencl/include/CL/cl_d3d10.h b/third_party/sources/opencl/include/CL/cl_d3d10.h
new file mode 100644
index 00000000..d5960a43
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_d3d10.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+
+/* cl_d3d10_device_source_nv */
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+
+/* cl_d3d10_device_set_nv */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D10_H */
+
diff --git a/third_party/sources/opencl/include/CL/cl_d3d11.h b/third_party/sources/opencl/include/CL/cl_d3d11.h
new file mode 100644
index 00000000..39f90723
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_d3d11.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+
+#include <d3d11.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
+
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+
+/* cl_d3d11_device_source */
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
+
+/* cl_d3d11_device_set */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d11_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D11_H */
+
diff --git a/third_party/sources/opencl/include/CL/cl_dx9_media_sharing.h b/third_party/sources/opencl/include/CL/cl_dx9_media_sharing.h
new file mode 100644
index 00000000..2729e8b9
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,132 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+    
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
+
diff --git a/third_party/sources/opencl/include/CL/cl_dx9_media_sharing_intel.h b/third_party/sources/opencl/include/CL/cl_dx9_media_sharing_intel.h
new file mode 100644
index 00000000..737e6856
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_dx9_media_sharing_intel.h
@@ -0,0 +1,182 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_dx9_media_sharing_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#include <d3d9.h>
+#include <dxvahd.h>
+#include <wtypes.h>
+#include <d3d9types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************
+* cl_intel_dx9_media_sharing extension *
+****************************************/
+
+#define cl_intel_dx9_media_sharing 1
+
+typedef cl_uint cl_dx9_device_source_intel;
+typedef cl_uint cl_dx9_device_set_intel;
+
+/* error codes */
+#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
+#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
+#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
+#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
+
+/* cl_dx9_device_source_intel */
+#define CL_D3D9_DEVICE_INTEL                          0x4022
+#define CL_D3D9EX_DEVICE_INTEL                        0x4070
+#define CL_DXVA_DEVICE_INTEL                          0x4071
+
+/* cl_dx9_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
+#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
+#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
+#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
+
+/* cl_mem_info */
+#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
+#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
+#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
+/******************************************************************************/
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9INTEL(
+    cl_platform_id              platform,
+    cl_dx9_device_source_intel  dx9_device_source,
+    void*                       dx9_object,
+    cl_dx9_device_set_intel     dx9_device_set,
+    cl_uint                     num_entries,
+    cl_device_id*               devices,
+    cl_uint*                    num_devices) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
+    cl_platform_id              platform,
+    cl_dx9_device_source_intel  dx9_device_source,
+    void*                       dx9_object,
+    cl_dx9_device_set_intel     dx9_device_set,
+    cl_uint                     num_entries,
+    cl_device_id*               devices,
+    cl_uint*                    num_devices) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceINTEL(
+    cl_context                  context,
+    cl_mem_flags                flags,
+    IDirect3DSurface9*          resource,
+    HANDLE                      sharedHandle,
+    UINT                        plane,
+    cl_int*                     errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
+    cl_context                  context,
+    cl_mem_flags                flags,
+    IDirect3DSurface9*          resource,
+    HANDLE                      sharedHandle,
+    UINT                        plane,
+    cl_int*                     errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9ObjectsINTEL(
+    cl_command_queue            command_queue,
+    cl_uint                     num_objects,
+    const cl_mem*               mem_objects,
+    cl_uint                     num_events_in_wait_list,
+    const cl_event*             event_wait_list,
+    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
+    cl_command_queue            command_queue,
+    cl_uint                     num_objects,
+    const cl_mem*               mem_objects,
+    cl_uint                     num_events_in_wait_list,
+    const cl_event*             event_wait_list,
+    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9ObjectsINTEL(
+    cl_command_queue            command_queue,
+    cl_uint                     num_objects,
+    cl_mem*                     mem_objects,
+    cl_uint                     num_events_in_wait_list,
+    const cl_event*             event_wait_list,
+    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
+    cl_command_queue            command_queue,
+    cl_uint                     num_objects,
+    cl_mem*                     mem_objects,
+    cl_uint                     num_events_in_wait_list,
+    const cl_event*             event_wait_list,
+    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
+
diff --git a/third_party/sources/opencl/include/CL/cl_egl.h b/third_party/sources/opencl/include/CL/cl_egl.h
new file mode 100644
index 00000000..bc4d998e
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_egl.h
@@ -0,0 +1,132 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#include <CL/cl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  context,
+                        CLeglDisplayKHR             egldisplay,
+                        CLeglImageKHR               eglimage,
+                        cl_mem_flags                flags,
+                        const cl_egl_image_properties_khr * properties,
+                        cl_int *                    errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+    cl_context                  context,
+    CLeglDisplayKHR             egldisplay,
+    CLeglImageKHR               eglimage,
+    cl_mem_flags                flags,
+    const cl_egl_image_properties_khr * properties,
+    cl_int *                    errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
+                              cl_uint          num_objects,
+                              const cl_mem *   mem_objects,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event * event_wait_list,
+                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
+                              cl_uint          num_objects,
+                              const cl_mem *   mem_objects,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event * event_wait_list,
+                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      context,
+                            CLeglSyncKHR    sync,
+                            CLeglDisplayKHR display,
+                            cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+    cl_context      context,
+    CLeglSyncKHR    sync,
+    CLeglDisplayKHR display,
+    cl_int *        errcode_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/third_party/sources/opencl/include/CL/cl_ext.h b/third_party/sources/opencl/include/CL/cl_ext.h
new file mode 100644
index 00000000..c7c0f0f1
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_ext.h
@@ -0,0 +1,811 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <CL/cl.h>
+
+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
+/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
+
+#if CL_TARGET_OPENCL_VERSION <= 110
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#endif
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in
+ * which they were registered. The user callback functions are called and then the memory object is deleted
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem memobj,
+                                        void (* pfn_notify)(cl_mem memobj, void * user_data),
+                                        void * user_data)             CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * errstr,
+                                            const void * private_info,
+                                            size_t       cb,
+                                            void *       user_data)  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * errstr,
+                                          const void * private_info,
+                                          size_t       cb,
+                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * errstr,
+                                          const void * private_info,
+                                          size_t       cb,
+                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************
+* cl_khr_icd extension *
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          num_entries,
+                       cl_platform_id * platforms,
+                       cl_uint *        num_platforms);
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint          num_entries,
+                                         cl_platform_id * platforms,
+                                         cl_uint *        num_platforms);
+
+
+/*******************************
+ * cl_khr_il_program extension *
+ *******************************/
+#define cl_khr_il_program 1
+
+/* New property to clGetDeviceInfo for retrieving supported intermediate
+ * languages
+ */
+#define CL_DEVICE_IL_VERSION_KHR                    0x105B
+
+/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
+ * program
+ */
+#define CL_PROGRAM_IL_KHR                           0x1169
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithILKHR(cl_context   context,
+                         const void * il,
+                         size_t       length,
+                         cl_int *     errcode_ret);
+
+typedef CL_API_ENTRY cl_program
+(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context   context,
+                                           const void * il,
+                                           size_t       length,
+                                           cl_int *     errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+/* Extension: cl_khr_image2d_from_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without
+ * a copy. The type associated with a 2D image created from a buffer in an
+ * OpenCL program is image2d_t. Both the sampler and sampler-less read_image
+ * built-in functions are supported for 2D images and 2D images created from
+ * a buffer.  Similarly, the write_image built-ins are also supported for 2D
+ * images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the
+ * width, height, image format (i.e. channel order and channel data type)
+ * and optionally the row pitch.
+ *
+ * The pitch specified must be a multiple of
+ * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels.
+ * The base address of the buffer must be aligned to
+ * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels.
+ */
+
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR              0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR       0x104B
+
+
+/**************************************
+ * cl_khr_initialize_memory extension *
+ **************************************/
+
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
+
+
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/*****************************************
+ * cl_khr_create_command_queue extension *
+ *****************************************/
+#define cl_khr_create_command_queue 1
+
+typedef cl_bitfield cl_queue_properties_khr;
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithPropertiesKHR(cl_context context,
+                                      cl_device_id device,
+                                      const cl_queue_properties_khr* properties,
+                                      cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_command_queue
+(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,
+                                                        cl_device_id device,
+                                                        const cl_queue_properties_khr* properties,
+                                                        cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+
+/***********************************
+* cl_ext_device_fission extension
+***********************************/
+#define cl_ext_device_fission   1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef cl_ulong  cl_device_partition_property_ext;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevicesEXT(cl_device_id   in_device,
+                      const cl_device_partition_property_ext * properties,
+                      cl_uint        num_entries,
+                      cl_device_id * out_devices,
+                      cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id   in_device,
+                                         const cl_device_partition_property_ext * properties,
+                                         cl_uint        num_entries,
+                                         cl_device_id * out_devices,
+                                         cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;
+
+/* cl_device_partition_property_ext */
+#define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+#define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+#define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+
+/* clDeviceGetInfo selectors */
+#define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+#define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+#define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+#define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+#define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+
+/* error codes */
+#define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+#define CL_INVALID_PARTITION_COUNT_EXT              -1058
+#define CL_INVALID_PARTITION_NAME_EXT               -1059
+
+/* CL_AFFINITY_DOMAINs */
+#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+#define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+
+/* cl_device_partition_property_ext list terminators */
+#define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+
+/***********************************
+ * cl_ext_migrate_memobject extension definitions
+ ***********************************/
+#define cl_ext_migrate_memobject 1
+
+typedef cl_bitfield cl_mem_migration_flags_ext;
+
+#define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1
+
+#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,
+                             cl_uint          num_mem_objects,
+                             const cl_mem *   mem_objects,
+                             cl_mem_migration_flags_ext flags,
+                             cl_uint          num_events_in_wait_list,
+                             const cl_event * event_wait_list,
+                             cl_event *       event);
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,
+                                               cl_uint          num_mem_objects,
+                                               const cl_mem *   mem_objects,
+                                               cl_mem_migration_flags_ext flags,
+                                               cl_uint          num_events_in_wait_list,
+                                               const cl_event * event_wait_list,
+                                               cl_event *       event);
+
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+#define cl_qcom_ext_host_ptr 1
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+
+    /* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+
+/*******************************************
+* cl_qcom_ext_host_ptr_iocoherent extension
+********************************************/
+
+/* Cache policy specifying io-coherence */
+#define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9
+
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+
+/*********************************
+* cl_qcom_android_native_buffer_host_ptr extension
+*********************************/
+
+#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6
+
+typedef struct _cl_mem_android_native_buffer_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* Virtual pointer to the android native buffer */
+    void*                anb_ptr;
+
+} cl_mem_android_native_buffer_host_ptr;
+
+
+/******************************************
+ * cl_img_yuv_image extension *
+ ******************************************/
+
+/* Image formats used in clCreateImage */
+#define CL_NV21_IMG                                 0x40D0
+#define CL_YV12_IMG                                 0x40D1
+
+
+/******************************************
+ * cl_img_cached_allocations extension *
+ ******************************************/
+
+/* Flag values used by clCreateBuffer */
+#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)
+#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)
+
+
+/******************************************
+ * cl_img_use_gralloc_ptr extension *
+ ******************************************/
+#define cl_img_use_gralloc_ptr 1
+
+/* Flag values used by clCreateBuffer */
+#define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
+#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
+
+/* Error code from clEnqueueReleaseGrallocObjectsIMG */
+#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      command_queue,
+                                  cl_uint               num_objects,
+                                  const cl_mem *        mem_objects,
+                                  cl_uint               num_events_in_wait_list,
+                                  const cl_event *      event_wait_list,
+                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      command_queue,
+                                  cl_uint               num_objects,
+                                  const cl_mem *        mem_objects,
+                                  cl_uint               num_events_in_wait_list,
+                                  const cl_event *      event_wait_list,
+                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*********************************
+* cl_khr_subgroups extension
+*********************************/
+#define cl_khr_subgroups 1
+
+#if !defined(CL_VERSION_2_1)
+/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
+   In hindsight, there should have been a khr suffix on this type for
+   the extension, but keeping it un-suffixed to maintain backwards
+   compatibility. */
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel    in_kernel,
+                           cl_device_id in_device,
+                           cl_kernel_sub_group_info param_name,
+                           size_t       input_value_size,
+                           const void * input_value,
+                           size_t       param_value_size,
+                           void *       param_value,
+                           size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel    in_kernel,
+                                              cl_device_id in_device,
+                                              cl_kernel_sub_group_info param_name,
+                                              size_t       input_value_size,
+                                              const void * input_value,
+                                              size_t       param_value_size,
+                                              void *       param_value,
+                                              size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+
+
+/*********************************
+* cl_khr_mipmap_image extension
+*********************************/
+
+/* cl_sampler_properties */
+#define CL_SAMPLER_MIP_FILTER_MODE_KHR              0x1155
+#define CL_SAMPLER_LOD_MIN_KHR                      0x1156
+#define CL_SAMPLER_LOD_MAX_KHR                      0x1157
+
+
+/*********************************
+* cl_khr_priority_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_priority_hints 1
+
+typedef cl_uint  cl_queue_priority_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_PRIORITY_KHR 0x1096
+
+/* cl_queue_priority_khr */
+#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
+#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
+#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
+
+
+/*********************************
+* cl_khr_throttle_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_throttle_hints 1
+
+typedef cl_uint  cl_queue_throttle_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_THROTTLE_KHR 0x1097
+
+/* cl_queue_throttle_khr */
+#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
+#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
+#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
+
+
+/*********************************
+* cl_khr_subgroup_named_barrier
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_subgroup_named_barrier 1
+
+/* cl_device_info */
+#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
+
+
+/*********************************
+* cl_khr_extended_versioning
+*********************************/
+
+#define CL_VERSION_MAJOR_BITS_KHR (10)
+#define CL_VERSION_MINOR_BITS_KHR (10)
+#define CL_VERSION_PATCH_BITS_KHR (12)
+
+#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)
+#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)
+#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1)
+
+#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR))
+#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR)
+#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR)
+
+#define CL_MAKE_VERSION_KHR(major, minor, patch) \
+    ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \
+    (((minor) &  CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \
+    ((patch) & CL_VERSION_PATCH_MASK_KHR))
+
+typedef cl_uint cl_version_khr;
+
+#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64
+
+typedef struct _cl_name_version_khr
+{
+    cl_version_khr version;
+    char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];
+} cl_name_version_khr;
+
+/* cl_platform_info */
+#define CL_PLATFORM_NUMERIC_VERSION_KHR                  0x0906
+#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR          0x0907
+
+/* cl_device_info */
+#define CL_DEVICE_NUMERIC_VERSION_KHR                    0x105E
+#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR           0x105F
+#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR            0x1060
+#define CL_DEVICE_ILS_WITH_VERSION_KHR                   0x1061
+#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR      0x1062
+
+
+/**********************************
+ * cl_arm_import_memory extension *
+ **********************************/
+#define cl_arm_import_memory 1
+
+typedef intptr_t cl_import_properties_arm;
+
+/* Default and valid proporties name for cl_arm_import_memory */
+#define CL_IMPORT_TYPE_ARM                        0x40B2
+
+/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
+
+/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
+
+/* Protected DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_PROTECTED_ARM              0x40B5
+
+/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2
+
+/* Import memory size value to indicate a size for the whole buffer */
+#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX
+
+/* This extension adds a new function that allows for direct memory import into
+ * OpenCL via the clImportMemoryARM function.
+ *
+ * Memory imported through this interface will be mapped into the device's page
+ * tables directly, providing zero copy access. It will never fall back to copy
+ * operations and aliased buffers.
+ *
+ * Types of memory supported for import are specified as additional extension
+ * strings.
+ *
+ * This extension produces cl_mem allocations which are compatible with all other
+ * users of cl_mem in the standard API.
+ *
+ * This extension maps pages with the same properties as the normal buffer creation
+ * function clCreateBuffer.
+ */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clImportMemoryARM( cl_context context,
+                   cl_mem_flags flags,
+                   const cl_import_properties_arm *properties,
+                   void *memory,
+                   size_t size,
+                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/******************************************
+ * cl_arm_shared_virtual_memory extension *
+ ******************************************/
+#define cl_arm_shared_virtual_memory 1
+
+/* Used by clGetDeviceInfo */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+
+/* Used by clGetMemObjectInfo */
+#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+
+/* Used by clSetKernelExecInfoARM: */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+
+/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+
+/* Flag values used by clSVMAllocARM: */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+
+typedef cl_bitfield cl_svm_mem_flags_arm;
+typedef cl_uint     cl_kernel_exec_info_arm;
+typedef cl_bitfield cl_device_svm_capabilities_arm;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAllocARM(cl_context       context,
+              cl_svm_mem_flags_arm flags,
+              size_t           size,
+              cl_uint          alignment) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFreeARM(cl_context        context,
+             void *            svm_pointer) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFreeARM(cl_command_queue  command_queue,
+                    cl_uint           num_svm_pointers,
+                    void *            svm_pointers[],
+                    void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
+                                                       cl_uint          num_svm_pointers,
+                                                       void *           svm_pointers[],
+                                                       void *           user_data),
+                    void *            user_data,
+                    cl_uint           num_events_in_wait_list,
+                    const cl_event *  event_wait_list,
+                    cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpyARM(cl_command_queue  command_queue,
+                      cl_bool           blocking_copy,
+                      void *            dst_ptr,
+                      const void *      src_ptr,
+                      size_t            size,
+                      cl_uint           num_events_in_wait_list,
+                      const cl_event *  event_wait_list,
+                      cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFillARM(cl_command_queue  command_queue,
+                       void *            svm_ptr,
+                       const void *      pattern,
+                       size_t            pattern_size,
+                       size_t            size,
+                       cl_uint           num_events_in_wait_list,
+                       const cl_event *  event_wait_list,
+                       cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMapARM(cl_command_queue  command_queue,
+                   cl_bool           blocking_map,
+                   cl_map_flags      flags,
+                   void *            svm_ptr,
+                   size_t            size,
+                   cl_uint           num_events_in_wait_list,
+                   const cl_event *  event_wait_list,
+                   cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmapARM(cl_command_queue  command_queue,
+                     void *            svm_ptr,
+                     cl_uint           num_events_in_wait_list,
+                     const cl_event *  event_wait_list,
+                     cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointerARM(cl_kernel    kernel,
+                            cl_uint      arg_index,
+                            const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfoARM(cl_kernel            kernel,
+                       cl_kernel_exec_info_arm  param_name,
+                       size_t               param_value_size,
+                       const void *         param_value) CL_EXT_SUFFIX__VERSION_1_2;
+
+/********************************
+ * cl_arm_get_core_id extension *
+ ********************************/
+
+#ifdef CL_VERSION_1_2
+
+#define cl_arm_get_core_id 1
+
+/* Device info property for bitfield of cores present */
+#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM      0x40BF
+
+#endif  /* CL_VERSION_1_2 */
+
+/*********************************
+* cl_arm_job_slot_selection
+*********************************/
+
+#define cl_arm_job_slot_selection 1
+
+/* cl_device_info */
+#define CL_DEVICE_JOB_SLOTS_ARM                   0x41E0
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_JOB_SLOT_ARM                     0x41E1
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/third_party/sources/opencl/include/CL/cl_ext_intel.h b/third_party/sources/opencl/include/CL/cl_ext_intel.h
new file mode 100644
index 00000000..9d1e4b58
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_ext_intel.h
@@ -0,0 +1,423 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_ext_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+#ifndef __CL_EXT_INTEL_H
+#define __CL_EXT_INTEL_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************
+* cl_intel_thread_local_exec extension *
+****************************************/
+
+#define cl_intel_thread_local_exec 1
+
+#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
+
+/***********************************************
+* cl_intel_device_partition_by_names extension *
+************************************************/
+
+#define cl_intel_device_partition_by_names 1
+
+#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
+#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
+
+/************************************************
+* cl_intel_accelerator extension                *
+* cl_intel_motion_estimation extension          *
+* cl_intel_advanced_motion_estimation extension *
+*************************************************/
+
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+#define cl_intel_advanced_motion_estimation 1
+
+typedef struct _cl_accelerator_intel* cl_accelerator_intel;
+typedef cl_uint cl_accelerator_type_intel;
+typedef cl_uint cl_accelerator_info_intel;
+
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+
+/* error codes */
+#define CL_INVALID_ACCELERATOR_INTEL                              -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
+
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
+
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
+
+/* cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
+
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
+
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
+
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
+
+#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
+#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
+#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
+#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
+
+#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
+#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
+#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
+
+#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
+#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
+#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
+#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
+#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
+
+#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
+#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
+#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
+#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
+
+#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
+#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
+#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
+#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
+#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
+
+#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
+#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
+#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
+#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
+
+/* cl_device_info */
+#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
+
+#define CL_ME_VERSION_LEGACY_INTEL                                0x0
+#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
+#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
+
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context                   context,
+    cl_accelerator_type_intel    accelerator_type,
+    size_t                       descriptor_size,
+    const void*                  descriptor,
+    cl_int*                      errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+    cl_context                   context,
+    cl_accelerator_type_intel    accelerator_type,
+    size_t                       descriptor_size,
+    const void*                  descriptor,
+    cl_int*                      errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL(
+    cl_accelerator_intel         accelerator,
+    cl_accelerator_info_intel    param_name,
+    size_t                       param_value_size,
+    void*                        param_value,
+    size_t*                      param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel         accelerator,
+    cl_accelerator_info_intel    param_name,
+    size_t                       param_value_size,
+    void*                        param_value,
+    size_t*                      param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;
+
+/******************************************
+* cl_intel_simultaneous_sharing extension *
+*******************************************/
+
+#define cl_intel_simultaneous_sharing 1
+
+#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
+#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
+
+/***********************************
+* cl_intel_egl_image_yuv extension *
+************************************/
+
+#define cl_intel_egl_image_yuv 1
+
+#define CL_EGL_YUV_PLANE_INTEL                           0x4107
+
+/********************************
+* cl_intel_packed_yuv extension *
+*********************************/
+
+#define cl_intel_packed_yuv 1
+
+#define CL_YUYV_INTEL                                    0x4076
+#define CL_UYVY_INTEL                                    0x4077
+#define CL_YVYU_INTEL                                    0x4078
+#define CL_VYUY_INTEL                                    0x4079
+
+/********************************************
+* cl_intel_required_subgroup_size extension *
+*********************************************/
+
+#define cl_intel_required_subgroup_size 1
+
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
+
+/****************************************
+* cl_intel_driver_diagnostics extension *
+*****************************************/
+
+#define cl_intel_driver_diagnostics 1
+
+typedef cl_uint cl_diagnostics_verbose_level;
+
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
+
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
+
+/********************************
+* cl_intel_planar_yuv extension *
+*********************************/
+
+#define CL_NV12_INTEL                                       0x410E
+
+#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
+#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
+
+#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
+#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
+
+/*******************************************************
+* cl_intel_device_side_avc_motion_estimation extension *
+********************************************************/
+
+#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
+#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
+#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
+
+#define CL_AVC_ME_VERSION_0_INTEL                           0x0;  // No support.
+#define CL_AVC_ME_VERSION_1_INTEL                           0x1;  // First supported version.
+
+#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
+#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
+#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
+#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
+
+#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
+#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
+#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
+#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
+
+#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
+#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
+#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
+
+#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
+#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
+#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
+#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
+#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
+#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
+#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
+#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
+
+#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
+#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
+#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
+#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
+#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
+#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
+#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
+#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
+#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
+#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
+
+#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
+#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
+
+#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
+#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
+#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
+
+#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
+#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
+#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
+#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
+
+#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
+#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
+#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
+#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
+#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
+
+#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
+#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
+#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
+#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
+
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
+
+#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
+
+#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
+#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
+
+#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
+#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
+#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
+
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3
+
+#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
+
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
+
+#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
+#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
+#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
+
+#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
+#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
+#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
+
+#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
+#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_EXT_INTEL_H */
diff --git a/third_party/sources/opencl/include/CL/cl_gl.h b/third_party/sources/opencl/include/CL/cl_gl.h
new file mode 100644
index 00000000..fbdaf629
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_gl.h
@@ -0,0 +1,171 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#include <CL/cl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#ifdef CL_VERSION_1_2
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+#endif
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#ifdef CL_VERSION_1_2
+#define CL_GL_NUM_SAMPLES                       0x2012
+#endif
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     context,
+                     cl_mem_flags   flags,
+                     cl_GLuint      bufobj,
+                     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      context,
+                      cl_mem_flags    flags,
+                      cl_GLenum       target,
+                      cl_GLint        miplevel,
+                      cl_GLuint       texture,
+                      cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   context,
+                           cl_mem_flags flags,
+                           cl_GLuint    renderbuffer,
+                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                memobj,
+                  cl_gl_object_type *   gl_object_type,
+                  cl_GLuint *           gl_object_name) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               memobj,
+                   cl_gl_texture_info   param_name,
+                   size_t               param_value_size,
+                   void *               param_value,
+                   size_t *             param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      command_queue,
+                          cl_uint               num_objects,
+                          const cl_mem *        mem_objects,
+                          cl_uint               num_events_in_wait_list,
+                          const cl_event *      event_wait_list,
+                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      command_queue,
+                          cl_uint               num_objects,
+                          const cl_mem *        mem_objects,
+                          cl_uint               num_events_in_wait_list,
+                          const cl_event *      event_wait_list,
+                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+/* cl_khr_gl_sharing extension  */
+
+#define cl_khr_gl_sharing 1
+
+typedef cl_uint     cl_gl_context_info;
+
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * properties,
+                      cl_gl_context_info            param_name,
+                      size_t                        param_value_size,
+                      void *                        param_value,
+                      size_t *                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/third_party/sources/opencl/include/CL/cl_gl_ext.h b/third_party/sources/opencl/include/CL/cl_gl_ext.h
new file mode 100644
index 00000000..c26d31ab
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_gl_ext.h
@@ -0,0 +1,52 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <CL/cl_gl.h>
+
+/* 
+ *  cl_khr_gl_event extension
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context context,
+                           cl_GLsync  cl_GLsync,
+                           cl_int *   errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/third_party/sources/opencl/include/CL/cl_icd.h b/third_party/sources/opencl/include/CL/cl_icd.h
new file mode 100644
index 00000000..2be64719
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_icd.h
@@ -0,0 +1,1269 @@
+/*******************************************************************************
+ * Copyright (c) 2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef OPENCL_CL_ICD_H
+#define OPENCL_CL_ICD_H
+
+#include <CL/cl.h>
+#include <CL/cl_egl.h>
+#include <CL/cl_ext.h>
+#include <CL/cl_gl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This file contains pointer type definitions for each of the CL API calls as
+ * well as a type definition for the dispatch table used by the Khronos ICD
+ * loader (see cl_khr_icd extension specification for background).
+ */
+
+/* API function pointer definitions */
+
+// Platform APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)(
+    cl_uint num_entries, cl_platform_id *platforms,
+    cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)(
+    cl_platform_id platform, cl_platform_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+// Device APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)(
+    cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
+    cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)(
+    cl_device_id device, cl_device_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevices)(
+    cl_device_id in_device,
+    const cl_device_partition_property *partition_properties,
+    cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDevice)(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDevice)(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clCreateSubDevices;
+typedef void *cl_api_clRetainDevice;
+typedef void *cl_api_clReleaseDevice;
+
+#endif
+
+// Context APIs
+typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContext)(
+    const cl_context_properties *properties, cl_uint num_devices,
+    const cl_device_id *devices,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContextFromType)(
+    const cl_context_properties *properties, cl_device_type device_type,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainContext)(
+    cl_context context) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseContext)(
+    cl_context context) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetContextInfo)(
+    cl_context context, cl_context_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+// Command Queue APIs
+typedef CL_API_ENTRY cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)(
+    cl_context context, cl_device_id device,
+    cl_command_queue_properties properties,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+typedef CL_API_ENTRY
+cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueueWithProperties)(
+    cl_context /* context */, cl_device_id /* device */,
+    const cl_queue_properties * /* properties */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#else
+
+typedef void *cl_api_clCreateCommandQueueWithProperties;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)(
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)(
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)(
+    cl_command_queue command_queue, cl_command_queue_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+// Memory Object APIs
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBuffer)(
+    cl_context context, cl_mem_flags flags, size_t size, void *host_ptr,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage)(
+    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+    const cl_image_desc *image_desc, void *host_ptr,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clCreateImage;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainMemObject)(
+    cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseMemObject)(
+    cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)(
+    cl_context context, cl_mem_flags flags, cl_mem_object_type image_type,
+    cl_uint num_entries, cl_image_format *image_formats,
+    cl_uint *num_image_formats) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)(
+    cl_mem memobj, cl_mem_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetImageInfo)(
+    cl_mem image, cl_image_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreatePipe)(
+    cl_context /* context */, cl_mem_flags /* flags */,
+    cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */,
+    const cl_pipe_properties * /* properties */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPipeInfo)(
+    cl_mem /* pipe */, cl_pipe_info /* param_name */,
+    size_t /* param_value_size */, void * /* param_value */,
+    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clSVMAlloc)(
+    cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */,
+    unsigned int /* alignment */)CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY void(CL_API_CALL *cl_api_clSVMFree)(
+    cl_context /* context */,
+    void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
+
+#else
+
+typedef void *cl_api_clCreatePipe;
+typedef void *cl_api_clGetPipeInfo;
+typedef void *cl_api_clSVMAlloc;
+typedef void *cl_api_clSVMFree;
+
+#endif
+
+// Sampler APIs
+typedef CL_API_ENTRY cl_sampler(CL_API_CALL *cl_api_clCreateSampler)(
+    cl_context context, cl_bool normalized_coords,
+    cl_addressing_mode addressing_mode, cl_filter_mode filter_mode,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainSampler)(
+    cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseSampler)(
+    cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)(
+    cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+typedef CL_API_ENTRY
+cl_sampler(CL_API_CALL *cl_api_clCreateSamplerWithProperties)(
+    cl_context /* context */,
+    const cl_sampler_properties * /* sampler_properties */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#else
+
+typedef void *cl_api_clCreateSamplerWithProperties;
+
+#endif
+
+// Program Object APIs
+typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)(
+    cl_context context, cl_uint count, const char **strings,
+    const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const size_t *lengths, const unsigned char **binaries,
+    cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY
+cl_program(CL_API_CALL *cl_api_clCreateProgramWithBuiltInKernels)(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const char *kernel_names, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clCreateProgramWithBuiltInKernels;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainProgram)(
+    cl_program program) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseProgram)(
+    cl_program program) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clBuildProgram)(
+    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCompileProgram)(
+    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options, cl_uint num_input_headers,
+    const cl_program *input_headers, const char **header_include_names,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clLinkProgram)(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options, cl_uint num_input_programs,
+    const cl_program *input_programs,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clCompileProgram;
+typedef void *cl_api_clLinkProgram;
+
+#endif
+
+#ifdef CL_VERSION_2_2
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clSetProgramSpecializationConstant)(
+    cl_program program, cl_uint spec_id, size_t spec_size,
+    const void *spec_value) CL_API_SUFFIX__VERSION_2_2;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)(
+    cl_program program,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) CL_API_SUFFIX__VERSION_2_2;
+
+#else
+
+typedef void *cl_api_clSetProgramSpecializationConstant;
+typedef void *cl_api_clSetProgramReleaseCallback;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)(
+    cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clUnloadPlatformCompiler;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramInfo)(
+    cl_program program, cl_program_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)(
+    cl_program program, cl_device_id device, cl_program_build_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+// Kernel Object APIs
+typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCreateKernel)(
+    cl_program program, const char *kernel_name,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)(
+    cl_program program, cl_uint num_kernels, cl_kernel *kernels,
+    cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainKernel)(
+    cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseKernel)(
+    cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArg)(
+    cl_kernel kernel, cl_uint arg_index, size_t arg_size,
+    const void *arg_value) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelInfo)(
+    cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)(
+    cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clGetKernelArgInfo;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)(
+    cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)(
+    cl_kernel /* kernel */, cl_uint /* arg_index */,
+    const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)(
+    cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */,
+    size_t /* param_value_size */,
+    const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)(
+    cl_kernel /* in_kernel */, cl_device_id /*in_device*/,
+    cl_kernel_sub_group_info /* param_name */, size_t /*input_value_size*/,
+    const void * /*input_value*/, size_t /*param_value_size*/,
+    void * /*param_value*/,
+    size_t * /*param_value_size_ret*/) CL_EXT_SUFFIX__VERSION_2_0;
+
+#else
+
+typedef void *cl_api_clSetKernelArgSVMPointer;
+typedef void *cl_api_clSetKernelExecInfo;
+typedef void *cl_api_clGetKernelSubGroupInfoKHR;
+
+#endif
+
+// Event Object APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clWaitForEvents)(
+    cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventInfo)(
+    cl_event event, cl_event_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event)
+    CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event)
+    CL_API_SUFFIX__VERSION_1_0;
+
+// Profiling APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)(
+    cl_event event, cl_profiling_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+// Flush and Finish APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFlush)(
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFinish)(
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+// Enqueued Commands APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+    size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+    const size_t *buffer_origin, const size_t *host_origin,
+    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+    size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_1;
+
+#else
+
+typedef void *cl_api_clEnqueueReadBufferRect;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+    size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+    const size_t *buffer_origin, const size_t *host_origin,
+    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+    size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_1;
+
+#else
+
+typedef void *cl_api_clEnqueueWriteBufferRect;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)(
+    cl_command_queue command_queue, cl_mem buffer, const void *pattern,
+    size_t pattern_size, size_t offset, size_t cb,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clEnqueueFillBuffer;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)(
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+    size_t src_offset, size_t dst_offset, size_t cb,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)(
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+    const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+    size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
+    size_t dst_slice_pitch, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_1;
+
+#else
+
+typedef void *cl_api_clEnqueueCopyBufferRect;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
+    const size_t *origin, const size_t *region, size_t row_pitch,
+    size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,
+    const size_t *origin, const size_t *region, size_t input_row_pitch,
+    size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)(
+    cl_command_queue command_queue, cl_mem image, const void *fill_color,
+    const size_t origin[3], const size_t region[3],
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clEnqueueFillImage;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)(
+    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image,
+    const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)(
+    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer,
+    const size_t *src_origin, const size_t *region, size_t dst_offset,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)(
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image,
+    size_t src_offset, const size_t *dst_origin, const size_t *region,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map,
+    cl_map_flags map_flags, size_t offset, size_t cb,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapImage)(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,
+    cl_map_flags map_flags, const size_t *origin, const size_t *region,
+    size_t *image_row_pitch, size_t *image_slice_pitch,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)(
+    cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)(
+    cl_command_queue command_queue, cl_uint num_mem_objects,
+    const cl_mem *mem_objects, cl_mem_migration_flags flags,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clEnqueueMigrateMemObjects;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)(
+    cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
+    const size_t *global_work_offset, const size_t *global_work_size,
+    const size_t *local_work_size, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueTask)(
+    cl_command_queue command_queue, cl_kernel kernel,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)(
+    cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *),
+    void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list,
+    const void **args_mem_loc, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)(
+    cl_command_queue command_queue, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)(
+    cl_command_queue command_queue, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY void *(
+    CL_API_CALL *cl_api_clGetExtensionFunctionAddressForPlatform)(
+    cl_platform_id platform,
+    const char *function_name)CL_API_SUFFIX__VERSION_1_2;
+
+#else
+
+typedef void *cl_api_clEnqueueMarkerWithWaitList;
+typedef void *cl_api_clEnqueueBarrierWithWaitList;
+typedef void *cl_api_clGetExtensionFunctionAddressForPlatform;
+
+#endif
+
+// Shared Virtual Memory APIs
+
+#ifdef CL_VERSION_2_0
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)(
+    cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
+    void ** /* svm_pointers */,
+    void(CL_CALLBACK *pfn_free_func)(cl_command_queue /* queue */,
+                                     cl_uint /* num_svm_pointers */,
+                                     void ** /* svm_pointers[] */,
+                                     void * /* user_data */),
+    void * /* user_data */, cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)(
+    cl_command_queue /* command_queue */, cl_bool /* blocking_copy */,
+    void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)(
+    cl_command_queue /* command_queue */, void * /* svm_ptr */,
+    const void * /* pattern */, size_t /* pattern_size */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)(
+    cl_command_queue /* command_queue */, cl_bool /* blocking_map */,
+    cl_map_flags /* map_flags */, void * /* svm_ptr */, size_t /* size */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)(
+    cl_command_queue /* command_queue */, void * /* svm_ptr */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */,
+    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+#else
+
+typedef void *cl_api_clEnqueueSVMFree;
+typedef void *cl_api_clEnqueueSVMMemcpy;
+typedef void *cl_api_clEnqueueSVMMemFill;
+typedef void *cl_api_clEnqueueSVMMap;
+typedef void *cl_api_clEnqueueSVMUnmap;
+
+#endif
+
+// Deprecated APIs
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)(
+    cl_command_queue command_queue, cl_command_queue_properties properties,
+    cl_bool enable, cl_command_queue_properties *old_properties)
+    CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage2D)(
+    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+    size_t image_width, size_t image_height, size_t image_row_pitch,
+    void *host_ptr, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage3D)(
+    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+    size_t image_width, size_t image_height, size_t image_depth,
+    size_t image_row_pitch, size_t image_slice_pitch, void *host_ptr,
+    cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void)
+    CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarker)(
+    cl_command_queue command_queue,
+    cl_event *event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)(
+    cl_command_queue command_queue, cl_uint num_events,
+    const cl_event *event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)(
+    cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)(
+    const char *function_name)CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+// GL and other APIs
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)(
+    cl_context context, cl_mem_flags flags, cl_GLuint bufobj,
+    int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)(
+    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
+    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)(
+    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
+    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)(
+    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
+    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)(
+    cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)(
+    cl_mem memobj, cl_gl_object_type *gl_object_type,
+    cl_GLuint *gl_object_name) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)(
+    cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+/* cl_khr_gl_sharing */
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)(
+    const cl_context_properties *properties, cl_gl_context_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret);
+
+/* cl_khr_gl_event */
+typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)(
+    cl_context context, cl_GLsync sync, cl_int *errcode_ret);
+
+#if defined(_WIN32)
+
+/* cl_khr_d3d10_sharing */
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)(
+    cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,
+    void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,
+    cl_uint num_entries, cl_device_id *devices,
+    cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Buffer *resource,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,
+    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,
+    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D10KHR(
+    cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,
+    void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,
+    cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10BufferKHR(cl_context context, cl_mem_flags flags,
+                           ID3D10Buffer *resource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture2DKHR(
+    cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,
+    UINT subresource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture3DKHR(
+    cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,
+    UINT subresource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D10ObjectsKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+/* cl_khr_d3d11_sharing */
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)(
+    cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,
+    void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,
+    cl_uint num_entries, cl_device_id *devices,
+    cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D11Buffer *resource,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)(
+    cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,
+    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)(
+    cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,
+    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D11ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+/* cl_khr_dx9_media_sharing */
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR)(
+    cl_platform_id platform, cl_uint num_media_adapters,
+    cl_dx9_media_adapter_type_khr *media_adapters_type, void *media_adapters,
+    cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
+    cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)(
+    cl_context context, cl_mem_flags flags,
+    cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,
+    cl_uint plane, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+/* cl_khr_d3d11_sharing */
+extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D11KHR(
+    cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,
+    void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,
+    cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11BufferKHR(cl_context context, cl_mem_flags flags,
+                           ID3D11Buffer *resource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture2DKHR(
+    cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,
+    UINT subresource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture3DKHR(
+    cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,
+    UINT subresource, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D11ObjectsKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+/* cl_khr_dx9_media_sharing */
+extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromDX9MediaAdapterKHR(
+    cl_platform_id platform, cl_uint num_media_adapters,
+    cl_dx9_media_adapter_type_khr *media_adapter_type, void *media_adapters,
+    cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
+    cl_device_id *devices, cl_uint *num_devices);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromDX9MediaSurfaceKHR(
+    cl_context context, cl_mem_flags flags,
+    cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,
+    cl_uint plane, cl_int *errcode_ret);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireDX9MediaSurfacesKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+#else
+
+/* cl_khr_d3d10_sharing */
+typedef void *cl_api_clGetDeviceIDsFromD3D10KHR;
+typedef void *cl_api_clCreateFromD3D10BufferKHR;
+typedef void *cl_api_clCreateFromD3D10Texture2DKHR;
+typedef void *cl_api_clCreateFromD3D10Texture3DKHR;
+typedef void *cl_api_clEnqueueAcquireD3D10ObjectsKHR;
+typedef void *cl_api_clEnqueueReleaseD3D10ObjectsKHR;
+
+/* cl_khr_d3d11_sharing */
+typedef void *cl_api_clGetDeviceIDsFromD3D11KHR;
+typedef void *cl_api_clCreateFromD3D11BufferKHR;
+typedef void *cl_api_clCreateFromD3D11Texture2DKHR;
+typedef void *cl_api_clCreateFromD3D11Texture3DKHR;
+typedef void *cl_api_clEnqueueAcquireD3D11ObjectsKHR;
+typedef void *cl_api_clEnqueueReleaseD3D11ObjectsKHR;
+
+/* cl_khr_dx9_media_sharing */
+typedef void *cl_api_clCreateFromDX9MediaSurfaceKHR;
+typedef void *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR;
+typedef void *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR;
+typedef void *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR;
+
+#endif
+
+/* OpenCL 1.1 */
+
+#ifdef CL_VERSION_1_1
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetEventCallback)(
+    cl_event /* event */, cl_int /* command_exec_callback_type */,
+    void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+    void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)(
+    cl_mem /* buffer */, cl_mem_flags /* flags */,
+    cl_buffer_create_type /* buffer_create_type */,
+    const void * /* buffer_create_info */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY
+cl_int(CL_API_CALL *cl_api_clSetMemObjectDestructorCallback)(
+    cl_mem /* memobj */,
+    void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */,
+                                       void * /*user_data*/),
+    void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateUserEvent)(
+    cl_context /* context */,
+    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)(
+    cl_event /* event */,
+    cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+
+#else
+
+typedef void *cl_api_clSetEventCallback;
+typedef void *cl_api_clCreateSubBuffer;
+typedef void *cl_api_clSetMemObjectDestructorCallback;
+typedef void *cl_api_clCreateUserEvent;
+typedef void *cl_api_clSetUserEventStatus;
+
+#endif
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)(
+    cl_device_id in_device,
+    const cl_device_partition_property_ext *partition_properties,
+    cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)(
+    cl_device_id device) CL_API_SUFFIX__VERSION_1_0;
+
+/* cl_khr_egl_image */
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)(
+    cl_context context, CLeglDisplayKHR display, CLeglImageKHR image,
+    cl_mem_flags flags, const cl_egl_image_properties_khr *properties,
+    cl_int *errcode_ret);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+/* cl_khr_egl_event */
+typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)(
+    cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display,
+    cl_int *errcode_ret);
+
+#ifdef CL_VERSION_2_1
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)(
+    cl_context context, cl_device_id device,
+    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)(
+    cl_context context, const void *il, size_t length,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)(
+    cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name,
+    size_t input_value_size, const void *input_value, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCloneKernel)(
+    cl_kernel source_kernel, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)(
+    cl_command_queue command_queue, cl_uint num_svm_pointers,
+    const void **svm_pointers, const size_t *sizes,
+    cl_mem_migration_flags flags, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)(
+    cl_device_id device, cl_ulong *device_timestamp,
+    cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetHostTimer)(
+    cl_device_id device, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;
+
+#else
+
+typedef void *cl_api_clSetDefaultDeviceCommandQueue;
+typedef void *cl_api_clCreateProgramWithIL;
+typedef void *cl_api_clGetKernelSubGroupInfo;
+typedef void *cl_api_clCloneKernel;
+typedef void *cl_api_clEnqueueSVMMigrateMem;
+typedef void *cl_api_clGetDeviceAndHostTimer;
+typedef void *cl_api_clGetHostTimer;
+
+#endif
+
+/* Vendor dispatch table struture */
+
+typedef struct _cl_icd_dispatch {
+  /* OpenCL 1.0 */
+  cl_api_clGetPlatformIDs clGetPlatformIDs;
+  cl_api_clGetPlatformInfo clGetPlatformInfo;
+  cl_api_clGetDeviceIDs clGetDeviceIDs;
+  cl_api_clGetDeviceInfo clGetDeviceInfo;
+  cl_api_clCreateContext clCreateContext;
+  cl_api_clCreateContextFromType clCreateContextFromType;
+  cl_api_clRetainContext clRetainContext;
+  cl_api_clReleaseContext clReleaseContext;
+  cl_api_clGetContextInfo clGetContextInfo;
+  cl_api_clCreateCommandQueue clCreateCommandQueue;
+  cl_api_clRetainCommandQueue clRetainCommandQueue;
+  cl_api_clReleaseCommandQueue clReleaseCommandQueue;
+  cl_api_clGetCommandQueueInfo clGetCommandQueueInfo;
+  cl_api_clSetCommandQueueProperty clSetCommandQueueProperty;
+  cl_api_clCreateBuffer clCreateBuffer;
+  cl_api_clCreateImage2D clCreateImage2D;
+  cl_api_clCreateImage3D clCreateImage3D;
+  cl_api_clRetainMemObject clRetainMemObject;
+  cl_api_clReleaseMemObject clReleaseMemObject;
+  cl_api_clGetSupportedImageFormats clGetSupportedImageFormats;
+  cl_api_clGetMemObjectInfo clGetMemObjectInfo;
+  cl_api_clGetImageInfo clGetImageInfo;
+  cl_api_clCreateSampler clCreateSampler;
+  cl_api_clRetainSampler clRetainSampler;
+  cl_api_clReleaseSampler clReleaseSampler;
+  cl_api_clGetSamplerInfo clGetSamplerInfo;
+  cl_api_clCreateProgramWithSource clCreateProgramWithSource;
+  cl_api_clCreateProgramWithBinary clCreateProgramWithBinary;
+  cl_api_clRetainProgram clRetainProgram;
+  cl_api_clReleaseProgram clReleaseProgram;
+  cl_api_clBuildProgram clBuildProgram;
+  cl_api_clUnloadCompiler clUnloadCompiler;
+  cl_api_clGetProgramInfo clGetProgramInfo;
+  cl_api_clGetProgramBuildInfo clGetProgramBuildInfo;
+  cl_api_clCreateKernel clCreateKernel;
+  cl_api_clCreateKernelsInProgram clCreateKernelsInProgram;
+  cl_api_clRetainKernel clRetainKernel;
+  cl_api_clReleaseKernel clReleaseKernel;
+  cl_api_clSetKernelArg clSetKernelArg;
+  cl_api_clGetKernelInfo clGetKernelInfo;
+  cl_api_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
+  cl_api_clWaitForEvents clWaitForEvents;
+  cl_api_clGetEventInfo clGetEventInfo;
+  cl_api_clRetainEvent clRetainEvent;
+  cl_api_clReleaseEvent clReleaseEvent;
+  cl_api_clGetEventProfilingInfo clGetEventProfilingInfo;
+  cl_api_clFlush clFlush;
+  cl_api_clFinish clFinish;
+  cl_api_clEnqueueReadBuffer clEnqueueReadBuffer;
+  cl_api_clEnqueueWriteBuffer clEnqueueWriteBuffer;
+  cl_api_clEnqueueCopyBuffer clEnqueueCopyBuffer;
+  cl_api_clEnqueueReadImage clEnqueueReadImage;
+  cl_api_clEnqueueWriteImage clEnqueueWriteImage;
+  cl_api_clEnqueueCopyImage clEnqueueCopyImage;
+  cl_api_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
+  cl_api_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
+  cl_api_clEnqueueMapBuffer clEnqueueMapBuffer;
+  cl_api_clEnqueueMapImage clEnqueueMapImage;
+  cl_api_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
+  cl_api_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
+  cl_api_clEnqueueTask clEnqueueTask;
+  cl_api_clEnqueueNativeKernel clEnqueueNativeKernel;
+  cl_api_clEnqueueMarker clEnqueueMarker;
+  cl_api_clEnqueueWaitForEvents clEnqueueWaitForEvents;
+  cl_api_clEnqueueBarrier clEnqueueBarrier;
+  cl_api_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
+  cl_api_clCreateFromGLBuffer clCreateFromGLBuffer;
+  cl_api_clCreateFromGLTexture2D clCreateFromGLTexture2D;
+  cl_api_clCreateFromGLTexture3D clCreateFromGLTexture3D;
+  cl_api_clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer;
+  cl_api_clGetGLObjectInfo clGetGLObjectInfo;
+  cl_api_clGetGLTextureInfo clGetGLTextureInfo;
+  cl_api_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
+  cl_api_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
+  cl_api_clGetGLContextInfoKHR clGetGLContextInfoKHR;
+
+  /* cl_khr_d3d10_sharing */
+  cl_api_clGetDeviceIDsFromD3D10KHR clGetDeviceIDsFromD3D10KHR;
+  cl_api_clCreateFromD3D10BufferKHR clCreateFromD3D10BufferKHR;
+  cl_api_clCreateFromD3D10Texture2DKHR clCreateFromD3D10Texture2DKHR;
+  cl_api_clCreateFromD3D10Texture3DKHR clCreateFromD3D10Texture3DKHR;
+  cl_api_clEnqueueAcquireD3D10ObjectsKHR clEnqueueAcquireD3D10ObjectsKHR;
+  cl_api_clEnqueueReleaseD3D10ObjectsKHR clEnqueueReleaseD3D10ObjectsKHR;
+
+  /* OpenCL 1.1 */
+  cl_api_clSetEventCallback clSetEventCallback;
+  cl_api_clCreateSubBuffer clCreateSubBuffer;
+  cl_api_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
+  cl_api_clCreateUserEvent clCreateUserEvent;
+  cl_api_clSetUserEventStatus clSetUserEventStatus;
+  cl_api_clEnqueueReadBufferRect clEnqueueReadBufferRect;
+  cl_api_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
+  cl_api_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;
+
+  /* cl_ext_device_fission */
+  cl_api_clCreateSubDevicesEXT clCreateSubDevicesEXT;
+  cl_api_clRetainDeviceEXT clRetainDeviceEXT;
+  cl_api_clReleaseDeviceEXT clReleaseDeviceEXT;
+
+  /* cl_khr_gl_event */
+  cl_api_clCreateEventFromGLsyncKHR clCreateEventFromGLsyncKHR;
+
+  /* OpenCL 1.2 */
+  cl_api_clCreateSubDevices clCreateSubDevices;
+  cl_api_clRetainDevice clRetainDevice;
+  cl_api_clReleaseDevice clReleaseDevice;
+  cl_api_clCreateImage clCreateImage;
+  cl_api_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
+  cl_api_clCompileProgram clCompileProgram;
+  cl_api_clLinkProgram clLinkProgram;
+  cl_api_clUnloadPlatformCompiler clUnloadPlatformCompiler;
+  cl_api_clGetKernelArgInfo clGetKernelArgInfo;
+  cl_api_clEnqueueFillBuffer clEnqueueFillBuffer;
+  cl_api_clEnqueueFillImage clEnqueueFillImage;
+  cl_api_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
+  cl_api_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
+  cl_api_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
+  cl_api_clGetExtensionFunctionAddressForPlatform
+      clGetExtensionFunctionAddressForPlatform;
+  cl_api_clCreateFromGLTexture clCreateFromGLTexture;
+
+  /* cl_khr_d3d11_sharing */
+  cl_api_clGetDeviceIDsFromD3D11KHR clGetDeviceIDsFromD3D11KHR;
+  cl_api_clCreateFromD3D11BufferKHR clCreateFromD3D11BufferKHR;
+  cl_api_clCreateFromD3D11Texture2DKHR clCreateFromD3D11Texture2DKHR;
+  cl_api_clCreateFromD3D11Texture3DKHR clCreateFromD3D11Texture3DKHR;
+  cl_api_clCreateFromDX9MediaSurfaceKHR clCreateFromDX9MediaSurfaceKHR;
+  cl_api_clEnqueueAcquireD3D11ObjectsKHR clEnqueueAcquireD3D11ObjectsKHR;
+  cl_api_clEnqueueReleaseD3D11ObjectsKHR clEnqueueReleaseD3D11ObjectsKHR;
+
+  /* cl_khr_dx9_media_sharing */
+  cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR
+      clGetDeviceIDsFromDX9MediaAdapterKHR;
+  cl_api_clEnqueueAcquireDX9MediaSurfacesKHR
+      clEnqueueAcquireDX9MediaSurfacesKHR;
+  cl_api_clEnqueueReleaseDX9MediaSurfacesKHR
+      clEnqueueReleaseDX9MediaSurfacesKHR;
+
+  /* cl_khr_egl_image */
+  cl_api_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
+  cl_api_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
+  cl_api_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
+
+  /* cl_khr_egl_event */
+  cl_api_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
+
+  /* OpenCL 2.0 */
+  cl_api_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties;
+  cl_api_clCreatePipe clCreatePipe;
+  cl_api_clGetPipeInfo clGetPipeInfo;
+  cl_api_clSVMAlloc clSVMAlloc;
+  cl_api_clSVMFree clSVMFree;
+  cl_api_clEnqueueSVMFree clEnqueueSVMFree;
+  cl_api_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
+  cl_api_clEnqueueSVMMemFill clEnqueueSVMMemFill;
+  cl_api_clEnqueueSVMMap clEnqueueSVMMap;
+  cl_api_clEnqueueSVMUnmap clEnqueueSVMUnmap;
+  cl_api_clCreateSamplerWithProperties clCreateSamplerWithProperties;
+  cl_api_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
+  cl_api_clSetKernelExecInfo clSetKernelExecInfo;
+
+  /* cl_khr_sub_groups */
+  cl_api_clGetKernelSubGroupInfoKHR clGetKernelSubGroupInfoKHR;
+
+  /* OpenCL 2.1 */
+  cl_api_clCloneKernel clCloneKernel;
+  cl_api_clCreateProgramWithIL clCreateProgramWithIL;
+  cl_api_clEnqueueSVMMigrateMem clEnqueueSVMMigrateMem;
+  cl_api_clGetDeviceAndHostTimer clGetDeviceAndHostTimer;
+  cl_api_clGetHostTimer clGetHostTimer;
+  cl_api_clGetKernelSubGroupInfo clGetKernelSubGroupInfo;
+  cl_api_clSetDefaultDeviceCommandQueue clSetDefaultDeviceCommandQueue;
+
+  /* OpenCL 2.2 */
+  cl_api_clSetProgramReleaseCallback clSetProgramReleaseCallback;
+  cl_api_clSetProgramSpecializationConstant clSetProgramSpecializationConstant;
+} cl_icd_dispatch;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* #ifndef OPENCL_CL_ICD_H */
diff --git a/third_party/sources/opencl/include/CL/cl_platform.h b/third_party/sources/opencl/include/CL/cl_platform.h
new file mode 100644
index 00000000..7f4ddea5
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_platform.h
@@ -0,0 +1,1384 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#include <CL/cl_version.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+
+#define CL_EXTENSION_WEAK_LINK
+#define CL_API_SUFFIX__VERSION_1_0
+#define CL_EXT_SUFFIX__VERSION_1_0
+#define CL_API_SUFFIX__VERSION_1_1
+#define CL_EXT_SUFFIX__VERSION_1_1
+#define CL_API_SUFFIX__VERSION_1_2
+#define CL_EXT_SUFFIX__VERSION_1_2
+#define CL_API_SUFFIX__VERSION_2_0
+#define CL_EXT_SUFFIX__VERSION_2_0
+#define CL_API_SUFFIX__VERSION_2_1
+#define CL_EXT_SUFFIX__VERSION_2_1
+#define CL_API_SUFFIX__VERSION_2_2
+#define CL_EXT_SUFFIX__VERSION_2_2
+
+
+#ifdef __GNUC__
+  #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated))
+  #define CL_EXT_PREFIX_DEPRECATED
+#elif defined(_WIN32)
+  #define CL_EXT_SUFFIX_DEPRECATED
+  #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated)
+#else
+  #define CL_EXT_SUFFIX_DEPRECATED
+  #define CL_EXT_PREFIX_DEPRECATED
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+#else
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+    #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+#else
+    #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+#else
+    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+ #endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+    #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+#else
+    #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+    #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+#else
+    #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+    #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          1.7976931348623158e+308
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short;
+typedef uint16_t        cl_ushort;
+typedef int32_t         cl_int;
+typedef uint32_t        cl_uint;
+typedef int64_t         cl_long;
+typedef uint64_t        cl_ulong;
+
+typedef uint16_t        cl_half;
+typedef float           cl_float;
+typedef double          cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned.
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned.
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef __vector unsigned char     __cl_uchar16;
+   typedef __vector signed char       __cl_char16;
+   typedef __vector unsigned short    __cl_ushort8;
+   typedef __vector signed short      __cl_short8;
+   typedef __vector unsigned int      __cl_uint4;
+   typedef __vector signed int        __cl_int4;
+   typedef __vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define capabilities for anonymous struct members. */
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >= 1500
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+    #define  __CL_HAS_ANON_STRUCT__ 1
+    #define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
+    #endif
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+
+/* ---- cl_halfn ---- */
+typedef union
+{
+    cl_half  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
+    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2;
+#endif
+}cl_half2;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
+    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[2];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4;
+#endif
+}cl_half4;
+
+/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
+typedef  cl_half4  cl_half3;
+
+typedef union
+{
+    cl_half   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
+    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[4];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[2];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8;
+#endif
+}cl_half8;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[8];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[4];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8[2];
+#endif
+#if defined( __CL_HALF16__ )
+    __cl_half16    v16;
+#endif
+}cl_half16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >=1500
+    #pragma warning( pop )
+    #endif
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/third_party/sources/opencl/include/CL/cl_va_api_media_sharing_intel.h b/third_party/sources/opencl/include/CL/cl_va_api_media_sharing_intel.h
new file mode 100644
index 00000000..934f3f52
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_va_api_media_sharing_intel.h
@@ -0,0 +1,172 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2019 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_va_api_media_sharing_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+
+#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#include <va/va.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************
+* cl_intel_va_api_media_sharing extension *
+*******************************************/
+
+#define cl_intel_va_api_media_sharing 1
+
+/* error codes */
+#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
+#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
+#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
+#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
+
+/* cl_va_api_device_source_intel */
+#define CL_VA_API_DISPLAY_INTEL                             0x4094
+
+/* cl_va_api_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
+#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
+
+/* cl_context_info */
+#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
+
+/* cl_mem_info */
+#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
+
+/* cl_image_info */
+#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
+#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
+
+typedef cl_uint cl_va_api_device_source_intel;
+typedef cl_uint cl_va_api_device_set_intel;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
+    cl_platform_id                platform,
+    cl_va_api_device_source_intel media_adapter_type,
+    void*                         media_adapter,
+    cl_va_api_device_set_intel    media_adapter_set,
+    cl_uint                       num_entries,
+    cl_device_id*                 devices,
+    cl_uint*                      num_devices) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
+    cl_platform_id                platform,
+    cl_va_api_device_source_intel media_adapter_type,
+    void*                         media_adapter,
+    cl_va_api_device_set_intel    media_adapter_set,
+    cl_uint                       num_entries,
+    cl_device_id*                 devices,
+    cl_uint*                      num_devices) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromVA_APIMediaSurfaceINTEL(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    VASurfaceID*                  surface,
+    cl_uint                       plane,
+    cl_int*                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    VASurfaceID*                  surface,
+    cl_uint                       plane,
+    cl_int*                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireVA_APIMediaSurfacesINTEL(
+    cl_command_queue              command_queue,
+    cl_uint                       num_objects,
+    const cl_mem*                 mem_objects,
+    cl_uint                       num_events_in_wait_list,
+    const cl_event*               event_wait_list,
+    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              command_queue,
+    cl_uint                       num_objects,
+    const cl_mem*                 mem_objects,
+    cl_uint                       num_events_in_wait_list,
+    const cl_event*               event_wait_list,
+    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseVA_APIMediaSurfacesINTEL(
+    cl_command_queue              command_queue,
+    cl_uint                       num_objects,
+    const cl_mem*                 mem_objects,
+    cl_uint                       num_events_in_wait_list,
+    const cl_event*               event_wait_list,
+    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              command_queue,
+    cl_uint                       num_objects,
+    const cl_mem*                 mem_objects,
+    cl_uint                       num_events_in_wait_list,
+    const cl_event*               event_wait_list,
+    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
+
diff --git a/third_party/sources/opencl/include/CL/cl_version.h b/third_party/sources/opencl/include/CL/cl_version.h
new file mode 100644
index 00000000..bb766cb9
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/cl_version.h
@@ -0,0 +1,86 @@
+/*******************************************************************************
+ * Copyright (c) 2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __CL_VERSION_H
+#define __CL_VERSION_H
+
+/* Detect which version to target */
+#if !defined(CL_TARGET_OPENCL_VERSION)
+#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
+#define CL_TARGET_OPENCL_VERSION 220
+#endif
+#if CL_TARGET_OPENCL_VERSION != 100 && \
+    CL_TARGET_OPENCL_VERSION != 110 && \
+    CL_TARGET_OPENCL_VERSION != 120 && \
+    CL_TARGET_OPENCL_VERSION != 200 && \
+    CL_TARGET_OPENCL_VERSION != 210 && \
+    CL_TARGET_OPENCL_VERSION != 220
+#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220). Defaulting to 220 (OpenCL 2.2)")
+#undef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 220
+#endif
+
+
+/* OpenCL Version */
+#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
+#define CL_VERSION_2_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
+#define CL_VERSION_2_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
+#define CL_VERSION_2_0  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
+#define CL_VERSION_1_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
+#define CL_VERSION_1_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
+#define CL_VERSION_1_0  1
+#endif
+
+/* Allow deprecated APIs for older OpenCL versions. */
+#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+
+#endif  /* __CL_VERSION_H */
diff --git a/third_party/sources/opencl/include/CL/opencl.h b/third_party/sources/opencl/include/CL/opencl.h
new file mode 100644
index 00000000..143d1d2d
--- /dev/null
+++ b/third_party/sources/opencl/include/CL/opencl.h
@@ -0,0 +1,47 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
diff --git a/training/CMakeLists.txt b/training/CMakeLists.txt
new file mode 100644
index 00000000..4a4395e4
--- /dev/null
+++ b/training/CMakeLists.txt
@@ -0,0 +1,128 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR)
+    message(FATAL_ERROR "Do not build in-source")
+endif ()
+
+############################
+##  Project
+############################
+project(
+        Training
+        VERSION 1
+        LANGUAGES CXX
+)
+
+############################
+##  Includes
+############################
+include(CMakeDependentOption)
+
+############################
+##  Options: Modules
+############################
+option(TRAINING_BUILD_C_API "Build training C API" OFF)
+cmake_dependent_option(TRAINING_BUILD_DEMO "Build training C demo" OFF TRAINING_BUILD_C_API OFF)
+
+############################
+##  Options: Raul
+############################
+option(RAUL_CONFIG_ENABLE_OPENMP "Build with OpenMP" OFF)
+option(RAUL_CONFIG_ENABLE_FP16 "Build with FP16" ON)
+option(RAUL_BUILD_TESTS "Build tests" OFF)
+
+option(RAUL_CONFIG_ENABLE_PEDANTIC "Build in pedantic mode" ON)
+option(RAUL_CONFIG_ENABLE_PARALLEL_BUILD "Build in parallelize mode" ON)
+
+option(RAUL_CONFIG_ENABLE_IO_JSON "Build with support loading topology from json" OFF)
+
+############################
+##  Options: Dev
+############################
+option(RAUL_CONFIG_DEV_ENABLE_CPPCHECK "Enable cppcheck if available" OFF)
+option(RAUL_CONFIG_DEV_ENABLE_CLANG_TIDY "Enable clang-tidy if available" OFF)
+
+############################
+##  Options: Tests
+############################
+cmake_dependent_option(RAUL_TESTS_CONFIG_ENABLE_VERBOSE "Enable verbose test" OFF RAUL_BUILD_TESTS OFF)
+cmake_dependent_option(RAUL_TESTS_CONFIG_ENABLE_SCENARIOS "Enable test scenarios" OFF RAUL_BUILD_TESTS OFF)
+
+cmake_dependent_option(RAUL_TESTS_BUILD_CORE "Build core library unit tests" ON RAUL_BUILD_TESTS OFF)
+cmake_dependent_option(RAUL_TESTS_BUILD_ACTIVATIONS "Build activation functions unit tests" ON RAUL_BUILD_TESTS OFF)
+cmake_dependent_option(RAUL_TESTS_BUILD_INITIALIZERS "Build initializers unit tests" ON RAUL_BUILD_TESTS OFF)
+cmake_dependent_option(RAUL_TESTS_BUILD_LAYERS "Build layers unit tests" ON RAUL_BUILD_TESTS OFF)
+cmake_dependent_option(RAUL_TESTS_BUILD_LOSS "Build loss functions unit tests" ON RAUL_BUILD_TESTS OFF)
+cmake_dependent_option(RAUL_TESTS_BUILD_OPTIMIZERS "Build optimizers unit tests" ON RAUL_BUILD_TESTS OFF)
+cmake_dependent_option(RAUL_TESTS_BUILD_TOPOLOGIES "Build topologies unit tests" ON RAUL_BUILD_TESTS OFF)
+cmake_dependent_option(RAUL_TESTS_BUILD_POSTPROCESSING "Build postprocessing unit tests" ON RAUL_BUILD_TESTS OFF)
+
+
+############################
+##  Options: Installation
+############################
+option(RAUL_INSTALL_ENABLE_SUBDIRS "Install executables and libraries to bin and lib subdirectories" OFF)
+cmake_dependent_option(RAUL_INSTALL_TESTS "Install tests" OFF RAUL_BUILD_TESTS OFF)
+
+############################
+##  Configuration
+############################
+if (RAUL_INSTALL_ENABLE_SUBDIRS)
+    include(GNUInstallDirs)
+endif ()
+
+if (RAUL_CONFIG_ENABLE_PARALLEL_BUILD)
+    include(cmake/core-counter.cmake)
+endif ()
+
+############################
+##  Add: Linters
+############################
+if (RAUL_CONFIG_DEV_ENABLE_CPPCHECK)
+    include(cmake/cppcheck.cmake)
+endif ()
+
+if (RAUL_CONFIG_DEV_ENABLE_CLANG_TIDY)
+    include(cmake/clang-tidy.cmake)
+endif ()
+
+############################
+##  Add: Modules
+#############################
+add_subdirectory(src)
+if (TRAINING_BUILD_C_API)
+    add_subdirectory(api)
+endif ()
+if (TRAINING_BUILD_DEMO)
+    add_subdirectory(demos)
+endif ()
+
+
+############################
+##  Add:  Test Scenarios
+############################
+if (RAUL_TESTS_CONFIG_ENABLE_SCENARIOS)
+    include(cmake/testing.cmake)
+    add_test_target(test-cpu-unit "Run all cpu unit tests" INCLUDE Unit)
+    add_test_target(test-unit-optimizers "Run all optimizers unit tests" INCLUDE TestOptimizer.*Unit)
+endif ()
+
+############################
+##  Add: Tools
+#############################
+include(docs/docs.cmake)
+include(cmake/clang-format.cmake)
+include(cmake/cmake-gui.cmake)
diff --git a/training/README.md b/training/README.md
new file mode 100644
index 00000000..c7168f6f
--- /dev/null
+++ b/training/README.md
@@ -0,0 +1,76 @@
+# ![Raul](docs/raul_logo.png) Raul
+
+![cmake](https://img.shields.io/badge/Cmake-3.11-blue?logo=CMake)
+![c++](https://img.shields.io/badge/C++-17-blue?logo=c%2B%2B)
+![c++](https://img.shields.io/badge/Android%20NDK-r22-blue)
+![c++](https://img.shields.io/badge/Clang-11.0.0-blue)
+![c++](https://img.shields.io/badge/GCC-9.2.1-blue)
+![c++](https://img.shields.io/badge/Visual%20Studio-19-blue?logo=Visual%20Studio)
+![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)
+
+**Cross-platform on-device training library for mobile and IoT devices**
+
+Raul on-device training library is C++ based software designed to train complex neural network (NN) topologies using minimal external dependencies and minimize ROM/RAM footprint. The current implementation of Raul version is CPU-based with BLAS-compatible mathematical back-end used for intensive mathematical operations: matrix multiplication and element-wise vector operations.
+
+**Features**
+
+- CPU-based computations
+- BLAS-compatible math back-end
+- 90+ NN layers
+- 9 NN optimization algorithms
+    - lr schedulers
+    - gradient clipping
+    - regularization methods
+- quantization-aware training
+- 6 verified complex topologies (BERT, ResNet, Tacotron 2, NIN, MobileNet 2, MobileNet 3, Transformer)
+- Memory efficiency strategies
+- Gradient checkpoints 
+- Workflows aka dynamic allocations 
+
+
+## Usage
+
+```cmake
+cmake_minimum_required(VERSION 3.11)
+project(sample)
+
+add_subdirectory(raul)
+
+add_executable(app main.cpp)
+target_link_libraries(app PRIVATE Raul)
+
+```
+
+## Build
+
+### Reqirements
+
+- Build system
+    - cmake 3.11+
+- Compilers
+    - Clang 11.0.0+
+    - GCC 9.2.1+
+    - Visual Studio 16 (2019)+
+    - Android NDK r22+ (r21 without assets related tests)
+
+### Host build
+
+```sh
+cmake -B build -S raul
+cmake --build build --target Raul --parallel
+```
+
+This short example shows how to configure, generate a project for default build system and buld using `cmake`. Here, `raul` is a directory with repository root and `build` is an output build directory. All required dependencies will be downloaded; a connection must be established.
+
+### Cross-platform Android build
+
+Android NDK is required.
+
+```sh
+cmake -B build -S raul -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=%path to android.toolchain.cmake% -DRAUL_CONFIG_BLAS_VENDOR=Huawei -DRAUL_CONFIG_ENABLE_OPENMP=ON -DANDROID_ABI=arm64-v8a -DANDROID_NATIVE_API_LEVEL=19 -DANDROID_STL=c++_static
+cmake --build build --target Raul --parallel
+```
+
+[Read more information](docs/build.md)
+
+
diff --git a/training/TUTORIAL.md b/training/TUTORIAL.md
new file mode 100644
index 00000000..7c6bb2ca
--- /dev/null
+++ b/training/TUTORIAL.md
@@ -0,0 +1,210 @@
+It's the tutorial for your reference to the on-device training in bolt.   
+
+# Compilation
+
+If you are building whole bolt project, you can use *--train* option to open on-device training module. Here are two examples:
+
+- Target to android aarch64
+
+```
+./install.sh --target=android-aarch64 -t 36 --train
+```
+
+- Target to x86 servers
+
+```
+./install.sh --target=linux-x86_64 -t 36 --train
+```
+
+# Usage
+
+Bolt provides three easy-to-use applications, namely [Lenet](./demos/lenet_demo/lenet_demo.cpp), [Mobilenet_v1](./demos/mobilenet_v1_demo/mobilenet_v1_demo.cpp) and [Resnet18](./resnet18_demo/resnet18_demo.cpp).  
+
+---
+
+### Lenet
+
+- [ ] Download lenet.onnx from https://github.com/ONNC/onnc-tutorial/blob/master/models/lenet/lenet.onnx ;
+- [ ] Use [onnx-simplifier](https://github.com/daquexian/onnx-simplifier) to simplify <u>lenet.onnx</u> and get the simplified model <u>lenet_sim.onnx</u> ;
+- [ ] [Run X2bolt to convert lenet_sim.onnx to **lenet_sim_train.bolt**](../docs/USER_HANDBOOK.md#model-conversion);
+
+```
+// Model Conversion
+# ./X2bolt -d /path_to_onnx_lenet -m lenet_sim -t
+```
+
+- [ ] Execute train_lenet with dataset [mnist](http://yann.lecun.com/exdb/mnist/) and  **lenet_sim_train.bolt**.
+
+```
+// Training with bolt
+# ./train_lenet
+```
+
+![](../docs/images/losses_of_training_lenet.PNG)
+
+### Mobilenet_v1
+
+- [ ] Download mobilenet_v1.caffemodel and mobilenet_v1.prototxt from https://github.com/shicai/MobileNet-Caffe ;
+- [ ] Download https://www.kaggle.com/whitemoon/miniimagenet and use python scratch to process the data; 
+- [ ] [Run X2bolt to convert mobilenet_v1.caffemodel and mobilenet_v1.prototxt to **mobilenet_v1_train.bolt**](../docs/USER_HANDBOOK.md#model-conversion);
+
+```
+// Model Conversion
+# ./X2bolt -d /path_to_caffe_mobilenet -m mobilenet_v1 -t
+```
+
+- [ ] Execute train_mobilenet_v1 with dataset mini-imagenet and mobilenet_v1_train.bolt
+
+```
+// Training with bolt 
+# ./train_mobilenet_v1
+```
+
+![](../docs/images/losses_of_training_mobilenet.PNG)
+
+### Resnet18
+
+- [ ] Download resnet18.onnx from https://github.com/onnx/models/tree/main/vision/classification/resnet/model ;
+- [ ] Use [onnx-simplifier](https://github.com/daquexian/onnx-simplifier) to simplify <u>resnet18.onnx</u> and get the simplified model <u>resnet18_sim.onnx</u> ;
+- [ ] [Run X2bolt to convert resnet18_sim.onnx to **resnet18_sim_train.bolt**](../docs/USER_HANDBOOK.md#model-conversion);
+
+```
+// Model Conversion
+# ./X2bolt -d /path_to_onnx_resnet18 -m resnet18_sim -t 
+```
+
+- [ ] Execute train_resnet18 with resnet18_sim_train.bolt
+
+```
+# ./train_resnet18
+```
+
+![](../docs/images/losses_of_training_resnet.PNG)
+
+# API
+
+Here we provide two types API:
+
+- **High-level**(recommended)
+
+  In order to quickly construct and train your model, it's convenient to use [high-level api](./demos/common/training.h) to build your training graph from bolt model. You can refer to the [demos](./demos).
+
+- **Low-level**
+
+  If you would like to highly customize your training model, please use the [low-level api](./api/training/api).
+
+# Training Modules
+
+## Layers
+
+| Layer                     | Description |
+| ------------------------- | ----------- |
+| GeLU activation           | gaussian error linear activation function |
+| HSigmoid activation       | hard sigmoid activation function |
+| HSwish activation         | hard swish activation function |
+| Leaky ReLU activation     | leaky rectified linear unit activation function |
+| Log Softmax activation    | logarithmic softmax activation function |
+| ReLU activation           | rectified linear unit activation function |
+| Sigmoid activation        | sigmoid activation function |
+| Softmax activation        | softmax activation function |
+| Softplus activation       | softplus activation function |
+| Swish activation          | swish activation function |
+| Tanh activation           | hyperbolic tangent activation function |
+| ArgMax                    | returns indices where values is the maximum value of each row in the given dimension |
+| ArgMin                    | returns indices where values is the minimum value of each row in the given dimension |
+| Average pooling           | 2D averaging over an input tensor |
+| Batch expander            | broadcast input tensor with shape [1, D, H, W] to [BatchSize, D, H, W] |
+| Batchnorm 2D              | batch normalization over 4D input tensor ([batch, channel, 2D inputs]) |
+| Clamp                     | clamp all elements in input into the range [ min, max ] and return a resulting tensor |
+| Concatenation             | layer combine sub-tensors to one |
+| Convolution 1D            | 1D convolution over input tensor |
+| Convolution 2D            | 2D convolution over input tensor |
+| Convolution deptwise      | 2D convolution over input tensor, each channel processed separately |
+| CumSum                    | cumulative sum of elements |
+| Data                      | entry point for data to a model |
+| Dropout                   | dropout layer |
+| Dynamic depthwise conv 2D | channel-wise dynamic convolution 2D layer |
+| Elementwise compare       | element-wise comparison layer |
+| Elementwise div           | element-wise division layer |
+| Elementwise max           | element-wise maximum layer |
+| Elementwise min           | element-wise minimum layer |
+| Elementwise mul           | element-wise multiplication |
+| Elementwise sub           | element-wise subtraction |
+| Elementwise sum           | element-wise addition layer |
+| Embedding                 | word embeddings using lookup table |
+| Exp                       | element-wise exponential layer |
+| Fake quant                | floating-point quantization layer simulating quantization and dequantization |
+| Fixed bias                | layer that adds a scalar to tensor |
+| Global average pool       | global average pooling layer |
+| Index fill                | fills the elements of the tensor with specified value |
+| L2 norm                   | divides all elements in input tensor by L2 norm calculated across chosen dimension |
+| L2 squared norm           | L2 squared normalizing layer |
+| Label smoothing           | label smoothing layer |
+| LayerNorm                 | layer normalization 1D |
+| LayerNorm2D               | layer normalization 2D |
+| Linear                    | affine transformation layer |
+| Log                       | natural logarithm layer |
+| Masked fill               | fills input tensor elements corresponding to ones in mask with fill value |
+| Matmul                    | scalar multiplication of last two dimensions |
+| Maxpool                   | 2D max-pooling over input |
+| Non-zero mask             | element-wise non-zero mask |
+| Padding                   | adds paddings to input tensors |
+| Positional encoding       | encodes symbol position in sequence into embedding vector |
+| Random choice             | randomly outputs one of it's input tensors |
+| Random select             | returns a tensor of elements selected from either x or y, depending on dropout rate |
+| Random tensor             | creates tensor filled with values from normal distribution |
+| Reduce batch mean         | computes mean of elements across dimensions of a tensor |
+| Reduce max                | returns maximum values of each row of the input tensor in the given dimension |
+| Reduce mean               | computes mean of elements across dimensions of a tensor |
+| Reduce min                | returns minimum values of each row of the input tensor in the given dimension |
+| Reduce non-zero           | computes the number of non-zero elements along dimensions of a tensor |
+| Reduce std                | computes the standard deviation of elements across dimensions of a tensor |
+| Reduce sum                | computes the sum of elements across dimensions of a tensor |
+| Repeate interleave        | creates a new tensor repeating elements along chosen dimension |
+| Reshape                   | reshaping of a tensor |
+| Reverse                   | reverse the order of a tensor |
+| Roll                      | layer that rolls tensor along the given dimension |
+| Round                     | returns a tensor with each of the elements of input rounded to the closest integer |
+| RSqrt                     | returns a new tensor with the reciprocal of the square-root of each of the elements of the input |
+| Scale                     | layer of multiplication by a scalar |
+| Select                    | returns a tensor of elements selected from either x or y, depending on condition |
+| Slicer                    | extracting sub-tensors |
+| Splitter                  | duplication of a tensor |
+| Sqrt                      | returns a new tensor with the square-root of each of the elements of input |
+| Square                    | returns a new tensor with the square of each of the elements of input |
+| Tensor                    | inserts a constant tensor into a topology |
+| Tile                      | creates a new tensor by replicating input multiples times |
+| Transpose                 | swap dimensions according to parameters |
+| Transposed convolution 1D | 1D transposed convolution operator over an input image |
+| Transposed convolution 2D | 2D transposed convolution operator over an input image |
+
+## Optimizers
+
+| Optimizer     | Description |
+| ------------- | ----------- |
+| Adadelta      | Adagrad optimization with learning rates decay |
+| Adagrad       | stochastic gradient descent optimization with adaptive learning rates |
+| Adam          | stochastic gradient descent optimization with adaptive per-parameter learning rates based on gradients moments |
+| AdaMax        | Adam optimization with infinity norm |
+| AdamW         | computes individual adaptive learning rates for different parameters from estimates of first and second moments of the gradients taking into account weight decay |
+| ASGD          | averaged stochastic gradient descent |
+| LAMB          | layer-wise adaptive moments optimizer for batch training |
+| Momentum      | stochastic gradient descent optimization with momentum |
+| NAG           | Nesterov accelerated gradient method |
+| Ranger        | combines RAdam + lookahead + gradient centralization into a single optimizer |
+| RMSProp       | root mean squared propagation optimization |
+| RProp         | resilient backpropagation optimization |
+| SGD           | simple stochastic gradient descent optimization |
+
+## Loss Functions
+
+| Loss                          | Description |
+| ----------------------------- | ----------- |
+| Binary cross entropy          | measures the binary cross entropy between the target and the input probabilities |
+| Cross entropy                 | cross-entropy loss function |
+| Kullback-Leibler divergance   | Kullback-Leibler divergence loss |
+| L1                            | creates a criterion that measures the mean absolute error between each element in the input x and target y |
+| MSE                           | creates a criterion that measures the mean squared error between each element in the input x and target y |
+| Negative log-likelihood       | measures a negative log-likelihood loss |
+| Sigmoid cross-entropy         | measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive |
+| Softmax cross-entropy         | measures the probability error in discrete classification tasks in which the classes are mutually exclusive |
\ No newline at end of file
diff --git a/training/api/CMakeLists.txt b/training/api/CMakeLists.txt
new file mode 100644
index 00000000..2b21ef1b
--- /dev/null
+++ b/training/api/CMakeLists.txt
@@ -0,0 +1,65 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+############################
+##  Library sources
+############################
+add_library(Training-API OBJECT)
+include(sources.cmake)
+
+############################
+##  Library build config
+############################
+target_include_directories(Training-API
+        PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+        ${BOLT_ROOT}/common/model_spec/include 
+        ${BOLT_ROOT}/common/uni/include
+        ${BOLT_ROOT}/common/memory/include
+        )
+
+target_compile_features(Training-API PUBLIC cxx_std_17)
+
+target_link_libraries(Training-API
+        PUBLIC
+        Raul
+        ${uni_library}
+        ${model_spec_library}
+        )
+
+target_compile_definitions(Training-API
+        PUBLIC
+        API_EXPORTS
+        )
+
+set_target_properties(Training-API PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# Parallelize mode
+if (RAUL_CPU_CORES)
+    target_compile_options(Training-API PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/MP${RAUL_CPU_CORES}>)
+endif ()
+
+# Pedantic mode (more warnings and warnings as errors)
+if (RAUL_CONFIG_ENABLE_PEDANTIC)
+    target_compile_options(Training-API PUBLIC
+            $<$<OR:$<CXX_COMPILER_ID:GNU>,$<CXX_COMPILER_ID:Clang>>:-Wall;-Wextra;-pedantic;-Werror>
+            $<$<CXX_COMPILER_ID:MSVC>:/W4;/WX>
+            )
+
+    target_compile_definitions(Training-API PUBLIC
+            $<$<CXX_COMPILER_ID:MSVC>:_CRT_SECURE_NO_WARNINGS>
+            )
+endif ()
diff --git a/training/api/sources.cmake b/training/api/sources.cmake
new file mode 100644
index 00000000..25030b6f
--- /dev/null
+++ b/training/api/sources.cmake
@@ -0,0 +1,21 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+target_sources(Training-API PRIVATE
+    training/api/API.cpp
+    training/api/API.h
+    training/api/lowlevel/APIDefinitions.h
+    training/api/lowlevel/APIChecks.h
+    )
diff --git a/training/api/training/api/API.cpp b/training/api/training/api/API.cpp
new file mode 100644
index 00000000..2882098e
--- /dev/null
+++ b/training/api/training/api/API.cpp
@@ -0,0 +1,1362 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "API.h"
+
+#include <iostream>
+#include <string.h>
+#include <type_traits>
+
+#include <training/api/API.h>
+#include <training/base/common/Common.h>
+#include <training/base/initializers/ConstantInitializer.h>
+#include <training/base/initializers/IInitializer.h>
+#include <training/base/initializers/RandomNormInitializer.h>
+#include <training/base/initializers/RandomUniformInitializer.h>
+#include <training/base/initializers/XavierInitializer.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/composite/rnn/LSTMLayer.h>
+#include <training/base/layers/parameters/LayerParameters.h>
+
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/Adadelta.h>
+#include <training/base/optimizers/Adagrad.h>
+#include <training/base/optimizers/Adam.h>
+#include <training/base/optimizers/Adamax.h>
+#include <training/base/optimizers/Momentum.h>
+#include <training/base/optimizers/Nesterov.h>
+#include <training/base/optimizers/Optimizer.h>
+#include <training/base/optimizers/SGD.h>
+#include <training/base/optimizers/AdamW.h>
+#include <training/compiler/Compiler.h>
+
+#include <training/base/optimizers/schedulers/LrScheduler.h>
+#include <training/base/optimizers/schedulers/strategies/CosineAnnealing.h>
+
+#include "lowlevel/APIChecks.h"
+#include "lowlevel/APIDefinitions.h"
+
+namespace
+{
+
+using namespace raul;
+
+thread_local std::string _lastError;
+
+template<typename Layer, typename Param>
+API_STATUS addLayer(Graph_Description_t* desc, const char* name, Param&& param)
+{
+    CHECK_NOT_NULL(desc);
+
+    CHECK_STRING(name);
+
+    try
+    {
+        desc->mDef->add<Layer>(name, std::forward<Param>(param));
+    }
+    catch (std::exception& e)
+    {
+        set_last_error(e.what());
+        return STATUS_ERROR;
+    }
+
+    return STATUS_OK;
+}
+
+} // anonymous
+
+extern "C"
+{
+    using namespace raul;
+    struct Graph_t
+    {
+        Graph_t(Graph_Description_t** desc, size_t batch_size)
+            : mGraph((*desc)->mDef)
+        {
+            delete *desc;
+            *desc = nullptr;
+
+            mGraph->preparePipelines();
+            mGraph->setBatchSize(batch_size);
+            mGraph->prepareMemoryForTraining();
+        }
+
+        Graph_t(Graph_Description_t** desc, size_t batch_size, bool keep_data_grads)
+            : mGraph((*desc)->mDef)
+        {
+            (void)keep_data_grads;
+            delete *desc;
+            *desc = nullptr;
+
+            mGraph->preparePipelines();
+            mGraph->setBatchSize(batch_size);
+            mGraph->prepareMemoryForTraining();
+        }
+
+        std::shared_ptr<raul::Workflow> mGraph;
+    };
+
+    struct Optimizer_t
+    {
+        Optimizer_t(std::shared_ptr<raul::optimizers::Optimizer> o)
+            : mOptimizer(std::move(o))
+        {
+        }
+        std::shared_ptr<raul::optimizers::Optimizer> mOptimizer;
+    };
+
+    struct Initializer_t
+    {
+        Initializer_t(std::shared_ptr<raul::initializers::IInitializer> i)
+            : mInitializer(std::move(i))
+        {
+        }
+        std::shared_ptr<raul::initializers::IInitializer> mInitializer;
+    };
+
+    struct LrScheduler_t
+    {
+        LrScheduler_t(std::shared_ptr<raul::optimizers::Scheduler::LrScheduler> l)
+            : mScheduler(std::move(l))
+        {
+        }
+
+        std::shared_ptr<raul::optimizers::Scheduler::LrScheduler> mScheduler;
+    };
+
+    API_EXPORT const char* get_last_error() { return _lastError.c_str(); }
+    API_EXPORT void set_last_error(const char* err) 
+    { 
+        if (!err) 
+        { 
+            std::cout << "null exception" << std::endl;
+            return; 
+        }
+        _lastError = err; 
+        std::cout << err << std::endl; 
+    }
+
+    API_EXPORT API_STATUS create_graph_description(Graph_Description_t** descr)
+    {
+        CHECK_NOT_NULL(descr);
+        try
+        {
+            *descr = new Graph_Description_t();
+        }
+        catch (std::exception& e)
+        {
+            *descr = NULL;
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_graph_description_fp16(Graph_Description_t** descr)
+    {
+        CHECK_NOT_NULL(descr);
+        try
+        {
+            *descr = new Graph_Description_t(raul::ExecutionTarget::CPUFP16);
+        }
+        catch (std::exception& e)
+        {
+            *descr = NULL;
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_graph_description_eager(Graph_Description_t** descr)
+    {
+        CHECK_NOT_NULL(descr);
+        try
+        {
+            *descr = new Graph_Description_t(true);
+        }
+        catch (std::exception& e)
+        {
+            *descr = NULL;
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_graph_description_compiler(Graph_Description_t** descr)
+    {
+        CHECK_NOT_NULL(descr);
+        try
+        {
+            *descr = new Graph_Description_t(raul::ExecutionTarget::CPU, true);
+        }
+        catch (std::exception& e)
+        {
+            *descr = NULL;
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+
+    API_EXPORT API_STATUS create_graph_description_compiler_fp16(Graph_Description_t** descr)
+    {
+        CHECK_NOT_NULL(descr);
+        try
+        {
+            *descr = new Graph_Description_t(raul::ExecutionTarget::CPUFP16, true);
+        }
+        catch (std::exception& e)
+        {
+            *descr = NULL;
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS delete_graph_description(Graph_Description_t* descr)
+    {
+        try
+        {
+            delete descr;
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_graph(Graph_Description_t** desc, Graph_t** graph, size_t batch_size)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_NOT_NULL(*desc);
+        CHECK_NOT_NULL(graph);
+
+        CHECK_PRECONDITION(batch_size > 0);
+
+        try
+        {
+            *graph = new Graph_t(desc, batch_size);
+        }
+        catch (std::exception& e)
+        {
+            *graph = NULL;
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_graph_with_data_grads(Graph_Description_t** desc, Graph_t** graph, size_t batch_size)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_NOT_NULL(*desc);
+        CHECK_NOT_NULL(graph);
+
+        CHECK_PRECONDITION(batch_size > 0);
+
+        try
+        {
+            *graph = new Graph_t(desc, batch_size, true);
+        }
+        catch (std::exception& e)
+        {
+            *graph = NULL;
+            _lastError = e.what();
+            return STATUS_ERROR;
+        }
+
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS delete_graph(Graph_t* graph)
+    {
+        try
+        {
+            delete graph;
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_STATUS get_tensor_size(Graph_t* graph, const char* tensor_name, size_t* size)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_STRING(tensor_name);
+        CHECK_PRECONDITION(size);
+
+        API_STATUS status = STATUS_ERROR;
+        try
+        {
+            if (graph->mGraph->getMemoryManager().tensorExists(tensor_name))
+            {
+                *size = graph->mGraph->getMemoryManager()[tensor_name].size();
+                status = STATUS_OK;
+            }
+            else if (graph->mGraph->getMemoryManager<raul::MemoryManagerFP16>().tensorExists(tensor_name))
+            {
+                *size = graph->mGraph->getMemoryManager<raul::MemoryManagerFP16>()[tensor_name].size();
+                status = STATUS_OK;
+            }
+            else
+            {
+                return STATUS_ERROR_BAD_NAME;
+            }
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+        }
+        return status;
+    }
+
+    API_EXPORT API_STATUS get_tensor(Graph_t* graph, const char* tensor_name, FLOAT_TYPE* data, size_t* size)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_STRING(tensor_name);
+        CHECK_PRECONDITION(size);
+
+        API_STATUS status = STATUS_ERROR;
+        try
+        {
+            size_t sz = 0;
+            auto status_get = get_tensor_size(graph, tensor_name, &sz);
+            if (status_get != STATUS_OK)
+            {
+                return status_get;
+            }
+            
+            if (!data)
+            {
+                *size = sz;
+                return STATUS_OK;
+            }
+            else if (*size == sz)
+            {
+                if (graph->mGraph->getMemoryManager().tensorExists(tensor_name))
+                {
+                    const auto& tensor = graph->mGraph->getMemoryManager()[tensor_name];
+                    std::copy(tensor.begin(), tensor.end(), data);
+                }
+                else if (graph->mGraph->getMemoryManager<raul::MemoryManagerFP16>().tensorExists(tensor_name))
+                {
+                    const auto& tensor = graph->mGraph->getMemoryManager<raul::MemoryManagerFP16>()[tensor_name];
+                    std::transform(tensor.begin(), tensor.end(), data, [](const auto v16) { return raul::toFloat32(v16); });
+                }
+                status = STATUS_OK;
+            }
+            else
+            {
+                return STATUS_ERROR_BAD_SIZE;
+            }
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+        }
+        return status;
+    }
+
+    API_EXPORT API_STATUS get_model_parameters(Graph_t* graph, bool only_trainable, char** parameters, size_t* param_count, size_t* max_param_name_length)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_PRECONDITION(parameters != nullptr || (param_count != nullptr && max_param_name_length != nullptr));
+
+        API_STATUS status = STATUS_ERROR;
+        try
+        {
+            raul::Names params;
+            if (only_trainable)
+            {
+                params = graph->mGraph->getTrainableParameterNames();
+            }
+            else
+            {
+                params = graph->mGraph->getTrainableParameterNames();
+            }
+
+            if (!parameters)
+            {
+                *param_count = params.size();
+                *max_param_name_length = 0;
+                for (const auto& p : params)
+                {
+                    if (*max_param_name_length < p.size())
+                    {
+                        *max_param_name_length = p.size();
+                    }
+                }
+                *max_param_name_length += 1;
+            }
+            else
+            {
+                for (size_t i = 0; i < params.size(); ++i)
+                {
+                    strncpy(parameters[i], params[i].c_str(), params[i].size());
+                    parameters[i][params[i].size()] = '\0';
+                }
+            }
+
+            status = STATUS_OK;
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+        }
+
+        return status;
+    }
+
+    API_EXPORT API_STATUS set_tensor(Graph_t* graph, const char* tensor_name, const FLOAT_TYPE* data, size_t size)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_NOT_NULL(data);
+        CHECK_STRING(tensor_name);
+
+        API_STATUS status = STATUS_ERROR;
+        try
+        {
+            if (graph->mGraph->getMemoryManager().tensorExists(tensor_name))
+            {
+                graph->mGraph->getMemoryManager()[std::string(tensor_name)] = raul::Tensor::dt_range(data, data + size);
+            }
+            else if (graph->mGraph->getMemoryManager<raul::MemoryManagerFP16>().tensorExists(tensor_name))
+            {
+                auto& tensor = graph->mGraph->getMemoryManager<raul::MemoryManagerFP16>()[tensor_name];
+                std::transform(data, data + size, tensor.begin(), [](const auto v32) { return raul::toFloat16(v32); });
+            }
+            status = STATUS_OK;
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+        }
+
+        return status;
+    }
+
+    API_EXPORT API_STATUS create_tensor(Graph_t* graph, const char* tensor_name, size_t batchSize, size_t depth, size_t height, size_t width)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_STRING(tensor_name);
+
+        API_STATUS status = STATUS_ERROR;
+        try
+        {
+            raul::MemoryManager& memoryManager = graph->mGraph->getMemoryManager();
+            memoryManager.createTensor(tensor_name, batchSize, depth, height, width);
+            status = STATUS_OK;
+        }
+        catch (std::exception& e)
+        {
+            _lastError = e.what();
+        }
+
+        return status;
+    }
+
+    API_EXPORT API_STATUS arange(Graph_t* graph, const char* tensor_name, FLOAT_TYPE start, FLOAT_TYPE step)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_STRING(tensor_name);
+
+        try
+        {
+            raul::MemoryManager& memoryManager = graph->mGraph->getMemoryManager();
+            auto& tensor = memoryManager[std::string(tensor_name)];
+            raul::Common::arange(tensor.begin(), tensor.end(), start, step);
+            return STATUS_OK;
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+        }
+
+        return STATUS_ERROR;
+    }
+
+    API_EXPORT API_STATUS fill_tensor(Graph_t* graph, const char* tensor_name, const FLOAT_TYPE value)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_STRING(tensor_name);
+
+        try
+        {
+            raul::MemoryManager& memoryManager = graph->mGraph->getMemoryManager();
+            auto& tensor = memoryManager[std::string(tensor_name)];
+            for (size_t i = 0; i < memoryManager[std::string(tensor_name)].size(); ++i)
+            {
+                tensor[i] = value;
+            }
+            return STATUS_OK;
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+        }
+
+        return STATUS_ERROR;
+    }
+
+    API_EXPORT API_STATUS create_adadelta_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate)
+    {
+        CHECK_NOT_NULL(optimizer);
+        CHECK_PRECONDITION(learning_rate > 0);
+
+        try
+        {
+            *optimizer = new Optimizer_t(std::make_shared<raul::optimizers::Adadelta>(learning_rate));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_adagrad_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate)
+    {
+        CHECK_NOT_NULL(optimizer);
+        CHECK_PRECONDITION(learning_rate > 0);
+
+        try
+        {
+            *optimizer = new Optimizer_t(std::make_shared<raul::optimizers::Adagrad>(learning_rate));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_adam_optimizer(Optimizer_t** optimizer, FLOAT_TYPE alfa, FLOAT_TYPE beta_1, FLOAT_TYPE beta_2, FLOAT_TYPE epsilon)
+    {
+        CHECK_NOT_NULL(optimizer);
+
+        try
+        {
+            *optimizer = new Optimizer_t(std::make_shared<raul::optimizers::Adam>(alfa, beta_1, beta_2, epsilon));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_adamax_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate)
+    {
+        CHECK_NOT_NULL(optimizer);
+        CHECK_PRECONDITION(learning_rate > 0);
+
+        try
+        {
+            *optimizer = new Optimizer_t(std::make_shared<raul::optimizers::Adamax>(learning_rate));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_momentum_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate, FLOAT_TYPE momentum)
+    {
+        CHECK_NOT_NULL(optimizer);
+        CHECK_PRECONDITION(learning_rate > 0);
+
+        try
+        {
+            *optimizer = new Optimizer_t(std::make_shared<raul::optimizers::Momentum>(learning_rate, momentum));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_nesterov_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate, FLOAT_TYPE momentum)
+    {
+        CHECK_NOT_NULL(optimizer);
+        CHECK_PRECONDITION(learning_rate > 0);
+
+        try
+        {
+            *optimizer = new Optimizer_t(std::make_shared<raul::optimizers::Nesterov>(learning_rate, momentum));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_sgd_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate)
+    {
+        CHECK_NOT_NULL(optimizer);
+        CHECK_PRECONDITION(learning_rate > 0);
+
+        try
+        {
+            *optimizer = new Optimizer_t(std::make_shared<raul::optimizers::SGD>(learning_rate));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS delete_optimizer(Optimizer_t* optimizer)
+    {
+        try
+        {
+            delete optimizer;
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_constant_initializer(Initializer_t** initializer, FLOAT_TYPE value)
+    {
+        CHECK_NOT_NULL(initializer);
+
+        try
+        {
+            *initializer = new Initializer_t(std::make_shared<raul::initializers::ConstantInitializer>(value));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_random_norm_initializer(Initializer_t** initializer, FLOAT_TYPE mean, FLOAT_TYPE stddev, size_t seed)
+    {
+        CHECK_NOT_NULL(initializer);
+
+        try
+        {
+            *initializer = new Initializer_t(std::make_shared<raul::initializers::RandomNormInitializer>(seed, mean, stddev));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_random_uniform_initializer(Initializer_t** initializer, FLOAT_TYPE min_value, FLOAT_TYPE max_value, size_t seed)
+    {
+        CHECK_NOT_NULL(initializer);
+
+        try
+        {
+            *initializer = new Initializer_t(std::make_shared<raul::initializers::RandomUniformInitializer>(seed, min_value, max_value));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_xavier_norm_initializer(Initializer_t** initializer, size_t seed)
+    {
+        CHECK_NOT_NULL(initializer);
+
+        try
+        {
+            *initializer = new Initializer_t(std::make_shared<raul::initializers::XavierNormInitializer>(seed));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS create_xavier_uniform_initializer(Initializer_t** initializer, size_t seed)
+    {
+        CHECK_NOT_NULL(initializer);
+
+        try
+        {
+            *initializer = new Initializer_t(std::make_shared<raul::initializers::XavierUniformInitializer>(seed));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS delete_initializer(Initializer_t* initializer)
+    {
+        try
+        {
+            delete initializer;
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS initialize_tensor(Graph_t* graph, Initializer_t* initializer, const char* tensor_name)
+    {
+        CHECK_NOT_NULL(initializer);
+        CHECK_NOT_NULL(graph);
+        CHECK_STRING(tensor_name);
+
+        API_STATUS status = STATUS_ERROR;
+        try
+        {
+            raul::MemoryManager& memoryManager = graph->mGraph->getMemoryManager();
+            CHECK_PRECONDITION_M(memoryManager.tensorExists(tensor_name), std::string("Tensor \"" + std::string(tensor_name) + "\" not found").c_str());
+            auto& tensor = memoryManager[std::string(tensor_name)];
+
+            initializer->mInitializer->operator()(tensor);
+
+            status = STATUS_OK;
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+        }
+
+        return status;
+    }
+
+    API_EXPORT API_STATUS create_cosine_annealing_adam_w_lr_scheduler(LrScheduler_t** scheduler, size_t size, FLOAT_TYPE max_a, FLOAT_TYPE min_a, FLOAT_TYPE warmup_percentage, FLOAT_TYPE warmup_pow, FLOAT_TYPE annealing_pow, FLOAT_TYPE base_lr, FLOAT_TYPE beta_1, FLOAT_TYPE beta_2, FLOAT_TYPE epsilon, FLOAT_TYPE weight_decay)
+    {
+        CHECK_NOT_NULL(scheduler);
+        CHECK_PRECONDITION(base_lr > 0);
+        CHECK_PRECONDITION(beta_1 >= 0 && beta_1 < 1);
+        CHECK_PRECONDITION(beta_2 >= 0 && beta_2 < 1);
+
+        try
+        {
+            *scheduler = new LrScheduler_t(std::make_shared<raul::optimizers::Scheduler::LrScheduler>(std::make_unique<raul::optimizers::Scheduler::Strategies::CosineAnnealing>(size, max_a, min_a, warmup_percentage, warmup_pow, annealing_pow), std::make_unique<raul::optimizers::AdamW>(base_lr, beta_1, beta_2, epsilon, weight_decay)));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS set_batch_size(Graph_t* graph, size_t batch_size)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_PRECONDITION(batch_size > 0);
+
+        try
+        {
+            auto& network = graph->mGraph;
+            network->setBatchSize(batch_size);
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS train_single_pass(Graph_t* graph, Optimizer_t* optimizer, const char** loss_names, size_t loss_count, FLOAT_TYPE* loss)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_NOT_NULL(optimizer);
+        CHECK_PRECONDITION_M(optimizer->mOptimizer, "Optimizer not initialized");
+        CHECK_PRECONDITION(loss_count == 0 || (loss_names != NULL && loss != NULL));
+
+        try
+        {
+            auto& network = graph->mGraph;
+            auto& opt = *optimizer->mOptimizer.get();
+            network->forwardPassTraining();
+
+            for (size_t i = 0; i < loss_count; ++i)
+            {
+                float totalLoss = 0.f;
+                size_t sz = 1;
+                get_tensor(graph, loss_names[i], &totalLoss, &sz);
+                loss[i] = static_cast<FLOAT_TYPE>(totalLoss);
+            }
+            network->backwardPassTraining();
+            auto paramNames = network->getTrainableParameterNames();
+            auto& mm = network->getMemoryManager();
+            auto& mm16 = network->getMemoryManager<raul::MemoryManagerFP16>();
+            for (auto& name : paramNames)
+            {
+                if (mm.tensorExists(name))
+                {
+                    opt(mm, mm[name], mm[name.grad()]);
+                }
+                else if (mm16.tensorExists(name))
+                {
+                    opt(mm16, mm16[name], mm16[name.grad()]);
+                }
+            }
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS train_single_pass_with_scheduling(Graph_t* graph, LrScheduler_t* scheduler, const char** loss_names, size_t loss_count, FLOAT_TYPE* loss)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_NOT_NULL(scheduler);
+        CHECK_PRECONDITION_M(scheduler->mScheduler, "Scheduler not initialized");
+        CHECK_PRECONDITION(loss_count == 0 || (loss_names != NULL && loss != NULL));
+
+        try
+        {
+            auto& network = graph->mGraph;
+            auto& sch = *scheduler->mScheduler.get();
+
+            network->forwardPassTraining();
+            for (size_t i = 0; i < loss_count; ++i)
+            {
+                float totalLoss = 0.f;
+                size_t sz = 1;
+                FORWARD_ERROR(get_tensor(graph, loss_names[i], &totalLoss, &sz));
+                loss[i] = static_cast<FLOAT_TYPE>(totalLoss);
+            }
+            network->backwardPassTraining();
+            sch.step();
+            auto paramNames = network->getTrainableParameterNames();
+            auto& mm = network->getMemoryManager();
+            auto& mm16 = network->getMemoryManager<raul::MemoryManagerFP16>();
+            for (auto& name : paramNames)
+            {
+                if (mm.tensorExists(name))
+                {
+                    sch(mm, mm[name], mm[name.grad()]);
+                }
+                else if (mm16.tensorExists(name))
+                {
+                    sch(mm16, mm16[name], mm16[name.grad()]);
+                }
+            }
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS test_network(Graph_t* graph, const char* prob_tensor_name, size_t* correctClasses, FLOAT_TYPE* accuracy)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_NOT_NULL(correctClasses);
+        CHECK_NOT_NULL(accuracy);
+        CHECK_STRING(prob_tensor_name);
+
+        try
+        {
+            auto& network = graph->mGraph;
+
+            CHECK_PRECONDITION_M(network->getMemoryManager().tensorExists(prob_tensor_name), "Tensor not found");
+
+            auto batchSize = network->getBatchSize();
+
+            network->forwardPassTesting();
+
+            const raul::Tensor& softmax = network->getMemoryManager()[prob_tensor_name];
+
+            size_t nClasses = softmax.size() / batchSize;
+            size_t correctLabelsCounter = 0;
+
+            for (size_t w = 0; w < network->getBatchSize(); ++w)
+            {
+                if (softmax.getMaxIndex(w * nClasses, (w + 1) * nClasses) == correctClasses[w]) ++correctLabelsCounter;
+            }
+
+            *accuracy = FLOAT_TYPE(correctLabelsCounter) / FLOAT_TYPE(batchSize);
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS network_forward(Graph_t* graph, bool is_test)
+    {
+        CHECK_NOT_NULL(graph);
+
+        try
+        {
+            auto& network = graph->mGraph;
+            if (is_test)
+            {
+                network->forwardPassTesting();
+            }
+            else
+            {
+                network->forwardPassTraining();
+            }
+        }
+        catch (std::exception& e)
+        {
+            _lastError = e.what();
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS network_backward(Graph_t* graph)
+    {
+        CHECK_NOT_NULL(graph);
+
+        try
+        {
+            auto& network = graph->mGraph;
+            network->backwardPassTraining();
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS
+    add_data_layer_with_labels(Graph_Description_t* desc, const char* name, const char** output_names, size_t output_count, size_t depth, size_t height, size_t width, size_t labels_count)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_STRING(name);
+        CHECK_NOT_NULL(output_names);
+        CHECK_PRECONDITION(output_count > 0);
+
+        CHECK_PRECONDITION(width > 0);
+        CHECK_PRECONDITION(height > 0);
+        CHECK_PRECONDITION(depth > 0);
+        CHECK_PRECONDITION(labels_count > 0);
+
+        return addLayer<raul::DataLayer>(desc, name, raul::DataParams{ raul::Names(output_names, output_names + output_count), depth, height, width, labels_count });
+    }
+
+    API_EXPORT API_STATUS add_data_layer(Graph_Description_t* desc, const char* name, const char** output_names, size_t output_count, size_t depth, size_t height, size_t width)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_STRING(name);
+        CHECK_NOT_NULL(output_names);
+        CHECK_PRECONDITION(output_count > 0);
+
+        CHECK_PRECONDITION(width > 0);
+        CHECK_PRECONDITION(height > 0);
+        CHECK_PRECONDITION(depth > 0);
+
+        return addLayer<raul::DataLayer>(desc, name, raul::DataParams{ raul::Names(output_names, output_names + output_count), depth, height, width });
+    }
+
+    API_EXPORT API_STATUS add_embedding_layer(Graph_Description_t* desc,
+                                              const char* name,
+                                              const char* input_name,
+                                              const char* output_name,
+                                              size_t dictionary_size,
+                                              size_t embedding_size,
+                                              int padding_idx,
+                                              bool scale_by_size,
+                                              bool scale_grad_by_frequency)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_STRING(name);
+        CHECK_STRING(input_name);
+        CHECK_STRING(output_name);
+        CHECK_PRECONDITION(dictionary_size > 0);
+        CHECK_PRECONDITION(embedding_size > 0);
+        CHECK_PRECONDITION(padding_idx >= -1);
+
+        return addLayer<raul::Embedding>(
+            desc, name, raul::EmbeddingParams{ raul::Name(input_name), raul::Name(output_name), dictionary_size, embedding_size, padding_idx, scale_by_size, scale_grad_by_frequency });
+    }
+
+    API_EXPORT API_STATUS add_labels(Graph_Description_t* desc, const char* name, const char** output_names, size_t output_count, size_t labels_count)
+    {
+        return add_data_layer(desc, name, output_names, output_count, 1u, 1u, labels_count);
+    }
+
+    API_EXPORT API_STATUS print_graph(Graph_t* graph)
+    {
+        CHECK_NOT_NULL(graph);
+
+        try
+        {
+            graph->mGraph->printInfo(std::cout);
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS add_transpose_layer(Graph_Description_t* desc,
+                                              const char* name,
+                                              const char* input_name,
+                                              const char* output_name,
+                                              DIM from,
+                                              DIM to)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_STRING(name);
+        CHECK_STRING(input_name);
+        CHECK_STRING(output_name);
+
+        return addLayer<raul::TransposeLayer>(
+            desc, name, raul::TransposingParams{ raul::Name(input_name), raul::Name(output_name), static_cast<raul::Dimension>(from), static_cast<raul::Dimension>(to) });
+    }
+
+    API_EXPORT API_STATUS
+    add_reshape_layer(Graph_Description_t* desc, const char* name, const char* input_name, const char* output_name, int new_depth, int new_height, int new_width)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_STRING(name);
+        CHECK_STRING(input_name);
+        CHECK_STRING(output_name);
+
+        return addLayer<raul::ReshapeLayer>(
+            desc, name, raul::ViewParams{ raul::Name(input_name), raul::Name(output_name), new_depth, new_height, new_width }); 
+    }
+
+    API_EXPORT API_STATUS
+    add_loss_layer(Graph_Description_t* desc, const char* name, const char** input_names, const char* loss_name, const char* loss_type, size_t inputs_count, LOSS_REDUCTION reduction)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_STRING(loss_type);
+        CHECK_PRECONDITION(std::string_view(loss_type) == CROSS_ENTROPY_LOSS || std::string_view(loss_type) == NLL_LOSS || std::string_view(loss_type) == KL_DIV_LOSS ||
+                           std::string_view(loss_type) == MSE_LOSS || std::string_view(loss_type) == L1_LOSS || std::string_view(loss_type) == BINARY_CROSS_ENTROPY_LOSS ||
+                           std::string_view(loss_type) == SOFTMAX_CROSS_ENTROPY_LOSS);
+        CHECK_STRING(name);
+        CHECK_PRECONDITION(input_names);
+        CHECK_PRECONDITION(inputs_count > 0);
+
+        raul::LossParams::Reduction red = raul::LossParams::Reduction::Mean;
+        switch (reduction)
+        {
+            case LOSS_REDUCTION_MEAN:
+                red = raul::LossParams::Reduction::Mean;
+                break;
+            case LOSS_REDUCTION_SUM:
+                red = raul::LossParams::Reduction::Sum;
+                break;
+            case LOSS_REDUCTION_BATCH_MEAN:
+                red = raul::LossParams::Reduction::Batch_Mean;
+                break;
+        }
+
+        if (std::string_view(loss_type) == CROSS_ENTROPY_LOSS)
+        {
+            return addLayer<raul::CrossEntropyLoss>(desc, name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red });
+        }
+        else if (std::string_view(loss_type) == NLL_LOSS)
+        {
+            return addLayer<raul::NLLLoss>(desc, name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red });
+        }
+        else if (std::string_view(loss_type) == KL_DIV_LOSS)
+        {
+            return addLayer<raul::KLDivLoss>(desc, name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red });
+        }
+        else if (std::string_view(loss_type) == MSE_LOSS)
+        {
+            return addLayer<raul::MSELoss>(desc, name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red });
+        }
+        else if (std::string_view(loss_type) == BINARY_CROSS_ENTROPY_LOSS)
+        {
+            return addLayer<raul::BinaryCrossEntropyLoss>(desc, name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red });
+        }
+        else if (std::string_view(loss_type) == SOFTMAX_CROSS_ENTROPY_LOSS)
+        {
+            return addLayer<raul::SoftmaxCrossEntropyLoss>(desc, name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red });
+        }
+        return addLayer<raul::L1Loss>(desc, name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red });
+    }
+
+    API_EXPORT API_STATUS
+    add_loss_layer_with_compiler(Graph_Description_t* desc, const char* name, const char** input_names, const char* loss_name, const char* loss_type, size_t inputs_count, LOSS_REDUCTION reduction)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_STRING(loss_type);
+        CHECK_PRECONDITION(std::string_view(loss_type) == CROSS_ENTROPY_LOSS || std::string_view(loss_type) == NLL_LOSS || std::string_view(loss_type) == KL_DIV_LOSS ||
+                           std::string_view(loss_type) == MSE_LOSS || std::string_view(loss_type) == L1_LOSS || std::string_view(loss_type) == BINARY_CROSS_ENTROPY_LOSS);
+        CHECK_STRING(name);
+        CHECK_PRECONDITION(input_names);
+        CHECK_PRECONDITION(inputs_count > 0);
+
+        raul::LossParams::Reduction red = raul::LossParams::Reduction::Mean;
+        switch (reduction)
+        {
+            case LOSS_REDUCTION_MEAN:
+                red = raul::LossParams::Reduction::Mean;
+                break;
+            case LOSS_REDUCTION_SUM:
+                red = raul::LossParams::Reduction::Sum;
+                break;
+            case LOSS_REDUCTION_BATCH_MEAN:
+                red = raul::LossParams::Reduction::Batch_Mean;
+                break;
+        }
+
+        try
+        {
+            if (std::string_view(loss_type) == CROSS_ENTROPY_LOSS)
+            {
+                raul::LossWrapperFunction<raul::CrossEntropyLoss>(name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red }, *desc->mDef.get());
+            }
+            else if (std::string_view(loss_type) == NLL_LOSS)
+            {
+                raul::LossWrapperFunction<raul::NLLLoss>(name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red }, *desc->mDef.get());
+            }
+            else if (std::string_view(loss_type) == KL_DIV_LOSS)
+            {
+                raul::LossWrapperFunction<raul::KLDivLoss>(name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red }, *desc->mDef.get());
+            }
+            else if (std::string_view(loss_type) == MSE_LOSS)
+            {
+                raul::LossWrapperFunction<raul::MSELoss>(name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red }, *desc->mDef.get());
+            }
+            else if (std::string_view(loss_type) == BINARY_CROSS_ENTROPY_LOSS)
+            {
+                raul::LossWrapperFunction<raul::BinaryCrossEntropyLoss>(name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red }, *desc->mDef.get());
+            }
+            else
+            {
+                raul::LossWrapperFunction<raul::L1Loss>(name, raul::LossParams{ raul::Names(input_names, input_names + inputs_count), raul::Names(&loss_name, &loss_name + 1u), red }, *desc->mDef.get());
+            }
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS add_lstm_layer(Graph_Description_t* desc, const char* name, const char* input_name, const char* output_name, size_t hidden_size, bool use_global_fusion, bool use_bias)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_STRING(name);
+        CHECK_STRING(input_name);
+        CHECK_STRING(output_name);
+        CHECK_PRECONDITION(hidden_size > 0);
+
+        raul::LSTMLayer layer(name, raul::LSTMParams{ { input_name }, { output_name }, hidden_size, use_global_fusion, use_bias }, desc->mDef->getNetworkParameters());
+
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS add_lstm_layer_ext(Graph_Description_t* desc,
+                                             const char* name,
+                                             const char* input_name,
+                                             const char* hidden_input_name,
+                                             const char* cell_input_name,
+                                             const char* output_name,
+                                             const char* hidden_output_name,
+                                             const char* cell_output_name,
+                                             bool use_global_fusion,
+                                             bool use_bias)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_STRING(name);
+        CHECK_STRING(input_name);
+        CHECK_STRING(hidden_input_name);
+        CHECK_STRING(cell_input_name);
+        CHECK_STRING(output_name);
+        CHECK_STRING(hidden_output_name);
+        CHECK_STRING(cell_output_name);
+
+        raul::LSTMLayer layer(
+            name, raul::LSTMParams{ input_name, hidden_input_name, cell_input_name, output_name, hidden_output_name, cell_output_name, use_global_fusion, use_bias }, desc->mDef->getNetworkParameters());
+
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS print_graph_to_file(Graph_t* graph, FILE* file)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_NOT_NULL(file);
+
+        try
+        {
+            std::stringstream s;
+            graph->mGraph->printInfo(s);
+            std::string str = s.str();
+            fputs(str.c_str(), file);
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS print_graph_to_string(Graph_t* graph, char* string, size_t* length)
+    {
+        CHECK_NOT_NULL(graph);
+        CHECK_PRECONDITION(string != nullptr || (string == nullptr && length != nullptr));
+
+        try
+        {
+            std::stringstream s;
+            graph->mGraph->printInfo(s);
+            std::string str = s.str();
+
+            if (!string)
+            {
+                *length = str.size();
+            }
+            else
+            {
+                strncpy(string, str.c_str(), str.size());
+                string[str.size()] = '\0';
+            }
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS set_constraint(Graph_Description_t* desc, const char* layer_name, CONSTRAINT_TYPE constraint)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_NOT_NULL(desc->mDef);
+        CHECK_STRING(layer_name);
+
+        try
+        {
+            desc->mDef->getCompiler().setConstraint(raul::Constraint(layer_name, static_cast<raul::ConstraintImpl>(constraint)));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS set_constraint_sequence(Graph_Description_t* desc, const char* layer_name_from, const char* layer_name_to, CONSTRAINT_TYPE constraint)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_NOT_NULL(desc->mDef);
+        CHECK_STRING(layer_name_from);
+        CHECK_STRING(layer_name_to);
+
+        try
+        {
+            desc->mDef->getCompiler().setConstraint(raul::Constraint(layer_name_from, layer_name_to, static_cast<raul::ConstraintImpl>(constraint)));
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+
+        return STATUS_OK;
+    }
+
+    API_EXPORT API_STATUS reset_layer_execution_target_override(Graph_Description_t* desc)
+    {
+        CHECK_NOT_NULL(desc);
+        CHECK_NOT_NULL(desc->mDef);
+        try
+        {
+            desc->mDef->resetLayerExecutionTargetOverride();
+        }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+
+        return STATUS_OK;
+    }
+    
+    API_STATUS finetune_impl(Graph_t* graph, LrScheduler_t* lr_scheduler, const char* loss_name, [[maybe_unused]]size_t step)
+    {
+        raul::dtype loss = 0.0_dt;
+	    return train_single_pass_with_scheduling(graph, lr_scheduler, &loss_name, 1, &loss);
+    }
+    
+    API_EXPORT API_STATUS finetune([[maybe_unused]]Graph_t* graph, [[maybe_unused]]LrScheduler_t* lr_scheduler, const char** input_names, const FLOAT_TYPE** input_data, const int* input_sizes, const int input_num, [[maybe_unused]]const char* loss_name, [[maybe_unused]]size_t step)
+    {
+        try
+	    {
+            for (int i = 0; i < input_num; ++i)
+	        {
+                FORWARD_ERROR(set_tensor(graph, input_names[i], input_data[i], input_sizes[i]));
+	        }
+            FORWARD_ERROR(finetune_impl(graph, lr_scheduler, loss_name, step));
+	    }
+        catch (std::exception& e)
+        {
+            set_last_error(e.what());
+            return STATUS_ERROR;
+        }
+        return STATUS_OK;
+    }
+}
diff --git a/training/api/training/api/API.h b/training/api/training/api/API.h
new file mode 100644
index 00000000..27952dcf
--- /dev/null
+++ b/training/api/training/api/API.h
@@ -0,0 +1,346 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_TRAINING_API
+#define _H_TRAINING_API
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+// Losses
+#define BINARY_CROSS_ENTROPY_LOSS "OpBinaryCrossEntropyLoss"
+#define CROSS_ENTROPY_LOSS "OpCrossEntropyLoss"
+#define KL_DIV_LOSS "OpKLDivLoss"
+#define L1_LOSS "OpL1Loss"
+#define MSE_LOSS "OpMSELoss"
+#define NLL_LOSS "OpNLLLoss"
+#define SIGMOID_CROSS_ENTROPY_LOSS "OpSigmoidCrossEntropyLoss"
+#define SOFTMAX_CROSS_ENTROPY_LOSS "OpSoftmaxCrossEntropyLoss"
+#define WEIGHTED_LOSS "OpWeightedLoss"
+
+// Layers
+#define CONVOLUTION_2D_LAYER "OpConv2DLayer"
+#define CONVOLUTION_DEPTHWISE_2D_LAYER "OpConvDW2DLayer"
+#define MAX_POOLING_2D_LAYER "OpMaxPool2DLayer"
+#define AVERAGE_POOLING_2D_LAYER "OpAvgPool2DLayer"
+#define ELEMENTWISE_DIV_LAYER "OpElementWiseDivLayer"
+#define ELEMENTWISE_MAX_LAYER "OpElementWiseMaxLayer"
+#define ELEMENTWISE_MIN_LAYER "OpElementWiseMinLayer"
+#define ELEMENTWISE_MUL_LAYER "OpElementWiseMulLayer"
+#define ELEMENTWISE_SUB_LAYER "OpElementWiseSubLayer"
+#define ELEMENTWISE_SUM_LAYER "OpElementWiseSumLayer"
+#define ELEMENTWISE_COMPARE_LAYER "OpElementWiseCompareLayer"
+
+// Activations
+#define GELU_ERF_ACTIVATION "OpGeLUErfActivation"
+#define GELU_TANH_ACTIVATION "OpGeLUTanhActivation"
+#define HSIGMOID_ACTIVATION "OpHSigmoidActivation"
+#define HSWISH_ACTIVATION "OpHSwishActivation"
+#define LOG_SOFTMAX_ACTIVATION "OpLogSoftmaxActivation"
+#define RELU_ACTIVATION "OpReLUActivation"
+#define RELU6_ACTIVATION "OpReLU6Activation"
+#define SIGMOID_ACTIVATION "OpSigmoidActivation"
+#define SOFTMAX_ACTIVATION "OpSoftmaxActivation"
+#define SWISH_ACTIVATION "OpSwishActivation"
+#define TANH_ACTIVATION "OpTanhActivation"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(_MSC_VER)
+#ifdef API_EXPORTS
+#define API_EXPORT __declspec(dllexport)
+#else
+#define API_EXPORT __declspec(dllimport)
+#endif
+#else
+#define API_EXPORT
+#endif
+
+#define FLOAT_TYPE float
+
+    enum API_STATUS
+    {
+        STATUS_OK = 0,
+        STATUS_NOT_IMPLEMENTED = 50,
+        STATUS_ERROR = 61,
+        STATUS_ERROR_BAD_SIZE = 62,
+        STATUS_ERROR_BAD_NAME = 63
+    };
+
+    enum COMPARISON_TYPE
+    {
+        COMPARE_EQUAL,
+        COMPARE_NOT_EQUAL,
+        COMPARE_LESS,
+        COMPARE_NOT_LESS,
+        COMPARE_GREATER,
+        COMPARE_NOT_GREATER
+    };
+
+    enum LOSS_REDUCTION
+    {
+        LOSS_REDUCTION_MEAN = 0,
+        LOSS_REDUCTION_SUM = 1,
+        LOSS_REDUCTION_BATCH_MEAN = 2
+    };
+
+    enum DIM
+    {
+        DIM_DEFAULT = -1,
+        DIM_BATCH = 0,
+        DIM_DEPTH = 1,
+        DIM_HEIGHT = 2,
+        DIM_WIDTH = 3
+    };
+
+    enum FILLING_MODE
+    {
+        CONSTANT,
+        REFLECTION,
+        REPLICATION
+    };
+
+    enum CONSTRAINT_TYPE
+    {
+        CPU,
+        CPUFP16,
+        CPUFP16FP32MasterWeights,
+        CPUFP32FP16MixedLocal
+    };
+    /**
+     * @brief Graph_Description_t represents structure of neural network
+     */
+    struct Graph_Description_t;
+
+    /**
+     * @brief Graph_t represents complete graph
+     */
+    struct Graph_t;
+
+    /**
+     * @brief Optimizer_t represents an optimizer for network training (SGD, Adam etc.)
+     */
+    struct Optimizer_t;
+
+    /**
+     * @brief Initializer_t represents an initializer for network training (Random, Constant, Xavier etc.)
+     */
+    struct Initializer_t;
+
+    /**
+     * @brief LrScheduler_t represents a lerning rate scheduler for network training
+     */
+    struct LrScheduler_t;
+
+    API_EXPORT API_STATUS create_graph_description(Graph_Description_t** descr);
+    API_EXPORT API_STATUS create_graph_description_fp16(Graph_Description_t** descr);
+    API_EXPORT API_STATUS create_graph_description_eager(Graph_Description_t** descr);
+    API_EXPORT API_STATUS create_graph_description_compiler(Graph_Description_t** descr);
+    API_EXPORT API_STATUS create_graph_description_compiler_fp16(Graph_Description_t** descr);
+
+
+    API_EXPORT API_STATUS delete_graph_description(Graph_Description_t* descr);
+
+    API_EXPORT API_STATUS create_graph(Graph_Description_t** desc, Graph_t** graph, size_t batch_size);
+    API_EXPORT API_STATUS create_graph_with_data_grads(Graph_Description_t** desc, Graph_t** graph, size_t batch_size);
+
+    API_EXPORT API_STATUS delete_graph(Graph_t* graph);
+
+    API_EXPORT const char* get_last_error();
+
+    /**
+     * @brief Internal usage only
+     */
+    API_EXPORT void set_last_error(const char*);
+
+    // optimizers
+
+    API_EXPORT API_STATUS create_adadelta_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate);
+    API_EXPORT API_STATUS create_adagrad_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate);
+    /**
+     * @brief Adam (Adaptive moment estimation)
+     *
+     *  The Adam method computes individual adaptive learning rates for
+     *  different parameters from estimates of first
+     *  and second moments of the gradients. This method is combination
+     *  of AdaGrad and RMSProp. AdaGrad works well with sparse gradients.
+     *  RMSProp works well in on-line and non-stationary settings.
+     *
+     *  \f[
+     *      m_t =  \beta_1 m_{t-1} - (1-\beta_1) \nabla_{\theta} E(\theta_{t-1}),\\
+     *      \nu_t =  \beta_2 \nu_{t-1} - (1-\beta_2) \nabla^2_{\theta} E(\theta_{t-1}),\\
+     *      \hat m_t = \frac{m}{1-\beta_1^t}, \\
+     *      \hat \nu_t = \frac{\nu}{1-\beta_2^t}, \\
+     *      \theta_{t} =  \theta_{t-1} - \alpha \frac{m_{t}}{\sqrt{\hat \nu_t} + \epsilon},
+     *  \f]
+     *  where
+     *  - \f$m\f$ is the 1st moment vector (the mean of gradient),
+     *  - \f$\nu\f$ is the 2st moment vector (the uncentered variance of gradient),
+     *  - \f$\beta_1\f$ is the exponential decay rate for 1st moment,
+     *  - \f$\beta_2\f$ is the exponential decay rate for 2st moment,
+     *  - \f$\hat m\f$ is the bias-corrected 1st moment vector,
+     *  - \f$\hat \nu\f$ is the bias-corrected 2st moment vector,
+     *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+     *  - \f$\alpha\f$ is a learning rate,
+     *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+     *
+     *  Good default settings from the original article:
+     *  - \f$\alpha = 0.0001\f$
+     *  - \f$\beta_1 = 0.9\f$
+     *  - \f$\beta_2 = 0.999\f$
+     *  - \f$\epsilon = 10^{-8}\f$
+     *
+     *  @see
+     *  - D. P. Kingma and J. Ba, �Adam: A Method for Stochastic Optimization� arXiv:1412.6980 [cs], Jan. 2017.
+     */
+    API_EXPORT API_STATUS create_adam_optimizer(Optimizer_t** optimizer, FLOAT_TYPE alfa, FLOAT_TYPE beta_1, FLOAT_TYPE beta_2, FLOAT_TYPE epsilon);
+    API_EXPORT API_STATUS create_adamax_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate);
+    /**
+     * @brief Momentum method
+     *
+     *  The momentum method is a technique for accelerating
+     *  gradient descent that accumulates a velocity
+     *  vector in directions of persistent reduction in the
+     *  objective across iterations.
+     *
+     *  \f[
+     *      \nu_{t} =  \mu nu_{t-1} - \eta_{t-1} \nabla_{\theta} E(\theta_{t-1}),\\
+     *      \theta_{t} =  \theta_{t-1} - \nu_{t}
+     *  \f]
+     *  where
+     *  - \f$\nu\f$ is a velocity,
+     *  - \f$\mu\f$ is a momentum parameter,
+     *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+     *  - \f$\eta\f$ is a learning rate,
+     *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+     *
+     *  @see
+     *  - I. Sutskever, J. Martens, G. Dahl, and G. Hinton, �On the importance of initialization and momentum in deep learning� p. 14.
+     */
+    API_EXPORT API_STATUS create_momentum_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate, FLOAT_TYPE momentum);
+    API_EXPORT API_STATUS create_nesterov_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate, FLOAT_TYPE momentum);
+    /**
+     * @brief Stochastic gradient descent (SGD)
+     *
+     *  This is classical stochastic gradient descent with
+     *  one parameter: learning rate (lr). An optimization
+     *  algorithm works according to the following formula.
+     *
+     *  \f[
+     *      \theta_{t} =  \theta_{t-1} - \eta_{t-1} \nabla_{\theta} E(\theta_{t-1}),
+     *  \f]
+     *  where
+     *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+     *  - \f$\eta\f$ is a learning rate,
+     *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+     *
+     *  @see
+     *  - S. Ruder, �An overview of gradient descent optimization algorithms� arXiv:1609.04747 [cs], Jun. 2017.
+     */
+    API_EXPORT API_STATUS create_sgd_optimizer(Optimizer_t** optimizer, FLOAT_TYPE learning_rate);
+    API_EXPORT API_STATUS delete_optimizer(Optimizer_t* optimizer);
+
+    // initializers
+    API_EXPORT API_STATUS create_constant_initializer(Initializer_t** initializer, FLOAT_TYPE value);
+    API_EXPORT API_STATUS create_random_norm_initializer(Initializer_t** initializer, FLOAT_TYPE mean, FLOAT_TYPE stddev, size_t seed);
+    API_EXPORT API_STATUS create_random_uniform_initializer(Initializer_t** initializer, FLOAT_TYPE min_value, FLOAT_TYPE max_value, size_t seed);
+    API_EXPORT API_STATUS create_xavier_norm_initializer(Initializer_t** initializer, size_t seed);
+    API_EXPORT API_STATUS create_xavier_uniform_initializer(Initializer_t** initializer, size_t seed);
+    API_EXPORT API_STATUS delete_initializer(Initializer_t* initializer);
+
+    API_EXPORT API_STATUS initialize_tensor(Graph_t* graph, Initializer_t* initializer, const char* tensor_name);
+    API_EXPORT API_STATUS arange(Graph_t* graph, const char* tensor_name, FLOAT_TYPE start, FLOAT_TYPE step);
+
+    // scheduler
+    API_EXPORT API_STATUS create_cosine_annealing_adam_w_lr_scheduler(LrScheduler_t** scheduler, size_t size, FLOAT_TYPE max_a, FLOAT_TYPE min_a, FLOAT_TYPE warmup_percentage, FLOAT_TYPE warmup_pow, FLOAT_TYPE annealing_pow, FLOAT_TYPE base_lr, FLOAT_TYPE beta_1, FLOAT_TYPE beta_2, FLOAT_TYPE epsilon, FLOAT_TYPE weight_decay);
+
+    // training
+    API_EXPORT API_STATUS set_batch_size(Graph_t* graph, size_t batch_size);
+
+    /**
+     * Performs one iteration of training
+     * @param loss_names same as loss_name in add_loss_layer
+     * @param loss array of size loss_count
+     */
+    API_EXPORT API_STATUS train_single_pass(Graph_t* graph, Optimizer_t* optimizer, const char** loss_names, size_t loss_count, FLOAT_TYPE* loss);
+    API_EXPORT API_STATUS train_single_pass_with_scheduling(Graph_t* graph, LrScheduler_t* scheduler, const char** loss_names, size_t loss_count, FLOAT_TYPE* loss);
+    API_EXPORT API_STATUS test_network(Graph_t* graph, const char* prob_tensor_name, size_t* correctClasses, FLOAT_TYPE* accuracy);
+    API_EXPORT API_STATUS get_model_parameters(Graph_t* graph, bool only_trainable, char** parameters, size_t* param_count, size_t* max_param_name_length);
+
+    // inference for verification
+    API_EXPORT API_STATUS network_forward(Graph_t* graph, bool is_test);
+    API_EXPORT API_STATUS network_backward(Graph_t* graph);
+
+    API_EXPORT API_STATUS
+    add_data_layer_with_labels(Graph_Description_t* desc, const char* name, const char** output_names, size_t output_count, size_t depth, size_t height, size_t width, size_t labels_count);
+    API_EXPORT API_STATUS add_data_layer(Graph_Description_t* desc, const char* name, const char** output_names, size_t output_count, size_t depth, size_t height, size_t width);
+
+    API_EXPORT API_STATUS add_embedding_layer(Graph_Description_t* desc,
+                                              const char* name,
+                                              const char* input_name,
+                                              const char* output_name,
+                                              size_t dictionary_size,
+                                              size_t embedding_size,
+                                              int padding_idx,
+                                              bool scale_by_size,
+                                              bool scale_grad_by_frequency);
+
+    API_EXPORT API_STATUS add_labels(Graph_Description_t* desc, const char* name, const char** output_names, size_t output_count, size_t labels_count);
+
+    API_EXPORT API_STATUS
+    add_transpose_layer(Graph_Description_t* desc, const char* name, const char* input_name, const char* output_name, DIM from, DIM to);
+
+    API_EXPORT API_STATUS
+    add_reshape_layer(Graph_Description_t* desc, const char* name, const char* input_name, const char* output_name, int new_depth, int new_height, int new_width);
+
+    API_EXPORT API_STATUS
+    add_loss_layer(Graph_Description_t* desc, const char* name, const char** input_names, const char* loss_name, const char* loss_type, size_t inputs_count, LOSS_REDUCTION reduction);
+
+    API_EXPORT API_STATUS
+    add_loss_layer_with_compiler(Graph_Description_t* desc, const char* name, const char** input_names, const char* loss_name, const char* loss_type, size_t inputs_count, LOSS_REDUCTION reduction);
+
+    API_EXPORT API_STATUS add_lstm_layer(Graph_Description_t* desc, const char* name, const char* input_name, const char* output_name, size_t hidden_size, bool use_global_fusion, bool use_bias);
+    API_EXPORT API_STATUS add_lstm_layer_ext(Graph_Description_t* desc,
+                                             const char* name,
+                                             const char* input_name,
+                                             const char* hidden_input_name,
+                                             const char* cell_input_name,
+                                             const char* output_name,
+                                             const char* hidden_output_name,
+                                             const char* cell_output_name,
+                                             bool use_global_fusion,
+                                             bool use_bias);
+
+    API_EXPORT API_STATUS print_graph(Graph_t* graph);
+    API_EXPORT API_STATUS print_graph_to_file(Graph_t* graph, FILE* file);
+    API_EXPORT API_STATUS print_graph_to_string(Graph_t* graph, char* string, size_t* length);
+
+    API_EXPORT API_STATUS get_tensor(Graph_t* graph, const char* tensor_name, FLOAT_TYPE* data, size_t* size);
+    API_EXPORT API_STATUS fill_tensor(Graph_t* graph, const char* tensor_name, const FLOAT_TYPE value);
+    API_EXPORT API_STATUS set_tensor(Graph_t* graph, const char* tensor_name, const FLOAT_TYPE* data, size_t size);
+    API_EXPORT API_STATUS create_tensor(Graph_t* graph, const char* tensor_name, size_t batchSize, size_t depth, size_t height, size_t width);
+
+    API_EXPORT API_STATUS set_constraint(Graph_Description_t* desc, const char* layer_name, CONSTRAINT_TYPE constraint);
+    API_EXPORT API_STATUS set_constraint_sequence(Graph_Description_t* desc, const char* layer_name_from, const char* layer_name_to, CONSTRAINT_TYPE constraint);
+
+    API_EXPORT API_STATUS reset_layer_execution_target_override(Graph_Description_t* desc);
+    API_EXPORT API_STATUS finetune(Graph_t* graph, LrScheduler_t* lr_scheduler, const char** input_names, const FLOAT_TYPE** input_data, const int* input_sizes, const int input_num, const char* loss_name, size_t step);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/training/api/training/api/lowlevel/APIChecks.h b/training/api/training/api/lowlevel/APIChecks.h
new file mode 100644
index 00000000..e2db7f29
--- /dev/null
+++ b/training/api/training/api/lowlevel/APIChecks.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef API_CHECKS_H
+#define API_CHECKS_H
+
+#include <string>
+
+#define CHECK_PRECONDITION_M(x, m)                                                                                                                                                                     \
+    do                                                                                                                                                                                                 \
+    {                                                                                                                                                                                                  \
+        if (!(x))                                                                                                                                                                                      \
+        {                                                                                                                                                                                              \
+            set_last_error(m);                                                                                                                                                                         \
+            return STATUS_ERROR;                                                                                                                                                                       \
+        }                                                                                                                                                                                              \
+    } while (false)
+
+#define CHECK_PRECONDITION(x) CHECK_PRECONDITION_M(x, "Condition " #x " violated")
+
+#define CHECK_NOT_NULL(x) CHECK_PRECONDITION_M(x, #x " is NULL")
+
+#define CHECK_STRING(x)                                                                                                                                                                                \
+    do                                                                                                                                                                                                 \
+    {                                                                                                                                                                                                  \
+        if (!(x))                                                                                                                                                                                      \
+        {                                                                                                                                                                                              \
+            set_last_error(#x " is NULL");                                                                                                                                                             \
+            return STATUS_ERROR_BAD_NAME;                                                                                                                                                              \
+        }                                                                                                                                                                                              \
+        if (std::string_view(x) == "")                                                                                                                                                                 \
+        {                                                                                                                                                                                              \
+            set_last_error(#x " is empty");                                                                                                                                                            \
+            return STATUS_ERROR_BAD_NAME;                                                                                                                                                              \
+        }                                                                                                                                                                                              \
+    } while (false)
+
+#define FORWARD_ERROR(x)                                                                                                                                                                               \
+    do                                                                                                                                                                                                 \
+    {                                                                                                                                                                                                  \
+        auto status = x;                                                                                                                                                                               \
+        if (status != STATUS_OK) { return status; }                                                                                                                                                    \
+    } while (false)
+
+// Error strings
+#define SE_LAYER_ALREADY_ADDED "Altering params of layer already added to graph is prohibited"
+
+#endif
\ No newline at end of file
diff --git a/training/api/training/api/lowlevel/APIDefinitions.h b/training/api/training/api/lowlevel/APIDefinitions.h
new file mode 100644
index 00000000..4696f4af
--- /dev/null
+++ b/training/api/training/api/lowlevel/APIDefinitions.h
@@ -0,0 +1,56 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef API_DEFINITIONS_H
+#define API_DEFINITIONS_H
+
+#include <memory>
+
+#include <training/compiler/Workflow.h>
+#include <training/compiler/WorkflowEager.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    struct Graph_Description_t
+    {
+        Graph_Description_t()
+            : mDef(std::make_shared<raul::Workflow>())
+        {
+        }
+
+        Graph_Description_t(raul::ExecutionTarget target)
+            : mDef(std::make_shared<raul::Workflow>(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, target))
+        {
+        }
+
+        Graph_Description_t(bool)
+            : mDef(std::make_shared<raul::WorkflowEager>())
+        {
+        }
+
+        Graph_Description_t(raul::ExecutionTarget target, bool enable_compiler)
+            : mDef(std::make_shared<raul::Workflow>(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, target, enable_compiler))
+        {
+        }
+
+        std::shared_ptr<raul::Workflow> mDef;
+    };
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/training/cmake/clang-format.cmake b/training/cmake/clang-format.cmake
new file mode 100644
index 00000000..c98cdcd6
--- /dev/null
+++ b/training/cmake/clang-format.cmake
@@ -0,0 +1,37 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+find_program(RAUL_DEV_CLANG_FORMAT NAMES "clang-format"
+        "clang-format-13"
+        "clang-format-12"
+        "clang-format-11")
+
+mark_as_advanced(RAUL_DEV_CLANG_FORMAT)
+
+if (RAUL_DEV_CLANG_FORMAT AND TARGET Raul)
+    get_target_property(raul-src Raul SOURCES)
+    list(FILTER raul-src EXCLUDE REGEX "${CMAKE_CURRENT_BINARY_DIR}/.*")
+    get_target_property(raul-dir Raul SOURCE_DIR)
+    set(raul-abs-src "")
+    foreach (file ${raul-src})
+        list(APPEND raul-abs-src ${raul-dir}/${file})
+    endforeach ()
+
+    add_custom_target(format
+            COMMAND ${CMAKE_COMMAND} -E echo "Tool: ${RAUL_DEV_CLANG_FORMAT}"
+            COMMAND ${RAUL_DEV_CLANG_FORMAT} -style=file -i ${raul-abs-src}
+            COMMENT "Format raul sources"
+            )
+endif ()
diff --git a/training/cmake/clang-tidy.cmake b/training/cmake/clang-tidy.cmake
new file mode 100644
index 00000000..8bd3b5cd
--- /dev/null
+++ b/training/cmake/clang-tidy.cmake
@@ -0,0 +1,25 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+find_program(CMAKE_CXX_CLANG_TIDY NAMES clang-tidy)
+
+if (CMAKE_CXX_CLANG_TIDY)
+    message(STATUS "Clang Tidy found")
+
+    list(
+            APPEND CMAKE_CXX_CLANG_TIDY
+            "-checks=*,-clang-analyzer-alpha.*"
+    )
+endif ()
diff --git a/training/cmake/cmake-gui.cmake b/training/cmake/cmake-gui.cmake
new file mode 100644
index 00000000..b4f9f2e5
--- /dev/null
+++ b/training/cmake/cmake-gui.cmake
@@ -0,0 +1,22 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+get_cmake_property(vars VARIABLES)
+foreach (var ${vars})
+    if (var MATCHES "FETCHCONTENT.*")
+        mark_as_advanced(${var})
+    endif ()
+endforeach ()
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
diff --git a/training/cmake/core-counter.cmake b/training/cmake/core-counter.cmake
new file mode 100644
index 00000000..eb67637e
--- /dev/null
+++ b/training/cmake/core-counter.cmake
@@ -0,0 +1,24 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+include(ProcessorCount)
+ProcessorCount(N)
+if (NOT N EQUAL 0)
+    set(RAUL_CPU_CORES ${N})
+else ()
+    set(RAUL_CPU_CORES 1)
+endif ()
+
+message(STATUS "Found CPU cores: ${RAUL_CPU_CORES}")
\ No newline at end of file
diff --git a/training/cmake/cppcheck.cmake b/training/cmake/cppcheck.cmake
new file mode 100644
index 00000000..980c0848
--- /dev/null
+++ b/training/cmake/cppcheck.cmake
@@ -0,0 +1,29 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+find_program(CMAKE_CXX_CPPCHECK NAMES cppcheck)
+
+if (CMAKE_CXX_CPPCHECK)
+    message(STATUS "Cppcheck found")
+
+    list(
+            APPEND CMAKE_CXX_CPPCHECK
+            "--enable=all"
+            "--inconclusive"
+            "--force"
+            "--inline-suppr"
+            "--language=c++"
+    )
+endif ()
diff --git a/training/cmake/ndk.cmake b/training/cmake/ndk.cmake
new file mode 100644
index 00000000..b489356a
--- /dev/null
+++ b/training/cmake/ndk.cmake
@@ -0,0 +1,37 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+set(ANDROID_NDK_SEARCH_PATHS
+        /opt/android-ndk-r22/
+        $ENV{NDK_PATH}
+        )
+
+if (WIN32)
+    set(ANDROID_NDK_SEARCH_PATHS
+            ${ANDROID_NDK_SEARCH_PATHS}
+            $ENV{SystemDrive}/android-ndk-r22/
+            $ENV{ProgramW6432}/android-ndk-r22/
+            $ENV{ProgramFiles}/android-ndk-r22/
+            $ENV{ProgramFiles\(x86\)}/android-ndk-r22/
+            )
+endif ()
+
+set(RAUL_CONFIG_NDK_PATH "" CACHE PATH "Path to android NKD")
+find_path(RAUL_CONFIG_NDK_PATH NAMES build/cmake/android.toolchain.cmake PATHS ${ANDROID_NDK_SEARCH_PATHS} NO_CMAKE_FIND_ROOT_PATH)
+mark_as_advanced(RAUL_CONFIG_NDK_PATH)
+
+if (RAUL_CONFIG_NDK_PATH)
+    message(STATUS "Found Android NDK: ${RAUL_CONFIG_NDK_PATH}")
+endif ()
\ No newline at end of file
diff --git a/training/cmake/teamcity/cppcheck.xslt b/training/cmake/teamcity/cppcheck.xslt
new file mode 100644
index 00000000..985b8ba1
--- /dev/null
+++ b/training/cmake/teamcity/cppcheck.xslt
@@ -0,0 +1,20 @@
+<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+<xsl:output encoding="UTF-8" indent="yes" method="xml"></xsl:output>
+<xsl:template match="/">
+  <checkstyle>
+    <xsl:attribute name="version">
+      <xsl:value-of select="results/cppcheck[@version]"/>
+    </xsl:attribute>
+    <xsl:for-each-group select="results/errors/error" group-by="(location/@file)[1]">
+      <file>
+        <xsl:attribute name="name">
+          <xsl:value-of select="current-grouping-key()"/>
+        </xsl:attribute>
+        <xsl:for-each select="current-group()">
+          <error line="{(location/@line)[1]}" message="{@msg}" severity="{@severity}" source="{@id}" />
+        </xsl:for-each>
+      </file>
+    </xsl:for-each-group>
+  </checkstyle>
+</xsl:template>
+</xsl:stylesheet>
\ No newline at end of file
diff --git a/training/cmake/testing.cmake b/training/cmake/testing.cmake
new file mode 100644
index 00000000..c31f82c7
--- /dev/null
+++ b/training/cmake/testing.cmake
@@ -0,0 +1,37 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+set(CMAKE_CTEST_ENV_COMMAND ${CMAKE_COMMAND} -E env RAUL_ASSETS=${CMAKE_CURRENT_SOURCE_DIR}/assets ${CMAKE_CTEST_COMMAND})
+
+function(add_test_target test_target_name test_target_comment)
+    set(RUN_COMMAND ${CMAKE_CTEST_ENV_COMMAND} ${CMAKE_CURRENT_BINARY_DIR})
+    set(oneValueArgs INCLUDE EXCLUDE)
+    cmake_parse_arguments(ADD_TEST_TARGET "" "${oneValueArgs}" "" ${ARGN})
+    if (ADD_TEST_TARGET_INCLUDE)
+        set(RUN_COMMAND ${RUN_COMMAND} -R ${ADD_TEST_TARGET_INCLUDE})
+    endif ()
+    if (ADD_TEST_TARGET_EXCLUDE)
+        set(RUN_COMMAND ${RUN_COMMAND} -E ${ADD_TEST_TARGET_EXCLUDE})
+    endif ()
+    if (RAUL_TESTS_CONFIG_ENABLE_VERBOSE)
+        set(RUN_COMMAND ${RUN_COMMAND} -VV)
+    endif ()
+    add_custom_target(${test_target_name}
+            COMMAND ${RUN_COMMAND} --parallel
+            COMMENT ${test_target_comment}
+            DEPENDS RaulTests
+            )
+    set_target_properties(${test_target_name} PROPERTIES FOLDER Scenarios)
+endfunction()
diff --git a/training/demos/CMakeLists.txt b/training/demos/CMakeLists.txt
new file mode 100644
index 00000000..c032f755
--- /dev/null
+++ b/training/demos/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+set(project_name training_demos)
+project(${project_name})
+
+add_subdirectory(common)
+add_subdirectory(lenet_demo)
+add_subdirectory(resnet18_demo)
+add_subdirectory(mobilenet_v1_demo)
diff --git a/training/demos/common/CMakeLists.txt b/training/demos/common/CMakeLists.txt
new file mode 100644
index 00000000..3851b1aa
--- /dev/null
+++ b/training/demos/common/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+set(project_name common)
+project(${project_name})
+
+add_library(${project_name} OBJECT training.cpp)
+target_include_directories(${project_name}
+        PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+        )
+target_link_libraries(${project_name} PRIVATE Training-API)
+
+add_library(training STATIC training.cpp)
+target_include_directories(training
+        PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+        )
+target_link_libraries(training PRIVATE Training-API)
+install(TARGETS training
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
+file(READ training.h content)
+string(REPLACE "training\/api\/" "" content "${content}")
+FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/training.h "${content}")
+install(FILES ../../api/training/api/API.h
+        ${CMAKE_CURRENT_BINARY_DIR}/training.h
+        DESTINATION include/training)
diff --git a/training/demos/common/mnist_parser.hpp b/training/demos/common/mnist_parser.hpp
new file mode 100644
index 00000000..8557d425
--- /dev/null
+++ b/training/demos/common/mnist_parser.hpp
@@ -0,0 +1,89 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+int ReverseInt(int i)
+{
+    unsigned char ch1, ch2, ch3, ch4;
+    ch1 = i & 255;
+    ch2 = (i >> 8) & 255;
+    ch3 = (i >> 16) & 255;
+    ch4 = (i >> 24) & 255;
+    return ((int)ch1 << 24) + ((int)ch2 << 16) + ((int)ch3 << 8) + ch4;
+}
+
+template <typename T>
+void read_Mnist_Label(std::string filename, std::vector<T> &labels)
+{
+    std::ifstream file(filename, std::ios::binary);
+    if (file.is_open()) {
+        int magic_number = 0;
+        int number_of_images = 0;
+        file.read((char *)&magic_number, sizeof(magic_number));
+        file.read((char *)&number_of_images, sizeof(number_of_images));
+        magic_number = ReverseInt(magic_number);
+        number_of_images = ReverseInt(number_of_images);
+        for (int i = 0; i < number_of_images; i++) {
+            unsigned char label = 0;
+            file.read((char *)&label, sizeof(label));
+            labels.push_back((T)label);
+        }
+    }
+}
+
+template <typename T>
+void read_Mnist_Images(std::string filename, std::vector<std::vector<T>> &images)
+{
+    std::ifstream file(filename, std::ios::binary);
+    if (file.is_open()) {
+        int magic_number = 0;
+        int number_of_images = 0;
+        int n_rows = 0;
+        int n_cols = 0;
+        file.read((char *)&magic_number, sizeof(magic_number));
+        file.read((char *)&number_of_images, sizeof(number_of_images));
+        file.read((char *)&n_rows, sizeof(n_rows));
+        file.read((char *)&n_cols, sizeof(n_cols));
+        magic_number = ReverseInt(magic_number);
+        number_of_images = ReverseInt(number_of_images);
+        n_rows = ReverseInt(n_rows);
+        n_cols = ReverseInt(n_cols);
+        for (int i = 0; i < number_of_images; i++) {
+            std::vector<T> tp;
+            for (int r = 0; r < n_rows; r++) {
+                for (int c = 0; c < n_cols; c++) {
+                    unsigned char image = 0;
+                    file.read((char *)&image, sizeof(image));
+                    tp.push_back(image / 255.0);
+                }
+            }
+            images.push_back(tp);
+        }
+    }
+}
+
+template <typename T1, typename T2>
+std::vector<std::vector<T2>> one_hot_encoding(std::vector<T1> labels, const int one_hot_len)
+{
+    std::vector<std::vector<T2>> res;
+    for (int i = 0; i < (int)(labels.size()); i++) {
+        std::vector<T2> tmp(one_hot_len, 0);
+        tmp[(int)(labels[i])] = (T2)1.0;
+        res.emplace_back(tmp);
+    }
+    return res;
+}
diff --git a/training/demos/common/training.cpp b/training/demos/common/training.cpp
new file mode 100644
index 00000000..dd98b050
--- /dev/null
+++ b/training/demos/common/training.cpp
@@ -0,0 +1,492 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <training/api/API.h>
+#include <training/api/lowlevel/APIChecks.h>
+#include <training/api/lowlevel/APIDefinitions.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/initializers/ConstantInitializer.h>
+#include <training/base/initializers/IInitializer.h>
+#include <training/base/initializers/RandomNormInitializer.h>
+#include <training/base/initializers/RandomUniformInitializer.h>
+#include <training/base/initializers/XavierInitializer.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/composite/rnn/LSTMLayer.h>
+#include <training/base/layers/parameters/LayerParameters.h>
+
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/Adadelta.h>
+#include <training/base/optimizers/Adagrad.h>
+#include <training/base/optimizers/Adam.h>
+#include <training/base/optimizers/Adamax.h>
+#include <training/base/optimizers/Momentum.h>
+#include <training/base/optimizers/Nesterov.h>
+#include <training/base/optimizers/Optimizer.h>
+#include <training/base/optimizers/SGD.h>
+#include <training/base/optimizers/AdamW.h>
+#include <training/compiler/Compiler.h>
+
+#include <training/base/optimizers/schedulers/LrScheduler.h>
+#include <training/base/optimizers/schedulers/strategies/CosineAnnealing.h>
+
+#include <iostream>
+#include <string.h>
+#include <type_traits>
+
+#include "training.h"
+#include "model_common.h"
+
+namespace {
+
+using namespace raul;
+void set_weights_for_training_model(ModelSpec &ms, Graph_t *graph, char *modified_output)
+{
+    std::map<std::string, int> scale_position;
+    for (int i = 0; i < ms.num_operator_specs; i++) {
+        if (ms.ops[i].type == OT_Scale) {
+            scale_position[std::string(ms.ops[i].name)] = i;
+        }
+    }
+
+    for (int i = 0; i < ms.num_weight_specs; i++) {
+        WeightSpec ws = ms.ws[i];
+        if (scale_position.find(std::string(ws.op_name)) != scale_position.end()) {
+            continue;
+        }
+
+        if (modified_output != nullptr && std::string(ws.op_name) == std::string(modified_output)) {
+            continue;
+        }
+
+        if (ws.bytes_of_weight > 0) {
+            std::string weightStr = std::string(ws.op_name) + "::Weights";
+            set_tensor(graph, weightStr.c_str(), (float *)(ws.weight),
+                ws.bytes_of_weight / bytesOf(ws.mdt));
+        }
+        if (ws.bytes_of_vec > 0) {
+            std::string biasesStr = std::string(ws.op_name) + "::Biases";
+            set_tensor(
+                graph, biasesStr.c_str(), (float *)(ws.vec), ws.bytes_of_vec / bytesOf(ws.mdt));
+        }
+    }
+}
+
+API_EXPORT API_STATUS add_training_model_from_model_spec(Graph_Description_t *desc, ModelSpec ms)
+{
+    CHECK_NOT_NULL(desc);
+    CHECK_NOT_NULL(desc->mDef);
+    try {
+        Workflow *work = desc->mDef.get();
+        std::map<std::string, int> tensor_layer_index;  // for removing duplication
+        std::map<std::string, std::string> simply_replaced_map;
+        for (int i = 0; i < ms.num_operator_specs; i++) {
+            OperatorSpec op_spec = ms.ops[i];
+            if (op_spec.type == OT_Conv) {
+                std::string cur_input_tensor_name = op_spec.input_tensors_name[0];
+                if (tensor_layer_index.find(cur_input_tensor_name) != tensor_layer_index.end()) {
+                    int tmp_layer_index = tensor_layer_index[cur_input_tensor_name];
+                    cur_input_tensor_name =
+                        cur_input_tensor_name + "::" + std::to_string(tmp_layer_index);
+                }
+
+                ConvolutionParamSpec convSpec = op_spec.ps.conv_spec;
+                work->add<Convolution2DLayer>(op_spec.name,
+                    Convolution2DParams{{cur_input_tensor_name}, {op_spec.output_tensors_name[0]},
+                        convSpec.kernel_w, convSpec.kernel_h, convSpec.num_outputs, convSpec.stride_w,
+                        convSpec.stride_h, convSpec.pad_top, convSpec.pad_left, false, false,
+                        convSpec.dilatedRate_w, convSpec.dilatedRate_h, convSpec.group});
+            } else if (op_spec.type == OT_Pooling) {
+                std::string cur_input_tensor_name = op_spec.input_tensors_name[0];
+                if (tensor_layer_index.find(cur_input_tensor_name) != tensor_layer_index.end()) {
+                    int tmp_layer_index = tensor_layer_index[cur_input_tensor_name];
+                    cur_input_tensor_name =
+                        cur_input_tensor_name + "::" + std::to_string(tmp_layer_index);
+                }
+
+                PoolingParamSpec poolingSpec = op_spec.ps.pooling_spec;
+                if (poolingSpec.mode == POOLING_MAX) {
+                    work->add<MaxPoolLayer2D>(op_spec.name,
+                        Pool2DParams{{op_spec.input_tensors_name[0]},
+                            {op_spec.output_tensors_name[0]}, poolingSpec.kernel_h,
+                            poolingSpec.stride_h});
+                } else {
+                    if (poolingSpec.kernel_h == 0 && poolingSpec.kernel_w == 0) {
+                        work->add<GlobAveragePoolLayer>(op_spec.name,
+                            BasicParams{{cur_input_tensor_name}, {op_spec.output_tensors_name[0]}});
+                    } else {
+                        work->add<AveragePoolLayer>(op_spec.name,
+                            Pool2DParams{{cur_input_tensor_name}, {op_spec.output_tensors_name[0]},
+                                poolingSpec.kernel_h, poolingSpec.stride_h});
+                    }
+                }
+            } else if (op_spec.type == OT_Softmax) {
+                std::string cur_input_tensor = std::string(op_spec.input_tensors_name[0]);
+                if (simply_replaced_map.find(cur_input_tensor) != simply_replaced_map.end()) {
+                    cur_input_tensor = simply_replaced_map[cur_input_tensor];
+                }
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                std::string paramDimStr = "width";
+                int softmax_axis = op_spec.ps.softmax_spec.axis;
+                if (softmax_axis == 0) {
+                    paramDimStr = "batch";
+                } else if (softmax_axis == 1) {
+                    paramDimStr = "depth";
+                } else if (softmax_axis == 2) {
+                    paramDimStr = "height";
+                } else if (softmax_axis == 3) {
+                    paramDimStr = "width";
+                } else {
+                    std::cout << "Not support this softmax, exit(-1)...\n\n";
+                    exit(-1);
+                }
+                work->add<SoftMaxActivation>(op_spec.name,
+                    BasicParamsWithDim{{cur_input_tensor}, {cur_output_tensor}, paramDimStr});
+            } else if (op_spec.type == OT_BatchNorm) {
+                std::string cur_input_tensor = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+
+                if (cur_input_tensor ==
+                    cur_output_tensor) {  // use the original tensor name to compare
+                    if (tensor_layer_index.find(cur_input_tensor) == tensor_layer_index.end()) {
+                        tensor_layer_index[cur_output_tensor] = i;  // i == layer_index
+                        cur_output_tensor = cur_output_tensor + "::" + std::to_string(i);
+                    } else {
+                        int cur_tmp_layer_index = tensor_layer_index[cur_input_tensor];
+                        cur_input_tensor =
+                            cur_input_tensor + "::" + std::to_string(cur_tmp_layer_index);
+                        tensor_layer_index[cur_output_tensor] = i;
+                        cur_output_tensor = cur_output_tensor + "::" + std::to_string(i);
+                    }
+                }
+
+                std::string paramDimStr = "width";
+                int bn_axis = op_spec.ps.bn_spec.axis;
+                if (bn_axis == 0) {
+                    paramDimStr = "batch";
+                } else if (bn_axis == 1) {
+                    paramDimStr = "depth";
+                } else if (bn_axis == 2) {
+                    paramDimStr = "height";
+                } else if (bn_axis == 3) {
+                    paramDimStr = "width";
+                } else {
+                    std::cout << "Not support this softmax, exit(-1)...\n\n";
+                    exit(-1);
+                }
+                if (ms.ops[i + 1].type == OT_Scale) {
+                    auto op_spec_next = ms.ops[i + 1];
+                    // deal with the scale
+                    std::string cur_scale_input_tensor =
+                        std::string(op_spec_next.input_tensors_name[0]);
+                    std::string cur_scale_output_tensor =
+                        std::string(op_spec_next.output_tensors_name[0]);
+                    if (cur_scale_input_tensor != cur_scale_output_tensor) {
+                        cur_output_tensor = cur_scale_output_tensor;
+                    } else {  // inplace operation
+                        tensor_layer_index[cur_scale_output_tensor] = i + 1;
+                        cur_output_tensor = cur_scale_output_tensor + "::" + std::to_string(i + 1);
+                    }
+
+                    work->add<BatchNormLayer>(op_spec.name,
+                        BatchnormParams{
+                            {cur_input_tensor}, {cur_output_tensor}, 0.01f, 1e-5f, paramDimStr});
+                    i++;  // skip the next op
+                } else {
+                    work->add<BatchNormLayer>(op_spec.name,
+                        BatchnormParams{
+                            {cur_input_tensor}, {cur_output_tensor}, 0.01f, 1e-5f, paramDimStr});
+                }
+            } else if (op_spec.type == OT_Relu) {
+                std::string cur_input_tensor = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+
+                if (cur_input_tensor ==
+                    cur_output_tensor) {  // use the original tensor name to compare
+                    if (tensor_layer_index.find(cur_input_tensor) == tensor_layer_index.end()) {
+                        tensor_layer_index[cur_output_tensor] = i;  // i == layer_index
+                        cur_output_tensor = cur_output_tensor + "::" + std::to_string(i);
+                    } else {
+                        int cur_tmp_layer_index = tensor_layer_index[cur_input_tensor];
+                        cur_input_tensor =
+                            cur_input_tensor + "::" + std::to_string(cur_tmp_layer_index);
+                        tensor_layer_index[cur_output_tensor] = i;
+                        cur_output_tensor = cur_output_tensor + "::" + std::to_string(i);
+                    }
+                } else {
+                    if (tensor_layer_index.find(cur_input_tensor) != tensor_layer_index.end()) {
+                        int cur_tmp_layer_index = tensor_layer_index[cur_input_tensor];
+                        cur_input_tensor =
+                            cur_input_tensor + "::" + std::to_string(cur_tmp_layer_index);
+                    }
+                }
+                work->add<ReLUActivation>(
+                    op_spec.name, BasicParams{{cur_input_tensor}, {cur_output_tensor}});
+            } else if (op_spec.type == OT_Eltwise) {
+                std::string cur_input_left = std::string(op_spec.input_tensors_name[0]);
+                if (simply_replaced_map.find(cur_input_left) != simply_replaced_map.end()) {
+                    cur_input_left = simply_replaced_map[cur_input_left];
+                }
+                std::string cur_input_right = std::string(op_spec.input_tensors_name[1]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                if (op_spec.ps.eltwise_spec.mode == ELTWISE_SUM) {
+                    work->add<ElementWiseSumLayer>(op_spec.name,
+                        ElementWiseLayerParams{{cur_input_left, cur_input_right}, cur_output_tensor});
+                } else if (op_spec.ps.eltwise_spec.mode == ELTWISE_PROD) {
+                    work->add<ElementWiseMulLayer>(op_spec.name,
+                        ElementWiseLayerParams{{cur_input_left, cur_input_right}, cur_output_tensor});
+                }
+            } else if (op_spec.type == OT_Reshape) {
+                std::string cur_input = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                work->add<ReshapeLayer>(
+                    op_spec.name, ViewParams{cur_input, cur_output_tensor, 1, 1, -1});
+            } else if (op_spec.type == OT_FC) {
+                FullyConnectedParamSpec fcps = op_spec.ps.fc_spec;
+                std::string cur_input = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                work->add<LinearLayer>(
+                    op_spec.name, LinearParams{{cur_input}, {cur_output_tensor}, fcps.num_outputs});
+            } else if (op_spec.type == OT_SharedWeight) {
+                std::cout << "[WARNING] Encounter a shared weight op\n";
+                continue;
+            } else if (op_spec.type == OT_Unsqueeze) {
+                std::string cur_input = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                if (cur_input != cur_output_tensor) {
+                    simply_replaced_map[cur_output_tensor] = cur_input;
+                }
+                continue;
+            } else if (op_spec.type == OT_Squeeze) {
+                std::string cur_input = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                if (cur_input != cur_output_tensor) {
+                    simply_replaced_map[cur_output_tensor] = cur_input;
+                }
+                continue;
+            } else if (op_spec.type == OT_Transpose) {
+                std::string cur_input = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                if (cur_input != cur_output_tensor) {
+                    simply_replaced_map[cur_output_tensor] = cur_input;
+                }
+                continue;
+            } else if (op_spec.type == OT_RNN) {
+                std::string cur_input = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                if (simply_replaced_map.find(cur_input) != simply_replaced_map.end()) {
+                    cur_input = simply_replaced_map[cur_input];
+                }
+                std::string cur_output_tensor1 = std::string(op_spec.output_tensors_name[1]);
+                GRULayer(op_spec.name,
+                    GRUParams{{cur_input}, {cur_output_tensor}, 32, true, true, true, false, false},
+                    work->getNetworkParameters());
+            } else if (op_spec.type == OT_LayerNorm) {
+                std::string cur_input = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                work->add<LayerNorm2DLayer>(
+                    op_spec.name, LayerNormParams{cur_input, cur_output_tensor});
+            } else if (op_spec.type == OT_Reduction) {
+                std::string cur_input = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                work->add<ReduceSumLayer>(
+                    op_spec.name, BasicParamsWithDim{{cur_input}, {cur_output_tensor}, "height"});
+            } else if (op_spec.type == OT_Sigmoid) {
+                std::string cur_input = std::string(op_spec.input_tensors_name[0]);
+                if (simply_replaced_map.find(cur_input) != simply_replaced_map.end()) {
+                    cur_input = simply_replaced_map[cur_input];
+                }
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                work->add<SigmoidActivation>(
+                    op_spec.name, BasicParamsWithDim{{cur_input}, {cur_output_tensor}});
+            } else if (op_spec.type == OT_Gather) {
+                std::string cur_input = std::string(op_spec.input_tensors_name[0]);
+                std::string cur_output_tensor = std::string(op_spec.output_tensors_name[0]);
+                work->add<SlicerLayer>(op_spec.name,
+                    SlicingParams{cur_input, {"left_" + cur_output_tensor, cur_output_tensor},
+                        Dimension::Depth, {-1, 1}});
+            } else {
+                std::cout << "Encounter non-supporting operator["
+                          << OperatorTypeName()[op_spec.type] << "]" << std::endl;
+                exit(-1);
+            }
+        }
+    } catch (std::exception &e) {
+        set_last_error(e.what());
+        return STATUS_ERROR;
+    }
+    return STATUS_OK;
+}
+
+}  // namespace
+API_STATUS create_general_training_model_from_bolt(Graph_t **graph,
+    const char *input_bolt_path,
+    size_t batch_size,
+    size_t target_size,
+    const char *loss_type,
+    bool use_fp16,
+    int *input_shape,
+    int shape_size,
+    char *modified_output)
+{
+    CHECK_NOT_NULL(graph);
+    CHECK_PRECONDITION(batch_size > 0);
+    CHECK_STRING(input_bolt_path);
+    CHECK_STRING(loss_type);
+
+    try {
+        Graph_Description_t *desc = NULL;
+
+        if (use_fp16) {
+            std::cerr << "Currently, do not support fp16 mode" << std::endl;
+            return STATUS_NOT_IMPLEMENTED;
+        } else {
+            FORWARD_ERROR(create_graph_description(&desc));
+        }
+
+        // deserialize the bolt from the input path in order to generate the model_spec
+        ModelSpec ms;
+        CHECK_STATUS(mt_create_model(&ms));
+        CHECK_STATUS(deserialize_model_from_file(input_bolt_path, &ms));
+
+        // dynamically fix output size
+        {
+            if (modified_output != nullptr) {
+                for (int i = 0; i < ms.num_operator_specs; i++) {
+                    if (std::string(ms.ops[i].name) == std::string(modified_output)) {
+                        if (ms.ops[i].type == OT_FC) {
+                            ms.ops[i].ps.fc_spec.num_outputs = target_size;
+                        } else if (ms.ops[i].type == OT_Conv) {
+                            ms.ops[i].ps.conv_spec.num_outputs = target_size;
+                        } else {
+                            std::cerr << "Not Fc or Conv layer, please check the layer name.\n\n";
+                            return STATUS_ERROR_BAD_NAME;
+                        }
+                    }
+                }
+            }
+        }
+
+        // Create inputs
+        {
+            for (int i = 0; i < ms.num_inputs; i++) {
+                const char *cur_input_name = ms.input_names[i];
+                const char *input_tensors[] = {ms.input_names[i]};
+                TensorDesc tensor_desc = ms.input_dims[i];
+                std::vector<int> in_dims = {1, 1, 1, 1};
+                if (tensor_desc.nDims > 4) {
+                    std::cout << "Not support surpassing 4D tensor...\n";
+                    return STATUS_NOT_IMPLEMENTED;
+                }
+                for (int j = 0; j < (int)tensor_desc.nDims; j++) {
+                    in_dims[3 - j] = tensor_desc.dims[j];
+                }
+                if (shape_size == 4) {
+                    in_dims[0] = input_shape[0];
+                    in_dims[1] = input_shape[1];
+                    in_dims[2] = input_shape[2];
+                    in_dims[3] = input_shape[3];
+                }
+
+                FORWARD_ERROR(add_data_layer(desc, cur_input_name, input_tensors, in_dims[0],
+                    in_dims[1], in_dims[2], in_dims[3]));
+                if (ms.num_outputs != 1) {
+                    std::cerr << "Not support output num > 1...\n";
+                    return STATUS_NOT_IMPLEMENTED;
+                }
+            }
+        }
+
+        const char *labels_tensor_name = "targets";
+        // Create labels layer
+        {
+            const char *tensors[] = {labels_tensor_name};
+            FORWARD_ERROR(add_data_layer(desc, "labels", tensors, 1, 1, 1, target_size));
+        }
+
+        // Traverse the operators and create the training graph layer by layer
+        {
+            FORWARD_ERROR(add_training_model_from_model_spec(desc, ms));
+            FORWARD_ERROR(add_reshape_layer(
+                desc, "reshape_layer", ms.output_names[0], "output_reshaped", 1, 1, -1));
+            const char *loss_input_names[] = {"output_reshaped", labels_tensor_name};
+            const char *loss_layer_output_tensor_name = "loss_layer_output_tensor_name";
+            size_t loss_layer_inputs_count = 2;
+            LOSS_REDUCTION loss_layer_reduction = LOSS_REDUCTION_BATCH_MEAN;
+            FORWARD_ERROR(
+                add_loss_layer(desc, "loss_layer", loss_input_names, loss_layer_output_tensor_name,
+                    loss_type, loss_layer_inputs_count, loss_layer_reduction));
+
+            // general model get ready
+            int CUR_BATCH_SIZE = batch_size;
+            FORWARD_ERROR(create_graph(&desc, graph, CUR_BATCH_SIZE));
+#ifdef _USE_DEBUG
+            if ((*graph) != nullptr) {
+                print_graph(*graph);
+            }
+#endif
+        }
+
+        // set weights from ms(bolt) to graph(raul)
+        {
+            set_weights_for_training_model(ms, *graph, modified_output);
+        }
+    } catch (std::exception &e) {
+        set_last_error(e.what());
+        return STATUS_ERROR;
+    }
+    return STATUS_OK;
+}
+
+API_STATUS save_training_model(Graph_t *graph, const char *bolt_path, bool overwrite)
+{
+    std::string path = std::string(bolt_path);
+    if (overwrite == false) {
+        path = path.substr(0, path.length() - 5) + "_finetuned.bolt";
+    }
+
+    ModelSpec ms;
+    CHECK_STATUS(mt_create_model(&ms));
+    CHECK_STATUS(deserialize_model_from_file(bolt_path, &ms));
+    // maybe need to extract trainable names at firstly
+    for (int i = 0; i < ms.num_weight_specs; i++) {
+        WeightSpec ws = ms.ws[i];
+        std::string name = std::string(ws.op_name);
+        size_t length = 0;
+        if (ms.ws[i].bytes_of_weight > 0) {
+            auto ptr = ms.ws[i].weight;
+            ms.ws[i].weight = (U8 *)mt_malloc(ms.ws[i].bytes_of_weight);
+            std::vector<FLOAT_TYPE> tmp(
+                ws.bytes_of_weight / bytesOf(ws.mdt), static_cast<FLOAT_TYPE>(0.f));
+            get_tensor(graph, (name + "::Weights").c_str(), &(tmp[0]), &length);
+            memcpy(ms.ws[i].weight, tmp.data(), ms.ws[i].bytes_of_weight);
+            mt_free(ptr, &ms);
+        }
+        if (ms.ws[i].bytes_of_vec > 0) {
+            auto ptr = ms.ws[i].vec;
+            ms.ws[i].vec = (U8 *)mt_malloc(ms.ws[i].bytes_of_vec);
+            std::vector<FLOAT_TYPE> tmp(
+                ws.bytes_of_vec / bytesOf(ws.mdt), static_cast<FLOAT_TYPE>(0.f));
+            get_tensor(graph, (name + "::Biases").c_str(), &(tmp[0]), &length);
+            memcpy(ms.ws[i].vec, tmp.data(), ms.ws[i].bytes_of_vec);
+            mt_free(ptr, &ms);
+        }
+    }
+#ifdef _USE_DEBUG
+    print_ms(ms);
+#endif
+    CHECK_STATUS(serialize_model_to_file(&ms, path.c_str()));
+    CHECK_STATUS(mt_destroy_model(&ms));
+    return STATUS_OK;
+}
diff --git a/training/demos/common/training.h b/training/demos/common/training.h
new file mode 100644
index 00000000..c18d76b7
--- /dev/null
+++ b/training/demos/common/training.h
@@ -0,0 +1,67 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_BOLT_TRAINING
+#define _H_BOLT_TRAINING
+
+#include "training/api/API.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief generate training graph from bolt model
+ * @param  graph                   training graph
+ * @param  input_bolt_path         original bolt model path
+ * @param  batch_size              batch size
+ * @param  target_size             target size, size of classification categories
+ * @param  loss_type               type of the training loss
+ * @param  use_fp16                use fp16 or not
+ * @param  input_shape             resized input shapes
+ * @param  shape_size              size of the input shape
+ * @param  modified_output         name of the resized layer
+ *
+ * @return API_STATUS which represents success or fail
+ * @note
+ * As to the option<loss_type>, choose one of the list["OpCrossEntropyLoss", "OpSoftmaxCrossEntropyLoss"]
+ * Due to fp16's instability, please set option<use_fp16> as "false" currently.
+ * If input shape is same with the original model, please set option<input_shape> as "nullptr".
+ * If output size is same with the original model, please set option<modified_output> as "nullptr".
+ */
+API_STATUS create_general_training_model_from_bolt(Graph_t **graph,
+    const char *input_bolt_path,
+    size_t batch_size,
+    size_t target_size,
+    const char *loss_type,
+    bool use_fp16,
+    int *input_shape,
+    int shape_size,
+    char *modified_output);
+
+/**
+ * @brief  write the updated ms into a bolt after fine-tunning
+ * @param  graph        training graph
+ * @param  bolt_path    path the input bolt model
+ * @param  overwrite    overwrite the original model or not
+ *
+ * @return API_STATUS which represents success or fail
+ * @note
+ * As to option<overwrite>, if <overwrite> is 'true', bolt_path "xxx.bolt" will be overwritten with new weights.
+ * If <overwrite> is 'false', based on bolt_path "xxx.bolt", a new file "xxx_finetuned.bolt" will be created and the new weights will be written into "xxx_finetuned.bolt". 
+ */
+API_STATUS save_training_model(Graph_t *graph, const char *bolt_path, bool overwrite);
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/training/demos/lenet_demo/CMakeLists.txt b/training/demos/lenet_demo/CMakeLists.txt
new file mode 100644
index 00000000..5e87e6e7
--- /dev/null
+++ b/training/demos/lenet_demo/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+set(project_name training_lenet)
+project(${project_name})
+
+add_executable(${project_name} lenet_demo.cpp)
+target_link_libraries(${project_name} PRIVATE common Training-API)
+install(TARGETS ${project_name}
+        RUNTIME DESTINATION examples)
diff --git a/training/demos/lenet_demo/lenet_demo.cpp b/training/demos/lenet_demo/lenet_demo.cpp
new file mode 100644
index 00000000..593b7e8a
--- /dev/null
+++ b/training/demos/lenet_demo/lenet_demo.cpp
@@ -0,0 +1,90 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <iostream>
+#include <malloc.h>
+#include <cstring>
+#include <vector>
+
+#include "training.h"
+#include "mnist_parser.hpp"
+
+int main(int argc, char *argv[])
+{
+    const char *loss_type = "OpSoftmaxCrossEntropyLoss";
+    float LEARNING_RATE = 0.001;
+    if (argc > 1) {
+        LEARNING_RATE = std::stof(std::string(argv[1]));
+    }
+    int batch_size = 512;
+    if (argc > 2) {
+        batch_size = std::stoi(std::string(argv[2]));
+    }
+    std::cout << "LEARNING_RATE: " << LEARNING_RATE << std::endl;
+    std::cout << "batch_size: " << batch_size << std::endl;
+    // data preparation
+    std::vector<float> labels;
+    read_Mnist_Label("train-labels-idx1-ubyte", labels);
+    std::vector<std::vector<float>> one_hot_labels = one_hot_encoding<float, float>(labels, 10);
+    std::vector<std::vector<float>> images;
+    read_Mnist_Images("train-images-idx3-ubyte", images);
+    // flatten labels
+    int one_hot_labels_row = one_hot_labels.size();
+    int one_hot_labels_column = one_hot_labels[0].size();
+    float *labels_ptr = (float *)malloc(one_hot_labels_row * one_hot_labels_column * sizeof(float));
+    for (int i = 0; i < one_hot_labels_row; i++) {
+        memcpy(&(labels_ptr[i * one_hot_labels_column]), &(one_hot_labels[i][0]),
+            sizeof(float) * one_hot_labels_column);
+    }
+    // flatten images
+    int images_row = images.size();
+    int images_column = images[0].size();
+    float *images_ptr = (float *)malloc(images_row * images_column * sizeof(float));
+    for (int i = 0; i < images_row; i++) {
+        memcpy(&(images_ptr[i * images_column]), &(images[i][0]), sizeof(float) * images_column);
+    }
+
+    // First step: to load the model
+    Graph_t *graph = NULL;
+    const char *modelPath = "./lenet_sim_train.bolt";
+    int target_size = 10;
+    create_general_training_model_from_bolt(
+        &graph, modelPath, batch_size, target_size, loss_type, false, nullptr, 0, nullptr);
+
+    // Second step: create optimizer
+    // Current plan: create a simple sgd optimizer
+    Optimizer_t *sgd_optimizer = NULL;
+    create_sgd_optimizer(&sgd_optimizer, LEARNING_RATE);
+
+    // Third step: training single step + metric
+    const char *loss_name = "loss_layer_output_tensor_name";
+    float testLoss = 0;
+    int iter_times = 10000 / batch_size;
+    for (int i = 0; i < iter_times; i++) {
+        // set input
+        set_tensor(graph, "import/Placeholder:0", &images_ptr[i * batch_size * 1 * 28 * 28],
+            (batch_size * 1 * 28 * 28));
+        // set labels
+        set_tensor(
+            graph, "targets", &labels_ptr[i * batch_size * 10 * 1 * 1], (batch_size * 10 * 1 * 1));
+
+        train_single_pass(graph, sgd_optimizer, &loss_name, 1, &testLoss);
+        std::cout << "step: " << i << ", loss: " << testLoss << "\n\n";
+    }
+
+    // Fourth step: serialize the updated model
+    save_training_model(graph, modelPath, false);
+    free(labels_ptr);
+    free(images_ptr);
+    return 0;
+}
diff --git a/training/demos/mobilenet_v1_demo/CMakeLists.txt b/training/demos/mobilenet_v1_demo/CMakeLists.txt
new file mode 100644
index 00000000..0baa11bf
--- /dev/null
+++ b/training/demos/mobilenet_v1_demo/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+set(project_name training_mobilenet_v1)
+project(${project_name})
+
+add_executable(${project_name} mobilenet_v1_demo.cpp)
+target_link_libraries(${project_name} PRIVATE common Training-API)
+install(TARGETS ${project_name}
+        RUNTIME DESTINATION examples)
diff --git a/training/demos/mobilenet_v1_demo/mobilenet_v1_demo.cpp b/training/demos/mobilenet_v1_demo/mobilenet_v1_demo.cpp
new file mode 100644
index 00000000..c4fb0b15
--- /dev/null
+++ b/training/demos/mobilenet_v1_demo/mobilenet_v1_demo.cpp
@@ -0,0 +1,120 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <iostream>
+#include <malloc.h>
+#include <cstring>
+#include <vector>
+#include <fstream>
+
+#include "training.h"
+
+std::string testing_data_prefix = "./testing_dataset/testing_images_";
+std::string testing_data_suffix = ".bin";
+std::string testing_label_prefix = "./testing_labels/testing_label_";
+std::string testing_label_suffix = ".bin";
+
+void readVector(
+    std::string path, std::vector<float> &myVector, int size, bool image_normalize = false)
+{
+    std::ifstream FILE(path, std::ios::in | std::ifstream::binary);
+    myVector.clear();
+    for (int k = 0; k < size; k++) {
+        unsigned char tmp;
+        FILE.read(reinterpret_cast<char *>(&tmp), sizeof(tmp));
+        if (image_normalize) {
+            myVector.push_back(((float)tmp / 255.0));
+        } else {
+            myVector.push_back((float)tmp);
+        }
+    }
+}
+
+// generate some shuffle data
+void gen_batch_data_and_labels(std::vector<float> &batch_images,
+    std::vector<float> &batch_labels,
+    std::vector<int> file_indexes)
+{
+    for (int i = 0; i < (int)(file_indexes.size()); i++) {
+        std::vector<float> tmp_image;
+        std::vector<float> tmp_label;
+        std::string cur_image_path =
+            testing_data_prefix + std::to_string(file_indexes[i]) + testing_data_suffix;
+        std::string cur_label_path =
+            testing_label_prefix + std::to_string(file_indexes[i]) + testing_label_suffix;
+        readVector(cur_image_path, tmp_image, 1 * 3 * 84 * 84, true);
+        readVector(cur_label_path, tmp_label, 1 * 20);
+        for (int j = 0; j < (int)(tmp_image.size()); j++) {
+            batch_images.push_back(tmp_image[j]);
+        }
+        for (int j = 0; j < (int)(tmp_label.size()); j++) {
+            batch_labels.push_back(tmp_label[j]);
+        }
+    }
+}
+
+int main()
+{
+    const float LEARNING_RATE = 0.1;
+
+    std::cout << "LEARNING_RATE: " << LEARNING_RATE << std::endl;
+    const char *loss_type = "OpCrossEntropyLoss";
+    // First step: to load the model
+    Graph_t *graph = NULL;
+    const char *modelPath = "./mobilenet_v1_train.bolt";
+    int batch_size = 20;
+    const int target_size = 20;
+    std::vector<int> input_size = {1, 3, 84, 84};
+    char *modified_output = (char *)"fc7";
+    create_general_training_model_from_bolt(&graph, modelPath, batch_size, target_size, loss_type,
+        false, &input_size[0], input_size.size(), modified_output);
+
+    // Second step: create optimizer
+    Optimizer_t *optimizer = NULL;
+    create_adam_optimizer(&optimizer, 0.0001, 0.9, 0.999, 10e-8);
+
+    // Add the interface for feeding the input data
+    int iter_times = 50;
+    int gap_size = 600;
+    std::vector<std::vector<int>> file_indexes;
+    for (int z = 0; z < iter_times; z++) {
+        std::vector<int> tmp_vec;
+        for (int i = 0; i < batch_size / target_size; i++) {
+            for (int j = 0; j < target_size; j++) {
+                tmp_vec.push_back(z * (batch_size / target_size) + i + j * gap_size);
+            }
+        }
+        file_indexes.push_back(tmp_vec);
+    }
+
+    // Third step: training single step + metric
+    const char *loss_name = "loss_layer_output_tensor_name";
+    float testLoss = 0;
+    for (int i = 0; i < iter_times; i++) {
+        std::vector<float> images_ptr;
+        std::vector<float> labels_ptr;
+        gen_batch_data_and_labels(images_ptr, labels_ptr, file_indexes[i]);
+        // set input
+        set_tensor(graph, "data", &images_ptr[0], (batch_size * 3 * 84 * 84));
+        // set labels
+        set_tensor(graph, "targets", &labels_ptr[0], (batch_size * target_size * 1 * 1));
+
+        train_single_pass(graph, optimizer, &loss_name, 1, &testLoss);
+        std::cout << "step: " << i << ", loss: " << testLoss << "\n\n";
+    }
+
+    // Fourth step: serialize the updated model
+    save_training_model(graph, modelPath, false);
+
+    return 0;
+}
diff --git a/training/demos/resnet18_demo/CMakeLists.txt b/training/demos/resnet18_demo/CMakeLists.txt
new file mode 100644
index 00000000..1dcf4394
--- /dev/null
+++ b/training/demos/resnet18_demo/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+set(project_name training_resnet18)
+project(${project_name})
+
+add_executable(${project_name} resnet18_demo.cpp)
+target_link_libraries(${project_name} PRIVATE common Training-API)
+install(TARGETS ${project_name}
+        RUNTIME DESTINATION examples)
diff --git a/training/demos/resnet18_demo/resnet18_demo.cpp b/training/demos/resnet18_demo/resnet18_demo.cpp
new file mode 100644
index 00000000..b92424ac
--- /dev/null
+++ b/training/demos/resnet18_demo/resnet18_demo.cpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <iostream>
+#include <malloc.h>
+#include <cstring>
+#include <vector>
+
+#include "training.h"
+
+int main()
+{
+    const char *loss_type = "OpSoftmaxCrossEntropyLoss";
+    const float LEARNING_RATE = 0.05;
+    // First step: to load the model
+    Graph_t *graph = NULL;
+    const char *modelPath = "./resnet18_v2_sim_train.bolt";
+    int batch_size = 2;
+    const int target_size = 1000;
+    create_general_training_model_from_bolt(
+        &graph, modelPath, batch_size, target_size, loss_type, false, nullptr, 0, nullptr);
+
+    // Second step: create optimizer
+    Optimizer_t *sgd_optimizer = NULL;
+    create_sgd_optimizer(&sgd_optimizer, LEARNING_RATE);
+
+    // Feeding model with fake data(full 1)
+    const int fake_imgs_size = 100;
+    std::vector<float> images_ptr(fake_imgs_size * (1 * 3 * 224 * 224), 1.0);
+    std::vector<float> labels_ptr(fake_imgs_size * (1 * target_size));
+    for (int i = 0; i < fake_imgs_size; i++) {
+        std::vector<float> tmp_vec(target_size, 0.0);
+        tmp_vec[i] = 1.0;
+        memcpy(&(labels_ptr[i * target_size]), &(tmp_vec[0]), sizeof(float) * target_size);
+    }
+
+    // Third step: training single step + metric
+    const char *loss_name = "loss_layer_output_tensor_name";
+    float testLoss = 0;
+    for (int i = 0; i < (fake_imgs_size / batch_size); i++) {
+        // set input
+        set_tensor(graph, "data", &images_ptr[i * batch_size * 3 * 224 * 224],
+            (batch_size * 3 * 224 * 224));
+        // set labels
+        set_tensor(graph, "targets", &labels_ptr[i * batch_size * target_size * 1 * 1],
+            (batch_size * target_size * 1 * 1));
+
+        train_single_pass(graph, sgd_optimizer, &loss_name, 1, &testLoss);
+        std::cout << "step: " << i << ", loss: " << testLoss << "\n\n";
+    }
+
+    // Fourth step: serialize the updated model
+    save_training_model(graph, modelPath, false);
+
+    return 0;
+}
diff --git a/training/docs/build.md b/training/docs/build.md
new file mode 100644
index 00000000..086567ff
--- /dev/null
+++ b/training/docs/build.md
@@ -0,0 +1,256 @@
+# Build
+
+[[_TOC_]]
+
+
+## Reqirements
+
+- Build system
+    - cmake 3.11+
+- Compilers
+    - Clang 11.0.0+
+    - GCC 9.2.1+
+    - Visual Studio 16 (2019)+
+    - Android NDK r22+ (r21 without assets related tests)
+
+### Dependencies
+
+Raul can be built with the following libraries. 
+- [OpenBlas](https://github.com/xianyi/OpenBLAS)  
+- [Yato](https://github.com/agruzdev/Yato)  
+
+Build system downloads dependencies automatically using `cmake fetchcontent`.
+
+Tests 
+- [GTest](https://github.com/google/googletest)  
+- [libjpeg-turbo](https://github.com/libjpeg-turbo/libjpeg-turbo) 
+
+
+#### OpenBLAS
+
+**Android**
+
+OpenBLAS prebuilt binary for Android already included into repository `/src/thirdParty/openblas/lib/android/libopenblas.so` (no openmp). 
+
+**Windows**
+
+OpenBLAS prebuilt for win32: [download](https://sourceforge.net/projects/openblas/files/develop/20150903/). Required libgfortran-3.dll, libgcc_s_sjlj-1.dll (mingw32_dll) can be download from here: [v0.2.12](https://sourceforge.net/projects/openblas/files/v0.2.12/) or [v0.2.14](https://sourceforge.net/projects/openblas/files/v0.2.14/).
+
+## Options 
+
+### Build options
+
+#### Build tests (`RAUL_BUILD_TESTS`)
+
+**Default: OFF**
+
+#### Build experiments (`RAUL_BUILD_EXPERIMENTS`)
+
+**Default: OFF**
+
+#### Build C API (`RAUL_BUILD_C_API`)
+
+**Default: OFF**
+
+#### Tests build options 
+
+**Note**: it is available if `RAUL_BUILD_TESTS` enabled
+
+**Default: ON** (all)
+
+It is possible to switch off unnecessary tests to speed up the build.
+
+- `RAUL_TESTS_BUILD_CORE`: Build core library unit tests
+- `RAUL_TESTS_BUILD_ACTIVATIONS`: Build activation functions unit tests
+- `RAUL_TESTS_BUILD_INITIALIZERS`: Build initializers unit tests
+- `RAUL_TESTS_BUILD_LAYERS`: Build layers unit tests
+- `RAUL_TESTS_BUILD_LOSS`: Build loss functions unit tests
+- `RAUL_TESTS_BUILD_META`: Build meta layers unit tests
+- `RAUL_TESTS_BUILD_OPTIMIZERS`: Build optimizers unit tests
+- `RAUL_TESTS_BUILD_TOPOLOGIES`: Build topologies unit tests
+- `RAUL_TESTS_BUILD_POSTPROCESSING`: Build postprocessing unit tests
+
+### Config options
+
+#### OpenMP
+
+**Default: OFF**
+
+```sh
+cmake -B build -S raul -DRAUL_CONFIG_ENABLE_OPENMP=ON 
+cmake --build build --target RaulLib --parallel
+```
+It uses `FindOpenMP` and can be customized, see cmake docs: https://cmake.org/cmake/help/latest/module/FindOpenMP.html
+
+
+#### Pedantic mode (`RAUL_CONFIG_ENABLE_PEDANTIC`)
+
+**Default: ON**
+
+Enable more warnings and interpret them as errors.
+
+
+#### BLAS configuration (`RAUL_CONFIG_BLAS_VENDOR`)
+
+**Default: None**
+
+Configures BLAS in raul library.
+
+| Options             | Description                                                                 | Technical details                                                                                        |
+| --------------------|-----------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------|
+| None                | Without BLAS                                                                | `RAUL_USE_BLAS=OFF`                                                                                      |
+| OpenBLAS            | Search for OpenBLAS using `FindBLAS`                                        | `RAUL_USE_BLAS=ON`, `BLA_VENDOR=OpenBLAS`                                                                |
+| Huawei              | Search for BLAS Enhance in repository *(android only)*                      | `RAUL_USE_BLAS=ON`, `BLA_VENDOR=Huawei` , `CMAKE_LIBRARY_PATH=<in repo>`, `CMAKE_INCLUDE_PATH=<in repo>` |
+| Custom              | Manual set (provide variables)                                              | `RAUL_USE_BLAS=ON`, `BLAS_LIBRARIES=<ask>`, `BLAS_INCLUDE_DIR=<ask>`                                     |
+| OpenBLAS (Internal) | Search for internal prebuilt OpenBLAS *(android, win32/win64, linux, migw)* | `RAUL_USE_BLAS=ON`, `BLAS_LIBRARIES=<in repo>`, `BLAS_INCLUDE_DIR=<in repo>`                             |
+| Auto                | Search for any BLAS using `FindBLAS`                                        | `RAUL_USE_BLAS=ON`                                                                                       |
+
+
+##### Custom BLAS: Example
+Using the specific version of OpenBLAS.
+
+1. Download, build and install custom openblas
+```sh
+wget https://github.com/xianyi/OpenBLAS/releases/download/v0.3.15/OpenBLAS-0.3.15.zip
+unzip OpenBLAS-0.3.15.zip
+make -C OpenBLAS-0.3.15/ -j72
+sudo make -C OpenBLAS-0.3.15/ PREFIX=/opt/openblas install
+```
+
+2. Build Raul with manually provided openblas
+```sh
+cmake -B build -S raul -DRAUL_CONFIG_BLAS_VENDOR=Custom -DBLAS_LIBRARIES=/opt/openblas/lib/libopenblas.so -DBLAS_INCLUDE_DIR=/opt/openblas/include
+cmake --build build --target RaulLib --parallel
+```
+
+#### OpenCL
+
+**Default: ON**
+
+#### 16 bit floating point (`RAUL_CONFIG_ENABLE_FP16`)
+
+**Default: ON**
+
+#### Parallize build mode (`RAUL_CONFIG_ENABLE_PARALLEL_BUILD`)
+
+**Default: ON**
+
+Parallel build
+
+#### Cppcheck (`RAUL_CONFIG_DEV_ENABLE_CPPCHECK`)
+
+**Default: OFF**
+
+Search for linter `cppcheck`.
+
+#### Clang tidy (`RAUL_CONFIG_DEV_ENABLE_CLANG_TIDY`)
+
+**Default: OFF**
+
+Search for linter `clang-tidy`.
+
+#### Tests config options
+
+**Note**: they are available if `RAUL_BUILD_TESTS` enabled
+
+##### LibJPEG
+
+**Default: OFF**
+
+```sh
+cmake -B build -S raul -DRAUL_BUILD_TESTS=ON -DRAUL_TESTS_CONFIG_ENABLE_LIBJPG=ON 
+cmake --build build --target RaulLib --parallel
+```
+
+##### Verbose tests (`RAUL_TESTS_CONFIG_ENABLE_VERBOSE`)
+
+**Default: OFF**
+
+Enable stdout print for tests.
+
+### Install options
+
+#### Subdirectories (`RAUL_INSTALL_ENABLE_SUBDIRS`)
+
+**Default: OFF**
+
+#### Install tests (`RAUL_INSTALL_TESTS`)
+
+**Note**: it is available if `RAUL_BUILD_TESTS` enabled
+
+**Default: OFF**
+
+## Tools
+
+### Format
+
+If `clang-format` is available in the system then `cmake` exposes format target
+
+```sh
+cmake -B build -S raul
+cmake --build build --target format --parallel
+```
+
+We use [Mozilla coding style](https://firefox-source-docs.mozilla.org/code-quality/coding-style/coding_style_cpp.html). The sources can be automatically foramated with the following command.
+
+**Note:** there is a CI job which checks and reformat the sources with help of `clang-format`.
+
+### Docs
+
+If `doxygen` is available in the system then `cmake` exposes format target
+
+```sh
+cmake -B build -S raul
+cmake --build build --target docs --parallel
+```
+
+Make sure you have installed `doxygen` and `dot`. You can also instgall `mscgen` and `dia` optionally.
+
+Also, there is a class diagram of the source code which is presented in docs/Raul.graphml (use [yED](https://www.yworks.com/products/yed) to open).
+
+## Scenarios
+
+### Cross-compilation for Android
+
+If Android NDK is installed `cmake` exposes `build-android-tests` target (does not work with Microsoft Visual Studio).
+
+```sh
+cmake -B build -S raul
+cmake --build build --target build-android-tests --parallel
+```
+
+This target includes:
+
+- NDK toolchain (clang)
+- Release config
+- ABI arm64-v8a
+- Android API 19
+- C++ static std library
+- OpenMP
+- Huawei BLAS (BLAS Enhance, from repository)
+
+#### NDK_PATH
+
+`cmake` finds NDK by looking in system drive and program files on Windows and /opt directory on Linux. A custom directory can be provided throth `NDK_PATH` environment variable.
+
+```sh
+NDK_PATH=/mnt/android-ndk-r22/ cmake -B build -S raul
+```
+
+### CTest targets (`RAUL_TESTS_CONFIG_ENABLE_SCENARIOUS`)
+
+**Note**: it is available if `RAUL_BUILD_TESTS` enabled
+
+**Default: OFF**
+
+It is possible to use a powerful cmake-bundled tool `ctest`. Several predefined testing scenarios have been added to the development environment for ctest.
+
+```sh
+cmake -B build -S raul -DRAUL_BUILD_TESTS=ON -DRAUL_TESTS_CONFIG_ENABLE_SCENARIOUS=ON
+cmake --build build --target test-unit-optimizers --parallel
+```
+## Tests
+
+### Datasets
+
+Required for functional tests data can be found in `assets` subdirectory. See [assets/README.md](assets/README.md) for details.
diff --git a/training/docs/docs.cmake b/training/docs/docs.cmake
new file mode 100644
index 00000000..80ca7d74
--- /dev/null
+++ b/training/docs/docs.cmake
@@ -0,0 +1,46 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+find_package(Doxygen OPTIONAL_COMPONENTS dot mscgen dia QUIET)
+
+if(Doxygen_FOUND)
+    message(STATUS "Doxygen found") 
+
+    SET(DOXYGEN_EXTRACT_ALL YES)
+    SET(DOXYGEN_EXTRACT_PRIVATE YES)
+    SET(DOXYGEN_BUILTIN_STL_SUPPORT YES)
+    SET(DOXYGEN_CREATE_SUBDIRS YES)
+    SET(DOXYGEN_HTML_TIMESTAMP YES)
+    SET(DOXYGEN_GENERATE_TREEVIEW YES)
+    SET(DOXYGEN_USE_MATHJAX YES)
+    SET(DOXYGEN_DOT_NUM_THREADS 32)
+    SET(DOXYGEN_USE_MDFILE_AS_MAINPAGE README.md)
+    SET(DOXYGEN_PROJECT_LOGO "${CMAKE_CURRENT_SOURCE_DIR}/raul_logo.png")
+    SET(DOXYGEN_PROJECT_NAME "Raul: on-device training library")
+
+    get_target_property(raul-src Raul SOURCES)
+    list(FILTER raul-src EXCLUDE REGEX "${CMAKE_CURRENT_BINARY_DIR}/.*")
+    get_target_property(raul-dir Raul SOURCE_DIR)
+    set(raul-abs-src "")
+    foreach (file ${raul-src})
+        list(APPEND raul-abs-src ${raul-dir}/${file})
+    endforeach ()
+
+    doxygen_add_docs(docs
+        ${raul-abs-src}
+        ${CMAKE_CURRENT_SOURCE_DIR}/README.md
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        COMMENT "Generating html documentation with Doxygen")
+endif()
\ No newline at end of file
diff --git a/training/docs/raul_logo.png b/training/docs/raul_logo.png
new file mode 100644
index 00000000..904e8186
Binary files /dev/null and b/training/docs/raul_logo.png differ
diff --git a/training/src/CMakeLists.txt b/training/src/CMakeLists.txt
new file mode 100644
index 00000000..9c59cbf3
--- /dev/null
+++ b/training/src/CMakeLists.txt
@@ -0,0 +1,128 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+
+############################
+##  Modules
+############################
+add_subdirectory(system)
+add_subdirectory(frontend)
+add_subdirectory(compiler)
+if (RAUL_BUILD_TESTS)
+    enable_testing()
+    add_subdirectory(tests)
+endif ()
+
+############################
+##  Library sources
+############################
+add_library(Raul SHARED)
+
+############################
+##  Version
+############################
+include(cmake/git.cmake)
+configure_file(
+        Version.cpp.in
+        ${CMAKE_CURRENT_BINARY_DIR}/Version.cpp
+)
+target_sources(Raul PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/Version.h ${CMAKE_CURRENT_BINARY_DIR}/Version.cpp)
+
+############################
+##  Library build config
+############################
+target_include_directories(Raul
+        PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+        $<INSTALL_INTERFACE:raul>
+        )
+
+target_link_libraries(Raul
+        PUBLIC
+        Raul-System
+        Raul-Frontend
+        Raul-Compiler
+        )
+
+target_compile_features(Raul PUBLIC cxx_std_17)
+set_target_properties(Raul PROPERTIES OUTPUT_NAME "raul")
+
+if (RAUL_CONFIG_ENABLE_FP16 AND ANDROID)
+    target_compile_definitions(Raul PUBLIC __ARM_FEATURE_FP16_VECTOR_ARITHMETIC=1)
+    target_compile_options(Raul PUBLIC -march=armv8.2-a+fp16+dotprod)
+endif ()
+
+if (MSVC)
+    set_target_properties(Raul PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif ()
+
+# Parallelize mode
+if (RAUL_CPU_CORES)
+    target_compile_options(Raul PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/MP${RAUL_CPU_CORES}>)
+endif ()
+
+# Pedantic mode (more warnigns and warnings as errors)
+if (RAUL_CONFIG_ENABLE_PEDANTIC)
+    target_compile_options(Raul PUBLIC
+            $<$<OR:$<CXX_COMPILER_ID:GNU>,$<CXX_COMPILER_ID:Clang>>:-Wall;-Wextra;-pedantic;-Werror>
+            $<$<CXX_COMPILER_ID:MSVC>:/W4;/WX>
+            )
+
+    target_compile_definitions(Raul PUBLIC
+            $<$<CXX_COMPILER_ID:MSVC>:_CRT_SECURE_NO_WARNINGS>
+            )
+endif ()
+
+if (CMAKE_BUILD_TYPE STREQUAL Release)
+    add_custom_command(TARGET Raul POST_BUILD
+            COMMAND ${CMAKE_STRIP} $<TARGET_FILE:Raul>)
+endif ()
+
+############################
+##  Building layout
+############################
+set_target_properties(Raul PROPERTIES
+        ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+        LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+        RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+        )
+
+############################
+##  Install
+############################
+if (RAUL_INSTALL_ENABLE_SUBDIRS)
+    install(TARGETS Raul EXPORT raul-export)
+else ()
+    install(TARGETS Raul EXPORT raul-export DESTINATION "lib")
+endif ()
+
+############################
+##  Appearance in IDEs
+############################
+if (MSVC)
+    target_sources(Raul INTERFACE raul.natvis)
+endif ()
+
+# Static sources
+get_target_property(raul-src-static Raul SOURCES)
+list(FILTER raul-src-static EXCLUDE REGEX "${CMAKE_CURRENT_BINARY_DIR}/.*")
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${raul-src-static})
+# Generated sources
+get_target_property(raul-src-generated Raul SOURCES)
+list(FILTER raul-src-generated INCLUDE REGEX "${CMAKE_CURRENT_BINARY_DIR}/.*")
+source_group(TREE ${CMAKE_CURRENT_BINARY_DIR} FILES ${raul-src-generated})
+
+set_target_properties(Raul PROPERTIES FOLDER Raul)
diff --git a/training/src/Version.cpp.in b/training/src/Version.cpp.in
new file mode 100644
index 00000000..f9f9fff8
--- /dev/null
+++ b/training/src/Version.cpp.in
@@ -0,0 +1,34 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Version.h"
+
+namespace raul
+{
+
+unsigned int Version::getNumber()
+{
+    return @PROJECT_VERSION@;
+}
+
+const char* Version::getRevisionStr()
+{
+    return "@RAUL_GIT_REVISION@";
+}
+
+const char* Version::getDateStr()
+{
+    return "@RAUL_GIT_DATE@";
+}
+
+}
diff --git a/training/src/Version.h b/training/src/Version.h
new file mode 100644
index 00000000..502f1eda
--- /dev/null
+++ b/training/src/Version.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef VERSION_H
+#define VERSION_H
+
+namespace raul
+{
+struct Version
+{
+    /**
+     * \brief Version number
+     * @return library version number
+     */
+    static unsigned int getNumber();
+
+    /**
+     * \brief Hash code of revision
+     * @return hash code of library revision as a c-string
+     */
+    static const char* getRevisionStr();
+
+    /**
+     * \brief Date of revision
+     * @return date of library revision as a c-string
+     */
+    static const char* getDateStr();
+};
+}
+
+#endif // VERSION_H
diff --git a/training/src/cmake/git.cmake b/training/src/cmake/git.cmake
new file mode 100644
index 00000000..82544435
--- /dev/null
+++ b/training/src/cmake/git.cmake
@@ -0,0 +1,44 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+find_package(Git)
+if (GIT_FOUND)
+    execute_process(
+            COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+            WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+            OUTPUT_VARIABLE RAUL_GIT_REVISION
+            ERROR_QUIET
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    execute_process(
+            COMMAND ${GIT_EXECUTABLE} show -s --date=short --pretty=%ad HEAD
+            WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+            OUTPUT_VARIABLE RAUL_GIT_DATE
+            ERROR_QUIET
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if (NOT RAUL_GIT_REVISION)
+        set(RAUL_GIT_REVISION "Unknown revision")
+    endif ()
+    if (NOT RAUL_GIT_DATE)
+        set(RAUL_GIT_DATE "Unknown date")
+    endif ()
+    set(RAUL_GIT_REVISION ${RAUL_GIT_REVISION} PARENT_SCOPE)
+    set(RAUL_GIT_DATE ${RAUL_GIT_DATE} PARENT_SCOPE)
+    message(STATUS "Git hash: ${RAUL_GIT_REVISION}")
+    message(STATUS "Git date: ${RAUL_GIT_DATE}")
+else ()
+    message(STATUS "Git not found")
+endif ()
diff --git a/training/src/compiler/CMakeLists.txt b/training/src/compiler/CMakeLists.txt
new file mode 100644
index 00000000..40e48268
--- /dev/null
+++ b/training/src/compiler/CMakeLists.txt
@@ -0,0 +1,105 @@
+# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+############################
+##  BLAS dev interface
+############################
+set(RAUL_CONFIG_BLAS_VENDOR "None" CACHE STRING "Set BLAS vendor")
+set_property(CACHE RAUL_CONFIG_BLAS_VENDOR PROPERTY STRINGS "None;Custom;OpenBLAS;Auto")
+
+if (RAUL_CONFIG_BLAS_VENDOR STREQUAL "OpenBLAS")
+    set(BLA_VENDOR OpenBLAS)
+endif ()
+
+if (RAUL_CONFIG_BLAS_VENDOR STREQUAL "Custom")
+    set(BLAS_LIBRARIES "" CACHE FILEPATH "Path to BLAS library file")
+    set(BLAS_INCLUDE_DIR "" CACHE PATH "Path to BLAS library include directory")
+endif ()
+
+############################
+##  Modules
+############################
+add_subdirectory(external/yato)
+
+if (RAUL_CONFIG_ENABLE_OPENMP)
+    include(external/openmp/CMakeLists.txt)
+endif ()
+
+message(STATUS "BLAS: ${RAUL_CONFIG_BLAS_VENDOR}")
+
+if (NOT RAUL_CONFIG_BLAS_VENDOR STREQUAL "None")
+    include(external/blas/CMakeLists.txt)
+endif ()
+
+############################
+##  Library sources
+############################
+add_library(Raul-Compiler OBJECT)
+include(sources.cmake)
+
+############################
+##  Library build config
+############################
+target_include_directories(Raul-Compiler
+        PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+        $<INSTALL_INTERFACE:raul/compiler>
+        )
+
+target_compile_features(Raul-Compiler PUBLIC cxx_std_17)
+
+
+target_link_libraries(Raul-Compiler
+        PUBLIC
+        Raul-Frontend
+        Raul-System
+        libyato
+        $<$<BOOL:${BLAS_FOUND}>:BLAS::BLAS>
+        $<$<BOOL:${OpenMP_CXX_FOUND}>:OpenMP::OpenMP_CXX>
+        )
+
+set_target_properties(Raul-Compiler PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# Parallelize mode
+if (RAUL_CPU_CORES)
+    target_compile_options(Raul-Compiler PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/MP${RAUL_CPU_CORES}>)
+endif ()
+
+# Pedantic mode (more warnigns and warnings as errors)
+if (RAUL_CONFIG_ENABLE_PEDANTIC)
+    target_compile_options(Raul-Compiler PUBLIC
+            $<$<OR:$<CXX_COMPILER_ID:GNU>,$<CXX_COMPILER_ID:Clang>>:-Wall;-Wextra;-pedantic;-Werror>
+            $<$<CXX_COMPILER_ID:MSVC>:/W4;/WX>
+            )
+
+    target_compile_definitions(Raul-Compiler PUBLIC
+            $<$<CXX_COMPILER_ID:MSVC>:_CRT_SECURE_NO_WARNINGS>
+            )
+endif ()
+
+############################
+##  Appearance in IDEs
+############################
+# Static sources
+get_target_property(raul-compiler-src-static Raul-Compiler SOURCES)
+list(FILTER raul-compiler-src-static EXCLUDE REGEX "${CMAKE_CURRENT_BINARY_DIR}/.*")
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/training FILES ${raul-compiler-src-static})
+# Generated sources
+get_target_property(raul-compiler-src-generated Raul-Compiler SOURCES)
+list(FILTER raul-compiler-src-generated INCLUDE REGEX "${CMAKE_CURRENT_BINARY_DIR}/.*")
+source_group(TREE ${CMAKE_CURRENT_BINARY_DIR}/training FILES ${raul-compiler-src-generated})
+
+set_target_properties(Raul-Compiler PROPERTIES FOLDER Raul)
\ No newline at end of file
diff --git a/training/src/compiler/cmake/kernels.cmake b/training/src/compiler/cmake/kernels.cmake
new file mode 100644
index 00000000..3c82c953
--- /dev/null
+++ b/training/src/compiler/cmake/kernels.cmake
@@ -0,0 +1,44 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+# cmake 3.10 doesn't have file(size ...) function
+function(get_file_size var filename)
+    file(READ "${filename}" content HEX)
+    string(LENGTH "${content}" content_length)
+    math(EXPR content_length "${content_length} / 2")
+    set(${var} ${content_length} PARENT_SCOPE)
+endfunction()
+get_file_size(file_kernel_def_size ${CMAKE_CURRENT_SOURCE_DIR}/training/base/opencl/kernels/kernel_def.h)
+
+function(split_kernels filename file_kernel_def_step)
+    message(STATUS "Splitting ${filename} (deprecated)")
+    get_file_size(file_kernel_def_size ${filename})
+    set(kernel_def_chunks "")
+    set(kernel_sources ${Raul_sources})
+    foreach (offset RANGE 0 ${file_kernel_def_size} ${file_kernel_def_step})
+        file(READ ${CMAKE_CURRENT_SOURCE_DIR}/training/base/opencl/kernels/kernel_def.h file_kernel_def_content OFFSET ${offset} LIMIT ${file_kernel_def_step})
+        # exclude additional end of line
+        string(SUBSTRING "${file_kernel_def_content}" 0 ${file_kernel_def_step} file_kernel_def_content)
+        file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/training/base/opencl/kernels/kernel_def_${offset}.h "R\"(${file_kernel_def_content})\"\n")
+        list(APPEND kernel_sources ${CMAKE_CURRENT_BINARY_DIR}/training/base/opencl/kernels/kernel_def_${offset}.h)
+        list(APPEND kernel_def_chunks "#include \"kernel_def_${offset}.h\"")
+    endforeach ()
+
+    string(REPLACE ";" "\n,\n" kernel_def_chunks "${kernel_def_chunks}")
+    list(APPEND Raul_sources ${CMAKE_CURRENT_BINARY_DIR}/training/base/opencl/kernels/kernel_chunks.h)
+    set(Raul_sources ${kernel_sources} PARENT_SCOPE)
+    set(kernel_def_chunks ${kernel_def_chunks} PARENT_SCOPE)
+endfunction()
+
diff --git a/training/src/compiler/external/blas/CMakeLists.txt b/training/src/compiler/external/blas/CMakeLists.txt
new file mode 100644
index 00000000..f5885111
--- /dev/null
+++ b/training/src/compiler/external/blas/CMakeLists.txt
@@ -0,0 +1,69 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.11...3.20 FATAL_ERROR)
+
+message(STATUS "Preparing BLAS")
+
+set(BLAS_SHARED ON)
+
+if (BLAS_LIBRARIES AND BLAS_INCLUDE_DIR)
+    get_filename_component(BLAS_CUSTOM_LIB_EXT ${BLAS_LIBRARIES} EXT)
+    if (BLAS_CUSTOM_LIB_EXT STREQUAL ".a" OR BLAS_CUSTOM_LIB_EXT STREQUAL ".lib")
+        set(BLAS_SHARED OFF)
+    endif ()
+    set(BLAS_FOUND "TRUE")
+    message(STATUS "Custom BLAS: ${BLAS_LIBRARIES}")
+elseif (RAUL_CONFIG_BLAS_VENDOR MATCHES "^Huawei$")
+    if (BOLT_ROOT)
+        set(BLAS_FOUND "TRUE")
+        set(BLAS_LIB "${CMAKE_BINARY_DIR}/compute/blas_enhance/src/${CMAKE_SHARED_LIBRARY_PREFIX}blas_enhance${CMAKE_SHARED_LIBRARY_SUFFIX}")
+        set(BLAS_INCLUDE_DIR "${BOLT_ROOT}/compute/blas_enhance/include;${BOLT_ROOT}/common/uni/include;${BOLT_ROOT}/common/memory/include")
+    else()
+        find_library(BLAS_LIB NAMES blas_enhance NO_CMAKE_FIND_ROOT_PATH)
+        find_path(BLAS_INCLUDE_DIR NAMES blas_enhance.h NO_CMAKE_FIND_ROOT_PATH)
+    endif()
+
+    if (BLAS_LIB AND BLAS_INCLUDE_DIR)
+        set(BLAS_FOUND "TRUE")
+    endif ()
+    if (BLAS_FOUND)
+        message(STATUS "Found Huawei BLAS Enhance: ${BLAS_LIB}")
+        set(BLAS_LINKER_FLAGS "-D_BLAS_ENHANCE")
+        set(BLAS_LIBRARIES "${BLAS_LIB}")
+    endif ()
+else ()
+    find_package(BLAS REQUIRED)
+    find_path(BLAS_INCLUDE_DIR NAMES "cblas.h" PATHS ${BLAS_INCLUDE_SEARCH_PATHS})
+    if (NOT BLAS_INCLUDE_DIR)
+        set(BLAS_FOUND OFF)
+    endif ()
+endif ()
+
+if (BLAS_FOUND)
+    if (NOT TARGET BLAS::BLAS)
+        if (BLAS_SHARED)
+            add_library(BLAS::BLAS SHARED IMPORTED)
+        else ()
+            add_library(BLAS::BLAS STATIC IMPORTED)
+        endif ()
+        set_target_properties(BLAS::BLAS PROPERTIES IMPORTED_LOCATION ${BLAS_LIBRARIES})
+        target_compile_options(BLAS::BLAS INTERFACE ${BLAS_LINKER_FLAGS})
+        target_include_directories(BLAS::BLAS INTERFACE ${BLAS_INCLUDE_DIR})
+    endif ()
+    target_compile_definitions(BLAS::BLAS INTERFACE _BLAS ${BLAS_COMPILER_DEFINITIONS})
+endif ()
+
+if (NOT BLAS_FOUND)
+    message(WARNING "BLAS not found")
+endif ()
diff --git a/training/src/compiler/external/openmp/CMakeLists.txt b/training/src/compiler/external/openmp/CMakeLists.txt
new file mode 100644
index 00000000..4be9b273
--- /dev/null
+++ b/training/src/compiler/external/openmp/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+message(STATUS "Preparing OpenMP")
+find_package(OpenMP REQUIRED)
diff --git a/training/src/compiler/external/yato/CMakeLists.txt b/training/src/compiler/external/yato/CMakeLists.txt
new file mode 100644
index 00000000..c5eb4c31
--- /dev/null
+++ b/training/src/compiler/external/yato/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.11...3.20 FATAL_ERROR)
+
+if (POLICY CMP0077)
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+endif ()
+
+include(FetchContent)
+
+set(FETCHCONTENT_UPDATES_DISCONNECTED ON)
+if (EXISTS ${BOLT_ROOT}/third_party/sources/Yato-9b5a49f6ec4169b67b9e5ffd11fdae9c238b0a3d.zip)
+    FetchContent_Declare(yato URL ${BOLT_ROOT}/third_party/sources/Yato-9b5a49f6ec4169b67b9e5ffd11fdae9c238b0a3d.zip)
+else ()
+    FetchContent_Declare(yato
+            GIT_REPOSITORY https://github.com/agruzdev/Yato.git
+            GIT_TAG 9b5a49f6ec4169b67b9e5ffd11fdae9c238b0a3d
+            GIT_SHALLOW FALSE
+            GIT_PROGRESS FALSE
+            )
+endif ()
+
+# Workaround for Yato until it became a library target
+FetchContent_GetProperties(yato)
+if (NOT yato_Target_POPULATED)
+    message(STATUS "Preparing yato")
+    set(YATO_BUILD_TESTS OFF)
+    FetchContent_Populate(yato)
+    add_library(libyato INTERFACE)
+    target_include_directories(libyato INTERFACE ${yato_SOURCE_DIR}/include)
+endif ()
diff --git a/training/src/compiler/sources.cmake b/training/src/compiler/sources.cmake
new file mode 100644
index 00000000..65922e15
--- /dev/null
+++ b/training/src/compiler/sources.cmake
@@ -0,0 +1,588 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+target_sources(Raul-Compiler PRIVATE
+        training/compiler/WShape.h
+        training/compiler/WorkflowPool.cpp
+        training/compiler/Workflow.h
+        training/compiler/Workflow.cpp
+        training/compiler/WorkflowEager.h
+        training/compiler/WorkflowEager.cpp
+        training/compiler/WorkflowActions.h
+        training/compiler/WorkflowBuilder.h
+        training/compiler/WorkflowDB.cpp
+        training/compiler/LayerBuilder.h
+        training/compiler/LayerBuilder.cpp
+        training/compiler/IntervalTree.h
+        training/compiler/WorkflowDB.h
+        training/compiler/IntervalTree.cpp
+        training/compiler/WorkflowActions.cpp
+        training/compiler/WorkflowPool.h
+        training/compiler/Layers.h
+        training/compiler/Compiler.h
+        training/compiler/Compiler.cpp
+        training/compiler/FrontendCompiler.h
+        training/compiler/LayersResolver.h
+        training/base/optimizers/Adadelta.cpp
+        training/base/optimizers/Optimizer.h
+        training/base/optimizers/LAMB.h
+        training/base/optimizers/AdamW.h
+        training/base/optimizers/AdamW.cpp
+        training/base/optimizers/Adam.h
+        training/base/optimizers/Ranger.cpp
+        training/base/optimizers/Momentum.cpp
+        training/base/optimizers/IOptimizer.h
+        training/base/optimizers/Adagrad.h
+        training/base/optimizers/Ranger.h
+        training/base/optimizers/Optimizer.cpp
+        training/base/optimizers/Adamax.h
+        training/base/optimizers/Adamax.cpp
+        training/base/optimizers/SGD.cpp
+        training/base/optimizers/Adam.cpp
+        training/base/optimizers/ASGD.h
+        training/base/optimizers/SGD.h
+        training/base/optimizers/Rprop.cpp
+        training/base/optimizers/LAMB.cpp
+        training/base/optimizers/Momentum.h
+        training/base/optimizers/Adadelta.h
+        training/base/optimizers/Nesterov.cpp
+        training/base/optimizers/RMSprop.h
+        training/base/optimizers/Adagrad.cpp
+        training/base/optimizers/Nesterov.h
+        training/base/optimizers/Rprop.h
+        training/base/optimizers/RMSprop.cpp
+        training/base/optimizers/regularizers/Regularizer.h
+        training/base/optimizers/regularizers/strategies/L2.h
+        training/base/optimizers/regularizers/strategies/L1.cpp
+        training/base/optimizers/regularizers/strategies/IRegularizerStrategy.h
+        training/base/optimizers/regularizers/strategies/L2.cpp
+        training/base/optimizers/regularizers/strategies/L1.h
+        training/base/optimizers/regularizers/Regularizer.cpp
+        training/base/optimizers/ASGD.cpp
+        training/base/optimizers/schedulers/LrScheduler.cpp
+        training/base/optimizers/schedulers/LrScheduler.h
+        training/base/optimizers/schedulers/strategies/Exponential.cpp
+        training/base/optimizers/schedulers/strategies/StepOffset.cpp
+        training/base/optimizers/schedulers/strategies/WarmUp.cpp
+        training/base/optimizers/schedulers/strategies/ClipUpper.h
+        training/base/optimizers/schedulers/strategies/WarmUp.h
+        training/base/optimizers/schedulers/strategies/Lambda.h
+        training/base/optimizers/schedulers/strategies/Base.cpp
+        training/base/optimizers/schedulers/strategies/Base.h
+        training/base/optimizers/schedulers/strategies/StepOffset.h
+        training/base/optimizers/schedulers/strategies/Exponential.h
+        training/base/optimizers/schedulers/strategies/Lambda.cpp
+        training/base/optimizers/schedulers/strategies/ClipUpper.cpp
+        training/base/optimizers/schedulers/strategies/ClipLower.cpp
+        training/base/optimizers/schedulers/strategies/ClipLower.h
+        training/base/optimizers/schedulers/strategies/CosineAnnealing.cpp
+        training/base/optimizers/schedulers/strategies/CosineAnnealing.h
+        training/base/postprocessing/GradientClipping.cpp
+        training/base/postprocessing/GradientClipping.h
+        training/base/postprocessing/GradientPostprocessor.h
+        training/base/initializers/RandomUniformInitializer.cpp
+        training/base/initializers/ConstantInitializer.cpp
+        training/base/initializers/ConstantInitializer.h
+        training/base/initializers/RandomNormInitializer.h
+        training/base/initializers/XavierInitializer.h
+        training/base/initializers/XavierInitializer.cpp
+        training/base/initializers/RandomUniformInitializer.h
+        training/base/initializers/IInitializer.h
+        training/base/initializers/RandomNormInitializer.cpp
+        training/base/common/TensorMem.h
+        training/base/common/Tensor.h
+        training/base/common/Tensor.cpp
+        training/base/common/MemoryManager.h
+        training/base/common/MemoryManager.cpp
+        training/base/common/Conversions.h
+        training/base/common/Random.cpp
+        training/base/common/Common.h
+        training/base/common/Common.cpp
+        training/base/common/Random.h
+        training/base/common/quantization/SymmetricQuantizer.cpp
+        training/base/common/quantization/IQuantizer.h
+        training/base/common/quantization/AffineQuantizer.cpp
+        training/base/common/quantization/AffineQuantizer.h
+        training/base/common/quantization/SymmetricQuantizer.h
+        training/base/common/io/TensorStream.h
+        training/base/common/io/TensorStream.cpp
+        training/base/layers/activations/ReLUActivation.h
+        training/base/layers/activations/GeLUActivation.cpp
+        training/base/layers/activations/HSwishActivation.h
+        training/base/layers/activations/impl/ReLUActivationImpl.cpp
+        training/base/layers/activations/impl/SoftPlusActivationCPU.cpp
+        training/base/layers/activations/impl/ReLUActivationImpl.h
+        training/base/layers/activations/impl/SoftPlusActivationCPU.h
+        training/base/layers/activations/impl/SigmoidActivationCPU.cpp
+        training/base/layers/activations/impl/SigmoidActivationCPU.h
+        training/base/layers/activations/impl/TanhActivationCPU.cpp
+        training/base/layers/activations/impl/TanhActivationCPU.h
+        training/base/layers/activations/impl/SwishActivationCPU.cpp
+        training/base/layers/activations/impl/SwishActivationCPU.h
+        training/base/layers/activations/impl/SoftMaxActivationCPU.cpp
+        training/base/layers/activations/impl/SoftMaxActivationCPU.h
+        training/base/layers/activations/impl/LeakyReLUActivationCPU.cpp
+        training/base/layers/activations/impl/LeakyReLUActivationCPU.h
+        training/base/layers/activations/impl/GeLUActivationCPU.cpp
+        training/base/layers/activations/impl/GeLUActivationCPU.h
+        training/base/layers/activations/impl/HSigmoidActivationCPU.cpp
+        training/base/layers/activations/impl/HSigmoidActivationCPU.h
+        training/base/layers/activations/impl/HSwishActivationCPU.cpp
+        training/base/layers/activations/impl/HSwishActivationCPU.h
+        training/base/layers/activations/ReLUActivation.cpp
+        training/base/layers/activations/HSigmoidActivation.h
+        training/base/layers/activations/GeLUActivation.h
+        training/base/layers/activations/SoftPlusActivation.h
+        training/base/layers/activations/HSwishActivation.cpp
+        training/base/layers/activations/LogSoftMaxActivation.cpp
+        training/base/layers/activations/SoftPlusActivation.cpp
+        training/base/layers/activations/SigmoidActivation.h
+        training/base/layers/activations/SigmoidActivation.cpp
+        training/base/layers/activations/LogSoftMaxActivation.h
+        training/base/layers/activations/SoftMaxActivation.cpp
+        training/base/layers/activations/SoftMaxActivation.h
+        training/base/layers/activations/TanhActivation.h
+        training/base/layers/activations/TanhActivation.cpp
+        training/base/layers/activations/SwishActivation.h
+        training/base/layers/activations/HSigmoidActivation.cpp
+        training/base/layers/activations/SwishActivation.cpp
+        training/base/layers/activations/LeakyReLUActivation.cpp
+        training/base/layers/activations/LeakyReLUActivation.h
+        training/base/layers/parameters/RandomTensorLayerParams.cpp
+        training/base/layers/parameters/ZoneoutParams.cpp
+        training/base/layers/parameters/TensorParams.h
+        training/base/layers/parameters/TilingParameters.cpp
+        training/base/layers/parameters/DataParams.cpp
+        training/base/layers/parameters/ScaleParams.cpp
+        training/base/layers/parameters/TilingParameters.h
+        training/base/layers/parameters/DropoutParams.cpp
+        training/base/layers/parameters/RandomSelectParams.cpp
+        training/base/layers/parameters/RollLayerParams.cpp
+        training/base/layers/parameters/RandomChoiceParams.cpp
+        training/base/layers/parameters/FixedBiasParams.h
+        training/base/layers/parameters/FixedBiasParams.cpp
+        training/base/layers/parameters/SoftPlusActivationParams.cpp
+        training/base/layers/parameters/trainable/TransformerParams.h
+        training/base/layers/parameters/trainable/LocationSensitiveAttentionParams.cpp
+        training/base/layers/parameters/trainable/LayerNormParams.h
+        training/base/layers/parameters/trainable/TransposedConvolution2DParams.h
+        training/base/layers/parameters/trainable/DynamicConvolutionAttentionParams.h
+        training/base/layers/parameters/trainable/LSTMCellParams.cpp
+        training/base/layers/parameters/trainable/LocationSensitiveAttentionParams.h
+        training/base/layers/parameters/trainable/BatchnormParams.cpp
+        training/base/layers/parameters/trainable/TransposedConvolution1DParams.h
+        training/base/layers/parameters/trainable/DynamicConvolutionAttentionParams.cpp
+        training/base/layers/parameters/trainable/TrainablePool2DParams.h
+        training/base/layers/parameters/trainable/Convolution1DParams.cpp
+        training/base/layers/parameters/trainable/TransformerParams.cpp
+        training/base/layers/parameters/trainable/Convolution2DParams.cpp
+        training/base/layers/parameters/trainable/MultiHeadAttentionParams.cpp
+        training/base/layers/parameters/trainable/LayerNormParams.cpp
+        training/base/layers/parameters/trainable/LSTMParams.h
+        training/base/layers/parameters/trainable/TrainableParams.h
+        training/base/layers/parameters/trainable/Convolution2DParams.h
+        training/base/layers/parameters/trainable/LinearParams.h
+        training/base/layers/parameters/trainable/LSTMParams.cpp
+        training/base/layers/parameters/trainable/BatchnormParams.h
+        training/base/layers/parameters/trainable/BahdanauAttentionParams.h
+        training/base/layers/parameters/trainable/TransposedConvolution1DParams.cpp
+        training/base/layers/parameters/trainable/EmbeddingParams.cpp
+        training/base/layers/parameters/trainable/TransposedConvolution2DParams.cpp
+        training/base/layers/parameters/trainable/MultiHeadAttentionParams.h
+        training/base/layers/parameters/trainable/Convolution1DParams.h
+        training/base/layers/parameters/trainable/LinearParams.cpp
+        training/base/layers/parameters/trainable/BahdanauAttentionParams.cpp
+        training/base/layers/parameters/trainable/LSTMCellParams.h
+        training/base/layers/parameters/trainable/EmbeddingParams.h
+        training/base/layers/parameters/trainable/TrainablePool2DParams.cpp
+        training/base/layers/parameters/trainable/GRUCellParams.cpp
+        training/base/layers/parameters/trainable/GRUCellParams.h
+        training/base/layers/parameters/trainable/GRUParams.cpp
+        training/base/layers/parameters/trainable/GRUParams.h
+        training/base/layers/parameters/trainable/GRUFusedGatesCalcParams.cpp
+        training/base/layers/parameters/trainable/GRUFusedGatesCalcParams.h
+        training/base/layers/parameters/trainable/LSTMFusedGatesCalcParams.cpp
+        training/base/layers/parameters/trainable/LSTMFusedGatesCalcParams.h
+        training/base/layers/parameters/BasicParameters.cpp
+        training/base/layers/parameters/FakeQuantParams.h
+        training/base/layers/parameters/ZoneoutParams.h
+        training/base/layers/parameters/RollLayerParams.h
+        training/base/layers/parameters/DropoutParams.h
+        training/base/layers/parameters/SlicingParams.cpp
+        training/base/layers/parameters/ElementWiseComparisonLayerParams.h
+        training/base/layers/parameters/IndexFillLayerParams.cpp
+        training/base/layers/parameters/SoftPlusActivationParams.h
+        training/base/layers/parameters/DataParams.h
+        training/base/layers/parameters/RepeatInterleaveParams.h
+        training/base/layers/parameters/ScaleParams.h
+        training/base/layers/parameters/IndexFillLayerParams.h
+        training/base/layers/parameters/LayerParameters.h
+        training/base/layers/parameters/ClampLayerParams.cpp
+        training/base/layers/parameters/RandomSelectParams.h
+        training/base/layers/parameters/RepeatInterleaveParams.cpp
+        training/base/layers/parameters/TensorParams.cpp
+        training/base/layers/parameters/ClampLayerParams.h
+        training/base/layers/parameters/ElementWiseComparisonLayerParams.cpp
+        training/base/layers/parameters/RandomTensorLayerParams.h
+        training/base/layers/parameters/SlicingParams.h
+        training/base/layers/parameters/FakeQuantParams.cpp
+        training/base/layers/parameters/BasicParameters.h
+        training/base/layers/parameters/RandomChoiceParams.h
+        training/base/layers/parameters/LeakyReLUParams.cpp
+        training/base/layers/parameters/LeakyReLUParams.h
+        training/base/layers/TrainableLayer.h
+        training/base/layers/BroadcastingLayer.cpp
+        training/base/layers/BroadcastingLayer.h
+        training/base/layers/basic/ExpLayer.cpp
+        training/base/layers/basic/DropoutLayer.h
+        training/base/layers/basic/MatMulLayer.h
+        training/base/layers/basic/ReshapeLayer.cpp
+        training/base/layers/basic/BatchExpanderLayer.cpp
+        training/base/layers/basic/ConcatenationLayer.h
+        training/base/layers/basic/RandomTensorLayer.cpp
+        training/base/layers/basic/FakeQuantLayer.cpp
+        training/base/layers/basic/RoundLayer.h
+        training/base/layers/basic/ReduceMeanLayer.cpp
+        training/base/layers/basic/RandomTensorLayer.h
+        training/base/layers/basic/RandomChoiceLayer.h
+        training/base/layers/basic/ExpLayer.h
+        training/base/layers/basic/ScaleLayer.h
+        training/base/layers/basic/ScaleLayer.cpp
+        training/base/layers/basic/MaxPoolLayer.h
+        training/base/layers/basic/GlobalAveragePoolLayer.h
+        training/base/layers/basic/ClampLayer.h
+        training/base/layers/basic/ArgExtremumLayer.h
+        training/base/layers/basic/PositionalEncoding.cpp
+        training/base/layers/basic/SqrtLayer.h
+        training/base/layers/basic/impl/DynamicDepthwiseConvolution2DLayerCPU.cpp
+        training/base/layers/basic/impl/DynamicDepthwiseConvolution2DLayerCPU.h
+        training/base/layers/basic/impl/BatchExpanderLayerCPU.cpp
+        training/base/layers/basic/impl/ReverseLayerCPU.cpp
+        training/base/layers/basic/impl/ReshapeLayerCPU.h
+        training/base/layers/basic/impl/BatchExpanderLayerCPU.h
+        training/base/layers/basic/impl/NonZeroMaskLayerCPU.h
+        training/base/layers/basic/impl/ReshapeLayerCPU.cpp
+        training/base/layers/basic/impl/ElementWiseCompareLayerCPU.h
+        training/base/layers/basic/impl/PositionalEncodingCPU.h
+        training/base/layers/basic/impl/RoundLayerCPU.cpp
+        training/base/layers/basic/impl/NonZeroMaskLayerCPU.cpp
+        training/base/layers/basic/impl/TileLayerCPU.cpp
+        training/base/layers/basic/impl/ReverseLayerCPU.h
+        training/base/layers/basic/impl/ScaleLayerImpl.cpp
+        training/base/layers/basic/impl/RoundLayerCPU.h
+        training/base/layers/basic/impl/ElementWiseCompareLayerCPU.cpp
+        training/base/layers/basic/impl/ScaleLayerImpl.h
+        training/base/layers/basic/impl/TileLayerCPU.h
+        training/base/layers/basic/impl/PositionalEncodingCPU.cpp
+        training/base/layers/basic/impl/MaskedFillLayerCPU.cpp
+        training/base/layers/basic/impl/MaskedFillLayerCPU.h
+        training/base/layers/basic/impl/LabelSmoothingCPU.cpp
+        training/base/layers/basic/impl/LabelSmoothingCPU.h
+        training/base/layers/basic/impl/ExpLayerCPU.cpp
+        training/base/layers/basic/impl/ExpLayerCPU.h
+        training/base/layers/basic/impl/SlicerLayerCPU.cpp
+        training/base/layers/basic/impl/SlicerLayerCPU.h
+        training/base/layers/basic/impl/ConcatenationLayerCPU.cpp
+        training/base/layers/basic/impl/ConcatenationLayerCPU.h
+        training/base/layers/basic/impl/CumSumLayerCPU.cpp
+        training/base/layers/basic/impl/CumSumLayerCPU.h
+        training/base/layers/basic/impl/ClampLayerCPU.cpp
+        training/base/layers/basic/impl/ClampLayerCPU.h
+        training/base/layers/basic/impl/SqrtLayerCPU.cpp
+        training/base/layers/basic/impl/SqrtLayerCPU.h
+        training/base/layers/basic/impl/RSqrtLayerCPU.cpp
+        training/base/layers/basic/impl/RSqrtLayerCPU.h
+        training/base/layers/basic/impl/SquareLayerCPU.cpp
+        training/base/layers/basic/impl/SquareLayerCPU.h
+        training/base/layers/basic/impl/LogLayerCPU.cpp
+        training/base/layers/basic/impl/LogLayerCPU.h
+        training/base/layers/basic/impl/FixedBiasLayerCPU.cpp
+        training/base/layers/basic/impl/FixedBiasLayerCPU.h
+        training/base/layers/basic/impl/L2NormLayerCPU.cpp
+        training/base/layers/basic/impl/L2NormLayerCPU.h
+        training/base/layers/basic/impl/L2SquaredNormLayerCPU.cpp
+        training/base/layers/basic/impl/L2SquaredNormLayerCPU.h
+        training/base/layers/basic/impl/RandomTensorLayerCPU.cpp
+        training/base/layers/basic/impl/RandomTensorLayerCPU.h
+        training/base/layers/basic/impl/ReduceExtremumLayerCPU.cpp
+        training/base/layers/basic/impl/ReduceExtremumLayerCPU.h
+        training/base/layers/basic/impl/ReduceArithmeticLayerCPU.cpp
+        training/base/layers/basic/impl/ReduceArithmeticLayerCPU.h
+        training/base/layers/basic/impl/SplitterLayerCPU.cpp
+        training/base/layers/basic/impl/SplitterLayerCPU.h
+        training/base/layers/basic/impl/TransposeLayerCPU.cpp
+        training/base/layers/basic/impl/TransposeLayerCPU.h
+        training/base/layers/basic/impl/TensorLayerCPU.cpp
+        training/base/layers/basic/impl/TensorLayerCPU.h
+        training/base/layers/basic/impl/SelectLayerCPU.cpp
+        training/base/layers/basic/impl/SelectLayerCPU.h
+        training/base/layers/basic/impl/RandomSelectLayerCPU.cpp
+        training/base/layers/basic/impl/RandomSelectLayerCPU.h
+        training/base/layers/basic/impl/MatMulLayerCPU.cpp
+        training/base/layers/basic/impl/MatMulLayerCPU.h
+        training/base/layers/basic/impl/LossWrapperHelperLayerCPU.cpp
+        training/base/layers/basic/impl/LossWrapperHelperLayerCPU.h
+        training/base/layers/basic/impl/ElementWiseSubLayerCPU.cpp
+        training/base/layers/basic/impl/ElementWiseSubLayerCPU.h
+        training/base/layers/basic/impl/ElementWiseSumLayerCPU.cpp
+        training/base/layers/basic/impl/ElementWiseSumLayerCPU.h
+        training/base/layers/basic/impl/ElementWiseDivLayerCPU.cpp
+        training/base/layers/basic/impl/ElementWiseDivLayerCPU.h
+        training/base/layers/basic/impl/ElementWiseMulLayerCPU.cpp
+        training/base/layers/basic/impl/ElementWiseMulLayerCPU.h
+        training/base/layers/basic/impl/DropoutLayerCPU.cpp
+        training/base/layers/basic/impl/DropoutLayerCPU.h
+        training/base/layers/basic/impl/AveragePoolLayerCPU.cpp
+        training/base/layers/basic/impl/AveragePoolLayerCPU.h
+        training/base/layers/basic/MatMulLayer.cpp
+        training/base/layers/basic/L2SquaredNormLayer.cpp
+        training/base/layers/basic/ElementWiseCompareLayer.cpp
+        training/base/layers/basic/LabelSmoothing.cpp
+        training/base/layers/basic/ReduceNonZeroLayer.cpp
+        training/base/layers/basic/SplitterLayer.h
+        training/base/layers/basic/ConvertPrecisionLayer.cpp
+        training/base/layers/basic/ConvertPrecisionLayer.h
+        training/base/layers/basic/ClampLayer.cpp
+        training/base/layers/basic/SelectLayer.h
+        training/base/layers/basic/ReverseLayer.h
+        training/base/layers/basic/SlicerLayer.cpp
+        training/base/layers/basic/ElementWiseCompareLayer.h
+        training/base/layers/basic/ElementWiseSubLayer.h
+        training/base/layers/basic/trainable/TransposedConvolution1DLayer.h
+        training/base/layers/basic/trainable/Batchnorm.cpp
+        training/base/layers/basic/trainable/Convolution1DLayer.cpp
+        training/base/layers/basic/trainable/impl/Convolution1DLayerCPU.cpp
+        training/base/layers/basic/trainable/impl/ConvolutionDepthwiseLayerCPU.cpp
+        training/base/layers/basic/trainable/impl/ConvolutionDepthwiseLayerCPU.h
+        training/base/layers/basic/trainable/impl/BatchnormCPU.cpp
+        training/base/layers/basic/trainable/impl/BatchnormCPU.h
+        training/base/layers/basic/trainable/impl/EmbeddingCPU.cpp
+        training/base/layers/basic/trainable/impl/EmbeddingCPU.h
+        training/base/layers/basic/trainable/impl/LayerNormCPU.cpp
+        training/base/layers/basic/trainable/impl/LayerNormCPU.h
+        training/base/layers/basic/trainable/impl/LayerNorm2dCPU.cpp
+        training/base/layers/basic/trainable/impl/LayerNorm2dCPU.h
+        training/base/layers/basic/trainable/impl/Convolution1DLayerCPU.h
+        training/base/layers/basic/trainable/TransposedConvolution2DLayer.cpp
+        training/base/layers/basic/trainable/LayerNorm.cpp
+        training/base/layers/basic/trainable/LayerNorm2D.cpp
+        training/base/layers/basic/trainable/TransposedConvolution1DLayer.cpp
+        training/base/layers/basic/trainable/Embedding.h
+        training/base/layers/basic/trainable/ConvolutionDepthwiseLayer.h
+        training/base/layers/basic/trainable/Convolution1DLayer.h
+        training/base/layers/basic/trainable/LinearLayer.cpp
+        training/base/layers/basic/trainable/LinearLayer.h
+        training/base/layers/basic/trainable/LayerNorm.h
+        training/base/layers/basic/trainable/LayerNorm2D.h
+        training/base/layers/basic/trainable/Batchnorm.h
+        training/base/layers/basic/trainable/Convolution2DLayer.h
+        training/base/layers/basic/trainable/TransposedConvolution2DLayer.h
+        training/base/layers/basic/trainable/Embedding.cpp
+        training/base/layers/basic/trainable/ConvolutionDepthwiseLayer.cpp
+        training/base/layers/basic/trainable/Convolution2DLayer.cpp
+        training/base/layers/basic/LogLayer.h
+        training/base/layers/basic/ElementWiseMaxLayer.h
+        training/base/layers/basic/TileLayer.h
+        training/base/layers/basic/LabelSmoothing.h
+        training/base/layers/basic/ReduceArithmeticLayer.cpp
+        training/base/layers/basic/LogLayer.cpp
+        training/base/layers/basic/ElementWiseMinLayer.h
+        training/base/layers/basic/SelectLayer.cpp
+        training/base/layers/basic/L2NormLayer.cpp
+        training/base/layers/basic/SquareLayer.h
+        training/base/layers/basic/LossWrapperHelperLayer.h
+        training/base/layers/basic/SqrtLayer.cpp
+        training/base/layers/basic/FakeQuantLayer.h
+        training/base/layers/basic/ArgMaxLayer.h
+        training/base/layers/basic/ElementWiseSumLayer.h
+        training/base/layers/basic/RSqrtLayer.cpp
+        training/base/layers/basic/GlobalAveragePoolLayer.cpp
+        training/base/layers/basic/DataLayer.cpp
+        training/base/layers/basic/ReduceSumLayer.cpp
+        training/base/layers/basic/CumSumLayer.cpp
+        training/base/layers/basic/TransposeLayer.h
+        training/base/layers/basic/ReduceBatchMeanLayer.h
+        training/base/layers/basic/ReverseLayer.cpp
+        training/base/layers/basic/ElementWiseMulLayer.cpp
+        training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.h
+        training/base/layers/basic/ReduceMinLayer.h
+        training/base/layers/basic/TileLayer.cpp
+        training/base/layers/basic/ElementWiseSumLayer.cpp
+        training/base/layers/basic/BatchExpanderLayer.h
+        training/base/layers/basic/ReduceExtremumLayer.cpp
+        training/base/layers/basic/AveragePoolLayer.h
+        training/base/layers/basic/L2SquaredNormLayer.h
+        training/base/layers/basic/MaskedFillLayer.cpp
+        training/base/layers/basic/ReduceExtremumLayer.h
+        training/base/layers/basic/ReduceNonZeroLayer.h
+        training/base/layers/basic/ReduceMaxLayer.h
+        training/base/layers/basic/MaxPoolLayer.cpp
+        training/base/layers/basic/ReduceSumLayer.h
+        training/base/layers/basic/ReduceArithmeticLayer.h
+        training/base/layers/basic/SplitterLayer.cpp
+        training/base/layers/basic/ReduceStdLayer.cpp
+        training/base/layers/basic/SquareLayer.cpp
+        training/base/layers/basic/L2NormLayer.h
+        training/base/layers/basic/IndexFillLayer.cpp
+        training/base/layers/basic/ReshapeLayer.h
+        training/base/layers/basic/ElementWiseSubLayer.cpp
+        training/base/layers/basic/PaddingLayer.h
+        training/base/layers/basic/ReduceMeanLayer.h
+        training/base/layers/basic/ElementWiseDivLayer.h
+        training/base/layers/basic/DataLayer.h
+        training/base/layers/basic/MaskedFillLayer.h
+        training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.cpp
+        training/base/layers/basic/RepeatInterleaveLayer.cpp
+        training/base/layers/basic/FixedBiasLayer.cpp
+        training/base/layers/basic/ReduceStdLayer.h
+        training/base/layers/basic/FixedBiasLayer.h
+        training/base/layers/basic/TensorLayer.cpp
+        training/base/layers/basic/ElementWiseMulLayer.h
+        training/base/layers/basic/PaddingLayer.cpp
+        training/base/layers/basic/ConcatenationLayer.cpp
+        training/base/layers/basic/ArgMinLayer.h
+        training/base/layers/basic/ElementWiseExtremumLayer.h
+        training/base/layers/basic/TensorLayer.h
+        training/base/layers/basic/SlicerLayer.h
+        training/base/layers/basic/ElementWiseDivLayer.cpp
+        training/base/layers/basic/IndexFillLayer.h
+        training/base/layers/basic/RSqrtLayer.h
+        training/base/layers/basic/TransposeLayer.cpp
+        training/base/layers/basic/RollLayer.h
+        training/base/layers/basic/RandomSelectLayer.h
+        training/base/layers/basic/RandomSelectLayer.cpp
+        training/base/layers/basic/CumSumLayer.h
+        training/base/layers/basic/DropoutLayer.cpp
+        training/base/layers/basic/NonZeroMaskLayer.h
+        training/base/layers/basic/RandomChoiceLayer.cpp
+        training/base/layers/basic/RollLayer.cpp
+        training/base/layers/basic/RepeatInterleaveLayer.h
+        training/base/layers/basic/ReduceBatchMeanLayer.cpp
+        training/base/layers/basic/LossWrapperHelperLayer.cpp
+        training/base/layers/basic/AveragePoolLayer.cpp
+        training/base/layers/basic/PositionalEncoding.h
+        training/base/layers/composite/LeNet.h
+        training/base/layers/composite/LeNet.cpp
+        training/base/layers/composite/DynamicConvolutionAttentionLayer.cpp
+        training/base/layers/composite/Transformer.cpp
+        training/base/layers/composite/MultiHeadAttention.h
+        training/base/layers/composite/BahdanauMonotonicAttentionLayer.cpp
+        training/base/layers/composite/BahdanauMonotonicAttentionLayer.h
+        training/base/layers/composite/AdditiveAttentionLayer.h
+        training/base/layers/composite/BahdanauMonotonicAttentionInternalLayers.h
+        training/base/layers/composite/LocationSensitiveAttentionInternalLayers.h
+        training/base/layers/composite/rnn/BidirectionalLSTMFunc.h
+        training/base/layers/composite/rnn/impl/ZeroOutputLayerCPU.h
+        training/base/layers/composite/rnn/impl/ZeroOutputLayerCPU.cpp
+        training/base/layers/composite/rnn/impl/GRUFusedGatesCalcLayerCPU.h
+        training/base/layers/composite/rnn/impl/GRUFusedGatesCalcLayerCPU.cpp
+        training/base/layers/composite/rnn/impl/LSTMFusedGatesCalcLayerCPU.h
+        training/base/layers/composite/rnn/impl/LSTMFusedGatesCalcLayerCPU.cpp
+        training/base/layers/composite/rnn/ZoneoutLayer.h
+        training/base/layers/composite/rnn/ZoneoutLayer.cpp
+        training/base/layers/composite/rnn/ZeroOutputLayer.cpp
+        training/base/layers/composite/rnn/LSTMCellLayer.h
+        training/base/layers/composite/rnn/LSTMCellLayer.cpp
+        training/base/layers/composite/rnn/BidirectionalLSTMFunc.cpp
+        training/base/layers/composite/rnn/LSTMLayer.h
+        training/base/layers/composite/rnn/LSTMLayer.cpp
+        training/base/layers/composite/rnn/LSTMFusedLayer.h
+        training/base/layers/composite/rnn/LSTMFusedLayer.cpp
+        training/base/layers/composite/rnn/ZeroOutputLayer.h
+        training/base/layers/composite/rnn/GRUCellLayer.h
+        training/base/layers/composite/rnn/GRUCellLayer.cpp
+        training/base/layers/composite/rnn/GRULayer.h
+        training/base/layers/composite/rnn/GRULayer.cpp
+        training/base/layers/composite/rnn/GRUFusedGatesCalcLayer.h
+        training/base/layers/composite/rnn/GRUFusedGatesCalcLayer.cpp
+        training/base/layers/composite/rnn/GRUFusedLayer.h
+        training/base/layers/composite/rnn/GRUFusedLayer.cpp
+        training/base/layers/composite/rnn/LSTMFusedGatesCalcLayer.h
+        training/base/layers/composite/rnn/LSTMFusedGatesCalcLayer.cpp
+        training/base/layers/composite/AttentionLayer.cpp
+        training/base/layers/composite/Transformer.h
+        training/base/layers/composite/AttentionLayer.h
+        training/base/layers/composite/MultiHeadAttention.cpp
+        training/base/layers/composite/AdditiveAttentionLayer.cpp
+        training/base/layers/composite/LocationSensitiveAttentionLayer.h
+        training/base/layers/composite/LocationSensitiveAttentionLayer.cpp
+        training/base/layers/composite/DynamicConvolutionAttentionInternalLayers.h
+        training/base/layers/composite/AttentionMaskCreatorLayer.h
+        training/base/layers/composite/DynamicConvolutionAttentionLayer.h
+        training/base/layers/BasicImpl.h
+        training/base/layers/TrainableLayer.cpp
+        training/base/layers/BasicLayer.h
+        training/base/impl/basic/trainable/LinearLayerImpl.cpp
+        training/base/impl/basic/trainable/LinearLayerImpl.h
+        training/base/impl/basic/trainable/LinearLayerCPUFP32.cpp
+        training/base/impl/basic/trainable/LinearLayerCPUFP32.h
+        training/base/impl/basic/trainable/LinearLayerCPUFP16.h
+        training/base/impl/basic/trainable/LinearLayerCPUFP16.cpp
+        training/base/impl/basic/trainable/Convolution2DLayerCPU.h
+        training/base/impl/basic/trainable/Convolution2DLayerCPU.cpp
+        training/base/impl/composite/rnn/LSTMFusedLayerCPU.h
+        training/base/impl/composite/rnn/LSTMFusedLayerCPU.cpp
+        training/base/impl/composite/rnn/LSTMFusedLayerCPUFP16.h
+        training/base/impl/composite/rnn/LSTMFusedLayerCPUFP16.cpp
+        training/base/impl/composite/rnn/GRUFusedLayerCPU.h
+        training/base/impl/composite/rnn/GRUFusedLayerCPU.cpp
+        training/base/loss/scaling/ScalingStrategy.h
+        training/base/loss/BinaryCrossEntropyLoss.cpp
+        training/base/loss/BinaryCrossEntropyLoss.h
+        training/base/loss/CrossEntropyLoss.cpp
+        training/base/loss/CrossEntropyLoss.h
+        training/base/loss/DivisorLossHelperLayer.h
+        training/base/loss/KLDivLoss.cpp
+        training/base/loss/KLDivLoss.h
+        training/base/loss/L1Loss.cpp
+        training/base/loss/L1Loss.h
+        training/base/loss/LossWrapper.h
+        training/base/loss/MSELoss.cpp
+        training/base/loss/MSELoss.h
+        training/base/loss/NegativeLogLikelihoodLoss.cpp
+        training/base/loss/NegativeLogLikelihoodLoss.h
+        training/base/loss/SigmoidCrossEntropyLoss.cpp
+        training/base/loss/SigmoidCrossEntropyLoss.h
+        training/base/loss/SoftmaxCrossEntropyLoss.cpp
+        training/base/loss/SoftmaxCrossEntropyLoss.h
+        training/base/loss/impl/BinaryCrossEntropyLossCPU.cpp
+        training/base/loss/impl/BinaryCrossEntropyLossCPU.h
+        training/base/loss/impl/CrossEntropyLossCPU.cpp
+        training/base/loss/impl/CrossEntropyLossCPU.h
+        training/base/loss/impl/DivisorLossHelperLayerCPU.cpp
+        training/base/loss/impl/DivisorLossHelperLayerCPU.h
+        training/base/loss/impl/KLDivLossCPU.cpp
+        training/base/loss/impl/KLDivLossCPU.h
+        training/base/loss/impl/L1LossCPU.cpp
+        training/base/loss/impl/L1LossCPU.h
+        training/base/loss/impl/MSELossCPU.cpp
+        training/base/loss/impl/MSELossCPU.h
+        training/base/loss/impl/NegativeLogLikelihoodLossCPU.cpp
+        training/base/loss/impl/NegativeLogLikelihoodLossCPU.h
+        training/base/loss/impl/SigmoidCrossEntropyLossCPU.cpp
+        training/base/loss/impl/SigmoidCrossEntropyLossCPU.h
+        training/base/loss/impl/SoftmaxCrossEntropyLossCPU.cpp
+        training/base/loss/impl/SoftmaxCrossEntropyLossCPU.h
+        training/base/impl/ImplFactory.h
+        training/base/impl/ImplFactory.cpp
+        training/base/tools/NamedTuple.h
+        training/base/tools/DataTransformations.h
+        training/base/tools/ElementSequence.cpp
+        training/base/tools/Utils.cpp
+        training/base/tools/DataTransformations.cpp
+        training/base/tools/Utils.h
+        training/base/tools/ElementSequence.h
+        training/base/common/NetworkParameters.h
+        training/base/common/NetworkParameters.cpp
+        )
\ No newline at end of file
diff --git a/training/src/compiler/training/base/common/Common.cpp b/training/src/compiler/training/base/common/Common.cpp
new file mode 100644
index 00000000..71827af6
--- /dev/null
+++ b/training/src/compiler/training/base/common/Common.cpp
@@ -0,0 +1,1210 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Common.h"
+
+#include "Tensor.h"
+
+#include <cassert>
+#include <fstream>
+#include <random>
+
+#ifdef _BLAS_ENHANCE
+#include "blas_enhance.h"
+#include "thread_affinity.h"
+#endif
+
+#include <training/system/TypeHalf.h>
+
+namespace
+{
+
+using namespace std;
+using namespace raul;
+
+#if !defined(RAUL_USE_OPENBLAS) // && (!defined(_BLAS_ENHANCE)
+/**
+ * @brief  Performs one of the matrix-matrix operations
+ *
+ *   C := alpha*op( A )*op( B ) + beta*C,
+ *   where  op( X ) is one of
+ *   op( X ) = X   or   op( X ) = X',
+ *
+ *   alpha and beta are scalars, and A, B and C are matrices, with op( A )
+ *   an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+ *
+ * @param transA specifies the form of op( A )
+ * @param transB specifies the form of op( B )
+ * @param matA pointer to matrix A
+ * @param matB pointer to matrix B
+ * @param matC pointer to matrix C
+ * @param m pecifies  the number  of rows  of the matrix op( A )
+ * @param n specifies the number  of columns of the matrix  op( B )
+ * @param k specifies  the number of columns of the matrix op( A ) and the number of rows of the matrix op( B )
+ * @param alpha specifies the scalar alpha
+ * @param beta pecifies the scalar  beta
+ * @param lda stride in matrix A
+ * @param ldb stride in matrix B
+ * @param ldc stride in matrix C
+ * @param bOffset offset in matrix b
+ *
+ * @see http://www.netlib.org/clapack/cblas/dgemm.c
+ */
+template<typename T>
+void matrixMul(const CBLAS_TRANSPOSE transA,
+               const CBLAS_TRANSPOSE transB,
+               const T* matA,
+               const T* matB,
+               T* matC,
+               size_t m,
+               size_t n,
+               size_t k,
+               const dtype alpha,
+               const dtype beta,
+               size_t lda,
+               size_t ldb,
+               size_t ldc,
+               size_t bOffset)
+{
+    assert(transA != CblasConjTrans);
+    assert(transB != CblasConjTrans);
+    assert(transA != CblasConjNoTrans);
+    assert(transB != CblasConjNoTrans);
+
+    if (transB == CblasNoTrans)
+    {
+        if (transA == CblasNoTrans)
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t r = 0; r < m; ++r)
+            {
+                for (size_t c = 0; c < n; ++c)
+                {
+                    float acc = 0.f;
+                    for (size_t i = 0; i < k; i++)
+                    {
+                        acc += static_cast<T>(alpha) * matA[i + r * lda] * matB[c + i * ldb + bOffset];
+                    }
+
+                    matC[c + r * ldc] = static_cast<T>(acc) + static_cast<T>(beta) * matC[c + r * ldc];
+                }
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t r = 0; r < m; ++r)
+            {
+                for (size_t c = 0; c < n; ++c)
+                {
+                    float acc = 0.f;
+                    for (size_t i = 0; i < k; i++)
+                    {
+                        acc += static_cast<T>(alpha) * matA[r + i * lda] * matB[c + i * ldb + bOffset];
+                    }
+
+                    matC[c + r * ldc] = static_cast<T>(acc) + static_cast<T>(beta) * matC[c + r * ldc];
+                }
+            }
+        }
+    }
+    else
+    {
+        if (transA == CblasNoTrans)
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t r = 0; r < m; ++r)
+            {
+                for (size_t c = 0; c < n; ++c)
+                {
+                    float acc = 0.f;
+                    for (size_t i = 0; i < k; i++)
+                    {
+                        acc += static_cast<T>(alpha) * matA[i + r * lda] * matB[i + c * ldb + bOffset];
+                    }
+
+                    matC[c + r * ldc] = static_cast<T>(acc) + static_cast<T>(beta) * matC[c + r * ldc];
+                }
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t r = 0; r < m; ++r)
+            {
+                for (size_t c = 0; c < n; ++c)
+                {
+                    float acc = 0.f;
+                    for (size_t i = 0; i < k; i++)
+                    {
+                        acc += static_cast<T>(alpha) * matA[r + i * lda] * matB[i + c * ldb + bOffset];
+                    }
+
+                    matC[c + r * ldc] = static_cast<T>(acc) + static_cast<T>(beta) * matC[c + r * ldc];
+                }
+            }
+        }
+    }
+}
+#endif
+
+template<typename Out>
+void split(const string& st, char delimeter, Out result)
+{
+    stringstream stream;
+    stream.str(st);
+    string item;
+    while (getline(stream, item, delimeter))
+    {
+        *(result++) = item;
+    }
+}
+
+template<typename T>
+void transpose(T* matrix, size_t cols, size_t size)
+{
+    // https://stackoverflow.com/questions/9227747/in-place-transposition-of-a-matrix
+    const size_t mn1 = size - 1;
+    const size_t n = size / cols;
+    vector<bool> visited(size);
+    T* cycle = matrix;
+    while (++cycle != (matrix + size))
+    {
+        if (visited[cycle - matrix]) continue;
+        size_t a = cycle - matrix;
+        do
+        {
+            a = a == mn1 ? mn1 : (n * a) % mn1;
+            swap(*(matrix + a), *cycle);
+            visited[a] = true;
+        } while ((matrix + a) != cycle);
+    }
+}
+
+void gemmCPUImpl(OPENBLAS_CONST CBLAS_TRANSPOSE transA,
+                 OPENBLAS_CONST CBLAS_TRANSPOSE transB,
+                 size_t m,
+                 size_t n,
+                 size_t k,
+                 OPENBLAS_CONST dtype alpha,
+                 OPENBLAS_CONST dtype* a,
+                 OPENBLAS_CONST dtype* b,
+                 OPENBLAS_CONST dtype beta,
+                 dtype* c)
+{
+    size_t bOffset = 0;
+#if defined(_BLAS) && !defined(_BLAS_ENHANCE)
+    size_t lda = (transA == CblasNoTrans) ? k : m;
+    size_t ldb = (transB == CblasNoTrans) ? n : k;
+
+    cblas_sgemm(CblasRowMajor,
+                transA,
+                transB,
+                static_cast<blasint>(m),
+                static_cast<blasint>(n),
+                static_cast<blasint>(k),
+                alpha,
+                a,
+                static_cast<blasint>(lda),
+                b + bOffset,
+                static_cast<blasint>(ldb),
+                beta,
+                c,
+                static_cast<blasint>(n));
+#else
+#if defined(_BLAS) && defined(_BLAS_ENHANCE)
+    if ((alpha != 1 && beta != 0) || (beta != 0 && beta != 1))
+    {
+        size_t lda = (transA == CblasNoTrans) ? k : m;
+        size_t ldb = (transB == CblasNoTrans) ? n : k;
+        matrixMul(transA, transB, a, b, c, m, n, k, alpha, beta, lda, ldb, n, bOffset);
+        return;
+    }
+    CHECK_REQUIREMENT(1 == alpha || 0 == beta);
+    CHECK_REQUIREMENT(1 == beta || 0 == beta);
+    TensorDesc matrixADesc, matrixBDesc;
+    if (transA == CblasNoTrans)
+    {
+        matrixADesc = tensor2df(DT_F32, DF_NORMAL, static_cast<U32>(m), static_cast<U32>(k));
+    }
+    else
+    {
+        matrixADesc = tensor2df(DT_F32, DF_TRANSPOSE, static_cast<U32>(k), static_cast<U32>(m));
+    }
+    if (transB == CblasNoTrans)
+    {
+        matrixBDesc = tensor2df(DT_F32, DF_NORMAL, static_cast<U32>(k), static_cast<U32>(n));
+    }
+    else
+    {
+        matrixBDesc = tensor2df(DT_F32, DF_TRANSPOSE, static_cast<U32>(n), static_cast<U32>(k));
+    }
+
+    if (1 == m && transB == CblasTrans)
+    {
+        matrixADesc = tensor1d(DT_F32, static_cast<U32>(k));
+        matrixBDesc.df = DF_NORMAL;
+        TensorDesc matrixCDesc = tensor1d(DT_F32, static_cast<U32>(n));
+        if (0 == beta)
+        {
+            memset(c, 0, tensorNumBytes(matrixCDesc));
+        }
+        CHECK_STATUS(matrix_vector_multiply(matrixBDesc, b + bOffset, matrixADesc, a, 0, nullptr, matrixCDesc, c, nullptr, get_cpu_arch()));
+    }
+    else if (1 == n && transA == CblasNoTrans)
+    {
+        matrixBDesc = tensor1d(DT_F32, static_cast<U32>(k));
+        TensorDesc matrixCDesc = tensor1d(DT_F32, static_cast<U32>(m));
+        if (0 == beta)
+        {
+            memset(c, 0, tensorNumBytes(matrixCDesc));
+        }
+        CHECK_STATUS(matrix_vector_multiply(matrixADesc, a, matrixBDesc, b + bOffset, 0, nullptr, matrixCDesc, c, nullptr, get_cpu_arch()));
+    }
+    else
+    {
+        TensorDesc matrixCDesc = tensor2df(DT_F32, DF_NORMAL, static_cast<U32>(m), static_cast<U32>(n));
+        unsigned int bytes;
+        CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(matrixADesc, matrixBDesc, &bytes, get_cpu_arch()));
+        vector<char> tmp(bytes);
+        if (0 == beta)
+        {
+            memset(c, 0, tensorNumBytes(matrixCDesc));
+        }
+        CHECK_STATUS(matrix_matrix_multiply(matrixADesc, a, matrixBDesc, b + bOffset, bytes, tmp.data(), matrixCDesc, c, nullptr, get_cpu_arch()));
+    }
+
+    if (alpha != 1_dt)
+    {
+        size_t sizeC = m * n;
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < sizeC; ++i)
+        {
+            c[i] *= alpha;
+        }
+    }
+#else
+    size_t lda = (transA == CblasNoTrans) ? k : m;
+    size_t ldb = (transB == CblasNoTrans) ? n : k;
+    matrixMul(transA, transB, a, b, c, m, n, k, alpha, beta, lda, ldb, n, bOffset);
+#endif
+#endif
+}
+
+void gemmCPUImpl(OPENBLAS_CONST CBLAS_TRANSPOSE transA,
+                 OPENBLAS_CONST CBLAS_TRANSPOSE transB,
+                 size_t m,
+                 size_t n,
+                 size_t k,
+                 OPENBLAS_CONST dtype alpha,
+                 OPENBLAS_CONST half* a,
+                 OPENBLAS_CONST half* b,
+                 OPENBLAS_CONST dtype beta,
+                 half* c)
+{
+    size_t bOffset = 0;
+#if defined(_BLAS) && !defined(_BLAS_ENHANCE)
+    size_t lda = (transA == CblasNoTrans) ? k : m;
+    size_t ldb = (transB == CblasNoTrans) ? n : k;
+
+    std::vector<dtype> mA(k * m);
+    std::vector<dtype> mB(n * k + bOffset);
+    std::vector<dtype> mC(n * m);
+
+    for (size_t q = 0; q < k * m; ++q)
+    {
+        mA[q] = toFloat32(a[q]);
+    }
+
+    for (size_t q = bOffset; q < n * k; ++q)
+    {
+        mB[q] = toFloat32(b[q]);
+    }
+
+    for (size_t q = 0; q < n * m; ++q)
+    {
+        mC[q] = toFloat32(c[q]);
+    }
+
+    cblas_sgemm(CblasRowMajor,
+                transA,
+                transB,
+                static_cast<blasint>(m),
+                static_cast<blasint>(n),
+                static_cast<blasint>(k),
+                alpha,
+                mA.data(),
+                static_cast<blasint>(lda),
+                mB.data() + bOffset,
+                static_cast<blasint>(ldb),
+                beta,
+                mC.data(),
+                static_cast<blasint>(n));
+
+    for (size_t q = 0; q < n * m; ++q)
+    {
+        c[q] = toFloat16(mC[q]);
+    }
+#else
+#if defined(_BLAS_ENHANCE)
+    if (alpha != 1 && beta != 0)
+    {
+        size_t lda = (transA == CblasNoTrans) ? k : m;
+        size_t ldb = (transB == CblasNoTrans) ? n : k;
+        matrixMul(transA, transB, a, b, c, m, n, k, alpha, beta, lda, ldb, n, bOffset);
+        return;
+    }
+    CHECK_REQUIREMENT(1 == alpha || 0 == beta);
+    CHECK_REQUIREMENT(1 == beta || 0 == beta);
+    TensorDesc matrixADesc, matrixBDesc;
+    if (transA == CblasNoTrans)
+    {
+        matrixADesc = tensor2df(DT_F16, DF_NORMAL, static_cast<U32>(m), static_cast<U32>(k));
+    }
+    else
+    {
+        matrixADesc = tensor2df(DT_F16, DF_TRANSPOSE, static_cast<U32>(k), static_cast<U32>(m));
+    }
+    if (transB == CblasNoTrans)
+    {
+        matrixBDesc = tensor2df(DT_F16, DF_NORMAL, static_cast<U32>(k), static_cast<U32>(n));
+    }
+    else
+    {
+        matrixBDesc = tensor2df(DT_F16, DF_TRANSPOSE, static_cast<U32>(n), static_cast<U32>(k));
+    }
+
+    if (1 == m && transB == CblasTrans)
+    {
+        matrixADesc = tensor1d(DT_F16, static_cast<U32>(k));
+        matrixBDesc.df = DF_NORMAL;
+        TensorDesc matrixCDesc = tensor1d(DT_F16, static_cast<U32>(n));
+        if (0 == beta)
+        {
+            memset(c, 0, tensorNumBytes(matrixCDesc));
+        }
+        CHECK_STATUS(matrix_vector_multiply(matrixBDesc, b + bOffset, matrixADesc, a, 0, nullptr, matrixCDesc, c, nullptr, get_cpu_arch()));
+    }
+    else if (1 == n && transA == CblasNoTrans)
+    {
+        matrixBDesc = tensor1d(DT_F16, static_cast<U32>(k));
+        TensorDesc matrixCDesc = tensor1d(DT_F16, static_cast<U32>(m));
+        if (0 == beta)
+        {
+            memset(c, 0, tensorNumBytes(matrixCDesc));
+        }
+        CHECK_STATUS(matrix_vector_multiply(matrixADesc, a, matrixBDesc, b + bOffset, 0, nullptr, matrixCDesc, c, nullptr, get_cpu_arch()));
+    }
+    else
+    {
+        TensorDesc matrixCDesc = tensor2df(DT_F16, DF_NORMAL, static_cast<U32>(m), static_cast<U32>(n));
+        unsigned int bytes;
+        CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(matrixADesc, matrixBDesc, &bytes, get_cpu_arch()));
+        vector<char> tmp(bytes);
+        if (0 == beta)
+        {
+            memset(c, 0, tensorNumBytes(matrixCDesc));
+        }
+        CHECK_STATUS(matrix_matrix_multiply(matrixADesc, a, matrixBDesc, b + bOffset, bytes, tmp.data(), matrixCDesc, c, nullptr, get_cpu_arch()));
+    }
+
+    if (alpha != 1_dt)
+    {
+        size_t sizeC = m * n;
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < sizeC; ++i)
+        {
+            c[i] = static_cast<half>(static_cast<float>(c[i]) * alpha);
+        }
+    }
+#else
+    size_t lda = (transA == CblasNoTrans) ? k : m;
+    size_t ldb = (transB == CblasNoTrans) ? n : k;
+
+    std::vector<dtype> mA(k * m);
+    std::vector<dtype> mB(n * k + bOffset);
+    std::vector<dtype> mC(n * m);
+
+    for (size_t q = 0; q < k * m; ++q)
+    {
+        mA[q] = toFloat32(a[q]);
+    }
+
+    for (size_t q = bOffset; q < n * k; ++q)
+    {
+        mB[q] = toFloat32(b[q]);
+    }
+
+    for (size_t q = 0; q < n * m; ++q)
+    {
+        mC[q] = toFloat32(c[q]);
+    }
+
+    matrixMul(transA, transB, mA.data(), mB.data(), mC.data(), m, n, k, alpha, beta, lda, ldb, n, bOffset);
+
+    for (size_t q = 0; q < n * m; ++q)
+    {
+        c[q] = toFloat16(mC[q]);
+    }
+#endif
+#endif
+}
+
+void axpyCPUImpl(size_t n, OPENBLAS_CONST dtype sa, OPENBLAS_CONST dtype* sx, size_t incx, dtype* sy, size_t incy, size_t xOffset, size_t yOffset)
+{
+#if defined(_BLAS) && !defined(_BLAS_ENHANCE)
+    cblas_saxpy(static_cast<blasint>(n), sa, &sx[xOffset], static_cast<blasint>(incx), &sy[yOffset], static_cast<blasint>(incy));
+#else
+#if defined(_BLAS_ENHANCE)
+    CHECK_REQUIREMENT(1 == incx && 1 == incy);
+    TensorDesc vDesc = tensor1d(DT_F32, static_cast<U32>(n));
+    CHECK_STATUS(vector_vector_axpby(sa, vDesc, &sx[xOffset], 1, vDesc, &sy[yOffset], get_cpu_arch()));
+#else
+    size_t indexX = 0;
+    size_t indexY = 0;
+    for (size_t index = 0; index < n; ++index)
+    {
+        sy[yOffset + indexY] += sa * sx[xOffset + indexX];
+        indexX += incx;
+        indexY += incy;
+    }
+#endif
+#endif
+}
+
+void axpyCPUImpl(size_t n, OPENBLAS_CONST dtype sa, OPENBLAS_CONST half* sx, size_t incx, half* sy, size_t incy, size_t xOffset, size_t yOffset)
+{
+#if defined(_BLAS) && !defined(_BLAS_ENHANCE)
+    std::vector<dtype> mSX(n * incx + xOffset);
+    std::vector<dtype> mSY(n * incy + yOffset);
+
+    for (size_t q = xOffset; q < mSX.size(); q += incx)
+    {
+        mSX[q] = toFloat32(sx[q]);
+    }
+
+    for (size_t q = yOffset; q < mSY.size(); q += incy)
+    {
+        mSY[q] = toFloat32(sy[q]);
+    }
+
+    cblas_saxpy(static_cast<blasint>(n), sa, &mSX[xOffset], static_cast<blasint>(incx), &mSY[yOffset], static_cast<blasint>(incy));
+
+    for (size_t q = yOffset; q < mSY.size(); q += incy)
+    {
+        sy[q] = toFloat16(mSY[q]);
+    }
+#else
+#if defined(_BLAS_ENHANCE)
+    CHECK_REQUIREMENT(1 == incx && 1 == incy);
+    TensorDesc vDesc = tensor1d(DT_F16, static_cast<U32>(n));
+    CHECK_STATUS(vector_vector_axpby(sa, vDesc, &sx[xOffset], 1, vDesc, &sy[yOffset], get_cpu_arch()));
+#else
+    std::vector<dtype> mSX(n * incx + xOffset);
+    std::vector<dtype> mSY(n * incy + yOffset);
+
+    for (size_t q = xOffset; q < mSX.size(); q += incx)
+    {
+        mSX[q] = toFloat32(sx[q]);
+    }
+
+    for (size_t q = yOffset; q < mSY.size(); q += incy)
+    {
+        mSY[q] = toFloat32(sy[q]);
+    }
+
+    size_t indexX = 0;
+    size_t indexY = 0;
+    for (size_t index = 0; index < n; ++index)
+    {
+        mSY[yOffset + indexY] += sa * mSX[xOffset + indexX];
+        indexX += incx;
+        indexY += incy;
+    }
+
+    for (size_t q = yOffset; q < mSY.size(); q += incy)
+    {
+        sy[q] = toFloat16(mSY[q]);
+    }
+#endif
+#endif
+}
+} // anonymous namespace
+
+namespace raul
+{
+
+void Common::conv1d(const dtype* input,
+                    dtype* output,
+                    const dtype* kernel,
+                    const dtype* bias,
+                    size_t batchSize,
+                    size_t inputSize,
+                    size_t inputChannels,
+                    size_t outputSize,
+                    size_t outputChannels,
+                    size_t kernelSize,
+                    size_t padding,
+                    size_t stride,
+                    size_t dilation,
+                    size_t groups,
+                    bool tfStyle)
+{
+    auto inputs3D = tfStyle ? yato::array_view_3d<dtype>(const_cast<dtype*>(input), yato::dims(batchSize, inputSize, inputChannels))
+                            : yato::array_view_3d<dtype>(const_cast<dtype*>(input), yato::dims(batchSize, inputChannels, inputSize));
+    auto outputs3D =
+        tfStyle ? yato::array_view_3d<dtype>(output, yato::dims(batchSize, outputSize, outputChannels)) : yato::array_view_3d<dtype>(output, yato::dims(batchSize, outputChannels, outputSize));
+
+    auto kernelsWeights3D = tfStyle ? yato::array_view_3d<dtype>(const_cast<dtype*>(kernel), yato::dims(kernelSize, inputChannels / groups, outputChannels))
+                                    : yato::array_view_3d<dtype>(const_cast<dtype*>(kernel), yato::dims(outputChannels, inputChannels / groups, kernelSize));
+
+    const auto firstOutputDimension = tfStyle ? outputSize : outputChannels;
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < batchSize; ++q)
+    {
+        for (size_t d = 0; d < firstOutputDimension; ++d)
+        {
+            fill(outputs3D[q][d].begin(), outputs3D[q][d].end(), 0.0_dt);
+        }
+
+        size_t inputSizePadded = inputSize + 2 * padding;
+
+        vector<dtype> inputPadded(inputChannels * inputSizePadded);
+
+        Common::addPadding1D(&inputs3D[q][0][0], inputPadded.data(), inputChannels, inputSize, inputSizePadded, tfStyle);
+
+        auto inputPadded2D = tfStyle ? yato::view(inputPadded).reshape(yato::dims(inputSizePadded, inputChannels)) : yato::view(inputPadded).reshape(yato::dims(inputChannels, inputSizePadded));
+        for (size_t group = 0; group < groups; ++group)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < outputChannels / groups; ++kernelIndex)
+            {
+                for (size_t d = 0; d < inputChannels / groups; ++d)
+                {
+                    for (size_t ox = 0; ox < outputSize; ++ox)
+                    {
+                        for (size_t kx = 0; kx < kernelSize; ++kx)
+                        {
+                            if (tfStyle)
+                            {
+                                outputs3D[q][ox][kernelIndex + group * outputChannels / groups] +=
+                                    kernelsWeights3D[kx][d][kernelIndex + group * outputChannels / groups] * inputPadded2D[ox * stride + kx * dilation][d + group * inputChannels / groups];
+                            }
+                            else
+                            {
+                                outputs3D[q][kernelIndex + group * outputChannels / groups][ox] +=
+                                    kernelsWeights3D[kernelIndex + group * outputChannels / groups][d][kx] * inputPadded2D[d + group * inputChannels / groups][ox * stride + kx * dilation];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (bias)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < outputChannels; ++kernelIndex)
+            {
+                for (size_t ox = 0; ox < outputSize; ++ox)
+                {
+                    if (tfStyle)
+                    {
+                        outputs3D[q][ox][kernelIndex] += bias[kernelIndex];
+                    }
+                    else
+                    {
+                        outputs3D[q][kernelIndex][ox] += bias[kernelIndex];
+                    }
+                }
+            }
+        }
+    }
+}
+
+void Common::generate_permutation(size_t n, vector<size_t>& ind_vector, unsigned int seed)
+{
+    ind_vector.clear();
+    ind_vector.resize(n);
+    std::iota(ind_vector.begin(), ind_vector.end(), size_t(0));
+
+    std::mt19937 g(seed);
+
+    std::shuffle(ind_vector.begin(), ind_vector.end(), g);
+}
+
+void Common::gemm(OPENBLAS_CONST CBLAS_TRANSPOSE transA,
+                  OPENBLAS_CONST CBLAS_TRANSPOSE transB,
+                  size_t m,
+                  size_t n,
+                  size_t k,
+                  OPENBLAS_CONST dtype alpha,
+                  OPENBLAS_CONST dtype* a,
+                  OPENBLAS_CONST dtype* b,
+                  OPENBLAS_CONST dtype beta,
+                  dtype* c)
+{
+    gemmCPUImpl(transA, transB, m, n, k, alpha, a, b, beta, c);
+}
+
+void Common::gemm(OPENBLAS_CONST CBLAS_TRANSPOSE transA,
+                  OPENBLAS_CONST CBLAS_TRANSPOSE transB,
+                  size_t m,
+                  size_t n,
+                  size_t k,
+                  OPENBLAS_CONST dtype alpha,
+                  OPENBLAS_CONST half* a,
+                  OPENBLAS_CONST half* b,
+                  OPENBLAS_CONST dtype beta,
+                  half* c)
+{
+    gemmCPUImpl(transA, transB, m, n, k, alpha, a, b, beta, c);
+}
+
+void Common::hadamard(OPENBLAS_CONST size_t n,
+                      OPENBLAS_CONST dtype alpha,
+                      OPENBLAS_CONST dtype* a,
+                      OPENBLAS_CONST dtype* x,
+                      OPENBLAS_CONST size_t incx,
+                      OPENBLAS_CONST dtype beta,
+                      dtype* y,
+                      OPENBLAS_CONST size_t incy)
+{
+#if defined(_BLAS) && !defined(_BLAS_ENHANCE)
+    size_t k = 0;
+    size_t lda = 1;
+
+    cblas_ssbmv(CblasRowMajor, CblasUpper, static_cast<blasint>(n), static_cast<blasint>(k), alpha, a, static_cast<blasint>(lda), x, static_cast<blasint>(incx), beta, y, static_cast<blasint>(incy));
+
+#else
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; ++i)
+    {
+        y[i * incy] = alpha * a[i] * x[i * incx] + beta * y[i * incy];
+    }
+#endif
+}
+
+void Common::axpy(size_t n, OPENBLAS_CONST dtype sa, OPENBLAS_CONST dtype* sx, size_t incx, dtype* sy, size_t incy, size_t xOffset, size_t yOffset)
+{
+    axpyCPUImpl(n, sa, sx, incx, sy, incy, xOffset, yOffset);
+}
+
+void Common::axpy(size_t n, OPENBLAS_CONST dtype sa, OPENBLAS_CONST half* sx, size_t incx, half* sy, size_t incy, size_t xOffset, size_t yOffset)
+{
+    axpyCPUImpl(n, sa, sx, incx, sy, incy, xOffset, yOffset);
+}
+
+int Common::axpby(OPENBLAS_CONST size_t n,
+                  OPENBLAS_CONST dtype alpha,
+                  OPENBLAS_CONST dtype* x,
+                  OPENBLAS_CONST size_t incx,
+                  OPENBLAS_CONST dtype beta,
+                  dtype* y,
+                  OPENBLAS_CONST size_t incy,
+                  size_t xOffset,
+                  size_t yOffset)
+{
+#if defined(_BLAS) && !defined(_BLAS_ENHANCE)
+    cblas_saxpby(static_cast<blasint>(n), alpha, &x[xOffset], static_cast<blasint>(incx), beta, &y[yOffset], static_cast<blasint>(incy));
+#else
+#if defined(_BLAS_ENHANCE)
+    (void)xOffset;
+    (void)yOffset;
+    CHECK_REQUIREMENT(1 == incx && 1 == incy);
+    TensorDesc vDesc = tensor1d(DT_F32, static_cast<U32>(n));
+    CHECK_STATUS(vector_vector_axpby(alpha, vDesc, x, beta, vDesc, y, get_cpu_arch()));
+#else
+    size_t indexX = 0;
+    size_t indexY = 0;
+    for (size_t index = 0; index < n; ++index)
+    {
+        y[yOffset + indexY] = alpha * x[xOffset + indexX] + beta * y[yOffset + indexY];
+        indexX += incx;
+        indexY += incy;
+    }
+#endif
+#endif
+    return 0;
+}
+
+int Common::axpby(OPENBLAS_CONST size_t n,
+                  OPENBLAS_CONST dtype alpha,
+                  OPENBLAS_CONST half* x,
+                  OPENBLAS_CONST size_t incx,
+                  OPENBLAS_CONST dtype beta,
+                  half* y,
+                  OPENBLAS_CONST size_t incy,
+                  size_t xOffset,
+                  size_t yOffset)
+{
+#if defined(_BLAS) && !defined(_BLAS_ENHANCE)
+    std::vector<dtype> mX(n * incx + xOffset);
+    std::vector<dtype> mY(n * incy + yOffset);
+
+    for (size_t q = xOffset; q < mX.size(); q += incx)
+    {
+        mX[q] = toFloat32(x[q]);
+    }
+
+    for (size_t q = yOffset; q < mY.size(); q += incy)
+    {
+        mY[q] = toFloat32(y[q]);
+    }
+
+    cblas_saxpby(static_cast<blasint>(n), alpha, &mX[xOffset], static_cast<blasint>(incx), beta, &mY[yOffset], static_cast<blasint>(incy));
+
+    for (size_t q = yOffset; q < mY.size(); q += incy)
+    {
+        y[q] = toFloat16(mY[q]);
+    }
+#else
+#if defined(_BLAS_ENHANCE)
+    CHECK_REQUIREMENT(1 == incx && 1 == incy);
+    TensorDesc vDesc = tensor1d(DT_F16, static_cast<U32>(n));
+    CHECK_STATUS(vector_vector_axpby(alpha, vDesc, &x[xOffset], beta, vDesc, &y[yOffset], get_cpu_arch()));
+#else
+    std::vector<dtype> mX(n * incx + xOffset);
+    std::vector<dtype> mY(n * incy + yOffset);
+
+    for (size_t q = xOffset; q < mX.size(); q += incx)
+    {
+        mX[q] = toFloat32(x[q]);
+    }
+
+    for (size_t q = yOffset; q < mY.size(); q += incy)
+    {
+        mY[q] = toFloat32(y[q]);
+    }
+
+    size_t indexX = 0;
+    size_t indexY = 0;
+    for (size_t index = 0; index < n; ++index)
+    {
+        mY[yOffset + indexY] = alpha * mX[xOffset + indexX] + beta * mY[yOffset + indexY];
+        indexX += incx;
+        indexY += incy;
+    }
+
+    for (size_t q = yOffset; q < mY.size(); q += incy)
+    {
+        y[q] = toFloat16(mY[q]);
+    }
+
+#endif
+#endif
+    return 0;
+}
+
+dtype Common::dot(size_t n, OPENBLAS_CONST dtype* sx, size_t incx, OPENBLAS_CONST dtype* sy, size_t incy)
+{
+#if defined(_BLAS) && !defined(_BLAS_ENHANCE)
+    return cblas_sdot(static_cast<blasint>(n), sx, static_cast<blasint>(incx), sy, static_cast<blasint>(incy));
+#else
+    dtype res = 0.0;
+    size_t indexX = 0;
+    size_t indexY = 0;
+    for (size_t index = 0; index < n; ++index)
+    {
+        res += sx[indexX] * sy[indexY];
+        indexX += incx;
+        indexY += incy;
+    }
+    return res;
+#endif
+}
+
+void Common::scal(size_t n, OPENBLAS_CONST dtype sa, dtype* sx, size_t incx)
+{
+#if defined(_BLAS) && !defined(_BLAS_ENHANCE)
+    cblas_sscal(static_cast<blasint>(n), sa, sx, static_cast<blasint>(incx));
+#else
+    size_t indexX = 0;
+    for (size_t index = 0; index < n; ++index)
+    {
+        sx[indexX] *= sa;
+        indexX += incx;
+    }
+#endif
+}
+
+void Common::transpose(Tensor& tensor, size_t cols)
+{
+    ::transpose(&tensor[0], cols, tensor.size());
+}
+
+void Common::transpose(TensorFP16& tensor, size_t cols)
+{
+    ::transpose(&tensor[0], cols, tensor.size());
+}
+
+void Common::addPadding1D(const dtype* src, dtype* dst, size_t srcChannels, size_t srcSize, size_t dstSize, bool reversedOrder)
+{
+    if (dstSize >= srcSize)
+    {
+        size_t padSize = dstSize - srcSize;
+
+        size_t leftPad = padSize / 2;
+
+        if (reversedOrder)
+        {
+            for (size_t d = 0; d < srcChannels; ++d)
+            {
+                for (size_t x = leftPad; x < leftPad + srcSize; ++x)
+                {
+                    dst[x * srcChannels + d] = src[(x - leftPad) * srcChannels + d];
+                }
+            }
+        }
+        else
+        {
+            for (size_t d = 0; d < srcChannels; ++d)
+            {
+                for (size_t x = leftPad; x < leftPad + srcSize; ++x)
+                {
+                    dst[d * dstSize + x] = src[d * srcSize + x - leftPad];
+                }
+            }
+        }
+    }
+}
+
+void Common::removePadding1D(const dtype* src, dtype* dst, size_t srcChannels, size_t srcSize, size_t dstSize, bool reversedOrder, bool overwrite)
+{
+    if (dstSize <= srcSize)
+    {
+        size_t padSize = srcSize - dstSize;
+
+        size_t leftPad = padSize / 2;
+        if (overwrite)
+        {
+            if (reversedOrder)
+            {
+                for (size_t d = 0; d < srcChannels; ++d)
+                {
+                    for (size_t x = 0; x < dstSize; ++x)
+                    {
+                        dst[x * srcChannels + d] = src[(x + leftPad) * srcChannels + d];
+                    }
+                }
+            }
+            else
+            {
+                for (size_t d = 0; d < srcChannels; ++d)
+                {
+                    for (size_t x = 0; x < dstSize; ++x)
+                    {
+                        dst[d * dstSize + x] = src[d * srcSize + x + leftPad];
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (reversedOrder)
+            {
+                for (size_t d = 0; d < srcChannels; ++d)
+                {
+                    for (size_t x = 0; x < dstSize; ++x)
+                    {
+                        dst[x * srcChannels + d] += src[(x + leftPad) * srcChannels + d];
+                    }
+                }
+            }
+            else
+            {
+                for (size_t d = 0; d < srcChannels; ++d)
+                {
+                    for (size_t x = 0; x < dstSize; ++x)
+                    {
+                        dst[d * dstSize + x] += src[d * srcSize + x + leftPad];
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<typename T>
+void Common::im2col(const T* image,
+                    size_t imageWidth,
+                    size_t imageHeight,
+                    size_t imageChannels,
+                    size_t filterWidth,
+                    size_t filterHeight,
+                    size_t strideWidth,
+                    size_t strideHeight,
+                    size_t paddingWidth,
+                    size_t paddingHeight,
+                    T* matrix,
+                    bool reversedOrder)
+{
+    // https://github.com/pluskid/Mocha.jl/blob/master/deps/im2col.cpp
+
+    // resulted matrix width (widthCol * heightCol)
+    const size_t widthCol = (imageWidth + 2 * paddingWidth - filterWidth) / strideWidth + 1;
+    const size_t heightCol = (imageHeight + 2 * paddingHeight - filterHeight) / strideHeight + 1;
+
+    // resulted matrix height
+    const size_t channelsCol = imageChannels * filterWidth * filterHeight;
+
+    for (size_t c = 0; c < channelsCol; ++c)
+    {
+        size_t w_offset = c % filterWidth;
+        size_t h_offset = (c / filterWidth) % filterHeight;
+        size_t c_im = c / (filterHeight * filterWidth);
+
+        for (size_t h = 0; h < heightCol; ++h)
+        {
+            for (size_t w = 0; w < widthCol; ++w)
+            {
+                long long w_pad = w * strideWidth - paddingWidth + w_offset;
+                long long h_pad = h * strideHeight - paddingHeight + h_offset;
+
+                if (h_pad >= 0 && h_pad < static_cast<long long>(imageHeight) && w_pad >= 0 && w_pad < static_cast<long long>(imageWidth))
+                {
+                    if (reversedOrder)
+                    {
+                        matrix[(c * heightCol + h) * widthCol + w] = image[w_pad * imageHeight * imageChannels + h_pad * imageChannels + c_im];
+                    }
+                    else
+                    {
+                        matrix[(c * heightCol + h) * widthCol + w] = image[c_im * imageHeight * imageWidth + h_pad * imageWidth + w_pad];
+                    }
+                }
+                else
+                {
+                    matrix[(c * heightCol + h) * widthCol + w] = static_cast<T>(0);
+                }
+            }
+        }
+    }
+}
+
+size_t Common::im2colOutputSize(size_t imageWidth,
+                                size_t imageHeight,
+                                size_t imageChannels,
+                                size_t filterWidth,
+                                size_t filterHeight,
+                                size_t strideWidth,
+                                size_t strideHeight,
+                                size_t paddingWidth,
+                                size_t paddingHeight,
+                                size_t dilationWidth,
+                                size_t dilationHeight)
+{
+    size_t mEffectiveReceptiveFieldW = dilationWidth * (filterWidth - 1) + 1;
+    size_t mEffectiveReceptiveFieldH = dilationHeight * (filterHeight - 1) + 1;
+    size_t mOutputWidth = (imageWidth + 2 * paddingWidth - mEffectiveReceptiveFieldW) / strideWidth + 1;
+    size_t mOutputHeight = (imageHeight + 2 * paddingHeight - mEffectiveReceptiveFieldH) / strideHeight + 1;
+
+    return mOutputHeight * mOutputWidth * imageChannels * mEffectiveReceptiveFieldH * mEffectiveReceptiveFieldW;
+}
+
+template<typename T>
+void Common::col2im(const T* matrix,
+                    size_t imageWidth,
+                    size_t imageHeight,
+                    size_t imageChannels,
+                    size_t filterWidth,
+                    size_t filterHeight,
+                    size_t strideWidth,
+                    size_t strideHeight,
+                    size_t paddingWidth,
+                    size_t paddingHeight,
+                    T* image,
+                    bool reversedOrder,
+                    bool zeroOutput)
+{
+    // https://github.com/pluskid/Mocha.jl/blob/master/deps/im2col.cpp
+
+    // input matrix width (widthCol * heightCol)
+    size_t widthCol = (imageWidth + 2 * paddingWidth - filterWidth) / strideWidth + 1;
+    size_t heightCol = (imageHeight + 2 * paddingHeight - filterHeight) / strideHeight + 1;
+
+    // input matrix height
+    size_t channelsCol = imageChannels * filterHeight * filterWidth;
+
+    if (zeroOutput)
+    {
+        fill(&image[0], &image[imageChannels * imageHeight * imageWidth], static_cast<T>(0));
+    }
+
+    for (size_t c = 0; c < channelsCol; ++c)
+    {
+        size_t w_offset = c % filterWidth;
+        size_t h_offset = (c / filterWidth) % filterHeight;
+        size_t c_im = c / (filterHeight * filterWidth);
+
+        for (size_t h = 0; h < heightCol; ++h)
+        {
+            for (size_t w = 0; w < widthCol; ++w)
+            {
+                long long w_pad = w * strideWidth - paddingWidth + w_offset;
+                long long h_pad = h * strideHeight - paddingHeight + h_offset;
+
+                if (h_pad >= 0 && h_pad < static_cast<long long>(imageHeight) && w_pad >= 0 && w_pad < static_cast<long long>(imageWidth))
+                {
+                    if (reversedOrder)
+                    {
+                        image[w_pad * imageHeight * imageChannels + h_pad * imageChannels + c_im] += matrix[(c * heightCol + h) * widthCol + w];
+                    }
+                    else
+                    {
+                        image[c_im * imageHeight * imageWidth + h_pad * imageWidth + w_pad] += matrix[(c * heightCol + h) * widthCol + w];
+                    }
+                }
+            }
+        }
+    }
+}
+
+dtype Common::GeLU_Erf(dtype x)
+{
+    return static_cast<dtype>(x * 0.5 * (1.0 + erf(x * RAUL_SQRT1_2)));
+}
+
+dtype Common::GeLU_Tanh(dtype x)
+{
+    return static_cast<dtype>(0.5 * x * (1 + tanh(RAUL_SQRT2_PI * (x + GELU_CONST * pow(x, 3)))));
+}
+
+vector<string> Common::split(const string& str, char delimeter)
+{
+    vector<string> elements;
+    ::split(str, delimeter, back_inserter(elements));
+    return elements;
+}
+
+shape Common::getStrides(const shape& tensor_shape)
+{
+    shape strides;
+    size_t offset = 1U;
+
+    for (ptrdiff_t i = 3; i >= 0; --i)
+    {
+        strides[i] = (tensor_shape[i] != 1U) ? offset : 0U;
+        offset *= tensor_shape[i];
+    }
+    return strides;
+}
+
+shape Common::offsetToIndexes(size_t offset, const shape& strides)
+{
+    shape indexes;
+    size_t q = 0;
+    while (q < strides.dimensions_num())
+    {
+        if (strides[q] != 0)
+        {
+
+            indexes[q] = offset / strides[q];
+            offset %= strides[q];
+        }
+        else
+        {
+            indexes[q] = 0U;
+        }
+        ++q;
+    }
+    indexes[q - 1] += offset;
+    return indexes;
+}
+
+size_t Common::indexesToOffset(const shape& indexes, const shape& strides)
+{
+    size_t offset = 0;
+    for (size_t q = 0; q < indexes.dimensions_num(); q++)
+    {
+        offset += indexes[q] * strides[q];
+    }
+    return offset;
+}
+
+template void Common::im2col(const dtype* image,
+                             size_t imageWidth,
+                             size_t imageHeight,
+                             size_t imageChannels,
+                             size_t filterWidth,
+                             size_t filterHeight,
+                             size_t strideWidth,
+                             size_t strideHeight,
+                             size_t paddingWidth,
+                             size_t paddingHeight,
+                             dtype* matrix,
+                             bool reversedOrder);
+
+template void Common::im2col(const half* image,
+                             size_t imageWidth,
+                             size_t imageHeight,
+                             size_t imageChannels,
+                             size_t filterWidth,
+                             size_t filterHeight,
+                             size_t strideWidth,
+                             size_t strideHeight,
+                             size_t paddingWidth,
+                             size_t paddingHeight,
+                             half* matrix,
+                             bool reversedOrder);
+
+template void Common::col2im(const dtype* matrix,
+                             size_t imageWidth,
+                             size_t imageHeight,
+                             size_t imageChannels,
+                             size_t filterWidth,
+                             size_t filterHeight,
+                             size_t strideWidth,
+                             size_t strideHeight,
+                             size_t paddingWidth,
+                             size_t paddingHeight,
+                             dtype* image,
+                             bool reversedOrder,
+                             bool zeroOutput);
+
+template void Common::col2im(const half* matrix,
+                             size_t imageWidth,
+                             size_t imageHeight,
+                             size_t imageChannels,
+                             size_t filterWidth,
+                             size_t filterHeight,
+                             size_t strideWidth,
+                             size_t strideHeight,
+                             size_t paddingWidth,
+                             size_t paddingHeight,
+                             half* image,
+                             bool reversedOrder,
+                             bool zeroOutput);
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/common/Common.h b/training/src/compiler/training/base/common/Common.h
new file mode 100644
index 00000000..7afdb973
--- /dev/null
+++ b/training/src/compiler/training/base/common/Common.h
@@ -0,0 +1,838 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <cmath>
+#include <optional>
+#include <yato/array_view.h>
+
+//#define CHECK_ASSERT
+
+#include <training/system/Errors.h>
+#include <training/system/Name.h>
+#include <training/system/TypeHalf.h>
+#include <training/system/Types.h>
+
+#include <map>
+#include <utility>
+
+#if defined(_BLAS) && !defined(_BLAS_ENHANCE)
+extern "C"
+{
+#include <cblas.h>
+}
+#else
+#ifndef OPENBLAS_CONST
+#define OPENBLAS_CONST const
+#endif
+
+typedef enum CBLAS_TRANSPOSE
+{
+    CblasNoTrans = 111,
+    CblasTrans = 112,
+    CblasConjTrans = 113,
+    CblasConjNoTrans = 114
+} CBLAS_TRANSPOSE;
+
+typedef enum CBLAS_UPLO
+{
+    CblasUpper = 121,
+    CblasLower = 122
+} CBLAS_UPLO;
+#endif
+
+#ifdef CHECK_ASSERT
+#define CHECK_NEAR ASSERT_NEAR
+#else
+#define CHECK_NEAR EXPECT_NEAR
+#endif
+
+#define RAUL_E 2.71828182845904523536        // e
+#define RAUL_LOG2E 1.44269504088896340736    // log2(e)
+#define RAUL_LOG10E 0.434294481903251827651  // log10(e)
+#define RAUL_LN2 0.693147180559945309417     // ln(2)
+#define RAUL_LN10 2.30258509299404568402     // ln(10)
+#define RAUL_PI 3.14159265358979323846       // pi
+#define RAUL_PI_2 1.57079632679489661923     // pi/2
+#define RAUL_PI_4 0.785398163397448309616    // pi/4
+#define RAUL_1_PI 0.318309886183790671538    // 1/pi
+#define RAUL_2_PI 0.636619772367581343076    // 2/pi
+#define RAUL_2_SQRTPI 1.12837916709551257390 // 2/sqrt(pi)
+#define RAUL_SQRT2_PI 0.79788456080286535588 // sqrt(2/pi)
+#define RAUL_SQRT2 1.41421356237309504880    // sqrt(2)
+#define RAUL_SQRT1_2 0.707106781186547524401 // 1/sqrt(2)
+
+#define GELU_CONST 0.044715
+
+namespace raul
+{
+
+enum class Limit : int
+{
+    Left = 0,
+    Middle = 1,
+    Right = 2
+};
+
+enum class Dimension : int
+{
+    Default = -1,
+    Batch = 0,
+    Depth = 1,
+    Height = 2,
+    Width = 3
+};
+
+#if defined(_MSC_VER)
+#define INLINE __forceinline
+#else
+#define INLINE __attribute__((always_inline))
+#endif
+
+template<typename Type>
+class TensorImpl;
+typedef TensorImpl<dtype> Tensor;
+typedef TensorImpl<half> TensorFP16;
+
+#if defined(ANDROID)
+#define TOMMTYPE(var) static_cast<typename MM::type>(var)
+#else
+#define TOMMTYPE(var) castHelper<typename MM::type>::cast(var)
+#endif
+
+using shape = yato::dimensionality<4U, size_t>;
+} // raul namespace
+
+namespace raul
+{
+
+enum class NetworkMode
+{
+    Train = 0,
+    Test = 1,
+    TrainCheckpointed = 2
+};
+
+enum class CompressionMode
+{
+    NONE = -1,
+    FP16 = 0,
+    INT8 = 1
+};
+
+enum class CalculationMode
+{
+    DETERMINISTIC = 0,
+#if defined(_OPENMP)
+    FAST = 1,
+#endif
+};
+
+/**
+ * @brief Hardware target platform
+ *
+ */
+enum class ExecutionTarget
+{
+    CPU = 0,
+    CPUFP16 = 1
+};
+
+/**
+ * @brief Hardware target platform per layer
+ *
+ * \note Might override execution target for workflow, useful for mixed precision
+ */
+enum class LayerExecutionTarget
+{
+    Default = -1, // use same as ExecutionTarget
+    CPU = 0,      // from this point enums should be aligned with ExecutionTarget (due to LayerExecutionTarget = static_cast<ExecutionTarget>(enum))
+    CPUFP16 = 1
+};
+
+/**
+ * @brief Memory allocation mode
+ */
+enum class AllocationMode
+{
+    STANDARD,
+    POOL
+};
+
+enum class DeclarationType
+{
+    Tensor = 0,
+    Shape = 1,
+    //    Alias = 2
+};
+
+class OpenclInitializer;
+
+class Common
+{
+  public:
+
+    // generate vector of random index permutation of [0..n-1]
+    static void generate_permutation(size_t n, std::vector<size_t>& ind_vector, unsigned int seed = 0);
+
+    /*
+     * [cols x rows]
+     * A[k x m]
+     * B[n x k]
+     * C[n x m]
+     * https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm
+     * C = alpha * A * B + beta * C
+     * bOffset - in elements (not bytes)
+     */
+    static void gemm(OPENBLAS_CONST CBLAS_TRANSPOSE transA,
+                     OPENBLAS_CONST CBLAS_TRANSPOSE transB,
+                     size_t m,
+                     size_t n,
+                     size_t k,
+                     OPENBLAS_CONST dtype alpha,
+                     OPENBLAS_CONST dtype* a,
+                     OPENBLAS_CONST dtype* b,
+                     OPENBLAS_CONST dtype beta,
+                     dtype* c);
+
+    static void gemm(OPENBLAS_CONST CBLAS_TRANSPOSE transA,
+                     OPENBLAS_CONST CBLAS_TRANSPOSE transB,
+                     size_t m,
+                     size_t n,
+                     size_t k,
+                     OPENBLAS_CONST dtype alpha,
+                     OPENBLAS_CONST half* a,
+                     OPENBLAS_CONST half* b,
+                     OPENBLAS_CONST dtype beta,
+                     half* c);
+
+    /**
+     * @brief : Basic Linear Algebra Subroutine y = y + ax
+     *
+     *  \f[
+     *      \vec{y} = \vec{y} + \alpha * \vec{x},
+     *  \f]
+     *
+     * @param n The number of elements in vectors x and y.
+     * @param sa The scalar alpha.
+     * @param sx The vector x of length n. Specified as: a one-dimensional array of (at least) length \f$ 1+(n-1)|incx| \f$.
+     * @param incx The stride for vector x. Specified as: an integer. It can have any value.
+     * @param sy The vector y of length n. Specified as: a one-dimensional array of (at least) length \f$ 1+(n-1)|incy| \f$.
+     * @param incy The stride for vector y.
+     * @param xOffset The offset for vector x.
+     * @param yOffset The offset for vector y.
+     * @return The vector y, containing the results of the computation.
+     */
+    static void axpy(size_t n, OPENBLAS_CONST dtype sa, OPENBLAS_CONST dtype* sx, size_t incx, dtype* sy, size_t incy, size_t xOffset = 0, size_t yOffset = 0);
+    static void axpy(size_t n, OPENBLAS_CONST dtype sa, OPENBLAS_CONST half* sx, size_t incx, half* sy, size_t incy, size_t xOffset = 0, size_t yOffset = 0);
+
+    /**
+     * @brief : Basic Linear Algebra Subroutine y = ax + by
+     *
+     *  \f[
+     *      \vec{y} = \alpha \vec{x} + \beta \vec{y},
+     *  \f]
+     *
+     * @param n The number of elements in vectors x and y.
+     * @param alpha The scalar alpha.
+     * @param x The vector x of length n. Specified as: a one-dimensional array of (at least) length \f$ 1+(n-1)|incx| \f$.
+     * @param incx The stride for vector x. Specified as: an integer. It can have any value.
+     * @param beta The scalar beta.
+     * @param y The vector y of length n. Specified as: a one-dimensional array of (at least) length \f$ 1+(n-1)|incy| \f$.
+     * @param incy The stride for vector y.
+     * @param xOffset The offset for vector x.
+     * @param yOffset The offset for vector y.
+     * @return The vector y, containing the results of the computation.
+     */
+    static int axpby(OPENBLAS_CONST size_t n,
+                     OPENBLAS_CONST dtype alpha,
+                     OPENBLAS_CONST dtype* x,
+                     OPENBLAS_CONST size_t incx,
+                     OPENBLAS_CONST dtype beta,
+                     dtype* y,
+                     OPENBLAS_CONST size_t incy,
+                     size_t xOffset,
+                     size_t yOffset);
+
+    static int axpby(OPENBLAS_CONST size_t n,
+                     OPENBLAS_CONST dtype alpha,
+                     OPENBLAS_CONST half* x,
+                     OPENBLAS_CONST size_t incx,
+                     OPENBLAS_CONST dtype beta,
+                     half* y,
+                     OPENBLAS_CONST size_t incy,
+                     size_t xOffset,
+                     size_t yOffset);
+    /**
+     * @brief : Basic Linear Algebra Subroutine y = alpha * a * x + beta * y
+     *
+     * Vector by vector element wise multiplication
+     *
+     *  \f[
+     *      \vec{y} = \alpha \vec{a} \vec{x} + \beta \vec{y},
+     *  \f]
+     *
+     * @param n The number of elements in vectors x and y.
+     * @param alpha The scalar alpha.
+     * @param a The vector of length n.
+     * @param x The vector x of length n. Specified as: a one-dimensional array of (at least) length \f$ 1+(n-1)|incx| \f$.
+     * @param incx The stride for vector x. Specified as: an integer. It can have any value.
+     * @param beta The scalar beta.
+     * @param y The vector y of length n. Specified as: a one-dimensional array of (at least) length \f$ 1+(n-1)|incy| \f$.
+     * @param incy The stride for vector y.
+     */
+
+    static void hadamard(OPENBLAS_CONST size_t n,
+                         OPENBLAS_CONST dtype alpha,
+                         OPENBLAS_CONST dtype* a,
+                         OPENBLAS_CONST dtype* x,
+                         OPENBLAS_CONST size_t incx,
+                         OPENBLAS_CONST dtype beta,
+                         dtype* y,
+                         OPENBLAS_CONST size_t incy);
+
+    static dtype dot(size_t n, OPENBLAS_CONST dtype* sx, size_t incx, OPENBLAS_CONST dtype* sy, size_t incy);
+
+    static void scal(size_t n, OPENBLAS_CONST dtype sa, dtype* sx, size_t incx);
+
+    static void transpose(Tensor& tensor, size_t cols);
+    static void transpose(TensorFP16& tensor, size_t cols);
+
+    /*
+     * memory for dst should be allocated externaly
+     */
+    static void addPadding1D(const dtype* src, dtype* dst, size_t srcChannels, size_t srcSize, size_t dstSize, bool reversedOrder = false);
+    template<typename T>
+    static void addPadding2D(const T* src, T* dst, size_t srcChannels, size_t srcWidth, size_t srcHeight, size_t dstWidth, size_t dstHeight)
+    {
+        if ((dstWidth >= srcWidth) && (dstHeight >= srcHeight))
+        {
+            size_t padWidth = dstWidth - srcWidth;
+            size_t padHeight = dstHeight - srcHeight;
+
+            size_t leftPad = padWidth / 2;
+            // size_t rightPad = padWidth - leftPad;
+            size_t topPad = padHeight / 2;
+            size_t bottomPad = padHeight - topPad;
+
+            for (size_t d = 0; d < srcChannels; ++d)
+            {
+                // top
+                for (size_t y = 0; y < topPad; ++y)
+                {
+                    for (size_t x = 0; x < dstWidth; ++x)
+                    {
+                        dst[d * dstWidth * dstHeight + dstWidth * y + x] = static_cast<T>(0.0_dt);
+                    }
+                }
+
+                for (size_t y = topPad; y < topPad + srcHeight; ++y)
+                {
+                    // left
+                    for (size_t x = 0; x < leftPad; ++x)
+                    {
+                        dst[d * dstWidth * dstHeight + dstWidth * y + x] = static_cast<T>(0.0_dt);
+                    }
+
+                    // src
+                    for (size_t x = leftPad; x < leftPad + srcWidth; ++x)
+                    {
+                        dst[d * dstWidth * dstHeight + dstWidth * y + x] = src[d * srcWidth * srcHeight + srcWidth * (y - topPad) + x - leftPad];
+                    }
+
+                    // right
+                    for (size_t x = leftPad + srcWidth; x < dstWidth; ++x)
+                    {
+                        dst[d * dstWidth * dstHeight + dstWidth * y + x] = static_cast<T>(0.0_dt);
+                    }
+                }
+
+                // bottom
+                for (size_t y = dstHeight - bottomPad; y < dstHeight; ++y)
+                {
+                    for (size_t x = 0; x < dstWidth; ++x)
+                    {
+                        dst[d * dstWidth * dstHeight + dstWidth * y + x] = static_cast<T>(0.0_dt);
+                    }
+                }
+            }
+        }
+    }
+
+    /*
+     * memory for dst should be allocated externaly
+     */
+    static void removePadding1D(const dtype* src, dtype* dst, size_t srcChannels, size_t srcSize, size_t dstSize, bool reversedOrder = false, bool overwrite = true);
+    template<typename T>
+    static void removePadding2D(const T* src, T* dst, size_t srcChannels, size_t srcWidth, size_t srcHeight, size_t dstWidth, size_t dstHeight, bool overwrite = true)
+    {
+        if ((dstWidth <= srcWidth) && (dstHeight <= srcHeight))
+        {
+            size_t padWidth = srcWidth - dstWidth;
+            size_t padHeight = srcHeight - dstHeight;
+
+            size_t leftPad = padWidth / 2;
+            // size_t rightPad = padWidth - leftPad;
+            size_t topPad = padHeight / 2;
+            // size_t bottomPad = padHeight - topPad;
+
+            if (overwrite)
+            {
+                for (size_t d = 0; d < srcChannels; ++d)
+                {
+                    for (size_t y = 0; y < dstHeight; ++y)
+                    {
+                        for (size_t x = 0; x < dstWidth; ++x)
+                        {
+                            dst[d * dstWidth * dstHeight + dstWidth * y + x] = src[d * srcWidth * srcHeight + srcWidth * (y + topPad) + x + leftPad];
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (size_t d = 0; d < srcChannels; ++d)
+                {
+                    for (size_t y = 0; y < dstHeight; ++y)
+                    {
+                        for (size_t x = 0; x < dstWidth; ++x)
+                        {
+                            dst[d * dstWidth * dstHeight + dstWidth * y + x] += src[d * srcWidth * srcHeight + srcWidth * (y + topPad) + x + leftPad];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /*
+     * paddingWidth, paddingHeight - zero padding added for both sides of the input
+     * memory for matrix should be allocated externaly
+     */
+    template<typename T>
+    static void im2col(const T* image,
+                       size_t imageWidth,
+                       size_t imageHeight,
+                       size_t imageChannels,
+                       size_t filterWidth,
+                       size_t filterHeight,
+                       size_t strideWidth,
+                       size_t strideHeight,
+                       size_t paddingWidth,
+                       size_t paddingHeight,
+                       T* matrix,
+                       bool reversedOrder = false);
+
+    static size_t im2colOutputSize(size_t imageWidth,
+                                   size_t imageHeight,
+                                   size_t imageChannels,
+                                   size_t filterWidth,
+                                   size_t filterHeight,
+                                   size_t strideWidth,
+                                   size_t strideHeight,
+                                   size_t paddingWidth,
+                                   size_t paddingHeight,
+                                   size_t dilationWidth,
+                                   size_t dilationHeight);
+
+    /*
+     * paddingWidth, paddingHeight - zero padding added for both sides of the input
+     * memory for image should be allocated externaly
+     */
+    template<typename T>
+    static void col2im(const T* matrix,
+                       size_t imageWidth,
+                       size_t imageHeight,
+                       size_t imageChannels,
+                       size_t filterWidth,
+                       size_t filterHeight,
+                       size_t strideWidth,
+                       size_t strideHeight,
+                       size_t paddingWidth,
+                       size_t paddingHeight,
+                       T* image,
+                       bool reversedOrder = false,
+                       bool zeroOutput = true);
+
+    /*
+     * Rectified Linear Unit
+     */
+    template<typename T>
+    static T ReLU(T x)
+    {
+        return std::max(static_cast<T>(0), x);
+    }
+    template<typename T>
+    static T ReLU6(T x)
+    {
+        return std::min(std::max(static_cast<T>(0), x), static_cast<T>(6.0_dt));
+    }
+
+    template<typename T>
+    static void ReLU(const T& in, T& out)
+    {
+        std::transform(in.begin(), in.end(), out.begin(), [&](typename T::type val) -> typename T::type { return ReLU(val); });
+    }
+
+    template<typename T>
+    static void ReLU6(const T& in, T& out)
+    {
+        std::transform(in.begin(), in.end(), out.begin(), [&](typename T::type val) -> typename T::type { return ReLU6(val); });
+    }
+
+    template<typename T>
+    static void ReLUBackward(const T& out, const T& delta, T& prevDelta)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < prevDelta.size(); ++q)
+        {
+            prevDelta[q] += (out[q] > static_cast<typename T::type>(0)) ? delta[q] : static_cast<typename T::type>(0);
+        }
+    }
+
+    template<typename T>
+    static void ReLU6Backward(const T& out, const T& delta, T& prevDelta)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < prevDelta.size(); ++q)
+        {
+            prevDelta[q] += (out[q] > static_cast<typename T::type>(0) && out[q] < static_cast<typename T::type>(6.0f)) ? delta[q] : static_cast<typename T::type>(0);
+        }
+    }
+
+    /*
+     * Gaussian error linear unit
+     * @see https://arxiv.org/abs/1606.08415
+     */
+    static dtype GeLU_Erf(dtype x);
+    static dtype GeLU_Tanh(dtype x);
+
+    /*
+     * Hard Sigmoid
+     */
+    template<typename T>
+    static T HSigmoid(T x)
+    {
+        return static_cast<T>(ReLU6(TODTYPE(x) + 3.0_dt) / 6.0_dt);
+    }
+
+    /*
+     * Hard Swish
+     */
+    template<typename T>
+    static T HSwish(T x)
+    {
+        return x * HSigmoid(x);
+    }
+    static dtype sign(dtype x) { return TODTYPE((0.0_dt < x) - (x < 0.0_dt)); }
+
+    template<typename T, typename U>
+    static void copyView(const T& view_from, U& view_to, const bool overwrite = false)
+    {
+        auto retLhs = [](typename T::value_type& lhs, [[maybe_unused]] typename T::value_type& rhs) { return lhs; };
+
+        auto copyViewImpl = [](const T& view_from, U& view_to, auto&& func)
+        {
+            for (size_t i1 = 0; i1 < view_from.size(0); ++i1)
+            {
+                for (size_t i2 = 0; i2 < view_from.size(1); ++i2)
+                {
+                    for (size_t i3 = 0; i3 < view_from.size(2); ++i3)
+                    {
+                        for (size_t i4 = 0; i4 < view_from.size(3); ++i4)
+                        {
+                            view_to[i1][i2][i3][i4] = func(view_from[i1][i2][i3][i4], view_to[i1][i2][i3][i4]);
+                        }
+                    }
+                }
+            }
+        };
+
+        if (overwrite)
+        {
+            copyViewImpl(view_from, view_to, retLhs);
+        }
+        else
+        {
+            copyViewImpl(view_from, view_to, std::plus<typename T::value_type>());
+        }
+    }
+
+    template<typename T>
+    static void unpack4D(const T& src, T& dst, Dimension dir, size_t index, const Name& layerType, const Name& layerName, bool overwrite)
+    {
+        auto input4d = src.get4DView();
+        auto inputDims = yato::dims(src.getDepth(), src.getHeight(), src.getWidth());
+
+        auto outputDims = dst.getShape();
+
+        const typename T::type* startEl = nullptr;
+        switch (dir)
+        {
+            case Dimension::Depth:
+                startEl = &input4d[0][index][0][0];
+                break;
+            case Dimension::Height:
+                startEl = &input4d[0][0][index][0];
+                break;
+            default:
+                throw std::runtime_error(layerType + "[" + layerName + "]: unpack4D unknown dim");
+        }
+
+        auto srcView = yato::array_view_4d<const typename T::type>(startEl, outputDims, inputDims);
+        auto outputView = dst.get4DView();
+        Common::copyView(srcView, outputView, overwrite);
+    }
+
+    template<typename T>
+    static void pack4D(const T& src, T& dst, Dimension dir, size_t index, const Name& layerType, const Name& layerName, bool overwrite)
+    {
+        auto output4d = dst.get4DView();
+
+        yato::dimensionality<3U, size_t> concatDims(dst.getDepth(), dst.getHeight(), dst.getWidth());
+
+        auto srcView = src.get4DView();
+        typename T::type* startEl = nullptr;
+        switch (dir)
+        {
+            case Dimension::Depth:
+                startEl = &output4d[0][index][0][0];
+                break;
+            case Dimension::Height:
+                startEl = &output4d[0][0][index][0];
+                break;
+            default:
+                throw std::runtime_error(layerType + "[" + layerName + "]: pack4D unknown dim");
+        }
+
+        auto dstView = yato::array_view_4d<typename T::type>(startEl, src.getShape(), concatDims);
+        Common::copyView(srcView, dstView, overwrite);
+    }
+
+    /*
+     * Upper triangle of a rectangular array
+     */
+    template<typename T>
+    static void triu(T* data, size_t nrows, size_t ncols, int diag = 0)
+    {
+        size_t i = 0;
+        int cols = (int)ncols;
+        int rows = (int)nrows;
+        for (int r = 0; r < rows; ++r)
+        {
+            for (int c = 0; c < cols; ++c, ++i)
+            {
+                if (c - r - diag < 0)
+                {
+                    data[i] = static_cast<T>(0);
+                }
+            }
+        }
+    }
+
+    /*
+     * Applies a 1D convolution over an input signal composed of several input planes.
+     * Supports 2 modes:
+     *  1. PyTorch style: Input[N, C, 1, L1] (or [N, 1, C, L1]) -> Output[N, FILTERS, 1, L2] (or [N, 1, FILTERS, L2])
+     *  2. TensorFlow style: Input[N, L1, 1, C] (or [N, 1, L1, C]) -> Output[N, L2, 1, FILTERS] (or [N, 1, L2, FILTERS])
+     * Output is not zeroed prior to convolution (operator += is used)
+     */
+    static void conv1d(const dtype* input,
+                       dtype* output,
+                       const dtype* kernel,
+                       const dtype* bias,
+                       size_t batchSize,
+                       size_t inputSize,
+                       size_t inputChannels,
+                       size_t outputSize,
+                       size_t outputChannels,
+                       size_t kernelSize,
+                       size_t padding,
+                       size_t stride,
+                       size_t dilation = 1U,
+                       size_t groups = 1U,
+                       bool tfStyle = false);
+
+    /*
+     * Applies 2D convolution over input tensor, all channels convolved
+     * Output is not zeroed prior to convolution (operator += is used)
+     */
+    template<typename T>
+    static void conv2d(const T* input,
+                       T* output,
+                       const T* kernel,
+                       const T* bias,
+                       size_t batchSize,
+                       size_t inputWidth,
+                       size_t inputHeight,
+                       size_t inputChannels,
+                       size_t outputWidth,
+                       size_t outputHeight,
+                       size_t outputChannels,
+                       size_t kernelWidth,
+                       size_t kernelHeight,
+                       size_t paddingW,
+                       size_t paddingH,
+                       size_t strideW,
+                       size_t strideH,
+                       size_t dilationW = 1U,
+                       size_t dilationH = 1U,
+                       size_t groups = 1U)
+    {
+        auto inputs3D = yato::array_view_3d<T>(const_cast<T*>(input), yato::dims(batchSize, inputChannels, inputHeight * inputWidth));
+        auto outputs3D = yato::array_view_3d<T>(output, yato::dims(batchSize, outputChannels, outputHeight * outputWidth));
+        auto kernelsWeights4D = yato::array_view_4d<T>(const_cast<T*>(kernel), yato::dims(outputChannels, inputChannels / groups, kernelHeight, kernelWidth));
+
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t d = 0; d < outputChannels; ++d)
+            {
+                std::fill(outputs3D[q][d].begin(), outputs3D[q][d].end(), static_cast<T>(0.0_dt));
+            }
+
+            size_t inputWidthPadded = inputWidth + 2 * paddingW;
+            size_t inputHeightPadded = inputHeight + 2 * paddingH;
+
+            std::vector<T> inputPadded(inputChannels * inputHeightPadded * inputWidthPadded);
+
+            Common::addPadding2D(&inputs3D[q][0][0], inputPadded.data(), inputChannels, inputWidth, inputHeight, inputWidthPadded, inputHeightPadded);
+
+            auto inputPadded2D = yato::view(inputPadded).reshape(yato::dims(inputChannels, inputHeightPadded * inputWidthPadded));
+
+            for (size_t group = 0; group < groups; ++group)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < outputChannels / groups; ++kernelIndex)
+                {
+                    for (size_t d = 0; d < inputChannels / groups; ++d)
+                    {
+                        for (size_t oy = 0; oy < outputHeight; ++oy)
+                        {
+                            for (size_t ox = 0; ox < outputWidth; ++ox)
+                            {
+                                for (size_t ky = 0; ky < kernelHeight; ++ky)
+                                {
+                                    for (size_t kx = 0; kx < kernelWidth; ++kx)
+                                    {
+                                        outputs3D[q][kernelIndex + group * outputChannels / groups][oy * outputWidth + ox] +=
+                                            kernelsWeights4D[kernelIndex + group * outputChannels / groups][d][ky][kx] *
+                                            inputPadded2D[d + group * inputChannels / groups][oy * inputWidthPadded * strideH + ky * dilationH * inputWidthPadded + ox * strideW + kx * dilationW];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        if (bias)
+        {
+            for (size_t q = 0; q < batchSize; ++q)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < outputChannels; ++kernelIndex)
+                {
+                    for (size_t oy = 0; oy < outputHeight; ++oy)
+                    {
+                        for (size_t ox = 0; ox < outputWidth; ++ox)
+                        {
+                            outputs3D[q][kernelIndex][oy * outputWidth + ox] += bias[kernelIndex];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template<typename T = dtype, typename Iterator>
+    static void arange(Iterator begin, Iterator end, T start = static_cast<T>(0), T step = static_cast<T>(1))
+    {
+        auto val = start;
+        for (auto p = begin; p != end; ++p)
+        {
+            *p = static_cast<std::remove_reference_t<decltype(*p)>>(val);
+            val += step;
+        }
+    }
+
+    template<typename T = dtype, typename Iterable>
+    static void arange(Iterable& i, T start = static_cast<T>(0), T step = static_cast<T>(1))
+    {
+        return arange(i.begin(), i.end(), start, step);
+    }
+
+    static void replaceAll(std::string& str, const std::string& srcSubstr, const std::string& tgtSubstr)
+    {
+        size_t start_pos = 0;
+        while ((start_pos = str.find(srcSubstr, start_pos)) != std::string::npos)
+        {
+            str.replace(start_pos, srcSubstr.length(), tgtSubstr);
+            start_pos += tgtSubstr.length(); // srcSubstr could be a substring of tgtSubstr
+        }
+    }
+
+    static bool startsWith(const std::string& str, const std::string& srcSubstr) { return (str.rfind(srcSubstr, 0) == 0); }
+
+    static std::vector<std::string> split(const std::string& string, char delimeter);
+
+    /*
+     * @see https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html
+     */
+    template<typename T>
+    static bool shapeIsBroadcastable(const T& from, const T& to)
+    {
+        const auto n = to.dimensions_num();
+        for (size_t i = 0; i < n; ++i)
+        {
+            if (from[i] != to[i] && from[i] != 1U && to[i] != 1U)
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    static bool endsWith(std::string const& value, std::string const& ending)
+    {
+        if (ending.size() > value.size())
+        {
+            return false;
+        }
+        return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+    }
+
+    static shape getStrides(const shape& tensor_shape);
+
+    static shape offsetToIndexes(size_t offset, const shape& strides);
+
+    static size_t indexesToOffset(const shape& indexes, const shape& strides);
+};
+
+template<class T>
+bool if_equals(const std::string&& error, const T val1, const T val2)
+{
+    if (val1 != val2)
+    {
+        throw(std::runtime_error(error));
+    }
+    return val1 == val2;
+}
+
+} // raul namespace
+
+#endif // COMMON_H
diff --git a/training/src/compiler/training/base/common/Conversions.h b/training/src/compiler/training/base/common/Conversions.h
new file mode 100644
index 00000000..f90e02bf
--- /dev/null
+++ b/training/src/compiler/training/base/common/Conversions.h
@@ -0,0 +1,163 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ConversionsH
+#define ConversionsH
+
+#include <iomanip>
+#include <sstream>
+
+#include <training/base/common/Common.h>
+
+class Conversions
+{
+  public:
+    template<class T>
+    static inline std::string toString(const T& t)
+    {
+        std::ostringstream o;
+        o << t;
+        return o.str();
+    }
+
+    template<class T>
+    static inline std::string toString(const T& t, int width)
+    {
+        std::ostringstream o;
+        o << std::setw(width);
+        o << t;
+        return o.str();
+    }
+
+    template<class T>
+    static inline std::string toString(const T& t, int width, int precision)
+    {
+        std::ostringstream o;
+        o << std::setw(width);
+        o << std::setprecision(precision);
+        o << t;
+        return o.str();
+    }
+
+    template<typename T>
+    static inline bool fromString(const std::string& str, T& res)
+    {
+        bool isCorrect = true;
+
+        if (false == str.empty())
+        {
+            std::istringstream ss(str);
+            ss >> res;
+            if (ss.fail())
+            {
+                isCorrect = false;
+            }
+        }
+        else
+        {
+            isCorrect = false;
+        }
+        return isCorrect;
+    }
+
+    template<class TT, size_t N>
+    static inline std::string toString(yato::dimensionality<N, TT>& t)
+    {
+        std::ostringstream o;
+        o << "[" << t[0];
+        for (size_t i = 1; i < N; ++i)
+        {
+            o << ", " << t[i];
+        }
+        o << "]";
+        return o.str();
+    }
+
+    template<class TT>
+    static inline std::string toString(yato::dimensionality<3U, TT>& t)
+    {
+        std::ostringstream o;
+        o << "[N";
+        for (size_t i = 0; i < 3; ++i)
+        {
+            o << ", " << t[i];
+        }
+        o << "]";
+        return o.str();
+    }
+};
+
+template<>
+inline std::string Conversions::toString<raul::half>(const raul::half& t)
+{
+    std::ostringstream o;
+    o << static_cast<raul::dtype>(t);
+    return o.str();
+}
+
+template<>
+inline std::string Conversions::toString<raul::LayerExecutionTarget>(const raul::LayerExecutionTarget& t)
+{
+    std::ostringstream o;
+    switch (t)
+    {
+        case raul::LayerExecutionTarget::CPU:
+            o << "CPU";
+            break;
+        case raul::LayerExecutionTarget::CPUFP16:
+            o << "CPUFP16";
+            break;
+        case raul::LayerExecutionTarget::Default:
+            o << "Default";
+            break;
+    }
+    return o.str();
+}
+
+template<>
+inline std::string Conversions::toString(const raul::shape& t)
+{
+    std::ostringstream o;
+    o << "[" << t[0] << ", " << t[1] << "," << t[2] << "," << t[3] << "]";
+    return o.str();
+}
+
+template<>
+inline std::string Conversions::toString<bool>(const bool& t)
+{
+    std::ostringstream o;
+    o << (t ? "true" : "false");
+    return o.str();
+}
+
+template<>
+inline bool Conversions::fromString<bool>(const std::string& str, bool& res)
+{
+    bool isCorrect = true;
+
+    if (str == "1" || str == "true")
+    {
+        res = true;
+    }
+    else if (str == "0" || str == "false")
+    {
+        res = false;
+    }
+    else
+    {
+        isCorrect = false;
+    }
+    return isCorrect;
+}
+
+#endif
diff --git a/training/src/compiler/training/base/common/MemoryManager.cpp b/training/src/compiler/training/base/common/MemoryManager.cpp
new file mode 100644
index 00000000..af213f3c
--- /dev/null
+++ b/training/src/compiler/training/base/common/MemoryManager.cpp
@@ -0,0 +1,359 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "MemoryManager.h"
+
+namespace raul
+{
+
+using namespace std::string_literals;
+
+template<typename T>
+MemoryManagerImpl<T>::MemoryManagerImpl()
+{
+    mTensorNameGenerator.setPrefix("Tensor_");
+}
+
+template<typename T>
+size_t MemoryManagerImpl<T>::size() const
+{
+    return mTensors.size();
+}
+
+template<typename T>
+const TensorsNamespace<T>& MemoryManagerImpl<T>::getTensorCollection() const
+{
+    return mTensors;
+}
+
+template<typename T>
+bool MemoryManagerImpl<T>::tensorExists(const raul::Name& name) const
+{
+    return checkTensorExists(name) || checkAliasExists(name);
+}
+
+template<typename T>
+bool MemoryManagerImpl<T>::checkTensorExists(const raul::Name& name) const
+{
+    return mTensors.tensors.find(name) != mTensors.tensors.end();
+}
+
+template<typename T>
+bool MemoryManagerImpl<T>::checkAliasExists(const raul::Name& name) const
+{
+    return mTensors.aliasToTensor.find(name) != mTensors.aliasToTensor.end();
+}
+
+template<typename T>
+void MemoryManagerImpl<T>::checkName(const Name& name, const char* caller, bool shouldExist) const
+{
+    if (name.empty())
+    {
+        THROW_NONAME("MemoryManagerImpl<type="s + BASE_TYPE_NAME(type) + ">", "empty name (requested by "s + caller + ")");
+    }
+    if (tensorExists(name) != shouldExist)
+    {
+        THROW_NONAME("MemoryManagerImpl<type="s + BASE_TYPE_NAME(type) + ">", "tensor '" + name + (shouldExist ? "' doesn't exist" : "' already exists") + " (requested by "s + caller + ")");
+    }
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width)
+{
+    checkName(name, __func__, false);
+
+    mTensors.tensors[name] = std::make_shared<T>(name, batchSize, depth, height, width, AllocationMode::STANDARD);
+
+    return mTensors.tensors[name].get();
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(const Name& name, raul::shape inShape)
+{
+    checkName(name, __func__, false);
+
+    mTensors.tensors[name] = std::make_shared<T>(name, inShape);
+
+    return mTensors.tensors[name].get();
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, std::initializer_list<typename T::type> list)
+{
+    checkName(name, __func__, false);
+
+    mTensors.tensors[name] = std::make_shared<T>(name, batchSize, depth, height, width, list);
+
+    return mTensors.tensors[name].get();
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, typename T::dt_range beginEnd)
+{
+    checkName(name, __func__, false);
+
+    mTensors.tensors[name] = std::make_shared<T>(name, batchSize, depth, height, width, beginEnd);
+
+    return mTensors.tensors[name].get();
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(const Name& name, shape inShape, typename T::dt_range beginEnd)
+{
+    checkName(name, __func__, false);
+
+    mTensors.tensors[name] = std::make_shared<T>(name, inShape, beginEnd);
+
+    return mTensors.tensors[name].get();
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(const Name& name, shape inShape, std::initializer_list<typename T::type> list)
+{
+    checkName(name, __func__, false);
+
+    mTensors.tensors[name] = std::make_shared<T>(name, inShape, list);
+
+    return mTensors.tensors[name].get();
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(size_t batchSize, size_t depth, size_t height, size_t width)
+{
+    return createTensor(mTensorNameGenerator.generate(), batchSize, depth, height, width);
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(shape inShape)
+{
+    return createTensor(mTensorNameGenerator.generate(), inShape);
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(size_t batchSize, size_t depth, size_t height, size_t width, std::initializer_list<typename T::type> list)
+{
+    return createTensor(mTensorNameGenerator.generate(), batchSize, depth, height, width, list);
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(shape inShape, std::initializer_list<typename T::type> list)
+{
+    return createTensor(mTensorNameGenerator.generate(), inShape, list);
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, const typename T::type filler)
+{
+    checkName(name, __func__, false);
+
+    mTensors.tensors[name] = std::make_shared<T>(name, batchSize, depth, height, width, filler);
+
+    return mTensors.tensors[name].get();
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(const Name& name, shape inShape, const typename T::type filler)
+{
+    checkName(name, __func__, false);
+
+    mTensors.tensors[name] = std::make_shared<T>(name, inShape, filler);
+
+    return mTensors.tensors[name].get();
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createTensor(size_t batchSize, size_t depth, size_t height, size_t width, const typename T::type filler)
+{
+    return createTensor(mTensorNameGenerator.generate(), batchSize, depth, height, width, filler);
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createShape(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, AllocationMode allocMode)
+{
+    checkName(name, __func__, false);
+
+    mTensors.tensors[name] = std::make_shared<T>(name, batchSize, depth, height, width, allocMode, false);
+
+    return mTensors.tensors[name].get();
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createShape(const Name& name, const T& tensor)
+{
+    return createShape(name, tensor.getBatchSize(), tensor.getDepth(), tensor.getHeight(), tensor.getWidth(), AllocationMode::STANDARD);
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createShape(const Name& name, shape inShape, AllocationMode allocMode)
+{
+    return createShape(name, inShape[0], inShape[1], inShape[2], inShape[3], allocMode);
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createShape(const T& tensor)
+{
+    return createShape(mTensorNameGenerator.generate(), tensor.getBatchSize(), tensor.getDepth(), tensor.getHeight(), tensor.getWidth(), AllocationMode::STANDARD);
+}
+
+template<typename T>
+T* MemoryManagerImpl<T>::createShape(shape inShape)
+{
+    return createShape(mTensorNameGenerator.generate(), inShape);
+}
+
+template<typename T>
+void MemoryManagerImpl<T>::createAlias(const Name& name, const std::string& aliasName)
+{
+    if (name.empty())
+    {
+        THROW_NONAME("MemoryManagerImpl<type="s + BASE_TYPE_NAME(type) + ">", "empty name");
+    }
+    if (aliasName.empty())
+    {
+        THROW_NONAME("MemoryManagerImpl<type="s + BASE_TYPE_NAME(type) + ">", "empty alias");
+    }
+
+    bool isNameAlias = !checkTensorExists(name) && checkAliasExists(name);
+
+    if (!isNameAlias)
+    {
+        if (!checkTensorExists(name))
+        {
+            THROW_NONAME("MemoryManagerImpl<type="s + BASE_TYPE_NAME(type) + ">", "tensor '" + name + "' doesn`t exist");
+        }
+        if (checkAliasExists(name))
+        {
+            THROW_NONAME("MemoryManagerImpl<type="s + BASE_TYPE_NAME(type) + ">", "alias '" + name + "' already exists");
+        }
+    }
+
+    if (checkTensorExists(aliasName))
+    {
+        THROW_NONAME("MemoryManagerImpl<type="s + BASE_TYPE_NAME(type) + ">", "tensor '" + name + "' already exists");
+    }
+    if (checkAliasExists(aliasName))
+    {
+        THROW_NONAME("MemoryManagerImpl<type="s + BASE_TYPE_NAME(type) + ">", "alias '" + name + "' already exists");
+    }
+
+    if (isNameAlias)
+    {
+        std::string tensorName = mTensors.aliasToTensor[name];
+
+        mTensors.aliasToTensor[aliasName] = tensorName;
+        mTensors.tensorToAliases[tensorName].push_back(aliasName);
+    }
+    else
+    {
+        mTensors.aliasToTensor[aliasName] = name;
+        mTensors.tensorToAliases[name].push_back(aliasName);
+    }
+}
+
+template<typename T>
+T& MemoryManagerImpl<T>::getTensor(const raul::Name& name)
+{
+    checkName(name, __func__);
+
+    std::string tensorName = name;
+
+    bool isNameAlias = !checkTensorExists(name) && checkAliasExists(name);
+    if (isNameAlias)
+    {
+        tensorName = mTensors.aliasToTensor[name];
+    }
+
+    return *mTensors.tensors.find(tensorName)->second;
+}
+
+template<typename T>
+const T& MemoryManagerImpl<T>::getTensor(const raul::Name& name) const
+{
+    checkName(name, __func__);
+
+    std::string tensorName = name;
+
+    bool isNameAlias = !checkTensorExists(name) && checkAliasExists(name);
+    if (isNameAlias)
+    {
+        tensorName = mTensors.aliasToTensor.find(name)->second;
+    }
+
+    return *mTensors.tensors.find(tensorName)->second;
+}
+
+template<typename T>
+const T& MemoryManagerImpl<T>::operator[](const raul::Name& name) const
+{
+    return getTensor(name);
+}
+
+template<typename T>
+T& MemoryManagerImpl<T>::operator[](const raul::Name& name)
+{
+    return getTensor(name);
+}
+
+template<typename T>
+void MemoryManagerImpl<T>::deleteTensor(const raul::Name& name)
+{
+    checkName(name, __func__);
+
+    std::string tensorName = name;
+
+    bool isNameAlias = !checkTensorExists(name) && checkAliasExists(name);
+    if (isNameAlias)
+    {
+        tensorName = mTensors.aliasToTensor[name];
+    }
+
+    auto it = mTensors.tensorToAliases.find(tensorName);
+
+    if (it != mTensors.tensorToAliases.end())
+    {
+        for (const auto& alias : it->second)
+        {
+            mTensors.aliasToTensor.erase(alias);
+        }
+
+        mTensors.tensorToAliases.erase(tensorName);
+    }
+
+    mTensors.tensors.erase(tensorName);
+}
+
+template<typename T>
+void MemoryManagerImpl<T>::clear()
+{
+    mTensors.clear();
+}
+
+template<typename T>
+size_t MemoryManagerImpl<T>::getTotalMemory() const
+{
+    size_t ret = 0;
+
+    for (const auto& it : mTensors.tensors)
+    {
+        ret += it.second->size();
+    }
+
+    ret *= sizeof(dtype);
+
+    return ret;
+}
+
+template class MemoryManagerImpl<Tensor>;
+template class MemoryManagerImpl<TensorFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/common/MemoryManager.h b/training/src/compiler/training/base/common/MemoryManager.h
new file mode 100644
index 00000000..344a9e72
--- /dev/null
+++ b/training/src/compiler/training/base/common/MemoryManager.h
@@ -0,0 +1,417 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MEMORY_MANAGER_H
+#define MEMORY_MANAGER_H
+
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include <training/base/common/Tensor.h>
+#include <training/system/NameGenerator.h>
+
+namespace raul
+{
+
+template<typename T>
+struct TensorsNamespace
+{
+    using MemoryDict = std::unordered_map<raul::Name, std::shared_ptr<T>>;
+    using AliasesDict = std::unordered_map<raul::Name, raul::Name>;    // alias to tensor name mapping
+    using TensorToAliasesDict = std::unordered_map<raul::Name, Names>; // tensor to alias mapping
+
+    MemoryDict tensors;
+    AliasesDict aliasToTensor;
+    TensorToAliasesDict tensorToAliases;
+
+    void clear()
+    {
+        tensors.clear();
+        aliasToTensor.clear();
+        tensorToAliases.clear();
+    }
+
+    size_t size() const { return tensors.size(); }
+};
+
+/**
+ * @brief Memory managers own tensors and control their life time.
+ *
+ * MemoryManagerImpl creates tensor, shape or alias and owns it.
+ * Tensor - object with NCHW description and allocated plain memory (size() = N*C*H*W).
+ * Shape - Tensor without memory allocation (size() == 0).
+ * Alias - alias to original tensor.
+ */
+template<typename T>
+class MemoryManagerImpl
+{
+  public:
+    typedef T tensor;
+    typedef typename T::type type;
+
+    /**
+     * @brief Construct a new Memory Manager
+     *
+     *  Note: The memory manager is uncopyable.
+     *
+     */
+    MemoryManagerImpl();
+
+    /**
+     * @brief Create a Tensor object of a specific shape
+     *
+     * @param name Name of the tensor
+     * @param batchSize Size of batches
+     * @param depth Number of channels
+     * @param height Height of tensor
+     * @param width Width of tensor
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createTensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width);
+
+    /**
+     * @brief Create a Tensor object of a specific shape
+     *
+     * @param name Name of the tensor
+     * @param inShape Shape of tensor
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createTensor(const Name& name, shape inShape);
+
+    /**
+     * @brief Create a Tensor object of a specific shape with specific data
+     *
+     * @param name Name of the tensor
+     * @param batchSize Size of batches
+     * @param depth Number of channels
+     * @param height Height of tensor
+     * @param width Width of tensor
+     * @param list Data for tensor initialization
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createTensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, std::initializer_list<typename T::type> list);
+
+    /**
+     * @brief Create a Tensor object of a specific shape with specific data
+     *
+     * @param name Name of the tensor
+     * @param batchSize Size of batches
+     * @param depth Number of channels
+     * @param height Height of tensor
+     * @param width Width of tensor
+     * @param beginEnd Data range for tensor initialization
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createTensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, typename T::dt_range beginEnd);
+
+    /**
+     * @brief Create a Tensor object of a specific shape with specific data
+     *
+     * @param name Name of the tensor
+     * @param inShape Shape of tensor
+     * @param beginEnd Data range for tensor initialization
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createTensor(const Name& name, shape inShape, typename T::dt_range beginEnd);
+
+    /**
+     * @brief Create a Tensor object of a specific shape with specific data
+     *
+     * @param name Name of the tensor
+     * @param inShape Shape of tensor
+     * @param list Data for tensor initialization
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createTensor(const Name& name, shape inShape, std::initializer_list<typename T::type> list);
+
+    /**
+     * @brief Create a Tensor object of a specific shape and randomly generated name
+     *
+     * @param name Name of the tensor
+     * @param batchSize Size of batches
+     * @param depth Number of channels
+     * @param height Height of tensor
+     * @param width Width of tensor
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createTensor(size_t batchSize, size_t depth, size_t height, size_t width);
+
+    /**
+     * @brief Create a Tensor object of a specific shape and randomly generated name
+     *
+     * @param name Name of the tensor
+     * @param inShape Shape of tensor
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createTensor(shape inShape);
+
+    /**
+     * @brief Create a Tensor object of a specific shape with specific data and randomly generated name
+     *
+     * @param name Name of the tensor
+     * @param batchSize Size of batches
+     * @param depth Number of channels
+     * @param height Height of tensor
+     * @param width Width of tensor
+     * @param list Data for tensor initialization
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createTensor(size_t batchSize, size_t depth, size_t height, size_t width, std::initializer_list<typename T::type> list);
+
+    /**
+     * @brief Create a Tensor object of a specific shape with specific data and randomly generated name
+     *
+     * @param name Name of the tensor
+     * @param inShape Shape of tensor
+     * @param list Data for tensor initialization
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createTensor(shape inShape, std::initializer_list<typename T::type> list);
+
+    /**
+     * @brief Create a Tensor object of a specific shape and inital value
+     *
+     * @param name Name of the tensor
+     * @param batchSize Size of batches
+     * @param depth Number of channels
+     * @param height Height of tensor
+     * @param width Width of tensor
+     * @param filler Initial value of the tensor (value must be dtype)
+     * @return Tensor* Pointer to an allocated tensor object
+     */
+    T* createTensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, const typename T::type filler);
+
+    /**
+     * @brief Create a Tensor object of a specific shape and inital value
+     *
+     * @param name Name of the tensor
+     * @param shape Tensor shape
+     * @param filler Initial value of the tensor (value must be dtype)
+     * @return Tensor* Pointer to an allocated tensor object
+     */
+    T* createTensor(const Name& name, shape inShape, const typename T::type filler);
+
+    /**
+     * @brief Create a Tensor object of a specific shape, inital value and randomly generated name
+     *
+     * @param name Name of the tensor
+     * @param batchSize Size of batches
+     * @param depth Number of channels
+     * @param height Height of tensor
+     * @param width Width of tensor
+     * @param filler Initial value of the tensor (value must be dtype)
+     * @return Tensor* Pointer to an allocated tensor object
+     */
+    T* createTensor(size_t batchSize, size_t depth, size_t height, size_t width, const typename T::type filler);
+
+    /**
+     * @brief Create a Tensor object of a specific shape without allocating memory
+     *
+     * @param name Name of the tensor
+     * @param batchSize Size of batches
+     * @param depth Number of channels
+     * @param height Height of tensor
+     * @param width Width of tensor
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createShape(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, AllocationMode allocMode);
+
+    /**
+     * @brief Create a Tensor object of a specific shape without allocating memory
+     *
+     * @param name Name of the tensor
+     * @param tensor Tensor to copy shape from
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createShape(const Name& name, const T& tensor);
+
+    /**
+     * @brief Create a Tensor object of a specific shape without allocating memory
+     *
+     * @param name Name of the tensor
+     * @param shape Tensor shape
+     *
+     * @see Tensor
+     */
+    T* createShape(const Name& name, shape inShape, AllocationMode allocMode);
+
+    /**
+     * @brief Create a Tensor object of a specific shape without allocating memory and randomly generated name
+     *
+     * @param tensor Tensor to copy shape from
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createShape(const T& tensor);
+
+    /**
+     * @brief Create a Tensor object of a specific shape without allocating memory and randomly generated name
+     *
+     * @param sh Tensor shape
+     * @return Tensor* Pointer to an allocated tensor object
+     *
+     * @see Tensor
+     */
+    T* createShape(shape inShape);
+
+    /**
+     * @brief Create an alias for existing Tensor object
+     *
+     * @param name Name of existing tensor
+     * @param aliasName Alias for Name
+     *
+     * @see Tensor
+     */
+    void createAlias(const Name& name, const std::string& aliasName);
+
+    /**
+     * @brief Get the tensor from memory by name
+     *
+     * @param name Name or alias of the tensor
+     * @return Tensor& Reference to the tensor object
+     */
+    T& getTensor(const raul::Name& name);
+
+    /**
+     * @brief Get the read-only tensor from memory by name
+     *
+     * @param name Name or alias of the tensor
+     * @return const Tensor& Reference to the tensor object
+     */
+    const T& getTensor(const raul::Name& name) const;
+
+    /**
+     * @brief Get the read-only tensor from memory by name
+     *
+     * @param name Name or alias of the tensor
+     * @return const Tensor& Reference to the tensor object
+     */
+    const T& operator[](const raul::Name& name) const;
+
+    /**
+     * @brief Get the tensor from memory by name
+     *
+     * @param name Name or alias of the tensor
+     * @return Tensor& Reference to the tensor object
+     */
+    T& operator[](const raul::Name& name);
+
+    /**
+     * @brief Delete the tensor from memory by name
+     *
+     * @param name  Name or alias of the tensor
+     */
+    void deleteTensor(const raul::Name& name);
+
+    /**
+     * @brief Delete all tensor in the current namespace
+     *
+     */
+    void clear();
+
+    /**
+     * @brief Get an amount of tensor in the current namespace
+     *
+     * @return size_t Amount of tensors
+     */
+    size_t size() const;
+
+    /**
+     * @brief Check if tensor exists in the current namespace
+     *
+     * @param name Name or alias of the tensor
+     * @return true Exists
+     * @return false Not exists
+     */
+    bool tensorExists(const raul::Name& name) const;
+
+    /**
+     * @brief Get full size of allocated memory for all namespaces
+     *
+     * @return size_t Size in bytes
+     */
+    size_t getTotalMemory() const;
+
+    const TensorsNamespace<T>& getTensorCollection() const;
+
+  private:
+    /**
+     * @brief Internal utility method to check if tensor with provided name exists
+     *
+     *  Methods throws a runtime exception if name collision occurs.
+     *
+     * @param name Name or alias of the tensor
+     * @param caller The function which requested the tensor
+     * @param shouldExist Flag which specifies expectation of existence
+     */
+    void checkName(const Name& name, const char* caller, bool shouldExist = true) const;
+
+    /**
+     * @brief Check if tensor (aliases not included) exists in the current namespace
+     *
+     * @param name Name of the tensor
+     * @return true Exists
+     * @return false Not exists
+     */
+    bool checkTensorExists(const raul::Name& name) const;
+
+    /**
+     * @brief Check if alias exists in the current namespace
+     *
+     * @param name Alias of the tensor
+     * @return true Exists
+     * @return false Not exists
+     */
+    bool checkAliasExists(const raul::Name& name) const;
+
+    MemoryManagerImpl(const MemoryManagerImpl&) = delete;
+    MemoryManagerImpl& operator=(const MemoryManagerImpl&) = delete;
+    MemoryManagerImpl* operator&() = delete;
+
+  private:
+    TensorsNamespace<T> mTensors;
+    NameGenerator mTensorNameGenerator;
+};
+
+typedef MemoryManagerImpl<Tensor> MemoryManager;
+typedef MemoryManagerImpl<TensorFP16> MemoryManagerFP16;
+
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/common/NetworkParameters.cpp b/training/src/compiler/training/base/common/NetworkParameters.cpp
new file mode 100644
index 00000000..50f05de4
--- /dev/null
+++ b/training/src/compiler/training/base/common/NetworkParameters.cpp
@@ -0,0 +1,59 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "NetworkParameters.h"
+
+namespace raul
+{
+
+std::function<void(raul::BasicLayer*, raul::MemoryManager&, raul::NetworkParameters::CallbackPlace)>
+CallbackHelper(std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManager&)>> beforeForward,
+               std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManager&)>> afterForward,
+               std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManager&)>> beforeBackward,
+               std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManager&)>> afterBackward)
+{
+    return [=](raul::BasicLayer* layer, raul::MemoryManager& mem, raul::NetworkParameters::CallbackPlace place) {
+        switch (place)
+        {
+            case raul::NetworkParameters::CallbackPlace::Before_Forward:
+                if (beforeForward)
+                {
+                    beforeForward.value()(layer, mem);
+                }
+                break;
+            case raul::NetworkParameters::CallbackPlace::After_Forward:
+                if (afterForward)
+                {
+                    afterForward.value()(layer, mem);
+                }
+                break;
+            case raul::NetworkParameters::CallbackPlace::Before_Backward:
+                if (beforeBackward)
+                {
+                    beforeBackward.value()(layer, mem);
+                }
+                break;
+            case raul::NetworkParameters::CallbackPlace::After_Backward:
+                if (afterBackward)
+                {
+                    afterBackward.value()(layer, mem);
+                    ;
+                }
+                break;
+            default:
+                THROW_NONAME("NetworkParameters", "unknown place to use callback");
+        }
+    };
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/common/NetworkParameters.h b/training/src/compiler/training/base/common/NetworkParameters.h
new file mode 100644
index 00000000..a1f457ce
--- /dev/null
+++ b/training/src/compiler/training/base/common/NetworkParameters.h
@@ -0,0 +1,89 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NETWORK_PARAMETERS_H
+#define NETWORK_PARAMETERS_H
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/common/quantization/IQuantizer.h>
+
+namespace raul
+{
+
+class BasicLayer;
+class Workflow;
+
+struct NetworkParameters
+{
+
+    enum class CallbackPlace : int
+    {
+        Before_Forward = 0,
+        After_Forward = 1,
+        Before_Backward = 2,
+        After_Backward = 3
+    };
+
+    using CpuCallback = std::function<void(raul::BasicLayer*, raul::MemoryManager&, raul::NetworkParameters::CallbackPlace)>;
+    using CpuFP16Callback = std::function<void(raul::BasicLayer*, raul::MemoryManagerFP16&, raul::NetworkParameters::CallbackPlace)>;
+
+    NetworkParameters(
+        MemoryManager& memoryManager,
+        MemoryManagerFP16& memoryManagerFP16,
+        Workflow& workflow,
+        size_t lossReductionCoefficient,
+        CompressionMode compressionMode,
+        CalculationMode calculationMode,
+        quantization::IQuantizer* quantizer = nullptr,
+        CpuCallback callback = [](BasicLayer*, MemoryManager&, NetworkParameters::CallbackPlace) {},
+        CpuFP16Callback callbackCPUFP16 = [](BasicLayer*, MemoryManagerFP16&, NetworkParameters::CallbackPlace) {})
+        : mMemoryManager(memoryManager)
+        , mMemoryManagerFP16(memoryManagerFP16)
+        , mWorkflow(workflow)
+        , mLossReductionCoefficient(lossReductionCoefficient)
+        , mCompressionMode(compressionMode)
+        , mCalculationMode(calculationMode)
+        , mQuantizerPtr(quantizer)
+        , mCallback(callback)
+        , mCallbackFP16(callbackCPUFP16)
+    {
+    }
+
+    void setCallback(CpuCallback callback) { mCallback = callback; }
+    void setCallback(CpuFP16Callback callback) { mCallbackFP16 = callback; }
+
+    MemoryManager& mMemoryManager;
+    MemoryManagerFP16& mMemoryManagerFP16;
+    Workflow& mWorkflow;
+    size_t mLossReductionCoefficient;
+    const CompressionMode mCompressionMode;
+    const CalculationMode mCalculationMode;
+    quantization::IQuantizer* mQuantizerPtr;
+
+    CpuCallback mCallback;
+    CpuFP16Callback mCallbackFP16;
+};
+
+NetworkParameters::CpuCallback CallbackHelper(std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManager&)>> beforeForward,
+                                              std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManager&)>> afterForward,
+                                              std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManager&)>> beforeBackward,
+                                              std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManager&)>> afterBackward);
+
+NetworkParameters::CpuFP16Callback CallbackHelperFP16(std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManagerFP16&)>> beforeForward,
+                                                      std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManagerFP16&)>> afterForward,
+                                                      std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManagerFP16&)>> beforeBackward,
+                                                      std::optional<std::function<void(raul::BasicLayer*, raul::MemoryManagerFP16&)>> afterBackward);
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/common/Random.cpp b/training/src/compiler/training/base/common/Random.cpp
new file mode 100644
index 00000000..41b5e9e4
--- /dev/null
+++ b/training/src/compiler/training/base/common/Random.cpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Random.h"
+
+namespace
+{
+std::optional<size_t> globalSeed = std::nullopt;
+}
+
+namespace raul::random
+{
+
+size_t getGlobalSeed()
+{
+    static std::random_device rd;
+    if (!globalSeed)
+    {
+        setGlobalSeed(rd());
+    }
+    return *globalSeed;
+}
+
+void setGlobalSeed(size_t seed)
+{
+    globalSeed = seed;
+}
+
+size_t getThreadSeed()
+{
+    static size_t cnt = 0;
+    static thread_local size_t seed = getGlobalSeed() + (++cnt);
+    return seed;
+}
+
+std::mt19937_64& getGenerator(std::optional<size_t> seed)
+{
+    const size_t localSeed = seed ? *seed : getThreadSeed();
+    thread_local static std::mt19937_64 generator(localSeed);
+    return generator;
+}
+
+namespace bernoulli
+{
+
+bool randBool(dtype p)
+{
+    static thread_local auto gen = getGenerator();
+    std::bernoulli_distribution dis;
+    return dis(gen, decltype(dis)::param_type{ p });
+}
+
+bool randBool(dtype p, std::mt19937_64& gen)
+{
+    std::bernoulli_distribution dis;
+    return dis(gen, decltype(dis)::param_type{ p });
+}
+
+}
+
+} // namespace raul::random
diff --git a/training/src/compiler/training/base/common/Random.h b/training/src/compiler/training/base/common/Random.h
new file mode 100644
index 00000000..08d0db8f
--- /dev/null
+++ b/training/src/compiler/training/base/common/Random.h
@@ -0,0 +1,104 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_H
+#define RANDOM_H
+
+#include "Common.h"
+
+#include <optional>
+#include <random>
+#include <type_traits>
+
+namespace raul::random
+{
+
+template<typename T>
+using dataRange = std::pair<T, T>;
+using dtypeRange = dataRange<raul::dtype>;
+using halfRange = dataRange<raul::half>;
+using intRange = dataRange<int>;
+
+// Seed management
+
+/**
+ * @brief Get global Raul seed value
+ * @return seed (non-negative integer value)
+ *
+ * If the value is not set before the random device is used.
+ *
+ */
+[[nodiscard]] size_t getGlobalSeed();
+
+/**
+ * @brief Set global Raul seed value
+ * @param seed
+ */
+void setGlobalSeed(size_t seed);
+
+/**
+ * @brief Get local thread seed value
+ * @return seed (non-negative integer value)
+ */
+size_t getThreadSeed();
+
+// Generators
+
+[[nodiscard]] std::mt19937_64& getGenerator(std::optional<size_t> seed = std::nullopt);
+
+namespace uniform
+{
+
+template<typename T>
+[[nodiscard]] T rand(T from, T to)
+{
+    static thread_local auto gen = getGenerator();
+    if constexpr (std::is_integral_v<T>)
+    {
+        std::uniform_int_distribution<T> dis(from, to);
+        return dis(gen);
+    }
+    else
+    {
+        std::uniform_real_distribution<raul::dtype> dis(from, to);
+        return static_cast<T>(dis(gen));
+    }
+}
+
+template<typename T>
+[[nodiscard]] T rand(dataRange<T> randomRange)
+{
+    if constexpr (std::is_integral_v<T>)
+    {
+        return rand<T>(randomRange.first, randomRange.second);
+    }
+    else
+    {
+        const auto first = TODTYPE(randomRange.first);
+        constexpr auto limit = std::numeric_limits<raul::dtype>::max();
+        const auto second = std::nextafter(TODTYPE(randomRange.second), limit);
+        return static_cast<T>(rand<raul::dtype>(first, second));
+    }
+}
+
+}
+
+namespace bernoulli
+{
+[[nodiscard]] bool randBool(dtype p);
+[[nodiscard]] bool randBool(dtype p, std::mt19937_64& gen);
+}
+
+} // namespace raul::random
+
+#endif // RANDOM_H
diff --git a/training/src/compiler/training/base/common/Tensor.cpp b/training/src/compiler/training/base/common/Tensor.cpp
new file mode 100644
index 00000000..19a443b3
--- /dev/null
+++ b/training/src/compiler/training/base/common/Tensor.cpp
@@ -0,0 +1,688 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <algorithm>
+
+#include "Tensor.h"
+
+#include <training/system/TypeHalf.h>
+
+namespace
+{
+
+/**
+ * @brief This helper maps given offset and stride into indexes.
+ * @param offset size_t offset in flat array
+ * @param strides stride for 4d tensor
+ * @return
+ */
+raul::shape offset_to_indexes(size_t offset, const raul::shape& strides)
+{
+    raul::shape indexes;
+    size_t q = 0;
+    while (q < strides.dimensions_num())
+    {
+        if (strides[q] != 0)
+        {
+
+            indexes[q] = offset / strides[q];
+            offset %= strides[q];
+        }
+        else
+        {
+            indexes[q] = 0U;
+        }
+        ++q;
+    }
+    indexes[q - 1] += offset;
+    return indexes;
+}
+
+} // anonymous namespace
+
+namespace raul
+{
+
+template<>
+template<>
+Tensor& Tensor::operator=(TensorFP16::dt_range beginEnd)
+{
+    const size_t size = static_cast<size_t>(beginEnd.second - beginEnd.first);
+    if (size != this->size())
+    {
+        THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(beginEnd.second - beginEnd.first) + ")");
+    }
+
+    auto ii = beginEnd.first;
+    auto i = mTensorMem.begin();
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < size; ++q)
+    {
+        i[q] = toFloat32(ii[q]);
+    }
+    return *this;
+}
+
+template<>
+template<>
+TensorFP16& TensorFP16::operator=(Tensor::dt_range beginEnd)
+{
+    const auto size = static_cast<size_t>(beginEnd.second - beginEnd.first);
+    if (size != this->size())
+    {
+        THROW("TensorFP16", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(beginEnd.second - beginEnd.first) + ")");
+    }
+
+    auto i = mTensorMem.begin();
+    auto ii = beginEnd.first;
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < size; ++q)
+    {
+        i[q] = toFloat16(ii[q]);
+    }
+    return *this;
+}
+
+template<>
+Tensor& Tensor::operator+=(const Tensor& rhs)
+{
+    if (rhs.size() != this->size())
+    {
+        THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(rhs.size()) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::plus<dtype>());
+
+    return *this;
+}
+
+template<>
+Tensor& Tensor::operator+=(const Tensor::dt_range& rhs)
+{
+    auto sz = static_cast<size_t>(rhs.second - rhs.first);
+    if (sz != this->size())
+    {
+        THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(sz) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.first, this->begin(), std::plus<dtype>());
+
+    return *this;
+}
+
+template<>
+TensorFP16& TensorFP16::operator+=(const TensorFP16& rhs)
+{
+    if (rhs.size() != this->size())
+    {
+        THROW("TensorFP16", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(rhs.size()) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::plus<half>());
+
+    return *this;
+}
+
+template<>
+TensorFP16& TensorFP16::operator+=(const TensorFP16::dt_range& rhs)
+{
+    auto sz = static_cast<size_t>(rhs.second - rhs.first);
+    if (sz != this->size())
+    {
+        THROW("TensorFP16", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(sz) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.first, this->begin(), std::plus<half>());
+
+    return *this;
+}
+
+template<>
+Tensor& Tensor::operator-=(const Tensor& rhs)
+{
+    if (rhs.size() != this->size())
+    {
+        THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(rhs.size()) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::minus<dtype>());
+
+    return *this;
+}
+
+template<>
+Tensor& Tensor::operator-=(const Tensor::dt_range& rhs)
+{
+    auto sz = static_cast<size_t>(rhs.second - rhs.first);
+    if (sz != this->size())
+    {
+        THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(sz) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.first, this->begin(), std::minus<dtype>());
+
+    return *this;
+}
+
+template<>
+TensorFP16& TensorFP16::operator-=(const TensorFP16& rhs)
+{
+    if (rhs.size() != this->size())
+    {
+        THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(rhs.size()) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::minus<half>());
+
+    return *this;
+}
+
+template<>
+TensorFP16& TensorFP16::operator-=(const TensorFP16::dt_range& rhs)
+{
+    auto sz = static_cast<size_t>(rhs.second - rhs.first);
+    if (sz != this->size())
+    {
+        THROW("TensorFP16", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(sz) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.first, this->begin(), std::minus<half>());
+
+    return *this;
+}
+
+template<>
+Tensor& Tensor::operator*=(const Tensor& rhs)
+{
+    if (rhs.size() != this->size())
+    {
+        THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(rhs.size()) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::multiplies<dtype>());
+
+    return *this;
+}
+
+template<>
+Tensor& Tensor::operator*=(const Tensor::dt_range& rhs)
+{
+    auto sz = static_cast<size_t>(rhs.second - rhs.first);
+    if (sz != this->size())
+    {
+        THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(sz) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.first, this->begin(), std::multiplies<dtype>());
+
+    return *this;
+}
+
+template<>
+TensorFP16& TensorFP16::operator*=(const TensorFP16& rhs)
+{
+    if (rhs.size() != this->size())
+    {
+        THROW("TensorFP16", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(rhs.size()) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::multiplies<half>());
+
+    return *this;
+}
+
+template<>
+TensorFP16& TensorFP16::operator*=(const TensorFP16::dt_range& rhs)
+{
+    auto sz = static_cast<size_t>(rhs.second - rhs.first);
+    if (sz != this->size())
+    {
+        THROW("TensorFP16", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(sz) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.first, this->begin(), std::multiplies<half>());
+
+    return *this;
+}
+
+template<>
+Tensor& Tensor::operator/=(const Tensor& rhs)
+{
+    if (rhs.size() != this->size())
+    {
+        THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(rhs.size()) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::divides<dtype>());
+
+    return *this;
+}
+
+template<>
+Tensor& Tensor::operator/=(const Tensor::dt_range& rhs)
+{
+    auto sz = static_cast<size_t>(rhs.second - rhs.first);
+    if (sz != this->size())
+    {
+        THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(sz) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.first, this->begin(), std::divides<dtype>());
+
+    return *this;
+}
+
+template<>
+TensorFP16& TensorFP16::operator/=(const TensorFP16& rhs)
+{
+    if (rhs.size() != this->size())
+    {
+        THROW("TensorFP16", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(rhs.size()) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::divides<half>());
+
+    return *this;
+}
+
+template<>
+TensorFP16& TensorFP16::operator/=(const TensorFP16::dt_range& rhs)
+{
+    auto sz = static_cast<size_t>(rhs.second - rhs.first);
+    if (sz != this->size())
+    {
+        THROW("TensorFP16", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(sz) + ")");
+    }
+    std::transform(this->begin(), this->end(), rhs.first, this->begin(), std::divides<half>());
+
+    return *this;
+}
+
+template<>
+Tensor& Tensor::operator=(dtype rhs)
+{
+    std::fill(this->begin(), this->end(), rhs);
+
+    return *this;
+}
+
+template<>
+TensorFP16& TensorFP16::operator=(half rhs)
+{
+    std::fill(this->begin(), this->end(), rhs);
+
+    return *this;
+}
+
+template<>
+void Tensor::compress(CompressionMode mode)
+{
+    if (mTensorMem.empty())
+    {
+        THROW_NONAME("Tensor", "empty tensor");
+    }
+
+    if (mode == CompressionMode::NONE) return;
+
+    if (mode == CompressionMode::FP16)
+    {
+        mCompressedDataFP16.resize(this->size());
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < this->size(); ++q)
+        {
+            mCompressedDataFP16[q] = toFloat16(mTensorMem[q]);
+        }
+
+        mTensorMem.clear();
+        mTensorMem.shrink_to_fit();
+    }
+
+    if (mode == CompressionMode::INT8)
+    {
+
+#if defined(_OPENMP)
+        mCompressInt8Min = std::numeric_limits<dtype>::max();
+        mCompressInt8Max = std::numeric_limits<dtype>::lowest();
+
+#pragma omp parallel for
+        for (size_t q = 0; q < this->size(); ++q)
+        {
+            if (mTensorMem[q] > mCompressInt8Max)
+            {
+#pragma omp critical
+                if (mTensorMem[q] > mCompressInt8Max) mCompressInt8Max = mTensorMem[q];
+            }
+
+            if (mTensorMem[q] < mCompressInt8Min)
+            {
+#pragma omp critical
+                if (mTensorMem[q] < mCompressInt8Min) mCompressInt8Min = mTensorMem[q];
+            }
+        }
+#else
+        auto minMax = std::minmax_element(mTensorMem.begin(), mTensorMem.end());
+        mCompressInt8Min = *minMax.first;
+        mCompressInt8Max = *minMax.second;
+#endif
+
+        mCompressedDataInt8.resize(this->size());
+
+        if (mCompressInt8Max != mCompressInt8Min)
+        {
+            dtype unit = TODTYPE(255.0f) / (mCompressInt8Max - mCompressInt8Min);
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < this->size(); ++q)
+            {
+                mCompressedDataInt8[q] = static_cast<uint8_t>((mTensorMem[q] - mCompressInt8Min) * unit);
+            }
+        }
+        else
+        {
+            std::fill(mCompressedDataInt8.begin(), mCompressedDataInt8.end(), static_cast<uint8_t>(0));
+        }
+
+        mTensorMem.clear();
+        mTensorMem.shrink_to_fit();
+    }
+}
+
+template<>
+void TensorFP16::compress(CompressionMode mode)
+{
+    if (mTensorMem.empty())
+    {
+        throw std::runtime_error("Tensor[compress]: empty tensor");
+    }
+
+    if (mode == CompressionMode::NONE)
+    {
+        return;
+    }
+
+    if (mode == CompressionMode::FP16)
+    {
+        return;
+    }
+
+    if (mode == CompressionMode::INT8)
+    {
+
+#if defined(_OPENMP)
+        mCompressInt8Min = std::numeric_limits<raul::half>::max();
+        mCompressInt8Max = std::numeric_limits<raul::half>::lowest();
+
+#pragma omp parallel for
+        for (size_t q = 0; q < this->size(); ++q)
+        {
+            if (mTensorMem[q] > mCompressInt8Max)
+            {
+#pragma omp critical
+                if (mTensorMem[q] > mCompressInt8Max) mCompressInt8Max = mTensorMem[q];
+            }
+
+            if (mTensorMem[q] < mCompressInt8Min)
+            {
+#pragma omp critical
+                if (mTensorMem[q] < mCompressInt8Min) mCompressInt8Min = mTensorMem[q];
+            }
+        }
+#else
+        auto minMax = std::minmax_element(mTensorMem.begin(), mTensorMem.end());
+        mCompressInt8Min = *minMax.first;
+        mCompressInt8Max = *minMax.second;
+#endif
+
+        mCompressedDataInt8.resize(this->size());
+
+        if (mCompressInt8Max != mCompressInt8Min)
+        {
+            raul::half unit = TOHTYPE(255.0f) / (mCompressInt8Max - mCompressInt8Min);
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < this->size(); ++q)
+            {
+                mCompressedDataInt8[q] = static_cast<uint8_t>((mTensorMem[q] - mCompressInt8Min) * unit);
+            }
+        }
+        else
+        {
+            std::fill(mCompressedDataInt8.begin(), mCompressedDataInt8.end(), static_cast<uint8_t>(0));
+        }
+
+        mTensorMem.clear();
+        mTensorMem.shrink_to_fit();
+    }
+}
+
+template<>
+void Tensor::decompress(CompressionMode mode)
+{
+    if (mode == CompressionMode::NONE) return;
+
+    if (mode == CompressionMode::FP16)
+    {
+        mTensorMem.resize(mCompressedDataFP16.size(), nullptr);
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < mCompressedDataFP16.size(); ++q)
+        {
+            mTensorMem[q] = toFloat32(mCompressedDataFP16[q]);
+        }
+
+        mCompressedDataFP16.clear();
+        mCompressedDataFP16.shrink_to_fit();
+    }
+
+    if (mode == CompressionMode::INT8)
+    {
+        mTensorMem.resize(mCompressedDataInt8.size(), nullptr);
+
+        dtype unit = (mCompressInt8Max - mCompressInt8Min) / TODTYPE(255.0f);
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < mCompressedDataInt8.size(); ++q)
+        {
+            mTensorMem[q] = mCompressInt8Min + mCompressedDataInt8[q] * unit;
+        }
+
+        mCompressedDataInt8.clear();
+        mCompressedDataInt8.shrink_to_fit();
+    }
+}
+
+template<>
+void TensorFP16::decompress(CompressionMode mode)
+{
+    if (mode == CompressionMode::NONE)
+    {
+        return;
+    }
+
+    if (mode == CompressionMode::FP16)
+    {
+        return;
+    }
+
+    if (mode == CompressionMode::INT8)
+    {
+        mTensorMem.resize(mCompressedDataInt8.size(), nullptr);
+
+        raul::half unit = (mCompressInt8Max - mCompressInt8Min) / TOHTYPE(255.0f);
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < mCompressedDataInt8.size(); ++q)
+        {
+            mTensorMem[q] = mCompressInt8Min + mCompressedDataInt8[q] * unit;
+        }
+
+        mCompressedDataInt8.clear();
+        mCompressedDataInt8.shrink_to_fit();
+    }
+}
+
+template<>
+void Tensor::memClear()
+{
+    mTensorMem.clear();
+    mTensorMem.shrink_to_fit();
+}
+
+template<>
+void TensorFP16::memClear()
+{
+    mTensorMem.clear();
+    mTensorMem.shrink_to_fit();
+}
+
+template<>
+void Tensor::memAllocate(dtype* data)
+{
+    if (mTensorMem.empty())
+    {
+        mTensorMem.resize(mShape.total_size(), data);
+    }
+}
+
+template<>
+void TensorFP16::memAllocate(half* data)
+{
+    if (mTensorMem.empty())
+    {
+        mTensorMem.resize(mShape.total_size(), data);
+    }
+}
+
+template<>
+size_t Tensor::getMaxIndex() const
+{
+    return std::max_element(mTensorMem.begin(), mTensorMem.end()) - mTensorMem.begin();
+}
+
+template<>
+size_t Tensor::getMaxIndex(size_t begin, size_t end) const
+{
+    return std::max_element(mTensorMem.begin() + begin, mTensorMem.begin() + end) - (mTensorMem.begin() + begin);
+}
+
+template<>
+size_t TensorFP16::getMaxIndex(size_t begin, size_t end) const
+{
+    return std::max_element(mTensorMem.begin() + begin, mTensorMem.begin() + end) - (mTensorMem.begin() + begin);
+}
+
+template<>
+size_t Tensor::broadcasted_viewer::get_offset(const size_t index) const
+{
+    const auto original_indexes = offset_to_indexes(index, mViewerStrides);
+    const size_t offset = std::inner_product(mOriginalStrides.cbegin(), mOriginalStrides.cend(), original_indexes.cbegin(), static_cast<size_t>(0));
+    return offset;
+}
+
+template<>
+size_t TensorFP16::broadcasted_viewer::get_offset(const size_t index) const
+{
+    const auto original_indexes = offset_to_indexes(index, mViewerStrides);
+    const size_t offset = std::inner_product(mOriginalStrides.cbegin(), mOriginalStrides.cend(), original_indexes.cbegin(), static_cast<size_t>(0));
+    return offset;
+}
+
+template<>
+dtype& Tensor::broadcasted_viewer::operator[](size_t index)
+{
+    const auto offset = get_offset(index);
+    auto ptr = const_cast<Tensor*>(mTensor);
+    return (*ptr)[offset];
+}
+
+template<>
+half& TensorFP16::broadcasted_viewer::operator[](size_t index)
+{
+    const auto offset = get_offset(index);
+    auto ptr = const_cast<TensorFP16*>(mTensor);
+    return (*ptr)[offset];
+}
+
+template<>
+const dtype& Tensor::broadcasted_viewer::operator[](size_t index) const
+{
+    const auto offset = get_offset(index);
+    return (*mTensor)[offset];
+}
+
+template<>
+const half& TensorFP16::broadcasted_viewer::operator[](size_t index) const
+{
+    const auto offset = get_offset(index);
+    return (*mTensor)[offset];
+}
+
+template<>
+Tensor::broadcasted_viewer Tensor::getBroadcastedViewer(const shape& viewer_shape) const
+{
+    const auto original_shape = getShape();
+    const auto original_strides = Common::getStrides(original_shape);
+    const auto viewer_strides = Common::getStrides(viewer_shape);
+    size_t size = 1U;
+
+    for (auto it = viewer_shape.cbegin(); it != viewer_shape.cend(); ++it)
+    {
+        size *= *it;
+    }
+
+    const auto broadcastable = isBroadcastableTo(viewer_shape);
+    if (!broadcastable)
+    {
+        THROW("Tensor", mName, "tensor is not broadcastable [from " + seq2str(getShape()) + ", to " + seq2str(viewer_shape) + "]");
+    }
+
+    return broadcasted_viewer{ this, viewer_strides, original_strides, size };
+}
+
+template<>
+TensorFP16::broadcasted_viewer TensorFP16::getBroadcastedViewer(const shape& viewer_shape) const
+{
+    const auto original_shape = getShape();
+    const auto original_strides = Common::getStrides(original_shape);
+    const auto viewer_strides = Common::getStrides(viewer_shape);
+    size_t size = 1U;
+
+    for (auto it = viewer_shape.cbegin(); it != viewer_shape.cend(); ++it)
+    {
+        size *= *it;
+    }
+
+    const auto broadcastable = isBroadcastableTo(viewer_shape);
+    if (!broadcastable)
+    {
+        THROW("TensorFP16", mName, "tensor is not broadcastable [from " + seq2str(getShape()) + ", to " + seq2str(viewer_shape) + "]");
+    }
+
+    return broadcasted_viewer{ this, viewer_strides, original_strides, size };
+}
+
+template<>
+std::string Tensor::getDescription() const
+{
+    return mName + " " + seq2str(getShape());
+}
+
+template<>
+std::string TensorFP16::getDescription() const
+{
+    return mName + " " + seq2str(getShape());
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/common/Tensor.h b/training/src/compiler/training/base/common/Tensor.h
new file mode 100644
index 00000000..09d01fb0
--- /dev/null
+++ b/training/src/compiler/training/base/common/Tensor.h
@@ -0,0 +1,686 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TENSOR_H
+#define TENSOR_H
+
+#include <string>
+
+#include "TensorMem.h"
+#include <training/system/Name.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/io/TensorStream.h>
+
+namespace raul
+{
+
+/**
+ * @brief Helper to convert sequences to str
+ * @param data
+ * @return str
+ */
+template<typename T>
+std::string seq2str(const T& data, char separator = ',', std::pair<char, char> brackets = { '(', ')' }, bool compact = false)
+{
+    const size_t half_size = 5;
+    std::string str;
+
+    str += brackets.first;
+
+    auto begin = data.cbegin();
+    const auto end = data.cend();
+
+    if (begin != end)
+    {
+        str += Conversions::toString(*begin);
+        ++begin;
+    }
+
+    if (compact && data.size() > 2 * half_size)
+    {
+        const auto first_half_stop = begin + half_size - 1;
+        for (; begin != first_half_stop; ++begin)
+        {
+            str += separator;
+            str += Conversions::toString(*begin);
+        }
+        str += separator;
+        str += "...";
+        begin = end - half_size;
+        for (; begin != end; ++begin)
+        {
+            str += separator;
+            str += Conversions::toString(*begin);
+        }
+    }
+    else
+    {
+        for (; begin != end; ++begin)
+        {
+            str += separator;
+            str += Conversions::toString(*begin);
+        }
+    }
+
+    str += brackets.second;
+
+    return str;
+}
+
+template<typename dt>
+class TensorImpl
+{
+  public:
+    typedef std::pair<const dt*, const dt*> dt_range;
+    typedef dt type;
+
+    explicit TensorImpl() = delete;
+    explicit TensorImpl(const TensorImpl&) = delete;
+    explicit TensorImpl(TensorImpl&&) = delete;
+
+    // no data
+    TensorImpl(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, AllocationMode allocMode = AllocationMode::STANDARD, bool isAllocate = true)
+        : mName(name)
+        , mTensorMem(allocMode)
+        , mCompressInt8Min(0)
+        , mCompressInt8Max(0)
+        , mShape(batchSize, depth, height, width)
+    {
+        if (isAllocate)
+        {
+            mTensorMem.resize(mShape.total_size(), nullptr);
+        }
+    }
+
+    TensorImpl(size_t batchSize, size_t depth, size_t height, size_t width, bool isAllocate = true)
+        : TensorImpl("", batchSize, depth, height, width, AllocationMode::STANDARD, isAllocate)
+    {
+    }
+
+    TensorImpl(const Name& name, size_t size)
+        : TensorImpl(name, 1u, 1u, 1u, size)
+    {
+    }
+
+    explicit TensorImpl(size_t size)
+        : TensorImpl("", size)
+    {
+    }
+
+    TensorImpl(shape inShape)
+        : TensorImpl("", inShape)
+    {
+    }
+
+    TensorImpl(const Name& name, shape inShape, bool isAllocate = true)
+        : TensorImpl(name, inShape[0], inShape[1], inShape[2], inShape[3], AllocationMode::STANDARD, isAllocate)
+    {
+    }
+
+    // filler
+    TensorImpl(const Name& name, shape inShape, dt filler)
+        : mName(name)
+        , mTensorMem(inShape.total_size(), filler)
+        , mCompressInt8Min(0)
+        , mCompressInt8Max(0)
+        , mShape(inShape)
+    {
+    }
+
+    TensorImpl(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, dt filler)
+        : TensorImpl(name, shape(batchSize, depth, height, width), filler)
+    {
+    }
+
+    TensorImpl(const Name& name, size_t size, dt filler)
+        : TensorImpl(name, shape(1u, 1u, 1u, size), filler)
+    {
+    }
+
+    TensorImpl(size_t size, dt filler)
+        : TensorImpl("", size, filler)
+    {
+    }
+
+    // initializer_list
+    TensorImpl(const Name& name, shape inShape, std::initializer_list<dt> list)
+        : mName(name)
+        , mTensorMem(list)
+        , mCompressInt8Min(0)
+        , mCompressInt8Max(0)
+        , mShape(inShape)
+    {
+        if (list.size() != mShape.total_size())
+        {
+            THROW("TensorImpl", name, "Bad initializer size");
+        }
+    }
+
+    TensorImpl(const Name& name, std::initializer_list<dt> list)
+        : TensorImpl(name, shape(1u, 1u, 1u, list.size()), list)
+    {
+    }
+
+    TensorImpl(std::initializer_list<dt> list)
+        : TensorImpl("", list)
+    {
+    }
+
+    TensorImpl(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, std::initializer_list<dt> list)
+        : TensorImpl(name, shape(batchSize, depth, height, width), list)
+    {
+    }
+
+    TensorImpl(size_t batchSize, size_t depth, size_t height, size_t width, std::initializer_list<dt> list)
+        : TensorImpl("", batchSize, depth, height, width, list)
+    {
+    }
+
+    TensorImpl(shape inShape, std::initializer_list<dt> list)
+        : TensorImpl("", inShape, list)
+    {
+    }
+
+    // dt_range
+    TensorImpl(const Name& name, shape inShape, dt_range beginEnd)
+        : mName(name)
+        , mTensorMem(beginEnd.first, beginEnd.second)
+        , mCompressInt8Min(0)
+        , mCompressInt8Max(0)
+        , mShape(inShape)
+    {
+        if (this->size() != mShape.total_size())
+        {
+            THROW("TensorImpl", name, "Bad data size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(mShape.total_size()) + ")");
+        }
+    }
+
+    TensorImpl(shape inShape, dt_range beginEnd)
+        : TensorImpl("", inShape, beginEnd)
+    {
+    }
+
+    TensorImpl(const Name& name, dt_range beginEnd)
+        : TensorImpl(name, shape(1u, 1u, 1u, static_cast<size_t>(beginEnd.second - beginEnd.first)), beginEnd)
+    {
+    }
+
+    TensorImpl(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, dt_range beginEnd)
+        : TensorImpl(name, shape(batchSize, depth, height, width), beginEnd)
+    {
+    }
+
+    TensorImpl(dt_range beginEnd)
+        : TensorImpl("", beginEnd)
+    {
+    }
+
+    INLINE dt& operator[](size_t index) noexcept { return mTensorMem[index]; }
+
+    INLINE const dt& operator[](size_t index) const noexcept { return mTensorMem[index]; }
+
+    INLINE dt* getBuffer() noexcept { return &mTensorMem[0]; }
+    INLINE const dt* getBuffer() const noexcept { return &mTensorMem[0]; }
+
+    /**
+     * set mData with size check (copy)
+     */
+    template<typename T>
+    TensorImpl& operator=(std::initializer_list<T> lst)
+    {
+        if (lst.size() != this->size())
+        {
+            THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(lst.size()) + ")");
+        }
+        std::transform(lst.begin(), lst.end(), mTensorMem.begin(), [](auto v) { return static_cast<dtype>(v); });
+        return *this;
+    }
+
+    TensorImpl& operator=(dt_range beginEnd)
+    {
+        if (static_cast<size_t>(beginEnd.second - beginEnd.first) != this->size())
+        {
+            THROW("Tensor", mName, "wrong size (expected: " + Conversions::toString(this->size()) + ", got: " + Conversions::toString(beginEnd.second - beginEnd.first) + ")");
+        }
+        std::copy(beginEnd.first, beginEnd.second, mTensorMem.begin());
+        return *this;
+    }
+
+    operator dt_range() const { return dt_range(&mTensorMem[0], &mTensorMem[0] + this->size()); }
+
+    /**
+     * conversions to/from TensorFP16
+     */
+    template<typename T>
+    TensorImpl& operator=(T beginEnd);
+
+    TensorImpl& operator=(const TensorImpl&) = delete;
+    TensorImpl& operator=(TensorImpl&&) = delete;
+
+    TensorImpl& operator+=(const TensorImpl& rhs);
+    TensorImpl& operator-=(const TensorImpl& rhs);
+    TensorImpl& operator*=(const TensorImpl& rhs);
+    TensorImpl& operator/=(const TensorImpl& rhs);
+
+    TensorImpl& operator+=(const dt_range& rhs);
+    TensorImpl& operator-=(const dt_range& rhs);
+    TensorImpl& operator*=(const dt_range& rhs);
+    TensorImpl& operator/=(const dt_range& rhs);
+
+    TensorImpl& operator=(dt rhs);
+
+    template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<T, half>::value>::type>
+    TensorImpl& operator+=(T rhs)
+    {
+        std::transform(this->begin(), this->end(), this->begin(), [&](const TensorImpl::type x) { return static_cast<TensorImpl::type>(x + rhs); });
+
+        return *this;
+    }
+
+    template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<T, half>::value>::type>
+    TensorImpl& operator-=(T rhs)
+    {
+        std::transform(this->begin(), this->end(), this->begin(), [&](const TensorImpl::type x) { return static_cast<TensorImpl::type>(x - rhs); });
+
+        return *this;
+    }
+
+    template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<T, half>::value>::type>
+    TensorImpl& operator*=(T rhs)
+    {
+        std::transform(this->begin(), this->end(), this->begin(), [&](const TensorImpl::type x) { return static_cast<TensorImpl::type>(x * rhs); });
+
+        return *this;
+    }
+
+    template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<T, half>::value>::type>
+    TensorImpl& operator/=(T rhs)
+    {
+        std::transform(this->begin(), this->end(), this->begin(), [&](const TensorImpl::type x) { return static_cast<TensorImpl::type>(x / rhs); });
+
+        return *this;
+    }
+
+    friend std::ostream& operator<<(std::ostream& out, const TensorImpl& instance)
+    {
+        out << "Tensor";
+        const auto name = instance.getName();
+        if (!name.empty())
+        {
+            out << " '" << name << "'";
+        }
+        out << " " << seq2str(instance.getShape());
+
+        if (io::tensor::isSetFlag(out, io::tensor::TensorView::size))
+        {
+            out << ", size: " << instance.size() * sizeof(type);
+        }
+        if (io::tensor::isSetFlag(out, io::tensor::TensorView::scale))
+        {
+            const auto scale = instance.getScale();
+            if (scale)
+            {
+                out << ", scale: " << static_cast<dtype>(*scale);
+            }
+            else
+            {
+                out << ", scale: no";
+            }
+        }
+        const bool compact = io::tensor::isSetFlag(out, io::tensor::TensorView::reduced);
+
+        if (io::tensor::isSetFlag(out, io::tensor::TensorView::content))
+        {
+            out << std::endl << seq2str(instance, ',', { '[', ']' }, compact);
+        }
+        return out;
+    }
+
+    [[nodiscard]] size_t size() const { return mTensorMem.size(); }
+    [[nodiscard]] bool empty() const { return mTensorMem.empty(); }
+
+    void compress(CompressionMode mode);
+    void decompress(CompressionMode mode);
+
+    void resetScale(dtype scale) { mScale = scale; }
+
+    void scale(dtype scale)
+    {
+        *this *= scale;
+        if (mScale)
+        {
+            *mScale *= scale;
+        }
+        else
+        {
+            mScale = scale;
+        }
+    }
+
+    void unscale()
+    {
+        if (mScale)
+        {
+            *this /= *mScale;
+            mScale = std::nullopt;
+        }
+    }
+
+    void memAllocate(dt* data); // d.polubotko: do not use directly in layers, should be used by actions only
+    void memClear();            // d.polubotko: do not use directly in layers, should be used by actions only
+
+    [[nodiscard]] size_t getMaxIndex() const;
+    [[nodiscard]] size_t getMaxIndex(size_t begin, size_t end) const;
+
+    // https://stackoverflow.com/questions/3582608/how-to-correctly-implement-custom-iterators-and-const-iterators
+    template<typename pointer_type>
+    class iterator_impl
+    {
+      public:
+        typedef iterator_impl self_type;
+        typedef pointer_type value_type;
+        typedef value_type& reference;
+        typedef value_type* pointer;
+        typedef std::random_access_iterator_tag iterator_category;
+        typedef std::ptrdiff_t difference_type;
+
+        iterator_impl(pointer ptr)
+            : mPtr(ptr)
+        {
+        }
+
+        iterator_impl() = default;
+        iterator_impl& operator=(const iterator_impl&) = default;
+
+        self_type operator++()
+        {
+            mPtr++;
+            return *this;
+        }
+        self_type operator++(int)
+        {
+            self_type i = *this;
+            mPtr++;
+            return i;
+        }
+        self_type operator+(const difference_type& n) const
+        {
+            self_type i = *this;
+            i.mPtr += n;
+            return i;
+        }
+        self_type operator-(const difference_type& n) const
+        {
+            self_type i = *this;
+            i.mPtr -= n;
+            return i;
+        }
+        difference_type operator-(const self_type& other) const { return mPtr - other.mPtr; }
+        reference operator*() { return *mPtr; }
+        reference operator*() const { return *mPtr; }
+        pointer operator->() { return mPtr; }
+        bool operator==(const self_type& rhs) { return mPtr == rhs.mPtr; }
+        bool operator!=(const self_type& rhs) { return mPtr != rhs.mPtr; }
+
+        operator iterator_impl<value_type const>() const { return iterator_impl<value_type const>(mPtr); }
+
+      private:
+        pointer mPtr;
+    };
+
+    typedef iterator_impl<dt> iterator;
+    typedef iterator_impl<const dt> const_iterator;
+
+    const_iterator cbegin() const { return begin(); }
+    const_iterator cend() const { return end(); }
+    const_iterator begin() const { return const_iterator(&mTensorMem[0]); }
+    const_iterator end() const { return const_iterator(&mTensorMem[0] + this->size()); }
+    iterator begin() { return iterator(&mTensorMem[0]); }
+    iterator end() { return iterator(&mTensorMem[0] + this->size()); }
+
+    dt* data() { return &mTensorMem[0]; }
+    const dt* data() const { return &mTensorMem[0]; }
+
+    template<size_t NewDimsNum>
+    auto reshape(const yato::dimensionality<NewDimsNum, size_t>& extents) const
+    {
+        return mTensorMem.reshape(extents);
+    }
+
+    template<size_t NewDimsNum>
+    auto reshape(const yato::dimensionality<NewDimsNum, size_t>& extents)
+    {
+        return mTensorMem.reshape(extents);
+    }
+
+    template<typename... Dims>
+    auto reshape(Dims... extents) const
+    {
+        return mTensorMem.reshape(yato::dims(extents...));
+    }
+
+    template<typename... Dims>
+    auto reshape(Dims... extents)
+    {
+        return mTensorMem.reshape(yato::dims(extents...));
+    }
+
+    auto get4DView() const { return reshape(getShape()); }
+    auto get4DView() { return reshape(getShape()); }
+
+    const std::string& getName() const { return mName; }
+    [[nodiscard]] std::string getDescription() const;
+
+    size_t getBatchSize() const { return mShape[0]; }
+    size_t getDepth() const { return mShape[1]; }
+    size_t getHeight() const { return mShape[2]; }
+    size_t getWidth() const { return mShape[3]; }
+
+    shape getShape() const { return mShape; }
+
+    AllocationMode getAllocationMode() const { return mTensorMem.getAllocationMode(); }
+
+    std::optional<dtype> getScale() const { return mScale; }
+
+    /**
+     * @brief Check if the tensor can be broadcasted to the provided shape
+     * @param to_shape new tensor shape
+     * @return true or false
+     *
+     * @see https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html
+     */
+    bool isBroadcastableTo(const shape& to_shape) const noexcept { return Common::shapeIsBroadcastable(getShape(), to_shape); }
+
+    /**
+     * @brief Helper class to emulate broadcasted tensor
+     */
+    class broadcasted_viewer
+    {
+        const TensorImpl* mTensor;
+        const shape mViewerStrides;
+        const shape mOriginalStrides;
+        const size_t mSize;
+
+        size_t get_offset(const size_t index) const;
+
+      public:
+        broadcasted_viewer(TensorImpl* tensor, const shape viewer_strides, const shape original_strides, const size_t size)
+            : mTensor(tensor)
+            , mViewerStrides(viewer_strides)
+            , mOriginalStrides(original_strides)
+            , mSize(size)
+        {
+        }
+        broadcasted_viewer(const TensorImpl* tensor, const shape viewer_strides, const shape original_strides, const size_t size)
+            : mTensor(tensor)
+            , mViewerStrides(viewer_strides)
+            , mOriginalStrides(original_strides)
+            , mSize(size)
+        {
+        }
+        /**
+         * @warning Write-access to returned element is not thread-safe as multiple broadcasted indices correspond to single underlying tensor element
+         */
+        dt& operator[](size_t index);
+        const dt& operator[](size_t index) const;
+        size_t size() const { return mSize; }
+    };
+
+    /**
+     * @brief The function returns BroadcastedViewer that performs tensor broadcasting
+     * @param viewer_shape
+     * @return Closure that returns element of broadcasted tenso
+     *
+     * @see https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html
+     */
+    broadcasted_viewer getBroadcastedViewer(const shape& viewer_shape) const;
+    broadcasted_viewer getBroadcastedViewer(const shape& viewer_shape) { return const_cast<const TensorImpl*>(this)->getBroadcastedViewer(viewer_shape); }
+
+  private:
+    std::string mName;
+
+    TensorMem<dt> mTensorMem;
+
+    std::vector<half> mCompressedDataFP16;
+    std::vector<uint8_t> mCompressedDataInt8;
+    dt mCompressInt8Min;
+    dt mCompressInt8Max;
+
+    std::optional<dtype> mScale;
+
+    shape mShape;
+};
+
+typedef TensorImpl<dtype> Tensor;
+typedef TensorImpl<uint8_t> TensorU8;
+typedef TensorImpl<half> TensorFP16;
+
+template<typename T>
+struct ParamAndGradImpl
+{
+    T& Param;
+    T& Gradient;
+};
+
+typedef ParamAndGradImpl<Tensor> ParamAndGrad;
+
+#define TORANGE(var) static_cast<raul::Tensor::dt_range>(var)
+#define TORANGE_FP16(var) static_cast<raul::TensorFP16::dt_range>(var)
+#define TORANGE_MM(var) static_cast<typename MM::tensor::dt_range>(var)
+
+/**
+ * @brief The function broadcast destination tensor to source tensor shape
+ *        and applies binary operation: dst = op(src, dst)
+ *
+ */
+template<typename TensorImpl, typename TOp>
+void binaryOpBroadcastedDst(const TensorImpl& srcTensor, TensorImpl& dstTensor, TOp&& op)
+{
+    const auto& srcShape = srcTensor.getShape();
+    const auto& dstShape = dstTensor.getShape();
+
+    if (srcShape == dstShape)
+    {
+        std::transform(srcTensor.cbegin(), srcTensor.cend(), dstTensor.cbegin(), dstTensor.begin(), op);
+        return;
+    }
+
+    const auto* src = srcTensor.data();
+    auto* dst = dstTensor.data();
+
+    size_t srcStride1 = srcShape[1] * srcShape[2] * srcShape[3];
+    size_t srcStride2 = srcShape[2] * srcShape[3];
+    size_t srcStride3 = srcShape[3];
+
+    size_t dstStride1 = dstShape[1] * dstShape[2] * dstShape[3];
+    size_t dstStride2 = dstShape[2] * dstShape[3];
+    size_t dstStride3 = dstShape[3];
+
+    for (size_t i1 = 0; i1 < srcShape[0]; ++i1)
+    {
+        size_t j1 = dstShape[0] == 1 ? 0 : i1;
+        for (size_t i2 = 0; i2 < srcShape[1]; ++i2)
+        {
+            size_t j2 = dstShape[1] == 1 ? 0 : i2;
+            for (size_t i3 = 0; i3 < srcShape[2]; ++i3)
+            {
+                size_t j3 = dstShape[2] == 1 ? 0 : i3;
+                for (size_t i4 = 0; i4 < srcShape[3]; ++i4)
+                {
+                    size_t j4 = dstShape[3] == 1 ? 0 : i4;
+                    size_t srcIdx = srcStride1 * i1 + srcStride2 * i2 + srcStride3 * i3 + i4;
+                    size_t dstIdx = dstStride1 * j1 + dstStride2 * j2 + dstStride3 * j3 + j4;
+
+                    dst[dstIdx] = op(src[srcIdx], dst[dstIdx]);
+                }
+            }
+        }
+    }
+}
+
+/**
+ * @brief The function broadcast source tensor to destination tensor shape
+ *        and applies binary operation: dst = op(src, dst)
+ *
+ */
+template<typename TensorImpl, typename TOp>
+void binaryOpBroadcastedSrc(const TensorImpl& srcTensor, TensorImpl& dstTensor, TOp&& op)
+{
+    const auto& srcShape = srcTensor.getShape();
+    const auto& dstShape = dstTensor.getShape();
+
+    if (srcShape == dstShape)
+    {
+        std::transform(srcTensor.cbegin(), srcTensor.cend(), dstTensor.cbegin(), dstTensor.begin(), op);
+        return;
+    }
+
+    const auto* src = srcTensor.data();
+    auto* dst = dstTensor.data();
+
+    size_t srcStride1 = srcShape[1] * srcShape[2] * srcShape[3];
+    size_t srcStride2 = srcShape[2] * srcShape[3];
+    size_t srcStride3 = srcShape[3];
+
+    size_t dstStride1 = dstShape[1] * dstShape[2] * dstShape[3];
+    size_t dstStride2 = dstShape[2] * dstShape[3];
+    size_t dstStride3 = dstShape[3];
+
+    for (size_t i1 = 0; i1 < dstShape[0]; ++i1)
+    {
+        size_t j1 = srcShape[0] == 1 ? 0 : i1;
+        for (size_t i2 = 0; i2 < dstShape[1]; ++i2)
+        {
+            size_t j2 = srcShape[1] == 1 ? 0 : i2;
+            for (size_t i3 = 0; i3 < dstShape[2]; ++i3)
+            {
+                size_t j3 = srcShape[2] == 1 ? 0 : i3;
+                for (size_t i4 = 0; i4 < dstShape[3]; ++i4)
+                {
+                    size_t j4 = srcShape[3] == 1 ? 0 : i4;
+
+                    size_t srcIdx = srcStride1 * j1 + srcStride2 * j2 + srcStride3 * j3 + j4;
+                    size_t dstIdx = dstStride1 * i1 + dstStride2 * i2 + dstStride3 * i3 + i4;
+
+                    dst[dstIdx] = op(src[srcIdx], dst[dstIdx]);
+                }
+            }
+        }
+    }
+}
+
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/common/TensorMem.h b/training/src/compiler/training/base/common/TensorMem.h
new file mode 100644
index 00000000..14e535a8
--- /dev/null
+++ b/training/src/compiler/training/base/common/TensorMem.h
@@ -0,0 +1,167 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TENSOR_MEM_H
+#define TENSOR_MEM_H
+
+#include <vector>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+template<typename dt>
+class TensorMem
+{
+  public:
+    explicit TensorMem(AllocationMode allocMode)
+        : mAllocationMode(allocMode)
+        , mData(nullptr)
+        , mSize(0)
+    {
+    }
+
+    TensorMem(size_t size, dt filler)
+        : mAllocationMode(AllocationMode::STANDARD)
+        , mData(nullptr)
+        , mMem(size, filler)
+    {
+        if (size != 0)
+        {
+            mData = &mMem[0];
+        }
+
+        mSize = size;
+    }
+
+    explicit TensorMem(std::initializer_list<dt> list)
+        : mAllocationMode(AllocationMode::STANDARD)
+        , mData(nullptr)
+        , mMem(list)
+    {
+        if (!mMem.empty())
+        {
+            mData = &mMem[0];
+        }
+
+        mSize = mMem.size();
+    }
+
+    TensorMem(const dt* first, const dt* sesond)
+        : mAllocationMode(AllocationMode::STANDARD)
+        , mData(nullptr)
+        , mMem(first, sesond)
+    {
+        if (!mMem.empty())
+        {
+            mData = &mMem[0];
+        }
+
+        mSize = mMem.size();
+    }
+
+    INLINE void resize(size_t size, dt* data)
+    {
+        if (mAllocationMode == AllocationMode::STANDARD)
+        {
+            mMem.resize(size);
+            if (size != 0)
+            {
+                mData = &mMem[0];
+            }
+            else
+            {
+                mData = nullptr;
+            }
+
+            mSize = size;
+        }
+        else
+        {
+            mData = data;
+
+            if (mData != nullptr)
+            {
+                mSize = size;
+
+                std::fill(mData, mData + mSize, static_cast<dt>(0));
+            }
+            else
+            {
+                mSize = 0;
+            }
+        }
+    }
+
+    INLINE size_t size() const { return mSize; }
+
+    INLINE bool empty() const { return mSize == 0; }
+
+    INLINE dt* begin() { return &mData[0]; }
+
+    INLINE const dt* begin() const { return &mData[0]; }
+
+    INLINE dt* end() { return &mData[mSize]; }
+
+    INLINE const dt* end() const { return &mData[mSize]; }
+
+    INLINE void clear()
+    {
+        if (mAllocationMode == AllocationMode::STANDARD)
+        {
+            mMem.clear();
+        }
+
+        mData = nullptr;
+        mSize = 0;
+    }
+
+    INLINE void shrink_to_fit()
+    {
+        if (mAllocationMode == AllocationMode::STANDARD)
+        {
+            mMem.shrink_to_fit();
+        }
+    }
+
+    INLINE dt& operator[](size_t index) noexcept { return mData[index]; }
+
+    INLINE const dt& operator[](size_t index) const noexcept { return mData[index]; }
+
+    template<size_t NewDimsNum>
+    INLINE auto reshape(const yato::dimensionality<NewDimsNum, size_t>& extents) const
+    {
+        return yato::array_view<const dt>(mData, yato::dims(mSize)).reshape(extents);
+    }
+
+    template<size_t NewDimsNum>
+    INLINE auto reshape(const yato::dimensionality<NewDimsNum, size_t>& extents)
+    {
+        return yato::array_view<dt>(mData, yato::dims(mSize)).reshape(extents);
+    }
+
+    AllocationMode getAllocationMode() const { return mAllocationMode; }
+
+  private:
+    AllocationMode mAllocationMode;
+
+    dt* mData;
+    size_t mSize;
+
+    // used in NORMAL AllocationMode
+    std::vector<dt> mMem;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/common/io/TensorStream.cpp b/training/src/compiler/training/base/common/io/TensorStream.cpp
new file mode 100644
index 00000000..295ba172
--- /dev/null
+++ b/training/src/compiler/training/base/common/io/TensorStream.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TensorStream.h"
+
+namespace raul::io::tensor
+{
+
+int getStreamIndex()
+{
+    static int const index = std::ios_base::xalloc();
+    return index;
+}
+
+std::ostream& full(std::ostream& os)
+{
+    auto flags = TensorView::size | TensorView::content;
+    return os << setview(flags);
+}
+
+std::ostream& compact(std::ostream& os)
+{
+    auto flags = TensorView::size | TensorView::content | TensorView::reduced;
+    return os << setview(flags);
+}
+
+std::ostream& brief(std::ostream& os)
+{
+    auto flags = 0;
+    return os << setview(flags);
+}
+
+bool isSetFlag(std::ostream& os, TensorView option)
+{
+    const auto index = getStreamIndex();
+    const auto value = os.iword(index);
+    const auto flag = value & static_cast<long>(option);
+    return flag;
+}
+
+} // namespace raul::io::tensor'
\ No newline at end of file
diff --git a/training/src/compiler/training/base/common/io/TensorStream.h b/training/src/compiler/training/base/common/io/TensorStream.h
new file mode 100644
index 00000000..7e8a80a6
--- /dev/null
+++ b/training/src/compiler/training/base/common/io/TensorStream.h
@@ -0,0 +1,68 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TENSORSTREAM_H
+#define TENSORSTREAM_H
+
+#include <iomanip>
+#include <iostream>
+
+namespace raul::io::tensor
+{
+
+enum class TensorView : long
+{
+    size = 0x1,
+    content = 0x2,
+    reduced = 0x4,
+    scale = 0x8
+};
+
+inline TensorView operator|(TensorView a, TensorView b)
+{
+    return static_cast<TensorView>(static_cast<long>(a) | static_cast<long>(b));
+}
+
+int getStreamIndex();
+
+struct setview
+{
+    explicit setview(TensorView flags)
+        : flags(static_cast<long>(flags))
+    {
+    }
+
+    explicit setview(long flags)
+        : flags(flags)
+    {
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const setview obj)
+    {
+        const auto index = getStreamIndex();
+        os.iword(index) = obj.flags;
+        return os;
+    }
+
+    long flags;
+};
+
+std::ostream& full(std::ostream& os);
+std::ostream& compact(std::ostream& os);
+std::ostream& brief(std::ostream& os);
+
+bool isSetFlag(std::ostream& os, TensorView option);
+
+} // namespace raul::io::tensor
+
+#endif // TENSORSTREAM_H
diff --git a/training/src/compiler/training/base/common/quantization/AffineQuantizer.cpp b/training/src/compiler/training/base/common/quantization/AffineQuantizer.cpp
new file mode 100644
index 00000000..d030b2ab
--- /dev/null
+++ b/training/src/compiler/training/base/common/quantization/AffineQuantizer.cpp
@@ -0,0 +1,68 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <algorithm>
+
+#include "training/base/common/quantization/AffineQuantizer.h"
+
+namespace raul::quantization
+{
+
+void AffineQuantizer::quantize(TensorItr begin, TensorItr end)
+{
+    std::transform(begin, end, begin, [&](dtype x_dt) -> dtype {
+        // Step 1. Offset
+        x_dt += mOffset;
+        // Step 2. Scale
+        x_dt *= mScale;
+        // Step 3. Round
+        x_dt = mRoundFunc(x_dt);
+        // Step 4. Optional saturation
+        if (mQuantizedRange)
+        {
+            x_dt = std::clamp(x_dt, mQuantizedRange->min, mQuantizedRange->max);
+        }
+        return x_dt;
+    });
+}
+
+void AffineQuantizer::dequantize(TensorItr begin, TensorItr end)
+{
+    assert(mScale > 0.0_dt);
+    std::transform(begin, end, begin, [&](dtype x_dt) -> dtype {
+        x_dt /= mScale;
+        x_dt -= mOffset;
+        return x_dt;
+    });
+}
+
+void AffineQuantizer::backpropagate(TensorConstItr begin, TensorConstItr end, TensorConstItr delta_begin, TensorItr grad_begin)
+{
+    std::transform(begin, end, delta_begin, grad_begin, [&](dtype x_dt, dtype grad_dt) -> dtype {
+        if (mQuantizedRange)
+        {
+            return grad_dt;
+        }
+
+        // Step 1. Offset
+        x_dt += mOffset;
+        // Step 2. Scale
+        x_dt *= mScale;
+        // Step 3. Round
+        x_dt = mRoundFunc(x_dt);
+        // Step 4. Saturation
+        return (x_dt >= mQuantizedRange->min && x_dt <= mQuantizedRange->max) ? grad_dt : 0.0_dt;
+    });
+}
+
+} // raul::quantization
\ No newline at end of file
diff --git a/training/src/compiler/training/base/common/quantization/AffineQuantizer.h b/training/src/compiler/training/base/common/quantization/AffineQuantizer.h
new file mode 100644
index 00000000..7c28a5f1
--- /dev/null
+++ b/training/src/compiler/training/base/common/quantization/AffineQuantizer.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#pragma once
+#include "training/base/common/Tensor.h"
+#include "training/base/common/quantization/IQuantizer.h"
+#include <functional>
+#include <optional>
+#include <utility>
+#include <variant>
+
+namespace raul::quantization
+{
+
+/**
+ * @brief Affine quantization algorithm
+ */
+struct AffineQuantizer : IQuantizer
+{
+    AffineQuantizer(RoundFuncT roundFunc, const dtype scale, const dtype offset, std::optional<RangeT> quantized_range = std::nullopt)
+        : mRoundFunc(std::move(roundFunc))
+        , mScale(scale)
+        , mOffset(offset)
+        , mQuantizedRange(quantized_range)
+    {
+    }
+
+    void quantize(TensorItr begin, TensorItr end) override;
+
+    void dequantize(TensorItr begin, TensorItr end) override;
+
+    void backpropagate(TensorConstItr begin, TensorConstItr end, TensorConstItr delta_begin, TensorItr grad_begin) override;
+
+  protected:
+    RoundFuncT mRoundFunc;
+    dtype mScale;
+    dtype mOffset;
+    std::optional<RangeT> mQuantizedRange;
+};
+
+} // raul::quantization
\ No newline at end of file
diff --git a/training/src/compiler/training/base/common/quantization/IQuantizer.h b/training/src/compiler/training/base/common/quantization/IQuantizer.h
new file mode 100644
index 00000000..88905ca9
--- /dev/null
+++ b/training/src/compiler/training/base/common/quantization/IQuantizer.h
@@ -0,0 +1,118 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef IQUANTIZER_H
+#define IQUANTIZER_H
+
+#include <functional>
+#include <optional>
+#include <variant>
+
+#include <training/base/common/Tensor.h>
+
+#define DEFAUL_QUANTIZATION_BITSIZE 8U
+
+/**
+ * @brief Quantization namespace
+ *
+ * **Quantization** is a transformation of a machine learning model into one that uses
+ * parameters and computations at a lower precision. It helps to reduce the model size
+ * and, as a consequence, other characteristics like power consumption.
+ * Quantization is a lossy process and can be considered in machine learning systems
+ * as a source of the noise.
+ *
+ * **Quantization-aware training (QAT)** is a technique of including quantization noise
+ * into the training process to improve model tolerance to precision reduction.
+ * Basic idea behind QAT is emulation low-precision computation during inference
+ * phase of the training process. It is reached by introducing special quantization
+ * layer which changes the values of the tensors.
+ *
+ * From a mathematical point of view, quantization is a surjective function which can be defined using
+ * different quantization strategies. This namespace contains the hierarchy of different quantization
+ * algorithms represented by classes that implement interface IQuantizer.
+ *
+ * @see
+ * - B. Jacob et al., “Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference” arXiv:1712.05877 [cs, stat], Dec. 2017, Accessed: Apr. 24, 2020. [Online].
+ * Available: http://arxiv.org/abs/1712.05877.
+ * - R. Krishnamoorthi, “Quantizing deep convolutional networks for efficient inference: A whitepaper” arXiv:1806.08342 [cs, stat], Jun. 2018, Accessed: May 05, 2020. [Online]. Available:
+ * http://arxiv.org/abs/1806.08342.
+ *
+ */
+namespace raul::quantization
+{
+
+using RoundFuncT = std::function<dtype(dtype)>;
+using RangeT = struct Range
+{
+    dtype min;
+    dtype max;
+};
+
+/**
+ * @brief Interface of quantizer algortithms
+ *
+ * Every quantizer must implement two methods:
+ * - quantize,
+ * - dequantize,
+ * - backpropagate.
+ *
+ * it can help to reduce the amount of cycles
+ */
+struct IQuantizer
+{
+    using TensorItr = Tensor::iterator;
+    using TensorConstItr = Tensor::const_iterator;
+
+    IQuantizer() = default;
+    virtual ~IQuantizer() = default;
+
+    /**
+     * @brief Quantize method
+     *
+     * This method maps floating-point number (dtype) to
+     * quantized floating-point number with restricted dynamic range and precision
+     *
+     * @param begin Tensor iterator specifies the beginning of the range of elements to quantize
+     * @param end Tensor iterator specifies the end of the range of elements to quantize
+     */
+    virtual void quantize(TensorItr begin, TensorItr end) = 0;
+
+    void quantize(TensorFP16::iterator, TensorFP16::iterator) {}
+
+    /**
+     * @brief Dequantize method
+     *
+     * This method maps quantized floating-point (dtype) to
+     * to floating-point number with original dynamic range but quantized precision
+     *
+     * @param begin Tensor iterator specifies the beginning of the range of elements to quantize
+     * @param end Tensor iterator specifies the end of the range of elements to quantize
+     */
+    virtual void dequantize(TensorItr begin, TensorItr end) = 0;
+
+    void dequantize(TensorFP16::iterator, TensorFP16::iterator) {}
+
+    /**
+     * @brief Backpropagate method
+     *
+     * @param begin Tensor iterator specifies the beginning of the range of elements to quantize
+     * @param end Tensor iterator specifies the end of the range of elements to quantize
+     * @param delta_begin Tensor iterator specifies the beginning of the range of elements of input deltas
+     * @param grad_begin Tensor iterator specifies the beginning of the range of elements where gradient will be stored
+     */
+    virtual void backpropagate(TensorConstItr begin, TensorConstItr end, TensorConstItr delta_begin, TensorItr grad_begin) = 0;
+};
+
+} // raul::quantization
+
+#endif // IQUANTIZER_H
diff --git a/training/src/compiler/training/base/common/quantization/SymmetricQuantizer.cpp b/training/src/compiler/training/base/common/quantization/SymmetricQuantizer.cpp
new file mode 100644
index 00000000..4e21ef02
--- /dev/null
+++ b/training/src/compiler/training/base/common/quantization/SymmetricQuantizer.cpp
@@ -0,0 +1,62 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <algorithm>
+
+#include "training/base/common/quantization/SymmetricQuantizer.h"
+
+namespace raul::quantization
+{
+
+void SymmetricQuantizer::quantize(TensorItr begin, TensorItr end)
+{
+    mScale = calcScale(begin, end);
+    AffineQuantizer::quantize(begin, end);
+}
+
+void SymmetricQuantizer::backpropagate(TensorConstItr begin, TensorConstItr end, TensorConstItr delta_begin, TensorItr grad_begin)
+{
+    mScale = calcScale(begin, end);
+    AffineQuantizer::backpropagate(begin, end, delta_begin, grad_begin);
+}
+
+dtype SymmetricQuantizer::calcScale(TensorConstItr begin, TensorConstItr end) const
+{
+    const auto& max_abs_value_ref = std::max_element(begin, end, [](dtype a, dtype b) { return std::abs(a) < std::abs(b); });
+    const dtype max_abs_value = std::abs(*max_abs_value_ref);
+    const dtype scale = (mQuantizedRange->max - mQuantizedRange->min) / max_abs_value / 2.0_dt;
+    return scale;
+}
+
+RangeT SymmetricQuantizer::calcQuantizedRange() const
+{
+    if (mDigits == 0U)
+    {
+        THROW_NONAME("SymmetricQuantizer", "digits must be nonzero");
+    }
+    const auto max_signed_positive_value = std::pow(2, mDigits - 1U);
+    if (mMode == Mode::restricted_range)
+    {
+        const auto max_value = max_signed_positive_value - 1;
+        const auto min_value = -(max_signed_positive_value - 1);
+        return { static_cast<dtype>(min_value), static_cast<dtype>(max_value) };
+    }
+    else
+    {
+        const auto max_value = max_signed_positive_value - 1;
+        const auto min_value = -max_signed_positive_value;
+        return { static_cast<dtype>(min_value), static_cast<dtype>(max_value) };
+    }
+}
+
+} // raul::quantization
\ No newline at end of file
diff --git a/training/src/compiler/training/base/common/quantization/SymmetricQuantizer.h b/training/src/compiler/training/base/common/quantization/SymmetricQuantizer.h
new file mode 100644
index 00000000..17a9b9ba
--- /dev/null
+++ b/training/src/compiler/training/base/common/quantization/SymmetricQuantizer.h
@@ -0,0 +1,86 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SYMMETRIC_QUANTIZER_H
+#define SYMMETRIC_QUANTIZER_H
+
+#include <functional>
+#include <optional>
+#include <variant>
+
+#include <training/base/common/Tensor.h>
+#include <training/base/common/quantization/AffineQuantizer.h>
+#include <training/base/common/quantization/IQuantizer.h>
+#include <utility>
+
+namespace raul::quantization
+{
+
+/** @brief Symmetric affine (linear) quantization
+ *
+ *  The quantizer performs two steps: symmetric mapping and rounding
+ *
+ *  1. Symmetric mapping:
+ *  \f[
+ *      x_{q} = \textrm{round}(\alpha x_{f}),
+ *  \f]
+ *  where \f$x_{f}\f$ is floating value, \f$x_{q}\f$ is quantized value,  \f$\alpha\f$ is a scale factor.
+ *
+ *  The scale factor depends on the mode of Symmetric quantizers.
+ *
+ *  **Full-range mode**
+ *  \f[
+ *      \alpha = \frac{1}{2}\frac{2^n-1}{\max{|{x_{f}|}}}.
+ *  \f]
+ *
+ *  **Restricted-range mode**
+ *  \f[
+ *      \alpha = \frac{2^n-1}{\max{|{x_{f}|}}}.
+ *  \f]
+ *
+ *  @see
+ *  - @ref page_rounding "Rounding algorithms"
+ *  - https://nervanasystems.github.io/distiller/algo_quantization.html
+ *
+ */
+struct SymmetricQuantizer final : AffineQuantizer
+{
+    enum class Mode
+    {
+        full_range,
+        restricted_range
+    };
+
+    SymmetricQuantizer(RoundFuncT roundFunc, const uint8_t digits = DEFAUL_QUANTIZATION_BITSIZE, const Mode mode = Mode::restricted_range)
+        : AffineQuantizer(std::move(roundFunc), 0, 0, std::nullopt)
+        , mDigits(digits)
+        , mMode(mode)
+    {
+        mQuantizedRange = calcQuantizedRange();
+    }
+
+    void quantize(TensorItr begin, TensorItr end) override;
+    void backpropagate(TensorConstItr begin, TensorConstItr end, TensorConstItr delta_begin, TensorItr grad_begin) override;
+
+    dtype calcScale(TensorConstItr begin, TensorConstItr end) const;
+
+  private:
+    RangeT calcQuantizedRange() const;
+
+    uint8_t mDigits;
+    Mode mMode;
+};
+
+} // raul::quantization
+
+#endif // SYMMETRIC_QUANTIZER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/ImplFactory.cpp b/training/src/compiler/training/base/impl/ImplFactory.cpp
new file mode 100644
index 00000000..a237909a
--- /dev/null
+++ b/training/src/compiler/training/base/impl/ImplFactory.cpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ImplFactory.h"
+
+namespace raul
+{
+
+void ImplFactory::clearRegistrationFromEveryMap(const Name& name)
+{
+    mMapImplCPUFP32.erase(name);
+    mMapImplCPUFP16.erase(name);
+    mMapImplCPUFP32FP16MixedLocal.erase(name);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/ImplFactory.h b/training/src/compiler/training/base/impl/ImplFactory.h
new file mode 100644
index 00000000..a09b2c1f
--- /dev/null
+++ b/training/src/compiler/training/base/impl/ImplFactory.h
@@ -0,0 +1,109 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef IMPL_FACTORY_H
+#define IMPL_FACTORY_H
+
+#include <memory>
+#include <typeinfo>
+#include <unordered_map>
+
+#include <training/system/Name.h>
+
+#include <training/system/Singleton.h>
+#include <training/system/Errors.h>
+
+namespace raul
+{
+class BasicLayer;
+class BasicImpl;
+
+#define REGISTER_BUILDER(nameFront, nameImpl, map)                                                                                                                                                     \
+    struct ImplBuilder##nameFront##map : public ImplBuilderBasic                                                                                                                                       \
+    {                                                                                                                                                                                                  \
+        std::unique_ptr<BasicImpl> create(BasicLayer* frontLayer) const override                                                                                                                       \
+        {                                                                                                                                                                                              \
+            nameFront* front = static_cast<nameFront*>(frontLayer);                                                                                                                                    \
+            return std::make_unique<nameImpl>(*front);                                                                                                                                                 \
+        }                                                                                                                                                                                              \
+    };                                                                                                                                                                                                 \
+    return map.insert({ typeid(nameFront).name(), std::make_unique<ImplBuilder##nameFront##map>() }).second;
+
+class ImplFactory
+{
+  public:
+    template<typename nameFront, typename nameImpl>
+    bool regCPUFP32()
+    {
+        if (mMapImplCPUFP32.find(typeid(nameFront).name()) != mMapImplCPUFP32.end())
+        {
+            THROW_NONAME("ImplFactory", Name("Front ") + typeid(nameFront).name() + " already registered");
+        }
+        REGISTER_BUILDER(nameFront, nameImpl, mMapImplCPUFP32)
+    }
+
+    template<typename nameFront, typename nameImpl>
+    bool regCPUFP16()
+    {
+        if (mMapImplCPUFP16.find(typeid(nameFront).name()) != mMapImplCPUFP16.end())
+        {
+            THROW_NONAME("ImplFactory", Name("Front ") + typeid(nameFront).name() + " already registered");
+        }
+        REGISTER_BUILDER(nameFront, nameImpl, mMapImplCPUFP16)
+    }
+
+    /**
+     * @brief Input/output FP32, calculation FP16
+     */
+    template<typename nameFront, typename nameImpl>
+    bool regCPUFP32FP16MixedLocal()
+    {
+        if (mMapImplCPUFP32FP16MixedLocal.find(typeid(nameFront).name()) != mMapImplCPUFP32FP16MixedLocal.end())
+        {
+            THROW_NONAME("ImplFactory", Name("Front ") + typeid(nameFront).name() + " already registered");
+        }
+        REGISTER_BUILDER(nameFront, nameImpl, mMapImplCPUFP32FP16MixedLocal)
+    }
+
+    void clearRegistrationFromEveryMap(const Name& name);
+
+    struct ImplBuilderBasic
+    {
+        virtual std::unique_ptr<BasicImpl> create(BasicLayer* frontLayer) const = 0;
+        virtual ~ImplBuilderBasic() {}
+    };
+
+    typedef std::unordered_map<Name, std::unique_ptr<ImplBuilderBasic>> MapImpl;
+
+    MapImpl& getCPUFP32Map() { return mMapImplCPUFP32; }
+    MapImpl& getCPUFP16Map() { return mMapImplCPUFP16; }
+    MapImpl& getCPUFP32FP16MixedLocalMap() { return mMapImplCPUFP32FP16MixedLocal; }
+
+  private:
+    ImplFactory() {}
+    ~ImplFactory() {}
+    ImplFactory(const ImplFactory&) = delete;
+    ImplFactory& operator=(const ImplFactory&) = delete;
+    ImplFactory* operator&() = delete;
+
+    MapImpl mMapImplCPUFP32;
+    MapImpl mMapImplCPUFP16;
+    MapImpl mMapImplCPUFP32FP16MixedLocal; // input/output FP32, calculation FP16
+
+    friend struct CreateStatic<ImplFactory>;
+};
+
+typedef SingletonHolder<ImplFactory, CreateStatic, PhoenixSingleton> TheImplFactory;
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/basic/trainable/Convolution2DLayerCPU.cpp b/training/src/compiler/training/base/impl/basic/trainable/Convolution2DLayerCPU.cpp
new file mode 100644
index 00000000..5cfbc658
--- /dev/null
+++ b/training/src/compiler/training/base/impl/basic/trainable/Convolution2DLayerCPU.cpp
@@ -0,0 +1,491 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Convolution2DLayerCPU.h"
+#include <training/base/layers/basic/trainable/Convolution2DLayer.h>
+
+#include <training/base/impl/ImplFactory.h>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::Convolution2DLayer, raul::Convolution2DLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::Convolution2DLayer, raul::Convolution2DLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void Convolution2DLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.getNetworkParams().mWorkflow;
+
+    const size_t batchSize = work.getBatchSize();
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.getOutputName()];
+
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.getInputName()];
+
+    auto& weights = work.getMemoryManager<MM>()[mLayer.getWeightsName()];
+
+    if (mLayer.isQuantizeWeights())
+    {
+        work.getMemoryManager<MM>()[mLayer.getWeightsBackup()] = TORANGE_MM(weights);
+        mLayer.getNetworkParams().mQuantizerPtr->quantize(weights.begin(), weights.end());
+        mLayer.getNetworkParams().mQuantizerPtr->dequantize(weights.begin(), weights.end());
+    }
+
+#ifdef RAUL_NAIVE_CONV_FORWARD
+    Common::conv2d(&inputs[0],
+                   &output[0],
+                   &weights[0],
+                   mLayer.isUseBias() ? &work.getMemoryManager<MM>()[mLayer.getBiasesName()][0] : nullptr,
+                   batchSize,
+                   mLayer.getInputWidth(),
+                   mLayer.getInputHeight(),
+                   mLayer.getInputDepth(),
+                   mLayer.getOutputWidth(),
+                   mLayer.getOutputHeight(),
+                   mLayer.getKernelsCount(),
+                   mLayer.getKernelWidth(),
+                   mLayer.getKernelHeight(),
+                   mLayer.getPaddingW(),
+                   mLayer.getPaddingH(),
+                   mLayer.getStrideW(),
+                   mLayer.getStrideH(),
+                   mLayer.getDilationW(),
+                   mLayer.getDilationH(),
+                   mLayer.getGroups());
+#else
+    auto inputs3D = inputs.reshape(yato::dims(batchSize, mLayer.getInputDepth(), mLayer.getInputHeight() * mLayer.getInputWidth()));
+    auto outputs3D = output.reshape(yato::dims(batchSize, mLayer.getKernelsCount(), mLayer.getOutputHeight() * mLayer.getOutputWidth()));
+
+    // Fill dilated weights if needed
+    if (mLayer.isDilationEnabled())
+    {
+        auto& dilationWeights = work.getMemoryManager<MM>()[mLayer.getDilationTensor()];
+
+        auto kernelsWeights4D = weights.reshape(yato::dims(mLayer.getKernelsCount(), mLayer.getInputDepth() / mLayer.getGroups(), mLayer.getKernelHeight(), mLayer.getKernelWidth()));
+        auto dilatedKernelsWeights4D =
+            dilationWeights.reshape(yato::dims(mLayer.getKernelsCount(), mLayer.getInputDepth() / mLayer.getGroups(), mLayer.getEffectiveReceptiveFieldH(), mLayer.getEffectiveReceptiveFieldW()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t kernelIndex = 0; kernelIndex < mLayer.getKernelsCount(); ++kernelIndex)
+        {
+            for (size_t d = 0; d < mLayer.getInputDepth() / mLayer.getGroups(); ++d)
+            {
+                for (size_t ky = 0; ky < mLayer.getKernelHeight(); ++ky)
+                {
+                    for (size_t kx = 0; kx < mLayer.getKernelWidth(); ++kx)
+                    {
+                        dilatedKernelsWeights4D[kernelIndex][d][ky * mLayer.getDilationH()][kx * mLayer.getDilationW()] = kernelsWeights4D[kernelIndex][d][ky][kx];
+                    }
+                }
+            }
+        }
+    }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < batchSize; ++q)
+    {
+        size_t index = 0;
+#if defined(_OPENMP)
+        index = omp_get_thread_num();
+#endif
+
+        auto& im2ColFor = work.getMemoryManager<MM>()[mLayer.getIm2ColForward()[index]];
+
+        Common::im2col(&inputs3D[q][0][0],
+                       mLayer.getInputWidth(),
+                       mLayer.getInputHeight(),
+                       mLayer.getInputDepth(),
+                       mLayer.getEffectiveReceptiveFieldW(),
+                       mLayer.getEffectiveReceptiveFieldH(),
+                       mLayer.getStrideW(),
+                       mLayer.getStrideH(),
+                       mLayer.getPaddingW(),
+                       mLayer.getPaddingH(),
+                       &im2ColFor[0]);
+
+        auto& wT = mLayer.isDilationEnabled() ? work.getMemoryManager<MM>()[mLayer.getDilationTensor()] : weights;
+
+        for (size_t group = 0; group < mLayer.getGroups(); ++group)
+        {
+            Common::gemm(CblasNoTrans,
+                         CblasNoTrans,
+                         mLayer.getKernelsCount() / mLayer.getGroups(),
+                         mLayer.getOutputWidth() * mLayer.getOutputHeight(),
+                         mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getInputDepth() / mLayer.getGroups(),
+                         1.0_dt,
+                         &wT[0] + group * mLayer.getKernelsCount() / mLayer.getGroups() * mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getInputDepth() /
+                                      mLayer.getGroups(),
+                         &im2ColFor[0] + group * mLayer.getInputDepth() / mLayer.getGroups() * mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getOutputWidth() *
+                                             mLayer.getOutputHeight(),
+                         0.0_dt,
+                         &outputs3D[q][group * mLayer.getKernelsCount() / mLayer.getGroups()][0]);
+        }
+    }
+
+    if (mLayer.isUseBias())
+    {
+        const auto& biases = work.getMemoryManager<MM>()[mLayer.getBiasesName()];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < mLayer.getKernelsCount(); ++kernelIndex)
+            {
+                const auto bias = biases[kernelIndex];
+                std::transform(
+                    outputs3D[q][kernelIndex].begin(), outputs3D[q][kernelIndex].end(), outputs3D[q][kernelIndex].begin(), [bias](typename MM::type& val) -> typename MM::type { return val + bias; });
+            }
+        }
+    }
+#endif
+
+    if (mLayer.isQuantizeWeights())
+    {
+        weights = TORANGE_MM(work.getMemoryManager<MM>()[mLayer.getWeightsBackup()]);
+    }
+}
+
+template<typename MM>
+void Convolution2DLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.getNetworkParams().mWorkflow;
+
+    const size_t batchSize = work.getBatchSize();
+
+    auto& deltas = work.getMemoryManager<MM>()[mLayer.getOutputName().grad()];
+
+    const auto& weights = mLayer.isDilationEnabled() ? work.getMemoryManager<MM>()[mLayer.getDilationTensor()] : work.getMemoryManager<MM>()[mLayer.getWeightsName()];
+
+#ifdef RAUL_NAIVE_CONV_BACKWARD
+    auto deltas4D = deltas.reshape(yato::dims(batchSize, mLayer.getKernelsCount(), mLayer.getOutputHeight(), mLayer.getOutputWidth()));
+
+    auto kernelsWeights4D = weights.reshape(yato::dims(mLayer.getKernelsCount(), mLayer.getInputDepth() / mLayer.getGroups(), mLayer.getKernelHeight(), mLayer.getKernelWidth()));
+
+    size_t inputWidthPadded = mLayer.getInputWidth() + 2 * mLayer.getPaddingW();
+    size_t inputHeightPadded = mLayer.getInputHeight() + 2 * mLayer.getPaddingH();
+
+    auto& prevDeltaTmp = work.getMemoryManager<MM>()[mLayer.getTmpTensorName()];
+    auto prevDeltaTmp3D = prevDeltaTmp.reshape(yato::dims(mLayer.getInputDepth(), inputHeightPadded, inputWidthPadded));
+
+    // prevDelta
+    ////if (mLayer.getNetworkParams().isGradNeeded(mLayer.getInputName()))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.getInputName().grad()];
+
+        auto prevDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mLayer.getInputDepth(), mLayer.getInputHeight() * mLayer.getInputWidth()));
+
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            std::fill(prevDeltaTmp.begin(), prevDeltaTmp.end(), TOMMTYPE(0.0_dt));
+
+            for (size_t group = 0; group < mLayer.getGroups(); ++group)
+            {
+                for (size_t d = 0; d < mLayer.getInputDepth() / mLayer.getGroups(); ++d)
+                {
+                    for (size_t kernelIndex = 0; kernelIndex < mLayer.getKernelsCount() / mLayer.getGroups(); ++kernelIndex)
+                    {
+                        for (size_t oy = 0; oy < mLayer.getOutputHeight(); ++oy)
+                        {
+                            for (size_t ox = 0; ox < mLayer.getOutputWidth(); ++ox)
+                            {
+                                for (size_t ky = 0; ky < mLayer.getKernelHeight(); ++ky)
+                                {
+                                    for (size_t kx = 0; kx < mLayer.getKernelWidth(); ++kx)
+                                    {
+                                        prevDeltaTmp3D[d + group * mLayer.getInputDepth() / mLayer.getGroups()][oy * mLayer.getStrideH() + ky * mLayer.getDilationH()]
+                                                      [ox * mLayer.getStrideW() + kx * mLayer.getDilationW()] +=
+                                            deltas4D[i][kernelIndex + group * mLayer.getKernelsCount() / mLayer.getGroups()][oy][ox] *
+                                            kernelsWeights4D[kernelIndex + group * mLayer.getKernelsCount() / mLayer.getGroups()][d][ky][kx];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            Common::removePadding2D(
+                &prevDeltaTmp3D[0][0][0], &prevDeltas3D[i][0][0], mLayer.getInputDepth(), inputWidthPadded, inputHeightPadded, mLayer.getInputWidth(), mLayer.getInputHeight(), false);
+        }
+    }
+
+    if (!mLayer.isFrozen())
+    {
+        auto& inputs = work.getMemoryManager<MM>()[mLayer.getInputName()];
+
+        auto& gradWeights = work.getMemoryManager<MM>()[mLayer.getWeightsName().grad()];
+
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mLayer.getInputDepth(), mLayer.getInputHeight() * mLayer.getInputWidth()));
+        auto gradWeights4D = gradWeights.reshape(yato::dims(mLayer.getKernelsCount(), mLayer.getInputDepth() / mLayer.getGroups(), mLayer.getKernelHeight(), mLayer.getKernelWidth()));
+
+        // gradients weights
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            auto& inputPadded = work.getMemoryManager<MM>()[mLayer.getTmpTensorName()];
+            inputPadded = TOMMTYPE(0_dt);
+
+            Common::addPadding2D(&inputs3D[i][0][0], inputPadded.data(), mLayer.getInputDepth(), mLayer.getInputWidth(), mLayer.getInputHeight(), inputWidthPadded, inputHeightPadded);
+
+            auto inputPadded3D = inputPadded.reshape(yato::dims(mLayer.getInputDepth(), inputHeightPadded, inputWidthPadded));
+
+            for (size_t group = 0; group < mLayer.getGroups(); ++group)
+            {
+                for (size_t d = 0; d < mLayer.getInputDepth() / mLayer.getGroups(); ++d)
+                {
+                    for (size_t kernelIndex = 0; kernelIndex < mLayer.getKernelsCount() / mLayer.getGroups(); ++kernelIndex)
+                    {
+                        for (size_t ky = 0; ky < mLayer.getKernelHeight(); ++ky)
+                        {
+                            for (size_t kx = 0; kx < mLayer.getKernelWidth(); ++kx)
+                            {
+                                for (size_t oy = 0; oy < mLayer.getOutputHeight(); ++oy)
+                                {
+                                    for (size_t ox = 0; ox < mLayer.getOutputWidth(); ++ox)
+                                    {
+                                        gradWeights4D[kernelIndex + group * mLayer.getKernelsCount() / mLayer.getGroups()][d][ky][kx] +=
+                                            deltas4D[i][kernelIndex + group * mLayer.getKernelsCount() / mLayer.getGroups()][oy][ox] *
+                                            inputPadded3D[d + group * mLayer.getInputDepth() / mLayer.getGroups()][oy * mLayer.getStrideH() + ky * mLayer.getDilationH()]
+                                                         [ox * mLayer.getStrideW() + kx * mLayer.getDilationW()];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // gradients biases
+        if (!mLayer.isFrozen() && mLayer.isUseBias())
+        {
+            auto& gradBiases = work.getMemoryManager<MM>()[mLayer.getBiasesName().grad()];
+
+            for (size_t kernelIndex = 0; kernelIndex < mLayer.getKernelsCount(); ++kernelIndex)
+            {
+                for (size_t i = 0; i < batchSize; ++i)
+                {
+                    for (size_t oh = 0; oh < mLayer.getOutputHeight(); ++oh)
+                    {
+                        for (size_t ow = 0; ow < mLayer.getOutputWidth(); ++ow)
+                        {
+                            gradBiases[kernelIndex] += deltas4D[i][kernelIndex][oh][ow];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#else
+    auto deltas3D = deltas.reshape(yato::dims(batchSize, mLayer.getKernelsCount(), mLayer.getOutputHeight() * mLayer.getOutputWidth()));
+
+    // prevDelta
+    ////if (mLayer.getNetworkParams().isGradNeeded(mInputName))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.getInputName().grad()];
+        auto prevDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mLayer.getInputDepth(), mLayer.getInputHeight() * mLayer.getInputWidth()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            size_t index = 0;
+#if defined(_OPENMP)
+            index = omp_get_thread_num();
+#endif
+            auto& im2ColBack = work.getMemoryManager<MM>()[mLayer.getIm2ColBackward()[index]];
+            for (size_t group = 0; group < mLayer.getGroups(); ++group)
+            {
+                Common::gemm(CblasTrans,
+                             CblasNoTrans,
+                             mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getInputDepth() / mLayer.getGroups(),
+                             mLayer.getOutputWidth() * mLayer.getOutputHeight(),
+                             mLayer.getKernelsCount() / mLayer.getGroups(),
+                             1.0_dt,
+                             &weights[0] + group * mLayer.getKernelsCount() / mLayer.getGroups() * mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() *
+                                               mLayer.getInputDepth() / mLayer.getGroups(),
+                             &deltas3D[i][0][0],
+                             0.0_dt,
+                             &im2ColBack[0] + group * mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getInputDepth() * mLayer.getOutputWidth() *
+                                                  mLayer.getOutputHeight() / mLayer.getGroups());
+            }
+
+            Common::col2im(&im2ColBack[0],
+                           mLayer.getInputWidth(),
+                           mLayer.getInputHeight(),
+                           mLayer.getInputDepth(),
+                           mLayer.getEffectiveReceptiveFieldW(),
+                           mLayer.getEffectiveReceptiveFieldH(),
+                           mLayer.getStrideW(),
+                           mLayer.getStrideH(),
+                           mLayer.getPaddingW(),
+                           mLayer.getPaddingH(),
+                           &prevDeltas3D[i][0][0],
+                           false,
+                           false);
+        }
+    }
+
+    // gradients weights
+    if (!mLayer.isFrozen())
+    {
+        auto& inputs = work.getMemoryManager<MM>()[mLayer.getInputName()];
+
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mLayer.getInputDepth(), mLayer.getInputHeight() * mLayer.getInputWidth()));
+
+        auto& gradWeights = work.getMemoryManager<MM>()[mLayer.getWeightsName().grad()];
+
+        if (mLayer.isDilationEnabled())
+        {
+            work.getMemoryManager<MM>()[mLayer.getDilationTensor()] = TOMMTYPE(0);
+        }
+
+        if (mLayer.getNetworkParams().mCalculationMode == CalculationMode::DETERMINISTIC)
+        {
+            auto& im2ColBack = work.getMemoryManager<MM>()[mLayer.getIm2ColBackward()[0]];
+
+            auto& tG = mLayer.isDilationEnabled() ? work.getMemoryManager<MM>()[mLayer.getDilationTensor()] : gradWeights;
+
+            for (size_t q = 0; q < batchSize; ++q)
+            {
+                Common::im2col(&inputs3D[q][0][0],
+                               mLayer.getInputWidth(),
+                               mLayer.getInputHeight(),
+                               mLayer.getInputDepth(),
+                               mLayer.getEffectiveReceptiveFieldW(),
+                               mLayer.getEffectiveReceptiveFieldH(),
+                               mLayer.getStrideW(),
+                               mLayer.getStrideH(),
+                               mLayer.getPaddingW(),
+                               mLayer.getPaddingH(),
+                               &im2ColBack[0]);
+                for (size_t group = 0; group < mLayer.getGroups(); ++group)
+                {
+                    Common::gemm(CblasNoTrans,
+                                 CblasTrans,
+                                 mLayer.getKernelsCount() / mLayer.getGroups(),
+                                 mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getInputDepth() / mLayer.getGroups(),
+                                 mLayer.getOutputWidth() * mLayer.getOutputHeight(),
+                                 1.0_dt,
+                                 &deltas3D[q][group * mLayer.getKernelsCount() / mLayer.getGroups()][0],
+                                 &im2ColBack[0] + group * mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getInputDepth() * mLayer.getOutputWidth() *
+                                                      mLayer.getOutputHeight() / mLayer.getGroups(),
+                                 1.0_dt,
+                                 &tG[0] + group * mLayer.getKernelsCount() / mLayer.getGroups() * mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getInputDepth() /
+                                              mLayer.getGroups());
+                }
+            }
+        }
+#if defined(_OPENMP)
+        else if (mLayer.getNetworkParams().mCalculationMode == CalculationMode::FAST)
+        {
+            auto& tG = mLayer.isDilationEnabled() ? work.getMemoryManager<MM>()[mLayer.getDilationTensor()] : gradWeights;
+
+#pragma omp parallel for
+            for (size_t q = 0; q < batchSize; ++q)
+            {
+                size_t index = omp_get_thread_num();
+
+                auto& im2ColBack = work.getMemoryManager<MM>()[mLayer.getIm2ColBackward()[index]];
+
+                Common::im2col(&inputs3D[q][0][0],
+                               mLayer.getInputWidth(),
+                               mLayer.getInputHeight(),
+                               mLayer.getInputDepth(),
+                               mLayer.getEffectiveReceptiveFieldW(),
+                               mLayer.getEffectiveReceptiveFieldH(),
+                               mLayer.getStrideW(),
+                               mLayer.getStrideH(),
+                               mLayer.getPaddingW(),
+                               mLayer.getPaddingH(),
+                               &im2ColBack[0]);
+#pragma omp critical
+                for (size_t group = 0; group < mLayer.getGroups(); ++group)
+                {
+                    Common::gemm(CblasNoTrans,
+                                 CblasTrans,
+                                 mLayer.getKernelsCount() / mLayer.getGroups(),
+                                 mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getInputDepth() / mLayer.getGroups(),
+                                 mLayer.getOutputWidth() * mLayer.getOutputHeight(),
+                                 1.0_dt,
+                                 &deltas3D[q][group * mLayer.getKernelsCount() / mLayer.getGroups()][0],
+                                 &im2ColBack[0] + group * mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getInputDepth() * mLayer.getOutputWidth() *
+                                                      mLayer.getOutputHeight() / mLayer.getGroups(),
+                                 1.0_dt,
+                                 &tG[0] + group * mLayer.getKernelsCount() / mLayer.getGroups() * mLayer.getEffectiveReceptiveFieldW() * mLayer.getEffectiveReceptiveFieldH() * mLayer.getInputDepth() /
+                                              mLayer.getGroups());
+                }
+            }
+        }
+#endif
+        else
+        {
+            THROW("Convolution2DLayer", mLayer.getName(), "unexpected calculation mode");
+        }
+
+        if (mLayer.isDilationEnabled())
+        {
+            const auto& dilationWeightsGrad = work.getMemoryManager<MM>()[mLayer.getDilationTensor()];
+            auto gradWeights4D = gradWeights.reshape(yato::dims(mLayer.getKernelsCount(), mLayer.getInputDepth() / mLayer.getGroups(), mLayer.getKernelHeight(), mLayer.getKernelWidth()));
+            const auto dilatedGradWeights4D = dilationWeightsGrad.reshape(
+                yato::dims(mLayer.getKernelsCount(), mLayer.getInputDepth() / mLayer.getGroups(), mLayer.getEffectiveReceptiveFieldH(), mLayer.getEffectiveReceptiveFieldW()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t kernelIndex = 0; kernelIndex < mLayer.getKernelsCount(); ++kernelIndex)
+            {
+                for (size_t d = 0; d < mLayer.getInputDepth() / mLayer.getGroups(); ++d)
+                {
+                    for (size_t ky = 0; ky < mLayer.getKernelHeight(); ++ky)
+                    {
+                        for (size_t kx = 0; kx < mLayer.getKernelWidth(); ++kx)
+                        {
+                            gradWeights4D[kernelIndex][d][ky][kx] += dilatedGradWeights4D[kernelIndex][d][ky * mLayer.getDilationH()][kx * mLayer.getDilationW()];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // gradients biases
+    if (!mLayer.isFrozen() && mLayer.isUseBias())
+    {
+        auto& gradBiases = work.getMemoryManager<MM>()[mLayer.getBiasesName().grad()];
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < mLayer.getKernelsCount(); ++kernelIndex)
+            {
+                gradBiases[kernelIndex] += std::accumulate(deltas3D[i][kernelIndex].begin(), deltas3D[i][kernelIndex].end(), TOMMTYPE(0), std::plus<typename MM::type>());
+            }
+        }
+    }
+#endif
+}
+
+template class Convolution2DLayerCPU<MemoryManager>;
+template class Convolution2DLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/basic/trainable/Convolution2DLayerCPU.h b/training/src/compiler/training/base/impl/basic/trainable/Convolution2DLayerCPU.h
new file mode 100644
index 00000000..bb86de3f
--- /dev/null
+++ b/training/src/compiler/training/base/impl/basic/trainable/Convolution2DLayerCPU.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVOLUTION_2D_LAYER_CPU_H
+#define CONVOLUTION_2D_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class Convolution2DLayer;
+
+/**
+ * @brief Convolution2D layer CPU implementation
+ */
+template<typename MM>
+class Convolution2DLayerCPU : public BasicImpl
+{
+  public:
+    Convolution2DLayerCPU(Convolution2DLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    Convolution2DLayerCPU(Convolution2DLayerCPU&&) = default;
+    Convolution2DLayerCPU(const Convolution2DLayerCPU&) = delete;
+    Convolution2DLayerCPU& operator=(const Convolution2DLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    Convolution2DLayer& mLayer;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP16.cpp b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP16.cpp
new file mode 100644
index 00000000..bc051ec9
--- /dev/null
+++ b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP16.cpp
@@ -0,0 +1,236 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LinearLayerCPUFP16.h"
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+#include <training/base/impl/ImplFactory.h>
+
+namespace
+{
+bool reg = raul::TheImplFactory::Instance().regCPUFP32FP16MixedLocal<raul::LinearLayer, raul::LinearLayerCPUFP16>();
+} // anonymous namespace
+
+namespace raul
+{
+
+LinearLayerCPUFP16::LinearLayerCPUFP16(LinearLayer& layer)
+    : mLayer(layer)
+{
+    auto inputShape = mLayer.getNetworkParams().mWorkflow.getShape(mLayer.getInputName());
+
+    mLayer.getNetworkParams().mWorkflow.tensorNeededMaxShape(
+        mLayer.getName(), "LinearLayerBufferInputFP16", inputShape, Workflow::Usage::ForwardAndBackward, Workflow::Mode::Write, true, true, false, false, false, LayerExecutionTarget::CPUFP16);
+
+    mLayer.getNetworkParams().mWorkflow.tensorNeededMaxShape(
+        mLayer.getName(), "LinearLayerBufferInputGradFP16", inputShape, Workflow::Usage::Backward, Workflow::Mode::Write, true, true, false, false, false, LayerExecutionTarget::CPUFP16);
+
+    mLayer.getNetworkParams().mWorkflow.tensorNeededMaxShape(mLayer.getName(),
+                                                             "LinearLayerBufferOutputFP16",
+                                                             WShape{ BS(), mLayer.getDepth(), mLayer.getHeight(), mLayer.getOutputsCount() },
+                                                             raul::Workflow::Usage::Forward,
+                                                             raul::Workflow::Mode::Write,
+                                                             true,
+                                                             true,
+                                                             false,
+                                                             false,
+                                                             false,
+                                                             LayerExecutionTarget::CPUFP16);
+
+    mLayer.getNetworkParams().mWorkflow.tensorNeededMaxShape(mLayer.getName(),
+                                                             "LinearLayerBufferOutputGradFP16",
+                                                             WShape{ BS(), mLayer.getDepth(), mLayer.getHeight(), mLayer.getOutputsCount() },
+                                                             Workflow::Usage::Backward,
+                                                             Workflow::Mode::Write,
+                                                             true,
+                                                             true,
+                                                             false,
+                                                             false,
+                                                             false,
+                                                             LayerExecutionTarget::CPUFP16);
+
+    mLayer.getNetworkParams().mWorkflow.tensorNeededMaxShape(mLayer.getName(),
+                                                             "LinearLayerBufferWeightsFP16",
+                                                             WShape{ 1u, 1u, mLayer.getOutputsCount(), mLayer.getInputsCount() },
+                                                             Workflow::Usage::ForwardAndBackward,
+                                                             Workflow::Mode::Write,
+                                                             true,
+                                                             true,
+                                                             false,
+                                                             false,
+                                                             false,
+                                                             LayerExecutionTarget::CPUFP16);
+
+    if (mLayer.isUseBias())
+    {
+        mLayer.getNetworkParams().mWorkflow.tensorNeededMaxShape(mLayer.getName(),
+                                                                 "LinearLayerBufferBiasFP16",
+                                                                 WShape{ 1u, 1u, 1u, mLayer.getOutputsCount() },
+                                                                 Workflow::Usage::Forward,
+                                                                 Workflow::Mode::Write,
+                                                                 true,
+                                                                 true,
+                                                                 false,
+                                                                 false,
+                                                                 false,
+                                                                 LayerExecutionTarget::CPUFP16);
+    }
+
+    if (!mLayer.isFrozen())
+    {
+        mLayer.getNetworkParams().mWorkflow.tensorNeededMaxShape(mLayer.getName(),
+                                                                 "LinearLayerBufferWeightsGradFP16",
+                                                                 WShape{ 1u, 1u, mLayer.getOutputsCount(), mLayer.getInputsCount() },
+                                                                 Workflow::Usage::Backward,
+                                                                 Workflow::Mode::Write,
+                                                                 true,
+                                                                 true,
+                                                                 false,
+                                                                 false,
+                                                                 false,
+                                                                 LayerExecutionTarget::CPUFP16);
+
+        if (mLayer.isUseBias())
+        {
+            mLayer.getNetworkParams().mWorkflow.tensorNeededMaxShape(mLayer.getName(),
+                                                                     "LinearLayerBufferBiasGradFP16",
+                                                                     WShape{ 1u, 1u, 1u, mLayer.getOutputsCount() },
+                                                                     Workflow::Usage::Backward,
+                                                                     Workflow::Mode::Write,
+                                                                     true,
+                                                                     true,
+                                                                     false,
+                                                                     false,
+                                                                     false,
+                                                                     LayerExecutionTarget::CPUFP16);
+        }
+    }
+}
+
+void LinearLayerCPUFP16::forwardComputeImpl(NetworkMode)
+{
+    auto& output = mLayer.getNetworkParams().mMemoryManager[mLayer.getOutputName()];
+    auto& outputFP16 = mLayer.getNetworkParams().mWorkflow.getMemoryManager<MemoryManagerFP16>()["LinearLayerBufferOutputFP16"];
+
+    std::transform(output.begin(), output.end(), outputFP16.begin(), [](dtype val) { return toFloat16(val); });
+
+    const size_t batchSize = mLayer.getNetworkParams().mWorkflow.getBatchSize();
+    size_t N = batchSize * mLayer.getDepth() * mLayer.getHeight();
+
+    const auto& inputs = mLayer.getNetworkParams().mMemoryManager[mLayer.getInputName()];
+    auto& inputsFP16 = mLayer.getNetworkParams().mWorkflow.getMemoryManager<MemoryManagerFP16>()["LinearLayerBufferInputFP16"];
+    std::transform(inputs.begin(), inputs.end(), inputsFP16.begin(), [](dtype val) { return toFloat16(val); });
+
+    const auto& weights = mLayer.getNetworkParams().mWorkflow.getMemoryManager()[mLayer.getWeightsName()];
+    auto& weightsFP16 = mLayer.getNetworkParams().mWorkflow.getMemoryManager<MemoryManagerFP16>()["LinearLayerBufferWeightsFP16"];
+    std::transform(weights.begin(), weights.end(), weightsFP16.begin(), [](dtype val) { return toFloat16(val); });
+
+    Common::gemm(CblasNoTrans,
+                 CblasTrans,
+                 N,
+                 mLayer.getOutputsCount(),
+                 mLayer.getInputsCount(),
+                 1.0_dt,
+                 &inputsFP16[0],
+                 &weightsFP16[0],
+                 0.0_dt,
+                 &outputFP16[0]);
+
+    if (mLayer.isUseBias())
+    {
+        const auto& biases = mLayer.getNetworkParams().mWorkflow.getMemoryManager()[mLayer.getBiasesName()];
+        auto& biasesFP16 = mLayer.getNetworkParams().mWorkflow.getMemoryManager<MemoryManagerFP16>()["LinearLayerBufferBiasFP16"];
+        std::transform(biases.begin(), biases.end(), biasesFP16.begin(), [](dtype val) { return toFloat16(val); });
+
+        for (size_t index = 0; index < N; ++index)
+        {
+            Common::axpy(mLayer.getOutputsCount(), 1.0_dt, &biasesFP16[0], 1, &outputFP16[0], 1, 0, index * mLayer.getOutputsCount());
+        }
+    }
+
+    std::transform(outputFP16.begin(), outputFP16.begin() + output.size(), output.begin(), [](half val) { return toFloat32(val); });
+}
+
+void LinearLayerCPUFP16::backwardComputeImpl()
+{
+    const auto& deltas = mLayer.getNetworkParams().mMemoryManager[mLayer.getOutputName().grad()];
+    auto& deltasFP16 = mLayer.getNetworkParams().mWorkflow.getMemoryManager<MemoryManagerFP16>()["LinearLayerBufferOutputGradFP16"];
+    std::transform(deltas.begin(), deltas.end(), deltasFP16.begin(), [](dtype val) { return toFloat16(val); });
+
+    const size_t batchSize = mLayer.getNetworkParams().mWorkflow.getBatchSize();
+    size_t N = batchSize * mLayer.getDepth() * mLayer.getHeight();
+
+    const auto& weights = mLayer.getNetworkParams().mWorkflow.getMemoryManager()[mLayer.getWeightsName()];
+    auto& weightsFP16 = mLayer.getNetworkParams().mWorkflow.getMemoryManager<MemoryManagerFP16>()["LinearLayerBufferWeightsFP16"];
+    std::transform(weights.begin(), weights.end(), weightsFP16.begin(), [](dtype val) { return toFloat16(val); });
+
+    ////if (mNetworkParams.isGradNeeded(mInputName))
+    {
+        auto& prevLayerDelta = mLayer.getNetworkParams().mMemoryManager[mLayer.getInputName().grad()];
+        auto& prevLayerDeltaFP16 = mLayer.getNetworkParams().mWorkflow.getMemoryManager<MemoryManagerFP16>()["LinearLayerBufferInputGradFP16"];
+        std::transform(prevLayerDelta.begin(), prevLayerDelta.end(), prevLayerDeltaFP16.begin(), [](dtype val) { return toFloat16(val); });
+
+        Common::gemm(CblasNoTrans,
+                     CblasNoTrans,
+                     N,
+                     mLayer.getInputsCount(),
+                     mLayer.getOutputsCount(),
+                     1.0_dt,
+                     &deltasFP16[0],
+                     &weightsFP16[0],
+                     1.0_dt,
+                     &prevLayerDeltaFP16[0]);
+
+        std::transform(prevLayerDeltaFP16.begin(), prevLayerDeltaFP16.begin() + prevLayerDelta.size(), prevLayerDelta.begin(), [](half val) { return toFloat32(val); });
+    }
+
+    if (!mLayer.isFrozen())
+    {
+        const Tensor& inputs = mLayer.getNetworkParams().mMemoryManager[mLayer.getInputName()];
+        auto& inputsFP16 = mLayer.getNetworkParams().mWorkflow.getMemoryManager<MemoryManagerFP16>()["LinearLayerBufferInputFP16"];
+        std::transform(inputs.begin(), inputs.end(), inputsFP16.begin(), [](dtype val) { return toFloat16(val); });
+
+        auto& gradWeights = mLayer.getNetworkParams().mWorkflow.getMemoryManager()[mLayer.getWeightsName().grad()];
+        auto& gradWeightsFP16 = mLayer.getNetworkParams().mWorkflow.getMemoryManager<MemoryManagerFP16>()["LinearLayerBufferWeightsGradFP16"];
+        std::transform(gradWeights.begin(), gradWeights.end(), gradWeightsFP16.begin(), [](dtype val) { return toFloat16(val); });
+
+        Common::gemm(CblasTrans,
+                     CblasNoTrans,
+                     mLayer.getOutputsCount(),
+                     mLayer.getInputsCount(),
+                     N,
+                     1.0_dt,
+                     &deltasFP16[0],
+                     &inputsFP16[0],
+                     1.0_dt,
+                     &gradWeightsFP16[0]);
+
+        std::transform(gradWeightsFP16.begin(), gradWeightsFP16.begin() + gradWeights.size(), gradWeights.begin(), [](half val) { return toFloat32(val); });
+
+        if (mLayer.isUseBias())
+        {
+            auto& gradBiases = mLayer.getNetworkParams().mWorkflow.getMemoryManager()[mLayer.getBiasesName().grad()];
+            auto& gradBiasesFP16 = mLayer.getNetworkParams().mWorkflow.getMemoryManager<MemoryManagerFP16>()["LinearLayerBufferBiasGradFP16"];
+
+            std::transform(gradBiases.begin(), gradBiases.end(), gradBiasesFP16.begin(), [](dtype val) { return toFloat16(val); });
+
+            for (size_t index = 0; index < N; ++index)
+            {
+                Common::axpy(mLayer.getOutputsCount(), 1.0_dt, &deltasFP16[0], 1, &gradBiasesFP16[0], 1, index * mLayer.getOutputsCount());
+            }
+
+            std::transform(gradBiasesFP16.begin(), gradBiasesFP16.begin() + gradBiases.size(), gradBiases.begin(), [](half val) { return toFloat32(val); });
+        }
+    }
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP16.h b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP16.h
new file mode 100644
index 00000000..16e10c18
--- /dev/null
+++ b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP16.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LINEAR_LAYER_CPU_FP16_H
+#define LINEAR_LAYER_CPU_FP16_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class LinearLayer;
+
+/**
+ * @brief Linear layer CPU implementation transform
+ */
+class LinearLayerCPUFP16 : public BasicImpl
+{
+  public:
+    LinearLayerCPUFP16(LinearLayer& layer);
+
+    LinearLayerCPUFP16(LinearLayerCPUFP16&&) = default;
+    LinearLayerCPUFP16(const LinearLayerCPUFP16&) = delete;
+    LinearLayerCPUFP16& operator=(const LinearLayerCPUFP16&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LinearLayer& mLayer;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP32.cpp b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP32.cpp
new file mode 100644
index 00000000..b429d7f2
--- /dev/null
+++ b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP32.cpp
@@ -0,0 +1,217 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LinearLayerCPUFP32.h"
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+#include <training/base/impl/ImplFactory.h>
+
+namespace raul
+{
+
+std::shared_ptr<Tensor> LinearLayerCPUFP32::mInput;
+std::shared_ptr<Tensor> LinearLayerCPUFP32::mOutput;
+std::shared_ptr<Tensor> LinearLayerCPUFP32::mDeltas;
+std::shared_ptr<Tensor> LinearLayerCPUFP32::mPrevLayerDeltas;
+std::shared_ptr<Tensor> LinearLayerCPUFP32::mWeights;
+std::shared_ptr<Tensor> LinearLayerCPUFP32::mGradWeights;
+std::shared_ptr<Tensor> LinearLayerCPUFP32::mBiases;
+std::shared_ptr<Tensor> LinearLayerCPUFP32::mGradBiases;
+
+void LinearLayerCPUFP32::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.getNetworkParams().mWorkflow;
+
+    auto& output = work.getMemoryManager<MemoryManagerFP16>()[mLayer.getOutputName()];
+    if (!mOutput)
+    {
+        mOutput = std::make_shared<Tensor>(output.size());
+    }
+    if (mOutput->size() < output.size())
+    {
+        mOutput = std::make_shared<Tensor>(output.size());
+    }
+    std::transform(output.begin(), output.end(), mOutput->begin(), [](half val) { return toFloat32(val); });
+    //*mOutput = TORANGE(output);
+
+    const size_t batchSize = mLayer.getNetworkParams().mWorkflow.getBatchSize();
+    size_t N = batchSize * mLayer.getDepth() * mLayer.getHeight();
+
+    const auto& inputs = work.getMemoryManager<MemoryManagerFP16>()[mLayer.getInputName()];
+    if (!mInput)
+    {
+        mInput = std::make_shared<Tensor>(inputs.size());
+    }
+    if (mInput->size() < inputs.size())
+    {
+        mInput = std::make_shared<Tensor>(inputs.size());
+    }
+    std::transform(inputs.begin(), inputs.end(), mInput->begin(), [](half val) { return toFloat32(val); });
+    //*mInput = TORANGE(inputs);
+
+    const auto& weights = work.getMemoryManager<MemoryManagerFP16>()[mLayer.getWeightsName()];
+    if (!mWeights)
+    {
+        mWeights = std::make_shared<Tensor>(weights.size());
+    }
+    if (mWeights->size() < weights.size())
+    {
+        mWeights = std::make_shared<Tensor>(weights.size());
+    }
+    std::transform(weights.begin(), weights.end(), mWeights->begin(), [](half val) { return toFloat32(val); });
+    //*mWeights = TORANGE(weights);
+
+    Common::gemm(CblasNoTrans,
+                 CblasTrans,
+                 N,
+                 mLayer.getOutputsCount(),
+                 mLayer.getInputsCount(),
+                 1.0_dt,
+                 &(*mInput)[0],
+                 &(*mWeights)[0],
+                 0.0_dt,
+                 &(*mOutput)[0]);
+
+    if (mLayer.isUseBias())
+    {
+        const auto& biases = work.getMemoryManager<MemoryManagerFP16>()[mLayer.getBiasesName()];
+        if (!mBiases)
+        {
+            mBiases = std::make_shared<Tensor>(biases.size());
+        }
+        if (mBiases->size() < biases.size())
+        {
+            mBiases = std::make_shared<Tensor>(biases.size());
+        }
+        //*mBiases = TORANGE(biases);
+        std::transform(biases.begin(), biases.end(), mBiases->begin(), [](half val) { return toFloat32(val); });
+
+        for (size_t index = 0; index < N; ++index)
+        {
+            Common::axpy(mLayer.getOutputsCount(), 1.0_dt, &(*mBiases)[0], 1, &(*mOutput)[0], 1, 0, index * mLayer.getOutputsCount());
+        }
+    }
+
+    std::transform(mOutput->begin(), mOutput->begin() + output.size(), output.begin(), [](dtype val) { return toFloat16(val); });
+    // output = TORANGE_FP16((*mOutput));
+}
+
+void LinearLayerCPUFP32::backwardComputeImpl()
+{
+    Workflow& work = mLayer.getNetworkParams().mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MemoryManagerFP16>()[mLayer.getOutputName().grad()];
+    if (!mDeltas)
+    {
+        mDeltas = std::make_shared<Tensor>(deltas.size());
+    }
+    if (mDeltas->size() < deltas.size())
+    {
+        mDeltas = std::make_shared<Tensor>(deltas.size());
+    }
+    //*mDeltas = TORANGE(deltas);
+    std::transform(deltas.begin(), deltas.end(), mDeltas->begin(), [](half val) { return toFloat32(val); });
+
+    const size_t batchSize = mLayer.getNetworkParams().mWorkflow.getBatchSize();
+    size_t N = batchSize * mLayer.getDepth() * mLayer.getHeight();
+
+    const auto& weights = work.getMemoryManager<MemoryManagerFP16>()[mLayer.getWeightsName()];
+    //*mWeights = TORANGE(weights);
+    std::transform(weights.begin(), weights.end(), mWeights->begin(), [](half val) { return toFloat32(val); });
+
+    ////if (mNetworkParams.isGradNeeded(mInputName))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MemoryManagerFP16>()[mLayer.getInputName().grad()];
+        if (!mPrevLayerDeltas)
+        {
+            mPrevLayerDeltas = std::make_shared<Tensor>(prevLayerDelta.size());
+        }
+        if (mPrevLayerDeltas->size() < prevLayerDelta.size())
+        {
+            mPrevLayerDeltas = std::make_shared<Tensor>(prevLayerDelta.size());
+        }
+        //*mPrevLayerDeltas = TORANGE(prevLayerDelta);
+        std::transform(prevLayerDelta.begin(), prevLayerDelta.end(), mPrevLayerDeltas->begin(), [](half val) { return toFloat32(val); });
+
+        Common::gemm(CblasNoTrans,
+                     CblasNoTrans,
+                     N,
+                     mLayer.getInputsCount(),
+                     mLayer.getOutputsCount(),
+                     1.0_dt,
+                     &(*mDeltas)[0],
+                     &(*mWeights)[0],
+                     1.0_dt,
+                     &(*mPrevLayerDeltas)[0]);
+
+        // prevLayerDelta = TORANGE_FP16(*mPrevLayerDeltas);
+        std::transform(mPrevLayerDeltas->begin(), mPrevLayerDeltas->begin() + prevLayerDelta.size(), prevLayerDelta.begin(), [](dtype val) { return toFloat16(val); });
+    }
+
+    if (!mLayer.isFrozen())
+    {
+        const auto& inputs = work.getMemoryManager<MemoryManagerFP16>()[mLayer.getInputName()];
+        //*mInput = TORANGE(inputs);
+        std::transform(inputs.begin(), inputs.end(), mInput->begin(), [](half val) { return toFloat32(val); });
+
+        auto& gradWeights = work.getMemoryManager<MemoryManagerFP16>()[mLayer.getWeightsName().grad()];
+        if (!mGradWeights)
+        {
+            mGradWeights = std::make_shared<Tensor>(gradWeights.size());
+        }
+        if (mGradWeights->size() < gradWeights.size())
+        {
+            mGradWeights = std::make_shared<Tensor>(gradWeights.size());
+        }
+        //*mGradWeights = TORANGE(gradWeights);
+        std::transform(gradWeights.begin(), gradWeights.end(), mGradWeights->begin(), [](half val) { return toFloat32(val); });
+
+        Common::gemm(CblasTrans,
+                     CblasNoTrans,
+                     mLayer.getOutputsCount(),
+                     mLayer.getInputsCount(),
+                     N,
+                     1.0_dt,
+                     &(*mDeltas)[0],
+                     &(*mInput)[0],
+                     1.0_dt,
+                     &(*mGradWeights)[0]);
+
+        // gradWeights = TORANGE_FP16(*mGradWeights);
+        std::transform(mGradWeights->begin(), mGradWeights->begin() + gradWeights.size(), gradWeights.begin(), [](dtype val) { return toFloat16(val); });
+
+        if (mLayer.isUseBias())
+        {
+            auto& gradBiases = work.getMemoryManager<MemoryManagerFP16>()[mLayer.getBiasesName().grad()];
+            if (!mGradBiases)
+            {
+                mGradBiases = std::make_shared<Tensor>(gradBiases.size());
+            }
+            if (mGradBiases->size() < gradBiases.size())
+            {
+                mGradBiases = std::make_shared<Tensor>(gradBiases.size());
+            }
+            //*mGradBiases = TORANGE(gradBiases);
+            std::transform(gradBiases.begin(), gradBiases.end(), mGradBiases->begin(), [](half val) { return toFloat32(val); });
+
+            for (size_t index = 0; index < N; ++index)
+            {
+                Common::axpy(mLayer.getOutputsCount(), 1.0_dt, &(*mDeltas)[0], 1, &(*mGradBiases)[0], 1, index * mLayer.getOutputsCount());
+            }
+
+            // gradBiases = TORANGE_FP16(*mGradBiases);
+            std::transform(mGradBiases->begin(), mGradBiases->begin() + gradBiases.size(), gradBiases.begin(), [](dtype val) { return toFloat16(val); });
+        }
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP32.h b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP32.h
new file mode 100644
index 00000000..c53ee55b
--- /dev/null
+++ b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerCPUFP32.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LINEAR_LAYER_CPU_FP32_H
+#define LINEAR_LAYER_CPU_FP32_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class LinearLayer;
+template<typename Type>
+class TensorImpl;
+typedef TensorImpl<dtype> Tensor;
+
+/**
+ * @brief Linear layer CPU implementation transform
+ */
+class LinearLayerCPUFP32 : public BasicImpl
+{
+  public:
+    LinearLayerCPUFP32(LinearLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    LinearLayerCPUFP32(LinearLayerCPUFP32&&) = default;
+    LinearLayerCPUFP32(const LinearLayerCPUFP32&) = delete;
+    LinearLayerCPUFP32& operator=(const LinearLayerCPUFP32&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LinearLayer& mLayer;
+
+    static std::shared_ptr<Tensor> mInput;
+    static std::shared_ptr<Tensor> mOutput;
+    static std::shared_ptr<Tensor> mDeltas;
+    static std::shared_ptr<Tensor> mPrevLayerDeltas;
+    static std::shared_ptr<Tensor> mWeights;
+    static std::shared_ptr<Tensor> mGradWeights;
+    static std::shared_ptr<Tensor> mBiases;
+    static std::shared_ptr<Tensor> mGradBiases;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/basic/trainable/LinearLayerImpl.cpp b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerImpl.cpp
new file mode 100644
index 00000000..ea197049
--- /dev/null
+++ b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerImpl.cpp
@@ -0,0 +1,136 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LinearLayerImpl.h"
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+
+#include <training/base/impl/ImplFactory.h>
+
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::LinearLayer, raul::LinearLayerImpl<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::LinearLayer, raul::LinearLayerImpl<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void LinearLayerImpl<MM>::onBatchSizeChanged(size_t)
+{
+}
+
+template<typename MM>
+void LinearLayerImpl<MM>::initNotBSTensors()
+{
+}
+
+template<typename MM>
+void LinearLayerImpl<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.getNetworkParams().mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>().getTensor(mLayer.getOutputName());
+    const size_t batchSize = mLayer.getNetworkParams().mWorkflow.getBatchSize();
+    size_t N = batchSize * mLayer.getDepth() * mLayer.getHeight();
+
+    auto& inputs = work.getMemoryManager<MM>().getTensor(mLayer.getInputName());
+
+    const auto& weights = work.getMemoryManager<MM>().getTensor(mLayer.getWeightsName());
+
+    auto beta = 0.0_dt;
+    if (mLayer.isUseBias())
+    {
+        const auto& biases = work.getMemoryManager<MM>().getTensor(mLayer.getBiasesName());
+        for (size_t i = 0; i < N; i++)
+        {
+            std::copy(biases.cbegin(), biases.cend(), output.begin() + i * mLayer.getOutputsCount());
+        }
+
+        beta = 1.0_dt;
+    }
+
+    Common::gemm(CblasNoTrans,
+                 CblasTrans,
+                 N,
+                 mLayer.getOutputsCount(),
+                 mLayer.getInputsCount(),
+                 1.0_dt,
+                 inputs.getBuffer(),
+                 weights.getBuffer(),
+                 beta,
+                 output.getBuffer());
+}
+
+template<typename MM>
+void LinearLayerImpl<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.getNetworkParams().mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>().getTensor(mLayer.getOutputName().grad());
+
+    const size_t batchSize = mLayer.getNetworkParams().mWorkflow.getBatchSize();
+    size_t N = batchSize * mLayer.getDepth() * mLayer.getHeight();
+
+    const auto& weights = work.getMemoryManager<MM>().getTensor(mLayer.getWeightsName());
+
+    ////if (mNetworkParams.isGradNeeded(mInputName))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>().getTensor(mLayer.getInputName().grad());
+
+        Common::gemm(CblasNoTrans,
+                     CblasNoTrans,
+                     N,
+                     mLayer.getInputsCount(),
+                     mLayer.getOutputsCount(),
+                     1.0_dt,
+                     deltas.getBuffer(),
+                     weights.getBuffer(),
+                     1.0_dt,
+                     prevLayerDelta.getBuffer());
+    }
+
+    if (!mLayer.isFrozen())
+    {
+        const auto& inputs = work.getMemoryManager<MM>().getTensor(mLayer.getInputName());
+        auto& gradWeights = work.getMemoryManager<MM>().getTensor(mLayer.getWeightsName().grad());
+
+        Common::gemm(CblasTrans,
+                     CblasNoTrans,
+                     mLayer.getOutputsCount(),
+                     mLayer.getInputsCount(),
+                     N,
+                     1.0_dt,
+                     deltas.getBuffer(),
+                     inputs.getBuffer(),
+                     1.0_dt,
+                     gradWeights.getBuffer());
+
+        if (mLayer.isUseBias())
+        {
+            auto& gradBiases = work.getMemoryManager<MM>().getTensor(mLayer.getBiasesName().grad());
+            for (size_t i = 0; i < N; i++)
+            {
+                std::transform(deltas.cbegin() + i * mLayer.getOutputsCount(),
+                               deltas.cbegin() + i * mLayer.getOutputsCount() + mLayer.getOutputsCount(),
+                               gradBiases.cbegin(),
+                               gradBiases.begin(),
+                               std::plus<typename MM::type>());
+            }
+        }
+    }
+}
+
+INSTANTIATE_IMPL(LinearLayerImpl)
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/basic/trainable/LinearLayerImpl.h b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerImpl.h
new file mode 100644
index 00000000..7dbb32b8
--- /dev/null
+++ b/training/src/compiler/training/base/impl/basic/trainable/LinearLayerImpl.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LINEAR_LAYER_IMPL_H
+#define LINEAR_LAYER_IMPL_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class LinearLayer;
+
+/**
+ * @brief Linear layer HW independent implementation
+ */
+template<typename MM>
+class LinearLayerImpl : public BasicImpl
+{
+  public:
+    LinearLayerImpl(LinearLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    LinearLayerImpl(LinearLayerImpl&&) = default;
+    LinearLayerImpl(const LinearLayerImpl&) = delete;
+    LinearLayerImpl& operator=(const LinearLayerImpl&) = delete;
+
+    void onBatchSizeChanged(size_t) override;
+    void initNotBSTensors() override;
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LinearLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/composite/rnn/GRUFusedLayerCPU.cpp b/training/src/compiler/training/base/impl/composite/rnn/GRUFusedLayerCPU.cpp
new file mode 100644
index 00000000..50c0b632
--- /dev/null
+++ b/training/src/compiler/training/base/impl/composite/rnn/GRUFusedLayerCPU.cpp
@@ -0,0 +1,330 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GRUFusedLayerCPU.h"
+#include <training/base/layers/composite/rnn/GRUFusedLayer.h>
+
+#include <training/base/impl/ImplFactory.h>
+
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::GRUFusedLayer, raul::GRUFusedLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::GRUFusedLayer, raul::GRUFusedLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void GRUFusedLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    size_t accumulatedSize = 0;
+
+    for (size_t q = 0; q < mLayer.mLengthSequence; ++q)
+    {
+        // Process input
+        const auto& inputData = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+        typename MM::tensor input(inputData.getBatchSize(), 1u, 1u, inputData.getWidth());
+        Common::unpack4D(inputData, input, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, true);
+
+        auto& linearIH = work.getMemoryManager<MM>()[mLayer.mLinearIHTmp[q]];
+
+        const auto& weightsIH = work.getMemoryManager<MM>().getTensor(mLayer.mWeightsNameIH);
+
+        const auto batchSize = work.getBatchSize();
+        size_t N = batchSize * input.getDepth() * input.getHeight();
+
+        Common::gemm(CblasNoTrans,
+                     CblasTrans,
+                     N,
+                     mLayer.mOutputsCount,
+                     input.getWidth(),
+                     1.0_dt,
+                     input.getBuffer(),
+                     weightsIH.getBuffer(),
+                     0.0_dt,
+                     linearIH.getBuffer());
+
+        if (mLayer.mUseBiasForInput)
+        {
+            const auto& biasesIH = work.getMemoryManager<MM>().getTensor(mLayer.mBiasesNameIH);
+
+            for (size_t index = 0; index < N; ++index)
+            {
+                Common::axpy(mLayer.mOutputsCount, 1.0_dt, biasesIH.getBuffer(), 1, linearIH.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+            }
+        }
+
+        // Process hidden
+        const auto& hiddenState = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][1]];
+        auto& linearHH = work.getMemoryManager<MM>()[mLayer.mLinearHHTmp[q]];
+
+        const auto& weightsHH = work.getMemoryManager<MM>().getTensor(mLayer.mWeightsNameHH);
+
+        Common::gemm(CblasNoTrans,
+                     CblasTrans,
+                     N,
+                     mLayer.mOutputsCount,
+                     hiddenState.getWidth(),
+                     1.0_dt,
+                     hiddenState.getBuffer(),
+                     weightsHH.getBuffer(),
+                     0.0_dt,
+                     linearHH.getBuffer());
+
+        if (mLayer.mUseBiasForHidden)
+        {
+            const auto& biasesHH = work.getMemoryManager<MM>().getTensor(mLayer.mBiasesNameHH);
+
+            for (size_t index = 0; index < N; ++index)
+            {
+                Common::axpy(mLayer.mOutputsCount, 1.0_dt, biasesHH.getBuffer(), 1, linearHH.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+            }
+        }
+
+        auto& newHiddenState = work.getMemoryManager<MM>()[mLayer.mOutputsLocal[q]];
+
+        const auto sliceSize = linearIH.getWidth() / 3;
+
+        const auto linearIH2D = linearIH.reshape(yato::dims(batchSize, sliceSize * 3));
+        const auto linearHH2D = linearHH.reshape(yato::dims(batchSize, sliceSize * 3));
+        const auto hiddenState2D = hiddenState.reshape(yato::dims(batchSize, sliceSize));
+        const auto newHiddenState2D = newHiddenState.reshape(yato::dims(batchSize, sliceSize));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t j = 0; j < sliceSize; ++j)
+            {
+                auto sigmoidGates0 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][j]) - TODTYPE(linearHH2D[i][j])));
+                auto sigmoidGates1 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][sliceSize + j]) - TODTYPE(linearHH2D[i][sliceSize + j])));
+                auto tanhGates2 = std::tanh(sigmoidGates0 * TODTYPE(linearHH2D[i][sliceSize * 2 + j]) + TODTYPE(linearIH2D[i][sliceSize * 2 + j]));
+                newHiddenState2D[i][j] = TOMMTYPE(sigmoidGates1 * TODTYPE(hiddenState2D[i][j]) + tanhGates2 * (1.0_dt - sigmoidGates1));
+            }
+        }
+
+        // concatenate hidden
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+        Common::pack4D(newHiddenState, output, mLayer.mDirection, accumulatedSize, mLayer.mTypeName, mLayer.mName, true);
+        accumulatedSize += newHiddenState.getShape()[mLayer.mDimIndex + 1];
+    }
+}
+
+template<typename MM>
+void GRUFusedLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    size_t accumulatedSize = mLayer.mLengthSequence - 1;
+
+    for (size_t q = mLayer.mLengthSequence; q-- > 0;)
+    {
+
+        auto& deltasHidden = work.getMemoryManager<MM>()[mLayer.mOutputsLocal[q].grad()];
+        const auto& linearIH = work.getMemoryManager<MM>()[mLayer.mLinearIHTmp[q]];
+        const auto& linearHH = work.getMemoryManager<MM>()[mLayer.mLinearHHTmp[q]];
+
+        const auto& inputData = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+        typename MM::tensor input(inputData.getBatchSize(), 1u, 1u, inputData.getWidth());
+        Common::unpack4D(inputData, input, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, true);
+
+        auto& inputGradData = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+        const auto& hiddenState = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][1]];
+
+        const auto& weightsIH = work.getMemoryManager<MM>()[mLayer.mWeightsNameIH];
+        const auto& weightsHH = work.getMemoryManager<MM>()[mLayer.mWeightsNameHH];
+
+        const auto batchSize = work.getBatchSize();
+        const auto sliceSize = linearIH.getWidth() / 3;
+        size_t N = batchSize * input.getDepth() * input.getHeight();
+
+        // concatenate hidden
+        const auto& delta = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+        Common::unpack4D(delta, deltasHidden, mLayer.mDirection, accumulatedSize, mLayer.mTypeName, mLayer.mName, false);
+        accumulatedSize -= deltasHidden.getShape()[mLayer.mDimIndex + 1];
+
+        const auto deltasHidden2D = deltasHidden.reshape(yato::dims(batchSize, sliceSize));
+        const auto linearIH2D = linearIH.reshape(yato::dims(batchSize, sliceSize * 3));
+        const auto linearHH2D = linearHH.reshape(yato::dims(batchSize, sliceSize * 3));
+        const auto hiddenState2D = hiddenState.reshape(yato::dims(batchSize, sliceSize));
+
+        // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputsLocal[q][0]) || !mLayer.mFrozen)
+        {
+            auto& linearIHGrad = work.getMemoryManager<MM>()[mLayer.mLinearIHTmp[q].grad()];
+            auto linearIHGrad2D = linearIHGrad.reshape(yato::dims(batchSize, sliceSize * 3));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t j = 0; j < sliceSize; ++j)
+                {
+                    auto sigmoidGates0 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][j]) - TODTYPE(linearHH2D[i][j])));
+                    auto sigmoidGates1 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][sliceSize + j]) - TODTYPE(linearHH2D[i][sliceSize + j])));
+                    auto tanhGates2 = std::tanh(sigmoidGates0 * TODTYPE(linearHH2D[i][sliceSize * 2 + j]) + TODTYPE(linearIH2D[i][sliceSize * 2 + j]));
+                    linearIHGrad2D[i][j] += TOMMTYPE(sigmoidGates0 * (1.0_dt - sigmoidGates0) * TODTYPE(linearHH2D[i][sliceSize * 2 + j]) * (1.0_dt - tanhGates2 * tanhGates2) *
+                                                     (1.0_dt - sigmoidGates1) * TODTYPE(deltasHidden2D[i][j]));
+                    linearIHGrad2D[i][sliceSize + j] += TOMMTYPE(sigmoidGates1 * (1.0_dt - sigmoidGates1) * (TODTYPE(hiddenState2D[i][j]) - tanhGates2) * TODTYPE(deltasHidden2D[i][j]));
+                    linearIHGrad2D[i][sliceSize * 2 + j] += TOMMTYPE((1.0_dt - tanhGates2 * tanhGates2) * (1.0_dt - sigmoidGates1) * TODTYPE(deltasHidden2D[i][j]));
+                }
+            }
+
+            // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputsLocal[q][0]))
+            {
+                // auto& inputGrad = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][0].grad()];
+                typename MM::tensor inputGrad(inputGradData.getBatchSize(), 1u, 1u, inputGradData.getWidth());
+                Common::unpack4D(inputGradData, inputGrad, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, true);
+
+                Common::gemm(CblasNoTrans,
+                             CblasNoTrans,
+                             N,
+                             inputGrad.getWidth(),
+                             mLayer.mOutputsCount,
+                             1.0_dt,
+                             linearIHGrad.getBuffer(),
+                             weightsIH.getBuffer(),
+                             1.0_dt,
+                             inputGrad.getBuffer());
+
+                Common::pack4D(inputGrad, inputGradData, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, false);
+            }
+
+            if (!mLayer.mFrozen)
+            {
+                auto& gradWeightsIH = work.getMemoryManager<MM>()[mLayer.mWeightsNameIH.grad()];
+
+                Common::gemm(CblasTrans,
+                             CblasNoTrans,
+                             mLayer.mOutputsCount,
+                             input.getWidth(),
+                             N,
+                             1.0_dt,
+                             linearIHGrad.getBuffer(),
+                             input.getBuffer(),
+                             1.0_dt,
+                             gradWeightsIH.getBuffer());
+
+                if (mLayer.mUseBiasForInput)
+                {
+                    auto& gradBiasesIH = work.getMemoryManager<MM>()[mLayer.mBiasesNameIH.grad()];
+                    for (size_t index = 0; index < N; ++index)
+                    {
+                        Common::axpy(mLayer.mOutputsCount,
+                                     1.0_dt,
+                                     linearIHGrad.getBuffer(),
+                                     1,
+                                     gradBiasesIH.getBuffer(),
+                                     1,
+                                     index * mLayer.mOutputsCount,
+                                     0);
+                    }
+                }
+            }
+        }
+
+        // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputsLocal[q][1]) || !mLayer.mFrozen)
+        {
+            auto& linearHHGrad = work.getMemoryManager<MM>()[mLayer.mLinearHHTmp[q].grad()];
+            auto linearHHGrad2D = linearHHGrad.reshape(yato::dims(batchSize, sliceSize * 3));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t j = 0; j < sliceSize; ++j)
+                {
+                    auto sigmoidGates0 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][j]) - TODTYPE(linearHH2D[i][j])));
+                    auto sigmoidGates1 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][sliceSize + j]) - TODTYPE(linearHH2D[i][sliceSize + j])));
+                    auto tanhGates2 = std::tanh(sigmoidGates0 * TODTYPE(linearHH2D[i][sliceSize * 2 + j]) + TODTYPE(linearIH2D[i][sliceSize * 2 + j]));
+                    linearHHGrad2D[i][j] += TOMMTYPE(sigmoidGates0 * (1.0_dt - sigmoidGates0) * TODTYPE(linearHH2D[i][sliceSize * 2 + j]) * (1.0_dt - tanhGates2 * tanhGates2) *
+                                                     (1.0_dt - sigmoidGates1) * TODTYPE(deltasHidden2D[i][j]));
+                    linearHHGrad2D[i][sliceSize + j] += TOMMTYPE(sigmoidGates1 * (1.0_dt - sigmoidGates1) * (TODTYPE(hiddenState2D[i][j]) - tanhGates2) * TODTYPE(deltasHidden2D[i][j]));
+                    linearHHGrad2D[i][sliceSize * 2 + j] += TOMMTYPE(sigmoidGates0 * (1.0_dt - tanhGates2 * tanhGates2) * (1.0_dt - sigmoidGates1) * TODTYPE(deltasHidden2D[i][j]));
+                }
+            }
+
+            // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputsLocal[q][1]))
+            {
+                auto& hiddenStateGrad = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][1].grad()];
+                auto hiddenStateGrad2D = hiddenStateGrad.reshape(yato::dims(batchSize, sliceSize));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < batchSize; ++i)
+                {
+                    for (size_t j = 0; j < sliceSize; ++j)
+                    {
+                        auto sigmoidGates1 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][sliceSize + j]) - TODTYPE(linearHH2D[i][sliceSize + j])));
+                        hiddenStateGrad2D[i][j] += TOMMTYPE(sigmoidGates1 * TODTYPE(deltasHidden2D[i][j]));
+                    }
+                }
+
+                Common::gemm(CblasNoTrans,
+                             CblasNoTrans,
+                             N,
+                             hiddenStateGrad.getWidth(),
+                             mLayer.mOutputsCount,
+                             1.0_dt,
+                             linearHHGrad.getBuffer(),
+                             weightsHH.getBuffer(),
+                             1.0_dt,
+                             hiddenStateGrad.getBuffer());
+            }
+
+            if (!mLayer.mFrozen)
+            {
+                auto& gradWeightsHH = work.getMemoryManager<MM>()[mLayer.mWeightsNameHH.grad()];
+
+                Common::gemm(CblasTrans,
+                             CblasNoTrans,
+                             mLayer.mOutputsCount,
+                             hiddenState.getWidth(),
+                             N,
+                             1.0_dt,
+                             linearHHGrad.getBuffer(),
+                             hiddenState.getBuffer(),
+                             1.0_dt,
+                             gradWeightsHH.getBuffer());
+
+                if (mLayer.mUseBiasForHidden)
+                {
+                    auto& gradBiasesHH = work.getMemoryManager<MM>()[mLayer.mBiasesNameHH.grad()];
+                    for (size_t index = 0; index < N; ++index)
+                    {
+                        Common::axpy(mLayer.mOutputsCount,
+                                     1.0_dt,
+                                     linearHHGrad.getBuffer(),
+                                     1,
+                                     gradBiasesHH.getBuffer(),
+                                     1,
+                                     index * mLayer.mOutputsCount,
+                                     0);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template class GRUFusedLayerCPU<MemoryManager>;
+template class GRUFusedLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/composite/rnn/GRUFusedLayerCPU.h b/training/src/compiler/training/base/impl/composite/rnn/GRUFusedLayerCPU.h
new file mode 100644
index 00000000..50ccc9d8
--- /dev/null
+++ b/training/src/compiler/training/base/impl/composite/rnn/GRUFusedLayerCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRU_FUSED_LAYER_CPU_H
+#define GRU_FUSED_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class GRUFusedLayer;
+
+/**
+ * @brief GRUFusedLayer CPU implementation
+ */
+template<typename MM>
+class GRUFusedLayerCPU : public BasicImpl
+{
+  public:
+    GRUFusedLayerCPU(GRUFusedLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    GRUFusedLayerCPU(GRUFusedLayerCPU&&) = default;
+    GRUFusedLayerCPU(const GRUFusedLayerCPU&) = delete;
+    GRUFusedLayerCPU& operator=(const GRUFusedLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    GRUFusedLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPU.cpp b/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPU.cpp
new file mode 100644
index 00000000..fea56d58
--- /dev/null
+++ b/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPU.cpp
@@ -0,0 +1,538 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LSTMFusedLayerCPU.h"
+#include <training/base/layers/composite/rnn/LSTMFusedLayer.h>
+
+#include <training/base/common/Random.h>
+
+#include <training/base/impl/ImplFactory.h>
+
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::LSTMFusedLayer, raul::LSTMFusedLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::LSTMFusedLayer, raul::LSTMFusedLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void LSTMFusedLayerCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    size_t accumulatedSize = 0;
+
+    for (size_t q = 0; q < mLayer.mLengthSequence; ++q)
+    {
+        // Process gates
+        const auto& inputData = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+        typename MM::tensor input(inputData.getBatchSize(), 1u, 1u, inputData.getWidth());
+
+        Common::unpack4D(inputData, input, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, true);
+
+        const auto& hiddenState = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][1]];
+
+        auto& gates = work.getMemoryManager<MM>()[mLayer.mGatesName[q]];
+        auto& tmp = work.getMemoryManager<MM>()[mLayer.mTmpCalculationsName[q]];
+
+        const auto batchSize = work.getBatchSize();
+        size_t N = batchSize * input.getDepth() * input.getHeight();
+
+        // If use single matrix
+        if (mLayer.mUseSingleParamTensor)
+        {
+            auto tmp2D = tmp.reshape(yato::dims(batchSize, tmp.getWidth()));
+            const auto input2D = input.reshape(yato::dims(batchSize, input.getWidth()));
+            const auto hiddenState2D = hiddenState.reshape(yato::dims(batchSize, hiddenState.getWidth()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                std::copy(input2D[i].begin(), input2D[i].end(), tmp2D[i].begin());
+                std::copy(hiddenState2D[i].begin(), hiddenState2D[i].end(), &tmp2D[i][input.getWidth()]);
+            }
+
+            const auto& weights = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[0]);
+
+            Common::gemm(CblasNoTrans,
+                         CblasTrans,
+                         N,
+                         mLayer.mOutputsCount,
+                         tmp.getWidth(),
+                         1.0_dt,
+                         tmp.getBuffer(),
+                         weights.getBuffer(),
+                         0.0_dt,
+                         gates.getBuffer());
+
+            if (mLayer.mUseBias)
+            {
+                const auto& biases = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[1]);
+
+                for (size_t index = 0; index < N; ++index)
+                {
+                    Common::axpy(mLayer.mOutputsCount, 1.0_dt, biases.getBuffer(), 1, gates.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+                }
+            }
+        }
+        else
+        {
+            // Process input
+            const auto& weightsIH = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[0]);
+
+            Common::gemm(CblasNoTrans,
+                         CblasTrans,
+                         N,
+                         mLayer.mOutputsCount,
+                         input.getWidth(),
+                         1.0_dt,
+                         input.getBuffer(),
+                         weightsIH.getBuffer(),
+                         0.0_dt,
+                         tmp.getBuffer());
+
+            if (mLayer.mUseBias)
+            {
+                const auto& biasesIH = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[1]);
+
+                for (size_t index = 0; index < N; ++index)
+                {
+                    Common::axpy(mLayer.mOutputsCount, 1.0_dt, biasesIH.getBuffer(), 1, tmp.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+                }
+            }
+
+            // Process hidden
+            const auto& weightsHH = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[2]);
+
+            Common::gemm(CblasNoTrans,
+                         CblasTrans,
+                         N,
+                         mLayer.mOutputsCount,
+                         hiddenState.getWidth(),
+                         1.0_dt,
+                         hiddenState.getBuffer(),
+                         weightsHH.getBuffer(),
+                         0.0_dt,
+                         gates.getBuffer());
+
+            if (mLayer.mUseBias)
+            {
+                const auto& biasesHH = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[3]);
+
+                for (size_t index = 0; index < N; ++index)
+                {
+                    Common::axpy(mLayer.mOutputsCount, 1.0_dt, biasesHH.getBuffer(), 1, gates.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+                }
+            }
+
+            // Final gates is sum of tmp and gates
+            gates += tmp;
+        }
+
+        const auto& cellState = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][2]];
+        auto& newCellState = mLayer.mUseZoneout ? work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewCellName[q]] : work.getMemoryManager<MM>()[mLayer.mOutputsLocal[q][1]];
+
+        auto& newHiddenState = work.getMemoryManager<MM>()[mLayer.mOutputsLocal[q][0]];
+
+        const auto sliceSize = gates.getWidth() / 4;
+
+        const auto gates2D = gates.reshape(yato::dims(batchSize, sliceSize * 4));
+        auto newHiddenState2D = newHiddenState.reshape(yato::dims(batchSize, sliceSize));
+        const auto cellState2D = cellState.reshape(yato::dims(batchSize, sliceSize));
+        auto newCellState2D = newCellState.reshape(yato::dims(batchSize, sliceSize));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t j = 0; j < sliceSize; ++j)
+            {
+                auto newCState = std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j])) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][j]))) +
+                                 TODTYPE(cellState2D[i][j]) / (1.0_dt + std::exp(-(TODTYPE(gates2D[i][sliceSize + j]) + mLayer.mForgetBias)));
+                newCellState2D[i][j] = TOMMTYPE(newCState);
+                newHiddenState2D[i][j] = TOMMTYPE(std::tanh(newCState) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j]))));
+            }
+        }
+
+        if (mLayer.mUseZoneout)
+        {
+            if (mode == NetworkMode::Test)
+            {
+                throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "::forwardComputeImpl]: Test mode with zoneout is not implemented");
+            }
+
+            auto& newCellStateFinal = work.getMemoryManager<MM>()[mLayer.mOutputsLocal[q][1]];
+            auto& mRandomCPUHidden = work.getMemoryManager<MM>()[mLayer.mRandomNameHidden[q]];
+            auto& mRandomCPUCell = work.getMemoryManager<MM>()[mLayer.mRandomNameCell[q]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t qq = 0; qq < hiddenState.size(); ++qq)
+            {
+                mRandomCPUHidden[qq] = TOMMTYPE(random::bernoulli::randBool(1.0_dt - mLayer.mZoneout) ? 1.0_dt : 0.0_dt);
+                mRandomCPUCell[qq] = TOMMTYPE(random::bernoulli::randBool(1.0_dt - mLayer.mZoneout) ? 1.0_dt : 0.0_dt);
+                newHiddenState[qq] = TOMMTYPE(newHiddenState[qq] * mRandomCPUHidden[qq] + (1.0_dt - mRandomCPUHidden[qq]) * hiddenState[qq]);
+                newCellStateFinal[qq] = TOMMTYPE(newCellState[qq] * mRandomCPUCell[qq] + (1.0_dt - mRandomCPUCell[qq]) * cellState[qq]);
+            }
+        }
+
+        // concatenate hidden
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+        Common::pack4D(newHiddenState, output, mLayer.mDirection, accumulatedSize, mLayer.mTypeName, mLayer.mName, true);
+        accumulatedSize += newHiddenState.getShape()[mLayer.mDimIndex + 1];
+    }
+}
+
+template<typename MM>
+void LSTMFusedLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    size_t accumulatedSize = mLayer.mLengthSequence - 1;
+
+    for (size_t q = mLayer.mLengthSequence; q-- > 0;)
+    {
+
+        const auto& inputData = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+        typename MM::tensor input(inputData.getBatchSize(), 1u, 1u, inputData.getWidth());
+        Common::unpack4D(inputData, input, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, true);
+
+        auto& inputGradData = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+        auto& deltasHidden = work.getMemoryManager<MM>()[mLayer.mOutputsLocal[q][0].grad()];
+        const auto& deltasCell = work.getMemoryManager<MM>()[mLayer.mOutputsLocal[q][1].grad()];
+        const auto& gates = work.getMemoryManager<MM>()[mLayer.mGatesName[q]];
+        const auto& hiddenState = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][1]];
+        const auto& cellState = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][2]];
+        const auto& newCellState = mLayer.mUseZoneout ? work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewCellName[q]] : work.getMemoryManager<MM>()[mLayer.mOutputsLocal[q][1]];
+
+        const auto batchSize = work.getBatchSize();
+        const auto sliceSize = gates.getWidth() / 4;
+        size_t N = batchSize * input.getDepth() * input.getHeight();
+
+        // concatenate hidden
+        const auto& delta = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+        Common::unpack4D(delta, deltasHidden, mLayer.mDirection, accumulatedSize, mLayer.mTypeName, mLayer.mName, false);
+        accumulatedSize -= deltasHidden.getShape()[mLayer.mDimIndex + 1];
+
+        if (mLayer.mUseZoneout)
+        {
+            auto& deltasHiddenNoZoneout = work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewHiddenGradName[q]];
+            auto& deltasCellNoZoneout = work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewCellGradName[q]];
+            auto& mRandomCPUHidden = work.getMemoryManager<MM>()[mLayer.mRandomNameHidden[q]];
+            auto& mRandomCPUCell = work.getMemoryManager<MM>()[mLayer.mRandomNameCell[q]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < batchSize * sliceSize; ++i)
+            {
+                if (mRandomCPUHidden[i] == TOMMTYPE(1.0_dt))
+                {
+                    deltasHiddenNoZoneout[i] += deltasHidden[i];
+                }
+                if (mRandomCPUCell[i] == TOMMTYPE(1.0_dt))
+                {
+                    deltasCellNoZoneout[i] += deltasCell[i];
+                }
+            }
+        }
+        const auto deltasHidden2D =
+            mLayer.mUseZoneout ? work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewHiddenGradName[q]].reshape(yato::dims(batchSize, sliceSize)) : deltasHidden.reshape(yato::dims(batchSize, sliceSize));
+        const auto deltasCell2D =
+            mLayer.mUseZoneout ? work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewCellGradName[q]].reshape(yato::dims(batchSize, sliceSize)) : deltasCell.reshape(yato::dims(batchSize, sliceSize));
+        const auto gates2D = gates.reshape(yato::dims(batchSize, sliceSize * 4));
+        const auto cellState2D = cellState.reshape(yato::dims(batchSize, sliceSize));
+        const auto newCellState2D = newCellState.reshape(yato::dims(batchSize, sliceSize));
+
+        // Calculate gradients for tmp storages
+        auto& gatesGrad = work.getMemoryManager<MM>()[mLayer.mGatesName[q].grad()];
+        auto gatesGrad2D = gatesGrad.reshape(yato::dims(batchSize, sliceSize * 4));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t j = 0; j < sliceSize; ++j)
+            {
+                auto globalGrad = TODTYPE(deltasHidden2D[i][j]) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j]))) *
+                                      (1.0_dt - std::tanh(TODTYPE(newCellState2D[i][j])) * std::tanh(TODTYPE(newCellState2D[i][j]))) +
+                                  TODTYPE(deltasCell2D[i][j]);
+
+                auto tmp = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(gates2D[i][j])));
+                gatesGrad2D[i][j] += TOMMTYPE(globalGrad * std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j])) * tmp * (1.0_dt - tmp));
+
+                tmp = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize + j]) - mLayer.mForgetBias));
+                gatesGrad2D[i][sliceSize + j] += TOMMTYPE(globalGrad * TODTYPE(cellState2D[i][j]) * tmp * (1.0_dt - tmp));
+
+                gatesGrad2D[i][sliceSize * 2 + j] += TOMMTYPE(globalGrad / (1.0_dt + std::exp(-TODTYPE(gates2D[i][j]))) *
+                                                              (1.0_dt - std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j])) * std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j]))));
+
+                tmp = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j])));
+                gatesGrad2D[i][sliceSize * 3 + j] += TOMMTYPE(TODTYPE(deltasHidden2D[i][j]) * std::tanh(TODTYPE(newCellState2D[i][j])) * tmp * (1.0_dt - tmp));
+            }
+        }
+
+        if (mLayer.mUseSingleParamTensor)
+        {
+            auto& tmpGrad = work.getMemoryManager<MM>()[mLayer.mTmpCalculationsName[q].grad()];
+            const auto& weights = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[0]];
+            Common::gemm(CblasNoTrans,
+                         CblasNoTrans,
+                         N,
+                         tmpGrad.getWidth(),
+                         mLayer.mOutputsCount,
+                         1.0_dt,
+                         gatesGrad.getBuffer(),
+                         weights.getBuffer(),
+                         1.0_dt,
+                         tmpGrad.getBuffer());
+            const auto tmpGrad2D = tmpGrad.reshape(yato::dims(batchSize, tmpGrad.getWidth()));
+            // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+            {
+                // auto& inputGrad = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+                auto inputGrad3D = inputGradData.reshape(yato::dims(batchSize, mLayer.mLengthSequence, inputGradData.getWidth()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < batchSize; ++i)
+                {
+                    for (size_t j = 0; j < inputGradData.getWidth(); ++j)
+                    {
+                        inputGrad3D[i][q][j] += tmpGrad2D[i][j];
+                    }
+                }
+            }
+            // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputsLocal[q][1]))
+            {
+                auto& hiddenStateGrad = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][1].grad()];
+                auto hiddenStateGrad2D = hiddenStateGrad.reshape(yato::dims(batchSize, hiddenStateGrad.getWidth()));
+
+                if (mLayer.mUseZoneout)
+                {
+                    auto& mRandomCPUHidden = work.getMemoryManager<MM>()[mLayer.mRandomNameHidden[q]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                    for (size_t i = 0; i < hiddenStateGrad.size(); ++i)
+                    {
+                        if (mRandomCPUHidden[i] == TOMMTYPE(0.0_dt))
+                        {
+                            hiddenStateGrad[i] += deltasHidden[i];
+                        }
+                    }
+                }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < batchSize; ++i)
+                {
+                    for (size_t j = 0; j < hiddenStateGrad.getWidth(); ++j)
+                    {
+                        hiddenStateGrad2D[i][j] += tmpGrad2D[i][input.getWidth() + j];
+                    }
+                }
+            }
+        }
+        else
+        {
+            // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+            {
+                // auto& inputGrad = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+                typename MM::tensor inputGrad(inputGradData.getBatchSize(), 1u, 1u, inputGradData.getWidth());
+                Common::unpack4D(inputGradData, inputGrad, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, true);
+
+                const auto& weightsIH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[0]];
+                Common::gemm(CblasNoTrans,
+                             CblasNoTrans,
+                             N,
+                             inputGrad.getWidth(),
+                             mLayer.mOutputsCount,
+                             1.0_dt,
+                             gatesGrad.getBuffer(),
+                             weightsIH.getBuffer(),
+                             1.0_dt,
+                             inputGrad.getBuffer());
+
+                Common::pack4D(inputGrad, inputGradData, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, false);
+            }
+            // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputsLocal[q][1]))
+            {
+                auto& hiddenStateGrad = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][1].grad()];
+
+                if (mLayer.mUseZoneout)
+                {
+                    auto& mRandomCPUHidden = work.getMemoryManager<MM>()[mLayer.mRandomNameHidden[q]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                    for (size_t i = 0; i < hiddenStateGrad.size(); ++i)
+                    {
+                        if (mRandomCPUHidden[i] == TOMMTYPE(0.0_dt))
+                        {
+                            hiddenStateGrad[i] += deltasHidden[i];
+                        }
+                    }
+                }
+
+                const auto& weightsHH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[2]];
+                Common::gemm(CblasNoTrans,
+                             CblasNoTrans,
+                             N,
+                             hiddenStateGrad.getWidth(),
+                             mLayer.mOutputsCount,
+                             1.0_dt,
+                             gatesGrad.getBuffer(),
+                             weightsHH.getBuffer(),
+                             1.0_dt,
+                             hiddenStateGrad.getBuffer());
+            }
+        }
+
+        // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputsLocal[q][2]))
+        {
+            auto& cellStateGrad = work.getMemoryManager<MM>()[mLayer.mInputsLocal[q][2].grad()];
+            auto cellStateGrad2D = cellStateGrad.reshape(yato::dims(batchSize, sliceSize));
+
+            if (mLayer.mUseZoneout)
+            {
+                auto& mRandomCPUCell = work.getMemoryManager<MM>()[mLayer.mRandomNameCell[q]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < cellStateGrad.size(); ++i)
+                {
+                    if (mRandomCPUCell[i] == TOMMTYPE(0.0_dt))
+                    {
+                        cellStateGrad[i] += deltasCell[i];
+                    }
+                }
+            }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t j = 0; j < sliceSize; ++j)
+                {
+                    cellStateGrad2D[i][j] += TOMMTYPE((TODTYPE(deltasHidden2D[i][j]) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j]))) *
+                                                           (1.0_dt - std::tanh(TODTYPE(newCellState2D[i][j])) * std::tanh(TODTYPE(newCellState2D[i][j]))) +
+                                                       TODTYPE(deltasCell2D[i][j])) /
+                                                      (1.0_dt + std::exp(-(TODTYPE(gates2D[i][sliceSize + j]) + mLayer.mForgetBias))));
+                }
+            }
+        }
+
+        if (!mLayer.mFrozen)
+        {
+            if (mLayer.mUseSingleParamTensor)
+            {
+                auto& gradWeights = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[0].grad()];
+                auto& tmp = work.getMemoryManager<MM>()[mLayer.mTmpCalculationsName[q]];
+                Common::gemm(CblasTrans,
+                             CblasNoTrans,
+                             mLayer.mOutputsCount,
+                             tmp.getWidth(),
+                             N,
+                             1.0_dt,
+                             gatesGrad.getBuffer(),
+                             tmp.getBuffer(),
+                             1.0_dt,
+                             gradWeights.getBuffer());
+
+                if (mLayer.mUseBias)
+                {
+                    auto& gradBiases = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[1].grad()];
+                    for (size_t index = 0; index < N; ++index)
+                    {
+                        Common::axpy(mLayer.mOutputsCount,
+                                     1.0_dt,
+                                     gatesGrad.getBuffer(),
+                                     1,
+                                     gradBiases.getBuffer(),
+                                     1,
+                                     index * mLayer.mOutputsCount,
+                                     0);
+                    }
+                }
+            }
+            else
+            {
+                auto& gradWeightsIH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[0].grad()];
+
+                Common::gemm(CblasTrans,
+                             CblasNoTrans,
+                             mLayer.mOutputsCount,
+                             input.getWidth(),
+                             N,
+                             1.0_dt,
+                             gatesGrad.getBuffer(),
+                             input.getBuffer(),
+                             1.0_dt,
+                             gradWeightsIH.getBuffer());
+
+                auto& gradWeightsHH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[2].grad()];
+
+                Common::gemm(CblasTrans,
+                             CblasNoTrans,
+                             mLayer.mOutputsCount,
+                             hiddenState.getWidth(),
+                             N,
+                             1.0_dt,
+                             gatesGrad.getBuffer(),
+                             hiddenState.getBuffer(),
+                             1.0_dt,
+                             gradWeightsHH.getBuffer());
+
+                if (mLayer.mUseBias)
+                {
+                    auto& gradBiasesIH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[1].grad()];
+                    for (size_t index = 0; index < N; ++index)
+                    {
+                        Common::axpy(mLayer.mOutputsCount,
+                                     1.0_dt,
+                                     gatesGrad.getBuffer(),
+                                     1,
+                                     gradBiasesIH.getBuffer(),
+                                     1,
+                                     index * mLayer.mOutputsCount,
+                                     0);
+                    }
+
+                    auto& gradBiasesHH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[3].grad()];
+                    for (size_t index = 0; index < N; ++index)
+                    {
+                        Common::axpy(mLayer.mOutputsCount,
+                                     1.0_dt,
+                                     gatesGrad.getBuffer(),
+                                     1,
+                                     gradBiasesHH.getBuffer(),
+                                     1,
+                                     index * mLayer.mOutputsCount,
+                                     0);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template class LSTMFusedLayerCPU<MemoryManager>;
+template class LSTMFusedLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPU.h b/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPU.h
new file mode 100644
index 00000000..a28366e2
--- /dev/null
+++ b/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LSTM_FUSED_LAYER_CPU_H
+#define LSTM_FUSED_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class LSTMFusedLayer;
+
+/**
+ * @brief LSTMFusedLayer CPU implementation
+ */
+template<typename MM>
+class LSTMFusedLayerCPU : public BasicImpl
+{
+  public:
+    LSTMFusedLayerCPU(LSTMFusedLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    LSTMFusedLayerCPU(LSTMFusedLayerCPU&&) = default;
+    LSTMFusedLayerCPU(const LSTMFusedLayerCPU&) = delete;
+    LSTMFusedLayerCPU& operator=(const LSTMFusedLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LSTMFusedLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPUFP16.cpp b/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPUFP16.cpp
new file mode 100644
index 00000000..05106841
--- /dev/null
+++ b/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPUFP16.cpp
@@ -0,0 +1,535 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LSTMFusedLayerCPUFP16.h"
+#include <training/base/common/Random.h>
+#include <training/base/layers/composite/rnn/LSTMFusedLayer.h>
+#include <training/base/impl/ImplFactory.h>
+
+namespace
+{
+// bool reg = raul::TheImplFactory::Instance().regCPUFP32FP16MixedLocal<raul::LSTMFusedLayer, raul::LSTMFusedLayerCPUFP16>();
+} // anonymous namespace
+
+namespace raul
+{
+
+LSTMFusedLayerCPUFP16::LSTMFusedLayerCPUFP16(LSTMFusedLayer& layer)
+    : mLayer(layer)
+{
+}
+
+void LSTMFusedLayerCPUFP16::forwardComputeImpl(NetworkMode mode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    size_t accumulatedSize = 0;
+
+    for (size_t q = 0; q < mLayer.mLengthSequence; ++q)
+    {
+        // Process gates
+        const auto& inputData = work.getMemoryManager()[mLayer.mInputs[0]];
+        Tensor input(inputData.getBatchSize(), 1u, 1u, inputData.getWidth());
+
+        Common::unpack4D(inputData, input, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, true);
+
+        const auto& hiddenState = work.getMemoryManager()[mLayer.mInputsLocal[q][1]];
+
+        auto& gates = work.getMemoryManager()[mLayer.mGatesName[q]];
+        auto& tmp = work.getMemoryManager()[mLayer.mTmpCalculationsName[q]];
+
+        const auto batchSize = work.getBatchSize();
+        size_t N = batchSize * input.getDepth() * input.getHeight();
+
+        // If use single matrix
+        if (mLayer.mUseSingleParamTensor)
+        {
+            auto tmp2D = tmp.reshape(yato::dims(batchSize, tmp.getWidth()));
+            const auto input2D = input.reshape(yato::dims(batchSize, input.getWidth()));
+            const auto hiddenState2D = hiddenState.reshape(yato::dims(batchSize, hiddenState.getWidth()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                std::copy(input2D[i].begin(), input2D[i].end(), tmp2D[i].begin());
+                std::copy(hiddenState2D[i].begin(), hiddenState2D[i].end(), &tmp2D[i][input.getWidth()]);
+            }
+
+            const auto& weights = work.getMemoryManager().getTensor(mLayer.mTrainableParamsNames[0]);
+
+            Common::gemm(CblasNoTrans,
+                         CblasTrans,
+                         N,
+                         mLayer.mOutputsCount,
+                         tmp.getWidth(),
+                         1.0_dt,
+                         tmp.getBuffer(),
+                         weights.getBuffer(),
+                         0.0_dt,
+                         gates.getBuffer());
+
+            if (mLayer.mUseBias)
+            {
+                const auto& biases = work.getMemoryManager().getTensor(mLayer.mTrainableParamsNames[1]);
+
+                for (size_t index = 0; index < N; ++index)
+                {
+                    Common::axpy(mLayer.mOutputsCount, 1.0_dt, biases.getBuffer(), 1, gates.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+                }
+            }
+        }
+        else
+        {
+            // Process input
+            const auto& weightsIH = work.getMemoryManager().getTensor(mLayer.mTrainableParamsNames[0]);
+
+            Common::gemm(CblasNoTrans,
+                         CblasTrans,
+                         N,
+                         mLayer.mOutputsCount,
+                         input.getWidth(),
+                         1.0_dt,
+                         input.getBuffer(),
+                         weightsIH.getBuffer(),
+                         0.0_dt,
+                         tmp.getBuffer());
+
+            if (mLayer.mUseBias)
+            {
+                const auto& biasesIH = work.getMemoryManager().getTensor(mLayer.mTrainableParamsNames[1]);
+
+                for (size_t index = 0; index < N; ++index)
+                {
+                    Common::axpy(mLayer.mOutputsCount, 1.0_dt, biasesIH.getBuffer(), 1, tmp.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+                }
+            }
+
+            // Process hidden
+            const auto& weightsHH = work.getMemoryManager().getTensor(mLayer.mTrainableParamsNames[2]);
+
+            Common::gemm(CblasNoTrans,
+                         CblasTrans,
+                         N,
+                         mLayer.mOutputsCount,
+                         hiddenState.getWidth(),
+                         1.0_dt,
+                         hiddenState.getBuffer(),
+                         weightsHH.getBuffer(),
+                         0.0_dt,
+                         gates.getBuffer());
+
+            if (mLayer.mUseBias)
+            {
+                const auto& biasesHH = work.getMemoryManager().getTensor(mLayer.mTrainableParamsNames[3]);
+
+                for (size_t index = 0; index < N; ++index)
+                {
+                    Common::axpy(mLayer.mOutputsCount, 1.0_dt, biasesHH.getBuffer(), 1, gates.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+                }
+            }
+
+            // Final gates is sum of tmp and gates
+            gates += tmp;
+        }
+
+        const auto& cellState = work.getMemoryManager()[mLayer.mInputsLocal[q][2]];
+        auto& newCellState = mLayer.mUseZoneout ? work.getMemoryManager()[mLayer.mNoZoneoutNewCellName[q]] : work.getMemoryManager()[mLayer.mOutputsLocal[q][1]];
+
+        auto& newHiddenState = work.getMemoryManager()[mLayer.mOutputsLocal[q][0]];
+
+        const auto sliceSize = gates.getWidth() / 4;
+
+        const auto gates2D = gates.reshape(yato::dims(batchSize, sliceSize * 4));
+        auto newHiddenState2D = newHiddenState.reshape(yato::dims(batchSize, sliceSize));
+        const auto cellState2D = cellState.reshape(yato::dims(batchSize, sliceSize));
+        auto newCellState2D = newCellState.reshape(yato::dims(batchSize, sliceSize));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t j = 0; j < sliceSize; ++j)
+            {
+                auto newCState = std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j])) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][j]))) +
+                                 TODTYPE(cellState2D[i][j]) / (1.0_dt + std::exp(-(TODTYPE(gates2D[i][sliceSize + j]) + mLayer.mForgetBias)));
+                newCellState2D[i][j] = TODTYPE(newCState);
+                newHiddenState2D[i][j] = TODTYPE(std::tanh(newCState) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j]))));
+            }
+        }
+
+        if (mLayer.mUseZoneout)
+        {
+            if (mode == NetworkMode::Test)
+            {
+                throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "::forwardComputeImpl]: Test mode with zoneout is not implemented");
+            }
+
+            auto& newCellStateFinal = work.getMemoryManager()[mLayer.mOutputsLocal[q][1]];
+            auto& mRandomCPUHidden = work.getMemoryManager()[mLayer.mRandomNameHidden[q]];
+            auto& mRandomCPUCell = work.getMemoryManager()[mLayer.mRandomNameCell[q]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t qq = 0; qq < hiddenState.size(); ++qq)
+            {
+                mRandomCPUHidden[qq] = TODTYPE(random::bernoulli::randBool(1.0_dt - mLayer.mZoneout) ? 1.0_dt : 0.0_dt);
+                mRandomCPUCell[qq] = TODTYPE(random::bernoulli::randBool(1.0_dt - mLayer.mZoneout) ? 1.0_dt : 0.0_dt);
+                newHiddenState[qq] = TODTYPE(newHiddenState[qq] * mRandomCPUHidden[qq] + (1.0_dt - mRandomCPUHidden[qq]) * hiddenState[qq]);
+                newCellStateFinal[qq] = TODTYPE(newCellState[qq] * mRandomCPUCell[qq] + (1.0_dt - mRandomCPUCell[qq]) * cellState[qq]);
+            }
+        }
+
+        // concatenate hidden
+        auto& output = work.getMemoryManager()[mLayer.mOutputs[0]];
+        Common::pack4D(newHiddenState, output, mLayer.mDirection, accumulatedSize, mLayer.mTypeName, mLayer.mName, true);
+        accumulatedSize += newHiddenState.getShape()[mLayer.mDimIndex + 1];
+    }
+}
+
+void LSTMFusedLayerCPUFP16::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    size_t accumulatedSize = mLayer.mLengthSequence - 1;
+
+    for (size_t q = mLayer.mLengthSequence; q-- > 0;)
+    {
+
+        const auto& inputData = work.getMemoryManager()[mLayer.mInputs[0]];
+        Tensor input(inputData.getBatchSize(), 1u, 1u, inputData.getWidth());
+        Common::unpack4D(inputData, input, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, true);
+
+        auto& inputGradData = work.getMemoryManager()[mLayer.mInputs[0].grad()];
+
+        auto& deltasHidden = work.getMemoryManager()[mLayer.mOutputsLocal[q][0].grad()];
+        const auto& deltasCell = work.getMemoryManager()[mLayer.mOutputsLocal[q][1].grad()];
+        const auto& gates = work.getMemoryManager()[mLayer.mGatesName[q]];
+        const auto& hiddenState = work.getMemoryManager()[mLayer.mInputsLocal[q][1]];
+        const auto& cellState = work.getMemoryManager()[mLayer.mInputsLocal[q][2]];
+        const auto& newCellState = mLayer.mUseZoneout ? work.getMemoryManager()[mLayer.mNoZoneoutNewCellName[q]] : work.getMemoryManager()[mLayer.mOutputsLocal[q][1]];
+
+        const auto batchSize = work.getBatchSize();
+        const auto sliceSize = gates.getWidth() / 4;
+        size_t N = batchSize * input.getDepth() * input.getHeight();
+
+        // concatenate hidden
+        const auto& delta = work.getMemoryManager()[mLayer.mOutputs[0].grad()];
+        Common::unpack4D(delta, deltasHidden, mLayer.mDirection, accumulatedSize, mLayer.mTypeName, mLayer.mName, false);
+        accumulatedSize -= deltasHidden.getShape()[mLayer.mDimIndex + 1];
+
+        if (mLayer.mUseZoneout)
+        {
+            auto& deltasHiddenNoZoneout = work.getMemoryManager()[mLayer.mNoZoneoutNewHiddenGradName[q]];
+            auto& deltasCellNoZoneout = work.getMemoryManager()[mLayer.mNoZoneoutNewCellGradName[q]];
+            auto& mRandomCPUHidden = work.getMemoryManager()[mLayer.mRandomNameHidden[q]];
+            auto& mRandomCPUCell = work.getMemoryManager()[mLayer.mRandomNameCell[q]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < batchSize * sliceSize; ++i)
+            {
+                if (mRandomCPUHidden[i] == TODTYPE(1.0_dt))
+                {
+                    deltasHiddenNoZoneout[i] += deltasHidden[i];
+                }
+                if (mRandomCPUCell[i] == TODTYPE(1.0_dt))
+                {
+                    deltasCellNoZoneout[i] += deltasCell[i];
+                }
+            }
+        }
+        const auto deltasHidden2D =
+            mLayer.mUseZoneout ? work.getMemoryManager()[mLayer.mNoZoneoutNewHiddenGradName[q]].reshape(yato::dims(batchSize, sliceSize)) : deltasHidden.reshape(yato::dims(batchSize, sliceSize));
+        const auto deltasCell2D =
+            mLayer.mUseZoneout ? work.getMemoryManager()[mLayer.mNoZoneoutNewCellGradName[q]].reshape(yato::dims(batchSize, sliceSize)) : deltasCell.reshape(yato::dims(batchSize, sliceSize));
+        const auto gates2D = gates.reshape(yato::dims(batchSize, sliceSize * 4));
+        const auto cellState2D = cellState.reshape(yato::dims(batchSize, sliceSize));
+        const auto newCellState2D = newCellState.reshape(yato::dims(batchSize, sliceSize));
+
+        // Calculate gradients for tmp storages
+        auto& gatesGrad = work.getMemoryManager()[mLayer.mGatesName[q].grad()];
+        auto gatesGrad2D = gatesGrad.reshape(yato::dims(batchSize, sliceSize * 4));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t j = 0; j < sliceSize; ++j)
+            {
+                auto globalGrad = TODTYPE(deltasHidden2D[i][j]) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j]))) *
+                                      (1.0_dt - std::tanh(TODTYPE(newCellState2D[i][j])) * std::tanh(TODTYPE(newCellState2D[i][j]))) +
+                                  TODTYPE(deltasCell2D[i][j]);
+
+                auto tmp = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(gates2D[i][j])));
+                gatesGrad2D[i][j] += TODTYPE(globalGrad * std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j])) * tmp * (1.0_dt - tmp));
+
+                tmp = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize + j]) - mLayer.mForgetBias));
+                gatesGrad2D[i][sliceSize + j] += TODTYPE(globalGrad * TODTYPE(cellState2D[i][j]) * tmp * (1.0_dt - tmp));
+
+                gatesGrad2D[i][sliceSize * 2 + j] += TODTYPE(globalGrad / (1.0_dt + std::exp(-TODTYPE(gates2D[i][j]))) *
+                                                             (1.0_dt - std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j])) * std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j]))));
+
+                tmp = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j])));
+                gatesGrad2D[i][sliceSize * 3 + j] += TODTYPE(TODTYPE(deltasHidden2D[i][j]) * std::tanh(TODTYPE(newCellState2D[i][j])) * tmp * (1.0_dt - tmp));
+            }
+        }
+
+        if (mLayer.mUseSingleParamTensor)
+        {
+            auto& tmpGrad = work.getMemoryManager()[mLayer.mTmpCalculationsName[q].grad()];
+            const auto& weights = work.getMemoryManager()[mLayer.mTrainableParamsNames[0]];
+            Common::gemm(CblasNoTrans,
+                         CblasNoTrans,
+                         N,
+                         tmpGrad.getWidth(),
+                         mLayer.mOutputsCount,
+                         1.0_dt,
+                         gatesGrad.getBuffer(),
+                         weights.getBuffer(),
+                         1.0_dt,
+                         tmpGrad.getBuffer());
+            const auto tmpGrad2D = tmpGrad.reshape(yato::dims(batchSize, tmpGrad.getWidth()));
+            // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+            {
+                // auto& inputGrad = work.getMemoryManager()[mLayer.mInputs[0].grad()];
+                auto inputGrad3D = inputGradData.reshape(yato::dims(batchSize, mLayer.mLengthSequence, inputGradData.getWidth()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < batchSize; ++i)
+                {
+                    for (size_t j = 0; j < inputGradData.getWidth(); ++j)
+                    {
+                        inputGrad3D[i][q][j] += tmpGrad2D[i][j];
+                    }
+                }
+            }
+            // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputsLocal[q][1]))
+            {
+                auto& hiddenStateGrad = work.getMemoryManager()[mLayer.mInputsLocal[q][1].grad()];
+                auto hiddenStateGrad2D = hiddenStateGrad.reshape(yato::dims(batchSize, hiddenStateGrad.getWidth()));
+
+                if (mLayer.mUseZoneout)
+                {
+                    auto& mRandomCPUHidden = work.getMemoryManager()[mLayer.mRandomNameHidden[q]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                    for (size_t i = 0; i < hiddenStateGrad.size(); ++i)
+                    {
+                        if (mRandomCPUHidden[i] == TODTYPE(0.0_dt))
+                        {
+                            hiddenStateGrad[i] += deltasHidden[i];
+                        }
+                    }
+                }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < batchSize; ++i)
+                {
+                    for (size_t j = 0; j < hiddenStateGrad.getWidth(); ++j)
+                    {
+                        hiddenStateGrad2D[i][j] += tmpGrad2D[i][input.getWidth() + j];
+                    }
+                }
+            }
+        }
+        else
+        {
+            // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+            {
+                // auto& inputGrad = work.getMemoryManager()[mLayer.mInputs[0].grad()];
+                Tensor inputGrad(inputGradData.getBatchSize(), 1u, 1u, inputGradData.getWidth());
+                Common::unpack4D(inputGradData, inputGrad, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, true);
+
+                const auto& weightsIH = work.getMemoryManager()[mLayer.mTrainableParamsNames[0]];
+                Common::gemm(CblasNoTrans,
+                             CblasNoTrans,
+                             N,
+                             inputGrad.getWidth(),
+                             mLayer.mOutputsCount,
+                             1.0_dt,
+                             gatesGrad.getBuffer(),
+                             weightsIH.getBuffer(),
+                             1.0_dt,
+                             inputGrad.getBuffer());
+
+                Common::pack4D(inputGrad, inputGradData, mLayer.mDirection, q, mLayer.mTypeName, mLayer.mName, false);
+            }
+            // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputsLocal[q][1]))
+            {
+                auto& hiddenStateGrad = work.getMemoryManager()[mLayer.mInputsLocal[q][1].grad()];
+
+                if (mLayer.mUseZoneout)
+                {
+                    auto& mRandomCPUHidden = work.getMemoryManager()[mLayer.mRandomNameHidden[q]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                    for (size_t i = 0; i < hiddenStateGrad.size(); ++i)
+                    {
+                        if (mRandomCPUHidden[i] == TODTYPE(0.0_dt))
+                        {
+                            hiddenStateGrad[i] += deltasHidden[i];
+                        }
+                    }
+                }
+
+                const auto& weightsHH = work.getMemoryManager()[mLayer.mTrainableParamsNames[2]];
+                Common::gemm(CblasNoTrans,
+                             CblasNoTrans,
+                             N,
+                             hiddenStateGrad.getWidth(),
+                             mLayer.mOutputsCount,
+                             1.0_dt,
+                             gatesGrad.getBuffer(),
+                             weightsHH.getBuffer(),
+                             1.0_dt,
+                             hiddenStateGrad.getBuffer());
+            }
+        }
+
+        // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputsLocal[q][2]))
+        {
+            auto& cellStateGrad = work.getMemoryManager()[mLayer.mInputsLocal[q][2].grad()];
+            auto cellStateGrad2D = cellStateGrad.reshape(yato::dims(batchSize, sliceSize));
+
+            if (mLayer.mUseZoneout)
+            {
+                auto& mRandomCPUCell = work.getMemoryManager()[mLayer.mRandomNameCell[q]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < cellStateGrad.size(); ++i)
+                {
+                    if (mRandomCPUCell[i] == TODTYPE(0.0_dt))
+                    {
+                        cellStateGrad[i] += deltasCell[i];
+                    }
+                }
+            }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t j = 0; j < sliceSize; ++j)
+                {
+                    cellStateGrad2D[i][j] += TODTYPE((TODTYPE(deltasHidden2D[i][j]) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j]))) *
+                                                          (1.0_dt - std::tanh(TODTYPE(newCellState2D[i][j])) * std::tanh(TODTYPE(newCellState2D[i][j]))) +
+                                                      TODTYPE(deltasCell2D[i][j])) /
+                                                     (1.0_dt + std::exp(-(TODTYPE(gates2D[i][sliceSize + j]) + mLayer.mForgetBias))));
+                }
+            }
+        }
+
+        if (!mLayer.mFrozen)
+        {
+            if (mLayer.mUseSingleParamTensor)
+            {
+                auto& gradWeights = work.getMemoryManager()[mLayer.mTrainableParamsNames[0].grad()];
+                auto& tmp = work.getMemoryManager()[mLayer.mTmpCalculationsName[q]];
+                Common::gemm(CblasTrans,
+                             CblasNoTrans,
+                             mLayer.mOutputsCount,
+                             tmp.getWidth(),
+                             N,
+                             1.0_dt,
+                             gatesGrad.getBuffer(),
+                             tmp.getBuffer(),
+                             1.0_dt,
+                             gradWeights.getBuffer());
+
+                if (mLayer.mUseBias)
+                {
+                    auto& gradBiases = work.getMemoryManager()[mLayer.mTrainableParamsNames[1].grad()];
+                    for (size_t index = 0; index < N; ++index)
+                    {
+                        Common::axpy(mLayer.mOutputsCount,
+                                     1.0_dt,
+                                     gatesGrad.getBuffer(),
+                                     1,
+                                     gradBiases.getBuffer(),
+                                     1,
+                                     index * mLayer.mOutputsCount,
+                                     0);
+                    }
+                }
+            }
+            else
+            {
+                auto& gradWeightsIH = work.getMemoryManager()[mLayer.mTrainableParamsNames[0].grad()];
+
+                Common::gemm(CblasTrans,
+                             CblasNoTrans,
+                             mLayer.mOutputsCount,
+                             input.getWidth(),
+                             N,
+                             1.0_dt,
+                             gatesGrad.getBuffer(),
+                             input.getBuffer(),
+                             1.0_dt,
+                             gradWeightsIH.getBuffer());
+
+                auto& gradWeightsHH = work.getMemoryManager()[mLayer.mTrainableParamsNames[2].grad()];
+
+                Common::gemm(CblasTrans,
+                             CblasNoTrans,
+                             mLayer.mOutputsCount,
+                             hiddenState.getWidth(),
+                             N,
+                             1.0_dt,
+                             gatesGrad.getBuffer(),
+                             hiddenState.getBuffer(),
+                             1.0_dt,
+                             gradWeightsHH.getBuffer());
+
+                if (mLayer.mUseBias)
+                {
+                    auto& gradBiasesIH = work.getMemoryManager()[mLayer.mTrainableParamsNames[1].grad()];
+                    for (size_t index = 0; index < N; ++index)
+                    {
+                        Common::axpy(mLayer.mOutputsCount,
+                                     1.0_dt,
+                                     gatesGrad.getBuffer(),
+                                     1,
+                                     gradBiasesIH.getBuffer(),
+                                     1,
+                                     index * mLayer.mOutputsCount,
+                                     0);
+                    }
+
+                    auto& gradBiasesHH = work.getMemoryManager()[mLayer.mTrainableParamsNames[3].grad()];
+                    for (size_t index = 0; index < N; ++index)
+                    {
+                        Common::axpy(mLayer.mOutputsCount,
+                                     1.0_dt,
+                                     gatesGrad.getBuffer(),
+                                     1,
+                                     gradBiasesHH.getBuffer(),
+                                     1,
+                                     index * mLayer.mOutputsCount,
+                                     0);
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPUFP16.h b/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPUFP16.h
new file mode 100644
index 00000000..2c4e90bf
--- /dev/null
+++ b/training/src/compiler/training/base/impl/composite/rnn/LSTMFusedLayerCPUFP16.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LSTM_FUSED_LAYER_CPU_H
+#define LSTM_FUSED_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class LSTMFusedLayer;
+
+/**
+ * @brief LSTMFusedLayer CPU implementation (FP32 input, FP16 calculations)
+ */
+class LSTMFusedLayerCPUFP16 : public BasicImpl
+{
+  public:
+    LSTMFusedLayerCPUFP16(LSTMFusedLayer& layer);
+
+    LSTMFusedLayerCPUFP16(LSTMFusedLayerCPUFP16&&) = default;
+    LSTMFusedLayerCPUFP16(const LSTMFusedLayerCPUFP16&) = delete;
+    LSTMFusedLayerCPUFP16& operator=(const LSTMFusedLayerCPUFP16&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LSTMFusedLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/initializers/ConstantInitializer.cpp b/training/src/compiler/training/base/initializers/ConstantInitializer.cpp
new file mode 100644
index 00000000..cd00f4ef
--- /dev/null
+++ b/training/src/compiler/training/base/initializers/ConstantInitializer.cpp
@@ -0,0 +1,50 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ConstantInitializer.h"
+
+namespace raul::initializers
+{
+ConstantInitializer::ConstantInitializer(const dtype value)
+    : mValue(value)
+{
+}
+
+void ConstantInitializer::operator()(Tensor& output)
+{
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < output.size(); i++)
+    {
+        output[i] = mValue;
+    }
+}
+
+void ConstantInitializer::operator()(TensorFP16& output)
+{
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < output.size(); i++)
+    {
+        output[i] = TOHTYPE(mValue);
+    }
+}
+
+std::ostream& ConstantInitializer::as_ostream(std::ostream& out) const
+{
+    out << "ConstantInitializer(value=" << mValue << ")";
+    return out;
+}
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/initializers/ConstantInitializer.h b/training/src/compiler/training/base/initializers/ConstantInitializer.h
new file mode 100644
index 00000000..255a0f16
--- /dev/null
+++ b/training/src/compiler/training/base/initializers/ConstantInitializer.h
@@ -0,0 +1,37 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONSTANT_INITIALIZER_H
+#define CONSTANT_INITIALIZER_H
+
+#include "IInitializer.h"
+
+namespace raul::initializers
+{
+struct ConstantInitializer : public IInitializer
+{
+    ConstantInitializer(const dtype value);
+
+    void operator()(Tensor& output) override;
+    void operator()(TensorFP16&) override;
+
+    dtype getValue() const { return mValue; }
+    void setValue(const dtype val) { mValue = val; }
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const override;
+    raul::dtype mValue;
+};
+} // raul::initializers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/initializers/IInitializer.h b/training/src/compiler/training/base/initializers/IInitializer.h
new file mode 100644
index 00000000..a8fe6635
--- /dev/null
+++ b/training/src/compiler/training/base/initializers/IInitializer.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef IINITIALIZER_H
+#define IINITIALIZER_H
+
+#include <iostream>
+#include <random>
+#include <training/base/common/Common.h>
+#include <training/base/common/Tensor.h>
+
+namespace raul::initializers
+{
+/**
+ * @brief Initializer interface
+ */
+struct IInitializer
+{
+    IInitializer() = default;
+    virtual ~IInitializer() = default;
+
+    IInitializer(const IInitializer& other) = delete;
+    IInitializer(const IInitializer&& other) = delete;
+    IInitializer& operator=(const IInitializer& other) = delete;
+    IInitializer& operator=(const IInitializer&& other) = delete;
+
+    virtual void operator()(Tensor& output) = 0;
+    virtual void operator()(TensorFP16& output) = 0;
+    friend std::ostream& operator<<(std::ostream& out, const IInitializer& instance) { return instance.as_ostream(out); }
+
+  private:
+    virtual std::ostream& as_ostream(std::ostream& out) const = 0;
+};
+} // raul::initializers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/initializers/RandomNormInitializer.cpp b/training/src/compiler/training/base/initializers/RandomNormInitializer.cpp
new file mode 100644
index 00000000..bd74ad92
--- /dev/null
+++ b/training/src/compiler/training/base/initializers/RandomNormInitializer.cpp
@@ -0,0 +1,56 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RandomNormInitializer.h"
+
+namespace raul::initializers
+{
+RandomNormInitializer::RandomNormInitializer(const size_t seed, const dtype mean, const dtype stddev)
+    : mSeed(seed)
+    , mGenerator(static_cast<unsigned>(seed))
+    , mMean(mean)
+    , mStddev(stddev)
+{
+}
+
+RandomNormInitializer::RandomNormInitializer(const dtype mean, const dtype stddev)
+    : RandomNormInitializer(std::random_device{}(), mean, stddev)
+{
+}
+
+void RandomNormInitializer::operator()(Tensor& output)
+{
+    std::normal_distribution<dtype> distribution(mMean, mStddev);
+    for (auto& d : output)
+    {
+        d = distribution(mGenerator);
+    }
+}
+
+void RandomNormInitializer::operator()(TensorFP16& output)
+{
+    std::normal_distribution<dtype> distribution(mMean, mStddev);
+    for (auto& d : output)
+    {
+        d = toFloat16(distribution(mGenerator));
+    }
+}
+
+std::ostream& RandomNormInitializer::as_ostream(std::ostream& out) const
+{
+    std::ios_base::fmtflags flags(out.flags());
+    out << "RandomNormInitializer(mean=" << std::scientific << mMean << ", stddev=" << mStddev << ", seed=" << mSeed << ")";
+    out.flags(flags);
+    return out;
+}
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/initializers/RandomNormInitializer.h b/training/src/compiler/training/base/initializers/RandomNormInitializer.h
new file mode 100644
index 00000000..1f9c12f2
--- /dev/null
+++ b/training/src/compiler/training/base/initializers/RandomNormInitializer.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_NORM_INITIALIZER_H
+#define RANDOM_NORM_INITIALIZER_H
+
+#include "IInitializer.h"
+
+namespace raul::initializers
+{
+struct RandomNormInitializer : public IInitializer
+{
+    explicit RandomNormInitializer(const size_t seed, const dtype mean = 0.0_dt, const dtype stddev = 1.0_dt);
+    RandomNormInitializer(const dtype mean = 0.0_dt, const dtype stddev = 1.0_dt);
+
+    void operator()(Tensor& output) override;
+    void operator()(TensorFP16&) override;
+
+    size_t getSeed() const { return mSeed; }
+    dtype getMean() const { return mMean; }
+    dtype getStddev() const { return mStddev; }
+
+    void setMean(const dtype val) { mMean = val; }
+    void setStddev(const dtype val) { mStddev = val; }
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const override;
+    size_t mSeed;
+    std::default_random_engine mGenerator;
+    dtype mMean;
+    dtype mStddev;
+};
+} // raul::initializers
+
+#endif
diff --git a/training/src/compiler/training/base/initializers/RandomUniformInitializer.cpp b/training/src/compiler/training/base/initializers/RandomUniformInitializer.cpp
new file mode 100644
index 00000000..0f8da758
--- /dev/null
+++ b/training/src/compiler/training/base/initializers/RandomUniformInitializer.cpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RandomUniformInitializer.h"
+
+namespace raul::initializers
+{
+RandomUniformInitializer::RandomUniformInitializer(const size_t seed, const dtype minval, const dtype maxval)
+    : mSeed(seed)
+    , mGenerator(static_cast<unsigned>(seed))
+    , mMinval(minval)
+    , mMaxval(maxval)
+{
+    if (minval >= maxval)
+        THROW_NONAME("RandomUniformInitializer", "lower bound should be less than upper(minval = " + Conversions::toString(minval) + " >= " + Conversions::toString(maxval) + " = maxval)");
+}
+
+RandomUniformInitializer::RandomUniformInitializer(const dtype minval, const dtype maxval)
+    : RandomUniformInitializer(std::random_device{}(), minval, maxval)
+{
+}
+
+void RandomUniformInitializer::operator()(Tensor& output)
+{
+    std::uniform_real_distribution<dtype> distribution(mMinval, mMaxval);
+    for (auto& d : output)
+    {
+        d = distribution(mGenerator);
+    }
+}
+
+void RandomUniformInitializer::operator()(TensorFP16& output)
+{
+    std::uniform_real_distribution<dtype> distribution(mMinval, mMaxval);
+    for (auto& d : output)
+    {
+        d = TOHTYPE(distribution(mGenerator));
+    }
+}
+
+std::ostream& RandomUniformInitializer::as_ostream(std::ostream& out) const
+{
+    std::ios_base::fmtflags flags(out.flags());
+    out << "RandomUniformInitializer(minval=" << std::scientific << mMinval << ", maxval=" << mMaxval << std::defaultfloat << ", seed=" << mSeed << ")";
+    out.flags(flags);
+    return out;
+}
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/initializers/RandomUniformInitializer.h b/training/src/compiler/training/base/initializers/RandomUniformInitializer.h
new file mode 100644
index 00000000..d4e26da2
--- /dev/null
+++ b/training/src/compiler/training/base/initializers/RandomUniformInitializer.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_UNIFORM_INITIALIZER_H
+#define RANDOM_UNIFORM_INITIALIZER_H
+
+#include "IInitializer.h"
+
+namespace raul::initializers
+{
+struct RandomUniformInitializer : public IInitializer
+{
+    explicit RandomUniformInitializer(const size_t seed, const dtype minval = 0.0_dt, const dtype maxval = 1.0_dt);
+    RandomUniformInitializer(const dtype minval = 0.0_dt, const dtype maxval = 1.0_dt);
+
+    void operator()(Tensor& output) override;
+    void operator()(TensorFP16& output) override;
+
+    size_t getSeed() const { return mSeed; }
+    dtype getMinval() const { return mMinval; }
+    dtype getMaxval() const { return mMaxval; }
+
+    void setMinval(const dtype val) { mMinval = val; }
+    void setMaxval(const dtype val) { mMaxval = val; }
+    void setSeed(const size_t val)
+    {
+        mSeed = val;
+        mGenerator.seed(static_cast<unsigned>(mSeed));
+    }
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const override;
+    size_t mSeed;
+    std::default_random_engine mGenerator;
+    dtype mMinval;
+    dtype mMaxval;
+};
+} // raul::initializers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/initializers/XavierInitializer.cpp b/training/src/compiler/training/base/initializers/XavierInitializer.cpp
new file mode 100644
index 00000000..e5c3e658
--- /dev/null
+++ b/training/src/compiler/training/base/initializers/XavierInitializer.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "XavierInitializer.h"
+#include <math.h>
+
+namespace raul::initializers
+{
+XavierUniformInitializer::XavierUniformInitializer(const size_t seed)
+    : mSeed(seed)
+    , mGenerator(static_cast<unsigned>(seed))
+{
+}
+
+XavierUniformInitializer::XavierUniformInitializer()
+    : XavierUniformInitializer(std::random_device{}())
+{
+}
+
+size_t XavierUniformInitializer::calculateFactor(const Tensor& output)
+{
+    shape mShape = output.getShape();
+    if (mShape[0] == 1 && mShape[1] == 1) return (mShape[2] + mShape[3]);
+    if (mShape[0] == 1) return (mShape[1] + mShape[2]) * mShape[3];
+    return (mShape[0] + mShape[1]) * mShape[2] * mShape[3];
+}
+
+void XavierUniformInitializer::operator()(Tensor& output)
+{
+    const auto factor = static_cast<dtype>(calculateFactor(output));
+    std::uniform_real_distribution<dtype> distribution(-sqrt(6.0_dt / factor), sqrt(6.0_dt / factor));
+    for (auto& d : output)
+    {
+        d = distribution(mGenerator);
+    }
+}
+
+std::ostream& XavierUniformInitializer::as_ostream(std::ostream& out) const
+{
+    out << "XavierUniformInitializer(seed=" << mSeed << ")";
+    return out;
+}
+
+XavierNormInitializer::XavierNormInitializer(const size_t seed)
+    : mSeed(seed)
+    , mGenerator(static_cast<unsigned>(seed))
+{
+}
+
+XavierNormInitializer::XavierNormInitializer()
+    : XavierNormInitializer(std::random_device{}())
+{
+}
+
+size_t XavierNormInitializer::calculateFactor(const Tensor& output)
+{
+    shape mShape = output.getShape();
+    if (mShape[0] == 1 && mShape[1] == 1) return (mShape[2] + mShape[3]);
+    if (mShape[0] == 1) return (mShape[1] + mShape[2]) * mShape[3];
+    return (mShape[0] + mShape[1]) * mShape[2] * mShape[3];
+}
+
+void XavierNormInitializer::operator()(Tensor& output)
+{
+    const auto factor = static_cast<dtype>(calculateFactor(output));
+    std::normal_distribution<dtype> distribution(0.0_dt, sqrt(2.0_dt / factor));
+    for (auto& d : output)
+    {
+        d = distribution(mGenerator);
+    }
+}
+
+std::ostream& XavierNormInitializer::as_ostream(std::ostream& out) const
+{
+    out << "XavierNormInitializer(seed=" << mSeed << ")";
+    return out;
+}
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/initializers/XavierInitializer.h b/training/src/compiler/training/base/initializers/XavierInitializer.h
new file mode 100644
index 00000000..fd827829
--- /dev/null
+++ b/training/src/compiler/training/base/initializers/XavierInitializer.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef XAVIER_INITIALIZER_H
+#define XAVIER_INITIALIZER_H
+
+#include "IInitializer.h"
+
+namespace raul::initializers
+{
+struct XavierUniformInitializer : public IInitializer
+{
+    explicit XavierUniformInitializer(const size_t seed);
+    XavierUniformInitializer();
+    static size_t calculateFactor(const Tensor& output);
+    void operator()(Tensor& output) override;
+    void operator()(TensorFP16&) override { THROW_NONAME("XavierUniformInitializer", "not implemented") }
+
+  private:
+    size_t mSeed;
+    std::default_random_engine mGenerator;
+    std::ostream& as_ostream(std::ostream& out) const override;
+};
+
+struct XavierNormInitializer : public IInitializer
+{
+    explicit XavierNormInitializer(const size_t seed);
+    XavierNormInitializer();
+    static size_t calculateFactor(const Tensor& output);
+    void operator()(Tensor& output) override;
+    void operator()(TensorFP16&) override { THROW_NONAME("XavierNormInitializer", "not implemented") }
+
+  private:
+    size_t mSeed;
+    std::default_random_engine mGenerator;
+    std::ostream& as_ostream(std::ostream& out) const override;
+};
+} // raul::initializers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/BasicImpl.h b/training/src/compiler/training/base/layers/BasicImpl.h
new file mode 100644
index 00000000..5c26cf4c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/BasicImpl.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BASIC_IMPL_H
+#define BASIC_IMPL_H
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Base class for all implementations
+ */
+class BasicImpl
+{
+  public:
+    virtual ~BasicImpl() = default;
+
+    virtual void initNotBSTensors() {}
+    virtual void onBatchSizeChanged(size_t /*newBatchSize*/) {}
+    virtual void forwardComputeImpl(NetworkMode) = 0;
+    virtual void backwardComputeImpl() = 0;
+};
+
+/**
+ * @brief Stub class
+ */
+class NotImplemented : public BasicImpl
+{
+  public:
+    template<typename T>
+    NotImplemented(T&)
+    {
+    }
+    void forwardComputeImpl(NetworkMode) {}
+    void backwardComputeImpl() {}
+};
+
+class DummyImpl : public BasicImpl
+{
+  public:
+    template<typename T>
+    DummyImpl(T&)
+    {
+    }
+    void forwardComputeImpl(NetworkMode) {}
+    void backwardComputeImpl() {}
+};
+
+} // raul namespace
+
+#endif // BASIC_IMPL_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/BasicLayer.h b/training/src/compiler/training/base/layers/BasicLayer.h
new file mode 100644
index 00000000..820127e9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/BasicLayer.h
@@ -0,0 +1,248 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BASIC_LAYER_H
+#define BASIC_LAYER_H
+
+#include <training/base/common/NetworkParameters.h>
+#include <training/base/layers/BasicImpl.h>
+#include <training/compiler/Workflow.h>
+#include <training/system/Profiler.h>
+
+#include <training/base/layers/parameters/BasicParameters.h>
+
+namespace raul
+{
+
+/**
+ * @brief Base class for all layers
+ * @param parent used for layer internal params sharing
+ * Layer that creates new layers with shared params MUST:
+ *     1. Aggregate gradients for shared weights in the end of backwardCompute
+ *     2. Make sure, that params of layer aliases are not visible via getTrainableParametersNames() and getParameters()
+ *
+ */
+class BasicLayer
+{
+  public:
+    BasicLayer(const Name& name, const Name& typeName, const BasicParams& params, NetworkParameters& networkParams, std::pair<bool, bool> doChecks = { true, true })
+        : mName(name)
+        , mTypeName(typeName)
+        , mInputs(params.getInputs())
+        , mOutputs(params.getOutputs())
+        , mSharedWeights(params.getSharedWeights())
+        , mSharedLayer(params.getSharedLayer())
+        , mNetworkParams(networkParams)
+    {
+        if (mName.empty())
+        {
+            THROW(mTypeName, mName, "empty layer name");
+        }
+
+        if (doChecks.first)
+        {
+            if (mInputs.empty())
+            {
+                THROW(mTypeName, mName, "empty inputs");
+            }
+            if (std::any_of(mInputs.begin(), mInputs.end(), [](const auto& s) { return s.empty(); }))
+            {
+                THROW(mTypeName, mName, "empty input name");
+            }
+        }
+
+        if (doChecks.second)
+        {
+            if (mOutputs.empty())
+            {
+                THROW(mTypeName, mName, "empty outputs");
+            }
+            if (std::any_of(mOutputs.begin(), mOutputs.end(), [](const auto& s) { return s.empty(); }))
+            {
+                THROW(mTypeName, mName, "empty output name");
+            }
+        }
+    }
+
+    virtual ~BasicLayer() = default;
+
+    BasicLayer(BasicLayer&&) = default;
+    BasicLayer& operator=(BasicLayer&&) = delete;
+
+    /*
+     * @brief Define implementation of layer. Used by Compiler
+     */
+    void setImpl(std::unique_ptr<BasicImpl> impl) { mImpl = std::move(impl); }
+
+    /*
+     * @brief Override to initialize non-zero tensors without batch dimension.
+     * Consider to call explicitly when using eager workflow and directly call forward / backward for layers.
+     * Executed when Workflow::prepareMemoryForTraining() called
+     */
+    virtual void initNotBSTensors()
+    {
+        if (mImpl)
+        {
+            mImpl->initNotBSTensors();
+        }
+    }
+
+    virtual void onBatchSizeChanged(size_t newBatchSize)
+    {
+        if (mImpl)
+        {
+            mImpl->onBatchSizeChanged(newBatchSize);
+        }
+    }
+
+    virtual void forwardComputeImpl(NetworkMode mode)
+    {
+        if (mImpl)
+        {
+            mImpl->forwardComputeImpl(mode);
+        }
+        else
+        {
+            THROW(mTypeName, mName, "Layer has no forward implementation");
+        }
+    }
+
+    virtual void backwardComputeImpl()
+    {
+        if (mImpl)
+        {
+            mImpl->backwardComputeImpl();
+        }
+        else
+        {
+            THROW(mTypeName, mName, "Layer has no backward implementation");
+        }
+    }
+
+    virtual void forwardCompute(NetworkMode mode)
+    {
+        try
+        {
+            if (mNetworkParams.mCallback && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+            {
+                mNetworkParams.mCallback(this, mNetworkParams.mMemoryManager, raul::NetworkParameters::CallbackPlace::Before_Forward);
+            }
+            else if (mNetworkParams.mCallbackFP16 && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16)
+            {
+                mNetworkParams.mCallbackFP16(this, mNetworkParams.mMemoryManagerFP16, raul::NetworkParameters::CallbackPlace::Before_Forward);
+            }
+        }
+        catch (...)
+        {
+            THROW(mTypeName, mName, "Cannot execute Before_Forward callback");
+        }
+        try
+        {
+            MEASURE_BLOCK(mTypeName + "[" + mName + "::forwardCompute]");
+            forwardComputeImpl(mode);
+        }
+        catch (...)
+        {
+            THROW(mTypeName, mName, "Cannot compute the forward path in the layer");
+        }
+        try
+        {
+            if (mNetworkParams.mCallback && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+            {
+                mNetworkParams.mCallback(this, mNetworkParams.mMemoryManager, raul::NetworkParameters::CallbackPlace::After_Forward);
+            }
+            else if (mNetworkParams.mCallbackFP16 && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16)
+            {
+                mNetworkParams.mCallbackFP16(this, mNetworkParams.mMemoryManagerFP16, raul::NetworkParameters::CallbackPlace::After_Forward);
+            }
+        }
+        catch (...)
+        {
+            THROW(mTypeName, mName, "Cannot execute After_Forward callback");
+        }
+    }
+    virtual void backwardCompute()
+    {
+        try
+        {
+
+            if (mNetworkParams.mCallback && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+            {
+                mNetworkParams.mCallback(this, mNetworkParams.mMemoryManager, raul::NetworkParameters::CallbackPlace::Before_Backward);
+            }
+            else if (mNetworkParams.mCallbackFP16 && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16)
+            {
+                mNetworkParams.mCallbackFP16(this, mNetworkParams.mMemoryManagerFP16, raul::NetworkParameters::CallbackPlace::Before_Backward);
+            }
+        }
+        catch (...)
+        {
+            THROW(mTypeName, mName, "Cannot execute Before_Backward callback");
+        }
+        try
+        {
+            MEASURE_BLOCK(mTypeName + "[" + mName + "::backwardCompute]");
+            backwardComputeImpl();
+        }
+        catch (...)
+        {
+            THROW(mTypeName, mName, "Cannot compute the backward path in the layer");
+        }
+        try
+        {
+            if (mNetworkParams.mCallback && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+            {
+                mNetworkParams.mCallback(this, mNetworkParams.mMemoryManager, raul::NetworkParameters::CallbackPlace::After_Backward);
+            }
+            else if (mNetworkParams.mCallbackFP16 && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16)
+            {
+                mNetworkParams.mCallbackFP16(this, mNetworkParams.mMemoryManagerFP16, raul::NetworkParameters::CallbackPlace::After_Backward);
+            }
+        }
+        catch (...)
+        {
+            THROW(mTypeName, mName, "Cannot execute After_Backward callback");
+        }
+    }
+
+    [[nodiscard]] const Name& getName() const { return mName; }
+    [[nodiscard]] std::string getTypeName() const { return mTypeName; }
+
+    [[nodiscard]] const Names& getInputs() const { return mInputs; }
+    [[nodiscard]] const Names& getOutputs() const { return mOutputs; }
+    [[nodiscard]] const Names& getSharedWeights() const { return mSharedWeights; }
+    [[nodiscard]] const Name& getSharedLayer() const { return mSharedLayer; }
+
+    [[nodiscard]] virtual bool isTrainable() const { return false; }
+
+    const NetworkParameters& getNetworkParams() const { return mNetworkParams; }
+    NetworkParameters& getNetworkParams() { return mNetworkParams; }
+
+    virtual void print(std::ostream& out, std::string prefix = "") const { out << prefix << mTypeName << "[" << mName << "]" << std::endl; }
+
+  protected:
+    const Name mName;
+    const std::string mTypeName;
+    const Names mInputs;
+    const Names mOutputs;
+    const Names mSharedWeights;
+    const Name mSharedLayer;
+
+    NetworkParameters& mNetworkParams;
+
+    std::unique_ptr<BasicImpl> mImpl;
+};
+
+} // raul namespace
+
+#endif // BASIC_LAYER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/BroadcastingLayer.cpp b/training/src/compiler/training/base/layers/BroadcastingLayer.cpp
new file mode 100644
index 00000000..e7b823ef
--- /dev/null
+++ b/training/src/compiler/training/base/layers/BroadcastingLayer.cpp
@@ -0,0 +1,95 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BroadcastingLayer.h"
+
+namespace raul
+{
+
+BroadcastingLayer::BroadcastingLayer(const raul::Name& name, const std::string& typeName, const BasicParams& params, NetworkParameters& networkParams, std::pair<bool, bool> doChecks)
+    : BasicLayer(name, typeName, params, networkParams, doChecks)
+    , mBroadcastQuery(std::vector<bool>(mInputs.size(), false))
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "no output");
+    }
+
+    mLayerTarget = mNetworkParams.mWorkflow.getOverrideLayerExecutionTarget();
+}
+
+void BroadcastingLayer::determineBroadcastFlags()
+{
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU || mLayerTarget == LayerExecutionTarget::CPU)
+    {
+        if (!mNetworkParams.mMemoryManager.tensorExists(mOutputs[0]))
+        {
+            THROW("BroadcastingLayer", mName, "all tensors should exist at this moment");
+        }
+
+        // Get output shape
+        const auto outputShape = mNetworkParams.mMemoryManager[mOutputs[0]].getShape();
+
+        // Check the need of broadcasting for each input
+        for (size_t i = 0; i < mInputs.size(); ++i)
+        {
+            if (!mNetworkParams.mMemoryManager.tensorExists(mInputs[i]))
+            {
+                THROW("BroadcastingLayer", mName, "all tensors should exist at this moment");
+            }
+
+            const Tensor& input = mNetworkParams.mMemoryManager[mInputs[i]];
+            if (!input.isBroadcastableTo(outputShape))
+            {
+                THROW("BroadcastingLayer", mName, "input tensor[" + mInputs[i] + "] is not broadcastable to output tensor[" + mOutputs[0] + "]");
+            }
+            mBroadcastQuery[i] = (input.getShape() != outputShape);
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16 || mLayerTarget == LayerExecutionTarget::CPUFP16)
+    {
+        if (!work.getMemoryManager<MemoryManagerFP16>().tensorExists(mOutputs[0]))
+        {
+            THROW("BroadcastingLayer", mName, "all tensors should exist at this moment");
+        }
+
+        // Get output shape
+        const auto outputShape = work.getMemoryManager<MemoryManagerFP16>()[mOutputs[0]].getShape();
+
+        // Check the need of broadcasting for each input
+        for (size_t i = 0; i < mInputs.size(); ++i)
+        {
+            if (!work.getMemoryManager<MemoryManagerFP16>().tensorExists(mInputs[i]))
+            {
+                THROW("BroadcastingLayer", mName, "all tensors should exist at this moment");
+            }
+
+            const auto& input = work.getMemoryManager<MemoryManagerFP16>()[mInputs[i]];
+            if (!input.isBroadcastableTo(outputShape))
+            {
+                THROW("BroadcastingLayer", mName, "input tensor[" + mInputs[i] + "] is not broadcastable to output tensor[" + mOutputs[0] + "]");
+            }
+            mBroadcastQuery[i] = (input.getShape() != outputShape);
+        }
+    }
+    else
+    {
+        THROW_NONAME("BroadcastingLayer", "unsupported execution target");
+    }
+}
+
+} // raul namespace
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/BroadcastingLayer.h b/training/src/compiler/training/base/layers/BroadcastingLayer.h
new file mode 100644
index 00000000..70509e6d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/BroadcastingLayer.h
@@ -0,0 +1,42 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BROADCASTING_LAYER_H
+#define BROADCASTING_LAYER_H
+
+#include "BasicLayer.h"
+#include <training/base/common/Common.h>
+#include <training/base/common/NetworkParameters.h>
+
+namespace raul
+{
+
+/**
+ * @brief Broadcasting layer class - base for range of element-wise layers
+ */
+
+class BroadcastingLayer : public BasicLayer
+{
+  public:
+    BroadcastingLayer(const raul::Name& name, const std::string& typeName, const BasicParams& params, NetworkParameters& networkParams, std::pair<bool, bool> doChecks = { false, false });
+
+  protected:
+    void determineBroadcastFlags();
+    std::vector<bool> mBroadcastQuery;
+
+    LayerExecutionTarget mLayerTarget;
+};
+
+} // raul namespace
+
+#endif // BROADCASTING_LAYER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/TrainableLayer.cpp b/training/src/compiler/training/base/layers/TrainableLayer.cpp
new file mode 100644
index 00000000..ab097775
--- /dev/null
+++ b/training/src/compiler/training/base/layers/TrainableLayer.cpp
@@ -0,0 +1,56 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TrainableLayer.h"
+
+namespace raul
+{
+
+TrainableLayer::TrainableLayer(const raul::Name& name, const std::string& typeName, const TrainableParams& params, NetworkParameters& networkParams, std::pair<bool, bool> doChecks)
+    : BasicLayer(name, typeName, params, networkParams, doChecks)
+    , mFrozen(params.frozen)
+{
+    MEASURE_BLOCK("TrainableLayer[" + mName + "::ctor]")
+
+    if (!mSharedLayer.empty())
+    {
+        mWeightsName = mSharedLayer / "Weights";
+        mBiasesName = mSharedLayer / "Biases";
+    }
+    else
+    {
+        if (mSharedWeights.empty() || (!mSharedWeights.empty() && mSharedWeights[0].empty()))
+        {
+            mWeightsName = mName / "Weights";
+        }
+        else
+        {
+            mWeightsName = mSharedWeights[0];
+        }
+
+        if (mSharedWeights.empty() || (mSharedWeights.size() > 1 && mSharedWeights[1].empty()))
+        {
+            mBiasesName = mName / "Biases";
+        }
+        else
+        {
+            if (mSharedWeights.size() < 2)
+            {
+                THROW("TrainableLayer", mName, "wrong number of weight names");
+            }
+            mBiasesName = mSharedWeights[1];
+        }
+    }
+}
+
+} // raul namespace
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/TrainableLayer.h b/training/src/compiler/training/base/layers/TrainableLayer.h
new file mode 100644
index 00000000..6a030fec
--- /dev/null
+++ b/training/src/compiler/training/base/layers/TrainableLayer.h
@@ -0,0 +1,56 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TRAINABLE_LAYER_H
+#define TRAINABLE_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/common/NetworkParameters.h>
+
+#include "parameters/DataParams.h"
+#include "parameters/trainable/TrainableParams.h"
+
+#include "BasicLayer.h"
+
+#include <optional>
+
+namespace raul
+{
+
+/**
+ * @brief Basic trainable layer
+ *
+ *
+ */
+class TrainableLayer : public BasicLayer
+{
+  public:
+    TrainableLayer(const raul::Name& name, const std::string& typeName, const TrainableParams& params, NetworkParameters& networkParams, std::pair<bool, bool> doChecks = { true, true });
+
+    [[nodiscard]] bool isTrainable() const override { return true; }
+    [[nodiscard]] virtual bool isFrozen() const { return mFrozen; }
+
+    const Name& getWeightsName() const { return mWeightsName; }
+    const Name& getBiasesName() const { return mBiasesName; }
+
+  protected:
+    bool mFrozen;
+
+    Name mWeightsName;
+    Name mBiasesName;
+};
+
+} // raul namespace
+
+#endif // TRAINABLE_LAYER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/GeLUActivation.cpp b/training/src/compiler/training/base/layers/activations/GeLUActivation.cpp
new file mode 100644
index 00000000..a8991958
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/GeLUActivation.cpp
@@ -0,0 +1,65 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GeLUActivation.h"
+
+#include "impl/GeLUActivationCPU.h"
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+GeLUErf::GeLUErf(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : GeLUErf(name, "GeLUErf", params, networkParameters)
+{
+}
+
+GeLUErf::GeLUErf(const Name& name, const Name& typeName, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, typeName, params, networkParameters)
+{
+    auto prefix = typeName + "[" + name + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(typeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(typeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(GeLUErf, GeLUErfCPU<MemoryManager>, NotImplemented)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, Workflow::Usage::ForwardAndBackward, Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_BACK_READ_NOMEMOPT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+GeLUTanh::GeLUTanh(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : GeLUErf(name, "GeLUTanh", params, networkParameters)
+{
+    DECLARE_IMPL(GeLUTanh, GeLUTanhCPU<MemoryManager>, NotImplemented)
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/GeLUActivation.h b/training/src/compiler/training/base/layers/activations/GeLUActivation.h
new file mode 100644
index 00000000..2c6a2bba
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/GeLUActivation.h
@@ -0,0 +1,66 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GELU_ACTIVATION_H
+#define GELU_ACTIVATION_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Gaussian error linear activation function
+ *
+ * The layer applies the element-wise gaussian error linear function.
+ */
+class GeLUErf : public BasicLayer
+{
+  public:
+    GeLUErf(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    GeLUErf(GeLUErf&&) = default;
+    GeLUErf(const GeLUErf&) = delete;
+    GeLUErf& operator=(const GeLUErf&) = delete;
+
+  protected:
+    GeLUErf(const Name& name, const Name& typeName, const BasicParams& params, NetworkParameters& networkParameters);
+
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class GeLUErfCPU;
+};
+
+class GeLUTanh : public GeLUErf
+{
+  public:
+    GeLUTanh(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    GeLUTanh(GeLUTanh&&) = default;
+    GeLUTanh(const GeLUTanh&) = delete;
+    GeLUTanh& operator=(const GeLUTanh&) = delete;
+
+    template<typename MM>
+    friend class GeLUTanhCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/HSigmoidActivation.cpp b/training/src/compiler/training/base/layers/activations/HSigmoidActivation.cpp
new file mode 100644
index 00000000..4717d93b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/HSigmoidActivation.cpp
@@ -0,0 +1,56 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "HSigmoidActivation.h"
+
+#include "impl/HSigmoidActivationCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+HSigmoidActivation::HSigmoidActivation(const Name& name, const HSigmoidActivationParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "HSigmoid", params, networkParameters)
+    , m3PointVal(params.m3PointVal)
+    , p3PointVal(params.p3PointVal)
+{
+    auto prefix = mTypeName + "[" + name + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(HSigmoidActivation, HSigmoidActivationCPU<MemoryManager>, HSigmoidActivationCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/HSigmoidActivation.h b/training/src/compiler/training/base/layers/activations/HSigmoidActivation.h
new file mode 100644
index 00000000..20edbafa
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/HSigmoidActivation.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef HSIGMOID_ACTIVATION_H
+#define HSIGMOID_ACTIVATION_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+/**
+ * @brief H-sigmoid (Hard Sigmoid) activation function
+ *
+ *  Function:
+ *  \f[
+ *      \mathrm{h-sigmoid}(x) = \frac{\mathrm{ReLU6}(x+3)}{6},\\
+ *      \mathrm{ReLU6}(x) = \min(\max(0,x),6)
+ *  \f]
+ *
+ *  Derivative:
+ *  \f[
+ *  \mathrm{h-sigmoid}^'(x) =
+ *  \left\{
+ *      \begin{array}{ll}
+ *          1/6 &, -3 < x < 3 \\
+ *          0 &, x < -3 \mathrm{or} x > 3 \\
+ *          \mathrm{indeterminate} &, \mathrm{otherwise}
+ *      \end{array}
+ *  \right.
+ *  \f]
+ *
+ *  @see
+ *  - A. Howard et al., “Searching for MobileNetV3” arXiv:1905.02244 [cs], Nov. 2019.
+ */
+class HSigmoidActivation : public BasicLayer
+{
+  public:
+    HSigmoidActivation(const Name& name, const HSigmoidActivationParams& params, NetworkParameters& networkParameters);
+
+    HSigmoidActivation(HSigmoidActivation&&) = default;
+    HSigmoidActivation(const HSigmoidActivation&) = delete;
+    HSigmoidActivation& operator=(const HSigmoidActivation&) = delete;
+
+  private:
+    Name mInputName;
+    Name mOutputName;
+
+    Limit m3PointVal;
+    Limit p3PointVal;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class HSigmoidActivationCPU;
+};
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/HSwishActivation.cpp b/training/src/compiler/training/base/layers/activations/HSwishActivation.cpp
new file mode 100644
index 00000000..f603f421
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/HSwishActivation.cpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "HSwishActivation.h"
+
+#include "impl/HSwishActivationCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+HSwishActivation::HSwishActivation(const Name& name, const HSwishActivationParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "HSwish", params, networkParameters)
+    , m3PointVal(params.m3PointVal)
+    , p3PointVal(params.p3PointVal)
+{
+    auto prefix = "HSwishActivation[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(HSwishActivation, HSwishActivationCPU<MemoryManager>, HSwishActivationCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/HSwishActivation.h b/training/src/compiler/training/base/layers/activations/HSwishActivation.h
new file mode 100644
index 00000000..be2d51a8
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/HSwishActivation.h
@@ -0,0 +1,72 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef HSWISH_ACTIVATION_H
+#define HSWISH_ACTIVATION_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+/**
+ * @brief H-swish (Hard Swish) activation function
+ *
+ *  Function:
+ *  \f[
+ *      \mathrm{h-swish}(x) = x \frac{\mathrm{ReLU6}(x+3)}{6},\\
+ *      \mathrm{ReLU6}(x) = \min(\max(0,x),6)
+ *  \f]
+ *
+ *  Derivative:
+ *  \f[
+ *  \mathrm{h-swish}^'(x) =
+ *  \left\{
+ *      \begin{array}{ll}
+ *          1   &,  x > 3 \\
+ *          1/6 (2x +3) &, -3 < x < 3 \\
+ *          0 &, x < -3 \\
+ *          \mathrm{indeterminate} &, \mathrm{otherwise}
+ *      \end{array}
+ *  \right.
+ *  \f]
+ *
+ *  @see
+ *  - A. Howard et al., “Searching for MobileNetV3” arXiv:1905.02244 [cs], Nov. 2019.
+ *  - R. Avenash and P. Viswanath, “Semantic Segmentation of Satellite Images using a Modified CNN with Hard-Swish Activation Function” in VISIGRAPP, 2019, doi: 10.5220/0007469604130420.
+ */
+class HSwishActivation : public BasicLayer
+{
+  public:
+    HSwishActivation(const Name& name, const HSwishActivationParams& params, NetworkParameters& networkParameters);
+
+    HSwishActivation(HSwishActivation&&) = default;
+    HSwishActivation(const HSwishActivation&) = delete;
+    HSwishActivation& operator=(const HSwishActivation&) = delete;
+
+  private:
+    Name mInputName;
+    Name mOutputName;
+
+    Limit m3PointVal;
+    Limit p3PointVal;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class HSwishActivationCPU;
+};
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/LeakyReLUActivation.cpp b/training/src/compiler/training/base/layers/activations/LeakyReLUActivation.cpp
new file mode 100644
index 00000000..b95533d4
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/LeakyReLUActivation.cpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LeakyReLUActivation.h"
+
+#include "impl/LeakyReLUActivationCPU.h"
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+LeakyReLUActivation::LeakyReLUActivation(const Name& name, const LeakyReLUParams& params, NetworkParameters& networkParameters)
+    : LeakyReLUActivation(name, "LeakyReLUActivation", params, networkParameters)
+{
+}
+
+LeakyReLUActivation::LeakyReLUActivation(const Name& name, const std::string& typeName, const LeakyReLUParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, typeName, params, networkParameters)
+    , mNegativeSlope(params.mNegativeSlope)
+{
+    auto prefix = typeName + "[" + mName + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        throw std::runtime_error(prefix + "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        throw std::runtime_error(prefix + "wrong number of output names");
+    }
+
+    DECLARE_IMPL(LeakyReLUActivation, LeakyReLUActivationCPU<MemoryManager>, LeakyReLUActivationCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/LeakyReLUActivation.h b/training/src/compiler/training/base/layers/activations/LeakyReLUActivation.h
new file mode 100644
index 00000000..904b818a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/LeakyReLUActivation.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LEAKY_RELU_ACTIVATION_H
+#define LEAKY_RELU_ACTIVATION_H
+
+#include "training/base/layers/BasicLayer.h"
+#include "training/base/layers/parameters/LeakyReLUParams.h"
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Leaky rectified linear unit activation function
+ *
+ * The layer applies the element-wise leaky rectified linear unit.
+ */
+class LeakyReLUActivation : public BasicLayer
+{
+  public:
+    LeakyReLUActivation(const Name& name, const LeakyReLUParams& params, NetworkParameters& networkParameters);
+
+    LeakyReLUActivation(LeakyReLUActivation&&) = default;
+    LeakyReLUActivation(const LeakyReLUActivation&) = delete;
+    LeakyReLUActivation& operator=(const LeakyReLUActivation&) = delete;
+
+  protected:
+    LeakyReLUActivation(const Name& name, const std::string& typeName, const LeakyReLUParams& params, NetworkParameters& networkParameters);
+
+    dtype mNegativeSlope;
+
+    Name mInputName;
+    Name mOutputName;
+
+    template<typename MM>
+    friend class LeakyReLUActivationCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/LogSoftMaxActivation.cpp b/training/src/compiler/training/base/layers/activations/LogSoftMaxActivation.cpp
new file mode 100644
index 00000000..2ad0b8ea
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/LogSoftMaxActivation.cpp
@@ -0,0 +1,299 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LogSoftMaxActivation.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+LogSoftMaxActivation::LogSoftMaxActivation(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "LogSoftMax", params, networkParameters)
+    , mDimension(params.dim)
+{
+
+    auto prefix = "LogSoftMaxActivation[" + mName + "::ctor]: ";
+
+    if (mDimension != Dimension::Default && mDimension != Dimension::Width)
+    {
+        THROW(mTypeName, mName, "specified dimension not implemented");
+    }
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    if (mDimension == Dimension::Height || mDimension == Dimension::Depth)
+    {
+        THROW(mTypeName, mName, "unsupported dimension");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    // reduce output tensor dimensions
+    if (mDimension == Dimension::Default)
+    {
+        shape inputShape = shape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputName), mNetworkParams.mWorkflow.getHeight(mInputName), mNetworkParams.mWorkflow.getWidth(mInputName) };
+        mInputsCount = inputShape.total_size();
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ raul::BS(), 1u, 1u, mInputsCount }, DEC_FORW_WRIT_NOMEMOPT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, DEC_BACK_READ_NOMEMOPT);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_BACK_READ_NOMEMOPT);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+    }
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+void LogSoftMaxActivation::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+        Tensor& output = mNetworkParams.mMemoryManager[mOutputName];
+
+        const Tensor& inputs = mNetworkParams.mMemoryManager[mInputName];
+
+        if (mDimension == Dimension::Default)
+        {
+            for (size_t q = 0; q < batchSize; ++q)
+            {
+                size_t batchOffset = q * mInputsCount;
+
+                dtype sum = 0.0_dt;
+                dtype max = (*std::max_element(inputs.begin() + batchOffset, inputs.begin() + batchOffset + mInputsCount));
+
+                for (size_t i = 0; i < mInputsCount; ++i)
+                {
+                    sum += std::exp(inputs[batchOffset + i] - max);
+                }
+                sum = static_cast<dtype>(log(sum) + max);
+                for (size_t i = 0; i < mInputsCount; ++i)
+                {
+                    output[batchOffset + i] = inputs[batchOffset + i] - sum;
+                }
+            }
+        }
+        else if (mDimension == Dimension::Width)
+        {
+            size_t size = batchSize * inputs.getDepth() * inputs.getHeight();
+            auto input2D = inputs.reshape(yato::dims(size, inputs.getWidth()));
+            auto output2D = output.reshape(yato::dims(size, inputs.getWidth()));
+
+            for (size_t q = 0; q < size; ++q)
+            {
+                dtype sum = 0.0_dt;
+                dtype max = (*std::max_element(inputs.begin() + q * inputs.getWidth(), inputs.begin() + (q + 1) * inputs.getWidth()));
+
+                for (size_t i = 0; i < inputs.getWidth(); ++i)
+                    sum += std::exp(input2D[q][i] - max);
+                sum = static_cast<dtype>(log(sum) + max);
+                for (size_t i = 0; i < inputs.getWidth(); ++i)
+                    output2D[q][i] = input2D[q][i] - sum;
+            }
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+        auto& output = work.getMemoryManager<MemoryManagerFP16>()[mOutputName];
+
+        const auto& inputs = work.getMemoryManager<MemoryManagerFP16>()[mInputName];
+
+        if (mDimension == Dimension::Default)
+        {
+            for (size_t q = 0; q < batchSize; ++q)
+            {
+                size_t batchOffset = q * mInputsCount;
+
+                dtype sum = 0.0_dt;
+                half max = (*std::max_element(inputs.begin() + batchOffset, inputs.begin() + batchOffset + mInputsCount));
+
+                for (size_t i = 0; i < mInputsCount; ++i)
+                {
+                    sum += std::exp(TODTYPE(inputs[batchOffset + i] - max));
+                }
+                sum = static_cast<dtype>(log(sum) + TODTYPE(max));
+                for (size_t i = 0; i < mInputsCount; ++i)
+                {
+                    output[batchOffset + i] = inputs[batchOffset + i] - TOHTYPE(sum);
+                }
+            }
+        }
+        else if (mDimension == Dimension::Width)
+        {
+            size_t size = batchSize * inputs.getDepth() * inputs.getHeight();
+            auto input2D = inputs.reshape(yato::dims(size, inputs.getWidth()));
+            auto output2D = output.reshape(yato::dims(size, inputs.getWidth()));
+
+            for (size_t q = 0; q < size; ++q)
+            {
+                dtype sum = 0.0_dt;
+                half max = (*std::max_element(inputs.begin() + q * inputs.getWidth(), inputs.begin() + (q + 1) * inputs.getWidth()));
+
+                for (size_t i = 0; i < inputs.getWidth(); ++i)
+                    sum += std::exp(TODTYPE(input2D[q][i] - max));
+                sum = static_cast<dtype>(log(sum) + TODTYPE(max));
+                for (size_t i = 0; i < inputs.getWidth(); ++i)
+                    output2D[q][i] = input2D[q][i] - TOHTYPE(sum);
+            }
+        }
+    }
+    else
+    {
+        THROW_NONAME("LogSoftMaxActivation", "unsupported execution target");
+    }
+}
+
+void LogSoftMaxActivation::backwardComputeImpl()
+{
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        const Tensor& output = mNetworkParams.mMemoryManager[mOutputName];
+
+        ////if (mNetworkParams.isGradNeeded(mInputName))
+        {
+            const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputName.grad()];
+            Tensor& prevLayerDelta = mNetworkParams.mMemoryManager[mInputName.grad()];
+
+            const size_t batchSize = prevLayerDelta.getBatchSize();
+
+            if (mDimension == Dimension::Default)
+            {
+                for (size_t q = 0; q < batchSize; ++q)
+                {
+                    size_t batchOffset = q * mInputsCount;
+
+                    for (size_t i = 0; i < mInputsCount; ++i)
+                    {
+                        dtype sum = 0.0_dt;
+                        dtype e = static_cast<dtype>(exp(output[batchOffset + i]));
+                        for (size_t j = 0; j < mInputsCount; ++j)
+                        {
+                            sum += deltas[batchOffset + j] * e;
+                        }
+                        prevLayerDelta[batchOffset + i] += deltas[batchOffset + i] - sum;
+                    }
+                }
+            }
+            else if (mDimension == Dimension::Width)
+            {
+                size_t size = batchSize * deltas.getDepth() * deltas.getHeight();
+
+                auto deltas2D = deltas.reshape(yato::dims(size, deltas.getWidth()));
+                auto prevLayerDelta2D = prevLayerDelta.reshape(yato::dims(size, deltas.getWidth()));
+                auto output2D = output.reshape(yato::dims(size, deltas.getWidth()));
+
+                for (size_t q = 0; q < size; ++q)
+                {
+                    for (size_t i = 0; i < deltas.getWidth(); ++i)
+                    {
+                        dtype sum = 0.0_dt;
+                        dtype e = static_cast<dtype>(exp(output2D[q][i]));
+                        for (size_t j = 0; j < deltas.getWidth(); ++j)
+                        {
+                            sum += deltas2D[q][j] * e;
+                        }
+                        prevLayerDelta2D[q][i] += deltas2D[q][i] - sum;
+                    }
+                }
+            }
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        const auto& output = work.getMemoryManager<MemoryManagerFP16>()[mOutputName];
+
+        ////if (mNetworkParams.isGradNeeded(mInputName))
+        {
+            const auto& deltas = work.getMemoryManager<MemoryManagerFP16>()[mOutputName.grad()];
+            auto& prevLayerDelta = work.getMemoryManager<MemoryManagerFP16>()[mInputName.grad()];
+
+            const size_t batchSize = prevLayerDelta.getBatchSize();
+
+            if (mDimension == Dimension::Default)
+            {
+                for (size_t q = 0; q < batchSize; ++q)
+                {
+                    size_t batchOffset = q * mInputsCount;
+
+                    for (size_t i = 0; i < mInputsCount; ++i)
+                    {
+                        dtype sum = 0.0_dt;
+                        dtype e = static_cast<dtype>(exp(TODTYPE(output[batchOffset + i])));
+                        for (size_t j = 0; j < mInputsCount; ++j)
+                        {
+                            sum += TODTYPE(deltas[batchOffset + j]) * e;
+                        }
+                        prevLayerDelta[batchOffset + i] += deltas[batchOffset + i] - TOHTYPE(sum);
+                    }
+                }
+            }
+            else if (mDimension == Dimension::Width)
+            {
+                size_t size = batchSize * deltas.getDepth() * deltas.getHeight();
+
+                auto deltas2D = deltas.reshape(yato::dims(size, deltas.getWidth()));
+                auto prevLayerDelta2D = prevLayerDelta.reshape(yato::dims(size, deltas.getWidth()));
+                auto output2D = output.reshape(yato::dims(size, deltas.getWidth()));
+
+                for (size_t q = 0; q < size; ++q)
+                {
+                    for (size_t i = 0; i < deltas.getWidth(); ++i)
+                    {
+                        dtype sum = 0.0_dt;
+                        dtype e = static_cast<dtype>(exp(TODTYPE(output2D[q][i])));
+                        for (size_t j = 0; j < deltas.getWidth(); ++j)
+                        {
+                            sum += TODTYPE(deltas2D[q][j]) * e;
+                        }
+                        prevLayerDelta2D[q][i] += deltas2D[q][i] - TOHTYPE(sum);
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        THROW_NONAME("LogSoftMaxActivation", "unsupported execution target");
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/LogSoftMaxActivation.h b/training/src/compiler/training/base/layers/activations/LogSoftMaxActivation.h
new file mode 100644
index 00000000..27f8928b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/LogSoftMaxActivation.h
@@ -0,0 +1,56 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LOG_SOFT_MAX_ACTIVATION_H
+#define LOG_SOFT_MAX_ACTIVATION_H
+
+#include "training/base/layers/BasicLayer.h"
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Logarithmic softmax activation function
+ *
+ * The layer applies logarithmic softmax.
+ */
+class LogSoftMaxActivation : public BasicLayer
+{
+  public:
+    LogSoftMaxActivation(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    LogSoftMaxActivation(LogSoftMaxActivation&&) = default;
+    LogSoftMaxActivation(const LogSoftMaxActivation&) = delete;
+    LogSoftMaxActivation& operator=(const LogSoftMaxActivation&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    size_t mInputsCount;
+
+    Dimension mDimension;
+
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/ReLUActivation.cpp b/training/src/compiler/training/base/layers/activations/ReLUActivation.cpp
new file mode 100644
index 00000000..44fdbd17
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/ReLUActivation.cpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReLUActivation.h"
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+#include "impl/ReLUActivationImpl.h"
+
+namespace raul
+{
+
+ReLUActivation::ReLUActivation(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : ReLUActivation(name, "ReLUActivation", params, networkParameters)
+{
+}
+
+ReLUActivation::ReLUActivation(const Name& name, const std::string& typeName, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, typeName, params, networkParameters)
+{
+    auto prefix = typeName + "[" + mName + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(typeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(typeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(ReLUActivation, ReLUActivationImpl<MemoryManager>, ReLUActivationImpl<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+ReLU6Activation::ReLU6Activation(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : ReLUActivation(name, "ReLU6Activation", params, networkParameters)
+{
+    DECLARE_IMPL(ReLU6Activation, ReLU6ActivationImpl<MemoryManager>, ReLU6ActivationImpl<MemoryManagerFP16>)
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/ReLUActivation.h b/training/src/compiler/training/base/layers/activations/ReLUActivation.h
new file mode 100644
index 00000000..044d3b88
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/ReLUActivation.h
@@ -0,0 +1,67 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RELU_ACTIVATION_H
+#define RELU_ACTIVATION_H
+
+#include "training/base/layers/BasicLayer.h"
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Rectified linear unit activation function
+ *
+ * The layer applies the element-wise rectified linear unit.
+ */
+class ReLUActivation : public BasicLayer
+{
+  public:
+    ReLUActivation(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    ReLUActivation(ReLUActivation&&) = default;
+    ReLUActivation(const ReLUActivation&) = delete;
+    ReLUActivation& operator=(const ReLUActivation&) = delete;
+
+  protected:
+    ReLUActivation(const Name& name, const std::string& typeName, const BasicParams& params, NetworkParameters& networkParameters);
+
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class ReLUActivationImpl;
+};
+
+class ReLU6Activation : public ReLUActivation
+{
+  public:
+    ReLU6Activation(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    ReLU6Activation(ReLU6Activation&&) = default;
+    ReLU6Activation(const ReLU6Activation&) = delete;
+    ReLU6Activation& operator=(const ReLU6Activation&) = delete;
+
+  protected:
+    template<typename MM>
+    friend class ReLU6ActivationImpl;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/SigmoidActivation.cpp b/training/src/compiler/training/base/layers/activations/SigmoidActivation.cpp
new file mode 100644
index 00000000..cc56d8e3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/SigmoidActivation.cpp
@@ -0,0 +1,63 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SigmoidActivation.h"
+
+#include "impl/SigmoidActivationCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+SigmoidActivation::SigmoidActivation(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Sigmoid", params, networkParameters)
+{
+
+    // d.polubotko: keep registration to pass TestWorkflowOverrideLayerExecutionTarget.TrainingOverrideTarget test
+    if (mNetworkParams.mWorkflow.getOverrideLayerExecutionTarget() == LayerExecutionTarget::Default)
+    {
+        DECLARE_IMPL(SigmoidActivation, SigmoidActivationCPU<MemoryManager>, SigmoidActivationCPU<MemoryManagerFP16>)
+    }
+    else if (mNetworkParams.mWorkflow.getOverrideLayerExecutionTarget() == LayerExecutionTarget::CPU)
+    {
+        DECLARE_IMPL(SigmoidActivation, SigmoidActivationCPU<MemoryManager>, SigmoidActivationCPU<MemoryManager>)
+    }
+    else if (mNetworkParams.mWorkflow.getOverrideLayerExecutionTarget() == LayerExecutionTarget::CPUFP16)
+    {
+        DECLARE_IMPL(SigmoidActivation, SigmoidActivationCPU<MemoryManagerFP16>, SigmoidActivationCPU<MemoryManagerFP16>)
+    }
+    else
+    {
+        THROW(mTypeName, mName, "unsupported layer execution target");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_COMP);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_BACK_READ_COMP);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/SigmoidActivation.h b/training/src/compiler/training/base/layers/activations/SigmoidActivation.h
new file mode 100644
index 00000000..90742ae8
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/SigmoidActivation.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SIGMOID_ACTIVATION_H
+#define SIGMOID_ACTIVATION_H
+
+#include "training/base/layers/BasicLayer.h"
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Sigmoid activation function
+ *
+ * The layer applies the element-wise sigmoid function.
+ */
+class SigmoidActivation : public BasicLayer
+{
+  public:
+    SigmoidActivation(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    SigmoidActivation(SigmoidActivation&&) = default;
+    SigmoidActivation(const SigmoidActivation&) = delete;
+    SigmoidActivation& operator=(const SigmoidActivation&) = delete;
+
+  private:
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class SigmoidActivationCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/SoftMaxActivation.cpp b/training/src/compiler/training/base/layers/activations/SoftMaxActivation.cpp
new file mode 100644
index 00000000..2534e2ff
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/SoftMaxActivation.cpp
@@ -0,0 +1,74 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SoftMaxActivation.h"
+
+#include "impl/SoftMaxActivationCPU.h"
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+SoftMaxActivation::SoftMaxActivation(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "SoftMax", params, networkParameters)
+    , mDimension(params.dim)
+{
+    MEASURE_BLOCK(mTypeName + "[" + mName + "::ctor]")
+
+    mCaller = mTypeName + "[" + mName + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(SoftMaxActivation, SoftMaxActivationCPU<MemoryManager>, SoftMaxActivationCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    // reduce output tensor dimensions
+    if (mDimension == Dimension::Default)
+    {
+        shape inputShape = shape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputName), mNetworkParams.mWorkflow.getHeight(mInputName), mNetworkParams.mWorkflow.getWidth(mInputName) };
+
+        mInputsCount = inputShape.total_size();
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ raul::BS(), mInputsCount, 1u, 1u }, DEC_FORW_WRIT_NOMEMOPT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, DEC_BACK_READ_NOMEMOPT);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDec(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_BACK_READ_NOMEMOPT);
+
+        mNetworkParams.mWorkflow.copyDec(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+    }
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/SoftMaxActivation.h b/training/src/compiler/training/base/layers/activations/SoftMaxActivation.h
new file mode 100644
index 00000000..86038510
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/SoftMaxActivation.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SOFT_MAX_ACTIVATION_H
+#define SOFT_MAX_ACTIVATION_H
+
+#include "training/base/layers/BasicLayer.h"
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief  Softmax activation function
+ *
+ * The layer rescales an input tensor with a sum of outputs equals 1.0.
+ */
+class SoftMaxActivation : public BasicLayer
+{
+  public:
+    SoftMaxActivation(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    SoftMaxActivation(SoftMaxActivation&&) = default;
+    SoftMaxActivation(const SoftMaxActivation&) = delete;
+    SoftMaxActivation& operator=(const SoftMaxActivation&) = delete;
+
+  private:
+    size_t mInputsCount = 0;
+
+    Dimension mDimension;
+
+    Name mInputName;
+    Name mOutputName;
+
+    Name mCaller;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class SoftMaxActivationCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/SoftPlusActivation.cpp b/training/src/compiler/training/base/layers/activations/SoftPlusActivation.cpp
new file mode 100644
index 00000000..dd7d3f2e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/SoftPlusActivation.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SoftPlusActivation.h"
+
+#include "impl/SoftPlusActivationCPU.h"
+
+namespace raul
+{
+
+SoftPlusActivation::SoftPlusActivation(const Name& name, const SoftPlusActivationParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "SoftPlus", params, networkParameters)
+    , mBeta(params.mBeta)
+    , mThreshold(params.mThreshold)
+{
+    DECLARE_IMPL(SoftPlusActivation, SoftPlusActivationCPU<MemoryManager>, SoftPlusActivationCPU<MemoryManagerFP16>)
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_BACK_READ_NOMEMOPT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/SoftPlusActivation.h b/training/src/compiler/training/base/layers/activations/SoftPlusActivation.h
new file mode 100644
index 00000000..11586e88
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/SoftPlusActivation.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SOFTPLUS_ACTIVATION_H
+#define SOFTPLUS_ACTIVATION_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/SoftPlusActivationParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief SoftPlus activation function
+ *
+ * SoftPlus is a smooth approximation to the ReLU function and
+ * can be used to constrain the output of a machine to always be positive.
+ *
+ * Function:
+ *  \f[
+ *      \mathrm{SoftPlus}(x) = \frac{1}{beta} * \log(1 + \exp(beta * x));
+ *  \f]
+ *
+ * For TensorFlow behaviour use beta=1, threshold=MAX_FLOAT
+ */
+class SoftPlusActivation : public BasicLayer
+{
+  public:
+    SoftPlusActivation(const Name& name, const SoftPlusActivationParams& params, NetworkParameters& networkParameters);
+
+    SoftPlusActivation(SoftPlusActivation&&) = default;
+    SoftPlusActivation(const SoftPlusActivation&) = delete;
+    SoftPlusActivation& operator=(const SoftPlusActivation&) = delete;
+
+  private:
+    dtype mBeta;
+    dtype mThreshold;
+
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class SoftPlusActivationCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/SwishActivation.cpp b/training/src/compiler/training/base/layers/activations/SwishActivation.cpp
new file mode 100644
index 00000000..3d031ba8
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/SwishActivation.cpp
@@ -0,0 +1,55 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SwishActivation.h"
+
+#include "impl/SwishActivationCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+SwishActivation::SwishActivation(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Swish", params, networkParameters)
+{
+    auto prefix = "SwishActivation[" + mName + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(SwishActivation, SwishActivationCPU<MemoryManager>, SwishActivationCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_BACK_READ);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/SwishActivation.h b/training/src/compiler/training/base/layers/activations/SwishActivation.h
new file mode 100644
index 00000000..4e893438
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/SwishActivation.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SWISH_ACTIVATION_H
+#define SWISH_ACTIVATION_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief  Swish activation function
+ *
+ * The layer applies the element-wise swish function.
+ */
+class SwishActivation : public BasicLayer
+{
+  public:
+    SwishActivation(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    SwishActivation(SwishActivation&&) = default;
+    SwishActivation(const SwishActivation&) = delete;
+    SwishActivation& operator=(const SwishActivation&) = delete;
+
+  private:
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class SwishActivationCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/TanhActivation.cpp b/training/src/compiler/training/base/layers/activations/TanhActivation.cpp
new file mode 100644
index 00000000..319db2c4
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/TanhActivation.cpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TanhActivation.h"
+
+#include "impl/TanhActivationCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+TanhActivation::TanhActivation(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Tanh", params, networkParameters)
+{
+
+    DECLARE_IMPL(TanhActivation, TanhActivationCPU<MemoryManager>, TanhActivationCPU<MemoryManagerFP16>)
+
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_COMP);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_BACK_READ_COMP);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/TanhActivation.h b/training/src/compiler/training/base/layers/activations/TanhActivation.h
new file mode 100644
index 00000000..c787b8d9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/TanhActivation.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TANH_ACTIVATION_H
+#define TANH_ACTIVATION_H
+
+#include "training/base/layers/BasicLayer.h"
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Hyperbolic tangent activation function
+ *
+ * The layer applies the element-wise hyperbolic tangent function.
+ */
+class TanhActivation : public BasicLayer
+{
+  public:
+    TanhActivation(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    TanhActivation(TanhActivation&&) = default;
+    TanhActivation(const TanhActivation&) = delete;
+    TanhActivation& operator=(const TanhActivation&) = delete;
+
+  private:
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class TanhActivationCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/GeLUActivationCPU.cpp b/training/src/compiler/training/base/layers/activations/impl/GeLUActivationCPU.cpp
new file mode 100644
index 00000000..70bfbd95
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/GeLUActivationCPU.cpp
@@ -0,0 +1,90 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GeLUActivationCPU.h"
+#include "../GeLUActivation.h"
+
+namespace raul
+{
+
+template<typename MM>
+void GeLUErfCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName);
+    const auto& inputs = work.getMemoryManager<MM>().getTensor(mLayer.mInputName);
+
+    std::transform(inputs.begin(), inputs.end(), output.begin(), [&](dtype val) -> dtype { return Common::GeLU_Erf(val); });
+}
+
+template<typename MM>
+void GeLUErfCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    ////if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        const auto& inputs = work.getMemoryManager<MM>().getTensor(mLayer.mInputName);
+        const auto& deltas = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName.grad());
+        auto& prevLayerDelta = work.getMemoryManager<MM>().getTensor(mLayer.mInputName.grad());
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+        {
+            auto x = inputs[q];
+            prevLayerDelta[q] += static_cast<dtype>(deltas[q] * 0.5_dt * (1.0_dt + std::erf(x * RAUL_SQRT1_2) + x * RAUL_SQRT2_PI * exp(-0.5 * x * x)));
+        }
+    }
+}
+
+template<typename MM>
+void GeLUTanhCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName);
+    const auto& inputs = work.getMemoryManager<MM>().getTensor(mLayer.mInputName);
+
+    std::transform(inputs.begin(), inputs.end(), output.begin(), [&](dtype val) -> dtype { return Common::GeLU_Tanh(val); });
+}
+
+template<typename MM>
+void GeLUTanhCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    ////if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        const auto& inputs = work.getMemoryManager<MM>().getTensor(mLayer.mInputName);
+        const auto& deltas = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName.grad());
+        auto& prevLayerDelta = work.getMemoryManager<MM>().getTensor(mLayer.mInputName.grad());
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+        {
+            auto x = inputs[q];
+            auto th = std::tanh(RAUL_SQRT2_PI * (x + GELU_CONST * std::pow(x, 3)));
+            prevLayerDelta[q] += static_cast<dtype>(deltas[q] * 0.5_dt * (1.0_dt + th + x * RAUL_SQRT2_PI * (1.0_dt + 3 * GELU_CONST * x * x) * (1.0_dt - th * th)));
+        }
+    }
+}
+
+template class GeLUErfCPU<MemoryManager>;
+template class GeLUTanhCPU<MemoryManager>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/GeLUActivationCPU.h b/training/src/compiler/training/base/layers/activations/impl/GeLUActivationCPU.h
new file mode 100644
index 00000000..de1cd490
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/GeLUActivationCPU.h
@@ -0,0 +1,73 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GELU_ACTIVATION_CPU_H
+#define GELU_ACTIVATION_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class GeLUErf;
+class GeLUTanh;
+
+/**
+ * @brief Gaussian error linear activation function layer CPU implementation
+ */
+template<typename MM>
+class GeLUErfCPU : public BasicImpl
+{
+  public:
+    GeLUErfCPU(GeLUErf& layer)
+        : mLayer(layer)
+    {
+    }
+
+    GeLUErfCPU(GeLUErfCPU&&) = default;
+    GeLUErfCPU(const GeLUErfCPU&) = delete;
+    GeLUErfCPU& operator=(const GeLUErfCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    GeLUErf& mLayer;
+};
+
+/**
+ * @brief Gaussian error linear activation function layer CPU implementation
+ */
+template<typename MM>
+class GeLUTanhCPU : public BasicImpl
+{
+  public:
+    GeLUTanhCPU(GeLUTanh& layer)
+        : mLayer(layer)
+    {
+    }
+
+    GeLUTanhCPU(GeLUTanhCPU&&) = default;
+    GeLUTanhCPU(const GeLUTanhCPU&) = delete;
+    GeLUTanhCPU& operator=(const GeLUTanhCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    GeLUTanh& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/HSigmoidActivationCPU.cpp b/training/src/compiler/training/base/layers/activations/impl/HSigmoidActivationCPU.cpp
new file mode 100644
index 00000000..34b2e714
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/HSigmoidActivationCPU.cpp
@@ -0,0 +1,89 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "HSigmoidActivationCPU.h"
+#include "../HSigmoidActivation.h"
+
+#include <algorithm>
+
+namespace raul
+{
+
+template<typename MM>
+void HSigmoidActivationCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    std::transform(inputs.begin(), inputs.end(), output.begin(), [&](typename MM::type val) -> typename MM::type { return Common::HSigmoid(val); });
+}
+
+template<typename MM>
+void HSigmoidActivationCPU<MM>::backwardComputeImpl()
+{
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+        {
+            if (TODTYPE(inputs[q]) > -3.0_dt && TODTYPE(inputs[q]) < 3.0_dt)
+            {
+                prevLayerDelta[q] += TOMMTYPE(TODTYPE(deltas[q]) / 6.0_dt);
+            }
+            else if (TODTYPE(inputs[q]) == 3.0_dt)
+            {
+                switch (mLayer.p3PointVal)
+                {
+                    case Limit::Left:
+                        prevLayerDelta[q] += TOMMTYPE(TODTYPE(deltas[q]) / 6.0_dt);
+                        break;
+                    case Limit::Middle:
+                        prevLayerDelta[q] += TOMMTYPE(TODTYPE(deltas[q]) / 12.0_dt);
+                        break;
+                    case Limit::Right:
+                        break;
+                        // default: Do nothing
+                }
+            }
+            else if (TODTYPE(inputs[q]) == -3.0_dt)
+            {
+                switch (mLayer.m3PointVal)
+                {
+                    case Limit::Left:
+                        break;
+                    case Limit::Middle:
+                        prevLayerDelta[q] += TOMMTYPE(TODTYPE(deltas[q]) / 12.0_dt);
+                        break;
+                    case Limit::Right:
+                        prevLayerDelta[q] += TOMMTYPE(TODTYPE(deltas[q]) / 6.0_dt);
+                        break;
+                        // default: Do nothing
+                }
+            }
+        }
+    }
+}
+
+template class HSigmoidActivationCPU<MemoryManager>;
+template class HSigmoidActivationCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/HSigmoidActivationCPU.h b/training/src/compiler/training/base/layers/activations/impl/HSigmoidActivationCPU.h
new file mode 100644
index 00000000..9005ffde
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/HSigmoidActivationCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef HSIGMOID_ACTIVATION_CPU_H
+#define HSIGMOID_ACTIVATION_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class HSigmoidActivation;
+
+/**
+ * @brief HSigmoidActivation layer CPU implementation
+ */
+template<typename MM>
+class HSigmoidActivationCPU : public BasicImpl
+{
+  public:
+    HSigmoidActivationCPU(HSigmoidActivation& layer)
+        : mLayer(layer)
+    {
+    }
+
+    HSigmoidActivationCPU(HSigmoidActivationCPU&&) = default;
+    HSigmoidActivationCPU(const HSigmoidActivationCPU&) = delete;
+    HSigmoidActivationCPU& operator=(const HSigmoidActivationCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    HSigmoidActivation& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/HSwishActivationCPU.cpp b/training/src/compiler/training/base/layers/activations/impl/HSwishActivationCPU.cpp
new file mode 100644
index 00000000..80642399
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/HSwishActivationCPU.cpp
@@ -0,0 +1,96 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "HSwishActivationCPU.h"
+#include "../HSwishActivation.h"
+
+#include <algorithm>
+
+namespace raul
+{
+
+template<typename MM>
+void HSwishActivationCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    std::transform(inputs.begin(), inputs.end(), output.begin(), [&](typename MM::type val) -> typename MM::type { return Common::HSwish(val); });
+}
+
+template<typename MM>
+void HSwishActivationCPU<MM>::backwardComputeImpl()
+{
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+        {
+            const auto grad_middle = [](const auto x, const auto grad) { return TOMMTYPE(TODTYPE(grad) * (TODTYPE(x) / 3.0_dt + 0.5_dt)); };
+
+            if (TODTYPE(inputs[q]) > 3.0_dt)
+            {
+                prevLayerDelta[q] += deltas[q];
+            }
+            else if (TODTYPE(inputs[q]) > -3.0_dt && TODTYPE(inputs[q]) < 3.0_dt)
+            {
+                prevLayerDelta[q] += grad_middle(inputs[q], deltas[q]);
+            }
+            else if (TODTYPE(inputs[q]) == 3.0_dt)
+            {
+                switch (mLayer.p3PointVal)
+                {
+                    case Limit::Left:
+                        prevLayerDelta[q] += grad_middle(inputs[q], deltas[q]);
+                        break;
+                    case Limit::Middle:
+                        prevLayerDelta[q] += (grad_middle(inputs[q], deltas[q]) + deltas[q]) / TOMMTYPE(2.0f);
+                        break;
+                    case Limit::Right:
+                        prevLayerDelta[q] += deltas[q];
+                        break;
+                        // default: Do nothing
+                }
+            }
+            else if (TODTYPE(inputs[q]) == -3.0_dt)
+            {
+                switch (mLayer.m3PointVal)
+                {
+                    case Limit::Left:
+                        break;
+                    case Limit::Middle:
+                        prevLayerDelta[q] += grad_middle(inputs[q], deltas[q]) / TOMMTYPE(2.0f);
+                        break;
+                    case Limit::Right:
+                        prevLayerDelta[q] += grad_middle(inputs[q], deltas[q]);
+                        break;
+                        // default: Do nothing
+                }
+            }
+        }
+    }
+}
+
+template class HSwishActivationCPU<MemoryManager>;
+template class HSwishActivationCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/activations/impl/HSwishActivationCPU.h b/training/src/compiler/training/base/layers/activations/impl/HSwishActivationCPU.h
new file mode 100644
index 00000000..cd5b9b9a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/HSwishActivationCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef HSWISH_ACTIVATION_CPU_H
+#define HSWISH_ACTIVATION_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class HSwishActivation;
+
+/**
+ * @brief HSwishActivation layer CPU implementation
+ */
+template<typename MM>
+class HSwishActivationCPU : public BasicImpl
+{
+  public:
+    HSwishActivationCPU(HSwishActivation& layer)
+        : mLayer(layer)
+    {
+    }
+
+    HSwishActivationCPU(HSwishActivationCPU&&) = default;
+    HSwishActivationCPU(const HSwishActivationCPU&) = delete;
+    HSwishActivationCPU& operator=(const HSwishActivationCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    HSwishActivation& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/LeakyReLUActivationCPU.cpp b/training/src/compiler/training/base/layers/activations/impl/LeakyReLUActivationCPU.cpp
new file mode 100644
index 00000000..3068a0aa
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/LeakyReLUActivationCPU.cpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LeakyReLUActivationCPU.h"
+#include "../LeakyReLUActivation.h"
+
+namespace raul
+{
+
+template<typename MM>
+void LeakyReLUActivationCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName);
+    const auto& input = work.getMemoryManager<MM>().getTensor(mLayer.mInputName);
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        output[q] = static_cast<typename MM::type>(std::max(0.0_dt, TODTYPE(input[q])) + mLayer.mNegativeSlope * std::min(0.0_dt, TODTYPE(input[q])));
+    }
+}
+
+template<typename MM>
+void LeakyReLUActivationCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& output = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName);
+    const auto& delta = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName.grad());
+    auto& prevLayerDelta = work.getMemoryManager<MM>().getTensor(mLayer.mInputName.grad());
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+    {
+        prevLayerDelta[q] += static_cast<typename MM::type>(TODTYPE(output[q]) > 0.0_dt ? delta[q] : delta[q] * mLayer.mNegativeSlope);
+    }
+}
+
+template class LeakyReLUActivationCPU<MemoryManager>;
+template class LeakyReLUActivationCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/LeakyReLUActivationCPU.h b/training/src/compiler/training/base/layers/activations/impl/LeakyReLUActivationCPU.h
new file mode 100644
index 00000000..19a475d2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/LeakyReLUActivationCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LEAKY_RELU_ACTIVATION_CPU_H
+#define LEAKY_RELU_ACTIVATION_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class LeakyReLUActivation;
+
+/**
+ * @brief Leaky rectified linear unit activation function layer HW independent implementation
+ */
+template<typename MM>
+class LeakyReLUActivationCPU : public BasicImpl
+{
+  public:
+    LeakyReLUActivationCPU(LeakyReLUActivation& layer)
+        : mLayer(layer)
+    {
+    }
+
+    LeakyReLUActivationCPU(LeakyReLUActivationCPU&&) = default;
+    LeakyReLUActivationCPU(const LeakyReLUActivationCPU&) = delete;
+    LeakyReLUActivationCPU& operator=(const LeakyReLUActivationCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LeakyReLUActivation& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/ReLUActivationImpl.cpp b/training/src/compiler/training/base/layers/activations/impl/ReLUActivationImpl.cpp
new file mode 100644
index 00000000..ec2d9155
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/ReLUActivationImpl.cpp
@@ -0,0 +1,76 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReLUActivationImpl.h"
+#include "../ReLUActivation.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::ReLUActivation, raul::ReLUActivationImpl<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::ReLUActivation, raul::ReLUActivationImpl<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void ReLUActivationImpl<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName);
+    const auto& input = work.getMemoryManager<MM>().getTensor(mLayer.mInputName);
+
+    Common::ReLU(input, output);
+}
+
+template<typename MM>
+void ReLUActivationImpl<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& output = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName);
+    const auto& delta = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName.grad());
+    auto& prevLayerDelta = work.getMemoryManager<MM>().getTensor(mLayer.mInputName.grad());
+
+    Common::ReLUBackward(output, delta, prevLayerDelta);
+}
+
+template<typename MM>
+void ReLU6ActivationImpl<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName);
+    const auto& input = work.getMemoryManager<MM>().getTensor(mLayer.mInputName);
+
+    Common::ReLU6(input, output);
+}
+
+template<typename MM>
+void ReLU6ActivationImpl<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& output = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName);
+    const auto& delta = work.getMemoryManager<MM>().getTensor(mLayer.mOutputName.grad());
+    auto& prevLayerDelta = work.getMemoryManager<MM>().getTensor(mLayer.mInputName.grad());
+
+    Common::ReLU6Backward(output, delta, prevLayerDelta);
+}
+
+INSTANTIATE_IMPL(ReLUActivationImpl)
+INSTANTIATE_IMPL(ReLU6ActivationImpl)
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/ReLUActivationImpl.h b/training/src/compiler/training/base/layers/activations/impl/ReLUActivationImpl.h
new file mode 100644
index 00000000..c060e13f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/ReLUActivationImpl.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RELU_ACTIVATION_IMPL_H
+#define RELU_ACTIVATION_IMPL_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class ReLUActivation;
+class ReLU6Activation;
+
+/**
+ * @brief Rectified linear unit activation function layer HW independent implementation
+ */
+template<typename MM>
+class ReLUActivationImpl : public BasicImpl
+{
+  public:
+    ReLUActivationImpl(ReLUActivation& layer)
+        : mLayer(layer)
+    {
+    }
+
+    ReLUActivationImpl(ReLUActivationImpl&&) = default;
+    ReLUActivationImpl(const ReLUActivationImpl&) = delete;
+    ReLUActivationImpl& operator=(const ReLUActivationImpl&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ReLUActivation& mLayer;
+};
+
+/**
+ * @brief Rectified linear unit 6 activation function layer HW independent implementation
+ */
+template<typename MM>
+class ReLU6ActivationImpl : public BasicImpl
+{
+  public:
+    ReLU6ActivationImpl(ReLU6Activation& layer)
+        : mLayer(layer)
+    {
+    }
+
+    ReLU6ActivationImpl(ReLU6ActivationImpl&&) = default;
+    ReLU6ActivationImpl(const ReLU6ActivationImpl&) = delete;
+    ReLU6ActivationImpl& operator=(const ReLU6ActivationImpl&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ReLU6Activation& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/activations/impl/SigmoidActivationCPU.cpp b/training/src/compiler/training/base/layers/activations/impl/SigmoidActivationCPU.cpp
new file mode 100644
index 00000000..dc019fc2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/SigmoidActivationCPU.cpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SigmoidActivationCPU.h"
+#include "../SigmoidActivation.h"
+
+#include <algorithm>
+
+#include <training/base/impl/ImplFactory.h>
+
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::SigmoidActivation, raul::SigmoidActivationCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::SigmoidActivation, raul::SigmoidActivationCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void SigmoidActivationCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        output[q] = static_cast<typename MM::type>(1.0_dt / (1.0_dt + std::exp(TODTYPE(-input[q]))));
+    }
+}
+
+template<typename MM>
+void SigmoidActivationCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    ////if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+        {
+            prevLayerDelta[q] += deltas[q] * output[q] * (static_cast<typename MM::type>(1.0_dt) - output[q]);
+        }
+    }
+}
+
+template class SigmoidActivationCPU<MemoryManager>;
+template class SigmoidActivationCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/SigmoidActivationCPU.h b/training/src/compiler/training/base/layers/activations/impl/SigmoidActivationCPU.h
new file mode 100644
index 00000000..ec7f299c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/SigmoidActivationCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SIGMOID_ACTIVATION_CPU_H
+#define SIGMOID_ACTIVATION_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SigmoidActivation;
+
+/**
+ * @brief SigmoidActivation layer CPU implementation
+ */
+template<typename MM>
+class SigmoidActivationCPU : public BasicImpl
+{
+  public:
+    SigmoidActivationCPU(SigmoidActivation& layer)
+        : mLayer(layer)
+    {
+    }
+
+    SigmoidActivationCPU(SigmoidActivationCPU&&) = default;
+    SigmoidActivationCPU(const SigmoidActivationCPU&) = delete;
+    SigmoidActivationCPU& operator=(const SigmoidActivationCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SigmoidActivation& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/SoftMaxActivationCPU.cpp b/training/src/compiler/training/base/layers/activations/impl/SoftMaxActivationCPU.cpp
new file mode 100644
index 00000000..7d4de20e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/SoftMaxActivationCPU.cpp
@@ -0,0 +1,279 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SoftMaxActivationCPU.h"
+#include "../SoftMaxActivation.h"
+
+#include <algorithm>
+
+#include <training/base/impl/ImplFactory.h>
+
+namespace
+{
+
+std::tuple<size_t, size_t, size_t, size_t> reassign(raul::Dimension dim, size_t i, size_t j, size_t k, size_t q)
+{
+    if (dim == raul::Dimension::Depth)
+    {
+        return std::make_tuple(j, i, k, q);
+    }
+    if (dim == raul::Dimension::Height)
+    {
+        return std::make_tuple(j, k, i, q);
+    }
+    return std::make_tuple(i, j, k, q);
+}
+
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::SoftMaxActivation, raul::SoftMaxActivationCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::SoftMaxActivation, raul::SoftMaxActivationCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void SoftMaxActivationCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    const size_t batchSize = inputs.getBatchSize();
+
+    if (mLayer.mDimension == Dimension::Default)
+    {
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            size_t batchOffset = q * mLayer.mInputsCount;
+
+            auto sum = static_cast<typename MM::type>(0.0_dt);
+            auto max = (*std::max_element(inputs.begin() + batchOffset, inputs.begin() + batchOffset + mLayer.mInputsCount));
+
+            for (size_t i = 0; i < mLayer.mInputsCount; ++i)
+            {
+                output[batchOffset + i] = static_cast<typename MM::type>(std::exp(TODTYPE(inputs[batchOffset + i] - max)));
+                sum += output[batchOffset + i];
+            }
+
+            for (size_t i = 0; i < mLayer.mInputsCount; ++i)
+            {
+                output[batchOffset + i] /= sum;
+            }
+        }
+    }
+    else if (mLayer.mDimension == Dimension::Width)
+    {
+        size_t size = batchSize * inputs.getDepth() * inputs.getHeight();
+        auto input2D = inputs.reshape(yato::dims(size, inputs.getWidth()));
+        auto output2D = output.reshape(yato::dims(size, inputs.getWidth()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < size; ++q)
+        {
+            auto sum = 0.0_dt;
+            auto max = (*std::max_element(inputs.begin() + q * inputs.getWidth(), inputs.begin() + (q + 1) * inputs.getWidth()));
+
+            for (size_t i = 0; i < inputs.getWidth(); ++i)
+            {
+                output2D[q][i] = static_cast<typename MM::type>(std::exp(TODTYPE(input2D[q][i]) - max));
+                sum += output2D[q][i];
+            }
+
+            for (size_t i = 0; i < inputs.getWidth(); ++i)
+            {
+                output2D[q][i] = static_cast<typename MM::type>(TODTYPE(output2D[q][i]) / sum);
+            }
+        }
+    }
+    else
+    {
+        // Output Strides
+        auto outputShape = output.getShape();
+        const auto outputStrides = Common::getStrides(outputShape);
+        // Divisor strides
+        outputShape[static_cast<size_t>(mLayer.mDimension)] = 1;
+        const auto divStrides = Common::getStrides(outputShape);
+        // Reshape intput in 4D view
+        const auto input4D = inputs.get4DView();
+        auto inputShape = inputs.getShape();
+        // Pick chosen dimension
+        const auto chosenDimSize = inputShape[static_cast<size_t>(mLayer.mDimension)];
+        // Delete it
+        std::vector<size_t> otherDims;
+        size_t otherDimsSize = 1u;
+        for (size_t i = 0; i < inputShape.dimensions_num(); ++i)
+        {
+            if (i != static_cast<size_t>(mLayer.mDimension))
+            {
+                otherDims.push_back(inputShape[i]);
+                otherDimsSize *= inputShape[i];
+            }
+        }
+        // Store divisors
+        std::vector<raul::dtype> divisors(otherDimsSize, 0.0_dt);
+        // Store indices mapping
+        std::vector<size_t> divIndices(output.size(), 0u);
+        // Main loop
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t j = 0; j < otherDims[0]; ++j)
+        {
+            for (size_t k = 0; k < otherDims[1]; ++k)
+            {
+                for (size_t q = 0; q < otherDims[2]; ++q)
+                {
+                    auto max = 0.0_dt;
+                    for (size_t i = 0; i < chosenDimSize; ++i)
+                    {
+                        auto [realI, realJ, realK, realQ] = reassign(mLayer.mDimension, i, j, k, q);
+                        max = std::max(max, TODTYPE(input4D[realI][realJ][realK][realQ]));
+                    }
+                    for (size_t i = 0; i < chosenDimSize; ++i)
+                    {
+                        // Rearrange indices in proper way
+                        auto [realI, realJ, realK, realQ] = reassign(mLayer.mDimension, i, j, k, q);
+                        // Find offset in output and divisors
+                        raul::shape outputIndices{ realI, realJ, realK, realQ };
+                        const auto outputOffset = Common::indexesToOffset(outputIndices, outputStrides);
+                        // Fill output with exp(input)
+                        auto val = std::exp(TODTYPE(input4D[realI][realJ][realK][realQ]) - max);
+                        output[outputOffset] = TOMMTYPE(val);
+                        // Calculate reduce_sum(exp(input), axis=mDimension)
+                        outputIndices[static_cast<size_t>(mLayer.mDimension)] = 1;
+                        const auto divOffset = Common::indexesToOffset(outputIndices, divStrides);
+                        divisors[divOffset] += val;
+                        divIndices[outputOffset] = divOffset;
+                    }
+                }
+            }
+        }
+        // Normalize values
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t n = 0; n < output.size(); ++n)
+        {
+            output[n] = TOMMTYPE(TODTYPE(output[n]) / divisors[divIndices[n]]);
+        }
+    }
+}
+
+template<typename MM>
+void SoftMaxActivationCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+    const size_t batchSize = output.getBatchSize();
+
+    ////if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        if (mLayer.mDimension == Dimension::Default)
+        {
+            for (size_t q = 0; q < batchSize; ++q)
+            {
+                size_t batchOffset = q * mLayer.mInputsCount;
+
+                for (size_t i = 0; i < mLayer.mInputsCount; ++i)
+                {
+                    auto sum = static_cast<typename MM::type>(0.0_dt);
+
+                    for (size_t j = 0; j < mLayer.mInputsCount; ++j)
+                    {
+                        sum += deltas[batchOffset + j] *
+                               ((j == i) ? output[batchOffset + j] * (static_cast<typename MM::type>(1.0_dt) - output[batchOffset + j]) : -output[batchOffset + i] * output[batchOffset + j]);
+                    }
+
+                    prevLayerDelta[batchOffset + i] += sum;
+                }
+            }
+        }
+        else if (mLayer.mDimension == Dimension::Width)
+        {
+            size_t size = batchSize * deltas.getDepth() * deltas.getHeight();
+
+            auto deltas2D = deltas.reshape(yato::dims(size, deltas.getWidth()));
+            auto prevLayerDelta2D = prevLayerDelta.reshape(yato::dims(size, deltas.getWidth()));
+            auto output2D = output.reshape(yato::dims(size, deltas.getWidth()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < size; ++q)
+            {
+                for (size_t i = 0; i < deltas.getWidth(); ++i)
+                {
+                    dtype sum = 0.0_dt;
+
+                    for (size_t j = 0; j < deltas.getWidth(); ++j)
+                    {
+                        sum += TODTYPE(deltas2D[q][j]) * ((j == i) ? TODTYPE(output2D[q][j]) * (1.0_dt - TODTYPE(output2D[q][j])) : -TODTYPE(output2D[q][i]) * TODTYPE(output2D[q][j]));
+                    }
+
+                    prevLayerDelta2D[q][i] += static_cast<typename MM::type>(sum);
+                }
+            }
+        }
+        else
+        {
+            auto deltas4D = deltas.get4DView();
+            auto output4D = output.get4DView();
+            auto prevLayerDelta4D = prevLayerDelta.get4DView();
+            const auto outputShape = output.getShape();
+            const auto chosenDimSize = outputShape[static_cast<size_t>(mLayer.mDimension)];
+            std::vector<size_t> otherDims;
+            for (size_t i = 0; i < outputShape.dimensions_num(); ++i)
+            {
+                if (i != static_cast<size_t>(mLayer.mDimension))
+                {
+                    otherDims.push_back(outputShape[i]);
+                }
+            }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t j = 0; j < otherDims[0]; ++j)
+            {
+                for (size_t k = 0; k < otherDims[1]; ++k)
+                {
+                    for (size_t q = 0; q < otherDims[2]; ++q)
+                    {
+                        for (size_t i = 0; i < chosenDimSize; ++i)
+                        {
+                            raul::dtype sum = 0.0_dt;
+                            auto [realI1, realJ1, realK1, realQ1] = reassign(mLayer.mDimension, i, j, k, q);
+                            for (size_t n = 0; n < chosenDimSize; ++n)
+                            {
+                                auto [realI2, realJ2, realK2, realQ2] = reassign(mLayer.mDimension, n, j, k, q);
+                                sum += TODTYPE(deltas4D[realI2][realJ2][realK2][realQ2]) *
+                                       ((n == i) ? TODTYPE(output4D[realI1][realJ1][realK1][realQ1]) * (1.0_dt - TODTYPE(output4D[realI1][realJ1][realK1][realQ1]))
+                                                 : -TODTYPE(output4D[realI1][realJ1][realK1][realQ1]) * TODTYPE(output4D[realI2][realJ2][realK2][realQ2]));
+                            }
+                            prevLayerDelta4D[realI1][realJ1][realK1][realQ1] += static_cast<typename MM::type>(sum);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template class SoftMaxActivationCPU<MemoryManager>;
+template class SoftMaxActivationCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/SoftMaxActivationCPU.h b/training/src/compiler/training/base/layers/activations/impl/SoftMaxActivationCPU.h
new file mode 100644
index 00000000..79566f4c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/SoftMaxActivationCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SOFTMAX_ACTIVATION_CPU_H
+#define SOFTMAX_ACTIVATION_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SoftMaxActivation;
+
+/**
+ * @brief SoftMaxActivation layer CPU implementation
+ */
+template<typename MM>
+class SoftMaxActivationCPU : public BasicImpl
+{
+  public:
+    SoftMaxActivationCPU(SoftMaxActivation& layer)
+        : mLayer(layer)
+    {
+    }
+
+    SoftMaxActivationCPU(SoftMaxActivationCPU&&) = default;
+    SoftMaxActivationCPU(const SoftMaxActivationCPU&) = delete;
+    SoftMaxActivationCPU& operator=(const SoftMaxActivationCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SoftMaxActivation& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/SoftPlusActivationCPU.cpp b/training/src/compiler/training/base/layers/activations/impl/SoftPlusActivationCPU.cpp
new file mode 100644
index 00000000..a807dae5
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/SoftPlusActivationCPU.cpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SoftPlusActivationCPU.h"
+#include "../SoftPlusActivation.h"
+
+#include <algorithm>
+
+namespace raul
+{
+
+template<typename MM>
+void SoftPlusActivationCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        if (input[q] * mLayer.mBeta <= mLayer.mThreshold)
+        {
+            output[q] = static_cast<typename MM::type>(1.0_dt / mLayer.mBeta * std::log(1.0_dt + std::exp(mLayer.mBeta * input[q])));
+        }
+        else
+        {
+            output[q] = input[q];
+        }
+    }
+}
+
+template<typename MM>
+void SoftPlusActivationCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    ////if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+        {
+            if (output[q] * mLayer.mBeta <= mLayer.mThreshold)
+            {
+                prevLayerDelta[q] += static_cast<typename MM::type>(deltas[q] * 1.0_dt / (1.0_dt + 1.0_dt / (std::exp(mLayer.mBeta * output[q]) - 1.0_dt)));
+            }
+            else
+            {
+                prevLayerDelta[q] += deltas[q];
+            }
+        }
+    }
+}
+
+template class SoftPlusActivationCPU<MemoryManager>;
+template class SoftPlusActivationCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/SoftPlusActivationCPU.h b/training/src/compiler/training/base/layers/activations/impl/SoftPlusActivationCPU.h
new file mode 100644
index 00000000..2f81688c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/SoftPlusActivationCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SOFTPLUS_ACTIVATION_CPU_H
+#define SOFTPLUS_ACTIVATION_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SoftPlusActivation;
+
+/**
+ * @brief SoftPlusActivation layer CPU implementation
+ */
+template<typename MM>
+class SoftPlusActivationCPU : public BasicImpl
+{
+  public:
+    SoftPlusActivationCPU(SoftPlusActivation& layer)
+        : mLayer(layer)
+    {
+    }
+
+    SoftPlusActivationCPU(SoftPlusActivationCPU&&) = default;
+    SoftPlusActivationCPU(const SoftPlusActivationCPU&) = delete;
+    SoftPlusActivationCPU& operator=(const SoftPlusActivationCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SoftPlusActivation& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/SwishActivationCPU.cpp b/training/src/compiler/training/base/layers/activations/impl/SwishActivationCPU.cpp
new file mode 100644
index 00000000..998e2dcb
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/SwishActivationCPU.cpp
@@ -0,0 +1,64 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SwishActivationCPU.h"
+#include "../SwishActivation.h"
+
+#include <algorithm>
+
+namespace raul
+{
+
+template<typename MM>
+void SwishActivationCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        output[q] = input[q] / (static_cast<typename MM::type>(1.0_dt) + static_cast<typename MM::type>(std::exp(TODTYPE(-input[q]))));
+    }
+}
+
+template<typename MM>
+void SwishActivationCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    ////if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+        {
+            prevLayerDelta[q] += (output[q] + static_cast<typename MM::type>(1.0_dt / (1.0_dt + std::exp(TODTYPE(-input[q])))) * (static_cast<typename MM::type>(1.0_dt) - output[q])) * deltas[q];
+        }
+    }
+}
+
+template class SwishActivationCPU<MemoryManager>;
+template class SwishActivationCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/activations/impl/SwishActivationCPU.h b/training/src/compiler/training/base/layers/activations/impl/SwishActivationCPU.h
new file mode 100644
index 00000000..a8857091
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/SwishActivationCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SWISH_ACTIVATION_CPU_H
+#define SWISH_ACTIVATION_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SwishActivation;
+
+/**
+ * @brief SwishActivation layer CPU implementation
+ */
+template<typename MM>
+class SwishActivationCPU : public BasicImpl
+{
+  public:
+    SwishActivationCPU(SwishActivation& layer)
+        : mLayer(layer)
+    {
+    }
+
+    SwishActivationCPU(SwishActivationCPU&&) = default;
+    SwishActivationCPU(const SwishActivationCPU&) = delete;
+    SwishActivationCPU& operator=(const SwishActivationCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SwishActivation& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/TanhActivationCPU.cpp b/training/src/compiler/training/base/layers/activations/impl/TanhActivationCPU.cpp
new file mode 100644
index 00000000..41d54716
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/TanhActivationCPU.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TanhActivationCPU.h"
+#include "../TanhActivation.h"
+
+#include <algorithm>
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::TanhActivation, raul::TanhActivationCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::TanhActivation, raul::TanhActivationCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void TanhActivationCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        output[q] = static_cast<typename MM::type>(std::tanh(TODTYPE(input[q])));
+    }
+}
+
+template<typename MM>
+void TanhActivationCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    ////if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+        {
+            prevLayerDelta[q] += deltas[q] * (static_cast<typename MM::type>(1.0_dt) - output[q] * output[q]);
+        }
+    }
+}
+
+template class TanhActivationCPU<MemoryManager>;
+template class TanhActivationCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/activations/impl/TanhActivationCPU.h b/training/src/compiler/training/base/layers/activations/impl/TanhActivationCPU.h
new file mode 100644
index 00000000..1f5a36b0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/activations/impl/TanhActivationCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TANH_ACTIVATION_CPU_H
+#define TANH_ACTIVATION_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class TanhActivation;
+
+/**
+ * @brief TanhActivation layer CPU implementation
+ */
+template<typename MM>
+class TanhActivationCPU : public BasicImpl
+{
+  public:
+    TanhActivationCPU(TanhActivation& layer)
+        : mLayer(layer)
+    {
+    }
+
+    TanhActivationCPU(TanhActivationCPU&&) = default;
+    TanhActivationCPU(const TanhActivationCPU&) = delete;
+    TanhActivationCPU& operator=(const TanhActivationCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    TanhActivation& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ArgExtremumLayer.h b/training/src/compiler/training/base/layers/basic/ArgExtremumLayer.h
new file mode 100644
index 00000000..7c7cf2e9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ArgExtremumLayer.h
@@ -0,0 +1,284 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ARG_EXTREMUM_LAYER_H
+#define ARG_EXTREMUM_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace
+{
+
+struct ArgMax
+{
+    static bool compare(const raul::dtype x, const raul::dtype y) { return y >= x; }
+    static raul::dtype getBound() { return -std::numeric_limits<raul::dtype>::infinity(); }
+};
+
+struct ArgMin
+{
+    static bool compare(const raul::dtype x, const raul::dtype y) { return y <= x; }
+    static raul::dtype getBound() { return std::numeric_limits<raul::dtype>::infinity(); }
+};
+
+}
+
+namespace raul
+{
+
+/**
+ * @brief ArgExtremum (ArgMax and ArgMin) Layer
+ *
+ * First output is indices, second (if needed) - values.
+ * No gradient if only indices required (copy tf and torch behavior).
+ *
+ */
+template<typename T>
+class ArgExtremumLayer : public BasicLayer
+{
+  public:
+    ArgExtremumLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    ArgExtremumLayer(ArgExtremumLayer&&) = default;
+    ArgExtremumLayer(const ArgExtremumLayer&) = delete;
+    ArgExtremumLayer& operator=(const ArgExtremumLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    Dimension mDimension;
+    T mComparator;
+    bool mValuesRequired;
+    std::vector<raul::dtype> mValues;
+};
+
+template<typename T>
+ArgExtremumLayer<T>::ArgExtremumLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "ArgExtremum", params, networkParameters)
+    , mDimension(params.dim)
+{
+    auto prefix = mTypeName + "[" + name + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() > 2)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    if (mDimension == raul::Dimension::Default)
+    {
+        THROW(mTypeName, mName, "wrong dimension specified");
+    }
+
+    const auto& inputName = mInputs[0];
+    mValuesRequired = mOutputs.size() == 2;
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, inputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    shape outputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+    if (mDimension == raul::Dimension::Batch)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ 1u, outputShape[1], outputShape[2], outputShape[3] }, DEC_FORW_WRIT);
+    }
+    else
+    {
+        for (size_t i = 1; i < outputShape.dimensions_num(); i++)
+        {
+            if (static_cast<size_t>(mDimension) == i)
+            {
+                outputShape[i] = 1u;
+            }
+        }
+        mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ raul::BS(), outputShape[1], outputShape[2], outputShape[3] }, DEC_FORW_WRIT);
+    }
+
+    if (mValuesRequired)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, inputName, inputName.grad(), DEC_BACK_WRIT_ZERO);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0], DEC_BACK_READ);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[1], DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[1], mOutputs[1].grad(), DEC_BACK_READ);
+    }
+}
+
+template<typename T>
+void ArgExtremumLayer<T>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        Tensor& indices = mNetworkParams.mMemoryManager[mOutputs[0]];
+
+        const Tensor& input = mNetworkParams.mMemoryManager[mInputs[0]];
+
+        // Storage for values
+        mValues.resize(indices.size());
+        std::fill(mValues.begin(), mValues.end(), mComparator.getBound());
+
+        // Need for dimensionality hints
+        const auto inputStrides = Common::getStrides(input.getShape());
+        const auto outputStrides = Common::getStrides(indices.getShape());
+
+        for (size_t q = 0; q < input.size(); ++q)
+        {
+            auto inputIndexes = Common::offsetToIndexes(q, inputStrides);
+            const auto index = inputIndexes[static_cast<size_t>(mDimension)];
+            inputIndexes[static_cast<size_t>(mDimension)] = 1;
+
+            // Calculate offset in output tensor
+            const auto outputOffset = Common::indexesToOffset(inputIndexes, outputStrides);
+            if (mComparator.compare(mValues[outputOffset], input[q]))
+            {
+                mValues[outputOffset] = input[q];
+                indices[outputOffset] = TODTYPE(index);
+            }
+        }
+
+        // If values are required
+        if (mValuesRequired)
+        {
+            Tensor& values = mNetworkParams.mMemoryManager[mOutputs[1]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < indices.size(); ++q)
+            {
+                values[q] = mValues[q];
+            }
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        auto& indices = work.template getMemoryManager<MemoryManagerFP16>()[mOutputs[0]];
+
+        const auto& input = work.template getMemoryManager<MemoryManagerFP16>()[mInputs[0]];
+
+        // Storage for values
+        mValues.resize(indices.size());
+        std::fill(mValues.begin(), mValues.end(), mComparator.getBound());
+
+        // Need for dimensionality hints
+        const auto inputStrides = Common::getStrides(input.getShape());
+        const auto outputStrides = Common::getStrides(indices.getShape());
+
+        for (size_t q = 0; q < input.size(); ++q)
+        {
+            auto inputIndexes = Common::offsetToIndexes(q, inputStrides);
+            const auto index = inputIndexes[static_cast<size_t>(mDimension)];
+            inputIndexes[static_cast<size_t>(mDimension)] = 1;
+
+            // Calculate offset in output tensor
+            const auto outputOffset = Common::indexesToOffset(inputIndexes, outputStrides);
+            if (mComparator.compare(mValues[outputOffset], TODTYPE(input[q])))
+            {
+                mValues[outputOffset] = TODTYPE(input[q]);
+                indices[outputOffset] = TOHTYPE(index);
+            }
+        }
+
+        // If values are required
+        if (mValuesRequired)
+        {
+            auto& values = work.template getMemoryManager<MemoryManagerFP16>()[mOutputs[1]];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < indices.size(); ++q)
+            {
+                values[q] = TOHTYPE(mValues[q]);
+            }
+        }
+    }
+    else
+    {
+        THROW_NONAME("ArgExtremumLayer", "unsupported execution target");
+    }
+}
+
+template<typename T>
+void ArgExtremumLayer<T>::backwardComputeImpl()
+{
+    if (!mValuesRequired)
+    {
+        // Tensorflow and pytorch behavior.
+        // No gradient if only indices required.
+        return;
+    }
+
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        const Tensor& delta = mNetworkParams.mMemoryManager[mOutputs[1].grad()];
+        const Tensor& indices = mNetworkParams.mMemoryManager[mOutputs[0]];
+
+        // if (mNetworkParams.isGradNeeded(mInputs[0]))
+        {
+            auto& in_nabla_tensor = mNetworkParams.mMemoryManager[mInputs[0].grad()];
+
+            const auto inputStrides = Common::getStrides(in_nabla_tensor.getShape());
+            const auto outputStrides = Common::getStrides(indices.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t j = 0; j < in_nabla_tensor.size(); ++j)
+            {
+                auto inputIndexes = Common::offsetToIndexes(j, inputStrides);
+                const auto index = inputIndexes[static_cast<size_t>(mDimension)];
+                inputIndexes[static_cast<size_t>(mDimension)] = 1;
+
+                const auto outputOffset = Common::indexesToOffset(inputIndexes, outputStrides);
+                in_nabla_tensor[j] += TODTYPE((index == static_cast<size_t>(indices[outputOffset]))) * delta[outputOffset];
+            }
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        const auto& delta = work.template getMemoryManager<MemoryManagerFP16>()[mOutputs[1].grad()];
+        const auto& indices = work.template getMemoryManager<MemoryManagerFP16>()[mOutputs[0]];
+
+        // if (mNetworkParams.isGradNeeded(mInputs[0]))
+        {
+            auto& in_nabla_tensor = work.template getMemoryManager<MemoryManagerFP16>()[mInputs[0].grad()];
+
+            const auto inputStrides = Common::getStrides(in_nabla_tensor.getShape());
+            const auto outputStrides = Common::getStrides(indices.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t j = 0; j < in_nabla_tensor.size(); ++j)
+            {
+                auto inputIndexes = Common::offsetToIndexes(j, inputStrides);
+                const auto index = inputIndexes[static_cast<size_t>(mDimension)];
+                inputIndexes[static_cast<size_t>(mDimension)] = 1;
+
+                const auto outputOffset = Common::indexesToOffset(inputIndexes, outputStrides);
+                in_nabla_tensor[j] += TOHTYPE((index == static_cast<size_t>(indices[outputOffset]))) * delta[outputOffset];
+            }
+        }
+    }
+    else
+    {
+        THROW_NONAME("ArgExtremumLayer", "unsupported execution target");
+    }
+}
+
+}
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ArgMaxLayer.h b/training/src/compiler/training/base/layers/basic/ArgMaxLayer.h
new file mode 100644
index 00000000..8add0563
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ArgMaxLayer.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ARG_MAX_LAYER_H
+#define ARG_MAX_LAYER_H
+
+#include "ArgExtremumLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief ArgMax Layer
+ * Returns indices where values is the maximum value of each row
+ * of the input tensor in the given dimension dim. Indices are the index locations
+ * of each maximum value found.
+ * If two outputs are supplied, the second will contain corresponding maximum values.
+ * Axis should be specified: raul::Dimension::Default causes an exception.
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.max.html
+ */
+
+using ArgMaxLayer = ArgExtremumLayer<ArgMax>;
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ArgMinLayer.h b/training/src/compiler/training/base/layers/basic/ArgMinLayer.h
new file mode 100644
index 00000000..4593a021
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ArgMinLayer.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ARG_MIN_LAYER_H
+#define ARG_MIN_LAYER_H
+
+#include "ArgExtremumLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief ArgMin Layer
+ * Returns indices where values is the minimum of each row
+ * of the input tensor in the given dimension dim. Indices are the index locations
+ * of each minimum value found.
+ * If two outputs are supplied, the second will contain corresponding minimum values.
+ * Axis should be specified: raul::Dimension::Default causes an exception.
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.min.html#torch.min
+ */
+
+using ArgMinLayer = ArgExtremumLayer<ArgMin>;
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/AveragePoolLayer.cpp b/training/src/compiler/training/base/layers/basic/AveragePoolLayer.cpp
new file mode 100644
index 00000000..fd62c9ba
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/AveragePoolLayer.cpp
@@ -0,0 +1,74 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "AveragePoolLayer.h"
+
+#include "impl/AveragePoolLayerCPU.h"
+
+namespace raul
+{
+
+AveragePoolLayer::AveragePoolLayer(const Name& name, const Pool2DParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "AveragePooling", params, networkParameters)
+    , mKernelWidth(params.kernelWidth)
+    , mKernelHeight(params.kernelHeight)
+    , mPaddingW(params.paddingW)
+    , mPaddingH(params.paddingH)
+    , mStrideW(params.strideW)
+    , mStrideH(params.strideH)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mInputDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mInputHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mInputWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+
+    if ((mPaddingH > mKernelHeight / 2) || (mPaddingW > mKernelWidth / 2))
+    {
+        THROW(mTypeName, mName, "Padding should be smaller than half of kernel size");
+    }
+    if (mKernelHeight == 0 || mKernelWidth == 0)
+    {
+        THROW(mTypeName, mName, "Kernel size can't be null");
+    }
+    if ((mInputWidth + mPaddingW * 2 < mKernelWidth) || (mInputHeight + mPaddingW * 2 < mKernelHeight))
+    {
+        THROW(mTypeName, mName, "ImageSize + 2*Padding can't be less than KernelSize");
+    }
+
+    DECLARE_IMPL(AveragePoolLayer, AveragePoolLayerCPU<MemoryManager>, AveragePoolLayerCPU<MemoryManagerFP16>)
+
+    mOutputWidth = (mInputWidth + mPaddingW * 2 - mKernelWidth) / mStrideW + 1;
+    mOutputHeight = (mInputHeight + mPaddingH * 2 - mKernelHeight) / mStrideH + 1;
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ BS(), mInputDepth, mOutputHeight, mOutputWidth }, DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/AveragePoolLayer.h b/training/src/compiler/training/base/layers/basic/AveragePoolLayer.h
new file mode 100644
index 00000000..8826dfb4
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/AveragePoolLayer.h
@@ -0,0 +1,62 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef AVERAGEPOOL_LAYER_H
+#define AVERAGEPOOL_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Average pooling layer
+ *
+ * The layer applies a 2D averaging over an input tensor.
+ */
+class AveragePoolLayer : public BasicLayer
+{
+  public:
+    AveragePoolLayer(const Name& name, const Pool2DParams& params, NetworkParameters& networkParameters);
+
+    AveragePoolLayer(AveragePoolLayer&&) = default;
+    AveragePoolLayer(const AveragePoolLayer&) = delete;
+    AveragePoolLayer& operator=(const AveragePoolLayer&) = delete;
+
+    typedef std::vector<size_t> Indexes;
+
+  private:
+    size_t mKernelWidth;
+    size_t mKernelHeight;
+    size_t mPaddingW;
+    size_t mPaddingH;
+    size_t mStrideW;
+    size_t mStrideH;
+
+    size_t mInputWidth;
+    size_t mInputHeight;
+    size_t mInputDepth;
+
+    size_t mOutputWidth;
+    size_t mOutputHeight;
+
+    Name mInputName;
+    Name mOutputName;
+
+    template<typename MM>
+    friend class AveragePoolLayerCPU;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/BatchExpanderLayer.cpp b/training/src/compiler/training/base/layers/basic/BatchExpanderLayer.cpp
new file mode 100644
index 00000000..8d6b7e96
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/BatchExpanderLayer.cpp
@@ -0,0 +1,74 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BatchExpanderLayer.h"
+
+#include "impl/BatchExpanderLayerCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+BatchExpanderLayer::BatchExpanderLayer(const Name& name, const ViewParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "BatchExpander", params, networkParameters)
+{
+    auto prefix = "BatchExpanderLayer[" + name + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mInputs[0].empty())
+    {
+        THROW(mTypeName, mName, "empty output name");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs[0].empty())
+    {
+        THROW(mTypeName, mName, "empty output name");
+    }
+
+    if (mNetworkParams.mWorkflow.getShape(mInputs[0]).isBSDependent())
+    {
+        THROW(mTypeName, mName, "input tensor already has batch dimension");
+    }
+
+    DECLARE_IMPL(BatchExpanderLayer, BatchExpanderLayerCPU<MemoryManager>, BatchExpanderLayerCPU<MemoryManagerFP16>)
+
+    size_t width = mNetworkParams.mWorkflow.getWidth(mInputs[0]);
+    size_t height = mNetworkParams.mWorkflow.getHeight(mInputs[0]);
+    size_t depth = mNetworkParams.mWorkflow.getDepth(mInputs[0]);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], Workflow::Usage::Forward, Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], WShape{ BS(), depth, height, width }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    if (mNetworkParams.mWorkflow.isTensorTrainable(mInputs[0]))
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_TRAINABLE_GRAD);
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/BatchExpanderLayer.h b/training/src/compiler/training/base/layers/basic/BatchExpanderLayer.h
new file mode 100644
index 00000000..2f11d07e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/BatchExpanderLayer.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BATCH_EXPANDER_LAYER_H
+#define BATCH_EXPANDER_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Batch Expander Layer
+ * Broadcast already created input tensor with shape
+ * [1, D, H, W] to [BatchSize, D, H, W], properly sum deltas
+ * on backward path.
+ *
+ *
+ */
+class BatchExpanderLayer : public BasicLayer
+{
+  public:
+    BatchExpanderLayer(const Name& name, const ViewParams& params, NetworkParameters& networkParameters);
+
+    BatchExpanderLayer(BatchExpanderLayer&&) = default;
+    BatchExpanderLayer(const BatchExpanderLayer&) = delete;
+    BatchExpanderLayer& operator=(const BatchExpanderLayer&) = delete;
+
+  private:
+    template<typename MM>
+    friend class BatchExpanderLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ClampLayer.cpp b/training/src/compiler/training/base/layers/basic/ClampLayer.cpp
new file mode 100644
index 00000000..b7c4df22
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ClampLayer.cpp
@@ -0,0 +1,53 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ClampLayer.h"
+
+#include "impl/ClampLayerCPU.h"
+
+namespace raul
+{
+
+ClampLayer::ClampLayer(const Name& name, const ClampLayerParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Clamp", params, networkParameters)
+    , mMin(params.mMin)
+    , mMax(params.mMax)
+{
+    auto prefix = "Clamp[" + name + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    if (mMin > mMax)
+    {
+        THROW(mTypeName, mName, "lower bound exceeds upper bound");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    DECLARE_IMPL(ClampLayer, ClampLayerCPU<MemoryManager>, ClampLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT_NOMEMOPT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ClampLayer.h b/training/src/compiler/training/base/layers/basic/ClampLayer.h
new file mode 100644
index 00000000..128c6f64
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ClampLayer.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CLAMP_LAYER_H
+#define CLAMP_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/ClampLayerParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief ClampLayer
+ * Clamp all elements in input into the range [ min, max ] and return a resulting tensor
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.clamp.html
+ */
+class ClampLayer : public BasicLayer
+{
+  public:
+    ClampLayer(const Name& name, const ClampLayerParams& params, NetworkParameters& networkParameters);
+
+    ClampLayer(ClampLayer&&) = default;
+    ClampLayer(const ClampLayer&) = delete;
+    ClampLayer& operator=(const ClampLayer&) = delete;
+
+  private:
+    raul::dtype mMin;
+    raul::dtype mMax;
+
+    std::vector<raul::dtype> mIsAcceptable;
+
+    template<typename MM>
+    friend class ClampLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ConcatenationLayer.cpp b/training/src/compiler/training/base/layers/basic/ConcatenationLayer.cpp
new file mode 100644
index 00000000..184bbca1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ConcatenationLayer.cpp
@@ -0,0 +1,86 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ConcatenationLayer.h"
+
+#include "impl/ConcatenationLayerCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+size_t ConcatenationLayer::mGlobalInputMaxSize = 0;
+
+ConcatenationLayer::ConcatenationLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Concatenation", params, networkParameters)
+    , mDirection(params.dim)
+    , mCurrentInputMaxSize(0)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(ConcatenationLayer, ConcatenationLayerCPU<MemoryManager>, ConcatenationLayerCPU<MemoryManagerFP16>)
+
+    switch (mDirection)
+    {
+        case Dimension::Depth:
+            mDimIndex = 0;
+            break;
+        case Dimension::Height:
+            mDimIndex = 1;
+            break;
+        case Dimension::Width:
+            mDimIndex = 2;
+            break;
+        default:
+            THROW(mTypeName, mName, "unsupported dim");
+    }
+
+    yato::dimensionality<3U, size_t> shape(mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]));
+    mCurrentInputMaxSize = shape[0] * shape[1] * shape[2];
+    for (size_t i = 1; i < mInputs.size(); ++i)
+    {
+        yato::dimensionality<3U, size_t> inputShape(mNetworkParams.mWorkflow.getDepth(mInputs[i]), mNetworkParams.mWorkflow.getHeight(mInputs[i]), mNetworkParams.mWorkflow.getWidth(mInputs[i]));
+        mCurrentInputMaxSize = std::max(mCurrentInputMaxSize, inputShape[0] * inputShape[1] * inputShape[2]);
+
+        for (size_t k = 0; k < 3; ++k)
+        {
+            if (k == mDimIndex)
+            {
+                continue;
+            }
+            if (shape[k] != inputShape[k])
+            {
+                THROW(mTypeName, mName, "inconsistent input shapes (" + mInputs[0] + " " + Conversions::toString(shape) + " vs " + mInputs[i] + " " + Conversions::toString(inputShape) + ")");
+            }
+        }
+        shape[mDimIndex] += inputShape[mDimIndex];
+    }
+
+    for (const auto& input : mInputs)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, input, Workflow::Usage::Forward, Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, input, input.grad(), DEC_BACK_WRIT_ZERO);
+    }
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], raul::WShape{ raul::BS(), shape[0], shape[1], shape[2] }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ConcatenationLayer.h b/training/src/compiler/training/base/layers/basic/ConcatenationLayer.h
new file mode 100644
index 00000000..f1852ead
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ConcatenationLayer.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONCATENATION_LAYER_H
+#define CONCATENATION_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/layers/parameters/SlicingParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Concatenation Layer
+ *
+ * The layer allows merging sub-tensors to one. It is the opposite operation of slicing.
+ */
+class ConcatenationLayer : public BasicLayer
+{
+  public:
+    ConcatenationLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    ConcatenationLayer(ConcatenationLayer&&) = default;
+    ConcatenationLayer(const ConcatenationLayer&) = delete;
+    ConcatenationLayer& operator=(const ConcatenationLayer&) = delete;
+
+  private:
+    Dimension mDirection;
+    size_t mDimIndex = 0;
+    size_t mCurrentInputMaxSize;
+
+    static size_t mGlobalInputMaxSize;
+
+    template<typename MM>
+    friend class ConcatenationLayerCPU;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/ConvertPrecisionLayer.cpp b/training/src/compiler/training/base/layers/basic/ConvertPrecisionLayer.cpp
new file mode 100644
index 00000000..9cc47889
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ConvertPrecisionLayer.cpp
@@ -0,0 +1,177 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ConvertPrecisionLayer.h"
+
+namespace raul
+{
+
+ConvertPrecisionLayer::ConvertPrecisionLayer(const Name& name, const ConvertPrecisionParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Cast", params, networkParameters)
+    , mFromTarget(params.mFromTarget)
+    , mToTarget(params.mToTarget)
+{
+    if (mFromTarget == LayerExecutionTarget::Default && mToTarget == LayerExecutionTarget::Default)
+    {
+        THROW(mTypeName, mName, "Layer execution target should be overriden");
+    }
+
+    try
+    {
+        if (mInputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of input names");
+        }
+        if (mOutputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of output names");
+        }
+
+        const auto& tensorIn = mInputs[0];
+        const auto& tensorOut = mOutputs[0];
+
+        if (params.mOptimizeMemory)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, tensorIn, tensorOut, DEC_FORW_WRIT, mToTarget);
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, tensorIn, tensorOut, DEC_FORW_WRIT_NOMEMOPT, mToTarget);
+        }
+
+        if (mNetworkParams.mWorkflow.isTensorDeclared(tensorIn.grad()))
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, tensorOut, tensorOut.grad(), DEC_BACK_READ, mToTarget);
+        }
+
+        if (params.mOptimizeMemory)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, tensorIn, DEC_FORW_READ, mFromTarget);
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, tensorIn, DEC_FORW_READ_NOMEMOPT, mFromTarget);
+        }
+
+        if (mNetworkParams.mWorkflow.isTensorDeclared(tensorIn.grad()))
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, tensorIn.grad(), DEC_BACK_WRIT_ZERO, mFromTarget);
+        }
+    }
+    catch (...)
+    {
+        THROW(mTypeName, mName, "Cannot create CastLayer");
+    }
+}
+
+void ConvertPrecisionLayer::forwardComputeImpl(NetworkMode)
+{
+    const auto& tensorIn = mInputs[0];
+    const auto& tensorOut = mOutputs[0];
+
+    if (mFromTarget == LayerExecutionTarget::CPU || (mFromTarget == LayerExecutionTarget::Default && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU))
+    {
+        const auto& mmCpuFP32 = mNetworkParams.mWorkflow.getMemoryManager<MemoryManager>();
+
+        if (mToTarget == LayerExecutionTarget::CPUFP16 || (mToTarget == LayerExecutionTarget::Default && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16))
+        {
+            auto& mmCpuFP16 = mNetworkParams.mWorkflow.getMemoryManager<MemoryManagerFP16>();
+
+            const auto& input = mmCpuFP32[tensorIn];
+            auto& output = mmCpuFP16[tensorOut];
+
+            std::transform(input.begin(), input.end(), output.begin(), [](dtype val) { return toFloat16(val); });
+        }
+        else
+        {
+            THROW(mTypeName, mName, "Layer target not supported");
+        }
+    }
+    else if (mFromTarget == LayerExecutionTarget::CPUFP16 || (mFromTarget == LayerExecutionTarget::Default && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16))
+    {
+        const auto& mmCpuFP16 = mNetworkParams.mWorkflow.getMemoryManager<MemoryManagerFP16>();
+
+        if (mToTarget == LayerExecutionTarget::CPU || (mToTarget == LayerExecutionTarget::Default && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU))
+        {
+            auto& mmCpuFP32 = mNetworkParams.mWorkflow.getMemoryManager<MemoryManager>();
+
+            const auto& input = mmCpuFP16[tensorIn];
+            auto& output = mmCpuFP32[tensorOut];
+
+            std::transform(input.begin(), input.end(), output.begin(), [](half val) { return toFloat32(val); });
+        }
+        else
+        {
+            THROW(mTypeName, mName, "Layer target not supported");
+        }
+    }
+    else
+    {
+        THROW(mTypeName, mName, "Target not supported");
+    }
+}
+
+void ConvertPrecisionLayer::backwardComputeImpl()
+{
+    const auto& tensorIn = mInputs[0];
+    const auto& tensorOut = mOutputs[0];
+
+    if (mFromTarget == LayerExecutionTarget::CPU || (mFromTarget == LayerExecutionTarget::Default && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU))
+    {
+        auto& mmCpuFP32 = mNetworkParams.mWorkflow.getMemoryManager<MemoryManager>();
+
+        if (mToTarget == LayerExecutionTarget::CPUFP16 || (mToTarget == LayerExecutionTarget::Default && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16))
+        {
+            const auto& mmCpuFP16 = mNetworkParams.mWorkflow.getMemoryManager<MemoryManagerFP16>();
+
+            if (mmCpuFP32.tensorExists(tensorIn.grad()))
+            {
+                const auto& input = mmCpuFP16[tensorOut.grad()];
+                auto& output = mmCpuFP32[tensorIn.grad()];
+
+                std::transform(input.begin(), input.end(), output.begin(), output.begin(), [](half val, dtype in) { return in + toFloat32(val); });
+            }
+        }
+        else
+        {
+            THROW(mTypeName, mName, "Layer target not supported");
+        }
+    }
+    else if (mFromTarget == LayerExecutionTarget::CPUFP16 || (mFromTarget == LayerExecutionTarget::Default && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16))
+    {
+        auto& mmCpuFP16 = mNetworkParams.mWorkflow.getMemoryManager<MemoryManagerFP16>();
+
+        if (mToTarget == LayerExecutionTarget::CPU || (mToTarget == LayerExecutionTarget::Default && mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU))
+        {
+            const auto& mmCpuFP32 = mNetworkParams.mWorkflow.getMemoryManager<MemoryManager>();
+
+            if (mmCpuFP16.tensorExists(tensorIn.grad()))
+            {
+                const auto& input = mmCpuFP32[tensorOut.grad()];
+                auto& output = mmCpuFP16[tensorIn.grad()];
+
+                std::transform(input.begin(), input.end(), output.begin(), output.begin(), [](dtype val, half in) -> half { return in + toFloat16(val); });
+            }
+        }
+        else
+        {
+            THROW(mTypeName, mName, "Layer target not supported");
+        }
+    }
+    else
+    {
+        THROW(mTypeName, mName, "Target not supported");
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ConvertPrecisionLayer.h b/training/src/compiler/training/base/layers/basic/ConvertPrecisionLayer.h
new file mode 100644
index 00000000..07ce079b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ConvertPrecisionLayer.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVERT_PRECISION_LAYER_H
+#define CONVERT_PRECISION_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/ConvertPrecisionParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief  ConvertPrecisionLayer
+ *
+ * The layer converts input tensor precision to output tensor precision using OverrideLayerExecutionTarget logic
+ * Class should be used between calls overrideLayerExecutionTarget(...) and resetLayerExecutionTargetOverride()
+ * Use invertDirection to cast precision in opposite way (create layer before resetLayerExecutionTargetOverride())
+ * Use in compiler only
+ *
+ */
+class ConvertPrecisionLayer : public BasicLayer
+{
+  public:
+    ConvertPrecisionLayer(const Name& name, const ConvertPrecisionParams& params, NetworkParameters& networkParameters);
+
+    ConvertPrecisionLayer(ConvertPrecisionLayer&&) = default;
+    ConvertPrecisionLayer(const ConvertPrecisionLayer&) = delete;
+    ConvertPrecisionLayer& operator=(const ConvertPrecisionLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LayerExecutionTarget mFromTarget;
+    LayerExecutionTarget mToTarget;
+};
+
+} // raul namespace
+
+#endif // CAST_LAYER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/CumSumLayer.cpp b/training/src/compiler/training/base/layers/basic/CumSumLayer.cpp
new file mode 100644
index 00000000..61e0a7c7
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/CumSumLayer.cpp
@@ -0,0 +1,62 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CumSumLayer.h"
+
+#include "impl/CumSumLayerCPU.h"
+
+namespace raul
+{
+
+CumSumLayer::CumSumLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "CumSum", params, networkParameters)
+    , mDimension(params.dim)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+    if (mDimension == raul::Dimension::Default)
+    {
+        THROW(mTypeName, mName, "certain dimension should be specified");
+    }
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    if (mInputs[0].empty())
+    {
+        THROW(mTypeName, mName, "empty input name");
+    }
+
+    if (mOutputs[0].empty())
+    {
+        THROW(mTypeName, mName, "empty output name");
+    }
+
+    DECLARE_IMPL(CumSumLayer, CumSumLayerCPU<MemoryManager>, CumSumLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/CumSumLayer.h b/training/src/compiler/training/base/layers/basic/CumSumLayer.h
new file mode 100644
index 00000000..23793fa0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/CumSumLayer.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CUM_SUM_LAYER_H
+#define CUM_SUM_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief CumSumLayer
+ *
+ * Returns the cumulative sum of elements of input in the dimension dim.
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.cumsum.html?highlight=cumsum#torch.cumsum
+ */
+class CumSumLayer : public BasicLayer
+{
+  public:
+    CumSumLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    CumSumLayer(CumSumLayer&&) = default;
+    CumSumLayer(const CumSumLayer&) = delete;
+    CumSumLayer& operator=(const CumSumLayer&) = delete;
+
+  private:
+    raul::Dimension mDimension;
+
+    template<typename MM>
+    friend class CumSumLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/DataLayer.cpp b/training/src/compiler/training/base/layers/basic/DataLayer.cpp
new file mode 100644
index 00000000..ddc1b8df
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/DataLayer.cpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DataLayer.h"
+
+#include <algorithm>
+
+#include <training/base/impl/ImplFactory.h>
+
+namespace
+{
+// to override compiler checks
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::DataLayer, raul::DummyImpl>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::DataLayer, raul::DummyImpl>();
+} // anonymous namespace
+
+namespace raul
+{
+DataLayer::DataLayer(const Name& name, const DataParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Data", params, networkParameters, { false, true })
+{
+    if (!params.getInputs().empty())
+    {
+        THROW("DataLayer", mName, "input names not allowed");
+    }
+    if (params.getOutputs().empty())
+    {
+        THROW("DataLayer", mName, "no output names");
+    }
+
+    if (std::any_of(params.getOutputs().begin(), params.getOutputs().end(), [](const auto& s) { return s.empty(); }))
+    {
+        THROW("DataLayer", mName, "empty output name");
+    }
+
+    auto end = params.getOutputs().end();
+    if (params.labelsCount > 0)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, params.getOutputs().back(), WShape{ BS(), 1u, 1u, params.labelsCount }, DEC_FRBC_READ_NOMEMOPT);
+        end -= 1;
+    }
+
+    for (auto s = params.getOutputs().begin(); s != end; ++s)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, *s, WShape{ BS(), params.depth, params.height, params.width }, DEC_FORW_READ_NOMEMOPT);
+    }
+}
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/DataLayer.h b/training/src/compiler/training/base/layers/basic/DataLayer.h
new file mode 100644
index 00000000..14b5b72c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/DataLayer.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DATA_LAYER_H
+#define DATA_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/layers/parameters/DataParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Data Layer
+ *
+ * The layer is an entry point for data to a model.
+ */
+class DataLayer : public BasicLayer
+{
+  public:
+    DataLayer(const Name& name, const DataParams& params, NetworkParameters& networkParameters);
+
+    DataLayer(DataLayer&&) = default;
+    DataLayer(const DataLayer&) = delete;
+    DataLayer& operator=(const DataLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override {}
+    void backwardComputeImpl() override {}
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/DropoutLayer.cpp b/training/src/compiler/training/base/layers/basic/DropoutLayer.cpp
new file mode 100644
index 00000000..d94c5d26
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/DropoutLayer.cpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DropoutLayer.h"
+
+#include "impl/DropoutLayerCPU.h"
+
+namespace raul
+{
+
+DropoutLayer::DropoutLayer(const Name& name, const DropoutParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Dropout", params, networkParameters)
+    , mProbability(params.probability)
+    , mState(random::getGenerator())
+    , mTmpBufferName("TempStorageForIntermediateCalculationsCPU")
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    if (mProbability < 0.0_dt || mProbability >= 1.0_dt)
+    {
+        THROW(mTypeName, mName, ", must be in [0,1)");
+    }
+
+    DECLARE_IMPL(DropoutLayer, DropoutLayerCPU<MemoryManager>, DropoutLayerCPU<MemoryManagerFP16>)
+
+    mScale = 1.0_dt / (1.0_dt - mProbability);
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+
+    mNetworkParams.mWorkflow.tensorNeeded(
+        mName, mName / "random", WShape{ BS(), mDepth, mHeight, mWidth }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Write, true, true, false, false, false);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mName / "random", DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/DropoutLayer.h b/training/src/compiler/training/base/layers/basic/DropoutLayer.h
new file mode 100644
index 00000000..80829e84
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/DropoutLayer.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DROPOUT_LAYER_H
+#define DROPOUT_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/DropoutParams.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Random.h>
+
+namespace raul
+{
+
+/**
+ * @brief Dropout layer
+ *
+ * The main parameter is the probability of zeroing. It must be in [0, 1).
+ */
+class DropoutLayer : public BasicLayer
+{
+  public:
+    DropoutLayer(const Name& name, const DropoutParams& params, NetworkParameters& networkParameters);
+
+  private:
+    // typedef std::vector<float> Random;
+    // Random mRandom;
+
+    dtype mScale;
+    dtype mProbability;
+
+    Name mInputName;
+    Name mOutputName;
+
+    std::mt19937_64 mState;
+
+    Name mTmpBufferName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class DropoutLayerCPU;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.cpp b/training/src/compiler/training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.cpp
new file mode 100644
index 00000000..a4514720
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.cpp
@@ -0,0 +1,105 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DynamicDepthwiseConvolution2DLayer.h"
+
+#include "impl/DynamicDepthwiseConvolution2DLayerCPU.h"
+
+namespace raul
+{
+
+DynamicDepthwiseConvolution2DLayer::DynamicDepthwiseConvolution2DLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "DynamicDepthwiseConvolution2D", params, networkParameters)
+{
+    auto prefix = "DynamicDepthwiseConvolution2D[" + mName + "::ctor]: ";
+    if (mInputs.size() != 2)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(DynamicDepthwiseConvolution2DLayer, DynamicDepthwiseConvolution2DLayerCPU<MemoryManager>, DynamicDepthwiseConvolution2DLayerCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mFiltersName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    // TF input format - NHWC
+    mInputHeight = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mInputWidth = mNetworkParams.mWorkflow.getHeight(mInputName);
+
+    // TF kernel format - [filter_height, filter_width, in_channels, channel_multiplier]
+    if (mNetworkParams.mWorkflow.isBatchPlaceholded(mFiltersName) || mNetworkParams.mWorkflow.isDepthPlaceholded(mFiltersName))
+    {
+        THROW(mTypeName, mName, "should know these filters dimensions");
+    }
+
+    mFilterHeight = mNetworkParams.mWorkflow.getBatch(mFiltersName);
+    mFilterWidth = mNetworkParams.mWorkflow.getDepth(mFiltersName);
+
+    mOutputHeight = mInputHeight - mFilterHeight + 1;
+    mOutputWidth = mInputWidth - mFilterWidth + 1;
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mFiltersName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mFiltersName, mFiltersName.grad(), DEC_BACK_WRIT_ZERO);
+
+    if (!mNetworkParams.mWorkflow.isWidthPlaceholded(mInputName))
+    {
+        if (mNetworkParams.mWorkflow.isHeightPlaceholded(mFiltersName))
+        {
+            THROW("DynamicDepthwiseConvolution2DLayer", mName, "filters height should be known");
+        }
+
+        mInputDepth = mNetworkParams.mWorkflow.getWidth(mInputName);
+        mInChannels = mNetworkParams.mWorkflow.getHeight(mFiltersName);
+        mChannelMultiplier = mNetworkParams.mWorkflow.getWidth(mFiltersName);
+
+        if (mInChannels != mInputDepth)
+        {
+            THROW("DynamicDepthwiseConvolution2DLayer", mName, "input channels != kernels amount");
+        }
+
+        if (mNetworkParams.mWorkflow.isBatchPlaceholded(mInputName))
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ raul::BS(), mOutputHeight, mOutputWidth, mInChannels * mChannelMultiplier }, DEC_FORW_WRIT);
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(
+                mName, mOutputName, raul::WShape{ mNetworkParams.mWorkflow.getBatch(mInputName), mOutputHeight, mOutputWidth, mInChannels * mChannelMultiplier }, DEC_FORW_WRIT);
+        }
+    }
+    else
+    {
+        if (!mNetworkParams.mWorkflow.isHeightPlaceholded(mFiltersName))
+        {
+            THROW("DynamicDepthwiseConvolution2DLayer", mName, "input channels != kernels amount");
+        }
+
+        mChannelMultiplier = mNetworkParams.mWorkflow.getWidth(mFiltersName);
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ 1u, mOutputHeight, mOutputWidth, BS(mChannelMultiplier) }, DEC_FORW_WRIT);
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.h b/training/src/compiler/training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.h
new file mode 100644
index 00000000..4cfa6daa
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.h
@@ -0,0 +1,61 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DYNAMIC_DEPTHWISE_CONVOLUTION_2D_LAYER_H
+#define DYNAMIC_DEPTHWISE_CONVOLUTION_2D_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Channel-wise Dynamic Convolution 2D Layer
+ *
+ * The layer applies 2D convolution over input tensor, each channel processed separately
+ * and can be multiplied N times
+ */
+class DynamicDepthwiseConvolution2DLayer : public BasicLayer
+{
+  public:
+    DynamicDepthwiseConvolution2DLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    DynamicDepthwiseConvolution2DLayer(DynamicDepthwiseConvolution2DLayer&&) = default;
+    DynamicDepthwiseConvolution2DLayer(const DynamicDepthwiseConvolution2DLayer&) = delete;
+    DynamicDepthwiseConvolution2DLayer& operator=(const DynamicDepthwiseConvolution2DLayer&) = delete;
+
+  private:
+    Name mInputName;
+    Name mFiltersName;
+    Name mOutputName;
+
+    size_t mInputDepth{ 0u };
+    size_t mInputHeight{ 0u };
+    size_t mInputWidth{ 0u };
+
+    size_t mOutputHeight{ 0u };
+    size_t mOutputWidth{ 0u };
+
+    size_t mFilterHeight{ 0u };
+    size_t mFilterWidth{ 0u };
+    size_t mInChannels{ 0u };
+    size_t mChannelMultiplier{ 0u };
+
+    template<typename MM>
+    friend class DynamicDepthwiseConvolution2DLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseCompareLayer.cpp b/training/src/compiler/training/base/layers/basic/ElementWiseCompareLayer.cpp
new file mode 100644
index 00000000..2c532dea
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseCompareLayer.cpp
@@ -0,0 +1,55 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseCompareLayer.h"
+
+#include "impl/ElementWiseCompareLayerCPU.h"
+
+namespace raul
+{
+
+ElementWiseCompareLayer::ElementWiseCompareLayer(const Name& name, const ElementWiseComparisonLayerParams& params, NetworkParameters& networkParameters)
+    : BroadcastingLayer(name, "ElementWiseCompare", params, networkParameters)
+    , mBroadcast(params.mBroadcast)
+    , mTolerance(params.mTolerance)
+    , mCompName(params.mComparator)
+{
+    if (mInputs.size() != 2)
+    {
+        THROW("ElementWiseCompareLayer", name, "wrong number of input names");
+    }
+
+    if (comparators<dtype>.find(mCompName) == comparators<dtype>.end())
+    {
+        THROW("ElementWiseCompareLayer", name, "Unknown comparator: " + mCompName);
+    }
+
+    DECLARE_IMPL(ElementWiseCompareLayer, ElementWiseCompareLayerCPU<MemoryManager>, ElementWiseCompareLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[1], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    shape outputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+    if (mBroadcast)
+    {
+        shape inputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[1]), mNetworkParams.mWorkflow.getHeight(mInputs[1]), mNetworkParams.mWorkflow.getWidth(mInputs[1]) };
+        std::transform(inputShape.begin(), inputShape.end(), outputShape.begin(), outputShape.begin(), [](auto a, auto b) { return std::max(a, b); });
+    }
+    mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ raul::BS(), outputShape[1], outputShape[2], outputShape[3] }, DEC_FORW_WRIT);
+
+    mEqual = !mCompName.compare(std::string("exact_equal")) || (!mCompName.compare(std::string("equal")) && mTolerance == 0_dt);
+    mLess = !mCompName.compare(std::string("exact_less")) || (!mCompName.compare(std::string("less")) && mTolerance == 0_dt);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseCompareLayer.h b/training/src/compiler/training/base/layers/basic/ElementWiseCompareLayer.h
new file mode 100644
index 00000000..81b76494
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseCompareLayer.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_COMPARE_LAYER_H
+#define ELEMENT_WISE_COMPARE_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BroadcastingLayer.h>
+#include <training/base/layers/parameters/ElementWiseComparisonLayerParams.h>
+
+#include <functional>
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise Comparison Layer
+ *
+ * Compares two input tensors element-wise and returns boolean mask.
+ */
+class ElementWiseCompareLayer : public BroadcastingLayer
+{
+  public:
+    ElementWiseCompareLayer(const Name& name, const ElementWiseComparisonLayerParams& params, NetworkParameters& networkParameters);
+
+    ElementWiseCompareLayer(ElementWiseCompareLayer&&) = default;
+    ElementWiseCompareLayer(const ElementWiseCompareLayer&) = delete;
+    ElementWiseCompareLayer& operator=(const ElementWiseCompareLayer&) = delete;
+
+  private:
+    bool mBroadcast;
+    raul::dtype mTolerance;
+    std::string mCompName;
+    bool mEqual;
+    bool mLess;
+
+    template<typename MM>
+    friend class ElementWiseCompareLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseDivLayer.cpp b/training/src/compiler/training/base/layers/basic/ElementWiseDivLayer.cpp
new file mode 100644
index 00000000..07135aad
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseDivLayer.cpp
@@ -0,0 +1,87 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseDivLayer.h"
+
+#include "impl/ElementWiseDivLayerCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+ElementWiseDivLayer::ElementWiseDivLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters)
+    : BroadcastingLayer(name, "ElementWiseDiv", params, networkParameters)
+    , mBroadcast(params.mBroadcast)
+    , mBackwardTmpBufferName("TempStorageForIntermediateCalculations")
+{
+    if (mInputs.size() != 2)
+    {
+        THROW("ElementWiseDivLayer", mName, "wrong number of input names");
+    }
+
+    DECLARE_IMPL(ElementWiseDivLayer, ElementWiseDivLayerCPU<MemoryManager>, ElementWiseDivLayerCPU<MemoryManagerFP16>)
+
+    for (const auto& input : mInputs)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, input, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, input, input.grad(), DEC_BACK_WRIT_ZERO);
+    }
+
+    if (!mNetworkParams.mWorkflow.isBatchPlaceholded(mInputs[0]) && !mNetworkParams.mWorkflow.isBatchPlaceholded(mInputs[1]))
+    {
+        shape dividend_shape = shape{
+            mNetworkParams.mWorkflow.getBatch(mInputs[0]), mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0])
+        };
+
+        if (mBroadcast)
+        {
+            const shape divisor_shape = shape{ mNetworkParams.mWorkflow.getBatch(mInputs[1]),
+                                               mNetworkParams.mWorkflow.getDepth(mInputs[1]),
+                                               mNetworkParams.mWorkflow.getHeight(mInputs[1]),
+                                               mNetworkParams.mWorkflow.getWidth(mInputs[1]) };
+
+            std::transform(divisor_shape.cbegin(), divisor_shape.cend(), dividend_shape.begin(), dividend_shape.begin(), [](auto a, auto b) { return std::max(a, b); });
+        }
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], WShape(dividend_shape), DEC_FORW_WRIT);
+    }
+    else
+    {
+        if (mBroadcast)
+        {
+            shape dividend_shape = shape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+
+            const shape divisor_shape = shape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[1]), mNetworkParams.mWorkflow.getHeight(mInputs[1]), mNetworkParams.mWorkflow.getWidth(mInputs[1]) };
+
+            std::transform(divisor_shape.cbegin(), divisor_shape.cend(), dividend_shape.begin(), dividend_shape.begin(), [](auto a, auto b) { return std::max(a, b); });
+
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], WShape{ BS(), dividend_shape[1], dividend_shape[2], dividend_shape[3] }, DEC_FORW_WRIT);
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+        }
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mOutputs[0]);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mOutputs[0]);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mOutputs[0]);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseDivLayer.h b/training/src/compiler/training/base/layers/basic/ElementWiseDivLayer.h
new file mode 100644
index 00000000..0af60771
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseDivLayer.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_DIV_LAYER_H
+#define ELEMENT_WISE_DIV_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BroadcastingLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise division Layer
+ *
+ * Implements element-wise division of two tensors.
+ */
+class ElementWiseDivLayer : public BroadcastingLayer
+{
+  public:
+    ElementWiseDivLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters);
+
+    ElementWiseDivLayer(ElementWiseDivLayer&&) = default;
+    ElementWiseDivLayer(const ElementWiseDivLayer&) = delete;
+    ElementWiseDivLayer& operator=(const ElementWiseDivLayer&) = delete;
+
+  private:
+    bool mBroadcast;
+
+    Name mBackwardTmpBufferName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class ElementWiseDivLayerCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseExtremumLayer.h b/training/src/compiler/training/base/layers/basic/ElementWiseExtremumLayer.h
new file mode 100644
index 00000000..76effa5a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseExtremumLayer.h
@@ -0,0 +1,208 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_EXTREMUM_LAYER_H
+#define ELEMENT_WISE_EXTREMUM_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BroadcastingLayer.h>
+
+#include <type_traits>
+
+namespace
+{
+
+struct ElementWiseMaxHelper
+{
+    static bool compare(const raul::dtype x, const raul::dtype y) { return y >= x; }
+    static raul::dtype getBound() { return -std::numeric_limits<raul::dtype>::infinity(); }
+};
+
+struct ElementWiseMinHelper
+{
+    static bool compare(const raul::dtype x, const raul::dtype y) { return y <= x; }
+    static raul::dtype getBound() { return std::numeric_limits<raul::dtype>::infinity(); }
+};
+
+}
+
+namespace raul
+{
+
+/**
+ * @brief Template for ElementWiseMax and ElementWiseMin layers
+ * @tparam T
+ */
+template<typename T>
+class ElementWiseExtremumLayer : public BroadcastingLayer
+{
+  public:
+    ElementWiseExtremumLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters);
+
+    ElementWiseExtremumLayer(ElementWiseExtremumLayer&&) = default;
+    ElementWiseExtremumLayer(const ElementWiseExtremumLayer&) = delete;
+    ElementWiseExtremumLayer& operator=(const ElementWiseExtremumLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    bool mBroadcast;
+    std::vector<size_t> mIndexes;
+    T mComparator;
+
+    Name mTmpBufferName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+};
+
+template<typename T>
+ElementWiseExtremumLayer<T>::ElementWiseExtremumLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters)
+    : BroadcastingLayer(name, "ElementWiseExtremum", params, networkParameters)
+    , mBroadcast(params.mBroadcast)
+    , mTmpBufferName("TempStorageForIntermediateCalculations")
+{
+    for (const auto& input : mInputs)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, input.grad(), DEC_BACK_WRIT_ZERO);
+    }
+
+    shape outputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+    if (mBroadcast)
+    {
+        for (size_t j = 1; j < mInputs.size(); ++j)
+        {
+            shape inputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[j]), mNetworkParams.mWorkflow.getHeight(mInputs[j]), mNetworkParams.mWorkflow.getWidth(mInputs[j]) };
+            std::transform(inputShape.begin(), inputShape.end(), outputShape.begin(), outputShape.begin(), [](auto a, auto b) { return std::max(a, b); });
+        }
+    }
+
+    mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ BS(), outputShape[1], outputShape[2], outputShape[3] }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mOutputs[0]);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mOutputs[0]);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mOutputs[0]);
+}
+
+template<typename T>
+void ElementWiseExtremumLayer<T>::forwardComputeImpl(NetworkMode)
+{
+    determineBroadcastFlags();
+
+    if (!mBroadcast && std::any_of(mBroadcastQuery.begin(), mBroadcastQuery.end(), [](const auto& needToBroadcast) { return needToBroadcast; }))
+    {
+        THROW("ElementWiseExtremumLayer", mName, "input size mismatch");
+    }
+
+    if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        Tensor& output = mNetworkParams.mMemoryManager[mOutputs[0]];
+        std::fill(output.begin(), output.end(), mComparator.getBound());
+        mIndexes.resize(output.size());
+
+        // Comparison
+        for (size_t q = 0; q < mInputs.size(); ++q)
+        {
+            const Tensor& input = mNetworkParams.mMemoryManager[mInputs[q]];
+
+            if (mBroadcastQuery[q])
+            {
+                auto input_viewer = input.getBroadcastedViewer(output.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); ++i)
+                {
+                    if (mComparator.compare(output[i], input_viewer[i]))
+                    {
+                        output[i] = input_viewer[i];
+                        mIndexes[i] = q;
+                    }
+                }
+            }
+            else
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); ++i)
+                {
+                    if (mComparator.compare(output[i], input[i]))
+                    {
+                        output[i] = input[i];
+                        mIndexes[i] = q;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        THROW(mTypeName, mName, "unsupported execution target");
+    }
+}
+
+template<typename T>
+void ElementWiseExtremumLayer<T>::backwardComputeImpl()
+{
+    if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        const Tensor& delta = mNetworkParams.mMemoryManager[mOutputs[0].grad()];
+
+        for (size_t i = 0; i < mInputs.size(); i++)
+        {
+            const auto input_name = mInputs[i];
+            // if (mNetworkParams.isGradNeeded(input_name))
+            {
+                auto& in_nabla_tensor = mNetworkParams.mMemoryManager[input_name.grad()];
+                if (mBroadcastQuery[i])
+                {
+                    auto in_nabla = in_nabla_tensor.getBroadcastedViewer(delta.getShape());
+                    for (size_t j = 0; j < mIndexes.size(); j++)
+                    {
+                        if (mIndexes[j] == i)
+                        {
+                            in_nabla[j] += delta[j];
+                        }
+                    }
+                }
+                else
+                {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                    for (size_t j = 0; j < mIndexes.size(); j++)
+                    {
+                        if (mIndexes[j] == i)
+                        {
+                            in_nabla_tensor[j] += delta[j];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        THROW(mTypeName, mName, "unsupported execution target");
+    }
+}
+
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseMaxLayer.h b/training/src/compiler/training/base/layers/basic/ElementWiseMaxLayer.h
new file mode 100644
index 00000000..40bbafea
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseMaxLayer.h
@@ -0,0 +1,31 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_MAX_LAYER_H
+#define ELEMENT_WISE_MAX_LAYER_H
+
+#include "ElementWiseExtremumLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise Maximum Layer
+ *
+ * Each element of the input tensor is compared with the corresponding element of the other input tensor, and an element-wise maximum is taken.
+ */
+using ElementWiseMaxLayer = ElementWiseExtremumLayer<ElementWiseMaxHelper>;
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseMinLayer.h b/training/src/compiler/training/base/layers/basic/ElementWiseMinLayer.h
new file mode 100644
index 00000000..fbfbb058
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseMinLayer.h
@@ -0,0 +1,31 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_MIN_LAYER_H
+#define ELEMENT_WISE_MIN_LAYER_H
+
+#include "ElementWiseExtremumLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise Minimum Layer
+ *
+ * Each element of the input tensor is compared with the corresponding element of the other input tensor, and an element-wise minimum is taken.
+ */
+using ElementWiseMinLayer = ElementWiseExtremumLayer<ElementWiseMinHelper>;
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseMulLayer.cpp b/training/src/compiler/training/base/layers/basic/ElementWiseMulLayer.cpp
new file mode 100644
index 00000000..4e5003e8
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseMulLayer.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseMulLayer.h"
+
+#include "impl/ElementWiseMulLayerCPU.h"
+
+#include <algorithm>
+
+namespace raul
+{
+
+ElementWiseMulLayer::ElementWiseMulLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters)
+    : BroadcastingLayer(name, "ElementWiseMul", params, networkParameters)
+    , mBroadcast(params.mBroadcast)
+    , mBackwardTmpBufferName("TempStorageForIntermediateCalculations")
+{
+    auto prefix = "ElementWiseMulLayer[" + name + "::ctor]: ";
+
+    DECLARE_IMPL(ElementWiseDivLayer, ElementWiseMulLayerCPU<MemoryManager>, ElementWiseMulLayerCPU<MemoryManagerFP16>)
+
+    for (const auto& input : mInputs)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+        if (mNetworkParams.mWorkflow.isTensorTrainable(input))
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(name, input, input.grad(), DEC_TRAINABLE_GRAD);
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(name, input, input.grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+
+    shape outputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+    if (mBroadcast)
+    {
+        for (size_t i = 1; i < mInputs.size(); ++i)
+        {
+            shape inputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[i]), mNetworkParams.mWorkflow.getHeight(mInputs[i]), mNetworkParams.mWorkflow.getWidth(mInputs[i]) };
+            std::transform(inputShape.begin(), inputShape.end(), outputShape.begin(), outputShape.begin(), [](auto a, auto b) { return std::max(a, b); });
+        }
+
+        for (const auto& mInput : mInputs)
+        {
+            shape inputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInput), mNetworkParams.mWorkflow.getHeight(mInput), mNetworkParams.mWorkflow.getWidth(mInput) };
+            if (!Common::shapeIsBroadcastable(inputShape, outputShape))
+            {
+                THROW("ElementWiseMulLayer", name, "tensor '" + mInput + "'" + Conversions::toString(inputShape) + " is not broadcastable to " + Conversions::toString(outputShape));
+            }
+        }
+    }
+    else
+    {
+        for (size_t i = 1; i < mInputs.size(); ++i)
+        {
+            shape inputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[i]), mNetworkParams.mWorkflow.getHeight(mInputs[i]), mNetworkParams.mWorkflow.getWidth(mInputs[i]) };
+            if (inputShape != outputShape)
+            {
+                THROW("ElementWiseMulLayer",
+                      name,
+                      "broadcasting is off while tensor '" + mInputs[i] + "'" + Conversions::toString(inputShape) + " and tensor '" + mInputs[0] + "'" + Conversions::toString(outputShape) +
+                          " shapes differ");
+            }
+        }
+    }
+
+    mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ BS(), outputShape[1], outputShape[2], outputShape[3] }, DEC_FORW_WRIT_COMP);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0], DEC_BACK_READ_COMP);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = outputShape[1];
+    mHeight = outputShape[2];
+    mWidth = outputShape[3];
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseMulLayer.h b/training/src/compiler/training/base/layers/basic/ElementWiseMulLayer.h
new file mode 100644
index 00000000..54485854
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseMulLayer.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_LAYER_H
+#define ELEMENT_WISE_LAYER_H
+
+#include <training/base/layers/BroadcastingLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise Multiplication
+ *
+ * Implements element-wise multiplication of input tensors.
+ */
+class ElementWiseMulLayer : public BroadcastingLayer
+{
+  public:
+    ElementWiseMulLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters);
+
+    ElementWiseMulLayer(ElementWiseMulLayer&&) = default;
+    ElementWiseMulLayer(const ElementWiseMulLayer&) = delete;
+    ElementWiseMulLayer& operator=(const ElementWiseMulLayer&) = delete;
+
+  private:
+    bool mBroadcast;
+
+    Name mBackwardTmpBufferName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class ElementWiseMulLayerCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseSubLayer.cpp b/training/src/compiler/training/base/layers/basic/ElementWiseSubLayer.cpp
new file mode 100644
index 00000000..0b223587
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseSubLayer.cpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseSubLayer.h"
+
+#include "impl/ElementWiseSubLayerCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+ElementWiseSubLayer::ElementWiseSubLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters)
+    : BroadcastingLayer(name, "ElementWiseSub", params, networkParameters)
+    , mBroadcast(params.mBroadcast)
+    , mBackwardTmpBufferName("TempStorageForIntermediateCalculations")
+{
+    if (mInputs.size() != 2)
+    {
+        THROW("ElementWiseSubLayer", name, "wrong number of input names");
+    }
+
+    DECLARE_IMPL(ElementWiseSubLayer, ElementWiseSubLayerCPU<MemoryManager>, ElementWiseSubLayerCPU<MemoryManagerFP16>)
+
+    for (const auto& input : mInputs)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, input.grad(), DEC_BACK_WRIT_ZERO);
+    }
+
+    shape outputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+    if (mBroadcast)
+    {
+        shape subtrahendShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[1]), mNetworkParams.mWorkflow.getHeight(mInputs[1]), mNetworkParams.mWorkflow.getWidth(mInputs[1]) };
+        std::transform(subtrahendShape.begin(), subtrahendShape.end(), outputShape.begin(), outputShape.begin(), [](auto a, auto b) { return std::max(a, b); });
+    }
+    mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ raul::BS(), outputShape[1], outputShape[2], outputShape[3] }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = outputShape[1];
+    mHeight = outputShape[2];
+    mWidth = outputShape[3];
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseSubLayer.h b/training/src/compiler/training/base/layers/basic/ElementWiseSubLayer.h
new file mode 100644
index 00000000..d01fb941
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseSubLayer.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_SUB_LAYER_H
+#define ELEMENT_WISE_SUB_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BroadcastingLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise Subtraction
+ *
+ * Subtracts one tensor from another element-wisely.
+ */
+class ElementWiseSubLayer : public BroadcastingLayer
+{
+  public:
+    ElementWiseSubLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters);
+
+    ElementWiseSubLayer(ElementWiseSubLayer&&) = default;
+    ElementWiseSubLayer(const ElementWiseSubLayer&) = delete;
+    ElementWiseSubLayer& operator=(const ElementWiseSubLayer&) = delete;
+
+  private:
+    bool mBroadcast;
+
+    Name mBackwardTmpBufferName;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class ElementWiseSubLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseSumLayer.cpp b/training/src/compiler/training/base/layers/basic/ElementWiseSumLayer.cpp
new file mode 100644
index 00000000..1bc618da
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseSumLayer.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseSumLayer.h"
+
+#include "impl/ElementWiseSumLayerCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+ElementWiseSumLayer::ElementWiseSumLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters)
+    : BroadcastingLayer(name, "ElementWiseSum", params, networkParameters)
+    , mBroadcast(params.mBroadcast)
+{
+    bool hasBatch = false;
+
+    DECLARE_IMPL(ElementWiseSumLayer, ElementWiseSumLayerCPU<MemoryManager>, ElementWiseSumLayerCPU<MemoryManagerFP16>)
+
+    for (const auto& input : mInputs)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+        if (mNetworkParams.mWorkflow.isTensorTrainable(input))
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, input, input.grad(), DEC_TRAINABLE_GRAD);
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, input, input.grad(), DEC_BACK_WRIT_ZERO);
+        }
+
+        hasBatch |= mNetworkParams.mWorkflow.getShape(input).isBSDependent();
+    }
+
+    shape outputShape{ hasBatch ? 1u : mNetworkParams.mWorkflow.getBatch(mInputs[0]),
+                       mNetworkParams.mWorkflow.getDepth(mInputs[0]),
+                       mNetworkParams.mWorkflow.getHeight(mInputs[0]),
+                       mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+    if (mBroadcast)
+    {
+        for (const auto& input_name : mInputs)
+        {
+            shape inputShape{ hasBatch ? 1u : mNetworkParams.mWorkflow.getBatch(input_name),
+                              mNetworkParams.mWorkflow.getDepth(input_name),
+                              mNetworkParams.mWorkflow.getHeight(input_name),
+                              mNetworkParams.mWorkflow.getWidth(input_name) };
+            std::transform(inputShape.begin(), inputShape.end(), outputShape.begin(), outputShape.begin(), [](auto a, auto b) { return std::max(a, b); });
+        }
+    }
+
+    WShape outputWShape = hasBatch ? WShape{ BS(), outputShape[1], outputShape[2], outputShape[3] } : WShape{ outputShape[0], outputShape[1], outputShape[2], outputShape[3] };
+    if (!hasBatch && outputShape.total_size() == 1)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], outputWShape, DEC_FORW_WRIT_NOMEMOPT);
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], outputWShape, DEC_FORW_WRIT);
+    }
+    mNetworkParams.mWorkflow.copyDec(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = outputShape[1];
+    mHeight = outputShape[2];
+    mWidth = outputShape[3];
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ElementWiseSumLayer.h b/training/src/compiler/training/base/layers/basic/ElementWiseSumLayer.h
new file mode 100644
index 00000000..d8d31737
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ElementWiseSumLayer.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_SUM_LAYER_H
+#define ELEMENT_WISE_SUM_LAYER_H
+
+#include <training/base/layers/BroadcastingLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise Addition Layer
+ *
+ * Element-wise addition. It is useful to create networks with residual connections.
+ */
+class ElementWiseSumLayer : public BroadcastingLayer
+{
+  public:
+    ElementWiseSumLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters);
+
+    ElementWiseSumLayer(ElementWiseSumLayer&&) = default;
+    ElementWiseSumLayer(const ElementWiseSumLayer&) = delete;
+    ElementWiseSumLayer& operator=(const ElementWiseSumLayer&) = delete;
+
+  private:
+    bool mBroadcast;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class ElementWiseSumLayerCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ExpLayer.cpp b/training/src/compiler/training/base/layers/basic/ExpLayer.cpp
new file mode 100644
index 00000000..f3649e9a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ExpLayer.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ExpLayer.h"
+
+#include "impl/ExpLayerCPU.h"
+
+namespace raul
+{
+
+ExpLayer::ExpLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Exp", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(ExpLayer, ExpLayerCPU<MemoryManager>, ExpLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputs[0]);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputs[0]);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputs[0]);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ExpLayer.h b/training/src/compiler/training/base/layers/basic/ExpLayer.h
new file mode 100644
index 00000000..01f3eb55
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ExpLayer.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef EXP_LAYER_H
+#define EXP_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise Exponential Function Layer
+ *
+ * Returns a new tensor with the exponent of the elements of the input tensor.
+ */
+class ExpLayer : public BasicLayer
+{
+  public:
+    ExpLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    ExpLayer(ExpLayer&&) = default;
+    ExpLayer(const ExpLayer&) = delete;
+    ExpLayer& operator=(const ExpLayer&) = delete;
+
+  private:
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class ExpLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/FakeQuantLayer.cpp b/training/src/compiler/training/base/layers/basic/FakeQuantLayer.cpp
new file mode 100644
index 00000000..71bad8ab
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/FakeQuantLayer.cpp
@@ -0,0 +1,126 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "FakeQuantLayer.h"
+
+#include <algorithm>
+
+namespace raul
+{
+
+FakeQuantLayer::FakeQuantLayer(const Name& name, const FakeQuantParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "FakeQuant", params, networkParameters)
+    , mQuantizationMode(params.mQuantizationMode)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+    if (!mNetworkParams.mQuantizerPtr)
+    {
+        THROW(mTypeName, mName, "quantizer is not defined");
+    }
+
+    if (mInputs.size() != mOutputs.size())
+    {
+        THROW(mTypeName,
+              mName,
+              "number of input tensors do not match the numbers of output ones (expected: " + Conversions::toString(mInputs.size()) + ", got: " + Conversions::toString(mOutputs.size()) + ")");
+    }
+
+    for (const auto& input_name : mInputs)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, input_name, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, input_name, input_name.grad(), DEC_BACK_WRIT_ZERO);
+    }
+
+    for (size_t i = 0; i < mOutputs.size(); ++i)
+    {
+        const auto output_name = mOutputs[i];
+        const auto input_name = mInputs[i];
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, input_name, output_name, DEC_FORW_WRIT);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, output_name, output_name.grad(), DEC_BACK_READ);
+    }
+}
+
+void FakeQuantLayer::forwardComputeImpl(NetworkMode)
+{
+
+    for (size_t i = 0; i < mOutputs.size(); ++i)
+    {
+        const auto output_name = mOutputs[i];
+        const auto input_name = mInputs[i];
+
+        const Tensor& input = mNetworkParams.mMemoryManager[input_name];
+        Tensor& output = mNetworkParams.mMemoryManager[output_name];
+        output = TORANGE(input);
+        switch (mQuantizationMode)
+        {
+            case QuantizationMode::over_full_tensor:
+                mNetworkParams.mQuantizerPtr->quantize(output.begin(), output.end());
+                mNetworkParams.mQuantizerPtr->dequantize(output.begin(), output.end());
+                break;
+            case QuantizationMode::over_batch:
+            {
+                const auto batch_size = mNetworkParams.mWorkflow.getBatchSize();
+                const auto size = output.size() / batch_size;
+                auto output2D = output.reshape(yato::dims(batch_size, size));
+                for (size_t j = 0U; j < batch_size; ++j)
+                {
+                    mNetworkParams.mQuantizerPtr->quantize(output2D[j].begin(), output2D[j].end());
+                    mNetworkParams.mQuantizerPtr->dequantize(output2D[j].begin(), output2D[j].end());
+                }
+            }
+            break;
+            default:
+                THROW("FakeQuant", mName, "Quantization mode is not implemented");
+        }
+    }
+}
+
+void FakeQuantLayer::backwardComputeImpl()
+{
+    for (size_t i = 0; i < mOutputs.size(); ++i)
+    {
+        const auto output_name = mOutputs[i];
+        const auto input_name = mInputs[i];
+
+        const Tensor& input = mNetworkParams.mMemoryManager[input_name];
+        const Tensor& delta = mNetworkParams.mMemoryManager[output_name.grad()];
+        auto& in_nabla = mNetworkParams.mMemoryManager[input_name.grad()];
+        switch (mQuantizationMode)
+        {
+            case QuantizationMode::over_full_tensor:
+                mNetworkParams.mQuantizerPtr->backpropagate(input.cbegin(), input.cend(), delta.cbegin(), in_nabla.begin());
+                break;
+            case QuantizationMode::over_batch:
+            {
+                const auto batch_size = mNetworkParams.mWorkflow.getBatchSize();
+                const auto size = input.size() / batch_size;
+                auto input2D = input.reshape(yato::dims(batch_size, size));
+                auto delta2D = delta.reshape(yato::dims(batch_size, size));
+                auto in_nabla2D = in_nabla.reshape(yato::dims(batch_size, size));
+
+                for (size_t j = 0U; j < batch_size; ++j)
+                {
+                    mNetworkParams.mQuantizerPtr->backpropagate(input2D[j].cbegin(), input2D[j].cend(), delta2D[j].cbegin(), in_nabla2D[j].begin());
+                }
+            }
+            break;
+            default:
+                THROW("FakeQuant", mName, "Quantization mode is not implemented");
+        }
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/FakeQuantLayer.h b/training/src/compiler/training/base/layers/basic/FakeQuantLayer.h
new file mode 100644
index 00000000..1bc44deb
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/FakeQuantLayer.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FAKE_QUANT_LAYER_H
+#define FAKE_QUANT_LAYER_H
+
+#include "training/base/layers/parameters/FakeQuantParams.h"
+#include <training/base/common/Common.h>
+#include <training/base/common/quantization/IQuantizer.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Fake Quantization Layer
+ *
+ * Layer introduces quantization noise for training-aware quantization technique.
+ */
+class FakeQuantLayer : public BasicLayer
+{
+  public:
+    FakeQuantLayer(const Name& name, const FakeQuantParams& params, NetworkParameters& networkParameters);
+
+    FakeQuantLayer(FakeQuantLayer&&) = default;
+    FakeQuantLayer(const FakeQuantLayer&) = delete;
+    FakeQuantLayer& operator=(const FakeQuantLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    QuantizationMode mQuantizationMode;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/FixedBiasLayer.cpp b/training/src/compiler/training/base/layers/basic/FixedBiasLayer.cpp
new file mode 100644
index 00000000..843ec2a0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/FixedBiasLayer.cpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "FixedBiasLayer.h"
+
+#include "impl/FixedBiasLayerCPU.h"
+
+namespace raul
+{
+
+FixedBiasLayer::FixedBiasLayer(const Name& name, const FixedBiasParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "FixedBias", params, networkParameters)
+    , mBias(params.mBias)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(FixedBiasLayer, FixedBiasLayerCPU<MemoryManager>, FixedBiasLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], Workflow::Usage::Forward, Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputs[0]);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputs[0]);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputs[0]);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/FixedBiasLayer.h b/training/src/compiler/training/base/layers/basic/FixedBiasLayer.h
new file mode 100644
index 00000000..38c5c4f0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/FixedBiasLayer.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FIXED_BIAS_LAYER_H
+#define FIXED_BIAS_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/FixedBiasParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Fixed Bias Layer
+ *
+ * The layer allows to add a fixed offset to tensor values.
+ */
+class FixedBiasLayer : public BasicLayer
+{
+  public:
+    FixedBiasLayer(const Name& name, const FixedBiasParams& params, NetworkParameters& networkParameters);
+
+    FixedBiasLayer(FixedBiasLayer&&) = default;
+    FixedBiasLayer(const FixedBiasLayer&) = delete;
+    FixedBiasLayer& operator=(const FixedBiasLayer&) = delete;
+
+  private:
+    dtype mBias;
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class FixedBiasLayerCPU;
+};
+
+} // raul namespace
+
+#endif // FIXED_BIAS_LAYER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/GlobalAveragePoolLayer.cpp b/training/src/compiler/training/base/layers/basic/GlobalAveragePoolLayer.cpp
new file mode 100644
index 00000000..10a86970
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/GlobalAveragePoolLayer.cpp
@@ -0,0 +1,149 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GlobalAveragePoolLayer.h"
+
+namespace raul
+{
+
+GlobAveragePoolLayer::GlobAveragePoolLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "GlobalAveragePooling", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mInputDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mInputHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mInputWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ BS(), mInputDepth, 1u, 1u }, DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+}
+
+void GlobAveragePoolLayer::forwardComputeImpl(NetworkMode)
+{
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+    const dtype reciprocalKernelSize = 1.0_dt / static_cast<dtype>(mInputWidth * mInputHeight);
+
+    if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        auto& output = mNetworkParams.mMemoryManager[mOutputName];
+
+        const auto& inputs = mNetworkParams.mMemoryManager[mInputName];
+
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+        auto outputs2D = output.reshape(yato::dims(batchSize, mInputDepth));
+
+        for (size_t b = 0; b < batchSize; ++b)
+        {
+
+            for (size_t k = 0; k < mInputDepth; ++k)
+            {
+                dtype sum = 0.0_dt;
+                for (size_t i = 0; i < mInputHeight * mInputWidth; ++i)
+                {
+                    sum += inputs3D[b][k][i];
+                }
+                outputs2D[b][k] = sum * reciprocalKernelSize;
+            }
+        }
+    }
+    else if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        auto& output = mNetworkParams.mMemoryManagerFP16[mOutputName];
+
+        const auto& inputs = mNetworkParams.mMemoryManagerFP16[mInputName];
+
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+        auto outputs2D = output.reshape(yato::dims(batchSize, mInputDepth));
+
+        for (size_t b = 0; b < batchSize; ++b)
+        {
+            for (size_t k = 0; k < mInputDepth; ++k)
+            {
+                dtype sum = 0.0_dt;
+                for (size_t i = 0; i < mInputHeight * mInputWidth; ++i)
+                {
+                    sum += TODTYPE(inputs3D[b][k][i]);
+                }
+                outputs2D[b][k] = TOHTYPE(sum * reciprocalKernelSize);
+            }
+        }
+    }
+}
+
+void GlobAveragePoolLayer::backwardComputeImpl()
+{
+    // if (mNetworkParams.isGradNeeded(mInputName))
+    {
+        const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+        const dtype reciprocalKernelSize = 1.0_dt / static_cast<dtype>(mInputHeight * mInputWidth);
+        if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+        {
+            auto& prevLayerDelta = mNetworkParams.mMemoryManager[mInputName.grad()];
+
+            const auto& deltas = mNetworkParams.mMemoryManager[mOutputName.grad()];
+
+            auto deltas2D = deltas.reshape(yato::dims(batchSize, mInputDepth));
+            auto prevDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+
+            for (size_t batch = 0; batch < batchSize; ++batch)
+            {
+                for (size_t c = 0; c < mInputDepth; ++c)
+                {
+                    for (size_t i = 0; i < mInputHeight * mInputWidth; ++i)
+                    {
+                        prevDeltas3D[batch][c][i] += deltas2D[batch][c] * reciprocalKernelSize;
+                    }
+                }
+            }
+        }
+        else if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16)
+        {
+            auto& prevLayerDelta = mNetworkParams.mMemoryManagerFP16[mInputName.grad()];
+
+            const auto& deltas = mNetworkParams.mMemoryManagerFP16[mOutputName.grad()];
+
+            auto deltas2D = deltas.reshape(yato::dims(batchSize, mInputDepth));
+            auto prevDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+
+            for (size_t batch = 0; batch < batchSize; ++batch)
+            {
+                for (size_t c = 0; c < mInputDepth; ++c)
+                {
+                    for (size_t i = 0; i < mInputHeight * mInputWidth; ++i)
+                    {
+                        prevDeltas3D[batch][c][i] += TOHTYPE(deltas2D[batch][c] * reciprocalKernelSize);
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/GlobalAveragePoolLayer.h b/training/src/compiler/training/base/layers/basic/GlobalAveragePoolLayer.h
new file mode 100644
index 00000000..70dc02c3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/GlobalAveragePoolLayer.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GAVERAGEPOOL_LAYER_H
+#define GAVERAGEPOOL_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Global Average Pooling Layer
+ *
+ */
+class GlobAveragePoolLayer : public BasicLayer
+{
+  public:
+    GlobAveragePoolLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    GlobAveragePoolLayer(GlobAveragePoolLayer&&) = default;
+    GlobAveragePoolLayer(const GlobAveragePoolLayer&) = delete;
+    GlobAveragePoolLayer& operator=(const GlobAveragePoolLayer&) = delete;
+
+    typedef std::vector<size_t> Indexes;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    size_t mInputWidth;
+    size_t mInputHeight;
+    size_t mInputDepth;
+
+    Name mInputName;
+    Name mOutputName;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/IndexFillLayer.cpp b/training/src/compiler/training/base/layers/basic/IndexFillLayer.cpp
new file mode 100644
index 00000000..282842cd
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/IndexFillLayer.cpp
@@ -0,0 +1,193 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "IndexFillLayer.h"
+
+namespace raul
+{
+
+IndexFillLayer::IndexFillLayer(const Name& name, const IndexFillLayerParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "IndexFill", params, networkParameters)
+    , mDimension(params.dim)
+    , mIndices(params.mIndices)
+    , mFillValue(params.mFillValue)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mDimension == raul::Dimension::Default)
+    {
+        THROW(mTypeName, mName, "default dimension is not supported");
+    }
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+void IndexFillLayer::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        Tensor& output = mNetworkParams.mMemoryManager[mOutputs[0]];
+        const Tensor& input = mNetworkParams.mMemoryManager[mInputs[0]];
+
+        const auto outputShape = output.getShape();
+        const auto outputStrides = Common::getStrides(outputShape);
+
+        // Pre-check
+        for (const auto& index : mIndices)
+        {
+            if (index >= outputShape[static_cast<size_t>(mDimension)])
+            {
+                THROW("IndexFillLayer", mName, "specified index out of the range");
+            }
+        }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            auto indexes = Common::offsetToIndexes(q, outputStrides);
+            if (mIndices.find(indexes[static_cast<size_t>(mDimension)]) != mIndices.end() || mIndices.empty())
+            {
+                output[q] = mFillValue;
+            }
+            else
+            {
+                output[q] = input[q];
+            }
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        auto& output = work.getMemoryManager<MemoryManagerFP16>()[mOutputs[0]];
+        const auto& input = work.getMemoryManager<MemoryManagerFP16>()[mInputs[0]];
+
+        const auto outputShape = output.getShape();
+        const auto outputStrides = Common::getStrides(outputShape);
+
+        // Pre-check
+        for (const auto& index : mIndices)
+        {
+            if (index >= outputShape[static_cast<size_t>(mDimension)])
+            {
+                THROW("IndexFillLayer", mName, "specified index out of the range");
+            }
+        }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            auto indexes = Common::offsetToIndexes(q, outputStrides);
+            if (mIndices.find(indexes[static_cast<size_t>(mDimension)]) != mIndices.end() || mIndices.empty())
+            {
+                output[q] = TOHTYPE(mFillValue);
+            }
+            else
+            {
+                output[q] = input[q];
+            }
+        }
+    }
+    else
+    {
+        THROW_NONAME("IndexFillLayer", "unsupported execution target");
+    }
+}
+
+void IndexFillLayer::backwardComputeImpl()
+{
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputs[0].grad()];
+
+        // if (mNetworkParams.isGradNeeded(mInputs[0]))
+        {
+            Tensor& prevLayerDelta = mNetworkParams.mMemoryManager[mInputs[0].grad()];
+
+            const auto outputShape = deltas.getShape();
+            const auto outputStrides = Common::getStrides(outputShape);
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                const auto indexes = Common::offsetToIndexes(q, outputStrides);
+                if (mIndices.find(indexes[static_cast<size_t>(mDimension)]) != mIndices.end())
+                {
+                    prevLayerDelta[q] += 0.0_dt;
+                }
+                else
+                {
+                    prevLayerDelta[q] += deltas[q];
+                }
+            }
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        const auto& deltas = work.getMemoryManager<MemoryManagerFP16>()[mOutputs[0].grad()];
+
+        // if (mNetworkParams.isGradNeeded(mInputs[0]))
+        {
+            auto& prevLayerDelta = work.getMemoryManager<MemoryManagerFP16>()[mInputs[0].grad()];
+
+            const auto outputShape = deltas.getShape();
+            const auto outputStrides = Common::getStrides(outputShape);
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                const auto indexes = Common::offsetToIndexes(q, outputStrides);
+                if (mIndices.find(indexes[static_cast<size_t>(mDimension)]) != mIndices.end())
+                {
+                    prevLayerDelta[q] += 0.0_hf;
+                }
+                else
+                {
+                    prevLayerDelta[q] += deltas[q];
+                }
+            }
+        }
+    }
+    else
+    {
+        THROW_NONAME("IndexFillLayer", "unsupported execution target");
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/IndexFillLayer.h b/training/src/compiler/training/base/layers/basic/IndexFillLayer.h
new file mode 100644
index 00000000..b1505cae
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/IndexFillLayer.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef INDEX_FILL_LAYER_H
+#define INDEX_FILL_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/IndexFillLayerParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief IndexFillLayer
+ * Fills the elements of the tensor with specified value
+ * by selecting the indices in the order given in index.
+ *
+ * @see
+ * https://pytorch.org/docs/master/tensors.html#torch.Tensor.index_fill_
+ */
+class IndexFillLayer : public BasicLayer
+{
+  public:
+    IndexFillLayer(const Name& name, const IndexFillLayerParams& params, NetworkParameters& networkParameters);
+
+    IndexFillLayer(IndexFillLayer&&) = default;
+    IndexFillLayer(const IndexFillLayer&) = delete;
+    IndexFillLayer& operator=(const IndexFillLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    raul::Dimension mDimension;
+    std::unordered_set<size_t> mIndices;
+    raul::dtype mFillValue;
+};
+
+}
+
+#endif // raul namespace
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/L2NormLayer.cpp b/training/src/compiler/training/base/layers/basic/L2NormLayer.cpp
new file mode 100644
index 00000000..c158a88d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/L2NormLayer.cpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "L2NormLayer.h"
+
+#include "impl/L2NormLayerCPU.h"
+
+namespace raul
+{
+
+L2NormLayer::L2NormLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "L2Norm", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(L2NormLayer, L2NormLayerCPU<MemoryManager>, L2NormLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/L2NormLayer.h b/training/src/compiler/training/base/layers/basic/L2NormLayer.h
new file mode 100644
index 00000000..c8935257
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/L2NormLayer.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef L2_NORM_LAYER_H
+#define L2_NORM_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief L2 Normalizing Layer
+ *
+ * Divides all elements in input tensor by L2 norm calculated across chosen dimension.
+ */
+class L2NormLayer : public BasicLayer
+{
+  public:
+    L2NormLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    L2NormLayer(L2NormLayer&&) = default;
+    L2NormLayer(const L2NormLayer&) = delete;
+    L2NormLayer& operator=(const L2NormLayer&) = delete;
+
+    template<typename MM>
+    friend class L2NormLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/L2SquaredNormLayer.cpp b/training/src/compiler/training/base/layers/basic/L2SquaredNormLayer.cpp
new file mode 100644
index 00000000..25375bb3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/L2SquaredNormLayer.cpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "L2SquaredNormLayer.h"
+
+#include "impl/L2SquaredNormLayerCPU.h"
+
+namespace raul
+{
+
+L2SquaredNormLayer::L2SquaredNormLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "L2SquaredNorm", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(L2SquaredNormLayer, L2SquaredNormLayerCPU<MemoryManager>, L2SquaredNormLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/L2SquaredNormLayer.h b/training/src/compiler/training/base/layers/basic/L2SquaredNormLayer.h
new file mode 100644
index 00000000..697d6270
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/L2SquaredNormLayer.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef L2_SQUARED_NORM_LAYER_H
+#define L2_SQUARED_NORM_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief L2 Squared Normalizing Layer
+ *
+ *  Computes half the L2 norm of a tensor without sqrt.
+ *
+ *  @see
+ *  https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss
+ */
+class L2SquaredNormLayer : public BasicLayer
+{
+  public:
+    L2SquaredNormLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    L2SquaredNormLayer(L2SquaredNormLayer&&) = default;
+    L2SquaredNormLayer(const L2SquaredNormLayer&) = delete;
+    L2SquaredNormLayer& operator=(const L2SquaredNormLayer&) = delete;
+
+    template<typename MM>
+    friend class L2SquaredNormLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/LabelSmoothing.cpp b/training/src/compiler/training/base/layers/basic/LabelSmoothing.cpp
new file mode 100644
index 00000000..bbc1fcc0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/LabelSmoothing.cpp
@@ -0,0 +1,68 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LabelSmoothing.h"
+
+#include <algorithm>
+#include <limits>
+
+#include <training/base/common/MemoryManager.h>
+
+#include "impl/LabelSmoothingCPU.h"
+
+namespace
+{
+
+const size_t NoPadding = std::numeric_limits<std::size_t>::max();
+
+} // anonymous namespace
+
+namespace raul
+{
+
+LabelSmoothing::LabelSmoothing(const Name& name, const LabelSmoothingParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "LabelSmoothing", params, networkParameters)
+    , mSmoothing(TODTYPE(params.smoothing))
+    , mPaddingIdx(params.paddingClass >= 0 ? params.paddingClass : NoPadding)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(LabelSmoothing, LabelSmoothingCPU<MemoryManager>, LabelSmoothingCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    if (mNetworkParams.mWorkflow.getWidth(mInputName) < 2)
+    {
+        THROW(mTypeName, mName, "input width must be at least 2");
+    }
+    if (mNetworkParams.mWorkflow.getWidth(mInputName) == 2 && mPaddingIdx != NoPadding)
+    {
+        THROW(mTypeName, mName, "when using padding, input width must be at least 3");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/LabelSmoothing.h b/training/src/compiler/training/base/layers/basic/LabelSmoothing.h
new file mode 100644
index 00000000..9183c64d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/LabelSmoothing.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LABEL_SMOOTHING_H
+#define LABEL_SMOOTHING_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+/*
+ * @brief Label Smoothing
+ *
+ * The generalization and learning speed of a multi-class neural network can often be significantly improved
+ * by using soft targets that are a weighted average of the hard targets and the uniform distribution over labels.
+ * Smoothing the labels in this way prevents the network from becoming over-confident.
+ *
+ * Implementation accroding to one in ransformer (https://nlp.seas.harvard.edu/2018/04/03/attention.html#label-smoothing):
+ *
+ * Zero input vectors are a special case and are kept zero.
+ *
+ * Does nothing in Test Mode (as if smoothing is 0)
+ *
+ * @see https://arxiv.org/abs/1906.02629
+ */
+class LabelSmoothing : public BasicLayer
+{
+  public:
+    LabelSmoothing(const Name& name, const LabelSmoothingParams& params, NetworkParameters& networkParameters);
+
+    LabelSmoothing(LabelSmoothing&&) = default;
+    LabelSmoothing(const LabelSmoothing&) = delete;
+    LabelSmoothing& operator=(const LabelSmoothing&) = delete;
+
+  private:
+    Name mInputName;
+    Name mOutputName;
+
+    dtype mSmoothing;
+    size_t mPaddingIdx;
+
+    template<typename MM>
+    friend class LabelSmoothingCPU;
+};
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/LogLayer.cpp b/training/src/compiler/training/base/layers/basic/LogLayer.cpp
new file mode 100644
index 00000000..79732650
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/LogLayer.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LogLayer.h"
+
+#include "impl/LogLayerCPU.h"
+
+namespace raul
+{
+
+LogLayer::LogLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Log", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(LogLayer, LogLayerCPU<MemoryManager>, LogLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputs[0]);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputs[0]);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputs[0]);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/LogLayer.h b/training/src/compiler/training/base/layers/basic/LogLayer.h
new file mode 100644
index 00000000..db1d2a1c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/LogLayer.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LOG_LAYER_H
+#define LOG_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief LogLayer
+ * Computes element-wise natural logarithm of elements in input
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.log.html?highlight=log#torch.log
+ */
+class LogLayer : public BasicLayer
+{
+  public:
+    LogLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    LogLayer(LogLayer&&) = default;
+    LogLayer(const LogLayer&) = delete;
+    LogLayer& operator=(const LogLayer&) = delete;
+
+  private:
+    bool mNegativeNumberDetected = false;
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class LogLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/LossWrapperHelperLayer.cpp b/training/src/compiler/training/base/layers/basic/LossWrapperHelperLayer.cpp
new file mode 100644
index 00000000..671ddf93
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/LossWrapperHelperLayer.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LossWrapperHelperLayer.h"
+
+#include "impl/LossWrapperHelperLayerCPU.h"
+
+namespace raul
+{
+
+LossWrapperHelperLayer::LossWrapperHelperLayer(const Name& name, const BasicParams& params, bool isFinal, NetworkParameters& networkParameters)
+    : BasicLayer(name, "LossWrapperHelper", params, networkParameters)
+    , mIsFinal(isFinal)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(LossWrapperHelperLayer, LossWrapperHelperLayerCPU<MemoryManager>, LossWrapperHelperLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT_NOMEMOPT);
+
+    if (!mIsFinal)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/LossWrapperHelperLayer.h b/training/src/compiler/training/base/layers/basic/LossWrapperHelperLayer.h
new file mode 100644
index 00000000..63050ee4
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/LossWrapperHelperLayer.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LOSS_WRAPPER_HELPER_LAYER_H
+#define LOSS_WRAPPER_HELPER_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+class LossWrapperHelperLayer : public BasicLayer
+{
+  public:
+    LossWrapperHelperLayer(const Name& name, const BasicParams& params, bool isFinal, NetworkParameters& networkParameters);
+
+    LossWrapperHelperLayer(LossWrapperHelperLayer&&) = default;
+    LossWrapperHelperLayer(const LossWrapperHelperLayer&) = delete;
+    LossWrapperHelperLayer& operator=(const LossWrapperHelperLayer&) = delete;
+
+  private:
+    bool mIsFinal;
+
+    template<typename MM>
+    friend class LossWrapperHelperLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/MaskedFillLayer.cpp b/training/src/compiler/training/base/layers/basic/MaskedFillLayer.cpp
new file mode 100644
index 00000000..c790a250
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/MaskedFillLayer.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "MaskedFillLayer.h"
+
+#include <yato/array_view.h>
+
+#include <training/base/common/MemoryManager.h>
+
+#include "impl/MaskedFillLayerCPU.h"
+
+namespace raul
+{
+
+MaskedFillLayer::MaskedFillLayer(const Name& name, const MaskedFillParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "MaskedFill", params, networkParameters)
+    , mFillValue(params.fillValue)
+    , mInverted(params.inverted)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 2)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(MaskedFillLayer, MaskedFillLayerCPU<MemoryManager>, MaskedFillLayerCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mMaskName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    mWidth = networkParameters.mWorkflow.getWidth(mInputName);
+    mHeight = networkParameters.mWorkflow.getHeight(mInputName);
+    mDepth = networkParameters.mWorkflow.getDepth(mInputName);
+
+    if ((mDepth != networkParameters.mWorkflow.getDepth(mMaskName) && networkParameters.mWorkflow.getDepth(mMaskName) != 1) ||
+        ((mWidth != networkParameters.mWorkflow.getWidth(mMaskName) && networkParameters.mWorkflow.getWidth(mMaskName) != 1) ||
+         (mHeight != networkParameters.mWorkflow.getHeight(mMaskName) && networkParameters.mWorkflow.getHeight(mMaskName) != 1)))
+    {
+        THROW(mTypeName, mName, "wrong tensor size");
+    }
+
+    // Input
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    // Mask
+    mNetworkParams.mWorkflow.copyDeclaration(name, mMaskName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    // Output
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputName, mOutputName, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/MaskedFillLayer.h b/training/src/compiler/training/base/layers/basic/MaskedFillLayer.h
new file mode 100644
index 00000000..226e402a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/MaskedFillLayer.h
@@ -0,0 +1,55 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MASKED_FILL_LAYER_H
+#define MASKED_FILL_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+/**
+ * @brief Masked fill
+ *
+ *  Fills input tensor elements corresponding to ones in mask with fill value.
+ *  Mask can have separate entries for each channel or single entry for all channels.
+ *
+ */
+class MaskedFillLayer : public BasicLayer
+{
+  public:
+    MaskedFillLayer(const Name& name, const MaskedFillParams& params, NetworkParameters& networkParameters);
+
+    MaskedFillLayer(MaskedFillLayer&&) = default;
+    MaskedFillLayer(const MaskedFillLayer&) = delete;
+    MaskedFillLayer& operator=(const MaskedFillLayer&) = delete;
+
+  private:
+    size_t mWidth;
+    size_t mHeight;
+    size_t mDepth;
+
+    dtype mFillValue;
+    bool mInverted;
+
+    Name mInputName;
+    std::string mMaskName;
+    Name mOutputName;
+
+    template<typename MM>
+    friend class MaskedFillLayerCPU;
+};
+}
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/MatMulLayer.cpp b/training/src/compiler/training/base/layers/basic/MatMulLayer.cpp
new file mode 100644
index 00000000..bd588fe3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/MatMulLayer.cpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "MatMulLayer.h"
+
+#include "impl/MatMulLayerCPU.h"
+
+#include <algorithm>
+
+namespace raul
+{
+
+MatMulLayer::MatMulLayer(const Name& name, const MatMulParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "MatMul", params, networkParameters)
+    , mCoeff(params.scale)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 2)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    if (networkParameters.mWorkflow.getDepth(mInputs[0]) != networkParameters.mWorkflow.getDepth(mInputs[1]))
+    {
+        THROW(mTypeName, mName, "wrong depth");
+    }
+    if (networkParameters.mWorkflow.getWidth(mInputs[0]) != networkParameters.mWorkflow.getHeight(mInputs[1]))
+    {
+        THROW(mTypeName, mName, "wrong tensor sizes");
+    }
+
+    DECLARE_IMPL(MatMulLayer, MatMulLayerCPU<MemoryManager>, MatMulLayerCPU<MemoryManagerFP16>)
+
+    for (const auto& input : mInputs)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+        if (mNetworkParams.mWorkflow.isTensorTrainable(input))
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, input, input.grad(), DEC_TRAINABLE_GRAD);
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, input, input.grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+
+    mDepth = networkParameters.mWorkflow.getDepth(mInputs[0]);
+
+    mNetworkParams.mWorkflow.tensorNeeded(
+        name, mOutputs[0], raul::WShape{ raul::BS(), mDepth, networkParameters.mWorkflow.getHeight(mInputs[0]), networkParameters.mWorkflow.getWidth(mInputs[1]) }, DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/MatMulLayer.h b/training/src/compiler/training/base/layers/basic/MatMulLayer.h
new file mode 100644
index 00000000..f828b094
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/MatMulLayer.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MAT_MUL_LAYER_H
+#define MAT_MUL_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+/**
+ * @brief MatMul
+ *
+ *  Scalar multiplication of last two dimensions of two tensors.
+ *  Depth and batch size must be the same.
+ *  [N, C, H1, W1] x [N, C, W1, W2] -> [N, C, H1, W2]
+ *
+ */
+class MatMulLayer : public BasicLayer
+{
+  public:
+    MatMulLayer(const Name& name, const MatMulParams& params, NetworkParameters& networkParameters);
+
+    MatMulLayer(MatMulLayer&&) = default;
+    MatMulLayer(const MatMulLayer&) = delete;
+    MatMulLayer& operator=(const MatMulLayer&) = delete;
+
+  private:
+    dtype mCoeff;
+    size_t mDepth;
+
+    template<typename MM>
+    friend class MatMulLayerCPU;
+};
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/MaxPoolLayer.cpp b/training/src/compiler/training/base/layers/basic/MaxPoolLayer.cpp
new file mode 100644
index 00000000..b70c07eb
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/MaxPoolLayer.cpp
@@ -0,0 +1,255 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "MaxPoolLayer.h"
+
+#include <yato/array_view.h>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+MaxPoolLayer2D::MaxPoolLayer2D(const Name& name, const Pool2DParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "MaxPooling", params, networkParameters)
+    , mKernelWidth(params.kernelWidth)
+    , mKernelHeight(params.kernelHeight)
+    , mStrideW(params.strideW)
+    , mStrideH(params.strideH)
+    , mPaddingW(params.paddingW)
+    , mPaddingH(params.paddingH)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mInputDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mInputHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mInputWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+
+    if ((mPaddingH > mKernelHeight / 2) || (mPaddingW > mKernelWidth / 2))
+    {
+        THROW(mTypeName, mName, "Padding should be smaller than half of kernel size");
+    }
+
+    if (mKernelHeight == 0 || mKernelWidth == 0)
+    {
+        THROW(mTypeName, mName, "kernel size can't be null");
+    }
+
+    if ((mInputWidth + mPaddingW * 2 < mKernelWidth) || (mInputHeight + mPaddingW * 2 < mKernelHeight))
+    {
+        THROW(mTypeName, mName, "ImageSize + 2*Padding can't be less than KernelSize");
+    }
+
+    mOutputWidth = (mInputWidth + mPaddingW * 2 - mKernelWidth) / mStrideW + 1;
+    mOutputHeight = (mInputHeight + mPaddingH * 2 - mKernelHeight) / mStrideH + 1;
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ BS(), mInputDepth, mOutputHeight, mOutputWidth }, DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+}
+
+void MaxPoolLayer2D::forwardComputeImpl(NetworkMode)
+{
+
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+        Tensor& output = mNetworkParams.mMemoryManager[mOutputName];
+
+        mIndexes.resize(batchSize * mInputDepth * mOutputWidth * mOutputHeight);
+
+        const Tensor& inputs = mNetworkParams.mMemoryManager[mInputName];
+
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+        auto outputs3D = output.reshape(yato::dims(batchSize, mInputDepth, mOutputHeight * mOutputWidth));
+        auto indexes3D = yato::view(mIndexes).reshape(yato::dims(batchSize, mInputDepth, mOutputHeight * mOutputWidth));
+
+        for (size_t b = 0; b < batchSize; ++b)
+        {
+            for (size_t k = 0; k < mInputDepth; ++k)
+            {
+                for (size_t i = 0; i < mOutputHeight; ++i)
+                {
+                    for (size_t j = 0; j < mOutputWidth; ++j)
+                    {
+                        auto out_index = j + mOutputWidth * i;
+                        dtype max = std::numeric_limits<dtype>::lowest();
+                        size_t max_i = 0;
+                        for (size_t n = 0; n < mKernelHeight; ++n)
+                        {
+                            for (size_t m = 0; m < mKernelWidth; ++m)
+                            {
+                                auto cur_h = i * mStrideH + n - mPaddingH;
+                                auto cur_w = j * mStrideW + m - mPaddingW;
+                                auto index = cur_w + mInputWidth * cur_h;
+
+                                if (cur_h < mInputHeight && cur_w < mInputWidth && inputs3D[b][k][index] > max)
+                                {
+                                    max_i = index;
+                                    max = inputs3D[b][k][index];
+                                }
+                            }
+                        }
+                        outputs3D[b][k][out_index] = max;
+                        indexes3D[b][k][out_index] = max_i;
+                    }
+                }
+            }
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+        auto& output = work.getMemoryManager<MemoryManagerFP16>()[mOutputName];
+
+        mIndexes.resize(batchSize * mInputDepth * mOutputWidth * mOutputHeight);
+
+        const auto& inputs = work.getMemoryManager<MemoryManagerFP16>()[mInputName];
+
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+        auto outputs3D = output.reshape(yato::dims(batchSize, mInputDepth, mOutputHeight * mOutputWidth));
+        auto indexes3D = yato::view(mIndexes).reshape(yato::dims(batchSize, mInputDepth, mOutputHeight * mOutputWidth));
+
+        for (size_t b = 0; b < batchSize; ++b)
+        {
+            for (size_t k = 0; k < mInputDepth; ++k)
+            {
+                for (size_t i = 0; i < mOutputHeight; ++i)
+                {
+                    for (size_t j = 0; j < mOutputWidth; ++j)
+                    {
+                        auto out_index = j + mOutputWidth * i;
+                        half max = std::numeric_limits<half>::lowest();
+                        size_t max_i = 0;
+                        for (size_t n = 0; n < mKernelHeight; ++n)
+                        {
+                            for (size_t m = 0; m < mKernelWidth; ++m)
+                            {
+                                auto cur_h = i * mStrideH + n - mPaddingH;
+                                auto cur_w = j * mStrideW + m - mPaddingW;
+                                auto index = cur_w + mInputWidth * cur_h;
+
+                                if (cur_h < mInputHeight && cur_w < mInputWidth && inputs3D[b][k][index] > max)
+                                {
+                                    max_i = index;
+                                    max = inputs3D[b][k][index];
+                                }
+                            }
+                        }
+                        outputs3D[b][k][out_index] = max;
+                        indexes3D[b][k][out_index] = max_i;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        THROW_NONAME("MaxPoolLayer2D", "unsupported execution target");
+    }
+}
+
+void MaxPoolLayer2D::backwardComputeImpl()
+{
+
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        // if (mNetworkParams.isGradNeeded(mInputName))
+        {
+            const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+            Tensor& prevLayerDelta = mNetworkParams.mMemoryManager[mInputName.grad()];
+
+            const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputName.grad()];
+
+            auto deltas3D = deltas.reshape(yato::dims(batchSize, mInputDepth, mOutputHeight * mOutputWidth));
+            auto prevDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+            auto indexes3D = yato::view(mIndexes).reshape(yato::dims(batchSize, mInputDepth, mOutputHeight * mOutputWidth));
+
+            size_t h = mOutputHeight;
+
+            size_t w = mOutputWidth;
+
+            for (size_t batch = 0; batch < batchSize; ++batch)
+            {
+                for (size_t c = 0; c < mInputDepth; ++c)
+                {
+                    for (size_t i = 0; i < h * w; ++i)
+                    {
+                        auto index = indexes3D[batch][c][i];
+                        prevDeltas3D[batch][c][index] += deltas3D[batch][c][i];
+                    }
+                }
+            }
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        // if (mNetworkParams.isGradNeeded(mInputName))
+        {
+            const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+            auto& prevLayerDelta = work.getMemoryManager<MemoryManagerFP16>()[mInputName.grad()];
+
+            const auto& deltas = work.getMemoryManager<MemoryManagerFP16>()[mOutputName.grad()];
+
+            auto deltas3D = deltas.reshape(yato::dims(batchSize, mInputDepth, mOutputHeight * mOutputWidth));
+            auto prevDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+            auto indexes3D = yato::view(mIndexes).reshape(yato::dims(batchSize, mInputDepth, mOutputHeight * mOutputWidth));
+
+            size_t h = mOutputHeight;
+
+            size_t w = mOutputWidth;
+
+            for (size_t batch = 0; batch < batchSize; ++batch)
+            {
+                for (size_t c = 0; c < mInputDepth; ++c)
+                {
+                    for (size_t i = 0; i < h * w; ++i)
+                    {
+                        auto index = indexes3D[batch][c][i];
+                        prevDeltas3D[batch][c][index] += deltas3D[batch][c][i];
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        THROW_NONAME("MaxPoolLayer2D", "unsupported execution target");
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/MaxPoolLayer.h b/training/src/compiler/training/base/layers/basic/MaxPoolLayer.h
new file mode 100644
index 00000000..cf22a0e9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/MaxPoolLayer.h
@@ -0,0 +1,65 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MAXPOOL_LAYER_H
+#define MAXPOOL_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief 2D Max Pooling Layer
+ *
+ * The layer applies a 2D max-pooling over input signal with several channels.
+ */
+class MaxPoolLayer2D : public BasicLayer
+{
+  public:
+    MaxPoolLayer2D(const Name& name, const Pool2DParams& params, NetworkParameters& networkParameters);
+
+    MaxPoolLayer2D(MaxPoolLayer2D&&) = default;
+    MaxPoolLayer2D(const MaxPoolLayer2D&) = delete;
+    MaxPoolLayer2D& operator=(const MaxPoolLayer2D&) = delete;
+
+    typedef std::vector<size_t> Indexes;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    size_t mKernelWidth;
+    size_t mKernelHeight;
+    size_t mStrideW;
+    size_t mStrideH;
+    size_t mPaddingW;
+    size_t mPaddingH;
+
+    size_t mInputWidth;
+    size_t mInputHeight;
+    size_t mInputDepth;
+
+    size_t mOutputWidth;
+    size_t mOutputHeight;
+
+    Indexes mIndexes;
+
+    Name mInputName;
+    Name mOutputName;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/NonZeroMaskLayer.h b/training/src/compiler/training/base/layers/basic/NonZeroMaskLayer.h
new file mode 100644
index 00000000..b73ac544
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/NonZeroMaskLayer.h
@@ -0,0 +1,64 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NON_ZERO_MASK_LAYER_H
+#define NON_ZERO_MASK_LAYER_H
+
+#include "impl/NonZeroMaskLayerCPU.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise non-zero mask
+ *
+ */
+class NonZeroMaskLayer : public BasicLayer
+{
+  public:
+    NonZeroMaskLayer(const Name& name, const BasicParams& params, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "NonZeroMask", params, networkParameters)
+    {
+        DECLARE_IMPL(NonZeroMaskLayer, NonZeroMaskLayerCPU<MemoryManager>, NonZeroMaskLayerCPU<MemoryManagerFP16>)
+        using namespace std;
+        auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+        if (mInputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of inputs")
+        }
+        if (mOutputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of outputs")
+        }
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], Workflow::Usage::ForwardAndBackward, Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+    }
+
+    NonZeroMaskLayer(NonZeroMaskLayer&&) = default;
+    NonZeroMaskLayer(const NonZeroMaskLayer&) = delete;
+    NonZeroMaskLayer& operator=(const NonZeroMaskLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override { mImpl->forwardComputeImpl(mode); }
+
+    void backwardComputeImpl() override { mImpl->backwardComputeImpl(); }
+
+    template<typename MM>
+    friend class NonZeroMaskLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/PaddingLayer.cpp b/training/src/compiler/training/base/layers/basic/PaddingLayer.cpp
new file mode 100644
index 00000000..059be9fb
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/PaddingLayer.cpp
@@ -0,0 +1,341 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <training/base/layers/basic/PaddingLayer.h>
+
+namespace raul
+{
+
+PaddingLayer::PaddingLayer(const Name& name, const PaddingLayerParams& layerParameters, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Padding", layerParameters, networkParameters)
+    , calculationStrategy(CalculationStrategy::define(layerParameters, networkParameters))
+    , mFillingValue(layerParameters.mFillingValue)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    const size_t inputDepth = networkParameters.mWorkflow.getDepth(mInputs[0]);
+    const size_t mInputHeight = networkParameters.mWorkflow.getHeight(mInputs[0]);
+    const size_t mInputWidth = networkParameters.mWorkflow.getWidth(mInputs[0]);
+
+    size_t output_depth = inputDepth;
+    size_t output_height = mInputHeight + layerParameters.mTopPadding + layerParameters.mBottomPadding;
+    size_t output_width = mInputWidth + layerParameters.mLeftPadding + layerParameters.mRightPadding;
+
+    mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ raul::BS(), output_depth, output_height, output_width }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+void PaddingLayer::forwardComputeImpl(NetworkMode /*mode*/)
+{
+    if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        const Tensor& input = mNetworkParams.mMemoryManager[mInputs[0]];
+        Tensor& output = mNetworkParams.mMemoryManager[mOutputs[0]];
+
+        auto input_4d_view = input.get4DView();
+        auto output_4d_view = output.get4DView();
+        for (size_t bIdx = 0, batch_size = output.getBatchSize(); bIdx < batch_size; ++bIdx)
+        {
+            for (size_t dIdx = 0, depth_size = output.getDepth(); dIdx < depth_size; ++dIdx)
+            {
+                for (size_t hIdx = 0, height_size = output.getHeight(); hIdx < height_size; ++hIdx)
+                {
+                    for (size_t wIdx = 0, width_size = output.getWidth(); wIdx < width_size; ++wIdx)
+                    {
+                        CalculationStrategy::Coordinates outputElementPosition{ bIdx, dIdx, hIdx, wIdx };
+                        if (calculationStrategy->isNeedToGetFillingValueFromInput(outputElementPosition))
+                        {
+                            auto pos = calculationStrategy->getFillingValuePositionInInput(outputElementPosition);
+                            output_4d_view[bIdx][dIdx][hIdx][wIdx] = input_4d_view[pos.BatchIdx][pos.DepthIdx][pos.HeightIdx][pos.WidthIdx];
+                        }
+                        else
+                        {
+                            output_4d_view[bIdx][dIdx][hIdx][wIdx] = mFillingValue;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        THROW(mTypeName, mName, "unsupported execution target");
+    }
+}
+
+void PaddingLayer::backwardComputeImpl()
+{
+    if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        const Tensor& delta = mNetworkParams.mMemoryManager[mOutputs[0].grad()];
+        auto& in_nabla = mNetworkParams.mMemoryManager[mInputs[0].grad()];
+        auto in_nabla_4d_view = in_nabla.get4DView();
+        auto delta_4d_view = delta.get4DView();
+        for (size_t bIdx = 0, batch_size = delta.getBatchSize(); bIdx < batch_size; ++bIdx)
+        {
+            for (size_t dIdx = 0, depth_size = delta.getDepth(); dIdx < depth_size; ++dIdx)
+            {
+                for (size_t hIdx = 0, height_size = delta.getHeight(); hIdx < height_size; ++hIdx)
+                {
+                    for (size_t wIdx = 0, width_size = delta.getWidth(); wIdx < width_size; ++wIdx)
+                    {
+                        CalculationStrategy::Coordinates deltaElementPosition{ bIdx, dIdx, hIdx, wIdx };
+                        if (calculationStrategy->isElementAffectToDerivative(deltaElementPosition))
+                        {
+                            auto pos = calculationStrategy->getPositionInDerivativeForUpdate(deltaElementPosition);
+                            in_nabla_4d_view[pos.BatchIdx][pos.DepthIdx][pos.HeightIdx][pos.WidthIdx] += delta_4d_view[bIdx][dIdx][hIdx][wIdx];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        THROW(mTypeName, mName, "unsupported execution target");
+    }
+}
+
+class ConstantPaddingCalculation : public PaddingLayer::CalculationStrategy
+{
+  public:
+    ConstantPaddingCalculation(const PaddingLayerParams& layerParameters, const NetworkParameters& networkParameters)
+        : CalculationStrategy(layerParameters, networkParameters)
+    {
+    }
+
+    bool isNeedToGetFillingValueFromInput(const Coordinates& outputElementPosition) const final
+    {
+        if (isElementAddedByPadding(outputElementPosition))
+        {
+            return false;
+        }
+
+        return true;
+    }
+    Coordinates getFillingValuePositionInInputAccordingFillingMode(const Coordinates& /* outputElementPosition */) const final
+    {
+        throw std::logic_error("PaddingLayer[PaddingLayer] Constant padding must fill padded elements by constant");
+    }
+
+    bool isElementAffectToDerivative(const Coordinates& deltaElementPosition) const final
+    {
+        if (isElementAddedByPadding(deltaElementPosition))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    Coordinates getPositionInDerivativeForUpdateAccordingFillingMode(const Coordinates& /* deltaElementPosition */) const final
+    {
+        throw std::logic_error("PaddingLayer[PaddingLayer] padded values in delta must not affect to derivative in constant padding");
+    }
+};
+
+class ReflectionPaddingCalculation : public PaddingLayer::CalculationStrategy
+{
+  public:
+    ReflectionPaddingCalculation(const PaddingLayerParams& layerParameters, const NetworkParameters& networkParameters)
+        : CalculationStrategy(layerParameters, networkParameters)
+    {
+        if (layerParameters.mTopPadding >= mInputHeight || layerParameters.mBottomPadding >= mInputHeight || layerParameters.mLeftPadding >= mInputWidth ||
+            layerParameters.mRightPadding >= mInputWidth)
+        {
+            THROW_NONAME("PaddingLayer",
+                         "inaccessible padding size in reflection mode. "
+                         "Max value for top padding - " +
+                             std::to_string(mInputHeight - 1) +
+                             ", "
+                             "Max value for bottom padding - " +
+                             std::to_string(mInputHeight - 1) +
+                             ", "
+                             "Max value for left padding - " +
+                             std::to_string(mInputWidth - 1) +
+                             ", "
+                             "Max value for right padding - " +
+                             std::to_string(mInputWidth - 1));
+        }
+    }
+
+    bool isNeedToGetFillingValueFromInput(const Coordinates& /* outputElementPosition */) const final { return true; }
+
+    Coordinates getFillingValuePositionInInputAccordingFillingMode(const Coordinates& outputElementPosition) const final
+    {
+        size_t heightIdxInInput = calculatePositionUsing(mInputHeight, mTopPadding, mInputHeight - 1, outputElementPosition.HeightIdx);
+        size_t widthIdxInInput = calculatePositionUsing(mInputWidth, mLeftPadding, mInputWidth - 1, outputElementPosition.WidthIdx);
+        return { outputElementPosition.BatchIdx, outputElementPosition.DepthIdx, heightIdxInInput, widthIdxInInput };
+    }
+
+    bool isElementAffectToDerivative(const Coordinates& /* deltaElementPosition */) const final { return true; }
+
+    Coordinates getPositionInDerivativeForUpdateAccordingFillingMode(const Coordinates& deltaElementPosition) const final
+    {
+        size_t heightIdxInDerivative = calculatePositionUsing(mInputHeight, mTopPadding, mInputHeight - 1, deltaElementPosition.HeightIdx);
+        size_t widthIdxInDerivative = calculatePositionUsing(mInputWidth, mLeftPadding, mInputWidth - 1, deltaElementPosition.WidthIdx);
+        return { deltaElementPosition.BatchIdx, deltaElementPosition.DepthIdx, heightIdxInDerivative, widthIdxInDerivative };
+    }
+
+  private:
+    static size_t calculatePositionUsing(size_t inputDimension, size_t paddingBeforeInput, size_t maxPaddingAfterInput, size_t outputPosition)
+    {
+        if (outputPosition < paddingBeforeInput)
+        {
+            return paddingBeforeInput - outputPosition;
+        }
+        else if (outputPosition - paddingBeforeInput < inputDimension)
+        {
+            return outputPosition - paddingBeforeInput;
+        }
+        else
+        {
+            return inputDimension - 1 - (outputPosition - paddingBeforeInput - maxPaddingAfterInput);
+        }
+    }
+};
+
+class ReplicationPaddingCalculation : public PaddingLayer::CalculationStrategy
+{
+  public:
+    ReplicationPaddingCalculation(const PaddingLayerParams& layerParameters, const NetworkParameters& networkParameters)
+        : CalculationStrategy(layerParameters, networkParameters)
+    {
+    }
+
+    bool isNeedToGetFillingValueFromInput(const Coordinates& /* outputElementPosition */) const final { return true; }
+
+    Coordinates getFillingValuePositionInInputAccordingFillingMode(const Coordinates& outputElementPosition) const final
+    {
+        size_t heightIdxInInput = calculatePositionUsing(mInputHeight, mTopPadding, outputElementPosition.HeightIdx);
+        size_t widthIdxInInput = calculatePositionUsing(mInputWidth, mLeftPadding, outputElementPosition.WidthIdx);
+        return { outputElementPosition.BatchIdx, outputElementPosition.DepthIdx, heightIdxInInput, widthIdxInInput };
+    }
+
+    bool isElementAffectToDerivative(const Coordinates& /* deltaElementPosition */) const final { return true; }
+
+    Coordinates getPositionInDerivativeForUpdateAccordingFillingMode(const Coordinates& deltaElementPosition) const final
+    {
+        size_t heightIdxInDerivative = calculatePositionUsing(mInputHeight, mTopPadding, deltaElementPosition.HeightIdx);
+        size_t widthIdxInDerivative = calculatePositionUsing(mInputWidth, mLeftPadding, deltaElementPosition.WidthIdx);
+        return { deltaElementPosition.BatchIdx, deltaElementPosition.DepthIdx, heightIdxInDerivative, widthIdxInDerivative };
+    }
+
+  private:
+    static size_t calculatePositionUsing(size_t inputDimension, size_t paddingBeforeInput, size_t outputPosition)
+    {
+        if (outputPosition < paddingBeforeInput)
+        {
+            return 0;
+        }
+        else if (outputPosition - paddingBeforeInput < inputDimension)
+        {
+            return outputPosition - paddingBeforeInput;
+        }
+        else
+        {
+            return inputDimension - 1;
+        }
+    }
+};
+
+PaddingLayer::CalculationStrategy::CalculationStrategy(const PaddingLayerParams& layerParameters, const NetworkParameters& networkParameters)
+    : mTopPadding(layerParameters.mTopPadding)
+    , mLeftPadding(layerParameters.mLeftPadding)
+{
+    (void)networkParameters;
+    if (layerParameters.getInputs().size() != 1)
+    {
+        THROW_NONAME("PaddingLayer", "wrong number of input names");
+    }
+    if (layerParameters.getInputs()[0].empty())
+    {
+        THROW_NONAME("PaddingLayer", "empty input name");
+    }
+
+    mInputHeight = networkParameters.mWorkflow.getHeight(layerParameters.getInputs()[0]);
+    mInputWidth = networkParameters.mWorkflow.getWidth(layerParameters.getInputs()[0]);
+}
+
+PaddingLayer::CalculationStrategy::Coordinates PaddingLayer::CalculationStrategy::getFillingValuePositionInInput(const Coordinates& outputElementPosition) const
+{
+    if (isElementAddedByPadding(outputElementPosition))
+    {
+        return getFillingValuePositionInInputAccordingFillingMode(outputElementPosition);
+    }
+    else
+    {
+        return getPositionOfInputElementMappedToOutputElementWith(outputElementPosition);
+    }
+}
+
+PaddingLayer::CalculationStrategy::Coordinates PaddingLayer::CalculationStrategy::getPositionInDerivativeForUpdate(const Coordinates& deltaElementPosition) const
+{
+    if (isElementAddedByPadding(deltaElementPosition))
+    {
+        return getPositionInDerivativeForUpdateAccordingFillingMode(deltaElementPosition);
+    }
+    else
+    {
+        return getPositionOfInputElementMappedToOutputElementWith(deltaElementPosition);
+    }
+}
+
+PaddingLayer::CalculationStrategy::Coordinates PaddingLayer::CalculationStrategy::getPositionOfInputElementMappedToOutputElementWith(const Coordinates& elementPosition) const
+{
+    return { elementPosition.BatchIdx, elementPosition.DepthIdx, elementPosition.HeightIdx - mTopPadding, elementPosition.WidthIdx - mLeftPadding };
+}
+
+bool PaddingLayer::CalculationStrategy::isElementAddedByPadding(const Coordinates& elementPosition) const
+{
+    return (elementPosition.HeightIdx < mTopPadding || elementPosition.HeightIdx >= mTopPadding + mInputHeight) ||
+           (elementPosition.WidthIdx < mLeftPadding || elementPosition.WidthIdx >= mLeftPadding + mInputWidth);
+}
+
+std::unique_ptr<PaddingLayer::CalculationStrategy> PaddingLayer::CalculationStrategy::define(const PaddingLayerParams& layerParameters, const NetworkParameters& networkParameters)
+{
+    switch (layerParameters.mFillingMode)
+    {
+        case PaddingLayerParams::USE_FILLING_VALUE:
+        {
+            return std::make_unique<ConstantPaddingCalculation>(layerParameters, networkParameters);
+        }
+        case PaddingLayerParams::REFLECTION:
+        {
+            return std::make_unique<ReflectionPaddingCalculation>(layerParameters, networkParameters);
+        }
+        case PaddingLayerParams::REPLICATION:
+        {
+            return std::make_unique<ReplicationPaddingCalculation>(layerParameters, networkParameters);
+        }
+        default:
+        {
+            THROW_NONAME("PaddingLayer", "unknown type of padding");
+        }
+    }
+}
+
+} // ! namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/PaddingLayer.h b/training/src/compiler/training/base/layers/basic/PaddingLayer.h
new file mode 100644
index 00000000..b81822db
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/PaddingLayer.h
@@ -0,0 +1,92 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef PADDING_LAYER_H
+#define PADDING_LAYER_H
+
+#include "training/base/layers/BasicLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief Padding Layer
+ *
+ * The layer adds paddings to input tensors in a given manner.
+ */
+class PaddingLayer : public BasicLayer
+{
+  public:
+    class CalculationStrategy
+    {
+      public:
+        struct Coordinates
+        {
+            const size_t BatchIdx;
+            const size_t DepthIdx;
+            const size_t HeightIdx;
+            const size_t WidthIdx;
+
+            Coordinates(size_t batchIdx, size_t depthIdx, size_t heightIdx, size_t widthIdx)
+                : BatchIdx(batchIdx)
+                , DepthIdx(depthIdx)
+                , HeightIdx(heightIdx)
+                , WidthIdx(widthIdx)
+            {
+            }
+        };
+
+      public:
+        static std::unique_ptr<CalculationStrategy> define(const PaddingLayerParams& layerParameters, const NetworkParameters& networkParameters);
+
+        virtual bool isNeedToGetFillingValueFromInput(const Coordinates& outputElementPosition) const = 0;
+        Coordinates getFillingValuePositionInInput(const Coordinates& outputElementPosition) const;
+        virtual bool isElementAffectToDerivative(const Coordinates& deltaElementPosition) const = 0;
+        Coordinates getPositionInDerivativeForUpdate(const Coordinates& deltaElementPosition) const;
+
+        virtual ~CalculationStrategy() = default;
+
+      protected:
+        CalculationStrategy(const PaddingLayerParams& layerParameters, const NetworkParameters& networkParameters);
+
+        bool isElementAddedByPadding(const Coordinates& elementPosition) const;
+        Coordinates getPositionOfInputElementMappedToOutputElementWith(const Coordinates& elementPosition) const;
+
+        virtual Coordinates getFillingValuePositionInInputAccordingFillingMode(const Coordinates& outputElementPosition) const = 0;
+        virtual Coordinates getPositionInDerivativeForUpdateAccordingFillingMode(const Coordinates& deltaElementPosition) const = 0;
+
+      protected:
+        size_t mInputHeight = 0;
+        size_t mInputWidth = 0;
+        uint32_t mTopPadding;
+        uint32_t mLeftPadding;
+    };
+
+  public:
+    PaddingLayer(const Name& name, const PaddingLayerParams& layerParameters, NetworkParameters& networkParameters);
+
+    PaddingLayer(PaddingLayer&&) = default;
+    PaddingLayer(const PaddingLayer&) = delete;
+    PaddingLayer& operator=(const PaddingLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode /*mode*/) final;
+    void backwardComputeImpl() final;
+
+  private:
+    std::unique_ptr<CalculationStrategy> calculationStrategy;
+    dtype mFillingValue;
+};
+
+} // ! namespace raul
+
+#endif // ! PADDING_LAYER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/PositionalEncoding.cpp b/training/src/compiler/training/base/layers/basic/PositionalEncoding.cpp
new file mode 100644
index 00000000..c4e8e78d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/PositionalEncoding.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "PositionalEncoding.h"
+#include "impl/PositionalEncodingCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+PositionalEncoding::PositionalEncoding(const Name& name, const PositionalEncodingParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "PositionalEncoding", params, networkParameters)
+    , mModelSize(params.modelSize)
+    , mMaxLength(params.maxLength)
+    , mDurationEncoding(params.durationEncoding)
+    , mMaxMelLength(params.maxMelLength)
+{
+    DECLARE_IMPL(PositionalEncoding, PositionalEncodingCPU<MemoryManager>, PositionalEncodingCPU<MemoryManagerFP16>)
+
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    if (mModelSize % 2 != 0)
+    {
+        THROW(mTypeName, mName, "input vector length must be even");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    if (!mDurationEncoding)
+    {
+        if (mNetworkParams.mWorkflow.getWidth(mInputName) != mModelSize)
+        {
+            THROW(mTypeName, mName, "bad input vector length");
+        }
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+    }
+    else
+    {
+        if (mNetworkParams.mWorkflow.getDepth(mInputName) != 1)
+        {
+            THROW(mTypeName, mName, "input depth must be 1 for duration encoding mode");
+        }
+        if (mNetworkParams.mWorkflow.getHeight(mInputName) != 1)
+        {
+            THROW(mTypeName, mName, "input height must be 1 for duration encoding mode");
+        }
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, WShape{ BS(), 1u, mMaxMelLength, mModelSize }, DEC_FORW_WRIT);
+    }
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mName / "pe", WShape{ 1u, 1u, mMaxLength, mModelSize }, DEC_FORW_READ_NOMEMOPT);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/PositionalEncoding.h b/training/src/compiler/training/base/layers/basic/PositionalEncoding.h
new file mode 100644
index 00000000..6c296fe2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/PositionalEncoding.h
@@ -0,0 +1,54 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef POSITIONAL_ENCODING_H
+#define POSITIONAL_ENCODING_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Range Positional Encoding Layer
+ *
+ * The layer encodes information about symbol position in sequence (e.g. word position in sentence) into its embedding vector.
+ *
+ * @see https://kazemnejad.com/blog/transformer_architecture_positional_encoding/
+ */
+class PositionalEncoding : public BasicLayer
+{
+  public:
+    PositionalEncoding(const Name& name, const PositionalEncodingParams& params, NetworkParameters& networkParameters);
+
+    PositionalEncoding(PositionalEncoding&&) = default;
+    PositionalEncoding(const PositionalEncoding&) = delete;
+    PositionalEncoding& operator=(const PositionalEncoding&) = delete;
+
+  private:
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mModelSize = 2;
+    size_t mMaxLength = 150;
+    bool mDurationEncoding = false;
+    size_t mMaxMelLength = 200;
+
+    template<typename MM>
+    friend class PositionalEncodingCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RSqrtLayer.cpp b/training/src/compiler/training/base/layers/basic/RSqrtLayer.cpp
new file mode 100644
index 00000000..79f46e90
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RSqrtLayer.cpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RSqrtLayer.h"
+
+#include "impl/RSqrtLayerCPU.h"
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+RSqrtLayer::RSqrtLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "RSqrt", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(RSqrtLayer, RSqrtLayerCPU<MemoryManager>, RSqrtLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputs[0]);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputs[0]);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputs[0]);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RSqrtLayer.h b/training/src/compiler/training/base/layers/basic/RSqrtLayer.h
new file mode 100644
index 00000000..1ac7abbe
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RSqrtLayer.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RSQRT_LAYER_H
+#define RSQRT_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise Reciprocal Square Root Layer
+ *
+ * Returns a new tensor with the reciprocal of the square-root of each of the elements of the input.
+ */
+class RSqrtLayer : public BasicLayer
+{
+  public:
+    RSqrtLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    RSqrtLayer(RSqrtLayer&&) = default;
+    RSqrtLayer(const RSqrtLayer&) = delete;
+    RSqrtLayer& operator=(const RSqrtLayer&) = delete;
+
+  private:
+    bool mNegativeNumberDetected = false;
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class RSqrtLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RandomChoiceLayer.cpp b/training/src/compiler/training/base/layers/basic/RandomChoiceLayer.cpp
new file mode 100644
index 00000000..2bd878b0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RandomChoiceLayer.cpp
@@ -0,0 +1,112 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RandomChoiceLayer.h"
+
+#include <training/base/common/Random.h>
+
+namespace raul
+{
+
+RandomChoiceLayer::RandomChoiceLayer(const Name& name, const RandomChoiceParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "RandomChoice", params, networkParameters)
+    , mGenerator(static_cast<unsigned>(params.mSeed))
+    , mSelectedInput(0)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    auto probs = params.mRatios;
+    if (probs.size() != mInputs.size() && probs.size() != mInputs.size() - 1)
+    {
+        THROW(mTypeName, mName, "wrong number of ratios in params");
+    }
+
+    auto sum = std::accumulate(probs.begin(), probs.end(), 0.f);
+    if (probs.size() == mInputs.size() - 1)
+    {
+        if (sum > 1.f)
+        {
+            THROW(mTypeName, mName, "sum of probabilities > 1");
+        }
+        probs.push_back(1.f - sum);
+        sum = 1.f;
+    }
+
+    mSections.push_back(probs[0] / sum);
+    for (size_t i = 1; i < probs.size(); ++i)
+    {
+        mSections.push_back(mSections.back() + probs[i] / sum);
+    }
+
+    shape outputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+    for (const auto& input : mInputs)
+    {
+        shape inputShape{ 1u, mNetworkParams.mWorkflow.getDepth(input), mNetworkParams.mWorkflow.getHeight(input), mNetworkParams.mWorkflow.getWidth(input) };
+        if (inputShape != outputShape)
+        {
+            THROW(mTypeName, mName, "input tensor shapes must be the same");
+        }
+
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, input.grad(), DEC_BACK_WRIT_ZERO);
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+void RandomChoiceLayer::forwardComputeImpl(NetworkMode)
+{
+
+    auto& output = mNetworkParams.mMemoryManager[mOutputs[0]];
+
+    auto p = random::uniform::rand<raul::dtype>(0., 1.);
+    mSelectedInput = mSections.size();
+    for (size_t i = 0; i < mSections.size(); ++i)
+    {
+        if (p < mSections[i])
+        {
+            mSelectedInput = i;
+            break;
+        }
+    }
+
+    const auto& input = mNetworkParams.mMemoryManager[mInputs[mSelectedInput]];
+    output = TORANGE(input);
+}
+
+void RandomChoiceLayer::backwardComputeImpl()
+{
+
+    const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputs[0].grad()];
+
+    for (size_t i = 0; i < mInputs.size(); ++i)
+    {
+        auto input = mInputs[i];
+        // if (mNetworkParams.isGradNeeded(input))
+        {
+            Tensor& prevLayerDelta = mNetworkParams.mMemoryManager[input.grad()];
+            if (i == mSelectedInput)
+            {
+                prevLayerDelta += deltas;
+            }
+        }
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RandomChoiceLayer.h b/training/src/compiler/training/base/layers/basic/RandomChoiceLayer.h
new file mode 100644
index 00000000..10994edb
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RandomChoiceLayer.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_CHOICE_LAYER_H
+#define RANDOM_CHOICE_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/RandomChoiceParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Random Choice Layer
+ * Randomly outputs one of it's input tensors
+ * according to probabilities provided.
+ */
+class RandomChoiceLayer : public BasicLayer
+{
+  public:
+    RandomChoiceLayer(const Name& name, const RandomChoiceParams& params, NetworkParameters& networkParameters);
+
+    RandomChoiceLayer(RandomChoiceLayer&&) = default;
+    RandomChoiceLayer(const RandomChoiceLayer&) = delete;
+    RandomChoiceLayer& operator=(const RandomChoiceLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    std::default_random_engine mGenerator;
+
+    std::vector<float> mSections;
+
+    size_t mSelectedInput;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RandomSelectLayer.cpp b/training/src/compiler/training/base/layers/basic/RandomSelectLayer.cpp
new file mode 100644
index 00000000..11d6372b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RandomSelectLayer.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RandomSelectLayer.h"
+
+#include "impl/RandomSelectLayerCPU.h"
+
+namespace raul
+{
+RandomSelectLayer::RandomSelectLayer(const Name& name, const RandomSelectParams& params, NetworkParameters& networkParameters)
+    : BroadcastingLayer(name, "RandomSelect", params, networkParameters)
+    , mProbability(params.probability)
+    , mBroadcast(params.broadcast)
+    , mRandomName(name / "random")
+{
+    if (mInputs.size() != 2)
+    {
+        THROW("RandomSelectLayer", name, "wrong number of input names");
+    }
+
+    DECLARE_IMPL(RandomSelectLayer, RandomSelectLayerCPU<MemoryManager>, RandomSelectLayerCPU<MemoryManagerFP16>)
+
+    for (const auto& input : mInputs)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(name, input, input.grad(), DEC_BACK_WRIT_ZERO);
+    }
+
+    shape outputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+    if (mBroadcast)
+    {
+        shape inputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[1]), mNetworkParams.mWorkflow.getHeight(mInputs[1]), mNetworkParams.mWorkflow.getWidth(mInputs[1]) };
+        std::transform(inputShape.begin(), inputShape.end(), outputShape.begin(), outputShape.begin(), [](auto a, auto b) { return std::max(a, b); });
+    }
+    mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ raul::BS(), outputShape[1], outputShape[2], outputShape[3] }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mRandomName, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mRandomName, DEC_BACK_READ);
+}
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RandomSelectLayer.h b/training/src/compiler/training/base/layers/basic/RandomSelectLayer.h
new file mode 100644
index 00000000..4aa9e066
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RandomSelectLayer.h
@@ -0,0 +1,54 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_SELECT_LAYER_H
+#define RANDOM_SELECT_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BroadcastingLayer.h>
+#include <training/base/layers/parameters/RandomSelectParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Random Select Layer
+ *
+ * This layer returns a tensor of elements selected from either x or y, depending on dropout rate.
+ * First input values selected at indices where condition is True, second input values - where condition is False.
+ * All input tensors must be broadcastable.
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.where.html?highlight=where#torch.where
+ */
+class RandomSelectLayer : public BroadcastingLayer
+{
+  public:
+    RandomSelectLayer(const Name& name, const RandomSelectParams& params, NetworkParameters& networkParameters);
+
+    RandomSelectLayer(RandomSelectLayer&&) = default;
+    RandomSelectLayer(const RandomSelectLayer&) = delete;
+    RandomSelectLayer& operator=(const RandomSelectLayer&) = delete;
+
+  private:
+    raul::dtype mProbability;
+    bool mBroadcast;
+    Name mRandomName;
+
+    template<typename MM>
+    friend class RandomSelectLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RandomTensorLayer.cpp b/training/src/compiler/training/base/layers/basic/RandomTensorLayer.cpp
new file mode 100644
index 00000000..b8c9387c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RandomTensorLayer.cpp
@@ -0,0 +1,50 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RandomTensorLayer.h"
+
+#include "impl/RandomTensorLayerCPU.h"
+
+namespace raul
+{
+
+RandomTensorLayer::RandomTensorLayer(const Name& name, const RandomTensorLayerParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "RandomTensor", params, networkParameters, { false, true })
+    , mDepth(params.mDepth)
+    , mHeight(params.mHeight)
+    , mWidth(params.mWidth)
+    , mMean(params.mMean)
+    , mStdDev(params.mStdDev)
+    , mGenerator(static_cast<unsigned>(params.mSeed))
+{
+    if (!mInputs.empty())
+    {
+        THROW("RandomTensorLayer", name, "no inputs expected");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW("RandomLayer", name, "wrong number of output names");
+    }
+
+    if (mOutputs[0].empty())
+    {
+        THROW("RandomLayer", name, "empty output name");
+    }
+
+    DECLARE_IMPL(RandomTensorLayer, RandomTensorLayerCPU<MemoryManager>, RandomTensorLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ BS(), mDepth, mHeight, mWidth }, DEC_FORW_WRIT);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RandomTensorLayer.h b/training/src/compiler/training/base/layers/basic/RandomTensorLayer.h
new file mode 100644
index 00000000..64a94f5c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RandomTensorLayer.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_TENSOR_LAYER_H
+#define RANDOM_TENSOR_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/RandomTensorLayerParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief RandomTensorLayer
+ * Creates tensor with specified shape and
+ * fills it by values from random normal distribution.
+ */
+class RandomTensorLayer : public BasicLayer
+{
+  public:
+    RandomTensorLayer(const Name& name, const RandomTensorLayerParams& params, NetworkParameters& networkParameters);
+
+    RandomTensorLayer(RandomTensorLayer&&) = default;
+    RandomTensorLayer(const RandomTensorLayer&) = delete;
+    RandomTensorLayer& operator=(const RandomTensorLayer&) = delete;
+
+  private:
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    raul::dtype mMean;
+    raul::dtype mStdDev;
+    std::default_random_engine mGenerator;
+
+    template<typename MM>
+    friend class RandomTensorLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceArithmeticLayer.cpp b/training/src/compiler/training/base/layers/basic/ReduceArithmeticLayer.cpp
new file mode 100644
index 00000000..ec493951
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceArithmeticLayer.cpp
@@ -0,0 +1,101 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReduceArithmeticLayer.h"
+
+#include "impl/ReduceArithmeticLayerCPU.h"
+
+namespace raul
+{
+
+std::unordered_set<std::string> ReduceArithmeticLayer::mAvailableOps = { "sum", "mean", "batch_mean", "std", "count_non_zero_elems" };
+
+ReduceArithmeticLayer::ReduceArithmeticLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters, const std::string& operation)
+    : BasicLayer(name, "ReduceArithmetic", params, networkParameters)
+    , mDim(params.dim)
+    , mOperation(operation)
+{
+    if (mAvailableOps.find(mOperation) == mAvailableOps.end())
+    {
+        THROW("ReduceArithmeticLayer", mName, "unavailabe operation");
+    }
+
+    if (mInputs.size() != 1)
+    {
+        THROW("Reduce" + mOperation + "Layer", mName, "wrong number of input names");
+    }
+
+    if (mOutputs.size() != 1)
+    {
+        THROW("Reduce" + mOperation + "Layer", mName, "wrong number of output names");
+    }
+
+    if (mInputs[0].empty())
+    {
+        THROW("Reduce" + mOperation + "Layer", mName, "empty first input name");
+    }
+
+    if (mOutputs[0].empty())
+    {
+        THROW("Reduce" + mOperation + "Layer", mName, "empty output name");
+    }
+
+    DECLARE_IMPL(ReduceArithmeticLayer, ReduceArithmeticLayerCPU<MemoryManager>, ReduceArithmeticLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    if (mDim == Dimension::Default)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], WShape{ 1u, 1u, 1u, 1u }, DEC_FRBC_WRIT_NOMEMOPT);
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0].grad(), WShape{ 1u, 1u, 1u, 1u }, DEC_BACK_READ);
+    }
+    else
+    {
+        shape inputShape = shape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+
+        if (mDim == Dimension::Batch)
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], WShape{ 1u, inputShape[1], inputShape[2], inputShape[3] }, DEC_FRBC_WRIT_NOMEMOPT);
+
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0].grad(), WShape{ 1u, inputShape[1], inputShape[2], inputShape[3] }, DEC_BACK_READ);
+        }
+        else
+        {
+            for (size_t i = 1; i < inputShape.dimensions_num(); ++i)
+            {
+                if (static_cast<size_t>(mDim) == i)
+                {
+                    inputShape[i] = 1;
+                    break;
+                }
+            }
+            if (mNetworkParams.mWorkflow.getShape(mInputs[0]).isBSDependent())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], WShape(BS(), inputShape[1], inputShape[2], inputShape[3]), DEC_FRBC_WRIT_NOMEMOPT);
+
+                mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0].grad(), WShape(BS(), inputShape[1], inputShape[2], inputShape[3]), DEC_BACK_READ);
+            }
+            else
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], WShape(mNetworkParams.mWorkflow.getBatch(mInputs[0]), inputShape[1], inputShape[2], inputShape[3]), DEC_FRBC_WRIT_NOMEMOPT);
+
+                mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0].grad(), WShape(mNetworkParams.mWorkflow.getBatch(mInputs[0]), inputShape[1], inputShape[2], inputShape[3]), DEC_BACK_READ);
+            }
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceArithmeticLayer.h b/training/src/compiler/training/base/layers/basic/ReduceArithmeticLayer.h
new file mode 100644
index 00000000..9090a32d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceArithmeticLayer.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_ARITHMETIC_LAYER_H
+#define REDUCE_ARITHMETIC_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Reduce Arithmetic Layer
+ * Computes the sum/mean/standard deviation of elements across dimensions of a tensor.
+ *
+ */
+class ReduceArithmeticLayer : public BasicLayer
+{
+  public:
+    ReduceArithmeticLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters, const std::string& operation);
+
+    ReduceArithmeticLayer(ReduceArithmeticLayer&&) = default;
+    ReduceArithmeticLayer(const ReduceArithmeticLayer&) = delete;
+    ReduceArithmeticLayer& operator=(const ReduceArithmeticLayer&) = delete;
+
+  private:
+    // Available options
+    static std::unordered_set<std::string> mAvailableOps;
+    Dimension mDim;
+    std::string mOperation;
+    dtype mDiv{ 1.0_dt };
+    // For mOperatiom == "std" case
+    std::vector<dtype> mMeanValues;
+
+    template<typename MM>
+    friend class ReduceArithmeticLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceBatchMeanLayer.cpp b/training/src/compiler/training/base/layers/basic/ReduceBatchMeanLayer.cpp
new file mode 100644
index 00000000..a035e390
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceBatchMeanLayer.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReduceBatchMeanLayer.h"
+
+namespace raul
+{
+
+ReduceBatchMeanLayer::ReduceBatchMeanLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : ReduceArithmeticLayer(name, params, networkParameters, "batch_mean")
+{
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceBatchMeanLayer.h b/training/src/compiler/training/base/layers/basic/ReduceBatchMeanLayer.h
new file mode 100644
index 00000000..4d2771e4
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceBatchMeanLayer.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_BATCH_MEAN_LAYER_H
+#define REDUCE_BATCH_MEAN_LAYER_H
+
+#include "ReduceArithmeticLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief Reduce Mean Layer
+ * Computes the mean of elements across dimensions of a tensor.
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.mean.html
+ */
+class ReduceBatchMeanLayer : public ReduceArithmeticLayer
+{
+  public:
+    ReduceBatchMeanLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    ReduceBatchMeanLayer(ReduceBatchMeanLayer&&) = default;
+    ReduceBatchMeanLayer(const ReduceBatchMeanLayer&) = delete;
+    ReduceBatchMeanLayer& operator=(const ReduceBatchMeanLayer&) = delete;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceExtremumLayer.cpp b/training/src/compiler/training/base/layers/basic/ReduceExtremumLayer.cpp
new file mode 100644
index 00000000..87c6cbf3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceExtremumLayer.cpp
@@ -0,0 +1,94 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReduceExtremumLayer.h"
+
+#include "impl/ReduceExtremumLayerCPU.h"
+
+namespace raul
+{
+
+template<template<typename> typename Comparator>
+ReduceExtremumLayer<Comparator>::ReduceExtremumLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "ReduceExtremum", params, networkParameters)
+    , mDim(params.dim)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    typedef ReduceExtremumLayerCPU<Comparator, MemoryManager> ReduceExtremumLayerCPUFP32;
+    typedef ReduceExtremumLayerCPU<Comparator, MemoryManagerFP16> ReduceExtremumLayerCPUFP16;
+
+    DECLARE_IMPL(ReduceExtremumLayer, ReduceExtremumLayerCPUFP32, ReduceExtremumLayerCPUFP16)
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    shape outputShape = shape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+
+    bool changeBS = false;
+    if (mDim == raul::Dimension::Default)
+    {
+        outputShape = shape{ 1u, 1u, 1u, 1u };
+        changeBS = true;
+    }
+    else
+    {
+        if (mDim == raul::Dimension::Batch)
+        {
+            changeBS = true;
+        }
+        else
+        {
+            for (size_t i = 1; i < 4; i++)
+            {
+                if (static_cast<size_t>(mDim) == i)
+                {
+                    outputShape[i] = 1;
+                    break;
+                }
+            }
+        }
+    }
+    if (changeBS)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], raul::WShape{ outputShape }, DEC_FORW_WRIT);
+    }
+    else
+    {
+        if (mNetworkParams.mWorkflow.getShape(mInputs[0]).isBSDependent())
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], raul::WShape{ raul::BS(), outputShape[1], outputShape[2], outputShape[3] }, DEC_FORW_WRIT);
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], raul::WShape{ mNetworkParams.mWorkflow.getBatch(mInputs[0]), outputShape[1], outputShape[2], outputShape[3] }, DEC_FORW_WRIT);
+        }
+    }
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0], DEC_BACK_READ);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+template class ReduceExtremumLayer<Max>;
+template class ReduceExtremumLayer<Min>;
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceExtremumLayer.h b/training/src/compiler/training/base/layers/basic/ReduceExtremumLayer.h
new file mode 100644
index 00000000..4f2fa4f4
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceExtremumLayer.h
@@ -0,0 +1,65 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_EXTREMUM_LAYER_H
+#define REDUCE_EXTREMUM_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Reduce Extremum Layer
+ *
+ * Returns extremum values along chosen dimension.
+ * If chosen dimension is raul::Dimension::Default
+ * (or no dimension specified), layer returns global extremum.
+ */
+template<typename T>
+struct Min
+{
+    static bool compare(const T x, const T y) { return y <= x; }
+    static T getBound() { return std::numeric_limits<T>::infinity(); }
+};
+
+template<typename T>
+struct Max
+{
+    static bool compare(const T x, const T y) { return y >= x; }
+    static T getBound() { return -std::numeric_limits<T>::infinity(); }
+};
+
+template<template<typename> typename Comparator>
+class ReduceExtremumLayer : public BasicLayer
+{
+  public:
+    ReduceExtremumLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    ReduceExtremumLayer(ReduceExtremumLayer&&) = default;
+    ReduceExtremumLayer(const ReduceExtremumLayer&) = delete;
+    ReduceExtremumLayer& operator=(const ReduceExtremumLayer&) = delete;
+
+  private:
+    Dimension mDim;
+    // Need for backward path
+    std::vector<size_t> mCountExtremums;
+
+    template<template<typename> typename Comp, typename MM>
+    friend class ReduceExtremumLayerCPU;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceMaxLayer.h b/training/src/compiler/training/base/layers/basic/ReduceMaxLayer.h
new file mode 100644
index 00000000..70b75c6e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceMaxLayer.h
@@ -0,0 +1,35 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_MAX_LAYER_H
+#define REDUCE_MAX_LAYER_H
+
+#include "ReduceExtremumLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief Reduce Max Layer
+ * Returns maximum values of each row of the input tensor in the given dimension dim.
+ * If chosen dimension is raul::Dimension::Default (or no dimension specified),
+ * layer returns global maximum.
+ *
+ * @see
+ * https://www.tensorflow.org/api_docs/python/tf/math/reduce_max
+ */
+using ReduceMaxLayer = ReduceExtremumLayer<raul::Max>;
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceMeanLayer.cpp b/training/src/compiler/training/base/layers/basic/ReduceMeanLayer.cpp
new file mode 100644
index 00000000..7b9bb9f1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceMeanLayer.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReduceMeanLayer.h"
+
+namespace raul
+{
+
+ReduceMeanLayer::ReduceMeanLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : ReduceArithmeticLayer(name, params, networkParameters, "mean")
+{
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceMeanLayer.h b/training/src/compiler/training/base/layers/basic/ReduceMeanLayer.h
new file mode 100644
index 00000000..3581fc06
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceMeanLayer.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_MEAN_LAYER_H
+#define REDUCE_MEAN_LAYER_H
+
+#include "ReduceArithmeticLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief Reduce Mean Layer
+ * Computes the mean of elements across dimensions of a tensor.
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.mean.html
+ */
+class ReduceMeanLayer : public ReduceArithmeticLayer
+{
+  public:
+    ReduceMeanLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    ReduceMeanLayer(ReduceMeanLayer&&) = default;
+    ReduceMeanLayer(const ReduceMeanLayer&) = delete;
+    ReduceMeanLayer& operator=(const ReduceMeanLayer&) = delete;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceMinLayer.h b/training/src/compiler/training/base/layers/basic/ReduceMinLayer.h
new file mode 100644
index 00000000..038876b2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceMinLayer.h
@@ -0,0 +1,35 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_MIN_LAYER_H
+#define REDUCE_MIN_LAYER_H
+
+#include "ReduceExtremumLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief Reduce Min Layer
+ * Returns minimum values of each row of the input tensor in the given dimension dim.
+ * If chosen dimension is raul::Dimension::Default (or no dimension specified),
+ * layer returns global minimum.
+ *
+ * @see
+ * https://www.tensorflow.org/api_docs/python/tf/math/reduce_min
+ */
+using ReduceMinLayer = ReduceExtremumLayer<raul::Min>;
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceNonZeroLayer.cpp b/training/src/compiler/training/base/layers/basic/ReduceNonZeroLayer.cpp
new file mode 100644
index 00000000..56a8a5aa
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceNonZeroLayer.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReduceNonZeroLayer.h"
+
+namespace raul
+{
+
+ReduceNonZeroLayer::ReduceNonZeroLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : ReduceArithmeticLayer(name, params, networkParameters, "count_non_zero_elems")
+{
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceNonZeroLayer.h b/training/src/compiler/training/base/layers/basic/ReduceNonZeroLayer.h
new file mode 100644
index 00000000..8f311ddc
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceNonZeroLayer.h
@@ -0,0 +1,39 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_NON_ZERO_LAYER_H
+#define REDUCE_NON_ZERO_LAYER_H
+
+#include "ReduceArithmeticLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief Reduce Non-Zero Layer
+ *
+ * Computes the number of non-zero elements (exactly or with given tolerance) along dimensions of a tensor.
+ */
+class ReduceNonZeroLayer : public ReduceArithmeticLayer
+{
+  public:
+    ReduceNonZeroLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    ReduceNonZeroLayer(ReduceNonZeroLayer&&) = default;
+    ReduceNonZeroLayer(const ReduceNonZeroLayer&) = delete;
+    ReduceNonZeroLayer& operator=(const ReduceNonZeroLayer&) = delete;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceStdLayer.cpp b/training/src/compiler/training/base/layers/basic/ReduceStdLayer.cpp
new file mode 100644
index 00000000..056f7ae0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceStdLayer.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReduceStdLayer.h"
+
+namespace raul
+{
+
+ReduceStdLayer::ReduceStdLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : ReduceArithmeticLayer(name, params, networkParameters, "std")
+{
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceStdLayer.h b/training/src/compiler/training/base/layers/basic/ReduceStdLayer.h
new file mode 100644
index 00000000..6e247672
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceStdLayer.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_STD_LAYER_H
+#define REDUCE_STD_LAYER_H
+
+#include "ReduceArithmeticLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief Reduce Sum Layer
+ * Computes the standard deviation of elements across dimensions of a tensor.
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.std.html
+ */
+class ReduceStdLayer : public ReduceArithmeticLayer
+{
+  public:
+    ReduceStdLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    ReduceStdLayer(ReduceStdLayer&&) = default;
+    ReduceStdLayer(const ReduceStdLayer&) = delete;
+    ReduceStdLayer& operator=(const ReduceStdLayer&) = delete;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceSumLayer.cpp b/training/src/compiler/training/base/layers/basic/ReduceSumLayer.cpp
new file mode 100644
index 00000000..aa874b34
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceSumLayer.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReduceSumLayer.h"
+
+namespace raul
+{
+
+ReduceSumLayer::ReduceSumLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters)
+    : ReduceArithmeticLayer(name, params, networkParameters, "sum")
+{
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReduceSumLayer.h b/training/src/compiler/training/base/layers/basic/ReduceSumLayer.h
new file mode 100644
index 00000000..a057a889
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReduceSumLayer.h
@@ -0,0 +1,39 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_SUM_LAYER_H
+#define REDUCE_SUM_LAYER_H
+
+#include "ReduceArithmeticLayer.h"
+
+namespace raul
+{
+
+/**
+ * @brief Reduce Sum Layer
+ * Computes the sum of elements across dimensions of a tensor.
+ * Use Dimension::Default to compute sum of all elements
+ */
+class ReduceSumLayer : public ReduceArithmeticLayer
+{
+  public:
+    ReduceSumLayer(const Name& name, const BasicParamsWithDim& params, NetworkParameters& networkParameters);
+
+    ReduceSumLayer(ReduceSumLayer&&) = default;
+    ReduceSumLayer(const ReduceSumLayer&) = delete;
+    ReduceSumLayer& operator=(const ReduceSumLayer&) = delete;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RepeatInterleaveLayer.cpp b/training/src/compiler/training/base/layers/basic/RepeatInterleaveLayer.cpp
new file mode 100644
index 00000000..ef983a5c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RepeatInterleaveLayer.cpp
@@ -0,0 +1,220 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RepeatInterleaveLayer.h"
+
+namespace
+{
+
+std::tuple<size_t, size_t, size_t, size_t> reassign(raul::Dimension dim, size_t i, size_t j, size_t k, size_t q)
+{
+    if (dim == raul::Dimension::Depth)
+    {
+        return std::make_tuple(j, i, k, q);
+    }
+    if (dim == raul::Dimension::Height)
+    {
+        return std::make_tuple(j, k, i, q);
+    }
+    return std::make_tuple(j, k, q, i);
+}
+
+} // anonymous namespace
+
+namespace raul
+{
+
+RepeatInterleaveLayer::RepeatInterleaveLayer(const Name& name, const RepeatInterleaveParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "RepeatInterleave", params, networkParameters)
+    , mDimension(params.dim)
+    , mRepeats(params.mRepeats)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mDimension == raul::Dimension::Batch)
+    {
+        THROW(mTypeName, mName, "only depth, height and width dimensions are supported");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    shape outputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+    if (mRepeats.size() == 1)
+    {
+        if (mDimension == raul::Dimension::Default)
+        {
+            outputShape[1] = mRepeats[0] * std::accumulate(outputShape.begin(), outputShape.end(), static_cast<size_t>(1), std::multiplies<size_t>());
+            outputShape[2] = 1;
+            outputShape[3] = 1;
+        }
+        else
+        {
+            outputShape[static_cast<size_t>(mDimension)] *= mRepeats[0];
+            mRepeats.resize(outputShape[static_cast<size_t>(mDimension)], mRepeats[0]);
+        }
+    }
+    else
+    {
+        if (outputShape[static_cast<size_t>(mDimension)] != mRepeats.size())
+        {
+            THROW("RepeatInterleaveLayer", name, "number of repetitions should match dimension size or be equal to 1");
+        }
+        outputShape[static_cast<size_t>(mDimension)] = std::accumulate(mRepeats.begin(), mRepeats.end(), static_cast<size_t>(0));
+    }
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], raul::WShape{ raul::BS(), outputShape[1], outputShape[2], outputShape[3] }, DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+void RepeatInterleaveLayer::forwardComputeImpl(NetworkMode)
+{
+
+    Tensor& output = mNetworkParams.mMemoryManager[mOutputs[0]];
+    const Tensor& input = mNetworkParams.mMemoryManager[mInputs[0]];
+
+    if (mDimension == raul::Dimension::Default)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < input.size(); ++i)
+        {
+            for (size_t j = mRepeats[0] * i; j < mRepeats[0] * i + mRepeats[0]; ++j)
+            {
+                output[j] = input[i];
+            }
+        }
+    }
+    else
+    {
+        // Get 4D view
+        const auto input4D = input.get4DView();
+        auto output4D = output.get4DView();
+        // Need shapes to get indices
+        const auto outputShape = output.getShape();
+
+        // Pick chosen dimension
+        const auto chosenDimSize = outputShape[static_cast<size_t>(mDimension)];
+        // Other dimensions
+        std::vector<size_t> otherDims;
+        for (size_t i = 0; i < outputShape.dimensions_num(); ++i)
+        {
+            if (i != static_cast<size_t>(mDimension))
+            {
+                otherDims.push_back(outputShape[i]);
+            }
+        }
+        size_t indexToRepeat = 0u;
+        size_t cumSum = 0u;
+        for (size_t i = 0; i < chosenDimSize; ++i)
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t j = 0; j < otherDims[0]; ++j)
+            {
+                for (size_t k = 0; k < otherDims[1]; ++k)
+                {
+                    for (size_t q = 0; q < otherDims[2]; ++q)
+                    {
+                        // Rearrange indices in proper way
+                        auto [realIO, realJO, realKO, realQO] = reassign(mDimension, i, j, k, q);
+                        auto [realII, realJI, realKI, realQI] = reassign(mDimension, indexToRepeat, j, k, q);
+                        output4D[realIO][realJO][realKO][realQO] = input4D[realII][realJI][realKI][realQI];
+                    }
+                }
+            }
+            if (i - cumSum == mRepeats[indexToRepeat] - 1)
+            {
+                cumSum += mRepeats[indexToRepeat];
+                indexToRepeat += 1;
+            }
+        }
+    }
+}
+
+void RepeatInterleaveLayer::backwardComputeImpl()
+{
+
+    const Tensor& delta = mNetworkParams.mMemoryManager[mOutputs[0].grad()];
+
+    // if (mNetworkParams.isGradNeeded(mInputs[0]))
+    {
+        auto& prevLayerNabla = mNetworkParams.mMemoryManager[mInputs[0].grad()];
+
+        if (mDimension == raul::Dimension::Default)
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < prevLayerNabla.size(); ++i)
+            {
+                for (size_t j = mRepeats[0] * i; j < mRepeats[0] * i + mRepeats[0]; ++j)
+                {
+                    prevLayerNabla[i] += delta[j];
+                }
+            }
+        }
+        else
+        {
+            // Get 4D view
+            const auto delta4D = delta.get4DView();
+            auto prevLayerNabla4D = prevLayerNabla.get4DView();
+            // Need shapes to get indices
+            const auto deltaShape = delta.getShape();
+
+            // Pick chosen dimension
+            const auto chosenDimSize = deltaShape[static_cast<size_t>(mDimension)];
+            // Other dimensions
+            std::vector<size_t> otherDims;
+            for (size_t i = 0; i < deltaShape.dimensions_num(); ++i)
+            {
+                if (i != static_cast<size_t>(mDimension))
+                {
+                    otherDims.push_back(deltaShape[i]);
+                }
+            }
+            size_t indexToRepeat = 0u;
+            size_t cumSum = 0u;
+            for (size_t i = 0; i < chosenDimSize; ++i)
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t j = 0; j < otherDims[0]; ++j)
+                {
+                    for (size_t k = 0; k < otherDims[1]; ++k)
+                    {
+                        for (size_t q = 0; q < otherDims[2]; ++q)
+                        {
+                            // Rearrange indices in proper way
+                            auto [realIO, realJO, realKO, realQO] = reassign(mDimension, i, j, k, q);
+                            auto [realII, realJI, realKI, realQI] = reassign(mDimension, indexToRepeat, j, k, q);
+                            prevLayerNabla4D[realII][realJI][realKI][realQI] += delta4D[realIO][realJO][realKO][realQO];
+                        }
+                    }
+                }
+                if (i - cumSum == mRepeats[indexToRepeat] - 1)
+                {
+                    cumSum += mRepeats[indexToRepeat];
+                    indexToRepeat += 1;
+                }
+            }
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RepeatInterleaveLayer.h b/training/src/compiler/training/base/layers/basic/RepeatInterleaveLayer.h
new file mode 100644
index 00000000..7da6df9e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RepeatInterleaveLayer.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REPEAT_INTERLEAVE_LAYER_H
+#define REPEAT_INTERLEAVE_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/RepeatInterleaveParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Repeat Interleave Layer
+ * This operation creates a new tensor repeating elements along
+ * chosen dimension. The number of repetitions is broadcasted (can be 1 or
+ * number of elements in chosen dimension) to fit the shape of the given axis.
+ *
+ * @see
+ * https://pytorch.org/docs/stable/torch.html#torch.repeat_interleave
+ *
+ */
+class RepeatInterleaveLayer : public BasicLayer
+{
+  public:
+    RepeatInterleaveLayer(const Name& name, const RepeatInterleaveParams& params, NetworkParameters& networkParameters);
+
+    RepeatInterleaveLayer(RepeatInterleaveLayer&&) = default;
+    RepeatInterleaveLayer(const RepeatInterleaveLayer&) = delete;
+    RepeatInterleaveLayer& operator=(const RepeatInterleaveLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    raul::Dimension mDimension;
+    std::vector<size_t> mRepeats;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReshapeLayer.cpp b/training/src/compiler/training/base/layers/basic/ReshapeLayer.cpp
new file mode 100644
index 00000000..41a95f85
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReshapeLayer.cpp
@@ -0,0 +1,95 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReshapeLayer.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+#include "impl/ReshapeLayerCPU.h"
+
+namespace raul
+{
+ReshapeLayer::ReshapeLayer(const Name& name, const ViewParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Reshape", params, networkParameters)
+{
+    MEASURE_BLOCK(mTypeName + "[" + mName + "::ctor]")
+
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    DECLARE_IMPL(ReshapeLayer, ReshapeLayerCPU<MemoryManager>, ReshapeLayerCPU<MemoryManagerFP16>)
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    const size_t dimensions = 3;
+
+    shape inputShape = shape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputName), mNetworkParams.mWorkflow.getHeight(mInputName), mNetworkParams.mWorkflow.getWidth(mInputName) };
+
+    size_t totalElInInput = inputShape.total_size();
+
+    std::vector<int> sizes = { params.depth, params.height, params.width };
+    size_t totalElInParams = 1;
+    for (size_t i = 0; i < dimensions; ++i)
+    {
+        if (sizes[i] == 0)
+        {
+            THROW(mTypeName, mName, "new sizes must be positve or -1");
+        }
+        if (sizes[i] > 0)
+        {
+            totalElInParams *= sizes[i];
+        }
+    }
+
+    if (totalElInInput % totalElInParams != 0)
+    {
+        THROW(mTypeName, mName, "bad shape");
+    }
+
+    size_t newSize = totalElInParams;
+    for (size_t i = 0; i < dimensions; ++i)
+    {
+        if (sizes[i] < 0)
+        {
+            sizes[i] = static_cast<int>(totalElInInput / totalElInParams);
+            newSize *= sizes[i];
+        }
+    }
+
+    if (newSize != totalElInInput)
+    {
+        THROW(mTypeName, mName, "bad shape");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ raul::BS(), static_cast<size_t>(sizes[0]), static_cast<size_t>(sizes[1]), static_cast<size_t>(sizes[2]) }, DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.tensorNeeded(
+        mName, mOutputName.grad(), raul::WShape{ raul::BS(), static_cast<size_t>(sizes[0]), static_cast<size_t>(sizes[1]), static_cast<size_t>(sizes[2]) }, DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReshapeLayer.h b/training/src/compiler/training/base/layers/basic/ReshapeLayer.h
new file mode 100644
index 00000000..8fc740cf
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReshapeLayer.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RESHAPE_LAYER_H
+#define RESHAPE_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Reshaping Layer
+ *
+ * A technical operation to perform reshaping of a tensor.
+ *
+ */
+class ReshapeLayer : public BasicLayer
+{
+  public:
+    ReshapeLayer(const Name& name, const ViewParams& params, NetworkParameters& networkParameters);
+
+    ReshapeLayer(ReshapeLayer&&) = default;
+    ReshapeLayer(const ReshapeLayer&) = delete;
+    ReshapeLayer& operator=(const ReshapeLayer&) = delete;
+
+  private:
+    Name mInputName;
+    Name mOutputName;
+
+    template<typename MM>
+    friend class ReshapeLayerCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReverseLayer.cpp b/training/src/compiler/training/base/layers/basic/ReverseLayer.cpp
new file mode 100644
index 00000000..698254d9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReverseLayer.cpp
@@ -0,0 +1,56 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReverseLayer.h"
+
+#include "impl/ReverseLayerCPU.h"
+
+namespace raul
+{
+
+ReverseLayer::ReverseLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Reverse", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    DECLARE_IMPL(ReverseLayer, ReverseLayerCPU<MemoryManager>, ReverseLayerCPU<MemoryManagerFP16>)
+
+    if (mInputs.size() != 1 && mInputs.size() != 2)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+    if (mNetworkParams.mWorkflow.getDepth(mInputs[0]) != 1u && mNetworkParams.mWorkflow.getHeight(mInputs[0]) != 1u)
+    {
+        THROW(mTypeName, mName, "unsupported mode; depth or height should have size = 1");
+    }
+
+    mReverseOnly = mInputs.size() == 1;
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+    if (!mReverseOnly)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[1], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    }
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ReverseLayer.h b/training/src/compiler/training/base/layers/basic/ReverseLayer.h
new file mode 100644
index 00000000..7fc279a3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ReverseLayer.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REVERSE_LAYER_H
+#define REVERSE_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Reverse Layer
+ *
+ * Given a real length of a sequence, reverse tensor elements from range [0, real_length].
+ * Works only when depth or height of the input tensor is equal to 1.
+ *
+ */
+
+class ReverseLayer : public BasicLayer
+{
+  public:
+    ReverseLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    ReverseLayer(ReverseLayer&&) = default;
+    ReverseLayer(const ReverseLayer&) = delete;
+    ReverseLayer& operator=(const ReverseLayer&) = delete;
+
+  private:
+    bool mReverseOnly;
+
+    Name mInputName;
+    Name mOutputName;
+    template<typename MM>
+    friend class ReverseLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RollLayer.cpp b/training/src/compiler/training/base/layers/basic/RollLayer.cpp
new file mode 100644
index 00000000..94627fd4
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RollLayer.cpp
@@ -0,0 +1,215 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RollLayer.h"
+
+namespace raul
+{
+
+RollLayer::RollLayer(const Name& name, const RollLayerParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Roll", params, networkParameters)
+    , mDimension(params.dim)
+    , mShift(params.mShift)
+    , mCycled(params.mCycled)
+    , mValueToFill(params.mValueToFill)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mDimension == raul::Dimension::Default)
+    {
+        THROW(mTypeName, mName, "default dimension not supported");
+    }
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+void RollLayer::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        Tensor& output = mNetworkParams.mMemoryManager[mOutputs[0]];
+        const Tensor& input = mNetworkParams.mMemoryManager[mInputs[0]];
+
+        const auto outputShape = output.getShape();
+        const auto outputStrides = Common::getStrides(outputShape);
+
+        const auto dimIndex = static_cast<size_t>(mDimension);
+        size_t realShift = mShift % outputShape[dimIndex];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            if (realShift == 0)
+            {
+                output[q] = input[q];
+            }
+            else
+            {
+                auto indexes = Common::offsetToIndexes(q, outputStrides);
+                size_t oldElIndex = indexes[dimIndex];
+                size_t newElIndex = (oldElIndex + realShift) % outputShape[dimIndex];
+                indexes[dimIndex] = newElIndex;
+                auto newQ = Common::indexesToOffset(indexes, outputStrides);
+                if (newElIndex < oldElIndex && !mCycled)
+                {
+                    output[newQ] = mValueToFill;
+                }
+                else
+                {
+                    output[newQ] = input[q];
+                }
+            }
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        auto& output = work.getMemoryManager<MemoryManagerFP16>()[mOutputs[0]];
+        const auto& input = work.getMemoryManager<MemoryManagerFP16>()[mInputs[0]];
+
+        const auto outputShape = output.getShape();
+        const auto outputStrides = Common::getStrides(outputShape);
+
+        const auto dimIndex = static_cast<size_t>(mDimension);
+        size_t realShift = mShift % outputShape[dimIndex];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            if (realShift == 0)
+            {
+                output[q] = input[q];
+            }
+            else
+            {
+                auto indexes = Common::offsetToIndexes(q, outputStrides);
+                size_t oldElIndex = indexes[dimIndex];
+                size_t newElIndex = (oldElIndex + realShift) % outputShape[dimIndex];
+                indexes[dimIndex] = newElIndex;
+                auto newQ = Common::indexesToOffset(indexes, outputStrides);
+                if (newElIndex < oldElIndex && !mCycled)
+                {
+                    output[newQ] = TOHTYPE(mValueToFill);
+                }
+                else
+                {
+                    output[newQ] = input[q];
+                }
+            }
+        }
+    }
+    else
+    {
+        THROW_NONAME("RollLayer", "unsupported execution target");
+    }
+}
+
+void RollLayer::backwardComputeImpl()
+{
+    auto& work = mNetworkParams.mWorkflow;
+
+    if (work.getExecutionTarget() == ExecutionTarget::CPU)
+    {
+        const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputs[0].grad()];
+
+        // if (mNetworkParams.isGradNeeded(mInputs[0]))
+        {
+            const auto outputShape = deltas.getShape();
+            const auto outputStrides = Common::getStrides(outputShape);
+            const size_t dimIndex = static_cast<size_t>(mDimension);
+            size_t realShift = mShift % outputShape[dimIndex];
+
+            Tensor& prevLayerDelta = mNetworkParams.mMemoryManager[mInputs[0].grad()];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                if (realShift == 0)
+                {
+                    prevLayerDelta[q] += deltas[q];
+                }
+                else
+                {
+                    auto indexes = Common::offsetToIndexes(q, outputStrides);
+                    size_t oldElIndex = indexes[dimIndex];
+                    size_t newElIndex = (oldElIndex - realShift + outputShape[dimIndex]) % outputShape[dimIndex];
+                    indexes[dimIndex] = newElIndex;
+                    auto newQ = Common::indexesToOffset(indexes, outputStrides);
+                    prevLayerDelta[newQ] += deltas[q];
+                }
+            }
+        }
+    }
+    else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+    {
+        const auto& deltas = work.getMemoryManager<MemoryManagerFP16>()[mOutputs[0].grad()];
+
+        // if (mNetworkParams.isGradNeeded(mInputs[0]))
+        {
+            const auto outputShape = deltas.getShape();
+            const auto outputStrides = Common::getStrides(outputShape);
+            const size_t dimIndex = static_cast<size_t>(mDimension);
+            size_t realShift = mShift % outputShape[dimIndex];
+
+            auto& prevLayerDelta = work.getMemoryManager<MemoryManagerFP16>()[mInputs[0].grad()];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                if (realShift == 0)
+                {
+                    prevLayerDelta[q] += deltas[q];
+                }
+                else
+                {
+                    auto indexes = Common::offsetToIndexes(q, outputStrides);
+                    size_t oldElIndex = indexes[dimIndex];
+                    size_t newElIndex = (oldElIndex - realShift + outputShape[dimIndex]) % outputShape[dimIndex];
+                    indexes[dimIndex] = newElIndex;
+                    auto newQ = Common::indexesToOffset(indexes, outputStrides);
+                    prevLayerDelta[newQ] += deltas[q];
+                }
+            }
+        }
+    }
+    else
+    {
+        THROW_NONAME("RollLayer", "unsupported execution target");
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RollLayer.h b/training/src/compiler/training/base/layers/basic/RollLayer.h
new file mode 100644
index 00000000..2239909f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RollLayer.h
@@ -0,0 +1,55 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ROLL_LAYER_H
+#define ROLL_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/RollLayerParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Roll Layer
+ *
+ * Roll the tensor along the given dimension. Elements that are shifted
+ * beyond the last position are re-introduced at the first position or
+ * can be replaced by specified value.
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.roll.html?highlight=roll#torch.roll
+ */
+class RollLayer : public BasicLayer
+{
+  public:
+    RollLayer(const Name& name, const RollLayerParams& params, NetworkParameters& networkParameters);
+
+    RollLayer(RollLayer&&) = default;
+    RollLayer(const RollLayer&) = delete;
+    RollLayer& operator=(const RollLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    raul::Dimension mDimension;
+    size_t mShift;
+    bool mCycled;
+    raul::dtype mValueToFill;
+};
+
+}
+
+#endif // raul namespace
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/RoundLayer.h b/training/src/compiler/training/base/layers/basic/RoundLayer.h
new file mode 100644
index 00000000..1cd28795
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/RoundLayer.h
@@ -0,0 +1,64 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ROUND_LAYER_H
+#define ROUND_LAYER_H
+
+#include "impl/RoundLayerCPU.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Round Layer
+ *
+ * Rounds half to even. Also known as bankers rounding.
+ *
+ */
+class RoundLayer : public BasicLayer
+{
+  public:
+    RoundLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+        : BasicLayer(name, "Round", params, networkParameters)
+    {
+        DECLARE_IMPL(RoundLayer, RoundLayerCPU<MemoryManager>, RoundLayerCPU<MemoryManagerFP16>)
+        using namespace std;
+        if (mInputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of inputs");
+        }
+        if (mOutputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of inputs");
+        }
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], Workflow::Usage::ForwardAndBackward, Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+    }
+
+    RoundLayer(RoundLayer&&) = default;
+    RoundLayer(const RoundLayer&) = delete;
+    RoundLayer& operator=(const RoundLayer&) = delete;
+
+    template<typename MM>
+    friend class RoundLayerCPU;
+
+  private:
+    Name mInputName;
+    Name mOutputName;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ScaleLayer.cpp b/training/src/compiler/training/base/layers/basic/ScaleLayer.cpp
new file mode 100644
index 00000000..a4ebbb1e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ScaleLayer.cpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ScaleLayer.h"
+
+#include "impl/ScaleLayerImpl.h"
+
+namespace raul
+{
+
+ScaleLayer::ScaleLayer(const Name& name, const ScaleParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Scale", params, networkParameters)
+    , mScale(params.mScale)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(ScaleLayer, ScaleLayerImpl<MemoryManager>, ScaleLayerImpl<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], Workflow::Usage::Forward, Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/ScaleLayer.h b/training/src/compiler/training/base/layers/basic/ScaleLayer.h
new file mode 100644
index 00000000..17a02b6b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/ScaleLayer.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SCALE_LAYER_H
+#define SCALE_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/ScaleParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief  Scale Layer
+ *
+ * The layer scales tensor values with a fixed multiplier.
+ *
+ */
+class ScaleLayer : public BasicLayer
+{
+  public:
+    ScaleLayer(const Name& name, const ScaleParams& params, NetworkParameters& networkParameters);
+
+    ScaleLayer(ScaleLayer&&) = default;
+    ScaleLayer(const ScaleLayer&) = delete;
+    ScaleLayer& operator=(const ScaleLayer&) = delete;
+
+  private:
+    dtype mScale;
+
+    template<typename MM>
+    friend class ScaleLayerImpl;
+};
+
+} // raul namespace
+
+#endif // SCALE_LAYER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/SelectLayer.cpp b/training/src/compiler/training/base/layers/basic/SelectLayer.cpp
new file mode 100644
index 00000000..384c13a6
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/SelectLayer.cpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SelectLayer.h"
+
+#include "impl/SelectLayerCPU.h"
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+SelectLayer::SelectLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters)
+    : BroadcastingLayer(name, "Select", params, networkParameters)
+    , mBroadcast(params.mBroadcast)
+{
+    if (mInputs.size() != 3)
+    {
+        THROW("SelectLayer", name, "wrong number of input names");
+    }
+
+    DECLARE_IMPL(SelectLayer, SelectLayerCPU<MemoryManager>, SelectLayerCPU<MemoryManagerFP16>)
+
+    for (size_t i = 0; i < mInputs.size(); ++i)
+    {
+        if (i == 0)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[i], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[i], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+            mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[i], mInputs[i].grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+
+    shape outputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]) };
+    bool isOutputBSDependent = mNetworkParams.mWorkflow.getShape(mInputs[0]).isBSDependent();
+    if (mBroadcast)
+    {
+        for (size_t i = 1; i < mInputs.size(); ++i)
+        {
+            isOutputBSDependent |= mNetworkParams.mWorkflow.getShape(mInputs[i]).isBSDependent();
+            shape inputShape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputs[i]), mNetworkParams.mWorkflow.getHeight(mInputs[i]), mNetworkParams.mWorkflow.getWidth(mInputs[i]) };
+            std::transform(inputShape.begin(), inputShape.end(), outputShape.begin(), outputShape.begin(), [](auto a, auto b) { return std::max(a, b); });
+        }
+    }
+    WShape outputWShape{ raul::BS(), outputShape[1], outputShape[2], outputShape[3] };
+    if (!isOutputBSDependent)
+    {
+        outputWShape = { 1u, outputShape[1], outputShape[2], outputShape[3] };
+    }
+    mNetworkParams.mWorkflow.tensorNeeded(name, mOutputs[0], outputWShape, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/SelectLayer.h b/training/src/compiler/training/base/layers/basic/SelectLayer.h
new file mode 100644
index 00000000..ce6aef5b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/SelectLayer.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SELECT_LAYER_H
+#define SELECT_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BroadcastingLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Select Layer
+ *
+ * This layer returns a tensor of elements selected from either x or y, depending on condition.
+ * First input values selected at indices where condition is True, second input values - where condition is False.
+ * All input tensors must be broadcastable.
+ *
+ * First input is condition tensor.
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.where.html?highlight=where#torch.where
+ */
+class SelectLayer : public BroadcastingLayer
+{
+  public:
+    SelectLayer(const Name& name, const ElementWiseLayerParams& params, NetworkParameters& networkParameters);
+
+    SelectLayer(SelectLayer&&) = default;
+    SelectLayer(const SelectLayer&) = delete;
+    SelectLayer& operator=(const SelectLayer&) = delete;
+
+  private:
+    bool mBroadcast;
+
+    template<typename MM>
+    friend class SelectLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/SlicerLayer.cpp b/training/src/compiler/training/base/layers/basic/SlicerLayer.cpp
new file mode 100644
index 00000000..006f2e12
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/SlicerLayer.cpp
@@ -0,0 +1,119 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SlicerLayer.h"
+
+#include "impl/SlicerLayerCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+SlicerLayer::SlicerLayer(const Name& name, const SlicingParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Slicer", params, networkParameters)
+    , mDirection(params.dim)
+    , mBackwardTmpBufferName("TempStorageForIntermediateCalculations")
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    size_t sliceCount = params.sliceSize.size();
+    if (sliceCount == 0)
+    {
+        sliceCount = mOutputs.size();
+    }
+
+    if (mOutputs.size() != sliceCount)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(SlicerLayer, SlicerLayerCPU<MemoryManager>, SlicerLayerCPU<MemoryManagerFP16>)
+
+    mDimIndex = 0;
+    switch (mDirection)
+    {
+        case Dimension::Depth:
+            mDimIndex = 0;
+            break;
+        case Dimension::Height:
+            mDimIndex = 1;
+            break;
+        case Dimension::Width:
+            mDimIndex = 2;
+            break;
+        default:
+            THROW(mTypeName, mName, "unsupported dim");
+    }
+
+    yato::dimensionality<3U, size_t> inputShape(mNetworkParams.mWorkflow.getDepth(mInputs[0]), mNetworkParams.mWorkflow.getHeight(mInputs[0]), mNetworkParams.mWorkflow.getWidth(mInputs[0]));
+
+    if (params.sliceSize.empty())
+    {
+        if (inputShape[mDimIndex] % sliceCount != 0)
+        {
+            THROW(mTypeName, mName, "wrong number of slices");
+        }
+        mSlices = std::vector<size_t>(sliceCount, inputShape[mDimIndex] / sliceCount);
+    }
+    else
+    {
+        auto cnt_1 = std::count_if(params.sliceSize.begin(), params.sliceSize.end(), [](int a) { return a == -1; });
+        if (cnt_1 > 1)
+        {
+            THROW(mTypeName, mName, "only one slice with -1 is allowed");
+        }
+
+        size_t size = inputShape[mDimIndex];
+        auto acc = std::accumulate(params.sliceSize.begin(), params.sliceSize.end(), size_t(0), [](size_t a, int b) { return b > 0 ? a + b : a; });
+        if (acc > size)
+        {
+            THROW(mTypeName, mName, "sum of slices (" + std::to_string(acc) + ") is greater then input tensor size (" + std::to_string(size) + ")");
+        }
+        size_t remainder = size - acc;
+        if (remainder == 0 && cnt_1 != 0)
+        {
+            THROW(mTypeName, mName, "nothing left for slice with -1");
+        }
+
+        for (auto& s : params.sliceSize)
+        {
+            if (s == 0)
+            {
+                THROW(mTypeName, mName, "slice of size 0 is not allowed");
+            }
+            mSlices.push_back(s == -1 ? remainder : size_t(s));
+        }
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], Workflow::Usage::Forward, Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    auto shape = inputShape;
+    for (size_t i = 0; i < mOutputs.size(); ++i)
+    {
+        shape[mDimIndex] = mSlices[i];
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[i], raul::WShape{ raul::BS(), shape[0], shape[1], shape[2] }, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[i], mOutputs[i].grad(), DEC_BACK_READ);
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/SlicerLayer.h b/training/src/compiler/training/base/layers/basic/SlicerLayer.h
new file mode 100644
index 00000000..f95539fd
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/SlicerLayer.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SLICER_LAYER_H
+#define SLICER_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/layers/parameters/SlicingParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Slicing Layer
+ *
+ * The layer allows extracting sub-tensors. It is the opposite operation of concatenation.
+ */
+class SlicerLayer : public BasicLayer
+{
+  public:
+    SlicerLayer(const Name& name, const SlicingParams& params, NetworkParameters& networkParameters);
+
+    SlicerLayer(SlicerLayer&&) = default;
+    SlicerLayer(const SlicerLayer&) = delete;
+    SlicerLayer& operator=(const SlicerLayer&) = delete;
+
+  private:
+    std::vector<size_t> mSlices;
+    Dimension mDirection;
+    size_t mDimIndex;
+
+    Name mBackwardTmpBufferName;
+
+    template<typename MM>
+    friend class SlicerLayerCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/SplitterLayer.cpp b/training/src/compiler/training/base/layers/basic/SplitterLayer.cpp
new file mode 100644
index 00000000..1fe93e57
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/SplitterLayer.cpp
@@ -0,0 +1,53 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SplitterLayer.h"
+
+#include "impl/SplitterLayerCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+SplitterLayer::SplitterLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Splitter", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+
+    DECLARE_IMPL(SplitterLayer, SplitterLayerCPU<MemoryManager>, SplitterLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    for (const auto& output : mOutputs)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], output, DEC_FORW_WRIT);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, output, output.grad(), DEC_BACK_READ);
+    }
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputs[0]);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputs[0]);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputs[0]);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/SplitterLayer.h b/training/src/compiler/training/base/layers/basic/SplitterLayer.h
new file mode 100644
index 00000000..8f6301b9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/SplitterLayer.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SPLITTER_LAYER_H
+#define SPLITTER_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Splitter Layer
+ *
+ * A technical operation to perform a duplication of a tensor.
+ */
+class SplitterLayer : public BasicLayer
+{
+  public:
+    SplitterLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    SplitterLayer(SplitterLayer&&) = default;
+    SplitterLayer(const SplitterLayer&) = delete;
+    SplitterLayer& operator=(const SplitterLayer&) = delete;
+
+  private:
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class SplitterLayerCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/SqrtLayer.cpp b/training/src/compiler/training/base/layers/basic/SqrtLayer.cpp
new file mode 100644
index 00000000..7de8abb3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/SqrtLayer.cpp
@@ -0,0 +1,53 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SqrtLayer.h"
+
+#include "impl/SqrtLayerCPU.h"
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+SqrtLayer::SqrtLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Sqrt", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(SqrtLayer, SqrtLayerCPU<MemoryManager>, SqrtLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputs[0]);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputs[0]);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputs[0]);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/SqrtLayer.h b/training/src/compiler/training/base/layers/basic/SqrtLayer.h
new file mode 100644
index 00000000..77a4a030
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/SqrtLayer.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SQRT_LAYER_H
+#define SQRT_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Element-wise Square Root Layer
+ *
+ * Returns a new tensor with the square-root of each of the elements of input.
+ */
+class SqrtLayer : public BasicLayer
+{
+  public:
+    SqrtLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    SqrtLayer(SqrtLayer&&) = default;
+    SqrtLayer(const SqrtLayer&) = delete;
+    SqrtLayer& operator=(const SqrtLayer&) = delete;
+
+  private:
+    bool mNegativeNumberDetected = false;
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class SqrtLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/SquareLayer.cpp b/training/src/compiler/training/base/layers/basic/SquareLayer.cpp
new file mode 100644
index 00000000..b8d9aff1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/SquareLayer.cpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SquareLayer.h"
+
+#include "impl/SquareLayerCPU.h"
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+SquareLayer::SquareLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Square", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(SquareLayer, SquareLayerCPU<MemoryManager>, SquareLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDec(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mOutputs[0], DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputs[0]);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputs[0]);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputs[0]);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/SquareLayer.h b/training/src/compiler/training/base/layers/basic/SquareLayer.h
new file mode 100644
index 00000000..a099e0b3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/SquareLayer.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SQUARE_LAYER_H
+#define SQUARE_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Squaring Layer
+ *
+ * Returns a new tensor with the square of each of the elements of input.
+ */
+class SquareLayer : public BasicLayer
+{
+  public:
+    SquareLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    SquareLayer(SquareLayer&&) = default;
+    SquareLayer(const SquareLayer&) = delete;
+    SquareLayer& operator=(const SquareLayer&) = delete;
+
+  private:
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class SquareLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/TensorLayer.cpp b/training/src/compiler/training/base/layers/basic/TensorLayer.cpp
new file mode 100644
index 00000000..70a35680
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/TensorLayer.cpp
@@ -0,0 +1,50 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TensorLayer.h"
+
+#include "impl/TensorLayerCPU.h"
+
+#include <algorithm>
+
+namespace raul
+{
+TensorLayer::TensorLayer(const Name& name, const TensorParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Tensor", params, networkParameters, { false, true })
+{
+    if (!mInputs.empty())
+    {
+        THROW("TensorLayer", name, "no inputs expected");
+    }
+
+    if (std::any_of(mOutputs.begin(), mOutputs.end(), [](const auto& s) { return s.empty(); }))
+    {
+        THROW("TensorLayer", name, "empty output name");
+    }
+
+    DECLARE_IMPL(TensorLayer, TensorLayerCPU<MemoryManager>, TensorLayerCPU<MemoryManagerFP16>)
+
+    mInit = params.init;
+    mInitValue = params.initValue;
+
+    for (const auto& out : mOutputs)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, out, params.shape, params.usage, params.mode, params.isOptimizeGraph, params.isOptimizeMem, params.isTrainable, params.isZero, params.isCompress);
+        if (params.isTrainable)
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, out.grad(), params.shape, DEC_TRAINABLE_GRAD);
+        }
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/TensorLayer.h b/training/src/compiler/training/base/layers/basic/TensorLayer.h
new file mode 100644
index 00000000..664bb4b0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/TensorLayer.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TENSOR_LAYER_H
+#define TENSOR_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/layers/parameters/TensorParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Tensor Layer
+ *
+ * A technical operation which inserts a constant tensor into a topology.
+ */
+class TensorLayer : public BasicLayer
+{
+  public:
+    TensorLayer(const Name& name, const TensorParams& params, NetworkParameters& networkParameters);
+
+    TensorLayer(TensorLayer&&) = default;
+    TensorLayer(const TensorLayer&) = delete;
+    TensorLayer& operator=(const TensorLayer&) = delete;
+
+  private:
+    bool mInit;
+    dtype mInitValue;
+
+    template<typename MM>
+    friend class TensorLayerCPU;
+};
+
+} // raul namespace
+
+#endif // TENSOR_LAYER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/TileLayer.cpp b/training/src/compiler/training/base/layers/basic/TileLayer.cpp
new file mode 100644
index 00000000..c2cb0e94
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/TileLayer.cpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TileLayer.h"
+
+#include "impl/TileLayerCPU.h"
+
+namespace raul
+{
+
+TileLayer::TileLayer(const Name& name, const TilingParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Tile", params, networkParameters)
+    , mDimension(params.dim)
+    , mRepeatDepth(params.mRepeatDepth)
+    , mRepeatHeight(params.mRepeatHeight)
+    , mRepeatWidth(params.mRepeatWidth)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mDimension == raul::Dimension::Batch)
+    {
+        THROW(mTypeName, mName, "batch dimension not supported");
+    }
+
+    DECLARE_IMPL(TileLayer, TileLayerCPU<MemoryManager>, TileLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.tensorNeeded(name,
+                                          mOutputs[0],
+                                          raul::WShape{ BS(),
+                                                        mNetworkParams.mWorkflow.getDepth(mInputs[0]) * static_cast<size_t>(mRepeatDepth),
+                                                        mNetworkParams.mWorkflow.getHeight(mInputs[0]) * static_cast<size_t>(mRepeatHeight),
+                                                        mNetworkParams.mWorkflow.getWidth(mInputs[0]) * static_cast<size_t>(mRepeatWidth) },
+                                          DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/TileLayer.h b/training/src/compiler/training/base/layers/basic/TileLayer.h
new file mode 100644
index 00000000..6aa90123
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/TileLayer.h
@@ -0,0 +1,55 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TILE_LAYER_H
+#define TILE_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/TilingParameters.h>
+
+namespace raul
+{
+
+/**
+ * @brief Tile Layer
+ * This operation creates a new tensor by replicating input multiples times. The output tensor's i'th dimension
+ * has input.dims(i) * multiples[i] elements, and the values of input are replicated multiples[i] times along the 'i'th dimension.
+ * If no dimension specified, whole tensor will be replicated, otherwise - only needed dimension.
+ *
+ * @see
+ * https://www.tensorflow.org/api_docs/python/tf/tile
+ *
+ */
+class TileLayer : public BasicLayer
+{
+  public:
+    TileLayer(const Name& name, const TilingParams& params, NetworkParameters& networkParameters);
+
+    TileLayer(TileLayer&&) = default;
+    TileLayer(const TileLayer&) = delete;
+    TileLayer& operator=(const TileLayer&) = delete;
+
+  private:
+    raul::Dimension mDimension;
+    size_t mRepeatDepth;
+    size_t mRepeatHeight;
+    size_t mRepeatWidth;
+
+    template<typename MM>
+    friend class TileLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/TransposeLayer.cpp b/training/src/compiler/training/base/layers/basic/TransposeLayer.cpp
new file mode 100644
index 00000000..f0449b31
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/TransposeLayer.cpp
@@ -0,0 +1,120 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TransposeLayer.h"
+
+#include "impl/TransposeLayerCPU.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+
+using namespace raul;
+
+namespace
+{
+
+size_t DimensionToIndex(raul::Dimension dim)
+{
+    switch (dim)
+    {
+        case raul::Dimension::Batch:
+            return 0;
+        case raul::Dimension::Depth:
+            return 1;
+        case raul::Dimension::Height:
+            return 2;
+        case raul::Dimension::Width:
+            return 3;
+        default:
+            THROW_NONAME("TransposeLayer", "Bad dimension " + std::to_string(static_cast<int>(dim)));
+    }
+}
+
+raul::WShape getProperOutShape(size_t dim, const raul::shape& inputShape)
+{
+    switch (dim)
+    {
+        case 1:
+            return raul::WShape{ inputShape[1], raul::BS(), inputShape[2], inputShape[3] };
+        case 2:
+            return raul::WShape{ inputShape[2], inputShape[1], raul::BS(), inputShape[3] };
+        case 3:
+            return raul::WShape{ inputShape[3], inputShape[1], inputShape[2], raul::BS() };
+        default:
+            THROW_NONAME("TransposeLayer", "Incorrect dimension number: " + std::to_string(static_cast<int>(dim)));
+    }
+}
+
+} // anonymous namespace
+
+namespace raul
+{
+
+TransposeLayer::TransposeLayer(const Name& name, const TransposingParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "Transpose", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(TransposeLayer, TransposeLayerCPU<MemoryManager>, TransposeLayerCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    const shape inputShape = shape{ 1u, mNetworkParams.mWorkflow.getDepth(mInputName), mNetworkParams.mWorkflow.getHeight(mInputName), mNetworkParams.mWorkflow.getWidth(mInputName) };
+
+    auto dim1 = params.dim1;
+    auto dim2 = params.dim2;
+
+    if (dim1 == Dimension::Default)
+    {
+        dim1 = Dimension::Width;
+    }
+    if (dim2 == Dimension::Default)
+    {
+        dim2 = Dimension::Height;
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mDim1 = DimensionToIndex(dim1);
+    mDim2 = DimensionToIndex(dim2);
+
+    // Needed for specific case, when both input dimensions are BATCH (not restricted)
+    if ((mDim1 == 0) ^ (mDim2 == 0))
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, mDim1 == 0 ? getProperOutShape(mDim2, inputShape) : getProperOutShape(mDim1, inputShape), DEC_FORW_WRIT);
+    }
+    else
+    {
+        auto outputShape = inputShape;
+        std::swap(outputShape[mDim1], outputShape[mDim2]);
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape(BS(), outputShape[1], outputShape[2], outputShape[3]), DEC_FORW_WRIT);
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/TransposeLayer.h b/training/src/compiler/training/base/layers/basic/TransposeLayer.h
new file mode 100644
index 00000000..f9188c6a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/TransposeLayer.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TRANSPOSE_LAYER_H
+#define TRANSPOSE_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+/**
+ * @brief Transpose a tensor
+ *
+ * Setting both dimensions to Dimension::Default will swap W and H dimensions.
+ */
+class TransposeLayer : public BasicLayer
+{
+  public:
+    TransposeLayer(const Name& name, const TransposingParams& params, NetworkParameters& networkParameters);
+
+    TransposeLayer(TransposeLayer&&) = default;
+    TransposeLayer(const TransposeLayer&) = delete;
+    TransposeLayer& operator=(const TransposeLayer&) = delete;
+
+  private:
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mDim1 = 0;
+    size_t mDim2 = 0;
+
+    template<typename MM>
+    friend class TransposeLayerCPU;
+};
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/AveragePoolLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/AveragePoolLayerCPU.cpp
new file mode 100644
index 00000000..aaa48768
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/AveragePoolLayerCPU.cpp
@@ -0,0 +1,129 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "AveragePoolLayerCPU.h"
+#include "../AveragePoolLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::AveragePoolLayer, raul::AveragePoolLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::AveragePoolLayer, raul::AveragePoolLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+AveragePoolLayerCPU<MM>::AveragePoolLayerCPU(AveragePoolLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void AveragePoolLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = work.getBatchSize();
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    auto inputs3D = inputs.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+    auto outputs3D = output.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mOutputHeight * mLayer.mOutputWidth));
+
+    const dtype reciprocalKernelSize = 1.0_dt / static_cast<dtype>(mLayer.mKernelHeight * mLayer.mKernelWidth);
+
+    for (size_t b = 0; b < batchSize; ++b)
+    {
+        for (size_t k = 0; k < mLayer.mInputDepth; ++k)
+        {
+            for (size_t i = 0; i < mLayer.mOutputHeight; ++i)
+            {
+                for (size_t j = 0; j < mLayer.mOutputWidth; ++j)
+                {
+                    auto out_index = j + mLayer.mOutputWidth * i;
+                    auto sum = TOMMTYPE(0.0_dt);
+                    for (size_t n = 0; n < mLayer.mKernelHeight; ++n)
+                    {
+                        for (size_t m = 0; m < mLayer.mKernelWidth; ++m)
+                        {
+                            auto cur_h = i * mLayer.mStrideH + n - mLayer.mPaddingH;
+                            auto cur_w = j * mLayer.mStrideW + m - mLayer.mPaddingW;
+                            if (cur_h < mLayer.mInputHeight && cur_w < mLayer.mInputWidth)
+                            {
+                                auto index = cur_w + mLayer.mInputWidth * (cur_h);
+                                sum += inputs3D[b][k][index];
+                            }
+                        }
+                    }
+                    outputs3D[b][k][out_index] = sum * TOMMTYPE(reciprocalKernelSize);
+                }
+            }
+        }
+    }
+}
+
+template<typename MM>
+void AveragePoolLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mLayer.mInputName))
+    {
+        const size_t batchSize = work.getBatchSize();
+
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+
+        auto deltas3D = deltas.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mOutputHeight * mLayer.mOutputWidth));
+        auto prevDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+
+        const dtype reciprocalKernelSize = 1.0_dt / static_cast<dtype>(mLayer.mKernelHeight * mLayer.mKernelWidth);
+
+        for (size_t batch = 0; batch < batchSize; ++batch)
+        {
+            for (size_t c = 0; c < mLayer.mInputDepth; ++c)
+            {
+                for (size_t i = 0; i < mLayer.mOutputHeight; ++i)
+                {
+                    for (size_t j = 0; j < mLayer.mOutputWidth; ++j)
+                    {
+                        auto out_index = j + mLayer.mOutputWidth * i;
+                        for (size_t n = 0; n < mLayer.mKernelHeight; ++n)
+                        {
+                            for (size_t m = 0; m < mLayer.mKernelWidth; ++m)
+                            {
+                                auto cur_h = i * mLayer.mStrideH + n - mLayer.mPaddingH;
+                                auto cur_w = j * mLayer.mStrideW + m - mLayer.mPaddingW;
+                                if (cur_h < mLayer.mInputHeight && cur_w < mLayer.mInputWidth)
+                                {
+                                    auto index = cur_w + mLayer.mInputWidth * (cur_h);
+                                    prevDeltas3D[batch][c][index] += deltas3D[batch][c][out_index] * TOMMTYPE(reciprocalKernelSize);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template class AveragePoolLayerCPU<MemoryManager>;
+template class AveragePoolLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/AveragePoolLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/AveragePoolLayerCPU.h
new file mode 100644
index 00000000..7b6ccfbe
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/AveragePoolLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef AVERAGEPOOL_LAYER_CPU_H
+#define AVERAGEPOOL_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class AveragePoolLayer;
+
+template<typename MM>
+class AveragePoolLayerCPU : public BasicImpl
+{
+  public:
+    AveragePoolLayerCPU(AveragePoolLayer& layer);
+
+    AveragePoolLayerCPU(AveragePoolLayerCPU&&) = default;
+    AveragePoolLayerCPU(const AveragePoolLayerCPU&) = delete;
+    AveragePoolLayerCPU& operator=(const AveragePoolLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    AveragePoolLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/BatchExpanderLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/BatchExpanderLayerCPU.cpp
new file mode 100644
index 00000000..1f11e00d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/BatchExpanderLayerCPU.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BatchExpanderLayerCPU.h"
+#include "../BatchExpanderLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+BatchExpanderLayerCPU<MM>::BatchExpanderLayerCPU(BatchExpanderLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void BatchExpanderLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    // Size check
+    if (work.getMemoryManager<MM>()[mLayer.mInputs[0]].getBatchSize() != 1)
+    {
+        THROW(mLayer.mTypeName, mLayer.mName, "input tensor should have batch_size = 1");
+    }
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+
+    const size_t inputSize = input.size();
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        output[q] = input[q % inputSize];
+    }
+}
+
+template<typename MM>
+void BatchExpanderLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mNetworkParams.isGradNeeded(mInputs[0]))
+    {
+        const auto batchSize = deltas.getBatchSize();
+        const auto size = deltas.getDepth() * deltas.getHeight() * deltas.getWidth();
+        auto deltas2D = deltas.reshape(yato::dims(batchSize, size));
+
+        auto& prevLayerNabla = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < size; ++i)
+        {
+            for (size_t j = 0; j < batchSize; ++j)
+            {
+                prevLayerNabla[i] += deltas2D[j][i];
+            }
+        }
+    }
+}
+
+template class BatchExpanderLayerCPU<MemoryManager>;
+template class BatchExpanderLayerCPU<MemoryManagerFP16>;
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/BatchExpanderLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/BatchExpanderLayerCPU.h
new file mode 100644
index 00000000..0208bad1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/BatchExpanderLayerCPU.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BATCH_EXPANDER_LAYER_CPU_H
+#define BATCH_EXPANDER_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class BatchExpanderLayer;
+
+template<typename MM>
+class BatchExpanderLayerCPU : public BasicImpl
+{
+  public:
+    BatchExpanderLayerCPU(BatchExpanderLayer& layer);
+
+    BatchExpanderLayerCPU(BatchExpanderLayerCPU&&) = default;
+    BatchExpanderLayerCPU(const BatchExpanderLayerCPU&) = delete;
+    BatchExpanderLayerCPU& operator=(const BatchExpanderLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    BatchExpanderLayer& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/ClampLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ClampLayerCPU.cpp
new file mode 100644
index 00000000..e9971176
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ClampLayerCPU.cpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ClampLayerCPU.h"
+#include "../ClampLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+ClampLayerCPU<MM>::ClampLayerCPU(ClampLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void ClampLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    mLayer.mIsAcceptable.resize(output.size());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        if (input[q] < static_cast<typename MM::type>(mLayer.mMin))
+        {
+            output[q] = static_cast<typename MM::type>(mLayer.mMin);
+        }
+        else
+        {
+            if (input[q] > static_cast<typename MM::type>(mLayer.mMax))
+            {
+                output[q] = static_cast<typename MM::type>(mLayer.mMax);
+            }
+            else
+            {
+                output[q] = input[q];
+                mLayer.mIsAcceptable[q] = 1.0_dt;
+            }
+        }
+    }
+}
+
+template<typename MM>
+void ClampLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < nabla_tensor.size(); ++q)
+        {
+            nabla_tensor[q] += deltas[q] * static_cast<typename MM::type>(mLayer.mIsAcceptable[q]);
+        }
+    }
+}
+
+template class ClampLayerCPU<MemoryManager>;
+template class ClampLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/ClampLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ClampLayerCPU.h
new file mode 100644
index 00000000..276a1647
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ClampLayerCPU.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CLAMP_LAYER_CPU_H
+#define CLAMP_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class ClampLayer;
+
+template<typename MM>
+class ClampLayerCPU : public BasicImpl
+{
+  public:
+    ClampLayerCPU(ClampLayer& layer);
+
+    ClampLayerCPU(ClampLayerCPU&&) = default;
+    ClampLayerCPU(const ClampLayerCPU&) = delete;
+    ClampLayerCPU& operator=(const ClampLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ClampLayer& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/ConcatenationLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ConcatenationLayerCPU.cpp
new file mode 100644
index 00000000..4fb57bb3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ConcatenationLayerCPU.cpp
@@ -0,0 +1,117 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ConcatenationLayerCPU.h"
+#include "../ConcatenationLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::ConcatenationLayer, raul::ConcatenationLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::ConcatenationLayer, raul::ConcatenationLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+ConcatenationLayerCPU<MM>::ConcatenationLayerCPU(ConcatenationLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void ConcatenationLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+
+    auto output4d = output.get4DView();
+
+    yato::dimensionality<3U, size_t> concatDims(output.getDepth(), output.getHeight(), output.getWidth());
+
+    size_t accumulatedSize = 0;
+    for (size_t q = 0; q < mLayer.mInputs.size(); ++q)
+    {
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[q]];
+
+        auto inputView = input.get4DView();
+
+        typename MM::type* startEl = nullptr;
+        switch (mLayer.mDirection)
+        {
+            case Dimension::Depth:
+                startEl = &output4d[0][accumulatedSize][0][0];
+                break;
+            case Dimension::Height:
+                startEl = &output4d[0][0][accumulatedSize][0];
+                break;
+            case Dimension::Width:
+                startEl = &output4d[0][0][0][accumulatedSize];
+                break;
+            default:
+                throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "forwardCompute]: unknown dim");
+        }
+        accumulatedSize += input.getShape()[mLayer.mDimIndex + 1];
+        auto sliceView = yato::array_view_4d<typename MM::type>(startEl, input.getShape(), concatDims);
+        Common::copyView(inputView, sliceView, true);
+    }
+}
+
+template<typename MM>
+void ConcatenationLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& delta = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+    auto delta4d = delta.get4DView();
+
+    yato::dimensionality<3U, size_t> concatDims(delta.getDepth(), delta.getHeight(), delta.getWidth());
+
+    size_t accumulatedSize = 0;
+    for (size_t q = 0; q < mLayer.mInputs.size(); ++q)
+    {
+        // if (mNetworkParams.isGradNeeded(mInputs[q]))
+        {
+            auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputs[q].grad()];
+
+            auto prevDelta4d = prevLayerDelta.get4DView();
+
+            const typename MM::type* startEl = nullptr;
+            switch (mLayer.mDirection)
+            {
+                case Dimension::Depth:
+                    startEl = &delta4d[0][accumulatedSize][0][0];
+                    break;
+                case Dimension::Height:
+                    startEl = &delta4d[0][0][accumulatedSize][0];
+                    break;
+                case Dimension::Width:
+                    startEl = &delta4d[0][0][0][accumulatedSize];
+                    break;
+                default:
+                    throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "backwardCompute]: unknown dim");
+            }
+            auto sliceView = yato::array_view_4d<const typename MM::type>(startEl, prevLayerDelta.getShape(), concatDims);
+
+            Common::copyView(sliceView, prevDelta4d);
+        }
+        accumulatedSize += work.getMemoryManager<MM>()[mLayer.mInputs[q]].getShape()[mLayer.mDimIndex + 1];
+    }
+}
+
+template class ConcatenationLayerCPU<MemoryManager>;
+template class ConcatenationLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/ConcatenationLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ConcatenationLayerCPU.h
new file mode 100644
index 00000000..e7bbdb63
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ConcatenationLayerCPU.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONCATENATION_LAYER_CPU_H
+#define CONCATENATION_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class ConcatenationLayer;
+
+template<typename MM>
+class ConcatenationLayerCPU : public BasicImpl
+{
+  public:
+    ConcatenationLayerCPU(ConcatenationLayer& layer);
+
+    ConcatenationLayerCPU(ConcatenationLayerCPU&&) = default;
+    ConcatenationLayerCPU(const ConcatenationLayerCPU&) = delete;
+    ConcatenationLayerCPU& operator=(const ConcatenationLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ConcatenationLayer& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/CumSumLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/CumSumLayerCPU.cpp
new file mode 100644
index 00000000..79e378fd
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/CumSumLayerCPU.cpp
@@ -0,0 +1,91 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CumSumLayerCPU.h"
+#include "../CumSumLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::CumSumLayer, raul::CumSumLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::CumSumLayer, raul::CumSumLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+CumSumLayerCPU<MM>::CumSumLayerCPU(CumSumLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void CumSumLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    const auto outputShape = output.getShape();
+    const auto outputStrides = Common::getStrides(outputShape);
+
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        auto indexes = Common::offsetToIndexes(q, outputStrides);
+        if (indexes[static_cast<size_t>(mLayer.mDimension)] == 0)
+        {
+            output[q] = input[q];
+        }
+        else
+        {
+            indexes[static_cast<size_t>(mLayer.mDimension)] -= 1;
+            output[q] = input[q] + output[Common::indexesToOffset(indexes, outputStrides)];
+        }
+    }
+}
+
+template<typename MM>
+void CumSumLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+        const auto outputShape = deltas.getShape();
+        const auto outputStrides = Common::getStrides(outputShape);
+        for (size_t q = prevLayerDelta.size(); q > 0; --q)
+        {
+            auto indexes = Common::offsetToIndexes(q - 1, outputStrides);
+            if (indexes[static_cast<size_t>(mLayer.mDimension)] == outputShape[static_cast<size_t>(mLayer.mDimension)] - 1)
+            {
+                prevLayerDelta[q - 1] += deltas[q - 1];
+            }
+            else
+            {
+                indexes[static_cast<size_t>(mLayer.mDimension)] += 1;
+                prevLayerDelta[q - 1] += deltas[q - 1] + prevLayerDelta[Common::indexesToOffset(indexes, outputStrides)];
+            }
+        }
+    }
+}
+
+template class CumSumLayerCPU<MemoryManager>;
+template class CumSumLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/CumSumLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/CumSumLayerCPU.h
new file mode 100644
index 00000000..3c892c43
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/CumSumLayerCPU.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CUMSUM_LAYER_CPU_H
+#define CUMSUM_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class CumSumLayer;
+
+template<typename MM>
+class CumSumLayerCPU : public BasicImpl
+{
+  public:
+    CumSumLayerCPU(CumSumLayer& layer);
+
+    CumSumLayerCPU(CumSumLayerCPU&&) = default;
+    CumSumLayerCPU(const CumSumLayerCPU&) = delete;
+    CumSumLayerCPU& operator=(const CumSumLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    CumSumLayer& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/DropoutLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/DropoutLayerCPU.cpp
new file mode 100644
index 00000000..b94a2837
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/DropoutLayerCPU.cpp
@@ -0,0 +1,109 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DropoutLayerCPU.h"
+#include "../DropoutLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+DropoutLayerCPU<MM>::DropoutLayerCPU(DropoutLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void DropoutLayerCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+    auto& mRandom = work.getMemoryManager<MM>()[mLayer.mName / "random"];
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    if (mode == NetworkMode::Train || mode == NetworkMode::TrainCheckpointed)
+    {
+        if (mode == NetworkMode::Train)
+        {
+            if (mLayer.mNetworkParams.mCalculationMode == CalculationMode::DETERMINISTIC)
+            {
+                for (size_t i = 0; i < output.size(); ++i)
+                {
+                    mRandom[i] = TOMMTYPE(random::bernoulli::randBool(mLayer.mProbability, mLayer.mState));
+                }
+            }
+            else
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); ++i)
+                {
+                    mRandom[i] = TOMMTYPE(random::bernoulli::randBool(mLayer.mProbability));
+                }
+            }
+        }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < output.size(); ++i)
+        {
+            if (mRandom[i] == TOMMTYPE(1.0f))
+            {
+                output[i] = TOMMTYPE(0.0_hf);
+            }
+            else
+            {
+                output[i] = TOMMTYPE(TODTYPE(inputs[i]) * mLayer.mScale);
+            }
+        }
+    }
+    else
+    {
+        std::copy(inputs.begin(), inputs.end(), output.begin());
+    }
+}
+
+template<typename MM>
+void DropoutLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName) && mNetworkParams.mMemoryManager.tensorExists(mLayer.mOutputName.grad()))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+        const auto& mRandom = work.getMemoryManager<MM>()[mLayer.mName / "random"];
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+
+        if (!deltas.empty())
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < deltas.size(); ++i)
+            {
+                if (mRandom[i] == TOMMTYPE(0.0f))
+                {
+                    prevLayerDelta[i] += deltas[i] * TOMMTYPE(mLayer.mScale);
+                }
+            }
+        }
+    }
+}
+
+template class DropoutLayerCPU<MemoryManager>;
+template class DropoutLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/DropoutLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/DropoutLayerCPU.h
new file mode 100644
index 00000000..d2f090dd
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/DropoutLayerCPU.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DROPOUT_LAYER_CPU_H
+#define DROPOUT_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class DropoutLayer;
+
+template<typename MM>
+class DropoutLayerCPU : public BasicImpl
+{
+  public:
+    DropoutLayerCPU(DropoutLayer& layer);
+
+    DropoutLayerCPU(DropoutLayerCPU&&) = default;
+    DropoutLayerCPU(const DropoutLayerCPU&) = delete;
+    DropoutLayerCPU& operator=(const DropoutLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    DropoutLayer& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/DynamicDepthwiseConvolution2DLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/DynamicDepthwiseConvolution2DLayerCPU.cpp
new file mode 100644
index 00000000..823cb45f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/DynamicDepthwiseConvolution2DLayerCPU.cpp
@@ -0,0 +1,157 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DynamicDepthwiseConvolution2DLayerCPU.h"
+#include "../DynamicDepthwiseConvolution2DLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+DynamicDepthwiseConvolution2DLayerCPU<MM>::DynamicDepthwiseConvolution2DLayerCPU(DynamicDepthwiseConvolution2DLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void DynamicDepthwiseConvolution2DLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& memoryManager = mLayer.mNetworkParams.mWorkflow.getMemoryManager<MM>();
+    auto& output = memoryManager[mLayer.mOutputName];
+    const auto& input = memoryManager[mLayer.mInputName];
+    const auto& filter = memoryManager[mLayer.mFiltersName];
+
+    const size_t batchSize = mLayer.mNetworkParams.mWorkflow.getBatch(mLayer.mInputName);
+
+    if (mLayer.mInputDepth == 0u)
+    {
+        mLayer.mInputDepth = input.getWidth();
+        mLayer.mInChannels = filter.getHeight();
+    }
+
+    auto output4D = output.reshape(yato::dims(batchSize, mLayer.mOutputHeight, mLayer.mOutputWidth, mLayer.mInChannels * mLayer.mChannelMultiplier));
+    auto input4D = input.reshape(yato::dims(batchSize, mLayer.mInputHeight, mLayer.mInputWidth, mLayer.mInChannels));
+    auto filter4D = filter.reshape(yato::dims(mLayer.mFilterHeight, mLayer.mFilterWidth, mLayer.mInChannels, mLayer.mChannelMultiplier));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t b = 0; b < batchSize; ++b)
+    {
+        for (size_t i = 0; i < mLayer.mOutputHeight; ++i)
+        {
+            for (size_t j = 0; j < mLayer.mOutputWidth; ++j)
+            {
+                for (size_t k = 0; k < mLayer.mInChannels; ++k)
+                {
+                    for (size_t q = 0; q < mLayer.mChannelMultiplier; ++q)
+                    {
+                        for (size_t di = 0; di < mLayer.mFilterHeight; ++di)
+                        {
+                            for (size_t dj = 0; dj < mLayer.mFilterWidth; ++dj)
+                            {
+                                output4D[b][i][j][k * mLayer.mChannelMultiplier + q] += filter4D[di][dj][k][q] * input4D[b][i + di][j + dj][k];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<typename MM>
+void DynamicDepthwiseConvolution2DLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& memoryManager = mLayer.mNetworkParams.mWorkflow.getMemoryManager<MM>();
+
+    const size_t batchSize = mLayer.mNetworkParams.mWorkflow.getBatch(mLayer.mInputName);
+
+    const auto& deltas = memoryManager[mLayer.mOutputName.grad()];
+    const auto& filters = memoryManager[mLayer.mFiltersName];
+    const auto& input = memoryManager[mLayer.mInputName];
+
+    const auto deltas4D = deltas.reshape(yato::dims(batchSize, mLayer.mOutputHeight, mLayer.mOutputWidth, mLayer.mInChannels * mLayer.mChannelMultiplier));
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    // {
+    auto& inputDelta = memoryManager[mLayer.mInputName.grad()];
+
+    auto inputDelta4D = inputDelta.reshape(yato::dims(batchSize, mLayer.mInputHeight, mLayer.mInputWidth, mLayer.mInChannels));
+    const auto filter4D = filters.reshape(yato::dims(mLayer.mFilterHeight, mLayer.mFilterWidth, mLayer.mInChannels, mLayer.mChannelMultiplier));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t b = 0; b < batchSize; ++b)
+    {
+        for (size_t i = 0; i < mLayer.mOutputHeight; ++i)
+        {
+            for (size_t j = 0; j < mLayer.mOutputWidth; ++j)
+            {
+                for (size_t k = 0; k < mLayer.mInChannels; ++k)
+                {
+                    for (size_t q = 0; q < mLayer.mChannelMultiplier; ++q)
+                    {
+                        for (size_t di = 0; di < mLayer.mFilterHeight; ++di)
+                        {
+                            for (size_t dj = 0; dj < mLayer.mFilterWidth; ++dj)
+                            {
+                                inputDelta4D[b][i + di][j + dj][k] += filter4D[di][dj][k][q] * deltas4D[b][i][j][k * mLayer.mChannelMultiplier + q];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    //}
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mFiltersName))
+    // {
+    auto& filtersDelta = memoryManager[mLayer.mFiltersName.grad()];
+
+    auto filtersDelta4D = filtersDelta.reshape(yato::dims(mLayer.mFilterHeight, mLayer.mFilterWidth, mLayer.mInChannels, mLayer.mChannelMultiplier));
+    const auto input4D = input.reshape(yato::dims(batchSize, mLayer.mInputHeight, mLayer.mInputWidth, mLayer.mInChannels));
+
+    for (size_t b = 0; b < batchSize; ++b)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t k = 0; k < mLayer.mInChannels; ++k)
+        {
+            for (size_t q = 0; q < mLayer.mChannelMultiplier; ++q)
+            {
+                for (size_t di = 0; di < mLayer.mFilterHeight; ++di)
+                {
+                    for (size_t dj = 0; dj < mLayer.mFilterWidth; ++dj)
+                    {
+                        for (size_t i = 0; i < mLayer.mOutputHeight; ++i)
+                        {
+                            for (size_t j = 0; j < mLayer.mOutputWidth; ++j)
+                            {
+                                filtersDelta4D[di][dj][k][q] += input4D[b][i + di][j + dj][k] * deltas4D[b][i][j][k * mLayer.mChannelMultiplier + q];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // }
+}
+
+template class DynamicDepthwiseConvolution2DLayerCPU<MemoryManager>;
+template class DynamicDepthwiseConvolution2DLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/DynamicDepthwiseConvolution2DLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/DynamicDepthwiseConvolution2DLayerCPU.h
new file mode 100644
index 00000000..fc1ce162
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/DynamicDepthwiseConvolution2DLayerCPU.h
@@ -0,0 +1,42 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DYNAMIC_DEPTHWISE_CONVOLUTION_2D_LAYER_CPU_H
+#define DYNAMIC_DEPTHWISE_CONVOLUTION_2D_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class DynamicDepthwiseConvolution2DLayer;
+
+template<typename MM>
+class DynamicDepthwiseConvolution2DLayerCPU : public BasicImpl
+{
+  public:
+    DynamicDepthwiseConvolution2DLayerCPU(DynamicDepthwiseConvolution2DLayer& layer);
+
+    DynamicDepthwiseConvolution2DLayerCPU(DynamicDepthwiseConvolution2DLayerCPU&&) = default;
+    DynamicDepthwiseConvolution2DLayerCPU(const DynamicDepthwiseConvolution2DLayerCPU&) = delete;
+    DynamicDepthwiseConvolution2DLayerCPU& operator=(const DynamicDepthwiseConvolution2DLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    DynamicDepthwiseConvolution2DLayer& mLayer;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ElementWiseCompareLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ElementWiseCompareLayerCPU.cpp
new file mode 100644
index 00000000..c423ee3a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ElementWiseCompareLayerCPU.cpp
@@ -0,0 +1,185 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseCompareLayerCPU.h"
+#include "../ElementWiseCompareLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+void ElementWiseCompareLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    mLayer.determineBroadcastFlags();
+
+    if (!mLayer.mBroadcast && (mLayer.mBroadcastQuery[0] || mLayer.mBroadcastQuery[1]))
+    {
+        THROW("ElementWiseCompareLayer", mLayer.mName, "input size mismatch");
+    }
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& firstInput = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& secondInput = work.getMemoryManager<MM>()[mLayer.mInputs[1]];
+
+    if (mLayer.mBroadcastQuery[0])
+    {
+        auto firstViewer = firstInput.getBroadcastedViewer(output.getShape());
+        if (mLayer.mBroadcastQuery[1])
+        {
+            auto secondViewer = secondInput.getBroadcastedViewer(output.getShape());
+            if (mLayer.mEqual)
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = firstViewer[i] == secondViewer[i];
+                }
+            }
+            else if (mLayer.mLess)
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = firstViewer[i] < secondViewer[i];
+                }
+            }
+            else
+            {
+                auto comparator = comparators<typename MM::type>[mLayer.mCompName];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = comparator(firstViewer[i], secondViewer[i], TOMMTYPE(mLayer.mTolerance));
+                }
+            }
+        }
+        else
+        {
+            if (mLayer.mEqual)
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = firstViewer[i] == secondInput[i];
+                }
+            }
+            else if (mLayer.mLess)
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = firstViewer[i] < secondInput[i];
+                }
+            }
+            else
+            {
+                auto comparator = comparators<typename MM::type>[mLayer.mCompName];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = comparator(firstViewer[i], secondInput[i], TOMMTYPE(mLayer.mTolerance));
+                }
+            }
+        }
+    }
+    else
+    {
+        if (mLayer.mBroadcastQuery[1])
+        {
+            auto secondViewer = secondInput.getBroadcastedViewer(output.getShape());
+            if (mLayer.mEqual)
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = firstInput[i] == secondViewer[i];
+                }
+            }
+            else if (mLayer.mLess)
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = firstInput[i] < secondViewer[i];
+                }
+            }
+            else
+            {
+                auto comparator = comparators<typename MM::type>[mLayer.mCompName];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = comparator(firstInput[i], secondViewer[i], TOMMTYPE(mLayer.mTolerance));
+                }
+            }
+        }
+        else
+        {
+            if (mLayer.mEqual)
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = firstInput[i] == secondInput[i];
+                }
+            }
+            else if (mLayer.mLess)
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = firstInput[i] < secondInput[i];
+                }
+            }
+            else
+            {
+                auto comparator = comparators<typename MM::type>[mLayer.mCompName];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < output.size(); i++)
+                {
+                    output[i] = comparator(firstInput[i], secondInput[i], TOMMTYPE(mLayer.mTolerance));
+                }
+            }
+        }
+    }
+}
+
+template class ElementWiseCompareLayerCPU<MemoryManager>;
+template class ElementWiseCompareLayerCPU<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ElementWiseCompareLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ElementWiseCompareLayerCPU.h
new file mode 100644
index 00000000..0ee91994
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ElementWiseCompareLayerCPU.h
@@ -0,0 +1,69 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_COMPARE_LAYER_CPU_H
+#define ELEMENT_WISE_COMPARE_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+#include <unordered_map>
+
+namespace
+{
+template<typename T>
+std::unordered_map<std::string, std::function<bool(const T, const T, const T)>> comparators = {
+    { "equal", [](const T x, const T y, const T tol) { return static_cast<T>(std::abs(TODTYPE(x - y))) <= tol; } },
+    { "ne", [](const T x, const T y, const T tol) { return static_cast<T>(std::abs(TODTYPE(x - y))) > tol; } },
+    { "less", [](const T x, const T y, const T tol) { return y - x > tol; } },
+    { "greater", [](const T x, const T y, const T tol) { return x - y > tol; } },
+    { "le", [](const T x, const T y, const T tol) { return y - x >= tol; } },
+    { "ge", [](const T x, const T y, const T tol) { return x - y >= tol; } },
+    { "exact_equal", [](const T x, const T y, const T) { return x == y; } },
+    { "exact_ne", [](const T x, const T y, const T) { return x != y; } },
+    { "exact_less", [](const T x, const T y, const T) { return x < y; } },
+    { "exact_greater", [](const T x, const T y, const T) { return x > y; } },
+    { "exact_le", [](const T x, const T y, const T) { return x <= y; } },
+    { "exact_ge", [](const T x, const T y, const T) { return x >= y; } }
+};
+} // anonymous
+
+namespace raul
+{
+class ElementWiseCompareLayer;
+
+/**
+ * @brief Element-wise Comparison Layer CPU implementation
+ */
+template<typename MM>
+class ElementWiseCompareLayerCPU : public BasicImpl
+{
+  public:
+    ElementWiseCompareLayerCPU(ElementWiseCompareLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    ElementWiseCompareLayerCPU(ElementWiseCompareLayerCPU&&) = default;
+    ElementWiseCompareLayerCPU(const ElementWiseCompareLayerCPU&) = delete;
+    ElementWiseCompareLayerCPU& operator=(const ElementWiseCompareLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override{};
+
+  private:
+    ElementWiseCompareLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ElementWiseDivLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ElementWiseDivLayerCPU.cpp
new file mode 100644
index 00000000..9bb09249
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ElementWiseDivLayerCPU.cpp
@@ -0,0 +1,200 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseDivLayerCPU.h"
+#include "../ElementWiseDivLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::ElementWiseDivLayer, raul::ElementWiseDivLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::ElementWiseDivLayer, raul::ElementWiseDivLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+ElementWiseDivLayerCPU<MM>::ElementWiseDivLayerCPU(ElementWiseDivLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void ElementWiseDivLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    mLayer.determineBroadcastFlags();
+
+    if (!mLayer.mBroadcast && (mLayer.mBroadcastQuery[0] || mLayer.mBroadcastQuery[1]))
+    {
+        THROW(mLayer.mTypeName, mLayer.mName, "input size mismatch");
+    }
+
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+
+    // Copy dividend to the output
+    if (mLayer.mBroadcastQuery[0])
+    {
+        auto dividend_viewer = work.getMemoryManager<MM>()[mLayer.mInputs[0]].getBroadcastedViewer(output.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            output[q] = dividend_viewer[q];
+        }
+    }
+    else
+    {
+        output = TORANGE_MM(work.getMemoryManager<MM>()[mLayer.mInputs[0]]);
+    }
+
+    const auto& divisor = work.getMemoryManager<MM>()[mLayer.mInputs[1]];
+
+    if (mLayer.mBroadcastQuery[1])
+    {
+        auto divisor_viewer = divisor.getBroadcastedViewer(output.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+
+        for (size_t i = 0; i < output.size(); ++i)
+        {
+            output[i] /= divisor_viewer[i];
+        }
+    }
+    else
+    {
+        std::transform(output.begin(), output.end(), divisor.begin(), output.begin(), std::divides<typename MM::type>());
+    }
+}
+
+template<typename MM>
+void ElementWiseDivLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto dividend_name = mLayer.mInputs[0];
+    const auto divisor_name = mLayer.mInputs[1];
+
+    const auto& delta = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mLayer.mNetworkParams.isGradNeeded(dividend_name))
+    {
+        auto& dividend_nabla_tensor = work.getMemoryManager<MM>()[dividend_name.grad()];
+        const auto& divisor_factor_tensor = work.getMemoryManager<MM>()[divisor_name];
+
+        if (mLayer.mBroadcastQuery[0])
+        {
+            auto dividend_nabla = dividend_nabla_tensor.getBroadcastedViewer(delta.getShape());
+            if (mLayer.mBroadcastQuery[1])
+            {
+                const auto divisor_factor = divisor_factor_tensor.getBroadcastedViewer(delta.getShape());
+                for (size_t q = 0; q < delta.size(); ++q)
+                {
+                    dividend_nabla[q] += (TOMMTYPE(1.0_dt) / divisor_factor[q]) * delta[q];
+                }
+            }
+            else
+            {
+                for (size_t q = 0; q < delta.size(); ++q)
+                {
+                    dividend_nabla[q] += (TOMMTYPE(1.0_dt) / divisor_factor_tensor[q]) * delta[q];
+                }
+            }
+        }
+        else
+        {
+            if (mLayer.mBroadcastQuery[1])
+            {
+                const auto divisor_factor = divisor_factor_tensor.getBroadcastedViewer(delta.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t q = 0; q < delta.size(); ++q)
+                {
+                    dividend_nabla_tensor[q] += (TOMMTYPE(1.0_dt) / divisor_factor[q]) * delta[q];
+                }
+            }
+            else
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t q = 0; q < delta.size(); ++q)
+                {
+                    dividend_nabla_tensor[q] += (TOMMTYPE(1.0_dt) / divisor_factor_tensor[q]) * delta[q];
+                }
+            }
+        }
+    }
+
+    // if (mLayer.mNetworkParams.isGradNeeded(divisor_name))
+    {
+        auto& divisor_nabla_tensor = work.getMemoryManager<MM>()[divisor_name.grad()];
+        const auto& divisor_factor_tensor = work.getMemoryManager<MM>()[divisor_name];
+        const auto& dividend_factor_tensor = work.getMemoryManager<MM>()[dividend_name];
+
+        if (mLayer.mBroadcastQuery[1])
+        {
+            auto divisor_nabla = divisor_nabla_tensor.getBroadcastedViewer(delta.getShape());
+            const auto divisor_factor = divisor_factor_tensor.getBroadcastedViewer(delta.getShape());
+            if (mLayer.mBroadcastQuery[0])
+            {
+                const auto dividend_factor = dividend_factor_tensor.getBroadcastedViewer(delta.getShape());
+                for (size_t q = 0; q < delta.size(); ++q)
+                {
+                    divisor_nabla[q] += (dividend_factor[q] / divisor_factor[q] / divisor_factor[q] * TOMMTYPE(-1.0_dt)) * delta[q];
+                }
+            }
+            else
+            {
+                for (size_t q = 0; q < delta.size(); ++q)
+                {
+                    divisor_nabla[q] += (dividend_factor_tensor[q] / divisor_factor[q] / divisor_factor[q] * TOMMTYPE(-1.0_dt)) * delta[q];
+                }
+            }
+        }
+        else
+        {
+            if (mLayer.mBroadcastQuery[0])
+            {
+                const auto dividend_factor = dividend_factor_tensor.getBroadcastedViewer(delta.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t q = 0; q < delta.size(); ++q)
+                {
+                    divisor_nabla_tensor[q] += (dividend_factor[q] / divisor_factor_tensor[q] / divisor_factor_tensor[q] * TOMMTYPE(-1.0_dt)) * delta[q];
+                }
+            }
+            else
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t q = 0; q < delta.size(); ++q)
+                {
+                    divisor_nabla_tensor[q] += (dividend_factor_tensor[q] / divisor_factor_tensor[q] / divisor_factor_tensor[q] * TOMMTYPE(-1.0_dt)) * delta[q];
+                }
+            }
+        }
+    }
+}
+
+template class ElementWiseDivLayerCPU<MemoryManager>;
+template class ElementWiseDivLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/ElementWiseDivLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ElementWiseDivLayerCPU.h
new file mode 100644
index 00000000..187945a1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ElementWiseDivLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+
+#ifndef ELEMENT_WISE_DIV_LAYER_CPU_H
+#define ELEMENT_WISE_DIV_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class ElementWiseDivLayer;
+
+template<typename MM>
+class ElementWiseDivLayerCPU : public BasicImpl
+{
+  public:
+    ElementWiseDivLayerCPU(ElementWiseDivLayer& layer);
+
+    ElementWiseDivLayerCPU(ElementWiseDivLayerCPU&&) = default;
+    ElementWiseDivLayerCPU(const ElementWiseDivLayerCPU&) = delete;
+    ElementWiseDivLayerCPU& operator=(const ElementWiseDivLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ElementWiseDivLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/ElementWiseMulLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ElementWiseMulLayerCPU.cpp
new file mode 100644
index 00000000..f658cc07
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ElementWiseMulLayerCPU.cpp
@@ -0,0 +1,153 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseMulLayerCPU.h"
+#include "../ElementWiseMulLayer.h"
+
+//#define ELEMENTWISE_MUL_OPTIMIZED
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::ElementWiseMulLayer, raul::ElementWiseMulLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::ElementWiseMulLayer, raul::ElementWiseMulLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+ElementWiseMulLayerCPU<MM>::ElementWiseMulLayerCPU(ElementWiseMulLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void ElementWiseMulLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    mLayer.determineBroadcastFlags();
+
+    if (!mLayer.mBroadcast && std::any_of(mLayer.mBroadcastQuery.begin(), mLayer.mBroadcastQuery.end(), [](const auto& needToBroadcast) { return needToBroadcast; }))
+    {
+        THROW(mLayer.mTypeName, mLayer.mName, "input size mismatch");
+    }
+
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& memoryManager = work.getMemoryManager<MM>();
+
+    auto& output = memoryManager[mLayer.mOutputs[0]];
+    const auto& input = memoryManager[mLayer.mInputs[0]];
+
+    auto retLhs = [](const typename MM::type& lhs, [[maybe_unused]] const typename MM::type& rhs) { return lhs; };
+
+    binaryOpBroadcastedSrc(input, output, retLhs);
+
+    // Multiply other factors
+    for (size_t q = 1; q < mLayer.mInputs.size(); ++q)
+    {
+        const auto& inp = memoryManager[mLayer.mInputs[q]];
+        binaryOpBroadcastedSrc(inp, output, std::multiplies<typename MM::type>());
+    }
+}
+
+template<typename MM>
+void ElementWiseMulLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& memoryManager = work.getMemoryManager<MM>();
+
+    const auto& delta = memoryManager[mLayer.mOutputs[0].grad()];
+
+#ifdef ELEMENTWISE_MUL_OPTIMIZED
+    const auto epsilon = TOMMTYPE(1e-12_dt);
+
+    if (output.size() == 0)
+    {
+        THROW("ElementWiseMulLayer", mLayer.mName, "zero output size");
+    }
+
+    for (size_t q = 0; q < mLayer.mInputs.size(); ++q)
+    {
+        // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[q]))
+        {
+            const auto& input = memoryManager[mLayer.mInputs[q]];
+            auto& inputNabla = memoryManager[mLayer.mInputs[q].grad()];
+            if (!mLayer.mBroadcastQuery[q])
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < inputNabla.size(); ++i)
+                {
+                    inputNabla[i] += output[i] / (input[i] + epsilon) * delta[i];
+                }
+            }
+            else
+            {
+                const auto inputBroadcasted = input.getBroadcastedViewer(output.getShape());
+                auto inputNablaBroadcasted = inputNabla.getBroadcastedViewer(output.getShape());
+
+                for (size_t j = 0; q < output.size(); ++j)
+                {
+
+                    // for f=f(x,y,z)
+                    // df = y*z dx + x*z dy + x*y dz
+                    // So, the coefficient can be calculated as
+                    // output (x*y*z) divided on the current value.
+                    // NB: potential loss of precision
+                    const auto coefficient_q = output[q] / (inputBroadcasted[q] + epsilon);
+                    inputNablaBroadcasted[q] += coefficient_q * delta[q];
+                }
+            }
+        }
+    }
+#else
+
+    typename MM::tensor coefficient_q_vector(delta.getShape());
+
+    for (size_t i = 0; i < mLayer.mInputs.size(); ++i)
+    {
+        // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[i]))
+        {
+            std::copy_n(delta.data(), delta.size(), coefficient_q_vector.data());
+
+            for (size_t j = 0; j < mLayer.mInputs.size(); ++j)
+            {
+                if (i == j)
+                {
+                    continue;
+                }
+
+                const auto input_factor_name = mLayer.mInputs[j];
+                auto& input_factor_tensor = memoryManager[input_factor_name];
+                if (input_factor_tensor.empty())
+                {
+                    THROW("ElementWiseMulLayer", mLayer.mName, "zero tensor size (" + input_factor_name + ")");
+                }
+
+                binaryOpBroadcastedSrc(input_factor_tensor, coefficient_q_vector, std::multiplies<typename MM::type>());
+            }
+
+            auto& inputNabla = memoryManager[mLayer.mInputs[i].grad()];
+            binaryOpBroadcastedDst(coefficient_q_vector, inputNabla, std::plus<typename MM::type>());
+        }
+    }
+#endif
+}
+
+template class ElementWiseMulLayerCPU<MemoryManager>;
+template class ElementWiseMulLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/ElementWiseMulLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ElementWiseMulLayerCPU.h
new file mode 100644
index 00000000..a8687fea
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ElementWiseMulLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_MUL_LAYER_CPU_H
+#define ELEMENT_WISE_MUL_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class ElementWiseMulLayer;
+
+template<typename MM>
+class ElementWiseMulLayerCPU : public BasicImpl
+{
+  public:
+    ElementWiseMulLayerCPU(ElementWiseMulLayer& layer);
+
+    ElementWiseMulLayerCPU(ElementWiseMulLayerCPU&&) = default;
+    ElementWiseMulLayerCPU(const ElementWiseMulLayerCPU&) = delete;
+    ElementWiseMulLayerCPU& operator=(const ElementWiseMulLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ElementWiseMulLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/ElementWiseSubLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ElementWiseSubLayerCPU.cpp
new file mode 100644
index 00000000..7fbde0ac
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ElementWiseSubLayerCPU.cpp
@@ -0,0 +1,122 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseSubLayerCPU.h"
+#include "../ElementWiseSubLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+ElementWiseSubLayerCPU<MM>::ElementWiseSubLayerCPU(ElementWiseSubLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void ElementWiseSubLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    mLayer.determineBroadcastFlags();
+
+    if (!mLayer.mBroadcast && (mLayer.mBroadcastQuery[0] || mLayer.mBroadcastQuery[1]))
+    {
+        THROW(mLayer.mTypeName, mLayer.mName, "input size mismatch");
+    }
+
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+
+    const auto& minuend = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& subtrahend = work.getMemoryManager<MM>()[mLayer.mInputs[1]];
+
+    if (!mLayer.mBroadcastQuery[0] && !mLayer.mBroadcastQuery[1])
+    {
+        output = TORANGE_MM(minuend);
+        output -= subtrahend;
+    }
+    else
+    {
+        std::fill(output.begin(), output.end(), TOMMTYPE(0.0_dt));
+        for (size_t i = 0; i < mLayer.mInputs.size(); i++)
+        {
+            auto input = (i == 0 ? minuend : subtrahend);
+            if (i == 1)
+            {
+                input *= TOMMTYPE(-1.0_dt);
+            }
+            if (!mLayer.mBroadcastQuery[i])
+            {
+                output += input;
+            }
+            else
+            {
+                auto input_viewer = input.getBroadcastedViewer(output.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t j = 0; j < output.size(); ++j)
+                {
+                    output[j] += input_viewer[j];
+                }
+            }
+        }
+    }
+}
+
+template<typename MM>
+void ElementWiseSubLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& minuend_nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+        if (!mLayer.mBroadcastQuery[0])
+        {
+            work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()] += deltas;
+        }
+        else
+        {
+            auto minuend_nabla = minuend_nabla_tensor.getBroadcastedViewer(deltas.getShape());
+            for (size_t q = 0; q < minuend_nabla.size(); ++q)
+            {
+                minuend_nabla[q] += deltas[q];
+            }
+        }
+    }
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[1]))
+    {
+        auto& subtrahend_nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[1].grad()];
+        if (!mLayer.mBroadcastQuery[1])
+        {
+            subtrahend_nabla_tensor -= deltas;
+        }
+        else
+        {
+            auto subtrahend_nabla = subtrahend_nabla_tensor.getBroadcastedViewer(deltas.getShape());
+            for (size_t q = 0; q < subtrahend_nabla.size(); ++q)
+            {
+                subtrahend_nabla[q] -= deltas[q];
+            }
+        }
+    }
+}
+
+template class ElementWiseSubLayerCPU<MemoryManager>;
+template class ElementWiseSubLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ElementWiseSubLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ElementWiseSubLayerCPU.h
new file mode 100644
index 00000000..304a7cbe
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ElementWiseSubLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_SUB_LAYER_CPU_H
+#define ELEMENT_WISE_SUB_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class ElementWiseSubLayer;
+
+template<typename MM>
+class ElementWiseSubLayerCPU : public BasicImpl
+{
+  public:
+    ElementWiseSubLayerCPU(ElementWiseSubLayer& layer);
+
+    ElementWiseSubLayerCPU(ElementWiseSubLayerCPU&&) = default;
+    ElementWiseSubLayerCPU(const ElementWiseSubLayerCPU&) = delete;
+    ElementWiseSubLayerCPU& operator=(const ElementWiseSubLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ElementWiseSubLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ElementWiseSumLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ElementWiseSumLayerCPU.cpp
new file mode 100644
index 00000000..de195af0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ElementWiseSumLayerCPU.cpp
@@ -0,0 +1,97 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseSumLayerCPU.h"
+#include "../ElementWiseSumLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::ElementWiseSumLayer, raul::ElementWiseSumLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::ElementWiseSumLayer, raul::ElementWiseSumLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+ElementWiseSumLayerCPU<MM>::ElementWiseSumLayerCPU(ElementWiseSumLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void ElementWiseSumLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    mLayer.determineBroadcastFlags();
+
+    if (!mLayer.mBroadcast && std::any_of(mLayer.mBroadcastQuery.begin(), mLayer.mBroadcastQuery.end(), [](const auto& needToBroadcast) { return needToBroadcast; }))
+    {
+        THROW(mLayer.mTypeName, mLayer.mName, "input size mismatch");
+    }
+
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+
+    size_t exactFit = mLayer.mInputs.size();
+    for (size_t q = 0; q < mLayer.mInputs.size(); ++q)
+    {
+        if (!mLayer.mBroadcastQuery[q])
+        {
+            exactFit = q;
+            break;
+        }
+    }
+    if (exactFit < mLayer.mInputs.size())
+    {
+        output = TORANGE_MM(work.getMemoryManager<MM>()[mLayer.mInputs[exactFit]]);
+    }
+    else
+    {
+        std::fill(output.begin(), output.end(), TOMMTYPE(0.0_dt));
+    }
+
+    for (size_t q = 0; q < mLayer.mInputs.size(); ++q)
+    {
+        if (q == exactFit)
+        {
+            continue;
+        }
+
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[q]];
+        binaryOpBroadcastedSrc(input, output, std::plus<typename MM::type>());
+    }
+}
+
+template<typename MM>
+void ElementWiseSumLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    for (size_t i = 0; i < mLayer.mInputs.size(); ++i)
+    {
+        // if (mNetworkParams.isGradNeeded(input))
+        {
+            auto& inputNabla = work.getMemoryManager<MM>()[mLayer.mInputs[i].grad()];
+            binaryOpBroadcastedDst(deltas, inputNabla, std::plus<typename MM::type>());
+        }
+    }
+}
+
+template class ElementWiseSumLayerCPU<MemoryManager>;
+template class ElementWiseSumLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ElementWiseSumLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ElementWiseSumLayerCPU.h
new file mode 100644
index 00000000..514376fb
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ElementWiseSumLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_SUM_LAYER_CPU_H
+#define ELEMENT_WISE_SUM_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class ElementWiseSumLayer;
+
+template<typename MM>
+class ElementWiseSumLayerCPU : public BasicImpl
+{
+  public:
+    ElementWiseSumLayerCPU(ElementWiseSumLayer& layer);
+
+    ElementWiseSumLayerCPU(ElementWiseSumLayerCPU&&) = default;
+    ElementWiseSumLayerCPU(const ElementWiseSumLayerCPU&) = delete;
+    ElementWiseSumLayerCPU& operator=(const ElementWiseSumLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ElementWiseSumLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ExpLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ExpLayerCPU.cpp
new file mode 100644
index 00000000..edde835b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ExpLayerCPU.cpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ExpLayerCPU.h"
+#include "../ExpLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+ExpLayerCPU<MM>::ExpLayerCPU(ExpLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void ExpLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        output[q] = static_cast<std::remove_reference_t<decltype(output[q])>>(std::exp(TODTYPE(input[q])));
+    }
+}
+
+template<typename MM>
+void ExpLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            nabla_tensor[q] += deltas[q] * output[q];
+        }
+    }
+}
+
+template class ExpLayerCPU<MemoryManager>;
+template class ExpLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ExpLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ExpLayerCPU.h
new file mode 100644
index 00000000..9e69c80a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ExpLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef EXP_LAYER_CPU_H
+#define EXP_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class ExpLayer;
+
+template<typename MM>
+class ExpLayerCPU : public BasicImpl
+{
+  public:
+    ExpLayerCPU(ExpLayer& layer);
+
+    ExpLayerCPU(ExpLayerCPU&&) = default;
+    ExpLayerCPU(const ExpLayerCPU&) = delete;
+    ExpLayerCPU& operator=(const ExpLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ExpLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/FixedBiasLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/FixedBiasLayerCPU.cpp
new file mode 100644
index 00000000..74e92b80
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/FixedBiasLayerCPU.cpp
@@ -0,0 +1,73 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "FixedBiasLayerCPU.h"
+#include "../FixedBiasLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::FixedBiasLayer, raul::FixedBiasLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::FixedBiasLayer, raul::FixedBiasLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+FixedBiasLayerCPU<MM>::FixedBiasLayerCPU(FixedBiasLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void FixedBiasLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        output[q] = input[q] + static_cast<std::remove_reference_t<decltype(output[q])>>(mLayer.mBias);
+    }
+}
+
+template<typename MM>
+void FixedBiasLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < deltas.size(); ++q)
+        {
+            nabla_tensor[q] += deltas[q];
+        }
+    }
+}
+
+template class FixedBiasLayerCPU<MemoryManager>;
+template class FixedBiasLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/FixedBiasLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/FixedBiasLayerCPU.h
new file mode 100644
index 00000000..45194f18
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/FixedBiasLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FIXED_BIAS_LAYER_CPU_H
+#define FIXED_BIAS_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class FixedBiasLayer;
+
+template<typename MM>
+class FixedBiasLayerCPU : public BasicImpl
+{
+  public:
+    FixedBiasLayerCPU(FixedBiasLayer& layer);
+
+    FixedBiasLayerCPU(FixedBiasLayerCPU&&) = default;
+    FixedBiasLayerCPU(const FixedBiasLayerCPU&) = delete;
+    FixedBiasLayerCPU& operator=(const FixedBiasLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    FixedBiasLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/L2NormLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/L2NormLayerCPU.cpp
new file mode 100644
index 00000000..28365e0e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/L2NormLayerCPU.cpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "L2NormLayerCPU.h"
+#include "../L2NormLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+void L2NormLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    size_t size = input.getBatchSize() * input.getDepth() * input.getHeight();
+    auto input2D = input.reshape(yato::dims(size, input.getWidth()));
+    auto output2D = output.reshape(yato::dims(size, input.getWidth()));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < size; ++q)
+    {
+        dtype sum = 0.0_dt;
+        for (size_t i = 0; i < input.getWidth(); ++i)
+        {
+            sum += TODTYPE(input2D[q][i]) * TODTYPE(input2D[q][i]);
+        }
+
+        for (size_t i = 0; i < input.getWidth(); ++i)
+        {
+            output2D[q][i] = input2D[q][i] / TOMMTYPE(std::sqrt(sum));
+        }
+    }
+}
+
+template<typename MM>
+void L2NormLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+        size_t size = deltas.getBatchSize() * deltas.getDepth() * deltas.getHeight();
+        auto deltas2D = deltas.reshape(yato::dims(size, deltas.getWidth()));
+        auto prevLayerDelta2D = prevLayerDelta.reshape(yato::dims(size, deltas.getWidth()));
+        auto output2D = output.reshape(yato::dims(size, deltas.getWidth()));
+        auto input2D = input.reshape(yato::dims(size, deltas.getWidth()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < size; ++q)
+        {
+            for (size_t i = 0; i < deltas.getWidth(); ++i)
+            {
+                dtype sum = 0.0_dt;
+                for (size_t j = 0; j < deltas.getWidth(); ++j)
+                {
+                    if (i == j)
+                    {
+                        sum += output2D[q][j] / input2D[q][j];
+                    }
+                    sum -= deltas2D[q][j] * input2D[q][i] / TODTYPE(std::pow(TODTYPE(input2D[q][j]), 2) / std::pow(TODTYPE(output2D[q][j]), 3));
+                }
+                prevLayerDelta2D[q][i] += TOMMTYPE(sum);
+            }
+        }
+    }
+}
+
+template class L2NormLayerCPU<MemoryManager>;
+template class L2NormLayerCPU<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/L2NormLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/L2NormLayerCPU.h
new file mode 100644
index 00000000..a16695e7
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/L2NormLayerCPU.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef L2_NORM_LAYER_CPU_H
+#define L2_NORM_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class L2NormLayer;
+
+/**
+ * @brief L2 Normalizing Layer CPU implementation
+ */
+template<typename MM>
+class L2NormLayerCPU : public BasicImpl
+{
+  public:
+    L2NormLayerCPU(L2NormLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    L2NormLayerCPU(L2NormLayerCPU&&) = default;
+    L2NormLayerCPU(const L2NormLayerCPU&) = delete;
+    L2NormLayerCPU& operator=(const L2NormLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    L2NormLayer& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/L2SquaredNormLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/L2SquaredNormLayerCPU.cpp
new file mode 100644
index 00000000..37fad0ce
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/L2SquaredNormLayerCPU.cpp
@@ -0,0 +1,64 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "L2SquaredNormLayerCPU.h"
+#include "../L2SquaredNormLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+void L2SquaredNormLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    dtype sum = 0.0_dt;
+
+#if defined(_OPENMP)
+#pragma omp parallel for reduction(+ : sum)
+#endif
+    for (size_t q = 0; q < input.size(); ++q)
+    {
+        sum += TODTYPE(input[q]) * TODTYPE(input[q]);
+    }
+    output[0] = TOMMTYPE(sum / 2.0_dt);
+}
+
+template<typename MM>
+void L2SquaredNormLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+        {
+            prevLayerDelta[q] += deltas[0] * input[q];
+        }
+    }
+}
+
+template class L2SquaredNormLayerCPU<MemoryManager>;
+template class L2SquaredNormLayerCPU<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/L2SquaredNormLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/L2SquaredNormLayerCPU.h
new file mode 100644
index 00000000..a4905987
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/L2SquaredNormLayerCPU.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef L2_SQUARED_NORM_LAYER_CPU_H
+#define L2_SQUARED_NORM_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class L2SquaredNormLayer;
+
+/**
+ * @brief L2 Squared Normalizing Layer CPU implementation
+ */
+template<typename MM>
+class L2SquaredNormLayerCPU : public BasicImpl
+{
+  public:
+    L2SquaredNormLayerCPU(L2SquaredNormLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    L2SquaredNormLayerCPU(L2SquaredNormLayerCPU&&) = default;
+    L2SquaredNormLayerCPU(const L2SquaredNormLayerCPU&) = delete;
+    L2SquaredNormLayerCPU& operator=(const L2SquaredNormLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    L2SquaredNormLayer& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/LabelSmoothingCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/LabelSmoothingCPU.cpp
new file mode 100644
index 00000000..bd81c315
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/LabelSmoothingCPU.cpp
@@ -0,0 +1,87 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LabelSmoothingCPU.h"
+#include "../LabelSmoothing.h"
+
+namespace
+{
+
+const size_t NoPadding = std::numeric_limits<std::size_t>::max();
+
+} // anonymous namespace
+
+namespace raul
+{
+template<typename MM>
+void LabelSmoothingCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    if (mode == NetworkMode::Test)
+    {
+        std::copy(inputs.begin(), inputs.end(), output.begin());
+        return;
+    }
+
+    size_t width = inputs.getWidth();
+    size_t N = inputs.getBatchSize() * inputs.getDepth() * inputs.getHeight();
+    auto confidence = (1.0_dt - mLayer.mSmoothing);
+
+    const auto inData2D = inputs.reshape(yato::dims(N, width));
+    auto outData2D = output.reshape(inData2D.dimensions());
+
+    dtype bias = mLayer.mSmoothing / TODTYPE(width - 1);
+
+    if (mLayer.mPaddingIdx != NoPadding)
+    {
+        bias = mLayer.mSmoothing / TODTYPE(width - 2);
+    }
+
+    std::fill(output.begin(), output.end(), TOMMTYPE(bias));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < N; ++q)
+    {
+        bool isZero = true;
+        for (size_t k = 0; k < width; ++k)
+        {
+            if (isZero)
+            {
+                isZero = (inData2D[q][k] == TOMMTYPE(0.0_dt) || k == mLayer.mPaddingIdx);
+            }
+            if (k == mLayer.mPaddingIdx)
+            {
+                outData2D[q][k] = 0;
+            }
+            else if (inData2D[q][k] > 0)
+            {
+                outData2D[q][k] = inData2D[q][k] * TOMMTYPE(confidence);
+            }
+        }
+        if (isZero)
+        {
+            std::fill(outData2D[q].begin(), outData2D[q].end(), TOMMTYPE(0.0_dt));
+        }
+    }
+}
+
+template class LabelSmoothingCPU<MemoryManager>;
+template class LabelSmoothingCPU<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/LabelSmoothingCPU.h b/training/src/compiler/training/base/layers/basic/impl/LabelSmoothingCPU.h
new file mode 100644
index 00000000..07ee1e93
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/LabelSmoothingCPU.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LABEL_SMOOTHING_CPU_H
+#define LABEL_SMOOTHING_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class LabelSmoothing;
+
+/**
+ * @brief Masked fill CPU implementation
+ */
+template<typename MM>
+class LabelSmoothingCPU : public BasicImpl
+{
+  public:
+    LabelSmoothingCPU(LabelSmoothing& layer)
+        : mLayer(layer)
+    {
+    }
+
+    LabelSmoothingCPU(LabelSmoothingCPU&&) = default;
+    LabelSmoothingCPU(const LabelSmoothingCPU&) = delete;
+    LabelSmoothingCPU& operator=(const LabelSmoothingCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override{};
+
+  private:
+    LabelSmoothing& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/LogLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/LogLayerCPU.cpp
new file mode 100644
index 00000000..d3bb5f05
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/LogLayerCPU.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LogLayerCPU.h"
+#include "../LogLayer.h"
+
+#include <atomic>
+#include <cmath>
+
+namespace raul
+{
+
+template<typename MM>
+LogLayerCPU<MM>::LogLayerCPU(LogLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void LogLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    std::atomic<bool> NegativeNumberDetected = false;
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        if (TODTYPE(input[q]) < 0.0_dt)
+        {
+            NegativeNumberDetected = true;
+        }
+        output[q] = static_cast<std::remove_reference_t<decltype(output[q])>>(std::log(TODTYPE(input[q])));
+    }
+
+    mLayer.mNegativeNumberDetected = NegativeNumberDetected;
+    if (mLayer.mNegativeNumberDetected)
+    {
+        throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "::forwardCompute]: negative input for sqrt()");
+    }
+}
+
+template<typename MM>
+void LogLayerCPU<MM>::backwardComputeImpl()
+{
+    if (mLayer.mNegativeNumberDetected)
+    {
+        throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "::backwardCompute]: negative input for sqrt()");
+    }
+
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < input.size(); ++q)
+        {
+            nabla_tensor[q] += static_cast<std::remove_reference_t<decltype(input[q])>>(TODTYPE(deltas[q]) / TODTYPE(input[q]));
+        }
+    }
+}
+
+template class LogLayerCPU<MemoryManager>;
+template class LogLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/LogLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/LogLayerCPU.h
new file mode 100644
index 00000000..99ab0a94
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/LogLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LOG_LAYER_CPU_H
+#define LOG_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class LogLayer;
+
+template<typename MM>
+class LogLayerCPU : public BasicImpl
+{
+  public:
+    LogLayerCPU(LogLayer& layer);
+
+    LogLayerCPU(LogLayerCPU&&) = default;
+    LogLayerCPU(const LogLayerCPU&) = delete;
+    LogLayerCPU& operator=(const LogLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LogLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/LossWrapperHelperLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/LossWrapperHelperLayerCPU.cpp
new file mode 100644
index 00000000..819b8c75
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/LossWrapperHelperLayerCPU.cpp
@@ -0,0 +1,61 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LossWrapperHelperLayerCPU.h"
+#include "../LossWrapperHelperLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::LossWrapperHelperLayer, raul::LossWrapperHelperLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::LossWrapperHelperLayer, raul::LossWrapperHelperLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+LossWrapperHelperLayerCPU<MM>::LossWrapperHelperLayerCPU(LossWrapperHelperLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void LossWrapperHelperLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    output = TORANGE_MM(input);
+}
+
+template<typename MM>
+void LossWrapperHelperLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (!mLayer.mIsFinal)
+    {
+        work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()] += work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+    }
+    else
+    {
+        work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()] += TOMMTYPE(1_dt);
+    }
+}
+
+template class LossWrapperHelperLayerCPU<MemoryManager>;
+template class LossWrapperHelperLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/LossWrapperHelperLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/LossWrapperHelperLayerCPU.h
new file mode 100644
index 00000000..48891026
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/LossWrapperHelperLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LOSS_WRAPPER_HELPER_LAYER_CPU_H
+#define LOSS_WRAPPER_HELPER_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class LossWrapperHelperLayer;
+
+template<typename MM>
+class LossWrapperHelperLayerCPU : public BasicImpl
+{
+  public:
+    LossWrapperHelperLayerCPU(LossWrapperHelperLayer& layer);
+
+    LossWrapperHelperLayerCPU(LossWrapperHelperLayerCPU&&) = default;
+    LossWrapperHelperLayerCPU(const LossWrapperHelperLayerCPU&) = delete;
+    LossWrapperHelperLayerCPU& operator=(const LossWrapperHelperLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LossWrapperHelperLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/MaskedFillLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/MaskedFillLayerCPU.cpp
new file mode 100644
index 00000000..7705d3dc
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/MaskedFillLayerCPU.cpp
@@ -0,0 +1,109 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "MaskedFillLayerCPU.h"
+#include "../MaskedFillLayer.h"
+
+namespace raul
+{
+template<typename MM>
+void MaskedFillLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    const auto& mask = work.getMemoryManager<MM>()[mLayer.mMaskName];
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    size_t size = inputs.size();
+
+    std::copy(inputs.begin(), inputs.end(), output.begin());
+
+    if (mask.getShape() != output.getShape())
+    {
+        const auto maskViewer = mask.getBroadcastedViewer(inputs.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < size; ++q)
+        {
+            if ((maskViewer[q] == TOMMTYPE(0.0_dt)) == mLayer.mInverted)
+            {
+                output[q] = TOMMTYPE(mLayer.mFillValue);
+            }
+        }
+    }
+    else
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < size; ++q)
+        {
+            if ((mask[q] == TOMMTYPE(0.0_dt)) == mLayer.mInverted)
+            {
+                output[q] = TOMMTYPE(mLayer.mFillValue);
+            }
+        }
+    }
+}
+
+template<typename MM>
+void MaskedFillLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    // if (mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+
+        const auto& mask = work.getMemoryManager<MM>()[mLayer.mMaskName];
+
+        size_t size = deltas.size();
+
+        if (mask.getShape() != deltas.getShape())
+        {
+            const auto maskViewer = mask.getBroadcastedViewer(deltas.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < size; ++q)
+            {
+                if ((maskViewer[q] == 0.0_dt) != mLayer.mInverted)
+                {
+                    prevLayerDelta[q] += deltas[q];
+                }
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < size; ++q)
+            {
+                if ((mask[q] == 0.0_dt) != mLayer.mInverted)
+                {
+                    prevLayerDelta[q] += deltas[q];
+                }
+            }
+        }
+    }
+}
+
+template class MaskedFillLayerCPU<MemoryManager>;
+template class MaskedFillLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/MaskedFillLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/MaskedFillLayerCPU.h
new file mode 100644
index 00000000..bd21519e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/MaskedFillLayerCPU.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MASKED_FILL_LAYER_CPU_H
+#define MASKED_FILL_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class MaskedFillLayer;
+
+/**
+ * @brief Masked fill CPU implementation
+ */
+template<typename MM>
+class MaskedFillLayerCPU : public BasicImpl
+{
+  public:
+    MaskedFillLayerCPU(MaskedFillLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    MaskedFillLayerCPU(MaskedFillLayerCPU&&) = default;
+    MaskedFillLayerCPU(const MaskedFillLayerCPU&) = delete;
+    MaskedFillLayerCPU& operator=(const MaskedFillLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    MaskedFillLayer& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/MatMulLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/MatMulLayerCPU.cpp
new file mode 100644
index 00000000..6e20ef53
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/MatMulLayerCPU.cpp
@@ -0,0 +1,135 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "MatMulLayerCPU.h"
+#include "../MatMulLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::MatMulLayer, raul::MatMulLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::MatMulLayer, raul::MatMulLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+MatMulLayerCPU<MM>::MatMulLayerCPU(MatMulLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void MatMulLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = work.getBatchSize();
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+
+    std::fill(output.begin(), output.end(), TOMMTYPE(0.0_dt));
+
+    const auto& input1 = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& input2 = work.getMemoryManager<MM>()[mLayer.mInputs[1]];
+
+    size_t size = mLayer.mDepth * batchSize;
+
+    auto input1_2D = input1.reshape(yato::dims(size, input1.getWidth() * input1.getHeight()));
+    auto input2_2D = input2.reshape(yato::dims(size, input2.getWidth() * input2.getHeight()));
+    auto output_2D = output.reshape(yato::dims(size, output.getWidth() * output.getHeight()));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < size; ++q)
+    {
+        Common::gemm(CblasNoTrans,
+                     CblasNoTrans,
+                     input1.getHeight(),
+                     input2.getWidth(),
+                     input1.getWidth(),
+                     mLayer.mCoeff,
+                     &input1_2D[q][0],
+                     &input2_2D[q][0],
+                     0.0_dt,
+                     &output_2D[q][0]);
+    }
+}
+
+template<typename MM>
+void MatMulLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = work.getBatchSize();
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+    const auto& input1 = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& input2 = work.getMemoryManager<MM>()[mLayer.mInputs[1]];
+    size_t size = mLayer.mDepth * batchSize;
+
+    auto deltas2D = deltas.reshape(yato::dims(size, input1.getHeight() * input2.getWidth()));
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0].grad()))
+    {
+        auto input2_2D = input2.reshape(yato::dims(size, input2.getHeight() * input2.getWidth()));
+        auto grad1_2D = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()].reshape(yato::dims(size, input1.getHeight() * input1.getWidth()));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < size; ++q)
+        {
+            Common::gemm(CblasNoTrans,
+                         CblasTrans,
+                         deltas.getHeight(),
+                         input2.getHeight(),
+                         deltas.getWidth(),
+                         mLayer.mCoeff,
+                         &deltas2D[q][0],
+                         &input2_2D[q][0],
+                         1.0_dt,
+                         &grad1_2D[q][0]);
+        }
+    }
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[1].grad()))
+    {
+        auto input1_2D = input1.reshape(yato::dims(size, input1.getHeight() * input1.getWidth()));
+        auto grad2_2D = work.getMemoryManager<MM>()[mLayer.mInputs[1].grad()].reshape(yato::dims(size, input2.getHeight() * input2.getWidth()));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < size; ++q)
+        {
+            Common::gemm(CblasTrans,
+                         CblasNoTrans,
+                         input1.getWidth(),
+                         deltas.getWidth(),
+                         input1.getHeight(),
+                         mLayer.mCoeff,
+                         &input1_2D[q][0],
+                         &deltas2D[q][0],
+                         1.0_dt,
+                         &grad2_2D[q][0]);
+        }
+    }
+}
+
+template class MatMulLayerCPU<MemoryManager>;
+template class MatMulLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/MatMulLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/MatMulLayerCPU.h
new file mode 100644
index 00000000..eaf571d7
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/MatMulLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MAT_MUL_LAYER_CPU_H
+#define MAT_MUL_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class MatMulLayer;
+
+template<typename MM>
+class MatMulLayerCPU : public BasicImpl
+{
+  public:
+    MatMulLayerCPU(MatMulLayer& layer);
+
+    MatMulLayerCPU(MatMulLayerCPU&&) = default;
+    MatMulLayerCPU(const MatMulLayerCPU&) = delete;
+    MatMulLayerCPU& operator=(const MatMulLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    MatMulLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/NonZeroMaskLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/NonZeroMaskLayerCPU.cpp
new file mode 100644
index 00000000..e6aeb69f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/NonZeroMaskLayerCPU.cpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "NonZeroMaskLayerCPU.h"
+#include <training/base/layers/basic/NonZeroMaskLayer.h>
+
+namespace raul
+{
+
+template<typename MM>
+NonZeroMaskLayerCPU<MM>::NonZeroMaskLayerCPU(NonZeroMaskLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void NonZeroMaskLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& memoryManager = mLayer.mNetworkParams.mWorkflow.getMemoryManager<MM>();
+    const auto& input = memoryManager[mLayer.mInputs[0]];
+    auto& mask = memoryManager[mLayer.mOutputs[0]];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t e = 0; e < input.size(); ++e)
+    {
+        mask[e] = TOMMTYPE(input[e] == 0 ? 0 : 1);
+    }
+}
+
+template<typename MM>
+void NonZeroMaskLayerCPU<MM>::backwardComputeImpl()
+{
+}
+
+template class NonZeroMaskLayerCPU<MemoryManager>;
+template class NonZeroMaskLayerCPU<MemoryManagerFP16>;
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/NonZeroMaskLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/NonZeroMaskLayerCPU.h
new file mode 100644
index 00000000..ba0cc70a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/NonZeroMaskLayerCPU.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NON_ZERO_MASK_LAYER_CPU_H
+#define NON_ZERO_MASK_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class NonZeroMaskLayer;
+
+template<typename MM>
+class NonZeroMaskLayerCPU : public BasicImpl
+{
+  public:
+    NonZeroMaskLayerCPU(NonZeroMaskLayer& layer);
+
+    NonZeroMaskLayerCPU(NonZeroMaskLayerCPU&&) = default;
+    NonZeroMaskLayerCPU(const NonZeroMaskLayerCPU&) = delete;
+    NonZeroMaskLayerCPU& operator=(const NonZeroMaskLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    NonZeroMaskLayer& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/PositionalEncodingCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/PositionalEncodingCPU.cpp
new file mode 100644
index 00000000..11a71725
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/PositionalEncodingCPU.cpp
@@ -0,0 +1,153 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "PositionalEncodingCPU.h"
+#include "../PositionalEncoding.h"
+
+namespace raul
+{
+
+template<typename MM>
+PositionalEncodingCPU<MM>::PositionalEncodingCPU(PositionalEncoding& layer)
+    : mLayer(layer)
+{
+    mLayer.mNetworkParams.mWorkflow.tensorNeeded(mLayer.mName, mLayer.mName / "ranges", WShape{ BS(), 1u, 1u, mLayer.mMaxMelLength }, DEC_FORW_WRIT);
+}
+
+template<typename MM>
+void PositionalEncodingCPU<MM>::initNotBSTensors()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+    auto& memoryManager = work.getMemoryManager<MM>();
+
+    auto& pe = memoryManager[mLayer.mName / "pe"];
+
+    auto data = pe.reshape(yato::dims(mLayer.mMaxLength, mLayer.mModelSize));
+    double w = 1;
+    double kw = pow(0.0001, 2. / static_cast<dtype>(mLayer.mModelSize));
+    for (size_t i = 0; i < mLayer.mModelSize; i += 2)
+    {
+        for (size_t t = 0; t < mLayer.mMaxLength; ++t)
+        {
+            data[t][i] = TOMMTYPE(sin(w * TODTYPE(t)));
+            data[t][i + 1] = TOMMTYPE(cos(w * TODTYPE(t)));
+        }
+        w *= kw;
+    }
+}
+
+template<typename MM>
+void durations_range(const typename MM::tensor& durations, typename MM::tensor& ranges, typename MM::type fillValue)
+{
+    size_t maxMelLength = ranges.getWidth();
+    size_t w = durations.getWidth();
+    auto dData = durations.reshape(yato::dims(durations.getBatchSize(), w));
+    auto rData = ranges.reshape(yato::dims(durations.getBatchSize(), maxMelLength));
+    for (size_t q = 0; q < durations.getBatchSize(); ++q)
+    {
+        size_t index = 0;
+        for (size_t i = 0; i < w; ++i)
+        {
+            auto el = static_cast<size_t>(dData[q][i]);
+            for (size_t j = 0; j < el; ++j)
+            {
+                if (index >= maxMelLength)
+                {
+                    break;
+                }
+                rData[q][index++] = TOMMTYPE(j);
+            }
+            if (index >= maxMelLength)
+            {
+                break;
+            }
+        }
+        if (index < maxMelLength - 1)
+        {
+            std::fill(rData[q].begin() + index, rData[q].end(), fillValue);
+        }
+    }
+}
+
+template<typename MM>
+void PositionalEncodingCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& memoryManager = mLayer.mNetworkParams.mWorkflow.getMemoryManager<MM>();
+    auto& output = memoryManager[mLayer.mOutputName];
+
+    const auto& inputs = memoryManager[mLayer.mInputName];
+
+    if (!mLayer.mDurationEncoding)
+    {
+        auto inData3D = inputs.reshape(yato::dims(inputs.getBatchSize() * inputs.getDepth(), inputs.getHeight(), mLayer.mModelSize));
+        auto outData3D = output.reshape(inData3D.dimensions());
+
+        const auto& pe = memoryManager[mLayer.mName / "pe"];
+        const auto data = pe.reshape(yato::dims(pe.getHeight(), pe.getWidth()));
+
+        size_t N = inData3D.dimensions()[0];
+        size_t height = inputs.getHeight();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t e = 0; e < N; ++e)
+        {
+            for (size_t t = 0; t < height; ++t)
+            {
+                for (size_t i = 0; i < mLayer.mModelSize; ++i)
+                {
+                    outData3D[e][t][i] = inData3D[e][t][i] + data[t][i];
+                }
+            }
+        }
+    }
+    else
+    {
+        auto& ranges = memoryManager[mLayer.mName / "ranges"];
+        auto batchSize = inputs.getBatchSize();
+        auto width = mLayer.mMaxMelLength;
+        auto ranges2D = ranges.reshape(batchSize, width);
+        auto outputs3D = output.reshape(batchSize, mLayer.mMaxMelLength, mLayer.mModelSize);
+        auto lut2D = memoryManager[mLayer.mName / "pe"].reshape(mLayer.mMaxLength, mLayer.mModelSize);
+        durations_range<MM>(inputs, ranges, TOMMTYPE(mLayer.mMaxLength - 1));
+
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t i = 0; i < width; ++i)
+            {
+                size_t index = static_cast<size_t>(ranges2D[q][i] + 0.5);
+                std::copy(lut2D[index].begin(), lut2D[index].end(), outputs3D[q][i].begin());
+            }
+        }
+    }
+}
+
+template<typename MM>
+void PositionalEncodingCPU<MM>::backwardComputeImpl()
+{
+    // if (mNetworkParams.isGradNeeded(mInputName))
+    {
+        if (!mLayer.mDurationEncoding)
+        {
+            auto& memoryManager = mLayer.mNetworkParams.mWorkflow.getMemoryManager<MM>();
+            auto& prevLayerDelta = memoryManager[mLayer.mInputName.grad()];
+            // just copy
+            prevLayerDelta += memoryManager[mLayer.mOutputName.grad()];
+        }
+    }
+}
+
+template class PositionalEncodingCPU<MemoryManager>;
+template class PositionalEncodingCPU<MemoryManagerFP16>;
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/PositionalEncodingCPU.h b/training/src/compiler/training/base/layers/basic/impl/PositionalEncodingCPU.h
new file mode 100644
index 00000000..23bd845e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/PositionalEncodingCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef POSITIONAL_ENCODING_CPU_H
+#define POSITIONAL_ENCODING_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class PositionalEncoding;
+
+/**
+ * @brief Range Positional Encoding Layer
+ *
+ * The layer encodes information about symbol position in sequence (e.g. word position in sentence) into its embedding vector.
+ *
+ * @see https://kazemnejad.com/blog/transformer_architecture_positional_encoding/
+ */
+template<typename MM>
+class PositionalEncodingCPU : public BasicImpl
+{
+  public:
+    PositionalEncodingCPU(PositionalEncoding& layer);
+
+    PositionalEncodingCPU(PositionalEncodingCPU&&) = default;
+    PositionalEncodingCPU(const PositionalEncodingCPU&) = delete;
+    PositionalEncodingCPU& operator=(const PositionalEncodingCPU&) = delete;
+
+    void initNotBSTensors() override;
+    void forwardComputeImpl(NetworkMode mMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    PositionalEncoding& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/RSqrtLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/RSqrtLayerCPU.cpp
new file mode 100644
index 00000000..49550542
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/RSqrtLayerCPU.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RSqrtLayerCPU.h"
+#include "../RSqrtLayer.h"
+
+#include <atomic>
+#include <cmath>
+
+namespace raul
+{
+
+template<typename MM>
+RSqrtLayerCPU<MM>::RSqrtLayerCPU(RSqrtLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void RSqrtLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    std::atomic<bool> NegativeNumberDetected = false;
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        if (TODTYPE(input[q]) < 0.0_dt)
+        {
+            NegativeNumberDetected = true;
+        }
+        output[q] = static_cast<std::remove_reference_t<decltype(output[q])>>(1.0_dt / std::sqrt(TODTYPE(input[q])));
+    }
+
+    mLayer.mNegativeNumberDetected = NegativeNumberDetected;
+    if (mLayer.mNegativeNumberDetected)
+    {
+        throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "::forwardCompute]: negative input for sqrt()");
+    }
+}
+
+template<typename MM>
+void RSqrtLayerCPU<MM>::backwardComputeImpl()
+{
+    if (mLayer.mNegativeNumberDetected)
+    {
+        throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "::backwardCompute]: negative input for sqrt()");
+    }
+
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            nabla_tensor[q] += TOMMTYPE(-0.5_dt * TODTYPE(deltas[q]) * std::pow(TODTYPE(output[q]), 3));
+        }
+    }
+}
+
+template class RSqrtLayerCPU<MemoryManager>;
+template class RSqrtLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/RSqrtLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/RSqrtLayerCPU.h
new file mode 100644
index 00000000..2520dd25
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/RSqrtLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RSQRT_LAYER_CPU_H
+#define RSQRT_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class RSqrtLayer;
+
+template<typename MM>
+class RSqrtLayerCPU : public BasicImpl
+{
+  public:
+    RSqrtLayerCPU(RSqrtLayer& layer);
+
+    RSqrtLayerCPU(RSqrtLayerCPU&&) = default;
+    RSqrtLayerCPU(const RSqrtLayerCPU&) = delete;
+    RSqrtLayerCPU& operator=(const RSqrtLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    RSqrtLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/RandomSelectLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/RandomSelectLayerCPU.cpp
new file mode 100644
index 00000000..bd3cb1c1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/RandomSelectLayerCPU.cpp
@@ -0,0 +1,198 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RandomSelectLayerCPU.h"
+#include "../RandomSelectLayer.h"
+
+#include <training/base/common/Random.h>
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::RandomSelectLayer, raul::RandomSelectLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::RandomSelectLayer, raul::RandomSelectLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+RandomSelectLayerCPU<MM>::RandomSelectLayerCPU(RandomSelectLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void RandomSelectLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    mLayer.determineBroadcastFlags();
+
+    if (!mLayer.mBroadcast && (mLayer.mBroadcastQuery[0] || mLayer.mBroadcastQuery[1]))
+    {
+        THROW("RandomSelectLayer", mLayer.mName, "input size mismatch");
+    }
+
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& memoryManager = work.getMemoryManager<MM>();
+    auto& mRandomCPU = memoryManager[mLayer.mRandomName];
+
+    auto& output = memoryManager[mLayer.mOutputs[0]];
+
+    const auto& x = memoryManager[mLayer.mInputs[0]];
+    const auto& y = memoryManager[mLayer.mInputs[1]];
+
+    if (mLayer.mBroadcastQuery[0] && mLayer.mBroadcastQuery[1])
+    {
+        const auto x_viewer = x.getBroadcastedViewer(output.getShape());
+        const auto y_viewer = y.getBroadcastedViewer(output.getShape());
+
+        if (mLayer.mNetworkParams.mCalculationMode == CalculationMode::DETERMINISTIC)
+        {
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                mRandomCPU[q] = TOMMTYPE(random::bernoulli::randBool(mLayer.mProbability) ? 1_dt : 0_dt);
+                output[q] = TOMMTYPE(x_viewer[q] * mRandomCPU[q] + (1_dt - mRandomCPU[q]) * y_viewer[q]);
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                mRandomCPU[q] = TOMMTYPE(random::bernoulli::randBool(mLayer.mProbability) ? 1_dt : 0_dt);
+                output[q] = TOMMTYPE(x_viewer[q] * mRandomCPU[q] + (1_dt - mRandomCPU[q]) * y_viewer[q]);
+            }
+        }
+    }
+    else if (mLayer.mBroadcastQuery[0])
+    {
+        const auto x_viewer = x.getBroadcastedViewer(output.getShape());
+
+        if (mLayer.mNetworkParams.mCalculationMode == CalculationMode::DETERMINISTIC)
+        {
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                mRandomCPU[q] = TOMMTYPE(random::bernoulli::randBool(mLayer.mProbability) ? 1_dt : 0_dt);
+                output[q] = TOMMTYPE(x_viewer[q] * mRandomCPU[q] + (1_dt - mRandomCPU[q]) * y[q]);
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                mRandomCPU[q] = TOMMTYPE(random::bernoulli::randBool(mLayer.mProbability) ? 1_dt : 0_dt);
+                output[q] = TOMMTYPE(x_viewer[q] * mRandomCPU[q] + (1_dt - mRandomCPU[q]) * y[q]);
+            }
+        }
+    }
+    else if (mLayer.mBroadcastQuery[1])
+    {
+        const auto y_viewer = y.getBroadcastedViewer(output.getShape());
+
+        if (mLayer.mNetworkParams.mCalculationMode == CalculationMode::DETERMINISTIC)
+        {
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                mRandomCPU[q] = TOMMTYPE(random::bernoulli::randBool(mLayer.mProbability) ? 1_dt : 0_dt);
+                output[q] = TOMMTYPE(x[q] * mRandomCPU[q] + (1_dt - mRandomCPU[q]) * y_viewer[q]);
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                mRandomCPU[q] = TOMMTYPE(random::bernoulli::randBool(mLayer.mProbability) ? 1_dt : 0_dt);
+                output[q] = TOMMTYPE(x[q] * mRandomCPU[q] + (1_dt - mRandomCPU[q]) * y_viewer[q]);
+            }
+        }
+    }
+    else
+    {
+        if (mLayer.mNetworkParams.mCalculationMode == CalculationMode::DETERMINISTIC)
+        {
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                mRandomCPU[q] = TOMMTYPE(random::bernoulli::randBool(mLayer.mProbability) ? 1_dt : 0_dt);
+                output[q] = TOMMTYPE(x[q] * mRandomCPU[q] + (1_dt - mRandomCPU[q]) * y[q]);
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                mRandomCPU[q] = TOMMTYPE(random::bernoulli::randBool(mLayer.mProbability) ? 1_dt : 0_dt);
+                output[q] = TOMMTYPE(x[q] * mRandomCPU[q] + (1_dt - mRandomCPU[q]) * y[q]);
+            }
+        }
+    }
+}
+
+template<typename MM>
+void RandomSelectLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& memoryManager = work.getMemoryManager<MM>();
+    auto& mRandomCPU = memoryManager[mLayer.mRandomName];
+    const auto& delta = memoryManager[mLayer.mOutputs[0].grad()];
+
+    for (size_t q = 0; q < mLayer.mInputs.size(); ++q)
+    {
+        // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[q]))
+        {
+            auto& in_nabla_tensor = memoryManager[mLayer.mInputs[q].grad()];
+
+            if (mLayer.mBroadcastQuery[q])
+            {
+                auto in_nabla = in_nabla_tensor.getBroadcastedViewer(delta.getShape());
+                for (size_t i = 0; i < in_nabla.size(); ++i)
+                {
+                    if ((q == 0 && mRandomCPU[i] == TOMMTYPE(1_dt)) || (q == 1 && mRandomCPU[i] == TOMMTYPE(0_dt)))
+                    {
+                        in_nabla[i] += delta[i];
+                    }
+                }
+            }
+            else
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < in_nabla_tensor.size(); ++i)
+                {
+                    if ((q == 0 && mRandomCPU[i] == TOMMTYPE(1_dt)) || (q == 1 && mRandomCPU[i] == TOMMTYPE(0_dt)))
+                    {
+                        in_nabla_tensor[i] += delta[i];
+                    }
+                }
+            }
+        }
+    }
+}
+
+template class RandomSelectLayerCPU<MemoryManager>;
+template class RandomSelectLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/RandomSelectLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/RandomSelectLayerCPU.h
new file mode 100644
index 00000000..b13837e5
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/RandomSelectLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_SELECT_LAYER_CPU_H
+#define RANDOM_SELECT_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class RandomSelectLayer;
+
+template<typename MM>
+class RandomSelectLayerCPU : public BasicImpl
+{
+  public:
+    RandomSelectLayerCPU(RandomSelectLayer& layer);
+
+    RandomSelectLayerCPU(RandomSelectLayerCPU&&) = default;
+    RandomSelectLayerCPU(const RandomSelectLayerCPU&) = delete;
+    RandomSelectLayerCPU& operator=(const RandomSelectLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    RandomSelectLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/RandomTensorLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/RandomTensorLayerCPU.cpp
new file mode 100644
index 00000000..12968f0f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/RandomTensorLayerCPU.cpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RandomTensorLayerCPU.h"
+#include "../RandomTensorLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+void RandomTensorLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    std::normal_distribution<dtype> distribution(mLayer.mMean, mLayer.mStdDev);
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        output[q] = TOMMTYPE(distribution(mLayer.mGenerator));
+    }
+}
+
+template class RandomTensorLayerCPU<MemoryManager>;
+template class RandomTensorLayerCPU<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/RandomTensorLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/RandomTensorLayerCPU.h
new file mode 100644
index 00000000..5d93801f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/RandomTensorLayerCPU.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_TENSOR_LAYER_CPU_H
+#define RANDOM_TENSOR_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class RandomTensorLayer;
+
+/**
+ * @brief RandomTensorLayer CPU implementation
+ */
+template<typename MM>
+class RandomTensorLayerCPU : public BasicImpl
+{
+  public:
+    RandomTensorLayerCPU(RandomTensorLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    RandomTensorLayerCPU(RandomTensorLayerCPU&&) = default;
+    RandomTensorLayerCPU(const RandomTensorLayerCPU&) = delete;
+    RandomTensorLayerCPU& operator=(const RandomTensorLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override{};
+
+  private:
+    RandomTensorLayer& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/ReduceArithmeticLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ReduceArithmeticLayerCPU.cpp
new file mode 100644
index 00000000..cbb7dac1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ReduceArithmeticLayerCPU.cpp
@@ -0,0 +1,266 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReduceArithmeticLayerCPU.h"
+#include "../ReduceArithmeticLayer.h"
+#include "../ReduceBatchMeanLayer.h"
+#include "../ReduceMeanLayer.h"
+#include "../ReduceNonZeroLayer.h"
+#include "../ReduceStdLayer.h"
+#include "../ReduceSumLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+
+namespace
+{
+std::tuple<size_t, size_t, size_t, size_t> reassign(raul::Dimension dim, size_t i, size_t j, size_t k, size_t q)
+{
+    if (dim == raul::Dimension::Depth)
+    {
+        return std::make_tuple(j, i, k, q);
+    }
+    if (dim == raul::Dimension::Height)
+    {
+        return std::make_tuple(j, k, i, q);
+    }
+    if (dim == raul::Dimension::Width)
+    {
+        return std::make_tuple(j, k, q, i);
+    }
+    return std::make_tuple(i, j, k, q);
+}
+
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::ReduceArithmeticLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::ReduceArithmeticLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManagerFP16>>();
+
+bool reg3 = raul::TheImplFactory::Instance().regCPUFP32<raul::ReduceSumLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManager>>();
+bool reg4 = raul::TheImplFactory::Instance().regCPUFP16<raul::ReduceSumLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManagerFP16>>();
+
+bool reg5 = raul::TheImplFactory::Instance().regCPUFP32<raul::ReduceBatchMeanLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManager>>();
+bool reg6 = raul::TheImplFactory::Instance().regCPUFP16<raul::ReduceBatchMeanLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManagerFP16>>();
+
+bool reg7 = raul::TheImplFactory::Instance().regCPUFP32<raul::ReduceNonZeroLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManager>>();
+bool reg8 = raul::TheImplFactory::Instance().regCPUFP16<raul::ReduceNonZeroLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManagerFP16>>();
+
+bool reg9 = raul::TheImplFactory::Instance().regCPUFP32<raul::ReduceMeanLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManager>>();
+bool reg10 = raul::TheImplFactory::Instance().regCPUFP16<raul::ReduceMeanLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManagerFP16>>();
+
+bool reg11 = raul::TheImplFactory::Instance().regCPUFP32<raul::ReduceStdLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManager>>();
+bool reg12 = raul::TheImplFactory::Instance().regCPUFP16<raul::ReduceStdLayer, raul::ReduceArithmeticLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void ReduceArithmeticLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    output = TOMMTYPE(0.0);
+
+    if (mLayer.mDim == Dimension::Default)
+    {
+        dtype sum = 0.0_dt;
+        dtype nonZeroElems = 0.0_dt;
+        //#if defined(_OPENMP)
+        //#pragma omp parallel for reduction(+ : sum, nonZeroElems)
+        //#endif
+        for (size_t q = 0; q < input.size(); ++q)
+        {
+            sum += input[q];
+            if (input[q] != 0.0_dt)
+            {
+                nonZeroElems += 1.0_dt;
+            }
+        }
+        mLayer.mDiv = mLayer.mOperation == "sum" || mLayer.mOperation == "count_non_zero_elems" ? 1.0_dt : mLayer.mOperation == "batch_mean" ? TODTYPE(input.getBatchSize()) : TODTYPE(input.size());
+        if (mLayer.mOperation == "count_non_zero_elems")
+        {
+            output[0] = TOMMTYPE(nonZeroElems);
+        }
+        else
+        {
+            output[0] = TOMMTYPE(sum / mLayer.mDiv);
+        }
+        if (mLayer.mOperation == "std")
+        {
+            mLayer.mMeanValues.resize(1);
+            dtype std = 0.0_dt;
+#if defined(_OPENMP)
+#pragma omp parallel for reduction(+ : std)
+#endif
+            for (size_t q = 0; q < input.size(); ++q)
+            {
+                std += TODTYPE((input[q] - output[0]) * (input[q] - output[0]));
+            }
+            // Remember mean value
+            mLayer.mMeanValues[0] = TODTYPE(output[0]);
+            output[0] = TOMMTYPE(std::sqrt(std / (mLayer.mDiv - 1)));
+        }
+    }
+    else
+    {
+        const auto outputStrides = Common::getStrides(output.getShape());
+        const auto input4D = input.get4DView();
+        auto inputShape = input.getShape();
+        // Pick chosen dimension
+        const auto chosenDimSize = inputShape[static_cast<size_t>(mLayer.mDim)];
+        // Delete it
+        std::vector<size_t> otherDims;
+        for (size_t i = 0; i < inputShape.dimensions_num(); ++i)
+        {
+            if (i != static_cast<size_t>(mLayer.mDim))
+            {
+                otherDims.push_back(inputShape[i]);
+            }
+        }
+        // Divisor if operation == "mean" or "std"
+        mLayer.mDiv = mLayer.mOperation == "sum" || mLayer.mOperation == "count_non_zero_elems" ? 1.0_dt : TODTYPE(chosenDimSize);
+        bool needToCountNonZeroElems = mLayer.mOperation == "count_non_zero_elems";
+        // Main loop
+        for (size_t i = 0; i < chosenDimSize; ++i)
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t j = 0; j < otherDims[0]; ++j)
+            {
+                for (size_t k = 0; k < otherDims[1]; ++k)
+                {
+                    for (size_t q = 0; q < otherDims[2]; ++q)
+                    {
+                        // Rearrange indices in proper way
+                        auto [realI, realJ, realK, realQ] = reassign(mLayer.mDim, i, j, k, q);
+                        // Find offset in output
+                        shape outputIndices{ realI, realJ, realK, realQ };
+                        outputIndices[static_cast<size_t>(mLayer.mDim)] = 1;
+                        const auto offset = Common::indexesToOffset(outputIndices, outputStrides);
+                        if (needToCountNonZeroElems)
+                        {
+                            if (input4D[realI][realJ][realK][realQ] != TOMMTYPE(0.0_dt))
+                            {
+                                output[offset] += TOMMTYPE(1.0_dt);
+                            }
+                        }
+                        else
+                        {
+                            output[offset] += TOMMTYPE(TODTYPE(input4D[realI][realJ][realK][realQ]) / mLayer.mDiv);
+                        }
+                    }
+                }
+            }
+        }
+        if (mLayer.mOperation == "std")
+        {
+            // Copy mean values to another vector
+            mLayer.mMeanValues.resize(output.size());
+            std::copy(output.begin(), output.end(), mLayer.mMeanValues.begin());
+            output = TOMMTYPE(0.0_dt);
+            for (size_t i = 0; i < chosenDimSize; ++i)
+            {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t j = 0; j < otherDims[0]; ++j)
+                {
+                    for (size_t k = 0; k < otherDims[1]; ++k)
+                    {
+                        for (size_t q = 0; q < otherDims[2]; ++q)
+                        {
+                            // Rearrange indices in proper way
+                            auto [realI, realJ, realK, realQ] = reassign(mLayer.mDim, i, j, k, q);
+                            // Find offset in output
+                            shape outputIndices{ realI, realJ, realK, realQ };
+                            outputIndices[static_cast<size_t>(mLayer.mDim)] = 1;
+                            const auto offset = Common::indexesToOffset(outputIndices, outputStrides);
+                            output[offset] +=
+                                (TOMMTYPE(mLayer.mMeanValues[offset]) - input4D[realI][realJ][realK][realQ]) * (TOMMTYPE(mLayer.mMeanValues[offset]) - input4D[realI][realJ][realK][realQ]);
+                        }
+                    }
+                }
+            }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t n = 0; n < output.size(); ++n)
+            {
+                output[n] = TOMMTYPE(std::sqrt(TODTYPE(output[n]) / (mLayer.mDiv - 1)));
+            }
+        }
+    }
+}
+
+template<typename MM>
+void ReduceArithmeticLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    auto& delta = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& in_nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+        const auto inputStrides = Common::getStrides(in_nabla_tensor.getShape());
+        const auto outputStrides = Common::getStrides(output.getShape());
+
+        if (mLayer.mOperation == "std")
+        {
+            const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+            for (size_t q = 0; q < in_nabla_tensor.size(); q++)
+            {
+                auto inputIndexes = Common::offsetToIndexes(q, inputStrides);
+                // Calculate index in output
+                if (mLayer.mDim == Dimension::Default)
+                {
+                    std::fill(inputIndexes.begin(), inputIndexes.end(), 1);
+                }
+                else
+                {
+                    inputIndexes[static_cast<size_t>(mLayer.mDim)] = 1;
+                }
+                const auto outputOffset = Common::indexesToOffset(inputIndexes, outputStrides);
+                in_nabla_tensor[q] += TOMMTYPE(TODTYPE(delta[outputOffset]) * (TODTYPE(input[q]) - mLayer.mMeanValues[outputOffset]) / TODTYPE(mLayer.mDiv - 1) / TODTYPE(output[outputOffset]));
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < in_nabla_tensor.size(); q++)
+            {
+                auto inputIndexes = Common::offsetToIndexes(q, inputStrides);
+                // Calculate index in output
+                if (mLayer.mDim == Dimension::Default)
+                {
+                    std::fill(inputIndexes.begin(), inputIndexes.end(), 1);
+                }
+                else
+                {
+                    inputIndexes[static_cast<size_t>(mLayer.mDim)] = 1;
+                }
+                const auto outputOffset = Common::indexesToOffset(inputIndexes, outputStrides);
+                in_nabla_tensor[q] += TOMMTYPE(TODTYPE(delta[outputOffset]) / mLayer.mDiv);
+            }
+        }
+    }
+}
+
+template class ReduceArithmeticLayerCPU<MemoryManager>;
+template class ReduceArithmeticLayerCPU<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ReduceArithmeticLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ReduceArithmeticLayerCPU.h
new file mode 100644
index 00000000..8d20e66c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ReduceArithmeticLayerCPU.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_ARITHMETIC_LAYER_CPU_H
+#define REDUCE_ARITHMETIC_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class ReduceArithmeticLayer;
+
+/**
+ * @brief Reduce Arithmetic Layer CPU implementation
+ */
+template<typename MM>
+class ReduceArithmeticLayerCPU : public BasicImpl
+{
+  public:
+    ReduceArithmeticLayerCPU(ReduceArithmeticLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    ReduceArithmeticLayerCPU(ReduceArithmeticLayerCPU&&) = default;
+    ReduceArithmeticLayerCPU(const ReduceArithmeticLayerCPU&) = delete;
+    ReduceArithmeticLayerCPU& operator=(const ReduceArithmeticLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ReduceArithmeticLayer& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/ReduceExtremumLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ReduceExtremumLayerCPU.cpp
new file mode 100644
index 00000000..5e444d75
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ReduceExtremumLayerCPU.cpp
@@ -0,0 +1,171 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReduceExtremumLayerCPU.h"
+#include "../ReduceExtremumLayer.h"
+
+namespace
+{
+
+std::tuple<size_t, size_t, size_t, size_t> reassign(raul::Dimension dim, size_t i, size_t j, size_t k, size_t q)
+{
+    if (dim == raul::Dimension::Depth)
+    {
+        return std::make_tuple(j, i, k, q);
+    }
+    if (dim == raul::Dimension::Height)
+    {
+        return std::make_tuple(j, k, i, q);
+    }
+    if (dim == raul::Dimension::Width)
+    {
+        return std::make_tuple(j, k, q, i);
+    }
+    return std::make_tuple(i, j, k, q);
+}
+
+} // anonymous
+
+namespace raul
+{
+
+template<template<typename> typename Comparator, typename MM>
+void ReduceExtremumLayerCPU<Comparator, MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    std::fill(output.begin(), output.end(), mComparator.getBound());
+
+    if (mLayer.mDim == raul::Dimension::Default)
+    {
+        mLayer.mCountExtremums.resize(1);
+        for (size_t q = 0; q < input.size(); ++q)
+        {
+            if (mComparator.compare(output[0], input[q]))
+            {
+                if (output[0] == input[q])
+                {
+                    mLayer.mCountExtremums[0]++;
+                }
+                else
+                {
+                    output[0] = input[q];
+                    mLayer.mCountExtremums[0] = 1;
+                }
+            }
+        }
+    }
+    else
+    {
+        const auto outputStrides = Common::getStrides(output.getShape());
+        const auto input4D = input.get4DView();
+        auto inputShape = input.getShape();
+        // Pick chosen dimension
+        const auto chosenDimSize = inputShape[static_cast<size_t>(mLayer.mDim)];
+        // Delete it
+        std::vector<size_t> otherDims;
+        size_t otherDimsSize = 1;
+        for (size_t i = 0; i < inputShape.dimensions_num(); ++i)
+        {
+            if (i != static_cast<size_t>(mLayer.mDim))
+            {
+                otherDims.push_back(inputShape[i]);
+                otherDimsSize *= inputShape[i];
+            }
+        }
+        mLayer.mCountExtremums.resize(otherDimsSize);
+        // Main loop
+        for (size_t i = 0; i < chosenDimSize; ++i)
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t j = 0; j < otherDims[0]; ++j)
+            {
+                for (size_t k = 0; k < otherDims[1]; ++k)
+                {
+                    for (size_t q = 0; q < otherDims[2]; ++q)
+                    {
+                        // Rearrange indices in proper way
+                        auto [realI, realJ, realK, realQ] = reassign(mLayer.mDim, i, j, k, q);
+                        // Find offset in output
+                        raul::shape outputIndices{ realI, realJ, realK, realQ };
+                        outputIndices[static_cast<size_t>(mLayer.mDim)] = 1;
+                        const auto offset = Common::indexesToOffset(outputIndices, outputStrides);
+                        if (mComparator.compare(output[offset], input4D[realI][realJ][realK][realQ]))
+                        {
+                            if (output[offset] == input4D[realI][realJ][realK][realQ])
+                            {
+                                mLayer.mCountExtremums[offset]++;
+                            }
+                            else
+                            {
+                                output[offset] = input4D[realI][realJ][realK][realQ];
+                                mLayer.mCountExtremums[offset] = 1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<template<typename> typename Comparator, typename MM>
+void ReduceExtremumLayerCPU<Comparator, MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& delta = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    // if (mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& in_nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+        if (mLayer.mDim == raul::Dimension::Default)
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t j = 0; j < in_nabla_tensor.size(); ++j)
+            {
+                in_nabla_tensor[j] = TOMMTYPE(input[j] == output[0]) * delta[0] / TOMMTYPE(mLayer.mCountExtremums[0]);
+            }
+        }
+        else
+        {
+            const auto inputStrides = Common::getStrides(in_nabla_tensor.getShape());
+            const auto outputStrides = Common::getStrides(output.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t j = 0; j < in_nabla_tensor.size(); ++j)
+            {
+                auto inputIndexes = Common::offsetToIndexes(j, inputStrides);
+                inputIndexes[static_cast<size_t>(mLayer.mDim)] = 1;
+                const auto outputOffset = Common::indexesToOffset(inputIndexes, outputStrides);
+                in_nabla_tensor[j] = TOMMTYPE(input[j] == output[outputOffset]) * delta[outputOffset] / TOMMTYPE(mLayer.mCountExtremums[outputOffset]);
+            }
+        }
+    }
+}
+
+template class ReduceExtremumLayerCPU<Max, MemoryManager>;
+template class ReduceExtremumLayerCPU<Min, MemoryManager>;
+template class ReduceExtremumLayerCPU<Max, MemoryManagerFP16>;
+template class ReduceExtremumLayerCPU<Min, MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ReduceExtremumLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ReduceExtremumLayerCPU.h
new file mode 100644
index 00000000..9592b8c6
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ReduceExtremumLayerCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REDUCE_EXTREMUM_LAYER_CPU_H
+#define REDUCE_EXTREMUM_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+template<template<typename> typename Comparator>
+class ReduceExtremumLayer;
+
+/**
+ * @brief Reduce Extremum Layer CPU implementation
+ */
+template<template<typename> typename Comparator, typename MM>
+class ReduceExtremumLayerCPU : public BasicImpl
+{
+  public:
+    ReduceExtremumLayerCPU(ReduceExtremumLayer<Comparator>& layer)
+        : mLayer(layer)
+    {
+    }
+
+    ReduceExtremumLayerCPU(ReduceExtremumLayerCPU&&) = default;
+    ReduceExtremumLayerCPU(const ReduceExtremumLayerCPU&) = delete;
+    ReduceExtremumLayerCPU& operator=(const ReduceExtremumLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    Comparator<typename MM::type> mComparator;
+    ReduceExtremumLayer<Comparator>& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/ReshapeLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ReshapeLayerCPU.cpp
new file mode 100644
index 00000000..1fab1c4f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ReshapeLayerCPU.cpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReshapeLayerCPU.h"
+#include "../ReshapeLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::ReshapeLayer, raul::ReshapeLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::ReshapeLayer, raul::ReshapeLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void ReshapeLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& mm = mLayer.mNetworkParams.mWorkflow.getMemoryManager<MM>();
+    auto& output = mm[mLayer.mOutputName];
+    output = TORANGE_MM(mm[mLayer.mInputName]);
+}
+
+template<typename MM>
+void ReshapeLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& mm = mLayer.mNetworkParams.mWorkflow.getMemoryManager<MM>();
+    auto& prevLayerDelta = mm[mLayer.mInputName.grad()];
+    const auto& delta = mm[mLayer.mOutputName.grad()];
+    std::transform(delta.begin(), delta.end(), prevLayerDelta.begin(), prevLayerDelta.begin(), [](typename MM::type x, typename MM::type grad) { return TOMMTYPE(x + grad); });
+}
+
+template class ReshapeLayerCPU<MemoryManager>;
+template class ReshapeLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/ReshapeLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ReshapeLayerCPU.h
new file mode 100644
index 00000000..2b9cd929
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ReshapeLayerCPU.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RESHAPE_LAYER_CPU_H
+#define RESHAPE_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class ReshapeLayer;
+
+/**
+ * @brief Reshape layer CPU implementation
+ */
+template<typename MM>
+class ReshapeLayerCPU : public BasicImpl
+{
+  public:
+    ReshapeLayerCPU(ReshapeLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    ReshapeLayerCPU(ReshapeLayerCPU&&) = default;
+    ReshapeLayerCPU(const ReshapeLayerCPU&) = delete;
+    ReshapeLayerCPU& operator=(const ReshapeLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ReshapeLayer& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/ReverseLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/ReverseLayerCPU.cpp
new file mode 100644
index 00000000..6278b5c1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ReverseLayerCPU.cpp
@@ -0,0 +1,108 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ReverseLayerCPU.h"
+#include "../ReverseLayer.h"
+
+#include <algorithm>
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::ReverseLayer, raul::ReverseLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::ReverseLayer, raul::ReverseLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void ReverseLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    const auto batch = work.getBatchSize();
+    const auto depth = input.getDepth();
+    const auto height = input.getHeight();
+    const auto width = input.getWidth();
+    auto input3D = input.reshape(yato::dims(batch, depth * height, width));
+    auto output3D = output.reshape(yato::dims(batch, depth * height, width));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < batch; ++i)
+    {
+        auto realLength = mLayer.mReverseOnly ? depth * height : static_cast<size_t>(work.getMemoryManager<MM>()[mLayer.mInputs[1]][i]);
+        for (size_t w = 0; w < width; ++w)
+        {
+            for (size_t start = 0; start < depth * height; ++start)
+            {
+                if (start < realLength)
+                {
+                    output3D[i][realLength - 1 - start][w] = input3D[i][start][w];
+                }
+                else
+                {
+                    output3D[i][start][w] = input3D[i][start][w];
+                }
+            }
+        }
+    }
+}
+
+template<typename MM>
+void ReverseLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+    auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+    const auto batch = work.getBatchSize();
+    const auto depth = deltas.getDepth();
+    const auto height = deltas.getHeight();
+    const auto width = deltas.getWidth();
+    auto prevLayerDelta3D = prevLayerDelta.reshape(yato::dims(batch, depth * height, width));
+    auto deltas3D = deltas.reshape(yato::dims(batch, depth * height, width));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < batch; ++i)
+    {
+        auto realLength = mLayer.mReverseOnly ? depth * height : static_cast<size_t>(work.getMemoryManager<MM>()[mLayer.mInputs[1]][i]);
+        for (size_t w = 0; w < width; ++w)
+        {
+            for (size_t start = 0; start < depth * height; ++start)
+            {
+                if (start < realLength)
+                {
+                    prevLayerDelta3D[i][realLength - 1 - start][w] += deltas3D[i][start][w];
+                }
+                else
+                {
+                    prevLayerDelta3D[i][start][w] += deltas3D[i][start][w];
+                }
+            }
+        }
+    }
+}
+
+template class ReverseLayerCPU<MemoryManager>;
+template class ReverseLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ReverseLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/ReverseLayerCPU.h
new file mode 100644
index 00000000..28e62096
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ReverseLayerCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REVERSE_LAYER_CPU_H
+#define REVERSE_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class ReverseLayer;
+
+/**
+ * @brief ReverseLayer layer CPU implementation
+ */
+template<typename MM>
+class ReverseLayerCPU : public BasicImpl
+{
+  public:
+    ReverseLayerCPU(ReverseLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    ReverseLayerCPU(ReverseLayerCPU&&) = default;
+    ReverseLayerCPU(const ReverseLayerCPU&) = delete;
+    ReverseLayerCPU& operator=(const ReverseLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ReverseLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/RoundLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/RoundLayerCPU.cpp
new file mode 100644
index 00000000..ec05e4fc
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/RoundLayerCPU.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RoundLayerCPU.h"
+#include "../RoundLayer.h"
+
+#include <cfenv>
+
+namespace raul
+{
+
+#if defined(_MSC_VER)
+#pragma fenv_access(on)
+#endif
+
+template<typename MM>
+RoundLayerCPU<MM>::RoundLayerCPU(RoundLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void RoundLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto mode = std::fegetround();
+    std::fesetround(FE_TONEAREST);
+
+    auto& memoryManager = mLayer.mNetworkParams.mWorkflow.getMemoryManager<MM>();
+    const auto& input = memoryManager[mLayer.mInputs[0]];
+    auto& output = memoryManager[mLayer.mOutputs[0]];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t e = 0; e < input.size(); ++e)
+    {
+        output[e] = TODTYPE(lrint(input[e]));
+    }
+
+    std::fesetround(mode);
+}
+
+template<>
+void RoundLayerCPU<MemoryManagerFP16>::forwardComputeImpl(NetworkMode)
+{
+    auto mode = std::fegetround();
+    std::fesetround(FE_TONEAREST);
+
+    auto& memoryManager = mLayer.mNetworkParams.mWorkflow.getMemoryManager<MemoryManagerFP16>();
+    const auto& input = memoryManager[mLayer.mInputs[0]];
+    auto& output = memoryManager[mLayer.mOutputs[0]];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t e = 0; e < input.size(); ++e)
+    {
+        output[e] = TOHTYPE(lrint(toFloat32(input[e])));
+    }
+
+    std::fesetround(mode);
+}
+
+template<typename MM>
+void RoundLayerCPU<MM>::backwardComputeImpl()
+{
+}
+
+template class RoundLayerCPU<MemoryManager>;
+template class RoundLayerCPU<MemoryManagerFP16>;
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/RoundLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/RoundLayerCPU.h
new file mode 100644
index 00000000..869bce1e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/RoundLayerCPU.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ROUND_LAYER_CPU_H
+#define ROUND_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class RoundLayer;
+
+template<typename MM>
+class RoundLayerCPU : public BasicImpl
+{
+  public:
+    RoundLayerCPU(RoundLayer& layer);
+
+    RoundLayerCPU(RoundLayerCPU&&) = default;
+    RoundLayerCPU(const RoundLayerCPU&) = delete;
+    RoundLayerCPU& operator=(const RoundLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    RoundLayer& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/ScaleLayerImpl.cpp b/training/src/compiler/training/base/layers/basic/impl/ScaleLayerImpl.cpp
new file mode 100644
index 00000000..bea2a458
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ScaleLayerImpl.cpp
@@ -0,0 +1,61 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ScaleLayerImpl.h"
+#include "../ScaleLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::ScaleLayer, raul::ScaleLayerImpl<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::ScaleLayer, raul::ScaleLayerImpl<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+template<typename MM>
+void ScaleLayerImpl<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>().getTensor(mLayer.mOutputs[0]);
+    const auto& input = work.getMemoryManager<MM>().getTensor(mLayer.mInputs[0]);
+
+    size_t n = output.size();
+    std::fill(output.begin(), output.end(), TOMMTYPE(0));
+    OPENBLAS_CONST dtype sa = mLayer.mScale;
+    Common::axpy(n, sa, input.getBuffer(), 1, output.getBuffer(), 1, 0, 0);
+}
+
+template<typename MM>
+void ScaleLayerImpl<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>().getTensor(mLayer.mOutputs[0].grad());
+
+    // if (mNetworkParams.isGradNeeded(mInputs[0]))
+    {
+        auto& nabla_tensor = work.getMemoryManager<MM>().getTensor(mLayer.mInputs[0].grad());
+
+        {
+            size_t n = nabla_tensor.size();
+            OPENBLAS_CONST dtype sa = mLayer.mScale;
+            Common::axpy(n, sa, deltas.getBuffer(), 1, nabla_tensor.getBuffer(), 1, 0, 0);
+        }
+    }
+}
+
+INSTANTIATE_IMPL(ScaleLayerImpl)
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/ScaleLayerImpl.h b/training/src/compiler/training/base/layers/basic/impl/ScaleLayerImpl.h
new file mode 100644
index 00000000..11fc6b44
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/ScaleLayerImpl.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SCALE_LAYER_IMPL_H
+#define SCALE_LAYER_IMPL_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class ScaleLayer;
+
+/**
+ * @brief Scale layer HW independent implementation
+ */
+template<typename MM>
+class ScaleLayerImpl : public BasicImpl
+{
+  public:
+    ScaleLayerImpl(ScaleLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    ScaleLayerImpl(ScaleLayerImpl&&) = default;
+    ScaleLayerImpl(const ScaleLayerImpl&) = delete;
+    ScaleLayerImpl& operator=(const ScaleLayerImpl&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ScaleLayer& mLayer;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/SelectLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/SelectLayerCPU.cpp
new file mode 100644
index 00000000..0b80b346
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/SelectLayerCPU.cpp
@@ -0,0 +1,179 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SelectLayerCPU.h"
+#include "../SelectLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+SelectLayerCPU<MM>::SelectLayerCPU(SelectLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void SelectLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    mLayer.determineBroadcastFlags();
+
+    if (!mLayer.mBroadcast && std::any_of(mLayer.mBroadcastQuery.begin(), mLayer.mBroadcastQuery.end(), [](const auto& needToBroadcast) { return needToBroadcast; }))
+    {
+        THROW("SelectLayer", mLayer.mName, "input size mismatch");
+    }
+
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+
+    const auto& cond = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& x = work.getMemoryManager<MM>()[mLayer.mInputs[1]];
+    const auto& y = work.getMemoryManager<MM>()[mLayer.mInputs[2]];
+
+    if (!mLayer.mBroadcast || (!mLayer.mBroadcastQuery[0] && !mLayer.mBroadcastQuery[1] && !mLayer.mBroadcastQuery[2]))
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            output[q] = static_cast<bool>(cond[q]) ? x[q] : y[q];
+        }
+    }
+    else
+    {
+        if (!mLayer.mBroadcastQuery[0] && !mLayer.mBroadcastQuery[1])
+        {
+            const auto y_viewer = y.getBroadcastedViewer(output.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                output[q] = static_cast<bool>(cond[q]) ? x[q] : y_viewer[q];
+            }
+        }
+        else if (!mLayer.mBroadcastQuery[1] && !mLayer.mBroadcastQuery[2])
+        {
+            const auto cond_viewer = cond.getBroadcastedViewer(output.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                output[q] = static_cast<bool>(cond_viewer[q]) ? x[q] : y[q];
+            }
+        }
+        else if (!mLayer.mBroadcastQuery[2] && !mLayer.mBroadcastQuery[0])
+        {
+            const auto x_viewer = x.getBroadcastedViewer(output.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                output[q] = static_cast<bool>(cond[q]) ? x_viewer[q] : y[q];
+            }
+        }
+        else
+        {
+            const auto cond_viewer = cond.getBroadcastedViewer(output.getShape());
+            const auto x_viewer = x.getBroadcastedViewer(output.getShape());
+            const auto y_viewer = y.getBroadcastedViewer(output.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < output.size(); ++q)
+            {
+                output[q] = static_cast<bool>(cond_viewer[q]) ? x_viewer[q] : y_viewer[q];
+            }
+        }
+    }
+}
+
+template<typename MM>
+void SelectLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& delta = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    if (mLayer.mBroadcastQuery[0])
+    {
+        const auto condition = work.getMemoryManager<MM>()[mLayer.mInputs[0]].getBroadcastedViewer(delta.getShape());
+        for (size_t q = 1; q < mLayer.mInputs.size(); ++q)
+        {
+            // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[q]))
+            {
+                auto& in_nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[q].grad()];
+                if (mLayer.mBroadcastQuery[q])
+                {
+                    auto in_nabla = in_nabla_tensor.getBroadcastedViewer(delta.getShape());
+                    for (size_t i = 0; i < in_nabla.size(); ++i)
+                    {
+                        auto cond = q == 1 ? static_cast<bool>(condition[i]) : !static_cast<bool>(condition[i]);
+                        in_nabla[i] += (cond) ? delta[i] : 0.0_hf;
+                    }
+                }
+                else
+                {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                    for (size_t i = 0; i < delta.size(); ++i)
+                    {
+                        auto cond = q == 1 ? static_cast<bool>(condition[i]) : !static_cast<bool>(condition[i]);
+                        in_nabla_tensor[i] += (cond) ? delta[i] : 0.0_hf;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        const auto condition = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+        for (size_t q = 1; q < mLayer.mInputs.size(); ++q)
+        {
+            // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[q]))
+            {
+                auto& in_nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[q].grad()];
+                if (mLayer.mBroadcastQuery[q])
+                {
+                    auto in_nabla = in_nabla_tensor.getBroadcastedViewer(delta.getShape());
+                    for (size_t i = 0; i < in_nabla.size(); ++i)
+                    {
+                        auto cond = q == 1 ? static_cast<bool>(condition[i]) : !static_cast<bool>(condition[i]);
+                        in_nabla[i] += (cond) ? delta[i] : 0.0_hf;
+                    }
+                }
+                else
+                {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                    for (size_t i = 0; i < delta.size(); ++i)
+                    {
+                        auto cond = q == 1 ? static_cast<bool>(condition[i]) : !static_cast<bool>(condition[i]);
+                        in_nabla_tensor[i] += (cond) ? delta[i] : 0.0_hf;
+                    }
+                }
+            }
+        }
+    }
+}
+
+template class SelectLayerCPU<MemoryManager>;
+template class SelectLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/SelectLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/SelectLayerCPU.h
new file mode 100644
index 00000000..fc018b53
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/SelectLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SELECT_LAYER_CPU_H
+#define SELECT_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SelectLayer;
+
+template<typename MM>
+class SelectLayerCPU : public BasicImpl
+{
+  public:
+    SelectLayerCPU(SelectLayer& layer);
+
+    SelectLayerCPU(SelectLayerCPU&&) = default;
+    SelectLayerCPU(const SelectLayerCPU&) = delete;
+    SelectLayerCPU& operator=(const SelectLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SelectLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/SlicerLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/SlicerLayerCPU.cpp
new file mode 100644
index 00000000..123a53e2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/SlicerLayerCPU.cpp
@@ -0,0 +1,116 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SlicerLayerCPU.h"
+#include "../SlicerLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::SlicerLayer, raul::SlicerLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::SlicerLayer, raul::SlicerLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+SlicerLayerCPU<MM>::SlicerLayerCPU(SlicerLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void SlicerLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    auto input4d = input.get4DView();
+    auto inputDims = yato::dims(input.getDepth(), input.getHeight(), input.getWidth());
+
+    size_t accumulatedSize = 0;
+    for (size_t q = 0; q < mLayer.mSlices.size(); ++q)
+    {
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[q]];
+        auto outputDims = output.getShape();
+        const typename MM::type* startEl = nullptr;
+        switch (mLayer.mDirection)
+        {
+            case Dimension::Depth:
+                startEl = &input4d[0][accumulatedSize][0][0];
+                break;
+            case Dimension::Height:
+                startEl = &input4d[0][0][accumulatedSize][0];
+                break;
+            case Dimension::Width:
+                startEl = &input4d[0][0][0][accumulatedSize];
+                break;
+            default:
+                throw std::runtime_error("SlicerLayer[forwardCompute]: unknown dim");
+        }
+        accumulatedSize += mLayer.mSlices[q];
+        auto sliceView = yato::array_view_4d<const typename MM::type>(startEl, outputDims, inputDims);
+        auto outputView = output.get4DView();
+        Common::copyView(sliceView, outputView, true);
+    }
+}
+
+template<typename MM>
+void SlicerLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+        auto prevDelta4d = prevLayerDelta.get4DView();
+        auto inputDims = yato::dims(prevLayerDelta.getDepth(), prevLayerDelta.getHeight(), prevLayerDelta.getWidth());
+        size_t accumulatedSize = 0;
+        for (size_t q = 0; q < mLayer.mOutputs.size(); ++q)
+        {
+            if (work.getMemoryManager<MM>().tensorExists(mLayer.mOutputs[q].grad()))
+            {
+                typename MM::type* startEl = nullptr;
+                switch (mLayer.mDirection)
+                {
+                    case Dimension::Depth:
+                        startEl = &prevDelta4d[0][accumulatedSize][0][0];
+                        break;
+                    case Dimension::Height:
+                        startEl = &prevDelta4d[0][0][accumulatedSize][0];
+                        break;
+                    case Dimension::Width:
+                        startEl = &prevDelta4d[0][0][0][accumulatedSize];
+                        break;
+                    default:
+                        throw std::runtime_error("SlicerLayer[backwardCompute]: unknown dim");
+                }
+
+                const auto& delta = work.getMemoryManager<MM>()[mLayer.mOutputs[q].grad()];
+                auto outputDims = delta.getShape();
+                auto deltaView = delta.get4DView();
+                auto sliceView = yato::array_view_4d<typename MM::type>(startEl, outputDims, inputDims);
+                Common::copyView(deltaView, sliceView);
+            }
+            accumulatedSize += mLayer.mSlices[q];
+        }
+    }
+}
+
+template class SlicerLayerCPU<MemoryManager>;
+template class SlicerLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/SlicerLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/SlicerLayerCPU.h
new file mode 100644
index 00000000..3d545cf6
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/SlicerLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SLICER_LAYER_CPU_H
+#define SLICER_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SlicerLayer;
+
+template<typename MM>
+class SlicerLayerCPU : public BasicImpl
+{
+  public:
+    SlicerLayerCPU(SlicerLayer& layer);
+
+    SlicerLayerCPU(SlicerLayerCPU&&) = default;
+    SlicerLayerCPU(const SlicerLayerCPU&) = delete;
+    SlicerLayerCPU& operator=(const SlicerLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SlicerLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/SplitterLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/SplitterLayerCPU.cpp
new file mode 100644
index 00000000..72b28a89
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/SplitterLayerCPU.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SplitterLayerCPU.h"
+#include "../SplitterLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+SplitterLayerCPU<MM>::SplitterLayerCPU(SplitterLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void SplitterLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& inputBlob = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    for (auto& output : mLayer.mOutputs)
+    {
+        auto& out = work.getMemoryManager<MM>()[output];
+        out = TORANGE_MM(inputBlob);
+    }
+}
+
+template<typename MM>
+void SplitterLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+        for (size_t q = 0; q < mLayer.mOutputs.size(); ++q)
+        {
+            if (work.getMemoryManager<MM>().tensorExists(mLayer.mOutputs[q].grad()))
+            {
+                const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[q].grad()];
+
+                if (deltas.size() != prevLayerDelta.size())
+                {
+                    throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "::backwardCompute]: gradient size mismatch");
+                }
+
+                std::transform(deltas.begin(), deltas.end(), prevLayerDelta.begin(), prevLayerDelta.begin(), std::plus<typename MM::type>());
+            }
+        }
+    }
+}
+
+template class SplitterLayerCPU<MemoryManager>;
+template class SplitterLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/SplitterLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/SplitterLayerCPU.h
new file mode 100644
index 00000000..b39c3f05
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/SplitterLayerCPU.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SPLITTER_LAYER_CPU_H
+#define SPLITTER_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SplitterLayer;
+
+template<typename MM>
+class SplitterLayerCPU : public BasicImpl
+{
+  public:
+    SplitterLayerCPU(SplitterLayer& layer);
+
+    SplitterLayerCPU(SplitterLayerCPU&&) = default;
+    SplitterLayerCPU(const SplitterLayerCPU&) = delete;
+    SplitterLayerCPU& operator=(const SplitterLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SplitterLayer& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/SqrtLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/SqrtLayerCPU.cpp
new file mode 100644
index 00000000..d17e21e1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/SqrtLayerCPU.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SqrtLayerCPU.h"
+#include "../SqrtLayer.h"
+
+#include <atomic>
+#include <cmath>
+
+namespace raul
+{
+
+template<typename MM>
+SqrtLayerCPU<MM>::SqrtLayerCPU(SqrtLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void SqrtLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    std::atomic<bool> NegativeNumberDetected = false;
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        if (TODTYPE(input[q]) < 0.0_dt)
+        {
+            NegativeNumberDetected = true;
+        }
+        output[q] = static_cast<std::remove_reference_t<decltype(output[q])>>(std::sqrt(TODTYPE(input[q])));
+    }
+
+    mLayer.mNegativeNumberDetected = NegativeNumberDetected;
+    if (mLayer.mNegativeNumberDetected)
+    {
+        throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "::forwardCompute]: negative input for sqrt()");
+    }
+}
+
+template<typename MM>
+void SqrtLayerCPU<MM>::backwardComputeImpl()
+{
+    if (mLayer.mNegativeNumberDetected)
+    {
+        throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "::backwardCompute]: negative input for sqrt()");
+    }
+
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            nabla_tensor[q] += static_cast<std::remove_reference_t<decltype(output[q])>>(0.5_dt * deltas[q] / output[q]);
+        }
+    }
+}
+
+template class SqrtLayerCPU<MemoryManager>;
+template class SqrtLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/SqrtLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/SqrtLayerCPU.h
new file mode 100644
index 00000000..614d6811
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/SqrtLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SQRT_LAYER_CPU_H
+#define SQRT_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SqrtLayer;
+
+template<typename MM>
+class SqrtLayerCPU : public BasicImpl
+{
+  public:
+    SqrtLayerCPU(SqrtLayer& layer);
+
+    SqrtLayerCPU(SqrtLayerCPU&&) = default;
+    SqrtLayerCPU(const SqrtLayerCPU&) = delete;
+    SqrtLayerCPU& operator=(const SqrtLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SqrtLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/SquareLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/SquareLayerCPU.cpp
new file mode 100644
index 00000000..de984d37
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/SquareLayerCPU.cpp
@@ -0,0 +1,68 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SquareLayerCPU.h"
+#include "../SquareLayer.h"
+
+namespace raul
+{
+
+template<typename MM>
+SquareLayerCPU<MM>::SquareLayerCPU(SquareLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void SquareLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        output[q] = input[q] * input[q];
+    }
+}
+
+template<typename MM>
+void SquareLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+    {
+        auto& nabla_tensor = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < input.size(); ++q)
+        {
+            nabla_tensor[q] += (2 * deltas[q] * input[q]);
+        }
+    }
+}
+
+template class SquareLayerCPU<MemoryManager>;
+template class SquareLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/SquareLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/SquareLayerCPU.h
new file mode 100644
index 00000000..641f8cdb
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/SquareLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SQUARE_LAYER_CPU_H
+#define SQUARE_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SquareLayer;
+
+template<typename MM>
+class SquareLayerCPU : public BasicImpl
+{
+  public:
+    SquareLayerCPU(SquareLayer& layer);
+
+    SquareLayerCPU(SquareLayerCPU&&) = default;
+    SquareLayerCPU(const SquareLayerCPU&) = delete;
+    SquareLayerCPU& operator=(const SquareLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SquareLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/TensorLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/TensorLayerCPU.cpp
new file mode 100644
index 00000000..fb8532ef
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/TensorLayerCPU.cpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TensorLayerCPU.h"
+#include "../TensorLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::TensorLayer, raul::TensorLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::TensorLayer, raul::TensorLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+TensorLayerCPU<MM>::TensorLayerCPU(TensorLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void TensorLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    if (mLayer.mInit)
+    {
+        auto& work = mLayer.mNetworkParams.mWorkflow;
+        for (const auto& out : mLayer.mOutputs)
+        {
+            work.getMemoryManager<MM>()[out] = static_cast<typename MM::type>(mLayer.mInitValue);
+        }
+    }
+}
+
+template class TensorLayerCPU<MemoryManager>;
+template class TensorLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/TensorLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/TensorLayerCPU.h
new file mode 100644
index 00000000..9791a892
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/TensorLayerCPU.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TENSOR_LAYER_CPU_H
+#define TENSOR_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class TensorLayer;
+
+template<typename MM>
+class TensorLayerCPU : public BasicImpl
+{
+  public:
+    TensorLayerCPU(TensorLayer& layer);
+
+    TensorLayerCPU(TensorLayerCPU&&) = default;
+    TensorLayerCPU(const TensorLayerCPU&) = delete;
+    TensorLayerCPU& operator=(const TensorLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override {}
+
+  private:
+    TensorLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/TileLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/TileLayerCPU.cpp
new file mode 100644
index 00000000..b15bf434
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/TileLayerCPU.cpp
@@ -0,0 +1,102 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TileLayerCPU.h"
+#include "../TileLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::TileLayer, raul::TileLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::TileLayer, raul::TileLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+TileLayerCPU<MM>::TileLayerCPU(TileLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void TileLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+
+    // Get 4D view
+    const auto input4D = input.get4DView();
+    auto output4D = output.get4DView();
+    // Need shapes to get indices
+    const auto outputShape = output.getShape();
+    const auto inputShape = input.getShape();
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < outputShape[0]; ++i)
+    {
+        for (size_t j = 0; j < outputShape[1]; ++j)
+        {
+            for (size_t k = 0; k < outputShape[2]; ++k)
+            {
+                for (size_t q = 0; q < outputShape[3]; ++q)
+                {
+                    output4D[i][j][k][q] = input4D[i][j % inputShape[1]][k % inputShape[2]][q % inputShape[3]];
+                }
+            }
+        }
+    }
+}
+
+template<typename MM>
+void TileLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& delta = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+
+    // if (mNetworkParams.isGradNeeded(mInputs[0]))
+    {
+        auto& prevLayerNabla = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+        // Get 4D view
+        const auto delta4D = delta.get4DView();
+        auto prevLayerNabla4D = prevLayerNabla.get4DView();
+        // Need shapes to get indices
+        const auto deltaShape = delta.getShape();
+        const auto inputShape = prevLayerNabla.getShape();
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < deltaShape[0]; ++i)
+        {
+            for (size_t j = 0; j < deltaShape[1]; ++j)
+            {
+                for (size_t k = 0; k < deltaShape[2]; ++k)
+                {
+                    for (size_t q = 0; q < deltaShape[3]; ++q)
+                    {
+                        prevLayerNabla4D[i][j % inputShape[1]][k % inputShape[2]][q % inputShape[3]] += delta4D[i][j][k][q];
+                    }
+                }
+            }
+        }
+    }
+}
+
+template class TileLayerCPU<MemoryManager>;
+template class TileLayerCPU<MemoryManagerFP16>;
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/impl/TileLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/TileLayerCPU.h
new file mode 100644
index 00000000..6388ae2a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/TileLayerCPU.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TILE_LAYER_CPU_H
+#define TILE_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class TileLayer;
+
+template<typename MM>
+class TileLayerCPU : public BasicImpl
+{
+  public:
+    TileLayerCPU(TileLayer& layer);
+
+    TileLayerCPU(TileLayerCPU&&) = default;
+    TileLayerCPU(const TileLayerCPU&) = delete;
+    TileLayerCPU& operator=(const TileLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    TileLayer& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/impl/TransposeLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/impl/TransposeLayerCPU.cpp
new file mode 100644
index 00000000..e87b8a27
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/TransposeLayerCPU.cpp
@@ -0,0 +1,107 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TransposeLayerCPU.h"
+#include "../TransposeLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+
+using namespace raul;
+
+namespace
+{
+
+template<typename T>
+void CopyWithSwappedIndices(const T& from, T& to, size_t ind1, size_t ind2, bool overwrite = true)
+{
+    if (overwrite)
+    {
+        std::fill(to.begin(), to.end(), static_cast<typename T::type>(0_dt));
+    }
+    if (ind1 == ind2)
+    {
+        std::transform(from.begin(), from.end(), to.begin(), to.begin(), std::plus<typename T::type>());
+    }
+    else
+    {
+        auto inData4D = from.get4DView();
+        auto outData4D = to.get4DView();
+
+        size_t N = from.getBatchSize();
+        size_t C = from.getDepth();
+        size_t H = from.getHeight();
+        size_t W = from.getWidth();
+
+        size_t inputIndices[4] = { 0 };
+        size_t* outputIndices[4] = { &inputIndices[0], &inputIndices[1], &inputIndices[2], &inputIndices[3] };
+
+        outputIndices[ind1] = &inputIndices[ind2];
+        outputIndices[ind2] = &inputIndices[ind1];
+
+        for (inputIndices[0] = 0; inputIndices[0] < N; ++inputIndices[0])
+        {
+            for (inputIndices[1] = 0; inputIndices[1] < C; ++inputIndices[1])
+            {
+                for (inputIndices[2] = 0; inputIndices[2] < H; ++inputIndices[2])
+                {
+                    for (inputIndices[3] = 0; inputIndices[3] < W; ++inputIndices[3])
+                    {
+                        outData4D[*outputIndices[0]][*outputIndices[1]][*outputIndices[2]][*outputIndices[3]] += inData4D[inputIndices[0]][inputIndices[1]][inputIndices[2]][inputIndices[3]];
+                    }
+                }
+            }
+        }
+    }
+}
+
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::TransposeLayer, raul::TransposeLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::TransposeLayer, raul::TransposeLayerCPU<raul::MemoryManagerFP16>>();
+
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+TransposeLayerCPU<MM>::TransposeLayerCPU(TransposeLayer& layer)
+    : mLayer(layer)
+{
+}
+
+template<typename MM>
+void TransposeLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+    CopyWithSwappedIndices(inputs, output, mLayer.mDim1, mLayer.mDim2);
+}
+
+template<typename MM>
+void TransposeLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+        CopyWithSwappedIndices(deltas, prevLayerDelta, mLayer.mDim1, mLayer.mDim2, false);
+    }
+}
+
+template class TransposeLayerCPU<MemoryManager>;
+template class TransposeLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/impl/TransposeLayerCPU.h b/training/src/compiler/training/base/layers/basic/impl/TransposeLayerCPU.h
new file mode 100644
index 00000000..0388a179
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/impl/TransposeLayerCPU.h
@@ -0,0 +1,42 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TRANSPOSE_LAYER_CPU_H
+#define TRANSPOSE_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class TransposeLayer;
+
+template<typename MM>
+class TransposeLayerCPU : public BasicImpl
+{
+  public:
+    TransposeLayerCPU(TransposeLayer& layer);
+
+    TransposeLayerCPU(TransposeLayerCPU&&) = default;
+    TransposeLayerCPU(const TransposeLayerCPU&) = delete;
+    TransposeLayerCPU& operator=(const TransposeLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    TransposeLayer& mLayer;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/Batchnorm.cpp b/training/src/compiler/training/base/layers/basic/trainable/Batchnorm.cpp
new file mode 100644
index 00000000..fd1b4c97
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/Batchnorm.cpp
@@ -0,0 +1,116 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Batchnorm.h"
+
+#include <algorithm>
+
+#include "impl/BatchnormCPU.h"
+
+namespace raul
+{
+
+BatchNormLayer::BatchNormLayer(const Name& name, const BatchnormParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "BatchNorm", params, networkParameters)
+    , mMomentum(params.momentum)
+    , mEps(params.eps)
+    , mDimension(params.dim)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    if (mMomentum < 0.0_dt || mMomentum > 1.0_dt)
+    {
+        THROW(mTypeName, mName, "incorrect momentum value");
+    }
+
+    if (mDimension == raul::Dimension::Batch || mDimension == raul::Dimension::Default)
+    {
+        THROW(mTypeName, mName, "incorrect dimension specified");
+    }
+
+    DECLARE_IMPL(BatchNormLayer, BatchNormLayerCPU<MemoryManager>, BatchNormLayerCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    using shape3D = yato::dimensionality<3U, size_t>;
+
+    const auto inputShape = shape3D{ mNetworkParams.mWorkflow.getDepth(mInputName), mNetworkParams.mWorkflow.getHeight(mInputName), mNetworkParams.mWorkflow.getWidth(mInputName) };
+
+    mInputWidth = inputShape[2];
+    mInputHeight = inputShape[1];
+    mInputDepth = inputShape[0];
+
+    mInputDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mInputHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mInputWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+
+    mChosenDimSize = inputShape[static_cast<size_t>(mDimension) - 1];
+    for (size_t k = 0, i = 0; i < inputShape.dimensions_num(); ++i)
+    {
+        if (i + 1 != static_cast<size_t>(mDimension))
+        {
+            mOtherDims[k] = inputShape[i];
+            k++;
+        }
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ raul::BS(), mInputDepth, mInputHeight, mInputWidth }, DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+    shape internalTensorsShape{ 1u, 1u, 1u, 1u };
+    internalTensorsShape[static_cast<size_t>(mDimension)] = inputShape[static_cast<size_t>(mDimension) - 1];
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mName / "Mean", raul::WShape(internalTensorsShape), DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mName / "Variance", raul::WShape(internalTensorsShape), DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mName / "MeanEval", raul::WShape(internalTensorsShape), DEC_FORW_WRIT_NOMEMOPT);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mName / "VarianceEval", raul::WShape(internalTensorsShape), DEC_FORW_WRIT_NOMEMOPT);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mName / "XHat", raul::WShape{ raul::BS(), mInputDepth, mInputHeight, mInputWidth }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mName / "XHat", DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mName / "VarianceSqrt", raul::WShape(internalTensorsShape), DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mName / "VarianceSqrt", DEC_BACK_READ);
+
+    // beta
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesName, raul::WShape(internalTensorsShape), DEC_TRAINABLE);
+
+    // gamma
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, raul::WShape(internalTensorsShape), DEC_TRAINABLE);
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesName, mBiasesName.grad(), DEC_TRAINABLE_GRAD);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/Batchnorm.h b/training/src/compiler/training/base/layers/basic/trainable/Batchnorm.h
new file mode 100644
index 00000000..6543097d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/Batchnorm.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BATCHNORM_LAYER_H
+#define BATCHNORM_LAYER_H
+
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/parameters/trainable/BatchnormParams.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Batch normalization layer
+ *
+ * The layer applies batch normalization over 4D input tensor ([batch, channel, 2D inputs]).
+ */
+class BatchNormLayer : public TrainableLayer
+{
+  public:
+    BatchNormLayer(const Name& name, const BatchnormParams& params, NetworkParameters& networkParameters);
+
+    BatchNormLayer(BatchNormLayer&&) = default;
+    BatchNormLayer(const BatchNormLayer&) = delete;
+    BatchNormLayer& operator=(const BatchNormLayer&) = delete;
+
+  private:
+    size_t mInputWidth;
+    size_t mInputHeight;
+    size_t mInputDepth;
+
+    size_t mChosenDimSize;
+    std::array<size_t, 2> mOtherDims;
+
+    Name mInputName;
+    Name mOutputName;
+
+    dtype mMomentum;
+    const dtype mEps;
+    raul::Dimension mDimension;
+
+    template<typename MM>
+    friend class BatchNormLayerCPU;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/Convolution1DLayer.cpp b/training/src/compiler/training/base/layers/basic/trainable/Convolution1DLayer.cpp
new file mode 100644
index 00000000..ae28d90b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/Convolution1DLayer.cpp
@@ -0,0 +1,384 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Convolution1DLayer.h"
+
+#include <functional>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4938)
+#endif
+
+#include "impl/Convolution1DLayerCPU.h"
+
+namespace raul
+{
+
+Convolution1DLayer::Convolution1DLayer(const Name& name, const Convolution1DParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "Convolution1D", params, networkParameters)
+    , mOutputChannels(params.kernelsCount)
+    , mKernelSize(params.kernelSize)
+    , mStride(params.stride)
+    , mPadding(params.padding)
+    , mDilation(params.dilation)
+    , mGroups(params.groups)
+    , mUseBias(params.useBias)
+    , mQuantizeWeights(params.quantizeWeights)
+    , mDilationEnabled(false)
+    , mTFStyle(params.tfStyle)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    DECLARE_IMPL(Convolution1DLayer, Convolution1DLayerCPU<MemoryManager>, Convolution1DLayerCPU<MemoryManagerFP16>)
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+    if (mGroups < 1)
+    {
+        THROW(mTypeName, mName, "zero groups");
+    }
+    if (mDilation < 1)
+    {
+        THROW(mTypeName, mName, "dilation parameter should be at least 1");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    if (mNetworkParams.mWorkflow.getHeight(mInputName) > 1 && mNetworkParams.mWorkflow.getDepth(mInputName) > 1)
+    {
+        THROW(mTypeName, mName, "height and depth can't both be > 1");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, Workflow::Usage::ForwardAndBackward, Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    // Depends on style
+    if (!mTFStyle)
+    {
+        mInputSize = mNetworkParams.mWorkflow.getWidth(mInputName);
+        mInputChannels = mNetworkParams.mWorkflow.getHeight(mInputName) * mNetworkParams.mWorkflow.getDepth(mInputName);
+    }
+    else
+    {
+        mInputChannels = mNetworkParams.mWorkflow.getWidth(mInputName);
+        mInputSize = mNetworkParams.mWorkflow.getHeight(mInputName) * mNetworkParams.mWorkflow.getDepth(mInputName);
+    }
+
+    if (mInputChannels % mGroups != 0 || mOutputChannels % mGroups != 0)
+    {
+        THROW(mTypeName, mName, "bad number of groups");
+    }
+
+    mEffectiveReceptiveField = mDilation * (mKernelSize - 1) + 1;
+    mOutputSize = (mInputSize + 2 * mPadding - mEffectiveReceptiveField) / mStride + 1;
+
+    WShape outputShape;
+    if (mNetworkParams.mWorkflow.getDepth(mInputName) > 1)
+    {
+        if (mTFStyle)
+        {
+            outputShape = { BS(), mOutputSize, 1u, mOutputChannels };
+        }
+        else
+        {
+            outputShape = { BS(), mOutputChannels, 1u, mOutputSize };
+        }
+    }
+    else
+    {
+        if (mTFStyle)
+        {
+            outputShape = { BS(), 1u, mOutputSize, mOutputChannels };
+        }
+        else
+        {
+            outputShape = { BS(), 1u, mOutputChannels, mOutputSize };
+        }
+    }
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, outputShape, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+#if !defined(RAUL_NAIVE_CONV_FORWARD) && !defined(RAUL_NAIVE_CONV_BACKWARD)
+    if (mDilation > 1)
+    {
+        mDilationEnabled = true;
+        mDilationTensor = "Dilated" + mWeightsName;
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mDilationTensor, WShape{ 1u, mOutputChannels, mInputChannels / mGroups, mEffectiveReceptiveField }, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mDilationTensor, DEC_BACK_READ);
+    }
+    else if (mTFStyle)
+    {
+        mTempWeights = "Temp" + mWeightsName;
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mTempWeights, WShape{ 1u, mOutputChannels, mInputChannels / mGroups, mKernelSize }, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mTempWeights, DEC_BACK_READ);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mTempWeights, mTempWeights.grad(), DEC_BACK_WRIT_ZERO);
+    }
+#endif
+
+    size_t numThreads = 1;
+#if defined(_OPENMP)
+#pragma omp parallel
+    {
+        numThreads = omp_get_num_threads();
+    }
+#endif
+    for (size_t i = 0; i < numThreads; ++i)
+    {
+        Name im2ColF = mName / "Im2ColFor" / Conversions::toString(i);
+        mIm2ColForward.push_back(im2ColF);
+
+        Name im2ColB = mName / "Im2ColBack" / Conversions::toString(i);
+        mIm2ColBackward.push_back(im2ColB);
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, im2ColF, WShape{ 1u, 1u, 1u, mOutputSize * mInputChannels * mEffectiveReceptiveField }, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, im2ColF, im2ColB, DEC_BACK_WRIT);
+
+        Name tempWeightsGrag = mName / "TempWeightsGrag" / Conversions::toString(i);
+        mTempWeightsGrag.push_back(tempWeightsGrag);
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, tempWeightsGrag, WShape{ 1u, mOutputChannels, mInputChannels / mGroups, mEffectiveReceptiveField }, DEC_BACK_WRIT_ZERO);
+    }
+    mTmpIm2ColName = mName / "TmpIm2Col";
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mTmpIm2ColName, WShape{ BS(), 1u, 1u, mOutputSize * mInputChannels * mEffectiveReceptiveField }, DEC_BACK_WRIT_ZERO);
+
+#if defined(RAUL_NAIVE_CONV_BACKWARD)
+    size_t inputSizePadded = mInputSize + 2 * mPadding;
+    mTmpForBackwardName = mName / "TmpForBackward";
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mTmpForBackwardName, WShape{ 1u, 1u, 1u, mInputChannels * inputSizePadded }, DEC_BACK_WRIT_ZERO);
+#endif
+
+    // Weights can be stored as [1, kernel_size, input_channels, output_channels] (tf-style)
+    // or as [1, output_channels, input_channels, kernel_size] (torch-style)
+    if (mTFStyle)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, WShape{ 1u, mKernelSize, mInputChannels / mGroups, mOutputChannels }, DEC_TRAINABLE);
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, WShape{ 1u, mOutputChannels, mInputChannels / mGroups, mKernelSize }, DEC_TRAINABLE);
+    }
+
+    if (mUseBias)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesName, WShape{ 1u, mOutputChannels, 1u, 1u }, DEC_TRAINABLE);
+    }
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+
+        if (mUseBias)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesName, mBiasesName.grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+
+    if (mQuantizeWeights)
+    {
+        if (!mNetworkParams.mQuantizerPtr)
+        {
+            THROW(mTypeName, mName, "quantizer is not defined");
+        }
+        mWeightsBackup = mWeightsName + "_backup";
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsBackup, DEC_FORW_WRIT);
+    }
+}
+
+#ifdef RAUL_NAIVE_CONV_FORWARD
+void Convolution1DLayer::forwardComputeImpl(NetworkMode)
+{
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+    Tensor& output = mNetworkParams.mMemoryManager[mOutputName];
+    const Tensor& inputs = mNetworkParams.mMemoryManager[mInputName];
+    const Tensor& weights = mNetworkParams.mMemoryManager[mWeightsName];
+
+    Common::conv1d(&inputs[0],
+                   &output[0],
+                   &weights[0],
+                   mUseBias ? &mNetworkParams.mMemoryManager[mBiasesName][0] : nullptr,
+                   batchSize,
+                   mInputSize,
+                   mInputChannels,
+                   mOutputSize,
+                   mOutputChannels,
+                   mKernelSize,
+                   mPadding,
+                   mStride,
+                   mDilation,
+                   mGroups,
+                   mTFStyle);
+}
+#else
+void Convolution1DLayer::forwardComputeImpl(NetworkMode mode)
+{
+    mImpl->forwardComputeImpl(mode);
+}
+#endif
+
+#ifdef RAUL_NAIVE_CONV_BACKWARD
+void Convolution1DLayer::backwardComputeImpl()
+{
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+    const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputName.grad()];
+    auto deltas3D = mTFStyle ? deltas.reshape(yato::dims(batchSize, mOutputSize, mOutputChannels)) : deltas.reshape(yato::dims(batchSize, mOutputChannels, mOutputSize));
+    const Tensor& weights = mNetworkParams.mMemoryManager[mWeightsName];
+    auto kernelsWeights3D =
+        mTFStyle ? weights.reshape(yato::dims(mKernelSize, mInputChannels / mGroups, mOutputChannels)) : weights.reshape(yato::dims(mOutputChannels, mInputChannels / mGroups, mKernelSize));
+
+    size_t inputSizePadded = mInputSize + 2 * mPadding;
+    // prevDelta
+    // if (mNetworkParams.isGradNeeded(mInputName))
+    {
+        Tensor& prevDeltaTmp = mNetworkParams.mMemoryManager[mTmpForBackwardName];
+
+        auto prevDeltaTmp2D = mTFStyle ? prevDeltaTmp.reshape(yato::dims(inputSizePadded, mInputChannels)) : prevDeltaTmp.reshape(yato::dims(mInputChannels, inputSizePadded));
+        Tensor& prevLayerDelta = mNetworkParams.mMemoryManager[mInputName.grad()];
+        auto prevDeltas3D = mTFStyle ? prevLayerDelta.reshape(yato::dims(batchSize, mInputSize, mInputChannels)) : prevLayerDelta.reshape(yato::dims(batchSize, mInputChannels, mInputSize));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            std::fill(prevDeltaTmp.begin(), prevDeltaTmp.end(), 0.0_dt);
+
+            for (size_t group = 0; group < mGroups; ++group)
+            {
+                for (size_t d = 0; d < mInputChannels / mGroups; ++d)
+                {
+                    for (size_t kernelIndex = 0; kernelIndex < mOutputChannels / mGroups; ++kernelIndex)
+                    {
+                        for (size_t ox = 0; ox < mOutputSize; ++ox)
+                        {
+                            for (size_t kx = 0; kx < mKernelSize; ++kx)
+                            {
+                                if (mTFStyle)
+                                {
+                                    prevDeltaTmp2D[ox * mStride + kx * mDilation][d + group * mInputChannels / mGroups] +=
+                                        deltas3D[i][ox][kernelIndex + group * mOutputChannels / mGroups] * kernelsWeights3D[kx][d][kernelIndex + group * mOutputChannels / mGroups];
+                                }
+                                else
+                                {
+                                    prevDeltaTmp2D[d + group * mInputChannels / mGroups][ox * mStride + kx * mDilation] +=
+                                        deltas3D[i][kernelIndex + group * mOutputChannels / mGroups][ox] * kernelsWeights3D[kernelIndex + group * mOutputChannels / mGroups][d][kx];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            Common::removePadding1D(&prevDeltaTmp2D[0][0], &prevDeltas3D[i][0][0], mInputChannels, inputSizePadded, mInputSize, mTFStyle, false);
+        }
+    }
+    if (!mFrozen)
+    {
+        const Tensor& inputs = mNetworkParams.mMemoryManager[mInputName];
+        Tensor& gradWeights = mNetworkParams.mMemoryManager[mWeightsName.grad()];
+
+        auto inputs3D = mTFStyle ? inputs.reshape(yato::dims(batchSize, mInputSize, mInputChannels)) : inputs.reshape(yato::dims(batchSize, mInputChannels, mInputSize));
+        auto gradWeights3D = mTFStyle ? gradWeights.reshape(yato::dims(mKernelSize, mInputChannels / mGroups, mOutputChannels))
+                                      : gradWeights.reshape(yato::dims(mOutputChannels, mInputChannels / mGroups, mKernelSize));
+
+        // gradients weights
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            Tensor& inputPadded = mNetworkParams.mMemoryManager[mTmpForBackwardName];
+            inputPadded = 0_dt;
+            Common::addPadding1D(&inputs3D[i][0][0], inputPadded.data(), mInputChannels, mInputSize, inputSizePadded, mTFStyle);
+
+            auto inputPadded2D = mTFStyle ? inputPadded.reshape(yato::dims(inputSizePadded, mInputChannels)) : inputPadded.reshape(yato::dims(mInputChannels, inputSizePadded));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t group = 0; group < mGroups; ++group)
+            {
+                for (size_t d = 0; d < mInputChannels / mGroups; ++d)
+                {
+                    for (size_t kernelIndex = 0; kernelIndex < mOutputChannels / mGroups; ++kernelIndex)
+                    {
+                        for (size_t kx = 0; kx < mKernelSize; ++kx)
+                        {
+                            for (size_t ox = 0; ox < mOutputSize; ++ox)
+                            {
+                                if (mTFStyle)
+                                {
+                                    gradWeights3D[kx][d][kernelIndex + group * mOutputChannels / mGroups] +=
+                                        deltas3D[i][ox][kernelIndex + group * mOutputChannels / mGroups] * inputPadded2D[ox * mStride + kx * mDilation][d + group * mInputChannels / mGroups];
+                                }
+                                else
+                                {
+                                    gradWeights3D[kernelIndex + group * mOutputChannels / mGroups][d][kx] +=
+                                        deltas3D[i][kernelIndex + group * mOutputChannels / mGroups][ox] * inputPadded2D[d + group * mInputChannels / mGroups][ox * mStride + kx * mDilation];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        // gradients biases
+        if (mUseBias)
+        {
+            Tensor& gradBiases = mNetworkParams.mMemoryManager[mBiasesName.grad()];
+
+            for (size_t kernelIndex = 0; kernelIndex < mOutputChannels; ++kernelIndex)
+            {
+                auto gradBias = 0.0_dt;
+#if defined(_OPENMP)
+#pragma omp parallel for reduction(+ : gradBias)
+#endif
+                for (size_t i = 0; i < batchSize; ++i)
+                {
+                    for (size_t ow = 0; ow < mOutputSize; ++ow)
+                    {
+                        if (mTFStyle)
+                        {
+                            gradBias += deltas3D[i][ow][kernelIndex];
+                        }
+                        else
+                        {
+                            gradBias += deltas3D[i][kernelIndex][ow];
+                        }
+                    }
+                }
+                gradBiases[kernelIndex] += gradBias;
+            }
+        }
+    }
+}
+#else
+void Convolution1DLayer::backwardComputeImpl()
+{
+    mImpl->backwardComputeImpl();
+}
+#endif
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/Convolution1DLayer.h b/training/src/compiler/training/base/layers/basic/trainable/Convolution1DLayer.h
new file mode 100644
index 00000000..6455afb1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/Convolution1DLayer.h
@@ -0,0 +1,91 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVOLUTION_1D_LAYER_H
+#define CONVOLUTION_1D_LAYER_H
+
+#include <training/base/layers/TrainableLayer.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/base/layers/parameters/trainable/Convolution1DParams.h>
+
+//#define RAUL_NAIVE_CONV_FORWARD
+//#define RAUL_NAIVE_CONV_BACKWARD
+
+namespace raul
+{
+
+/**
+ * @brief Convolution 1D Layer
+ *
+ * Applies a 1D convolution over an input signal composed of several input planes.
+ * Supports 2 modes:
+ *  1. PyTorch style: Input[N, C, 1, L1] (or [N, 1, C, L1]) -> Output[N, FILTERS, 1, L2] (or [N, 1, FILTERS, L2])
+ *  2. TensorFlow style: Input[N, L1, 1, C] (or [N, 1, L1, C]) -> Output[N, L2, 1, FILTERS] (or [N, 1, L2, FILTERS])
+ *
+ */
+class Convolution1DLayer : public TrainableLayer
+{
+  public:
+    Convolution1DLayer(const Name& name, const Convolution1DParams& params, NetworkParameters& networkParameters);
+
+    Convolution1DLayer(Convolution1DLayer&&) = default;
+    Convolution1DLayer(const Convolution1DLayer&) = delete;
+    Convolution1DLayer& operator=(const Convolution1DLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  protected:
+    size_t mInputSize;
+    size_t mInputChannels;
+
+    size_t mOutputSize;
+    size_t mOutputChannels;
+
+    size_t mKernelSize;
+    size_t mStride;
+    size_t mPadding;
+    size_t mDilation;
+    size_t mGroups;
+
+    bool mUseBias;
+
+    bool mQuantizeWeights;
+    Name mWeightsBackup;
+
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mEffectiveReceptiveField;
+    bool mDilationEnabled;
+    Name mDilationTensor;
+
+    Names mIm2ColForward;
+    Names mIm2ColBackward;
+    Names mTempWeightsGrag;
+
+    Name mTmpIm2ColName;
+#if defined(RAUL_NAIVE_CONV_BACKWARD)
+    Name mTmpForBackwardName;
+#endif
+    bool mTFStyle;
+    Name mTempWeights; // for TF style
+
+    template<typename MM>
+    friend class Convolution1DLayerCPU;
+};
+
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/trainable/Convolution2DLayer.cpp b/training/src/compiler/training/base/layers/basic/trainable/Convolution2DLayer.cpp
new file mode 100644
index 00000000..f60bb7d1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/Convolution2DLayer.cpp
@@ -0,0 +1,173 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Convolution2DLayer.h"
+#include <training/base/impl/basic/trainable/Convolution2DLayerCPU.h>
+
+#include <functional>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace raul
+{
+
+Convolution2DLayer::Convolution2DLayer(const Name& name, const Convolution2DParams& params, NetworkParameters& networkParameters)
+    : Convolution2DLayer(name, "Convolution2D", params, networkParameters){ MEASURE_BLOCK("Convolution2DLayer[" + mName + "::ctor]") }
+
+    Convolution2DLayer::Convolution2DLayer(const Name& name, const std::string& typeName, const Convolution2DParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, typeName, params, networkParameters)
+    , mKernelWidth(params.kernelWidth)
+    , mKernelHeight(params.kernelHeight)
+    , mKernelsCount(params.kernelsCount)
+    , mStrideW(params.strideW)
+    , mStrideH(params.strideH)
+    , mPaddingW(params.paddingW)
+    , mPaddingH(params.paddingH)
+    , mDilationW(params.mDilationW)
+    , mDilationH(params.mDilationH)
+    , mGroups(params.mGroups)
+    , mUseBias(params.bias)
+    , mQuantizeWeights(params.quantizeWeights)
+    , mDilationEnabled(false)
+{
+    MEASURE_BLOCK(mTypeName + "[" + mName + "::ctor]")
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (!mNetworkParams.mWorkflow.isCompilerEnabled())
+    {
+        if (mNetworkParams.mWorkflow.getOverrideLayerExecutionTarget() == LayerExecutionTarget::Default)
+        {
+            DECLARE_IMPL(Convolution2DLayer, Convolution2DLayerCPU<MemoryManager>, Convolution2DLayerCPU<MemoryManagerFP16>)
+        }
+        else if (mNetworkParams.mWorkflow.getOverrideLayerExecutionTarget() == LayerExecutionTarget::CPU)
+        {
+            DECLARE_IMPL(Convolution2DLayer, Convolution2DLayerCPU<MemoryManager>, Convolution2DLayerCPU<MemoryManager>)
+        }
+        else if (mNetworkParams.mWorkflow.getOverrideLayerExecutionTarget() == LayerExecutionTarget::CPUFP16)
+        {
+            DECLARE_IMPL(Convolution2DLayer, Convolution2DLayerCPU<MemoryManagerFP16>, Convolution2DLayerCPU<MemoryManagerFP16>)
+        }
+        else
+        {
+            THROW(mTypeName, mName, "unsupported layer execution target");
+        }
+    }
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+    if (mDilationW < 1 || mDilationH < 1)
+    {
+        THROW(mTypeName, mName, "dilation parameter should be at least 1");
+    }
+
+    if (mGroups < 1)
+    {
+        THROW(mTypeName, mName, "zero groups");
+    }
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mInputDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mInputHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mInputWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+
+    mEffectiveReceptiveFieldW = mDilationW * (mKernelWidth - 1) + 1;
+    mEffectiveReceptiveFieldH = mDilationH * (mKernelHeight - 1) + 1;
+    mOutputWidth = (mInputWidth + 2 * mPaddingW - mEffectiveReceptiveFieldW) / mStrideW + 1;
+    mOutputHeight = (mInputHeight + 2 * mPaddingH - mEffectiveReceptiveFieldH) / mStrideH + 1;
+
+    if (mInputDepth % mGroups != 0 || mKernelsCount % mGroups != 0)
+    {
+        THROW(mTypeName, mName, "bad number of groups");
+    }
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ raul::BS(), mKernelsCount, mOutputHeight, mOutputWidth }, DEC_FORW_WRIT_COMP);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+#if !defined(RAUL_NAIVE_CONV_FORWARD) && !defined(RAUL_NAIVE_CONV_BACKWARD)
+    if (mDilationW > 1 || mDilationH > 1)
+    {
+        mDilationEnabled = true;
+        mDilationTensor = "Dilated" + mWeightsName;
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mDilationTensor, raul::WShape{ mKernelsCount, mInputDepth / mGroups, mEffectiveReceptiveFieldH, mEffectiveReceptiveFieldW }, DEC_FRBC_WRIT);
+    }
+#endif
+
+    size_t numThreads = 1;
+#if defined(_OPENMP)
+#pragma omp parallel
+    {
+        numThreads = omp_get_num_threads();
+    }
+#endif
+    for (size_t i = 0; i < numThreads; ++i)
+    {
+        Name im2ColF = mName / "Im2ColFor" / Conversions::toString(i);
+        mIm2ColForward.push_back(im2ColF);
+
+        Name im2ColB = mName / "Im2ColBack" / Conversions::toString(i);
+        mIm2ColBackward.push_back(im2ColB);
+
+        mNetworkParams.mWorkflow.tensorNeeded(
+            mName, im2ColF, raul::WShape{ 1u, 1u, 1u, mOutputHeight * mOutputWidth * mInputDepth * mEffectiveReceptiveFieldH * mEffectiveReceptiveFieldW }, DEC_FORW_WRIT);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, im2ColF, im2ColB, DEC_BACK_WRIT);
+    }
+#if defined(RAUL_NAIVE_CONV_BACKWARD)
+    mTmpTensorName = mName / "TmpTensor";
+    size_t inputWidthPadded = mInputWidth + 2 * mPaddingW;
+    size_t inputHeightPadded = mInputHeight + 2 * mPaddingH;
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mTmpTensorName, raul::WShape{ 1u, 1u, 1u, mInputDepth * inputHeightPadded * inputWidthPadded }, DEC_BACK_WRIT_ZERO);
+#endif
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, raul::WShape{ mKernelsCount, mInputDepth / mGroups, mKernelHeight, mKernelWidth }, DEC_TRAINABLE);
+
+    if (mUseBias)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesName, raul::WShape{ 1u, mKernelsCount, 1u, 1u }, DEC_TRAINABLE);
+    }
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+
+        if (mUseBias)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesName, mBiasesName.grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+
+    if (mQuantizeWeights)
+    {
+        if (!mNetworkParams.mQuantizerPtr)
+        {
+            THROW(mTypeName, mName, "quantizer is not defined");
+        }
+        mWeightsBackup = mWeightsName + "_backup";
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsBackup, DEC_FORW_WRIT);
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/Convolution2DLayer.h b/training/src/compiler/training/base/layers/basic/trainable/Convolution2DLayer.h
new file mode 100644
index 00000000..92719d0d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/Convolution2DLayer.h
@@ -0,0 +1,135 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVOLUTION_2D_LAYER_H
+#define CONVOLUTION_2D_LAYER_H
+
+#include <training/base/layers/TrainableLayer.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/parameters/trainable/Convolution2DParams.h>
+
+//#define RAUL_NAIVE_CONV_FORWARD
+//#define RAUL_NAIVE_CONV_BACKWARD
+
+namespace raul
+{
+
+/**
+ * @brief Convolution 2D Layer
+ *
+ * The layer applies 2D convolution over input tensor, all channels convolved.
+ */
+class Convolution2DLayer : public TrainableLayer
+{
+  public:
+    Convolution2DLayer(const Name& name, const Convolution2DParams& params, NetworkParameters& networkParameters);
+
+    Convolution2DLayer(Convolution2DLayer&&) = default;
+    Convolution2DLayer(const Convolution2DLayer&) = delete;
+    Convolution2DLayer& operator=(const Convolution2DLayer&) = delete;
+
+    size_t getInputWidth() const { return mInputWidth; }
+    size_t getInputHeight() const { return mInputHeight; }
+    size_t getInputDepth() const { return mInputDepth; }
+
+    size_t getOutputWidth() const { return mOutputWidth; }
+    size_t getOutputHeight() const { return mOutputHeight; }
+
+    size_t getKernelWidth() const { return mKernelWidth; }
+    size_t getKernelHeight() const { return mKernelHeight; }
+    size_t getKernelsCount() const { return mKernelsCount; }
+
+    size_t getStrideW() const { return mStrideW; }
+    size_t getStrideH() const { return mStrideH; }
+
+    size_t getPaddingW() const { return mPaddingW; }
+    size_t getPaddingH() const { return mPaddingH; }
+
+    size_t getDilationW() const { return mDilationW; }
+    size_t getDilationH() const { return mDilationH; }
+
+    size_t getGroups() const { return mGroups; }
+
+    bool isUseBias() const { return mUseBias; }
+
+    bool isQuantizeWeights() const { return mQuantizeWeights; }
+
+    size_t getEffectiveReceptiveFieldW() const { return mEffectiveReceptiveFieldW; }
+    size_t getEffectiveReceptiveFieldH() const { return mEffectiveReceptiveFieldH; }
+
+    const Name& getInputName() const { return mInputName; }
+    const Name& getOutputName() const { return mOutputName; }
+
+    bool isDilationEnabled() const { return mDilationEnabled; }
+
+    const Name& getDilationTensor() const { return mDilationTensor; }
+    const Names& getIm2ColForward() const { return mIm2ColForward; }
+    const Names& getIm2ColBackward() const { return mIm2ColBackward; }
+
+#if defined(RAUL_NAIVE_CONV_BACKWARD)
+    const Name& getTmpTensorName() const { return mTmpTensorName; }
+#endif
+
+    const Name& getWeightsBackup() const { return mWeightsBackup; }
+
+  private:
+    Convolution2DLayer(const Name& name, const std::string& typeName, const Convolution2DParams& params, NetworkParameters& networkParameters);
+
+    size_t mInputWidth;
+    size_t mInputHeight;
+    size_t mInputDepth;
+
+    size_t mOutputWidth;
+    size_t mOutputHeight;
+
+    size_t mKernelWidth;
+    size_t mKernelHeight;
+    size_t mKernelsCount;
+
+    size_t mStrideW;
+    size_t mStrideH;
+
+    size_t mPaddingW;
+    size_t mPaddingH;
+
+    size_t mDilationW;
+    size_t mDilationH;
+
+    size_t mGroups;
+
+    bool mUseBias;
+
+    bool mQuantizeWeights;
+
+    // Taking into account dilation
+    size_t mEffectiveReceptiveFieldW;
+    size_t mEffectiveReceptiveFieldH;
+
+    Name mInputName;
+    Name mOutputName;
+
+    bool mDilationEnabled;
+    Name mDilationTensor;
+
+    Names mIm2ColForward;
+    Names mIm2ColBackward;
+#if defined(RAUL_NAIVE_CONV_BACKWARD)
+    Name mTmpTensorName;
+#endif
+
+    Name mWeightsBackup;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/ConvolutionDepthwiseLayer.cpp b/training/src/compiler/training/base/layers/basic/trainable/ConvolutionDepthwiseLayer.cpp
new file mode 100644
index 00000000..c1e6612c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/ConvolutionDepthwiseLayer.cpp
@@ -0,0 +1,122 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ConvolutionDepthwiseLayer.h"
+
+#include <algorithm>
+#include <functional>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include <training/system/NameGenerator.h>
+
+#include "impl/ConvolutionDepthwiseLayerCPU.h"
+
+namespace raul
+{
+
+ConvolutionDepthwiseLayer::ConvolutionDepthwiseLayer(const Name& name, const Convolution2DParams& params, NetworkParameters& networkParameters)
+    : ConvolutionDepthwiseLayer(name, "ConvolutionDepthwise2D", params, networkParameters){ MEASURE_BLOCK("ConvolutionDepthwise2D[" + mName + "::ctor]") }
+
+    ConvolutionDepthwiseLayer::ConvolutionDepthwiseLayer(const Name& name, const std::string& typeName, const Convolution2DParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, typeName, params, networkParameters)
+    , mKernelWidth(params.kernelWidth)
+    , mKernelHeight(params.kernelHeight)
+    , mKernelsCount(params.kernelsCount)
+    , mStrideW(params.strideW)
+    , mStrideH(params.strideH)
+    , mPaddingW(params.paddingW)
+    , mPaddingH(params.paddingH)
+    , mUseBias(params.bias)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(ConvolutionDepthwiseLayer, ConvolutionDepthwiseLayerCPU<MemoryManager>, ConvolutionDepthwiseLayerCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mInputDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mInputHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mInputWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+
+    if (mKernelsCount != mInputDepth)
+    {
+        THROW("ConvolutionDepthwiseLayer", mName, "input channels != kernels amount");
+    }
+
+    mOutputWidth = (mInputWidth + 2 * mPaddingW - mKernelWidth) / mStrideW + 1;
+    mOutputHeight = (mInputHeight + 2 * mPaddingH - mKernelHeight) / mStrideH + 1;
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ raul::BS(), mKernelsCount, mOutputHeight, mOutputWidth }, DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+    size_t numThreads = 1;
+#if defined(_OPENMP)
+#pragma omp parallel
+    {
+        numThreads = omp_get_num_threads();
+    }
+#endif
+    for (size_t i = 0; i < numThreads; ++i)
+    {
+        Name im2ColF = mName / "Im2ColFor" / Conversions::toString(i);
+        mIm2ColForward.push_back(im2ColF);
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, im2ColF, raul::WShape{ 1u, 1u, 1u, mOutputHeight * mOutputWidth * mInputDepth * mKernelHeight * mKernelWidth }, DEC_FORW_WRIT);
+    }
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, raul::WShape{ mKernelsCount, 1u, mKernelHeight, mKernelWidth }, DEC_TRAINABLE);
+
+    if (mUseBias)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesName, raul::WShape{ 1u, mKernelsCount, 1u, 1u }, DEC_TRAINABLE);
+    }
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+
+        if (mUseBias)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesName, mBiasesName.grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+}
+
+void ConvolutionDepthwiseLayer::forwardComputeImpl(NetworkMode mode)
+{
+    mImpl->forwardComputeImpl(mode);
+}
+
+void ConvolutionDepthwiseLayer::backwardComputeImpl()
+{
+    mImpl->backwardComputeImpl();
+}
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/basic/trainable/ConvolutionDepthwiseLayer.h b/training/src/compiler/training/base/layers/basic/trainable/ConvolutionDepthwiseLayer.h
new file mode 100644
index 00000000..a9592761
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/ConvolutionDepthwiseLayer.h
@@ -0,0 +1,76 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVOLUTION_DEPTHWISE_LAYER_H
+#define CONVOLUTION_DEPTHWISE_LAYER_H
+
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/parameters/trainable/Convolution2DParams.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Channel-wise Convolution 2D Layer
+ *
+ * The layer applies 2D convolution over input tensor, each channel processed separately.
+ */
+class ConvolutionDepthwiseLayer : public TrainableLayer
+{
+  public:
+    ConvolutionDepthwiseLayer(const Name& name, const Convolution2DParams& params, NetworkParameters& networkParameters);
+
+    ConvolutionDepthwiseLayer(ConvolutionDepthwiseLayer&&) = default;
+    ConvolutionDepthwiseLayer(const ConvolutionDepthwiseLayer&) = delete;
+    ConvolutionDepthwiseLayer& operator=(const ConvolutionDepthwiseLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ConvolutionDepthwiseLayer(const Name& name, const std::string& typeName, const Convolution2DParams& params, NetworkParameters& networkParameters);
+
+    size_t mInputWidth;
+    size_t mInputHeight;
+    size_t mInputDepth;
+
+    size_t mOutputWidth;
+    size_t mOutputHeight;
+
+    size_t mKernelWidth;
+    size_t mKernelHeight;
+    size_t mKernelsCount;
+
+    size_t mStrideW;
+    size_t mStrideH;
+
+    size_t mPaddingW;
+    size_t mPaddingH;
+
+    bool mUseBias;
+
+    Name mInputName;
+    Name mOutputName;
+
+    Names mIm2ColForward;
+
+    Name mWeightsBackup;
+
+    template<typename MM>
+    friend class ConvolutionDepthwiseLayerCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/Embedding.cpp b/training/src/compiler/training/base/layers/basic/trainable/Embedding.cpp
new file mode 100644
index 00000000..e62e1488
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/Embedding.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Embedding.h"
+
+#include <string>
+
+#include "impl/EmbeddingCPU.h"
+
+namespace
+{
+const size_t NoPadding = std::numeric_limits<std::size_t>::max();
+} // anonymous namespace
+
+namespace raul
+{
+
+Embedding::Embedding(const Name& name, const EmbeddingParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "Embedding", params, networkParameters)
+    , mDictionarySize(params.dictionarySize)
+    , mEmbeddingSize(params.embeddingSize)
+    , mPaddingIdx(params.paddingClass >= 0 ? params.paddingClass : NoPadding)
+    , mScaleGradByFreq(params.scaleGradByFrequency)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(EmbeddingLayer, EmbeddingCPU<MemoryManager>, EmbeddingCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    if (params.scaleOutput)
+    {
+        mOutputScale = TODTYPE(sqrt(static_cast<dtype>(mEmbeddingSize)));
+    }
+
+    if (mNetworkParams.mWorkflow.getWidth(mInputName) != 1 && mNetworkParams.mWorkflow.getHeight(mInputName) != 1)
+    {
+        THROW(mTypeName, mName, "either input tensor width or height must be 1");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+    // inputs.getWidth or inputs.getHeight - sentence length
+    // inputs.getBatchSize - number of sentences in batch
+    mNetworkParams.mWorkflow.tensorNeeded(
+        mName,
+        mOutputName,
+        raul::WShape{ raul::BS(), mNetworkParams.mWorkflow.getDepth(mInputName), mNetworkParams.mWorkflow.getHeight(mInputName) * mNetworkParams.mWorkflow.getWidth(mInputName), mEmbeddingSize },
+        DEC_FORW_WRIT);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, WShape{ 1u, 1u, mDictionarySize, mEmbeddingSize }, DEC_TRAINABLE);
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/Embedding.h b/training/src/compiler/training/base/layers/basic/trainable/Embedding.h
new file mode 100644
index 00000000..b3d95baa
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/Embedding.h
@@ -0,0 +1,61 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef EMBEDDING_H
+#define EMBEDDING_H
+
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/parameters/trainable/EmbeddingParams.h>
+
+#include <map>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Word embeddings using lookup table
+ *
+ * A simple lookup table that stores embeddings of a fixed dictionary and size.
+ * This module is often used to store word embeddings and retrieve them using indices.
+ * The input to the module is a list of indices (either in width or height dimension), and the output is the corresponding word embeddings.
+ */
+class Embedding : public TrainableLayer
+{
+  public:
+    Embedding(const Name& name, const EmbeddingParams& params, NetworkParameters& networkParameters);
+
+    Embedding(Embedding&&) = default;
+    Embedding(const Embedding&) = delete;
+    Embedding& operator=(const Embedding&) = delete;
+
+  protected:
+    Name mInputName;
+    Name mOutputName;
+    std::string mLutTensorName;
+
+    std::map<size_t, size_t> mIndices;
+
+    size_t mDictionarySize;
+    size_t mEmbeddingSize;
+    size_t mPaddingIdx;
+    raul::dtype mOutputScale = 1.0_dt;
+    bool mScaleGradByFreq = false;
+
+    template<typename MM>
+    friend class EmbeddingCPU;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/LayerNorm.cpp b/training/src/compiler/training/base/layers/basic/trainable/LayerNorm.cpp
new file mode 100644
index 00000000..e393276d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/LayerNorm.cpp
@@ -0,0 +1,75 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LayerNorm.h"
+
+#include "impl/LayerNormCPU.h"
+
+namespace raul
+{
+
+LayerNormLayer::LayerNormLayer(const Name& name, const LayerNormParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "LayerNorm", params, networkParameters)
+    , mVarianceName(name / "Variance")
+    , mXHatName(name / "XHat")
+    , mXHatNablaName(name / "XHatNabla")
+    , mEps(params.eps)
+    , mTFStyle(params.useTFstyle)
+    , mUseBesselCorrection(params.useBesselCorrection)
+{
+    MEASURE_BLOCK(mTypeName + "[" + mName + "::ctor]")
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(LayerNormLayer, LayerNormLayerCPU<MemoryManager>, LayerNormLayerCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mInputDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mInputHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mInputWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+
+    mOutputWidth = mInputWidth;
+    mOutputHeight = mInputHeight;
+    mOutputSize = mOutputHeight * mOutputWidth;
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ BS(), mInputDepth, mOutputHeight, mInputWidth }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, raul::WShape{ 1, 1, 1, mInputWidth }, DEC_TRAINABLE);
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesName, raul::WShape{ 1, 1, 1, mInputWidth }, DEC_TRAINABLE);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mVarianceName, raul::WShape{ BS(), mInputDepth, mInputHeight, 1u }, DEC_FRBC_WRIT);
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mXHatName, raul::WShape{ BS(), mInputDepth, mInputHeight, mInputWidth }, DEC_FRBC_WRIT);
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesName, mBiasesName.grad(), DEC_TRAINABLE_GRAD);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mXHatName, mXHatNablaName, DEC_BACK_WRIT_ZERO);
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/LayerNorm.h b/training/src/compiler/training/base/layers/basic/trainable/LayerNorm.h
new file mode 100644
index 00000000..8ae6dfc5
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/LayerNorm.h
@@ -0,0 +1,65 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAYERNORM_LAYER_H
+#define LAYERNORM_LAYER_H
+
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/parameters/trainable/LayerNormParams.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Layer Normalization
+ *
+ * An alternative technique to BatchNorm to normalize the distribution of layer outputs that performs exactly the same during training and inference.
+ * @see https://arxiv.org/abs/1607.06450
+ */
+class LayerNormLayer : public TrainableLayer
+{
+  public:
+    LayerNormLayer(const Name& name, const LayerNormParams& params, NetworkParameters& networkParameters);
+
+    LayerNormLayer(LayerNormLayer&&) = default;
+    LayerNormLayer(const LayerNormLayer&) = delete;
+    LayerNormLayer& operator=(const LayerNormLayer&) = delete;
+
+  private:
+    size_t mInputWidth;
+    size_t mInputHeight;
+
+    size_t mOutputWidth;
+    size_t mOutputHeight;
+    size_t mOutputSize;
+    size_t mInputDepth;
+
+    Name mInputName;
+    Name mOutputName;
+
+    Name mVarianceName;
+    Name mXHatName;
+    Name mXHatNablaName;
+
+    const dtype mEps;
+    bool mTFStyle;
+    bool mUseBesselCorrection;
+
+    template<typename MM>
+    friend class LayerNormLayerCPU;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/LayerNorm2D.cpp b/training/src/compiler/training/base/layers/basic/trainable/LayerNorm2D.cpp
new file mode 100644
index 00000000..bb3b0465
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/LayerNorm2D.cpp
@@ -0,0 +1,76 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LayerNorm2D.h"
+
+#include "impl/LayerNorm2dCPU.h"
+
+namespace raul
+{
+
+LayerNorm2DLayer::LayerNorm2DLayer(const Name& name, const LayerNormParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "LayerNorm2D", params, networkParameters)
+    , mVarianceName(name / "Variance")
+    , mXHatName(name / "XHat")
+    , mXHatNablaName(name / "XHatNabla")
+    , mEps(params.eps)
+    , mTFStyle(params.useTFstyle)
+    , mUseBesselCorrection(params.useBesselCorrection)
+{
+    MEASURE_BLOCK(mTypeName + "[" + mName + "::ctor]")
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mInputDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mInputHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mInputWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+
+    mOutputWidth = mInputWidth;
+    mOutputHeight = mInputHeight;
+    mOutputSize = mOutputHeight * mOutputWidth;
+
+    DECLARE_IMPL(LayerNormLayer, LayerNorm2DLayerCPU<MemoryManager>, LayerNorm2DLayerCPU<MemoryManagerFP16>)
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    // mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ BS(), mInputDepth, mOutputHeight, mInputWidth }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, raul::WShape{ 1, 1, 1, mInputWidth }, DEC_TRAINABLE);
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesName, raul::WShape{ 1, 1, 1, mInputWidth }, DEC_TRAINABLE);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mVarianceName, raul::WShape{ BS(), mInputDepth, 1u, 1u }, DEC_FRBC_WRIT);
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mXHatName, raul::WShape{ BS(), mInputDepth, mInputHeight, mInputWidth }, DEC_FRBC_WRIT);
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesName, mBiasesName.grad(), DEC_TRAINABLE_GRAD);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mXHatName, mXHatNablaName, DEC_BACK_WRIT_ZERO);
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/LayerNorm2D.h b/training/src/compiler/training/base/layers/basic/trainable/LayerNorm2D.h
new file mode 100644
index 00000000..212bdaea
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/LayerNorm2D.h
@@ -0,0 +1,67 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAYERNORM2D_LAYER_H
+#define LAYERNORM2D_LAYER_H
+
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/parameters/trainable/LayerNormParams.h>
+
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+/**
+ * @brief Layer Normalization
+ *
+ * An alternative technique to BatchNorm to normalize the distribution of layer outputs that performs exactly the same during training and inference.
+ * @see https://arxiv.org/abs/1607.06450
+ * This is a cummulative implementation - mean and variance are calculated along width and height
+ * Weights and biases still have size = width
+ */
+class LayerNorm2DLayer : public TrainableLayer
+{
+  public:
+    LayerNorm2DLayer(const Name& name, const LayerNormParams& params, NetworkParameters& networkParameters);
+
+    LayerNorm2DLayer(LayerNorm2DLayer&&) = default;
+    LayerNorm2DLayer(const LayerNorm2DLayer&) = delete;
+    LayerNorm2DLayer& operator=(const LayerNorm2DLayer&) = delete;
+
+  private:
+    size_t mInputWidth;
+    size_t mInputHeight;
+
+    size_t mOutputWidth;
+    size_t mOutputHeight;
+    size_t mOutputSize;
+    size_t mInputDepth;
+
+    Name mInputName;
+    Name mOutputName;
+
+    Name mVarianceName;
+    Name mXHatName;
+    Name mXHatNablaName;
+
+    const dtype mEps;
+    bool mTFStyle;
+    bool mUseBesselCorrection;
+
+    template<typename MM>
+    friend class LayerNorm2DLayerCPU;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/LinearLayer.cpp b/training/src/compiler/training/base/layers/basic/trainable/LinearLayer.cpp
new file mode 100644
index 00000000..6cacf0eb
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/LinearLayer.cpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LinearLayer.h"
+
+#include <algorithm>
+
+#include <training/system/Profiler.h>
+
+#include <training/base/impl/basic/trainable/LinearLayerCPUFP16.h>
+#include <training/base/impl/basic/trainable/LinearLayerCPUFP32.h>
+#include <training/base/impl/basic/trainable/LinearLayerImpl.h>
+
+namespace raul
+{
+
+LinearLayer::LinearLayer(const Name& name, const LinearParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "Linear", params, networkParameters, { true, true })
+    , mOutputsCount(params.outputsCount)
+    , mUseBias(params.bias)
+{
+    MEASURE_BLOCK(mTypeName + "[" + mName + "::ctor]")
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mInputsCount = mNetworkParams.mWorkflow.getWidth(mInputName);
+
+    if (!mNetworkParams.mWorkflow.isCompilerEnabled())
+    {
+        DECLARE_IMPL(LinearLayer, LinearLayerImpl<MemoryManager>, LinearLayerImpl<MemoryManagerFP16>)
+    }
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, Workflow::Usage::ForwardAndBackward, Workflow::Mode::Read);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, WShape{ BS(), mDepth, mHeight, mOutputsCount }, DEC_FORW_WRIT_COMP);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, WShape{ 1u, 1u, mOutputsCount, mInputsCount }, DEC_TRAINABLE);
+
+    if (mUseBias)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesName, WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+    }
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+
+        if (mUseBias)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesName, mBiasesName.grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/LinearLayer.h b/training/src/compiler/training/base/layers/basic/trainable/LinearLayer.h
new file mode 100644
index 00000000..a7292700
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/LinearLayer.h
@@ -0,0 +1,92 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LINEAR_LAYER_H
+#define LINEAR_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/parameters/trainable/LinearParams.h>
+
+namespace raul
+{
+/**
+ * @brief Linear transform
+ *
+ *  Linear transformation of rows of tensor.
+ *  Batch, Depth and Height dimensions are kept unchanged.
+ *  [N, C, H, W] -> [N, C, H, L]
+ *
+ *  Weights are stored transposed ([L, W]) as in torch.nn.Linear
+ *
+ */
+class LinearLayer : public TrainableLayer
+{
+  public:
+    LinearLayer(const Name& name, const LinearParams& params, NetworkParameters& networkParameters);
+
+    LinearLayer(LinearLayer&&) = default;
+    LinearLayer(const LinearLayer&) = delete;
+    LinearLayer& operator=(const LinearLayer&) = delete;
+
+    size_t getInputsCount() const { return mInputsCount; }
+    size_t getOutputsCount() const { return mOutputsCount; }
+    size_t getDepth() const { return mDepth; }
+    size_t getHeight() const { return mHeight; }
+
+    bool isUseBias() const { return mUseBias; }
+
+    const Name& getInputName() const { return mInputName; }
+    const Name& getOutputName() const { return mOutputName; }
+
+    size_t getForwardAlignedWeightsSize() const { return mForwardAlignedWeightsSize; }
+    size_t getBackwardAlignedWeightsSize() const { return mBackwardAlignedWeightsSize; }
+
+    const Name& getTransposedWeightsName() const { return mTransposedWeightsName; }
+    const Name& getTransposedPaddedWeightsName() const { return mTransposedPaddedWeightsName; }
+    const Name& getPaddedWeightsName() const { return mPaddedWeightsName; }
+
+    enum class WeightsUsage
+    {
+        Unchanged,
+        Transposed,
+        Padded
+    };
+
+    WeightsUsage getForwardWeightsUsage() const { return mForwardWeightsUsage; }
+    WeightsUsage getBackwardWeightsUsage() const { return mBackwardWeightsUsage; }
+
+  private:
+    size_t mInputsCount;
+    size_t mOutputsCount;
+    size_t mDepth;
+    size_t mHeight;
+
+    bool mUseBias;
+
+    Name mInputName;
+    Name mOutputName;
+
+    size_t mForwardAlignedWeightsSize = 0;
+    size_t mBackwardAlignedWeightsSize = 0;
+
+    Name mTransposedWeightsName;
+    Name mTransposedPaddedWeightsName;
+    Name mPaddedWeightsName;
+
+    WeightsUsage mForwardWeightsUsage;
+    WeightsUsage mBackwardWeightsUsage;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution1DLayer.cpp b/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution1DLayer.cpp
new file mode 100644
index 00000000..1a125618
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution1DLayer.cpp
@@ -0,0 +1,568 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TransposedConvolution1DLayer.h"
+
+#include <functional>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+//#define RAUL_NAIVE_CONV_FORWARD
+//#define RAUL_NAIVE_CONV_BACKWARD
+
+namespace raul
+{
+
+TransposedConvolution1DLayer::TransposedConvolution1DLayer(const Name& name, const TransposedConvolution1DParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "TransposedConvolution1D", params, networkParameters)
+    , mOutputChannels(params.kernelsCount)
+    , mKernelSize(params.kernelSize)
+    , mStride(params.stride)
+    , mPadding(params.padding)
+    , mOutputPadding(params.outputPadding)
+    , mDilation(params.dilation)
+    , mGroups(params.groups)
+    , mUseBias(params.useBias)
+    , mQuantizeWeights(params.quantizeWeights)
+    , mDilationEnabled(false)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+    if (mGroups < 1)
+    {
+        THROW(mTypeName, mName, "zero groups");
+    }
+    if (mDilation < 1)
+    {
+        THROW(mTypeName, mName, "dilation parameter should be at least 1");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+    mPrevLayerDeltaName = mInputName.grad();
+
+    if (mNetworkParams.mWorkflow.getHeight(mInputName) > 1 && mNetworkParams.mWorkflow.getDepth(mInputName) > 1)
+    {
+        THROW(mTypeName, mName, "height and depth can't both be > 1");
+    }
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mInputSize = mNetworkParams.mWorkflow.getWidth(mInputName);
+    mInputChannels = mNetworkParams.mWorkflow.getHeight(mInputName) * mNetworkParams.mWorkflow.getDepth(mInputName);
+
+    if (mInputChannels % mGroups != 0 || mOutputChannels % mGroups != 0)
+    {
+        THROW(mTypeName, mName, "bad number of groups");
+    }
+
+    mEffectiveReceptiveField = mDilation * (mKernelSize - 1) + 1;
+    mOutputSize = (mInputSize - 1) * mStride - 2 * mPadding + mEffectiveReceptiveField + mOutputPadding;
+#if !defined(RAUL_NAIVE_CONV_FORWARD) && !defined(RAUL_NAIVE_CONV_BACKWARD)
+    if (mDilation > 1)
+    {
+        mDilationEnabled = true;
+        mDilationTensor = "Dilated" + mWeightsName;
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mDilationTensor, raul::WShape{ 1u, mInputChannels, mOutputChannels / mGroups, mEffectiveReceptiveField }, DEC_FRBC_WRIT);
+    }
+#else
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mName / "TMP", raul::WShape{ 1u, 1u, 1u, mOutputChannels * (mOutputSize + 2 * mPadding) }, DEC_FRBC_WRIT);
+#endif
+
+    raul::WShape outputShape{ BS(), mOutputChannels, 1u, mOutputSize };
+    if (mNetworkParams.mWorkflow.getHeight(mInputName) > 1)
+    {
+        outputShape = raul::WShape{ BS(), 1u, mOutputChannels, mOutputSize };
+    }
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, outputShape, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, raul::WShape{ 1u, mInputChannels, mOutputChannels / mGroups, mKernelSize }, DEC_TRAINABLE);
+    if (mUseBias)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesName, raul::WShape{ 1u, mOutputChannels, 1u, 1u }, DEC_TRAINABLE);
+    }
+
+    size_t numThreads = 1;
+#if defined(_OPENMP)
+#pragma omp parallel
+    {
+        numThreads = omp_get_num_threads();
+    }
+#endif
+    for (size_t i = 0; i < numThreads; ++i)
+    {
+        Name im2ColF = mName / "Im2ColFor" / Conversions::toString(i);
+        mIm2ColForward.push_back(im2ColF);
+
+        Name im2ColB = mName / "Im2ColBack" / Conversions::toString(i);
+        mIm2ColBackward.push_back(im2ColB);
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, im2ColF, raul::WShape{ 1u, 1u, 1u, mInputSize * mOutputChannels * mEffectiveReceptiveField }, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, im2ColF, im2ColB, DEC_BACK_WRIT);
+    }
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+        if (mUseBias)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesName, mBiasesName.grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+
+    if (mQuantizeWeights)
+    {
+        if (!mNetworkParams.mQuantizerPtr)
+        {
+            THROW(mTypeName, mName, "quantizer is not defined");
+        }
+        mWeightsBackup = mWeightsName + "_backup";
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsBackup, DEC_FORW_WRIT);
+    }
+}
+
+#ifdef RAUL_NAIVE_CONV_FORWARD
+void TransposedConvolution1DLayer::forwardComputeImpl(NetworkMode)
+{
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+    Tensor& output = mNetworkParams.mMemoryManager[mOutputName];
+
+    const Tensor& input = mNetworkParams.mMemoryManager[mInputName];
+
+    auto input3D = input.reshape(yato::dims(batchSize, mInputChannels, mInputSize));
+    auto output3D = output.reshape(yato::dims(batchSize, mOutputChannels, mOutputSize));
+
+    auto kernelsWeights3D = mNetworkParams.mMemoryManager[mWeightsName].reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mKernelSize));
+    size_t outputSizePadded = mOutputSize + 2 * mPadding;
+
+    auto outputTmp2D = mNetworkParams.mMemoryManager[mName / "TMP"].reshape(yato::dims(mOutputChannels, outputSizePadded));
+
+    for (size_t i = 0; i < batchSize; ++i)
+    {
+        std::fill(mNetworkParams.mMemoryManager[mName / "TMP"].begin(), mNetworkParams.mMemoryManager[mName / "TMP"].end(), 0.0_dt);
+
+        for (size_t group = 0; group < mGroups; ++group)
+        {
+            for (size_t d = 0; d < mInputChannels / mGroups; ++d)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < mOutputChannels / mGroups; ++kernelIndex)
+                {
+                    for (size_t ox = 0; ox < mInputSize; ++ox)
+                    {
+                        for (size_t kx = 0; kx < mKernelSize; ++kx)
+                        {
+                            outputTmp2D[kernelIndex + group * mOutputChannels / mGroups][ox * mStride + kx * mDilation] +=
+                                input3D[i][d + group * mInputChannels / mGroups][ox] * kernelsWeights3D[d + group * mInputChannels / mGroups][kernelIndex][kx];
+                        }
+                    }
+                }
+            }
+        }
+        Common::removePadding1D(&outputTmp2D[0][0], &output3D[i][0][0], mOutputChannels, outputSizePadded, mOutputSize);
+    }
+
+    if (mUseBias)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < mOutputChannels; ++kernelIndex)
+            {
+                const dtype bias = mNetworkParams.mMemoryManager[mBiasesName][kernelIndex];
+                std::transform(output3D[q][kernelIndex].begin(), output3D[q][kernelIndex].end(), output3D[q][kernelIndex].begin(), [bias](dtype& val) { return val + bias; });
+            }
+        }
+    }
+}
+#else
+void TransposedConvolution1DLayer::forwardComputeImpl(NetworkMode)
+{
+    Tensor& weights = mNetworkParams.mMemoryManager[mWeightsName];
+    if (mQuantizeWeights)
+    {
+        mNetworkParams.mMemoryManager[mWeightsBackup] = TORANGE(weights);
+        mNetworkParams.mQuantizerPtr->quantize(weights.begin(), weights.end());
+        mNetworkParams.mQuantizerPtr->dequantize(weights.begin(), weights.end());
+    }
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+    Tensor& output = mNetworkParams.mMemoryManager[mOutputName];
+
+    const Tensor& input = mNetworkParams.mMemoryManager[mInputName];
+    auto input3D = input.reshape(yato::dims(batchSize, mInputChannels, mInputSize));
+    auto output3D = output.reshape(yato::dims(batchSize, mOutputChannels, mOutputSize));
+
+    auto finalKernelsWeights3D = mDilationEnabled ? mNetworkParams.mMemoryManager[mDilationTensor].reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mEffectiveReceptiveField))
+                                                  : weights.reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mKernelSize));
+
+    // Fill dilated weights if needed
+    if (mDilationEnabled)
+    {
+        std::fill(mNetworkParams.mMemoryManager[mDilationTensor].begin(), mNetworkParams.mMemoryManager[mDilationTensor].end(), 0.0_dt);
+        auto kernelsWeights3D = weights.reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mKernelSize));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t d = 0; d < mInputChannels; ++d)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < mOutputChannels / mGroups; ++kernelIndex)
+            {
+                for (size_t kx = 0; kx < mKernelSize; ++kx)
+                {
+                    finalKernelsWeights3D[d][kernelIndex][kx * mDilation] = kernelsWeights3D[d][kernelIndex][kx];
+                }
+            }
+        }
+    }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < batchSize; ++q)
+    {
+        size_t index = 0;
+#if defined(_OPENMP)
+        index = omp_get_thread_num();
+#endif
+        Tensor& im2ColFor = mNetworkParams.mMemoryManager[mIm2ColForward[index]];
+        for (size_t group = 0; group < mGroups; ++group)
+        {
+            Common::gemm(CblasTrans,
+                         CblasNoTrans,
+                         mEffectiveReceptiveField * mOutputChannels / mGroups,
+                         mInputSize,
+                         mInputChannels / mGroups,
+                         1.0_dt,
+                         &finalKernelsWeights3D[group * mInputChannels / mGroups][0][0],
+                         &input3D[q][group * mInputChannels / mGroups][0],
+                         0.0_dt,
+                         &im2ColFor[0] + group * mEffectiveReceptiveField * mOutputChannels * mInputSize / mGroups);
+        }
+        Common::col2im(&im2ColFor[0], mOutputSize, 1u, mOutputChannels, mEffectiveReceptiveField, 1u, mStride, 1u, mPadding, 0, &output3D[q][0][0]);
+    }
+
+    if (mUseBias)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < mOutputChannels; ++kernelIndex)
+            {
+                const dtype bias = mNetworkParams.mMemoryManager[mBiasesName][kernelIndex];
+                std::transform(output3D[q][kernelIndex].begin(), output3D[q][kernelIndex].end(), output3D[q][kernelIndex].begin(), [bias](dtype& val) { return val + bias; });
+            }
+        }
+    }
+
+    if (mQuantizeWeights)
+    {
+        weights = TORANGE(mNetworkParams.mMemoryManager[mWeightsBackup]);
+    }
+}
+#endif
+
+#ifdef RAUL_NAIVE_CONV_BACKWARD
+void TransposedConvolution1DLayer::backwardComputeImpl()
+{
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+    const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputName.grad()];
+    auto deltas3D = deltas.reshape(yato::dims(batchSize, mOutputChannels, mOutputSize));
+
+    size_t outputSizePadded = mOutputSize + 2 * mPadding;
+
+    // if (mNetworkParams.isGradNeeded(mInputName))
+    {
+        Tensor& prevLayerDelta = mNetworkParams.mMemoryManager[mPrevLayerDeltaName];
+
+        auto kernelsWeights3D = mNetworkParams.mMemoryManager[mWeightsName].reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mKernelSize));
+        auto prevLayerDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mInputChannels, mInputSize));
+
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            std::fill(mNetworkParams.mMemoryManager[mName / "TMP"].begin(), mNetworkParams.mMemoryManager[mName / "TMP"].end(), 0.0_dt);
+            Common::addPadding1D(&deltas3D[q][0][0], &mNetworkParams.mMemoryManager[mName / "TMP"][0], mOutputChannels, mOutputSize, outputSizePadded);
+            auto deltasPadded2D = mNetworkParams.mMemoryManager[mName / "TMP"].reshape(yato::dims(mOutputChannels, outputSizePadded));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t group = 0; group < mGroups; ++group)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < mOutputChannels / mGroups; ++kernelIndex)
+                {
+                    for (size_t d = 0; d < mInputChannels / mGroups; ++d)
+                    {
+                        for (size_t ox = 0; ox < mInputSize; ++ox)
+                        {
+                            for (size_t kx = 0; kx < mKernelSize; ++kx)
+                            {
+                                prevLayerDeltas3D[q][d + group * mInputChannels / mGroups][ox] += kernelsWeights3D[d + group * mInputChannels / mGroups][kernelIndex][kx] *
+                                                                                                  deltasPadded2D[kernelIndex + group * mOutputChannels / mGroups][ox * mStride + kx * mDilation];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (!mFrozen)
+    {
+        const Tensor& inputs = mNetworkParams.mMemoryManager[mInputName];
+
+        Tensor& gradWeights = mNetworkParams.mMemoryManager[mWeightsName.grad()];
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mInputChannels, mInputSize));
+        auto gradWeights3D = gradWeights.reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mKernelSize));
+
+        if (mNetworkParams.mCalculationMode == CalculationMode::DETERMINISTIC)
+        {
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                std::fill(mNetworkParams.mMemoryManager[mName / "TMP"].begin(), mNetworkParams.mMemoryManager[mName / "TMP"].end(), 0.0_dt);
+                Common::addPadding1D(&deltas3D[i][0][0], &mNetworkParams.mMemoryManager[mName / "TMP"][0], mOutputChannels, mOutputSize, outputSizePadded);
+                auto deltasPadded2D = mNetworkParams.mMemoryManager[mName / "TMP"].reshape(yato::dims(mOutputChannels, outputSizePadded));
+
+                for (size_t group = 0; group < mGroups; ++group)
+                {
+                    for (size_t d = 0; d < mInputChannels / mGroups; ++d)
+                    {
+                        for (size_t kernelIndex = 0; kernelIndex < mOutputChannels / mGroups; ++kernelIndex)
+                        {
+                            for (size_t kx = 0; kx < mKernelSize; ++kx)
+                            {
+                                for (size_t ox = 0; ox < mInputSize; ++ox)
+                                {
+                                    gradWeights3D[d + group * mInputChannels / mGroups][kernelIndex][kx] +=
+                                        deltasPadded2D[kernelIndex + group * mOutputChannels / mGroups][ox * mStride + kx * mDilation] * inputs3D[i][d + group * mInputChannels / mGroups][ox];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#if defined(_OPENMP)
+        else if (mNetworkParams.mCalculationMode == CalculationMode::FAST)
+        {
+#pragma omp parallel for
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                std::fill(mNetworkParams.mMemoryManager[mName / "TMP"].begin(), mNetworkParams.mMemoryManager[mName / "TMP"].end(), 0.0_dt);
+                Common::addPadding1D(&deltas3D[i][0][0], &mNetworkParams.mMemoryManager[mName / "TMP"][0], mOutputChannels, mOutputSize, outputSizePadded);
+                auto deltasPadded2D = mNetworkParams.mMemoryManager[mName / "TMP"].reshape(yato::dims(mOutputChannels, outputSizePadded));
+
+                for (size_t group = 0; group < mGroups; ++group)
+                {
+                    for (size_t d = 0; d < mInputChannels / mGroups; ++d)
+                    {
+                        for (size_t kernelIndex = 0; kernelIndex < mOutputChannels / mGroups; ++kernelIndex)
+                        {
+                            for (size_t kx = 0; kx < mKernelSize; ++kx)
+                            {
+                                for (size_t ox = 0; ox < mInputSize; ++ox)
+                                {
+                                    gradWeights3D[d + group * mInputChannels / mGroups][kernelIndex][kx] +=
+                                        deltasPadded2D[kernelIndex + group * mOutputChannels / mGroups][ox * mStride + kx * mDilation] * inputs3D[i][d + group * mInputChannels / mGroups][ox];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#endif
+        else
+        {
+            auto prefix = "TransposedConvolution1DLayer[" + mName + "::backwardCompute]: ";
+            THROW(mTypeName, mName, "unexpected calculation mode");
+        }
+
+        if (mUseBias)
+        {
+            Tensor& gradBiases = mNetworkParams.mMemoryManager[mBiasesName.grad()];
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < mOutputChannels; ++kernelIndex)
+                {
+                    gradBiases[kernelIndex] += std::accumulate(deltas3D[i][kernelIndex].begin(), deltas3D[i][kernelIndex].end(), 0.0_dt);
+                }
+            }
+        }
+    }
+}
+#else
+void TransposedConvolution1DLayer::backwardComputeImpl()
+{
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+    const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputName.grad()];
+    auto deltas3D = deltas.reshape(yato::dims(batchSize, mOutputChannels, mOutputSize));
+
+    auto finalKernelsWeights3D = mDilationEnabled ? mNetworkParams.mMemoryManager[mDilationTensor].reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mEffectiveReceptiveField))
+                                                  : mNetworkParams.mMemoryManager[mWeightsName].reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mKernelSize));
+    // if (mNetworkParams.isGradNeeded(mInputName))
+    {
+        Tensor& prevLayerDelta = mNetworkParams.mMemoryManager[mPrevLayerDeltaName];
+
+        auto prevLayerDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mInputChannels, mInputSize));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            size_t index = 0;
+#if defined(_OPENMP)
+            index = omp_get_thread_num();
+#endif
+            Tensor& im2ColBack = mNetworkParams.mMemoryManager[mIm2ColBackward[index]];
+            Common::im2col(&deltas3D[q][0][0], mOutputSize, 1u, mOutputChannels, mEffectiveReceptiveField, 1u, mStride, 1u, mPadding, 0, &im2ColBack[0]);
+
+            for (size_t group = 0; group < mGroups; ++group)
+            {
+                Common::gemm(CblasNoTrans,
+                             CblasNoTrans,
+                             mInputChannels / mGroups,
+                             mInputSize,
+                             mEffectiveReceptiveField * mOutputChannels / mGroups,
+                             1.0_dt,
+                             &finalKernelsWeights3D[group * mInputChannels / mGroups][0][0],
+                             &im2ColBack[0] + group * mOutputChannels / mGroups * mEffectiveReceptiveField * mInputSize,
+                             1.0_dt,
+                             &prevLayerDeltas3D[q][group * mInputChannels / mGroups][0]);
+            }
+        }
+    }
+
+    if (!mFrozen)
+    {
+        const Tensor& inputs = mNetworkParams.mMemoryManager[mInputName];
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mInputChannels, mInputSize));
+
+        Tensor im2colMatrix(batchSize * mInputSize * mOutputChannels * mEffectiveReceptiveField);
+        auto im2colMatrix2D = im2colMatrix.reshape(yato::dims(batchSize, mInputSize * mOutputChannels * mEffectiveReceptiveField));
+
+        if (mDilationEnabled)
+        {
+            std::fill(mNetworkParams.mMemoryManager[mDilationTensor].begin(), mNetworkParams.mMemoryManager[mDilationTensor].end(), 0.0_dt);
+        }
+
+        Tensor& gradWeights = mNetworkParams.mMemoryManager[mWeightsName.grad()];
+        auto gradWeights3D = mDilationEnabled ? mNetworkParams.mMemoryManager[mDilationTensor].reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mEffectiveReceptiveField))
+                                              : gradWeights.reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mKernelSize));
+
+        if (mNetworkParams.mCalculationMode == CalculationMode::DETERMINISTIC)
+        {
+            for (size_t q = 0; q < batchSize; ++q)
+            {
+                Common::im2col(&deltas3D[q][0][0], mOutputSize, 1u, mOutputChannels, mEffectiveReceptiveField, 1u, mStride, 1u, mPadding, 0, &im2colMatrix2D[q][0]);
+
+                for (size_t group = 0; group < mGroups; ++group)
+                {
+                    Common::gemm(CblasNoTrans,
+                                 CblasTrans,
+                                 mInputChannels / mGroups,
+                                 mEffectiveReceptiveField * mOutputChannels / mGroups,
+                                 mInputSize,
+                                 1.0_dt,
+                                 &inputs3D[q][group * mInputChannels / mGroups][0],
+                                 &im2colMatrix2D[q][0] + group * mInputSize * mOutputChannels * mEffectiveReceptiveField / mGroups,
+                                 1.0_dt,
+                                 &gradWeights3D[group * mInputChannels / mGroups][0][0]);
+                }
+            }
+        }
+#if defined(_OPENMP)
+        else if (mNetworkParams.mCalculationMode == CalculationMode::FAST)
+        {
+#pragma omp parallel for
+            for (size_t q = 0; q < batchSize; ++q)
+            {
+                Common::im2col(&deltas3D[q][0][0], mOutputSize, 1u, mOutputChannels, mEffectiveReceptiveField, 1u, mStride, 1u, mPadding, 0, &im2colMatrix2D[q][0]);
+
+#pragma omp critical
+                for (size_t group = 0; group < mGroups; ++group)
+                {
+                    Common::gemm(CblasNoTrans,
+                                 CblasTrans,
+                                 mInputChannels / mGroups,
+                                 mEffectiveReceptiveField * mOutputChannels / mGroups,
+                                 mInputSize,
+                                 1.0_dt,
+                                 &inputs3D[q][group * mInputChannels / mGroups][0],
+                                 &im2colMatrix2D[q][0] + group * mInputSize * mOutputChannels * mEffectiveReceptiveField / mGroups,
+                                 1.0_dt,
+                                 &gradWeights3D[group * mInputChannels / mGroups][0][0]);
+                }
+            }
+        }
+#endif
+        else
+        {
+            auto prefix = "TransposedConvolution1DLayer[" + mName + "::backwardCompute]: ";
+            THROW(mTypeName, mName, "unexpected calculation mode");
+        }
+
+        if (mDilationEnabled)
+        {
+            auto realGradWeights3D = gradWeights.reshape(yato::dims(mInputChannels, mOutputChannels / mGroups, mKernelSize));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t d = 0; d < mInputChannels; ++d)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < mOutputChannels / mGroups; ++kernelIndex)
+                {
+                    for (size_t kx = 0; kx < mKernelSize; ++kx)
+                    {
+                        realGradWeights3D[d][kernelIndex][kx] += gradWeights3D[d][kernelIndex][kx * mDilation];
+                    }
+                }
+            }
+        }
+
+        if (mUseBias)
+        {
+            Tensor& gradBiases = mNetworkParams.mMemoryManager[mBiasesName.grad()];
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < mOutputChannels; ++kernelIndex)
+                {
+                    gradBiases[kernelIndex] += std::accumulate(deltas3D[i][kernelIndex].begin(), deltas3D[i][kernelIndex].end(), 0.0_dt);
+                }
+            }
+        }
+    }
+}
+#endif
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution1DLayer.h b/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution1DLayer.h
new file mode 100644
index 00000000..bff3c3e3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution1DLayer.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVOLUTION_1D_LAYER_H
+#define CONVOLUTION_1D_LAYER_H
+
+#include <training/base/layers/TrainableLayer.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/base/layers/parameters/trainable/TransposedConvolution1DParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Convolution1D Layer
+ * Applies a 1D transposed convolution operator over an input image composed of several input planes.
+ * This module can be seen as the gradient of Conv1d with respect to its input. It is also known as
+ * a fractionally-strided convolution or a deconvolution (although it is not an actual deconvolution operation).
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.nn.ConvTranspose1d.html
+ */
+class TransposedConvolution1DLayer : public TrainableLayer
+{
+  public:
+    TransposedConvolution1DLayer(const Name& name, const TransposedConvolution1DParams& params, NetworkParameters& networkParameters);
+
+    TransposedConvolution1DLayer(TransposedConvolution1DLayer&&) = default;
+    TransposedConvolution1DLayer(const TransposedConvolution1DLayer&) = delete;
+    TransposedConvolution1DLayer& operator=(const TransposedConvolution1DLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  protected:
+    size_t mInputSize;
+    size_t mInputChannels;
+
+    size_t mOutputSize;
+    size_t mOutputChannels;
+
+    size_t mKernelSize;
+    size_t mStride;
+    size_t mPadding;
+    size_t mOutputPadding;
+    size_t mDilation;
+    size_t mGroups;
+
+    bool mUseBias;
+
+    bool mQuantizeWeights;
+    Name mWeightsBackup;
+
+    size_t mEffectiveReceptiveField;
+
+    bool mDilationEnabled;
+    Name mDilationTensor;
+
+    Name mInputName;
+    Name mOutputName;
+    Name mPrevLayerDeltaName;
+
+    Names mIm2ColForward;
+    Names mIm2ColBackward;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution2DLayer.cpp b/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution2DLayer.cpp
new file mode 100644
index 00000000..439100ef
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution2DLayer.cpp
@@ -0,0 +1,583 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TransposedConvolution2DLayer.h"
+
+#include <functional>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+//#define RAUL_NAIVE_CONV_FORWARD
+//#define RAUL_NAIVE_CONV_BACKWARD
+
+namespace raul
+{
+
+TransposedConvolution2DLayer::TransposedConvolution2DLayer(const Name& name, const TransposedConvolution2DParams& params, NetworkParameters& networkParameters)
+    : TransposedConvolution2DLayer(name, "TransposeConvolution2D", params, networkParameters)
+{
+}
+
+TransposedConvolution2DLayer::TransposedConvolution2DLayer(const Name& name, const std::string& typeName, const TransposedConvolution2DParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, typeName, params, networkParameters)
+    , mKernelWidth(params.kernelWidth)
+    , mKernelHeight(params.kernelHeight)
+    , mKernelsCount(params.kernelsCount)
+    , mStrideW(params.strideW)
+    , mStrideH(params.strideH)
+    , mPaddingW(params.paddingW)
+    , mPaddingH(params.paddingH)
+    , mOutputPaddingW(params.mOutputPaddingW)
+    , mOutputPaddingH(params.mOutputPaddingH)
+    , mUseBias(params.bias)
+    , mDilationW(params.mDilationW)
+    , mDilationH(params.mDilationH)
+    , mQuantizeWeights(params.quantizeWeights)
+    , mDilationEnabled(false)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mOutputName = mOutputs[0];
+    mPrevLayerDeltaName = mInputName.grad();
+
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mInputWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+    mInputHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mInputDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+
+    mEffectiveReceptiveFieldW = mDilationW * (mKernelWidth - 1) + 1;
+    mEffectiveReceptiveFieldH = mDilationH * (mKernelHeight - 1) + 1;
+
+    mOutputWidth = (mInputWidth - 1) * mStrideW - 2 * mPaddingW + mEffectiveReceptiveFieldW + mOutputPaddingW;
+    mOutputHeight = (mInputHeight - 1) * mStrideH - 2 * mPaddingH + mEffectiveReceptiveFieldH + mOutputPaddingH;
+
+#if !defined(RAUL_NAIVE_CONV_FORWARD) && !defined(RAUL_NAIVE_CONV_BACKWARD)
+    if (mDilationW > 1 || mDilationH > 1)
+    {
+        mDilationEnabled = true;
+        mDilationTensor = "Dilated" + mWeightsName;
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mDilationTensor, raul::WShape{ mInputDepth, mKernelsCount, mEffectiveReceptiveFieldH, mEffectiveReceptiveFieldW }, DEC_FRBC_WRIT);
+    }
+#else
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mName / "TMP", raul::WShape{ 1u, 1u, 1u, mKernelsCount * (mOutputHeight + 2 * mPaddingH) * (mOutputWidth + 2 * mPaddingW) }, DEC_FRBC_WRIT);
+#endif
+
+    size_t numThreads = 1;
+#if defined(_OPENMP)
+#pragma omp parallel
+    {
+        numThreads = omp_get_num_threads();
+    }
+#endif
+    for (size_t i = 0; i < numThreads; ++i)
+    {
+        Name im2ColF = mName / "Im2ColFor" / Conversions::toString(i);
+        mIm2ColForward.push_back(im2ColF);
+
+        Name im2ColB = mName / "Im2ColBack" / Conversions::toString(i);
+        mIm2ColBackward.push_back(im2ColB);
+
+        mNetworkParams.mWorkflow.tensorNeeded(
+            mName, im2ColF, raul::WShape{ 1u, 1u, 1u, mInputHeight * mInputWidth * mKernelsCount * mEffectiveReceptiveFieldH * mEffectiveReceptiveFieldW }, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, im2ColF, im2ColB, DEC_BACK_WRIT);
+    }
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputName, raul::WShape{ BS(), mKernelsCount, mOutputHeight, mOutputWidth }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, raul::WShape{ mInputDepth, mKernelsCount, mKernelHeight, mKernelWidth }, DEC_TRAINABLE);
+    if (mUseBias)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesName, raul::WShape{ 1u, mKernelsCount, 1u, 1u }, DEC_TRAINABLE);
+    }
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+        if (mUseBias)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesName, mBiasesName.grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+
+    if (mQuantizeWeights)
+    {
+        if (!mNetworkParams.mQuantizerPtr)
+        {
+            THROW(mTypeName, mName, "quantizer is not defined");
+        }
+        mWeightsBackup = mWeightsName + "_backup";
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsBackup, DEC_FORW_WRIT);
+    }
+}
+
+#ifdef RAUL_NAIVE_CONV_FORWARD
+void TransposedConvolution2DLayer::forwardComputeImpl(NetworkMode)
+{
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+    Tensor& output = mNetworkParams.mMemoryManager[mOutputName];
+
+    const Tensor& input = mNetworkParams.mMemoryManager[mInputName];
+
+    auto input4D = input.reshape(yato::dims(batchSize, mInputDepth, mInputHeight, mInputWidth));
+    auto output3D = output.reshape(yato::dims(batchSize, mKernelsCount, mOutputHeight * mOutputWidth));
+
+    auto kernelsWeights4D = mNetworkParams.mMemoryManager[mWeightsName].reshape(yato::dims(mInputDepth, mKernelsCount, mKernelHeight, mKernelWidth));
+
+    size_t outputWidthPadded = mOutputWidth + 2 * mPaddingW;
+    size_t outputHeightPadded = mOutputHeight + 2 * mPaddingH;
+
+    auto outputTmp3D = mNetworkParams.mMemoryManager[mName / "TMP"].reshape(yato::dims(mKernelsCount, outputHeightPadded, outputWidthPadded));
+
+    for (size_t q = 0; q < batchSize; ++q)
+    {
+        std::fill(mNetworkParams.mMemoryManager[mName / "TMP"].begin(), mNetworkParams.mMemoryManager[mName / "TMP"].end(), 0.0_dt);
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t kernelIndex = 0; kernelIndex < mKernelsCount; ++kernelIndex)
+        {
+            for (size_t d = 0; d < mInputDepth; ++d)
+            {
+                for (size_t oy = 0; oy < mInputHeight; ++oy)
+                {
+                    for (size_t ox = 0; ox < mInputWidth; ++ox)
+                    {
+                        for (size_t ky = 0; ky < mKernelHeight; ++ky)
+                        {
+                            for (size_t kx = 0; kx < mKernelWidth; ++kx)
+                            {
+                                outputTmp3D[kernelIndex][oy * mStrideH + ky * mDilationH][ox * mStrideW + kx * mDilationW] += kernelsWeights4D[d][kernelIndex][ky][kx] * input4D[q][d][oy][ox];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        Common::removePadding2D(&outputTmp3D[0][0][0], &output3D[q][0][0], mKernelsCount, outputWidthPadded, outputHeightPadded, mOutputWidth, mOutputHeight);
+    }
+
+    if (mUseBias)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < mKernelsCount; ++kernelIndex)
+            {
+                for (size_t oy = 0; oy < mOutputHeight; ++oy)
+                {
+                    for (size_t ox = 0; ox < mOutputWidth; ++ox)
+                    {
+                        output3D[q][kernelIndex][oy * mOutputWidth + ox] += mNetworkParams.mMemoryManager[mBiasesName][kernelIndex];
+                    }
+                }
+            }
+        }
+    }
+}
+#else
+void TransposedConvolution2DLayer::forwardComputeImpl(NetworkMode)
+{
+    Tensor& weights = mNetworkParams.mMemoryManager[mWeightsName];
+    if (mQuantizeWeights)
+    {
+        mNetworkParams.mMemoryManager[mWeightsBackup] = TORANGE(weights);
+        mNetworkParams.mQuantizerPtr->quantize(weights.begin(), weights.end());
+        mNetworkParams.mQuantizerPtr->dequantize(weights.begin(), weights.end());
+    }
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+    Tensor& output = mNetworkParams.mMemoryManager[mOutputName];
+
+    const Tensor& input = mNetworkParams.mMemoryManager[mInputName];
+    auto input3D = input.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+    auto output3D = output.reshape(yato::dims(batchSize, mKernelsCount, mOutputHeight * mOutputWidth));
+
+    // Fill dilated weights if needed
+    if (mDilationEnabled)
+    {
+        std::fill(mNetworkParams.mMemoryManager[mDilationTensor].begin(), mNetworkParams.mMemoryManager[mDilationTensor].end(), 0.0_dt);
+        auto kernelsWeights4D = mNetworkParams.mMemoryManager[mWeightsName].reshape(yato::dims(mInputDepth, mKernelsCount, mKernelHeight, mKernelWidth));
+        auto dilatedKernelsWeights4D = mNetworkParams.mMemoryManager[mDilationTensor].reshape(yato::dims(mInputDepth, mKernelsCount, mEffectiveReceptiveFieldH, mEffectiveReceptiveFieldW));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t d = 0; d < mInputDepth; ++d)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < mKernelsCount; ++kernelIndex)
+            {
+                for (size_t ky = 0; ky < mKernelHeight; ++ky)
+                {
+                    for (size_t kx = 0; kx < mKernelWidth; ++kx)
+                    {
+                        dilatedKernelsWeights4D[d][kernelIndex][ky * mDilationH][kx * mDilationW] = kernelsWeights4D[d][kernelIndex][ky][kx];
+                    }
+                }
+            }
+        }
+    }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < batchSize; ++i)
+    {
+        size_t index = 0;
+#if defined(_OPENMP)
+        index = omp_get_thread_num();
+#endif
+        Tensor& im2ColFor = mNetworkParams.mMemoryManager[mIm2ColForward[index]];
+        Common::gemm(CblasTrans,
+                     CblasNoTrans,
+                     mEffectiveReceptiveFieldW * mEffectiveReceptiveFieldH * mKernelsCount,
+                     mInputWidth * mInputHeight,
+                     mInputDepth,
+                     1.0_dt,
+                     mDilationEnabled ? &mNetworkParams.mMemoryManager[mDilationTensor][0] : &mNetworkParams.mMemoryManager[mWeightsName][0],
+                     &input3D[i][0][0],
+                     0.0_dt,
+                     &im2ColFor[0]);
+
+        Common::col2im(&im2ColFor[0], mOutputWidth, mOutputHeight, mKernelsCount, mEffectiveReceptiveFieldW, mEffectiveReceptiveFieldH, mStrideW, mStrideH, mPaddingW, mPaddingH, &output3D[i][0][0]);
+    }
+    if (mUseBias)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < mKernelsCount; ++kernelIndex)
+            {
+                for (size_t oy = 0; oy < mOutputHeight; ++oy)
+                {
+                    for (size_t ox = 0; ox < mOutputWidth; ++ox)
+                    {
+                        output3D[q][kernelIndex][oy * mOutputWidth + ox] += mNetworkParams.mMemoryManager[mBiasesName][kernelIndex];
+                    }
+                }
+            }
+        }
+    }
+
+    if (mQuantizeWeights)
+    {
+        weights = TORANGE(mNetworkParams.mMemoryManager[mWeightsBackup]);
+    }
+}
+#endif
+
+#ifdef RAUL_NAIVE_CONV_BACKWARD
+void TransposedConvolution2DLayer::backwardComputeImpl()
+{
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+    const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputName.grad()];
+    auto deltas3D = deltas.reshape(yato::dims(batchSize, mKernelsCount, mOutputHeight * mOutputWidth));
+
+    size_t outputWidthPadded = mOutputWidth + 2 * mPaddingW;
+    size_t outputHeightPadded = mOutputHeight + 2 * mPaddingH;
+
+    // if (mNetworkParams.isGradNeeded(mInputName))
+    {
+        Tensor& prevLayerDelta = mNetworkParams.mMemoryManager[mPrevLayerDeltaName];
+
+        auto kernelsWeights4D = mNetworkParams.mMemoryManager[mWeightsName].reshape(yato::dims(mInputDepth, mKernelsCount, mKernelHeight, mKernelWidth));
+        auto prevLayerDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            std::fill(mNetworkParams.mMemoryManager[mName / "TMP"].begin(), mNetworkParams.mMemoryManager[mName / "TMP"].end(), 0.0_dt);
+            Common::addPadding2D(&deltas3D[q][0][0], &mNetworkParams.mMemoryManager[mName / "TMP"][0], mKernelsCount, mOutputWidth, mOutputHeight, outputWidthPadded, outputHeightPadded);
+            auto deltasPadded2D = mNetworkParams.mMemoryManager[mName / "TMP"].reshape(yato::dims(mKernelsCount, outputHeightPadded * outputWidthPadded));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t d = 0; d < mInputDepth; ++d)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < mKernelsCount; ++kernelIndex)
+                {
+                    for (size_t oy = 0; oy < mInputHeight; ++oy)
+                    {
+                        for (size_t ox = 0; ox < mInputWidth; ++ox)
+                        {
+                            for (size_t ky = 0; ky < mKernelHeight; ++ky)
+                            {
+                                for (size_t kx = 0; kx < mKernelWidth; ++kx)
+                                {
+                                    prevLayerDeltas3D[q][d][oy * mInputWidth + ox] +=
+                                        kernelsWeights4D[d][kernelIndex][ky][kx] *
+                                        deltasPadded2D[kernelIndex][oy * outputWidthPadded * mStrideH + ky * mDilationH * outputWidthPadded + ox * mStrideW + kx * mDilationW];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (!mFrozen)
+    {
+        const Tensor& inputs = mNetworkParams.mMemoryManager[mInputName];
+
+        Tensor& gradWeights = mNetworkParams.mMemoryManager[mWeightsName.grad()];
+        auto inputs4D = inputs.reshape(yato::dims(batchSize, mInputDepth, mInputHeight, mInputWidth));
+        auto gradWeights4D = gradWeights.reshape(yato::dims(mInputDepth, mKernelsCount, mKernelHeight, mKernelWidth));
+
+        if (mNetworkParams.mCalculationMode == CalculationMode::DETERMINISTIC)
+        {
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                std::fill(mNetworkParams.mMemoryManager[mName / "TMP"].begin(), mNetworkParams.mMemoryManager[mName / "TMP"].end(), 0.0_dt);
+                Common::addPadding2D(&deltas3D[i][0][0], &mNetworkParams.mMemoryManager[mName / "TMP"][0], mKernelsCount, mOutputWidth, mOutputHeight, outputWidthPadded, outputHeightPadded);
+                auto deltasPadded2D = mNetworkParams.mMemoryManager[mName / "TMP"].reshape(yato::dims(mKernelsCount, outputHeightPadded * outputWidthPadded));
+
+                for (size_t d = 0; d < mInputDepth; ++d)
+                {
+                    for (size_t kernelIndex = 0; kernelIndex < mKernelsCount; ++kernelIndex)
+                    {
+                        for (size_t ky = 0; ky < mKernelHeight; ++ky)
+                        {
+                            for (size_t kx = 0; kx < mKernelWidth; ++kx)
+                            {
+                                for (size_t oy = 0; oy < mInputHeight; ++oy)
+                                {
+                                    for (size_t ox = 0; ox < mInputWidth; ++ox)
+                                    {
+                                        gradWeights4D[d][kernelIndex][ky][kx] +=
+                                            deltasPadded2D[kernelIndex][oy * outputWidthPadded * mStrideH + ky * mDilationH * outputWidthPadded + ox * mStrideW + kx * mDilationW] *
+                                            inputs4D[i][d][oy][ox];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#if defined(_OPENMP)
+        else if (mNetworkParams.mCalculationMode == CalculationMode::FAST)
+        {
+#pragma omp parallel for
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                std::fill(mNetworkParams.mMemoryManager[mName / "TMP"].begin(), mNetworkParams.mMemoryManager[mName / "TMP"].end(), 0.0_dt);
+                Common::addPadding2D(&deltas3D[i][0][0], &mNetworkParams.mMemoryManager[mName / "TMP"][0], mKernelsCount, mOutputWidth, mOutputHeight, outputWidthPadded, outputHeightPadded);
+                auto deltasPadded2D = mNetworkParams.mMemoryManager[mName / "TMP"].reshape(yato::dims(mKernelsCount, outputHeightPadded * outputWidthPadded));
+
+                for (size_t d = 0; d < mInputDepth; ++d)
+                {
+                    for (size_t kernelIndex = 0; kernelIndex < mKernelsCount; ++kernelIndex)
+                    {
+                        for (size_t ky = 0; ky < mKernelHeight; ++ky)
+                        {
+                            for (size_t kx = 0; kx < mKernelWidth; ++kx)
+                            {
+                                for (size_t oy = 0; oy < mInputHeight; ++oy)
+                                {
+                                    for (size_t ox = 0; ox < mInputWidth; ++ox)
+                                    {
+                                        gradWeights4D[d][kernelIndex][ky][kx] +=
+                                            deltasPadded2D[kernelIndex][oy * outputWidthPadded * mStrideH + ky * mDilationH * outputWidthPadded + ox * mStrideW + kx * mDilationW] *
+                                            inputs4D[i][d][oy][ox];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#endif
+        else
+        {
+            auto prefix = "TransposedConvolution2DLayer[" + mName + "::backwardCompute]: ";
+            THROW(mTypeName, mName, "unexpected calculation mode");
+        }
+
+        if (mUseBias)
+        {
+            Tensor& gradBiases = mNetworkParams.mMemoryManager[mBiasesName.grad()];
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < mKernelsCount; ++kernelIndex)
+                {
+                    gradBiases[kernelIndex] += std::accumulate(deltas3D[i][kernelIndex].begin(), deltas3D[i][kernelIndex].end(), 0.0_dt);
+                }
+            }
+        }
+    }
+}
+#else
+void TransposedConvolution2DLayer::backwardComputeImpl()
+{
+    const size_t batchSize = mNetworkParams.mWorkflow.getBatchSize();
+
+    const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputName.grad()];
+    auto deltas3D = deltas.reshape(yato::dims(batchSize, mKernelsCount, mOutputHeight * mOutputWidth));
+
+    // if (mNetworkParams.isGradNeeded(mInputName))
+    {
+        Tensor& prevLayerDelta = mNetworkParams.mMemoryManager[mPrevLayerDeltaName];
+
+        auto prevLayerDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            size_t index = 0;
+#if defined(_OPENMP)
+            index = omp_get_thread_num();
+#endif
+            Tensor& im2ColBack = mNetworkParams.mMemoryManager[mIm2ColBackward[index]];
+            Common::im2col(
+                &deltas3D[q][0][0], mOutputWidth, mOutputHeight, mKernelsCount, mEffectiveReceptiveFieldW, mEffectiveReceptiveFieldH, mStrideW, mStrideH, mPaddingW, mPaddingH, &im2ColBack[0]);
+
+            Common::gemm(CblasNoTrans,
+                         CblasNoTrans,
+                         mInputDepth,
+                         mInputWidth * mInputHeight,
+                         mEffectiveReceptiveFieldW * mEffectiveReceptiveFieldH * mKernelsCount,
+                         1.0_dt,
+                         mDilationEnabled ? &mNetworkParams.mMemoryManager[mDilationTensor][0] : &mNetworkParams.mMemoryManager[mWeightsName][0],
+                         &im2ColBack[0],
+                         1.0_dt,
+                         &prevLayerDeltas3D[q][0][0]);
+        }
+    }
+
+    if (!mFrozen)
+    {
+        const Tensor& inputs = mNetworkParams.mMemoryManager[mInputName];
+
+        Tensor& gradWeights = mNetworkParams.mMemoryManager[mWeightsName.grad()];
+
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mInputDepth, mInputHeight * mInputWidth));
+
+        if (mDilationEnabled)
+        {
+            std::fill(mNetworkParams.mMemoryManager[mDilationTensor].begin(), mNetworkParams.mMemoryManager[mDilationTensor].end(), 0.0_dt);
+        }
+
+        if (mNetworkParams.mCalculationMode == CalculationMode::DETERMINISTIC)
+        {
+            for (size_t q = 0; q < batchSize; ++q)
+            {
+                size_t index = 0;
+                Tensor& im2ColBack = mNetworkParams.mMemoryManager[mIm2ColBackward[index]];
+                Common::im2col(
+                    &deltas3D[q][0][0], mOutputWidth, mOutputHeight, mKernelsCount, mEffectiveReceptiveFieldW, mEffectiveReceptiveFieldH, mStrideW, mStrideH, mPaddingW, mPaddingH, &im2ColBack[0]);
+
+                Common::gemm(CblasNoTrans,
+                             CblasTrans,
+                             mInputDepth,
+                             mEffectiveReceptiveFieldW * mEffectiveReceptiveFieldH * mKernelsCount,
+                             mInputWidth * mInputHeight,
+                             1.0_dt,
+                             &inputs3D[q][0][0],
+                             &im2ColBack[0],
+                             1.0_dt,
+                             mDilationEnabled ? &mNetworkParams.mMemoryManager[mDilationTensor][0] : &gradWeights[0]);
+            }
+        }
+#if defined(_OPENMP)
+        else if (mNetworkParams.mCalculationMode == CalculationMode::FAST)
+        {
+#pragma omp parallel for
+            for (size_t q = 0; q < batchSize; ++q)
+            {
+                size_t index = omp_get_thread_num();
+
+                Tensor& im2ColBack = mNetworkParams.mMemoryManager[mIm2ColBackward[index]];
+                Common::im2col(
+                    &deltas3D[q][0][0], mOutputWidth, mOutputHeight, mKernelsCount, mEffectiveReceptiveFieldW, mEffectiveReceptiveFieldH, mStrideW, mStrideH, mPaddingW, mPaddingH, &im2ColBack[0]);
+#pragma omp critical
+                Common::gemm(CblasNoTrans,
+                             CblasTrans,
+                             mInputDepth,
+                             mEffectiveReceptiveFieldW * mEffectiveReceptiveFieldH * mKernelsCount,
+                             mInputWidth * mInputHeight,
+                             1.0_dt,
+                             &inputs3D[q][0][0],
+                             &im2ColBack[0],
+                             1.0_dt,
+                             mDilationEnabled ? &mNetworkParams.mMemoryManager[mDilationTensor][0] : &gradWeights[0]);
+            }
+        }
+#endif
+        else
+        {
+            auto prefix = "TransposedConvolution2DLayer[" + mName + "::backwardCompute]: ";
+            THROW(mTypeName, mName, "unexpected calculation mode");
+        }
+
+        if (mDilationEnabled)
+        {
+            auto gradWeights4D = gradWeights.reshape(yato::dims(mInputDepth, mKernelsCount, mKernelHeight, mKernelWidth));
+            auto dilatedGradWeights4D = mNetworkParams.mMemoryManager[mDilationTensor].reshape(yato::dims(mInputDepth, mKernelsCount, mEffectiveReceptiveFieldH, mEffectiveReceptiveFieldW));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t d = 0; d < mInputDepth; ++d)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < mKernelsCount; ++kernelIndex)
+                {
+                    for (size_t ky = 0; ky < mKernelHeight; ++ky)
+                    {
+                        for (size_t kx = 0; kx < mKernelWidth; ++kx)
+                        {
+                            gradWeights4D[d][kernelIndex][ky][kx] += dilatedGradWeights4D[d][kernelIndex][ky * mDilationH][kx * mDilationW];
+                        }
+                    }
+                }
+            }
+        }
+
+        if (mUseBias)
+        {
+            Tensor& gradBiases = mNetworkParams.mMemoryManager[mBiasesName.grad()];
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < mKernelsCount; ++kernelIndex)
+                {
+                    gradBiases[kernelIndex] += std::accumulate(deltas3D[i][kernelIndex].begin(), deltas3D[i][kernelIndex].end(), 0.0_dt);
+                }
+            }
+        }
+    }
+}
+#endif
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution2DLayer.h b/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution2DLayer.h
new file mode 100644
index 00000000..51bf0245
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/TransposedConvolution2DLayer.h
@@ -0,0 +1,94 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TRANSPOSED_CONVOLUTION_2D_LAYER_H
+#define TRANSPOSED_CONVOLUTION_2D_LAYER_H
+
+#include <training/base/layers/TrainableLayer.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/parameters/trainable/TransposedConvolution2DParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Transposed Convolution 2D Layer
+ * Applies a 2D transposed convolution operator over an input image
+ * composed of several input planes. This module can be seen as the
+ * gradient of Convolution 2D Layer with respect to its input.
+ * It is also known as a fractionally-strided convolution or a deconvolution
+ * (although it is not an actual deconvolution operation).
+ *
+ * @see
+ * https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html
+ */
+class TransposedConvolution2DLayer : public TrainableLayer
+{
+  public:
+    TransposedConvolution2DLayer(const Name& name, const TransposedConvolution2DParams& params, NetworkParameters& networkParameters);
+
+    TransposedConvolution2DLayer(TransposedConvolution2DLayer&&) = default;
+    TransposedConvolution2DLayer(const TransposedConvolution2DLayer&) = delete;
+    TransposedConvolution2DLayer& operator=(const TransposedConvolution2DLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  protected:
+    TransposedConvolution2DLayer(const Name& name, const std::string& typeName, const TransposedConvolution2DParams& params, NetworkParameters& networkParameters);
+
+    size_t mInputWidth;
+    size_t mInputHeight;
+    size_t mInputDepth;
+
+    size_t mOutputWidth;
+    size_t mOutputHeight;
+
+    size_t mKernelWidth;
+    size_t mKernelHeight;
+    size_t mKernelsCount;
+
+    size_t mStrideW;
+    size_t mStrideH;
+
+    size_t mPaddingW;
+    size_t mPaddingH;
+
+    size_t mOutputPaddingW;
+    size_t mOutputPaddingH;
+
+    bool mUseBias;
+
+    size_t mDilationW;
+    size_t mDilationH;
+
+    bool mQuantizeWeights;
+    Name mWeightsBackup;
+
+    size_t mEffectiveReceptiveFieldW;
+    size_t mEffectiveReceptiveFieldH;
+
+    bool mDilationEnabled;
+    Name mDilationTensor;
+
+    Name mInputName;
+    Name mOutputName;
+    Name mPrevLayerDeltaName;
+
+    Names mIm2ColForward;
+    Names mIm2ColBackward;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/BatchnormCPU.cpp b/training/src/compiler/training/base/layers/basic/trainable/impl/BatchnormCPU.cpp
new file mode 100644
index 00000000..f9fc7098
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/BatchnormCPU.cpp
@@ -0,0 +1,427 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BatchnormCPU.h"
+#include "../Batchnorm.h"
+
+#include <training/base/impl/ImplFactory.h>
+
+namespace
+{
+
+std::tuple<size_t, size_t, size_t> reassign(raul::Dimension dim, size_t i, size_t k, size_t q)
+{
+    if (dim == raul::Dimension::Depth)
+    {
+        return std::make_tuple(i, k, q);
+    }
+    if (dim == raul::Dimension::Height)
+    {
+        return std::make_tuple(k, i, q);
+    }
+    return std::make_tuple(k, q, i);
+}
+
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::BatchNormLayer, raul::BatchNormLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::BatchNormLayer, raul::BatchNormLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+template<typename MM>
+void BatchNormLayerCPU<MM>::initNotBSTensors()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    work.getMemoryManager<MM>()[mLayer.mWeightsName] = TOMMTYPE(1_dt);
+    work.getMemoryManager<MM>()[mLayer.mName / "VarianceEval"] = TOMMTYPE(1_dt);
+}
+
+template<typename MM>
+void BatchNormLayerCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = mLayer.mNetworkParams.mWorkflow.getBatchSize();
+
+    auto& outputs = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    const auto& gamma = work.getMemoryManager<MM>()[mLayer.mWeightsName];
+    const auto& beta = work.getMemoryManager<MM>()[mLayer.mBiasesName];
+
+    auto& xHat = work.getMemoryManager<MM>()[mLayer.mName / "XHat"];
+    auto& varSqrt = work.getMemoryManager<MM>()[mLayer.mName / "VarianceSqrt"];
+
+    auto& mean = work.getMemoryManager<MM>()[mLayer.mName / "Mean"];
+    auto& var = work.getMemoryManager<MM>()[mLayer.mName / "Variance"];
+    auto& meanEval = work.getMemoryManager<MM>()[mLayer.mName / "MeanEval"];
+    auto& varEval = work.getMemoryManager<MM>()[mLayer.mName / "VarianceEval"];
+
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    if (mLayer.mDimension == raul::Dimension::Depth)
+    {
+        auto outputs3D = outputs.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+
+        if (mode == NetworkMode::Train || mode == NetworkMode::TrainCheckpointed)
+        {
+            auto xhat3D = xHat.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+
+            const dtype reciprocalN = 1.0_dt / static_cast<dtype>(batchSize * mLayer.mInputHeight * mLayer.mInputWidth);
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t c = 0; c < mLayer.mInputDepth; ++c)
+            {
+                dtype sum = 0_dt;
+                for (size_t batch = 0; batch < batchSize; ++batch)
+                {
+                    for (size_t i = 0; i < mLayer.mInputHeight * mLayer.mInputWidth; ++i)
+                    {
+                        sum += inputs3D[batch][c][i];
+                    }
+                }
+                mean[c] = TOMMTYPE(sum * reciprocalN);
+                if (mode == NetworkMode::Train)
+                {
+                    meanEval[c] = TOMMTYPE((1.0_dt - mLayer.mMomentum) * TODTYPE(meanEval[c]) + mLayer.mMomentum * mean[c]);
+                }
+            }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t c = 0; c < mLayer.mInputDepth; ++c)
+            {
+                dtype varT = 0_dt;
+                dtype vv = 0;
+                for (size_t batch = 0; batch < batchSize; ++batch)
+                {
+                    for (size_t i = 0; i < mLayer.mInputHeight * mLayer.mInputWidth; ++i)
+                    {
+                        vv = TODTYPE(inputs3D[batch][c][i] - mean[c]);
+                        varT += vv * vv * reciprocalN;
+                    }
+                }
+                var[c] = TOMMTYPE(varT);
+                varSqrt[c] = TOMMTYPE(std::sqrt(varT + mLayer.mEps));
+                if (mode == NetworkMode::Train)
+                {
+                    varEval[c] = TOMMTYPE((1.0_dt - mLayer.mMomentum) * TODTYPE(varEval[c]) + mLayer.mMomentum * var[c]);
+                }
+            }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t c = 0; c < mLayer.mInputDepth; ++c)
+            {
+                auto g = gamma[c];
+                auto b = beta[c];
+                dtype v = 1_dt / TODTYPE(varSqrt[c]);
+                auto m = mean[c];
+                for (size_t batch = 0; batch < batchSize; ++batch)
+                {
+                    for (size_t i = 0; i < mLayer.mInputHeight * mLayer.mInputWidth; ++i)
+                    {
+                        dtype val = (inputs3D[batch][c][i] - m) * v;
+                        xhat3D[batch][c][i] = TOMMTYPE(val);
+                        outputs3D[batch][c][i] = TOMMTYPE(g * val + b);
+                    }
+                }
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t c = 0; c < mLayer.mInputDepth; ++c)
+            {
+                for (size_t batch = 0; batch < batchSize; ++batch)
+                {
+                    for (size_t i = 0; i < mLayer.mInputHeight * mLayer.mInputWidth; ++i)
+                    {
+                        varSqrt[c] = TOMMTYPE(std::sqrt(TODTYPE(varEval[c] + mLayer.mEps)));
+                        outputs3D[batch][c][i] = gamma[c] * ((inputs3D[batch][c][i] - meanEval[c]) / varSqrt[c]) + beta[c];
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        auto outputs4D = outputs.get4DView();
+        auto inputs4D = inputs.get4DView();
+
+        if (mode == NetworkMode::Train || mode == NetworkMode::TrainCheckpointed)
+        {
+            auto xhat4D = xHat.get4DView();
+
+            Tensor sum(mLayer.mChosenDimSize, 0.0_dt);
+            Tensor varT(mLayer.mChosenDimSize, 0.0_dt);
+
+            const dtype reciprocalN = 1.0_dt / static_cast<dtype>(batchSize * mLayer.mOtherDims[0] * mLayer.mOtherDims[1]);
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < mLayer.mChosenDimSize; ++i)
+            {
+                for (size_t j = 0; j < batchSize; ++j)
+                {
+                    for (size_t k = 0; k < mLayer.mOtherDims[0]; ++k)
+                    {
+                        for (size_t q = 0; q < mLayer.mOtherDims[1]; ++q)
+                        {
+                            // Rearrange indices in proper way
+                            auto [depth, height, width] = reassign(mLayer.mDimension, i, k, q);
+                            sum[i] += inputs4D[j][depth][height][width];
+                        }
+                    }
+                }
+                mean[i] = TOMMTYPE(sum[i] * reciprocalN);
+                if (mode == NetworkMode::Train)
+                {
+                    meanEval[i] = TOMMTYPE((1.0_dt - mLayer.mMomentum) * TODTYPE(meanEval[i]) + mLayer.mMomentum * mean[i]);
+                }
+            }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < mLayer.mChosenDimSize; ++i)
+            {
+                for (size_t j = 0; j < batchSize; ++j)
+                {
+                    for (size_t k = 0; k < mLayer.mOtherDims[0]; ++k)
+                    {
+                        for (size_t q = 0; q < mLayer.mOtherDims[1]; ++q)
+                        {
+                            // Rearrange indices in proper way
+                            auto [depth, height, width] = reassign(mLayer.mDimension, i, k, q);
+                            varT[i] += TODTYPE(pow(inputs4D[j][depth][height][width] - mean[i], 2.0_dt)) * reciprocalN;
+                        }
+                    }
+                }
+                var[i] = TOMMTYPE(varT[i]);
+                if (mode == NetworkMode::Train)
+                {
+                    varEval[i] = TOMMTYPE((1.0_dt - mLayer.mMomentum) * TODTYPE(varEval[i]) + mLayer.mMomentum * var[i]);
+                }
+            }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < mLayer.mChosenDimSize; ++i)
+            {
+                for (size_t j = 0; j < batchSize; ++j)
+                {
+                    for (size_t k = 0; k < mLayer.mOtherDims[0]; ++k)
+                    {
+                        for (size_t q = 0; q < mLayer.mOtherDims[1]; ++q)
+                        {
+                            // Rearrange indices in proper way
+                            auto [depth, height, width] = reassign(mLayer.mDimension, i, k, q);
+                            varSqrt[i] = TOMMTYPE(std::sqrt(TODTYPE(var[i] + mLayer.mEps)));
+                            xhat4D[j][depth][height][width] = (inputs4D[j][depth][height][width] - mean[i]) / varSqrt[i];
+                            outputs4D[j][depth][height][width] = gamma[i] * xhat4D[j][depth][height][width] + beta[i];
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < mLayer.mChosenDimSize; ++i)
+            {
+                for (size_t j = 0; j < batchSize; ++j)
+                {
+                    for (size_t k = 0; k < mLayer.mOtherDims[0]; ++k)
+                    {
+                        for (size_t q = 0; q < mLayer.mOtherDims[1]; ++q)
+                        {
+                            // Rearrange indices in proper way
+                            auto [depth, height, width] = reassign(mLayer.mDimension, i, k, q);
+                            varSqrt[i] = TOMMTYPE(std::sqrt(TODTYPE(varEval[i] + mLayer.mEps)));
+                            outputs4D[j][depth][height][width] = gamma[i] * ((inputs4D[j][depth][height][width] - meanEval[i]) / varSqrt[i]) + beta[i];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<typename MM>
+void BatchNormLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = mLayer.mNetworkParams.mWorkflow.getBatchSize();
+
+    const auto& xHat = work.getMemoryManager<MM>()[mLayer.mName / "XHat"];
+    const auto& varSqrt = work.getMemoryManager<MM>()[mLayer.mName / "VarianceSqrt"];
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+
+    const auto& gamma = work.getMemoryManager<MM>()[mLayer.mWeightsName];
+
+    if (mLayer.mDimension == raul::Dimension::Depth)
+    {
+        auto deltas3D = deltas.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+        auto xhat3D = xHat.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+
+        if (!mLayer.mFrozen)
+        {
+            auto& nablaBeta = work.getMemoryManager<MM>()[mLayer.mBiasesName.grad()];
+            auto& nablaGamma = work.getMemoryManager<MM>()[mLayer.mWeightsName.grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t c = 0; c < mLayer.mInputDepth; ++c)
+            {
+                for (size_t batch = 0; batch < batchSize; ++batch)
+                {
+                    for (size_t i = 0; i < mLayer.mInputHeight * mLayer.mInputWidth; ++i)
+                    {
+                        nablaBeta[c] += deltas3D[batch][c][i];
+                        nablaGamma[c] += deltas3D[batch][c][i] * xhat3D[batch][c][i];
+                    }
+                }
+            }
+        }
+
+        ////if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+        {
+            auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+            auto prevDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+            const size_t N = batchSize * mLayer.mInputHeight * mLayer.mInputWidth;
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t c = 0; c < mLayer.mInputDepth; ++c)
+            {
+                dtype g = TODTYPE(gamma[c]);
+                dtype dvar = 0;
+                dtype dvar2 = 0;
+                dtype val = 0;
+                dtype coeff = 1_dt / (TODTYPE(N) * TODTYPE(varSqrt[c]));
+                for (size_t batch = 0; batch < batchSize; ++batch)
+                {
+                    for (size_t i = 0; i < mLayer.mInputHeight * mLayer.mInputWidth; ++i)
+                    {
+                        val = deltas3D[batch][c][i] * g;
+                        dvar += val;
+                        dvar2 += val * TODTYPE(xhat3D[batch][c][i]);
+                    }
+                }
+
+                for (size_t batch = 0; batch < batchSize; ++batch)
+                {
+                    for (size_t i = 0; i < mLayer.mInputHeight * mLayer.mInputWidth; ++i)
+                    {
+                        prevDeltas3D[batch][c][i] += TOMMTYPE((TODTYPE(N) * deltas3D[batch][c][i] * g - dvar - xhat3D[batch][c][i] * dvar2) * coeff);
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        auto deltas4D = deltas.get4DView();
+        auto xhat4D = xHat.get4DView();
+
+        if (!mLayer.mFrozen)
+        {
+            auto& nablaBeta = work.getMemoryManager<MM>()[mLayer.mBiasesName.grad()];
+            auto& nablaGamma = work.getMemoryManager<MM>()[mLayer.mWeightsName.grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < mLayer.mChosenDimSize; ++i)
+            {
+                for (size_t j = 0; j < batchSize; ++j)
+                {
+                    for (size_t k = 0; k < mLayer.mOtherDims[0]; ++k)
+                    {
+                        for (size_t q = 0; q < mLayer.mOtherDims[1]; ++q)
+                        {
+                            // Rearrange indices in proper way
+                            auto [depth, height, width] = reassign(mLayer.mDimension, i, k, q);
+                            nablaBeta[i] += deltas4D[j][depth][height][width];
+                            nablaGamma[i] += deltas4D[j][depth][height][width] * xhat4D[j][depth][height][width];
+                        }
+                    }
+                }
+            }
+        }
+
+        ////if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+        {
+            Tensor nablaXhat(batchSize, mLayer.mInputDepth, mLayer.mInputHeight, mLayer.mInputWidth);
+            auto nablaXhat4D = nablaXhat.get4DView();
+
+            Tensor dvar(mLayer.mChosenDimSize, 0.0_dt);
+            Tensor dvar2(mLayer.mChosenDimSize, 0.0_dt);
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < mLayer.mChosenDimSize; ++i)
+            {
+                for (size_t j = 0; j < batchSize; ++j)
+                {
+                    for (size_t k = 0; k < mLayer.mOtherDims[0]; ++k)
+                    {
+                        for (size_t q = 0; q < mLayer.mOtherDims[1]; ++q)
+                        {
+                            // Rearrange indices in proper way
+                            auto [depth, height, width] = reassign(mLayer.mDimension, i, k, q);
+                            nablaXhat4D[j][depth][height][width] = TODTYPE(deltas4D[j][depth][height][width]) * TODTYPE(gamma[i]);
+                            dvar[i] += nablaXhat4D[j][depth][height][width];
+                            dvar2[i] += nablaXhat4D[j][depth][height][width] * TODTYPE(xhat4D[j][depth][height][width]);
+                        }
+                    }
+                }
+            }
+
+            auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+            auto prevDeltas4D = prevLayerDelta.get4DView();
+            const size_t N = batchSize * mLayer.mOtherDims[0] * mLayer.mOtherDims[1];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < mLayer.mChosenDimSize; ++i)
+            {
+                for (size_t j = 0; j < batchSize; ++j)
+                {
+                    for (size_t k = 0; k < mLayer.mOtherDims[0]; ++k)
+                    {
+                        for (size_t q = 0; q < mLayer.mOtherDims[1]; ++q)
+                        {
+                            // Rearrange indices in proper way
+                            auto [depth, height, width] = reassign(mLayer.mDimension, i, k, q);
+                            prevDeltas4D[j][depth][height][width] += TOMMTYPE((TODTYPE(N) * nablaXhat4D[j][depth][height][width] - dvar[i] - TODTYPE(xhat4D[j][depth][height][width]) * dvar2[i]) /
+                                                                              (static_cast<raul::dtype>(N) * TODTYPE(varSqrt[i])));
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template class BatchNormLayerCPU<MemoryManager>;
+template class BatchNormLayerCPU<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/BatchnormCPU.h b/training/src/compiler/training/base/layers/basic/trainable/impl/BatchnormCPU.h
new file mode 100644
index 00000000..5c891ffc
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/BatchnormCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BATCHNORM_LAYER_CPU_H
+#define BATCHNORM_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class BatchNormLayer;
+
+/**
+ * @brief BatchNorm layer CPU implementation
+ */
+template<typename MM>
+class BatchNormLayerCPU : public BasicImpl
+{
+  public:
+    BatchNormLayerCPU(BatchNormLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    BatchNormLayerCPU(BatchNormLayerCPU&&) = default;
+    BatchNormLayerCPU(const BatchNormLayerCPU&) = delete;
+    BatchNormLayerCPU& operator=(const BatchNormLayerCPU&) = delete;
+
+    void initNotBSTensors() override;
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    BatchNormLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/Convolution1DLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/trainable/impl/Convolution1DLayerCPU.cpp
new file mode 100644
index 00000000..d07f5699
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/Convolution1DLayerCPU.cpp
@@ -0,0 +1,448 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Convolution1DLayerCPU.h"
+#include "../Convolution1DLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace
+{
+
+template<typename T>
+void im2col1d(const T* image, size_t imageWidth, size_t imageChannels, size_t filterWidth, size_t strideWidth, size_t paddingWidth, T* matrix, bool reversedOrder)
+{
+
+    if (false || reversedOrder || strideWidth != 1)
+    {
+        raul::Common::im2col(image, imageWidth, 1u, imageChannels, filterWidth, 1u, strideWidth, 1u, paddingWidth, 0, matrix, reversedOrder);
+        return;
+    }
+
+    // resulted matrix width
+    const size_t widthCol = imageWidth + 2 * paddingWidth - filterWidth + 1;
+
+    for (size_t c = 0; c < imageChannels; ++c)
+    {
+        for (size_t f = 0; f < filterWidth; f++)
+        {
+
+            T* dst = matrix + (c * filterWidth + f) * widthCol;
+
+            size_t padLeft = 0;
+            if (f < paddingWidth)
+            {
+                padLeft = paddingWidth - f;
+                std::fill_n(dst, padLeft, T(0));
+            }
+
+            size_t padRight = 0;
+            size_t needToCopy = widthCol - padLeft;
+            if (needToCopy > imageWidth)
+            {
+                padRight = needToCopy - imageWidth;
+                needToCopy = imageWidth;
+            }
+
+            std::copy_n(&image[c * imageWidth + padLeft + f - paddingWidth], needToCopy, &dst[padLeft]);
+            std::fill_n(dst + padLeft + needToCopy, padRight, T(0));
+        }
+    }
+}
+
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::Convolution1DLayer, raul::Convolution1DLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::Convolution1DLayer, raul::Convolution1DLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void Convolution1DLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& weights = work.getMemoryManager<MM>()[mLayer.mWeightsName];
+
+    if (mLayer.mQuantizeWeights)
+    {
+        work.getMemoryManager<MM>()[mLayer.mWeightsBackup] = TORANGE_MM(weights);
+        mLayer.mNetworkParams.mQuantizerPtr->quantize(weights.begin(), weights.end());
+        mLayer.mNetworkParams.mQuantizerPtr->dequantize(weights.begin(), weights.end());
+    }
+
+    const size_t batchSize = work.getBatchSize();
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    auto inputs3D = mLayer.mTFStyle ? inputs.reshape(yato::dims(batchSize, mLayer.mInputSize, mLayer.mInputChannels)) : inputs.reshape(yato::dims(batchSize, mLayer.mInputChannels, mLayer.mInputSize));
+
+    auto outputs3D =
+        mLayer.mTFStyle ? output.reshape(yato::dims(batchSize, mLayer.mOutputSize, mLayer.mOutputChannels)) : output.reshape(yato::dims(batchSize, mLayer.mOutputChannels, mLayer.mOutputSize));
+
+    auto kernelsWeights3D = mLayer.mTFStyle ? weights.reshape(yato::dims(mLayer.mKernelSize, mLayer.mInputChannels / mLayer.mGroups, mLayer.mOutputChannels))
+                                            : weights.reshape(yato::dims(mLayer.mOutputChannels, mLayer.mInputChannels / mLayer.mGroups, mLayer.mKernelSize));
+
+    auto finalKernelsWeights3D = kernelsWeights3D;
+
+    if (mLayer.mDilationEnabled)
+    {
+        auto dilatedKernelsWeights3D =
+            work.getMemoryManager<MM>()[mLayer.mDilationTensor].reshape(yato::dims(mLayer.mOutputChannels, mLayer.mInputChannels / mLayer.mGroups, mLayer.mEffectiveReceptiveField));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t kernelIndex = 0; kernelIndex < mLayer.mOutputChannels; ++kernelIndex)
+        {
+            for (size_t d = 0; d < mLayer.mInputChannels / mLayer.mGroups; ++d)
+            {
+                for (size_t kx = 0; kx < mLayer.mKernelSize; ++kx)
+                {
+                    if (mLayer.mTFStyle)
+                    {
+                        dilatedKernelsWeights3D[kernelIndex][d][kx * mLayer.mDilation] = kernelsWeights3D[kx][d][kernelIndex];
+                    }
+                    else
+                    {
+                        dilatedKernelsWeights3D[kernelIndex][d][kx * mLayer.mDilation] = kernelsWeights3D[kernelIndex][d][kx];
+                    }
+                }
+            }
+        }
+        finalKernelsWeights3D = dilatedKernelsWeights3D;
+    }
+    else if (mLayer.mTFStyle)
+    {
+        finalKernelsWeights3D = work.getMemoryManager<MM>()[mLayer.mTempWeights].reshape(yato::dims(mLayer.mOutputChannels, mLayer.mInputChannels / mLayer.mGroups, mLayer.mKernelSize));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < mLayer.mOutputChannels; ++i)
+        {
+            for (size_t j = 0; j < mLayer.mInputChannels / mLayer.mGroups; ++j)
+            {
+                for (size_t k = 0; k < mLayer.mKernelSize; ++k)
+                {
+                    finalKernelsWeights3D[i][j][k] = kernelsWeights3D[k][j][i];
+                }
+            }
+        }
+    }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < batchSize; ++q)
+    {
+        size_t index = 0;
+#if defined(_OPENMP)
+        index = omp_get_thread_num();
+#endif
+        auto& im2ColFor = work.getMemoryManager<MM>()[mLayer.mIm2ColForward[index]];
+
+        im2col1d(&inputs3D[q][0][0], mLayer.mInputSize, mLayer.mInputChannels, mLayer.mEffectiveReceptiveField, mLayer.mStride, mLayer.mPadding, &im2ColFor[0], mLayer.mTFStyle);
+
+        for (size_t group = 0; group < mLayer.mGroups; ++group)
+        {
+            if (mLayer.mTFStyle)
+            {
+                Common::gemm(CblasTrans,
+                             CblasTrans,
+                             mLayer.mOutputSize,
+                             mLayer.mOutputChannels / mLayer.mGroups,
+                             mLayer.mEffectiveReceptiveField * mLayer.mInputChannels / mLayer.mGroups,
+                             1.0_dt,
+                             &im2ColFor[0] + group * mLayer.mInputChannels / mLayer.mGroups * mLayer.mEffectiveReceptiveField * mLayer.mOutputSize,
+                             &finalKernelsWeights3D[group * mLayer.mOutputChannels / mLayer.mGroups][0][0],
+                             0.0_dt,
+                             &outputs3D[q][0][group * mLayer.mOutputChannels / mLayer.mGroups]);
+            }
+            else
+            {
+                Common::gemm(CblasNoTrans,
+                             CblasNoTrans,
+                             mLayer.mOutputChannels / mLayer.mGroups,
+                             mLayer.mOutputSize,
+                             mLayer.mEffectiveReceptiveField * mLayer.mInputChannels / mLayer.mGroups,
+                             1.0_dt,
+                             &finalKernelsWeights3D[group * mLayer.mOutputChannels / mLayer.mGroups][0][0],
+                             &im2ColFor[0] + group * mLayer.mInputChannels / mLayer.mGroups * mLayer.mEffectiveReceptiveField * mLayer.mOutputSize,
+                             0.0_dt,
+                             &outputs3D[q][group * mLayer.mOutputChannels / mLayer.mGroups][0]);
+            }
+        }
+    }
+
+    if (mLayer.mUseBias)
+    {
+        const auto& biases = work.getMemoryManager<MM>()[mLayer.mBiasesName];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t i = 0; i < mLayer.mOutputSize; ++i)
+            {
+                for (size_t kernelIndex = 0; kernelIndex < mLayer.mOutputChannels; ++kernelIndex)
+                {
+                    if (mLayer.mTFStyle)
+                    {
+                        outputs3D[q][i][kernelIndex] += biases[kernelIndex];
+                    }
+                    else
+                    {
+                        outputs3D[q][kernelIndex][i] += biases[kernelIndex];
+                    }
+                }
+            }
+        }
+    }
+
+    if (mLayer.mQuantizeWeights)
+    {
+        weights = TORANGE_MM(work.getMemoryManager<MM>()[mLayer.mWeightsBackup]);
+    }
+}
+
+template<typename MM>
+void Convolution1DLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = work.getBatchSize();
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+    auto deltas3D =
+        mLayer.mTFStyle ? deltas.reshape(yato::dims(batchSize, mLayer.mOutputSize, mLayer.mOutputChannels)) : deltas.reshape(yato::dims(batchSize, mLayer.mOutputChannels, mLayer.mOutputSize));
+    const auto& weights = mLayer.mDilationEnabled ? work.getMemoryManager<MM>()[mLayer.mDilationTensor]
+                                                  : mLayer.mTFStyle ? work.getMemoryManager<MM>()[mLayer.mTempWeights] : work.getMemoryManager<MM>()[mLayer.mWeightsName];
+
+    auto finalKernelsWeights3D = mLayer.mDilationEnabled ? weights.reshape(yato::dims(mLayer.mOutputChannels, mLayer.mInputChannels / mLayer.mGroups, mLayer.mEffectiveReceptiveField))
+                                                         : mLayer.mTFStyle ? weights.reshape(yato::dims(mLayer.mOutputChannels, mLayer.mInputChannels / mLayer.mGroups, mLayer.mKernelSize))
+                                                                           : weights.reshape(yato::dims(mLayer.mOutputChannels, mLayer.mInputChannels / mLayer.mGroups, mLayer.mKernelSize));
+
+    // prevDelta
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+        auto prevDeltas3D = mLayer.mTFStyle ? prevLayerDelta.reshape(yato::dims(batchSize, mLayer.mInputChannels, mLayer.mInputSize))
+                                            : prevLayerDelta.reshape(yato::dims(batchSize, mLayer.mInputSize, mLayer.mInputChannels));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            size_t index = 0;
+#if defined(_OPENMP)
+            index = omp_get_thread_num();
+#endif
+            auto& im2ColBack = work.getMemoryManager<MM>()[mLayer.mIm2ColBackward[index]];
+            for (size_t group = 0; group < mLayer.mGroups; ++group)
+            {
+                if (mLayer.mTFStyle)
+                {
+                    Common::gemm(CblasTrans,
+                                 CblasTrans,
+                                 mLayer.mEffectiveReceptiveField * mLayer.mInputChannels / mLayer.mGroups,
+                                 mLayer.mOutputSize,
+                                 mLayer.mOutputChannels / mLayer.mGroups,
+                                 1.0_dt,
+                                 &finalKernelsWeights3D[group * mLayer.mOutputChannels / mLayer.mGroups][0][0],
+                                 &deltas3D[q][0][group * mLayer.mOutputChannels / mLayer.mGroups],
+                                 0.0_dt,
+                                 &im2ColBack[0] + group * mLayer.mEffectiveReceptiveField * mLayer.mInputChannels * mLayer.mOutputSize / mLayer.mGroups);
+                }
+                else
+                {
+                    Common::gemm(CblasTrans,
+                                 CblasNoTrans,
+                                 mLayer.mEffectiveReceptiveField * mLayer.mInputChannels / mLayer.mGroups,
+                                 mLayer.mOutputSize,
+                                 mLayer.mOutputChannels / mLayer.mGroups,
+                                 1.0_dt,
+                                 &finalKernelsWeights3D[group * mLayer.mOutputChannels / mLayer.mGroups][0][0],
+                                 &deltas3D[q][group * mLayer.mOutputChannels / mLayer.mGroups][0],
+                                 0.0_dt,
+                                 &im2ColBack[0] + group * mLayer.mEffectiveReceptiveField * mLayer.mInputChannels * mLayer.mOutputSize / mLayer.mGroups);
+                }
+            }
+            Common::col2im(&im2ColBack[0],
+                           mLayer.mInputSize,
+                           1u,
+                           mLayer.mInputChannels,
+                           mLayer.mEffectiveReceptiveField,
+                           1u,
+                           mLayer.mStride,
+                           1u,
+                           mLayer.mPadding,
+                           0,
+                           &prevDeltas3D[q][0][0],
+                           mLayer.mTFStyle,
+                           false);
+        }
+    }
+
+    // gradients weights
+    if (!mLayer.mFrozen)
+    {
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+        auto inputs3D =
+            mLayer.mTFStyle ? inputs.reshape(yato::dims(batchSize, mLayer.mInputSize, mLayer.mInputChannels)) : inputs.reshape(yato::dims(batchSize, mLayer.mInputChannels, mLayer.mInputSize));
+
+        auto& gradWeights = work.getMemoryManager<MM>()[mLayer.mWeightsName.grad()];
+
+        auto& tmpWeightsGrad = mLayer.mDilationEnabled ? work.getMemoryManager<MM>()[mLayer.mDilationTensor] : mLayer.mTFStyle ? work.getMemoryManager<MM>()[mLayer.mTempWeights.grad()] : gradWeights;
+
+        size_t inputChannelsPerGroup = mLayer.mInputChannels / mLayer.mGroups;
+        auto gradWeights3D = tmpWeightsGrad.reshape(yato::dims(mLayer.mOutputChannels, inputChannelsPerGroup, mLayer.mEffectiveReceptiveField));
+
+        if (mLayer.mDilationEnabled || mLayer.mTFStyle)
+        {
+            tmpWeightsGrad = TOMMTYPE(0);
+            size_t maxThreads = mLayer.mTempWeightsGrag.size();
+            for (size_t q = 0; q < maxThreads; ++q)
+            {
+                auto& tempWeightsGrag = work.getMemoryManager<MM>()[mLayer.mTempWeightsGrag[q]];
+                tempWeightsGrag = TOMMTYPE(0);
+            }
+        }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            size_t index = 0;
+#if defined(_OPENMP)
+            index = omp_get_thread_num();
+#endif
+            auto& im2ColBack = work.getMemoryManager<MM>()[mLayer.mIm2ColBackward[index]];
+            auto& tempWeightsGrag = work.getMemoryManager<MM>()[mLayer.mTempWeightsGrag[index]];
+            auto tempWeightsGrag3D = tempWeightsGrag.reshape(yato::dims(mLayer.mOutputChannels, inputChannelsPerGroup, mLayer.mEffectiveReceptiveField));
+
+            im2col1d(&inputs3D[q][0][0], mLayer.mInputSize, mLayer.mInputChannels, mLayer.mEffectiveReceptiveField, mLayer.mStride, mLayer.mPadding, &im2ColBack[0], mLayer.mTFStyle);
+
+            for (size_t group = 0; group < mLayer.mGroups; ++group)
+            {
+                if (mLayer.mTFStyle)
+                {
+                    Common::gemm(CblasTrans,
+                                 CblasTrans,
+                                 mLayer.mOutputChannels / mLayer.mGroups,
+                                 mLayer.mEffectiveReceptiveField * mLayer.mInputChannels / mLayer.mGroups,
+                                 mLayer.mOutputSize,
+                                 1.0_dt,
+                                 &deltas3D[q][0][group * mLayer.mOutputChannels / mLayer.mGroups],
+                                 &im2ColBack[0] + group * mLayer.mOutputSize * mLayer.mInputChannels * mLayer.mEffectiveReceptiveField / mLayer.mGroups,
+                                 1.0_dt,
+                                 &tempWeightsGrag3D[group * mLayer.mOutputChannels / mLayer.mGroups][0][0]);
+                }
+                else
+                {
+                    Common::gemm(CblasNoTrans,
+                                 CblasTrans,
+                                 mLayer.mOutputChannels / mLayer.mGroups,
+                                 mLayer.mEffectiveReceptiveField * mLayer.mInputChannels / mLayer.mGroups,
+                                 mLayer.mOutputSize,
+                                 1.0_dt,
+                                 &deltas3D[q][group * mLayer.mOutputChannels / mLayer.mGroups][0],
+                                 &im2ColBack[0] + group * mLayer.mOutputSize * mLayer.mInputChannels * mLayer.mEffectiveReceptiveField / mLayer.mGroups,
+                                 1.0_dt,
+                                 &tempWeightsGrag3D[group * mLayer.mOutputChannels / mLayer.mGroups][0][0]);
+                }
+            }
+        }
+
+        size_t maxThreads = mLayer.mTempWeightsGrag.size();
+        for (size_t q = 0; q < maxThreads; ++q)
+        {
+            auto& tempWeightsGrag = work.getMemoryManager<MM>()[mLayer.mTempWeightsGrag[q]];
+            tmpWeightsGrad += tempWeightsGrag;
+        }
+
+        if (mLayer.mTFStyle)
+        {
+            auto realGradWeights3D = gradWeights.reshape(yato::dims(mLayer.mKernelSize, mLayer.mInputChannels / mLayer.mGroups, mLayer.mOutputChannels));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t kx = 0; kx < mLayer.mKernelSize; ++kx)
+            {
+                for (size_t d = 0; d < mLayer.mInputChannels / mLayer.mGroups; ++d)
+                {
+                    for (size_t kernelIndex = 0; kernelIndex < mLayer.mOutputChannels; ++kernelIndex)
+                    {
+                        realGradWeights3D[kx][d][kernelIndex] += gradWeights3D[kernelIndex][d][kx * mLayer.mDilation];
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (mLayer.mDilationEnabled)
+            {
+                auto realGradWeights3D = gradWeights.reshape(yato::dims(mLayer.mOutputChannels, mLayer.mInputChannels / mLayer.mGroups, mLayer.mKernelSize));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t kernelIndex = 0; kernelIndex < mLayer.mOutputChannels; ++kernelIndex)
+                {
+                    for (size_t d = 0; d < mLayer.mInputChannels / mLayer.mGroups; ++d)
+                    {
+                        for (size_t kx = 0; kx < mLayer.mKernelSize; ++kx)
+                        {
+                            realGradWeights3D[kernelIndex][d][kx] += gradWeights3D[kernelIndex][d][kx * mLayer.mDilation];
+                        }
+                    }
+                }
+            }
+        }
+
+        if (mLayer.mUseBias)
+        {
+            auto& gradBiases = work.getMemoryManager<MM>()[mLayer.mBiasesName.grad()];
+            for (size_t kernelIndex = 0; kernelIndex < mLayer.mOutputChannels; ++kernelIndex)
+            {
+                auto gradBias = 0.0_dt;
+#if defined(_OPENMP)
+#pragma omp parallel for reduction(+ : gradBias)
+#endif
+                for (size_t i = 0; i < batchSize; ++i)
+                {
+                    for (size_t j = 0; j < mLayer.mOutputSize; ++j)
+                    {
+                        if (mLayer.mTFStyle)
+                        {
+                            gradBias += deltas3D[i][j][kernelIndex];
+                        }
+                        else
+                        {
+                            gradBias += deltas3D[i][kernelIndex][j];
+                        }
+                    }
+                }
+                gradBiases[kernelIndex] += TOMMTYPE(gradBias);
+            }
+        }
+    }
+}
+
+template class Convolution1DLayerCPU<MemoryManager>;
+template class Convolution1DLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/Convolution1DLayerCPU.h b/training/src/compiler/training/base/layers/basic/trainable/impl/Convolution1DLayerCPU.h
new file mode 100644
index 00000000..4453aac2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/Convolution1DLayerCPU.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVOLUTION_1D_LAYER_CPU_H
+#define CONVOLUTION_1D_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class Convolution1DLayer;
+
+/**
+ * @brief Convolution1D layer CPU implementation
+ */
+template<typename MM>
+class Convolution1DLayerCPU : public BasicImpl
+{
+  public:
+    Convolution1DLayerCPU(Convolution1DLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    Convolution1DLayerCPU(Convolution1DLayerCPU&&) = default;
+    Convolution1DLayerCPU(const Convolution1DLayerCPU&) = delete;
+    Convolution1DLayerCPU& operator=(const Convolution1DLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    Convolution1DLayer& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/ConvolutionDepthwiseLayerCPU.cpp b/training/src/compiler/training/base/layers/basic/trainable/impl/ConvolutionDepthwiseLayerCPU.cpp
new file mode 100644
index 00000000..b9152725
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/ConvolutionDepthwiseLayerCPU.cpp
@@ -0,0 +1,217 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ConvolutionDepthwiseLayerCPU.h"
+#include "../ConvolutionDepthwiseLayer.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace raul
+{
+
+template<typename MM>
+void ConvolutionDepthwiseLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = mLayer.mNetworkParams.mWorkflow.getBatchSize();
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    auto inputs3D = inputs.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+    auto outputs3D = output.reshape(yato::dims(batchSize, mLayer.mKernelsCount, mLayer.mOutputHeight * mLayer.mOutputWidth));
+
+    const auto& weights = work.getMemoryManager<MM>()[mLayer.mWeightsName];
+    auto kernelsWeights2D = weights.reshape(yato::dims(mLayer.mKernelsCount, mLayer.mKernelHeight * mLayer.mKernelWidth));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t q = 0; q < batchSize; ++q)
+    {
+        for (size_t w = 0; w < mLayer.mInputDepth; ++w)
+        {
+            size_t index = 0;
+#if defined(_OPENMP)
+            index = omp_get_thread_num();
+#endif
+
+            auto& im2ColFor = work.getMemoryManager<MM>()[mLayer.mIm2ColForward[index]];
+
+            Common::im2col(&inputs3D[q][w][0],
+                           mLayer.mInputWidth,
+                           mLayer.mInputHeight,
+                           1,
+                           mLayer.mKernelWidth,
+                           mLayer.mKernelHeight,
+                           mLayer.mStrideW,
+                           mLayer.mStrideH,
+                           mLayer.mPaddingW,
+                           mLayer.mPaddingH,
+                           &im2ColFor[0]);
+
+            Common::gemm(CblasNoTrans,
+                         CblasNoTrans,
+                         1,
+                         mLayer.mOutputWidth * mLayer.mOutputHeight,
+                         mLayer.mKernelWidth * mLayer.mKernelHeight * 1,
+                         1.0_dt,
+                         &kernelsWeights2D[w][0],
+                         &im2ColFor[0],
+                         0.0_dt,
+                         &outputs3D[q][w][0]);
+        }
+    }
+
+    if (mLayer.mUseBias)
+    {
+        const auto& biases = work.getMemoryManager<MM>()[mLayer.mBiasesName];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < mLayer.mKernelsCount; ++kernelIndex)
+            {
+                const typename MM::type bias = biases[kernelIndex];
+                std::transform(
+                    outputs3D[q][kernelIndex].begin(), outputs3D[q][kernelIndex].end(), outputs3D[q][kernelIndex].begin(), [bias](typename MM::type& val) -> typename MM::type { return val + bias; });
+            }
+        }
+    }
+}
+
+template<typename MM>
+void ConvolutionDepthwiseLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = mLayer.mNetworkParams.mWorkflow.getBatchSize();
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+
+    auto deltas3D = deltas.reshape(yato::dims(batchSize, mLayer.mKernelsCount, mLayer.mOutputHeight * mLayer.mOutputWidth));
+
+    const auto& weights = work.getMemoryManager<MM>()[mLayer.mWeightsName];
+    auto kernelsWeights2D = weights.reshape(yato::dims(mLayer.mKernelsCount, mLayer.mKernelHeight * mLayer.mKernelWidth));
+
+    // prevDelta
+    // if (mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+        auto prevDeltas3D = prevLayerDelta.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t w = 0; w < mLayer.mInputDepth; ++w)
+            {
+                typename MM::tensor prevDeltaGemmTmp(mLayer.mKernelWidth * mLayer.mKernelHeight * 1 * mLayer.mOutputWidth * mLayer.mOutputHeight, TOMMTYPE(0));
+
+                Common::gemm(CblasTrans,
+                             CblasNoTrans,
+                             mLayer.mKernelWidth * mLayer.mKernelHeight * 1,
+                             mLayer.mOutputWidth * mLayer.mOutputHeight,
+                             1,
+                             1.0_dt,
+                             &kernelsWeights2D[w][0],
+                             &deltas3D[i][w][0],
+                             0.0_dt,
+                             prevDeltaGemmTmp.data());
+
+                Common::col2im(prevDeltaGemmTmp.data(),
+                               mLayer.mInputWidth,
+                               mLayer.mInputHeight,
+                               1,
+                               mLayer.mKernelWidth,
+                               mLayer.mKernelHeight,
+                               mLayer.mStrideW,
+                               mLayer.mStrideH,
+                               mLayer.mPaddingW,
+                               mLayer.mPaddingH,
+                               &prevDeltas3D[i][w][0],
+                               false,
+                               false);
+            }
+        }
+    }
+
+    // gradients weights
+    if (!mLayer.mFrozen)
+    {
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const size_t widthCol = (mLayer.mInputWidth + 2 * mLayer.mPaddingW - mLayer.mKernelWidth) / mLayer.mStrideW + 1;
+        const size_t heightCol = (mLayer.mInputHeight + 2 * mLayer.mPaddingH - mLayer.mKernelHeight) / mLayer.mStrideH + 1;
+
+        auto inputs3D = inputs.reshape(yato::dims(batchSize, mLayer.mInputDepth, mLayer.mInputHeight * mLayer.mInputWidth));
+
+        auto& gradWeights = work.getMemoryManager<MM>()[mLayer.mWeightsName.grad()];
+        auto gradWeights2D = gradWeights.reshape(yato::dims(mLayer.mKernelsCount, mLayer.mKernelHeight * mLayer.mKernelWidth));
+
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t w = 0; w < mLayer.mInputDepth; ++w)
+            {
+                typename MM::tensor im2colMatrix(widthCol * heightCol * 1 * mLayer.mKernelHeight * mLayer.mKernelWidth);
+
+                Common::im2col(&inputs3D[q][w][0],
+                               mLayer.mInputWidth,
+                               mLayer.mInputHeight,
+                               1,
+                               mLayer.mKernelWidth,
+                               mLayer.mKernelHeight,
+                               mLayer.mStrideW,
+                               mLayer.mStrideH,
+                               mLayer.mPaddingW,
+                               mLayer.mPaddingH,
+                               im2colMatrix.data());
+
+                Common::gemm(CblasNoTrans,
+                             CblasTrans,
+                             1,
+                             mLayer.mKernelWidth * mLayer.mKernelHeight * 1,
+                             mLayer.mOutputWidth * mLayer.mOutputHeight,
+                             1.0_dt,
+                             &deltas3D[q][w][0],
+                             im2colMatrix.data(),
+                             1.0_dt,
+                             &gradWeights2D[w][0]);
+            }
+        }
+    }
+
+    // gradients biases
+    if (!mLayer.mFrozen && mLayer.mUseBias)
+    {
+        auto& gradBiases = work.getMemoryManager<MM>()[mLayer.mBiasesName.grad()];
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t kernelIndex = 0; kernelIndex < mLayer.mKernelsCount; ++kernelIndex)
+            {
+                gradBiases[kernelIndex] += std::accumulate(deltas3D[i][kernelIndex].begin(), deltas3D[i][kernelIndex].end(), TOMMTYPE(0), std::plus<typename MM::type>());
+            }
+        }
+    }
+}
+
+template class ConvolutionDepthwiseLayerCPU<MemoryManager>;
+template class ConvolutionDepthwiseLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/ConvolutionDepthwiseLayerCPU.h b/training/src/compiler/training/base/layers/basic/trainable/impl/ConvolutionDepthwiseLayerCPU.h
new file mode 100644
index 00000000..ab44d69a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/ConvolutionDepthwiseLayerCPU.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVOLUTION_DEPTHWISE_LAYER_CPU_H
+#define CONVOLUTION_DEPTHWISE_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class ConvolutionDepthwiseLayer;
+
+/**
+ * @brief Depthwise convolution CPU implementation
+ */
+template<typename MM>
+class ConvolutionDepthwiseLayerCPU : public BasicImpl
+{
+  public:
+    ConvolutionDepthwiseLayerCPU(ConvolutionDepthwiseLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    ConvolutionDepthwiseLayerCPU(ConvolutionDepthwiseLayerCPU&&) = default;
+    ConvolutionDepthwiseLayerCPU(const ConvolutionDepthwiseLayerCPU&) = delete;
+    ConvolutionDepthwiseLayerCPU& operator=(const ConvolutionDepthwiseLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ConvolutionDepthwiseLayer& mLayer;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/EmbeddingCPU.cpp b/training/src/compiler/training/base/layers/basic/trainable/impl/EmbeddingCPU.cpp
new file mode 100644
index 00000000..c1178be9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/EmbeddingCPU.cpp
@@ -0,0 +1,152 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "EmbeddingCPU.h"
+#include "../Embedding.h"
+
+namespace raul
+{
+
+template<typename MM>
+void EmbeddingCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    // input tensor will come as indices in float
+    mLayer.mIndices.clear();
+
+    const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+    auto batchSize = inputs.getBatchSize();
+    auto depth = inputs.getDepth();
+    auto height = inputs.getHeight() * inputs.getWidth();
+
+    auto inputs3D = inputs.reshape(yato::dims(batchSize, depth, height));
+    auto outputs3D = output.reshape(yato::dims(batchSize, depth, height * mLayer.mEmbeddingSize));
+
+    const auto& lut = work.getMemoryManager<MM>()[mLayer.mWeightsName];
+    const auto lut2D = lut.reshape(yato::dims(mLayer.mDictionarySize, mLayer.mEmbeddingSize));
+
+    if (mode == NetworkMode::Train || mode == NetworkMode::TrainCheckpointed)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t c = 0; c < depth; ++c)
+            {
+                for (size_t i = 0; i < height; ++i)
+                {
+                    size_t index = static_cast<size_t>(inputs3D[q][c][i] + 0.5);
+                    size_t offset = i * mLayer.mEmbeddingSize;
+                    if (index == mLayer.mPaddingIdx)
+                    {
+                        std::fill_n(outputs3D[q][c].begin() + offset, mLayer.mEmbeddingSize, TOMMTYPE(0.0));
+                        continue;
+                    }
+                    if (mLayer.mScaleGradByFreq)
+                    {
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+                        {
+                            if (mLayer.mIndices.find(index) == mLayer.mIndices.end())
+                            {
+                                mLayer.mIndices[index] = 1;
+                            }
+                            else
+                            {
+                                ++mLayer.mIndices[index];
+                            }
+                        }
+                    }
+
+                    for (size_t e = 0; e < mLayer.mEmbeddingSize; ++e)
+                    {
+                        outputs3D[q][c][offset + e] = lut2D[index][e] * TOMMTYPE(mLayer.mOutputScale);
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < batchSize; ++q)
+        {
+            for (size_t c = 0; c < depth; ++c)
+            {
+                for (size_t i = 0; i < height; ++i)
+                {
+                    size_t offset = i * mLayer.mEmbeddingSize;
+                    size_t index = static_cast<size_t>(inputs3D[q][c][i] + 0.5);
+                    if (index == mLayer.mPaddingIdx)
+                    {
+                        std::fill_n(outputs3D[q][c].begin() + offset, mLayer.mEmbeddingSize, TOMMTYPE(0.0));
+                        continue;
+                    }
+                    for (size_t e = 0; e < mLayer.mEmbeddingSize; ++e)
+                    {
+                        outputs3D[q][c][offset + e] = lut2D[index][e] * TOMMTYPE(mLayer.mOutputScale);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<typename MM>
+void EmbeddingCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()]; // gradient from next layer
+
+    // if (mNetworkParams.mTensorDeclarator.hasGradient(mLayer.mWeightsName))
+    if (!mLayer.mFrozen)
+    {
+        auto& gradWeights = work.getMemoryManager<MM>()[mLayer.mWeightsName.grad()];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+        auto deltas2D = deltas.reshape(yato::dims(deltas.size() / mLayer.mEmbeddingSize, mLayer.mEmbeddingSize));
+        auto grad2D = gradWeights.reshape(yato::dims(mLayer.mDictionarySize, mLayer.mEmbeddingSize));
+        size_t el = 0;
+        for (auto v = inputs.begin(); v != inputs.end(); ++v, ++el)
+        {
+            auto index = static_cast<size_t>(*v + 0.5);
+            if (index == mLayer.mPaddingIdx)
+            {
+                continue;
+            }
+
+            auto scale = mLayer.mOutputScale;
+            if (mLayer.mScaleGradByFreq)
+            {
+                scale /= static_cast<dtype>(mLayer.mIndices.find(index)->second);
+            }
+
+            for (size_t i = 0; i < mLayer.mEmbeddingSize; ++i)
+            {
+                grad2D[index][i] += TOMMTYPE(scale) * deltas2D[el][i];
+            }
+        }
+    }
+}
+
+template class EmbeddingCPU<MemoryManager>;
+template class EmbeddingCPU<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/EmbeddingCPU.h b/training/src/compiler/training/base/layers/basic/trainable/impl/EmbeddingCPU.h
new file mode 100644
index 00000000..4425aec5
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/EmbeddingCPU.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef EMBEDDING_CPU_H
+#define EMBEDDING_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class Embedding;
+
+/**
+ * @brief Word embeddings using lookup table CPU implementation
+ */
+template<typename MM>
+class EmbeddingCPU : public BasicImpl
+{
+  public:
+    EmbeddingCPU(Embedding& layer)
+        : mLayer(layer)
+    {
+    }
+
+    EmbeddingCPU(EmbeddingCPU&&) = default;
+    EmbeddingCPU(const EmbeddingCPU&) = delete;
+    EmbeddingCPU& operator=(const EmbeddingCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    Embedding& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNorm2dCPU.cpp b/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNorm2dCPU.cpp
new file mode 100644
index 00000000..bf352aeb
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNorm2dCPU.cpp
@@ -0,0 +1,271 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LayerNorm2dCPU.h"
+#include "../LayerNorm2D.h"
+
+namespace raul
+{
+
+template<typename MM>
+LayerNorm2DLayerCPU<MM>::LayerNorm2DLayerCPU(LayerNorm2DLayer& layer)
+    : mLayer(layer)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    work.tensorNeeded(mLayer.mName, mLayer.mName / "cache1", WShape{ BS(), mLayer.mInputDepth, 1u, 1u }, DEC_BACK_WRIT);
+
+    work.tensorNeeded(mLayer.mName, mLayer.mName / "cache2", WShape{ BS(), mLayer.mInputDepth, 1u, 1u }, DEC_BACK_WRIT);
+}
+
+template<typename MM>
+void LayerNorm2DLayerCPU<MM>::initNotBSTensors()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+    work.getMemoryManager<MM>()[mLayer.mWeightsName] = TOMMTYPE(1_dt);
+}
+
+template<typename MM>
+void LayerNorm2DLayerCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = mLayer.mNetworkParams.mWorkflow.getBatchSize();
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+    auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+    auto& gamma = work.getMemoryManager<MM>()[mLayer.mWeightsName];
+    auto& beta = work.getMemoryManager<MM>()[mLayer.mBiasesName];
+
+    size_t N = batchSize * mLayer.mInputDepth;
+
+    size_t len = mLayer.mInputWidth * mLayer.mInputHeight;
+
+    auto outputs2D = output.reshape(yato::dims(N, len));
+    auto inputs2D = inputs.reshape(yato::dims(N, len));
+
+    if (mode == NetworkMode::Train || mode == NetworkMode::TrainCheckpointed)
+    {
+        auto& xHat = work.getMemoryManager<MM>()[mLayer.mXHatName];
+        auto& Var = work.getMemoryManager<MM>()[mLayer.mVarianceName];
+
+        auto xhat2D = xHat.reshape(yato::dims(N, len));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t n = 0; n < N; ++n)
+        {
+            raul::dtype mean = 0.0_dt;
+            raul::dtype var = 0.0_dt;
+
+            for (size_t i = 0; i < len; ++i)
+            {
+                mean += TODTYPE(inputs2D[n][i]);
+            }
+            mean /= TODTYPE(len);
+            for (size_t i = 0; i < len; ++i)
+            {
+                var += (TODTYPE(inputs2D[n][i]) - mean) * (TODTYPE(inputs2D[n][i]) - mean);
+            }
+
+            if (mLayer.mTFStyle)
+            {
+                var = TODTYPE(sqrt(var / TODTYPE(len) + mLayer.mEps));
+            }
+            else
+            {
+                if (mLayer.mUseBesselCorrection)
+                {
+                    var = TODTYPE(sqrt(var / TODTYPE(len - 1)) + mLayer.mEps); // we use Bessel's correction to mimic torch default behaviour
+                }
+                else
+                {
+                    var = TODTYPE(sqrt(var / TODTYPE(len)) + mLayer.mEps);
+                }
+            }
+
+            auto var_1 = 1.0_dt / var;
+
+            Var[n] = TOMMTYPE(var_1);
+            size_t ind = 0;
+            for (size_t j = 0; j < mLayer.mInputHeight; ++j)
+            {
+                for (size_t i = 0; i < mLayer.mInputWidth; ++i, ++ind)
+                {
+                    dtype xhat = (TODTYPE(inputs2D[n][ind]) - mean) * var_1;
+                    xhat2D[n][ind] = TOMMTYPE(xhat);
+                    outputs2D[n][ind] = TOMMTYPE(TODTYPE(gamma[i]) * xhat + TODTYPE(beta[i]));
+                }
+            }
+        }
+    }
+    else
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t n = 0; n < N; ++n)
+        {
+            dtype mean = 0_dt;
+            dtype var = 0_dt;
+
+            for (size_t i = 0; i < len; ++i)
+            {
+                mean += inputs2D[n][i];
+            }
+            mean /= static_cast<dtype>(len);
+            for (size_t i = 0; i < len; ++i)
+            {
+                var += (inputs2D[n][i] - mean) * (inputs2D[n][i] - mean);
+            }
+
+            if (mLayer.mTFStyle)
+            {
+                var = TODTYPE(sqrt(var / TODTYPE(len) + mLayer.mEps));
+            }
+            else
+            {
+                if (mLayer.mUseBesselCorrection)
+                {
+                    var = TODTYPE(sqrt(var / TODTYPE(len - 1)) + mLayer.mEps); // we use Bessel's correction to mimic torch default behaviour
+                }
+                else
+                {
+                    var = TODTYPE(sqrt(var / TODTYPE(len)) + mLayer.mEps);
+                }
+            }
+
+            auto var_1 = 1.0_dt / var;
+            size_t ind = 0;
+            for (size_t j = 0; j < mLayer.mInputHeight; ++j)
+            {
+                for (size_t i = 0; i < mLayer.mInputWidth; ++i, ++ind)
+                {
+                    auto xhat = (inputs2D[n][ind] - mean) * var_1;
+                    outputs2D[n][ind] = TOMMTYPE(gamma[i] * xhat + beta[i]);
+                }
+            }
+        }
+    }
+}
+
+template<typename MM>
+void LayerNorm2DLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = mLayer.mNetworkParams.mWorkflow.getBatchSize();
+
+    auto& xHat = work.getMemoryManager<MM>()[mLayer.mXHatName];
+    auto& Var = work.getMemoryManager<MM>()[mLayer.mVarianceName];
+
+    auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+
+    size_t N = batchSize * mLayer.mInputDepth;
+
+    size_t len = mLayer.mInputHeight * mLayer.mInputWidth;
+
+    auto deltas2D = deltas.reshape(yato::dims(N, len));
+
+    auto xhat2D = xHat.reshape(yato::dims(N, len));
+
+    if (!mLayer.mFrozen)
+    {
+        auto& gradWeights = work.getMemoryManager<MM>()[mLayer.mWeightsName.grad()];
+        auto& gradBiases = work.getMemoryManager<MM>()[mLayer.mBiasesName.grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < mLayer.mInputWidth; ++i)
+        {
+            dtype gBias = 0;
+            dtype gWeights = 0;
+            for (size_t j = 0; j < mLayer.mInputHeight; ++j)
+            {
+                size_t ind = i + j * mLayer.mInputWidth;
+                for (size_t n = 0; n < N; ++n)
+                {
+                    gBias += TODTYPE(deltas2D[n][ind]);
+                    gWeights += TODTYPE(deltas2D[n][ind]) * TODTYPE(xhat2D[n][ind]);
+                }
+            }
+
+            gradBiases[i] = TOMMTYPE(TODTYPE(gradBiases[i]) + gBias);
+            gradWeights[i] = TOMMTYPE(TODTYPE(gradWeights[i]) + gWeights);
+        }
+    }
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        auto& nablaXhat = work.getMemoryManager<MM>()[mLayer.mXHatNablaName];
+        auto nablaXhat2D = nablaXhat.reshape(yato::dims(N, len));
+        auto& gamma = work.getMemoryManager<MM>()[mLayer.mWeightsName];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t n = 0; n < N; ++n)
+        {
+            size_t ind = 0;
+            for (size_t j = 0; j < mLayer.mInputHeight; ++j)
+            {
+                for (size_t i = 0; i < mLayer.mInputWidth; ++i, ++ind)
+                {
+                    nablaXhat2D[n][ind] = deltas2D[n][ind] * gamma[i];
+                }
+            }
+        }
+
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+        auto prevDeltas2D = prevLayerDelta.reshape(yato::dims(N, len));
+
+        auto v_1 = 1_dt / TODTYPE(len);
+        auto scale = mLayer.mTFStyle || !mLayer.mUseBesselCorrection ? 1_dt / TODTYPE(len) : 1_dt / (TODTYPE(len) - 1_dt);
+
+        auto& cacheTensor1 = work.getMemoryManager<MM>().getTensor(mLayer.mName / "cache1");
+        auto& cacheTensor2 = work.getMemoryManager<MM>().getTensor(mLayer.mName / "cache2");
+        auto* cache1 = cacheTensor1.getBuffer();
+        auto* cache2 = cacheTensor2.getBuffer();
+
+        auto v_1_mm = TOMMTYPE(v_1);
+        for (size_t n = 0; n < N; ++n)
+        {
+            cache1[n] = 0;
+            cache2[n] = 0;
+            for (size_t i = 0; i < len; ++i)
+            {
+                cache1[n] += nablaXhat2D[n][i] * v_1_mm;
+                cache2[n] += nablaXhat2D[n][i] * xhat2D[n][i];
+            }
+
+            // cache1[n] *= v_1_mm;
+        }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t n = 0; n < N; ++n)
+        {
+            for (size_t j = 0; j < len; ++j)
+            {
+                auto coeff = TODTYPE(xhat2D[n][j]) * scale;
+                auto val = TODTYPE(nablaXhat2D[n][j]) - TODTYPE(cache1[n]) - coeff * TODTYPE(cache2[n]);
+                prevDeltas2D[n][j] = TOMMTYPE(TODTYPE(prevDeltas2D[n][j]) + val * TODTYPE(Var[n]));
+            }
+        }
+    }
+}
+
+template class LayerNorm2DLayerCPU<MemoryManager>;
+template class LayerNorm2DLayerCPU<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNorm2dCPU.h b/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNorm2dCPU.h
new file mode 100644
index 00000000..27989d4b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNorm2dCPU.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAYERNORM_LAYER_CPU_H
+#define LAYERNORM_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class LayerNorm2DLayer;
+
+/**
+ * @brief Layer Normalization 2D CPU implementation
+ */
+template<typename MM>
+class LayerNorm2DLayerCPU : public BasicImpl
+{
+  public:
+    LayerNorm2DLayerCPU(LayerNorm2DLayer& layer);
+
+    LayerNorm2DLayerCPU(LayerNorm2DLayerCPU&&) = default;
+    LayerNorm2DLayerCPU(const LayerNorm2DLayerCPU&) = delete;
+    LayerNorm2DLayerCPU& operator=(const LayerNorm2DLayerCPU&) = delete;
+
+    void initNotBSTensors() override;
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LayerNorm2DLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNormCPU.cpp b/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNormCPU.cpp
new file mode 100644
index 00000000..04e56f7b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNormCPU.cpp
@@ -0,0 +1,217 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LayerNormCPU.h"
+#include "../LayerNorm.h"
+
+namespace raul
+{
+template<typename MM>
+void LayerNormLayerCPU<MM>::initNotBSTensors()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    work.getMemoryManager<MM>()[mLayer.mWeightsName] = TOMMTYPE(1_dt);
+}
+
+template<typename MM>
+void LayerNormLayerCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = mLayer.mNetworkParams.mWorkflow.getBatchSize();
+
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+    auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+    auto& gamma = work.getMemoryManager<MM>()[mLayer.mWeightsName];
+    auto& beta = work.getMemoryManager<MM>()[mLayer.mBiasesName];
+
+    size_t N = batchSize * mLayer.mInputDepth * mLayer.mInputHeight;
+
+    auto outputs2D = output.reshape(yato::dims(N, mLayer.mInputWidth));
+    auto inputs2D = inputs.reshape(yato::dims(N, mLayer.mInputWidth));
+
+    if (mode == NetworkMode::Train || mode == NetworkMode::TrainCheckpointed)
+    {
+        auto& xHat = work.getMemoryManager<MM>()[mLayer.mXHatName];
+        auto& Var = work.getMemoryManager<MM>()[mLayer.mVarianceName];
+
+        auto xhat2D = xHat.reshape(yato::dims(N, mLayer.mInputWidth));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t n = 0; n < N; ++n)
+        {
+            raul::dtype mean = 0.0_dt;
+            raul::dtype var = 0.0_dt;
+
+            for (size_t i = 0; i < mLayer.mInputWidth; ++i)
+            {
+                mean += inputs2D[n][i];
+            }
+            mean /= static_cast<dtype>(mLayer.mInputWidth);
+            for (size_t i = 0; i < mLayer.mInputWidth; ++i)
+            {
+                var += (inputs2D[n][i] - mean) * (inputs2D[n][i] - mean);
+            }
+            if (mLayer.mTFStyle)
+            {
+                var = TODTYPE(sqrt(var / TODTYPE(mLayer.mInputWidth) + mLayer.mEps));
+            }
+            else
+            {
+                if (mLayer.mUseBesselCorrection)
+                {
+                    var = TODTYPE(sqrt(var / TODTYPE(mLayer.mInputWidth - 1)) + mLayer.mEps); // we use Bessel's correction to mimic torch default behaviour
+                }
+                else
+                {
+                    var = TODTYPE(sqrt(var / TODTYPE(mLayer.mInputWidth)) + mLayer.mEps);
+                }
+            }
+
+            auto var_1 = 1.0_dt / var;
+
+            Var[n] = TOMMTYPE(var_1);
+
+            for (size_t i = 0; i < mLayer.mInputWidth; ++i)
+            {
+                xhat2D[n][i] = (inputs2D[n][i] - TOMMTYPE(mean)) * TOMMTYPE(var_1);
+                outputs2D[n][i] = gamma[i] * xhat2D[n][i] + beta[i];
+            }
+        }
+    }
+    else
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t n = 0; n < N; ++n)
+        {
+            dtype mean = 0_dt;
+            dtype var = 0_dt;
+
+            for (size_t i = 0; i < mLayer.mInputWidth; ++i)
+            {
+                mean += inputs2D[n][i];
+            }
+            mean /= static_cast<dtype>(mLayer.mInputWidth);
+            for (size_t i = 0; i < mLayer.mInputWidth; ++i)
+            {
+                var += (inputs2D[n][i] - mean) * (inputs2D[n][i] - mean);
+            }
+            if (mLayer.mTFStyle)
+            {
+                var = TODTYPE(sqrt(var / TODTYPE(mLayer.mInputWidth) + mLayer.mEps));
+            }
+            else
+            {
+                if (mLayer.mUseBesselCorrection)
+                {
+                    var = TODTYPE(sqrt(var / TODTYPE(mLayer.mInputWidth - 1)) + mLayer.mEps); // we use Bessel's correction to mimic torch default behaviour
+                }
+                else
+                {
+                    var = TODTYPE(sqrt(var / TODTYPE(mLayer.mInputWidth)) + mLayer.mEps);
+                }
+            }
+            auto var_1 = 1.0_dt / var;
+            for (size_t i = 0; i < mLayer.mInputWidth; ++i)
+            {
+                auto xhat = (inputs2D[n][i] - TOMMTYPE(mean)) * TOMMTYPE(var_1);
+                outputs2D[n][i] = gamma[i] * TOMMTYPE(xhat) + beta[i];
+            }
+        }
+    }
+}
+
+template<typename MM>
+void LayerNormLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const size_t batchSize = mLayer.mNetworkParams.mWorkflow.getBatchSize();
+
+    auto& xHat = work.getMemoryManager<MM>()[mLayer.mXHatName];
+    auto& Var = work.getMemoryManager<MM>()[mLayer.mVarianceName];
+
+    auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+
+    size_t N = batchSize * mLayer.mInputDepth * mLayer.mInputHeight;
+
+    auto deltas2D = deltas.reshape(yato::dims(N, mLayer.mInputWidth));
+
+    auto xhat2D = xHat.reshape(yato::dims(N, mLayer.mInputWidth));
+
+    if (!mLayer.mFrozen)
+    {
+        auto& gradWeights = work.getMemoryManager<MM>()[mLayer.mWeightsName.grad()];
+        auto& gradBiases = work.getMemoryManager<MM>()[mLayer.mBiasesName.grad()];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < mLayer.mInputWidth; ++i)
+        {
+            for (size_t n = 0; n < N; ++n)
+            {
+                gradBiases[i] += deltas2D[n][i];
+                gradWeights[i] += deltas2D[n][i] * xhat2D[n][i];
+            }
+        }
+    }
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputName))
+    {
+        auto& nablaXhat = work.getMemoryManager<MM>()[mLayer.mXHatNablaName];
+        auto nablaXhat2D = nablaXhat.reshape(yato::dims(N, mLayer.mInputWidth));
+        auto& gamma = work.getMemoryManager<MM>()[mLayer.mWeightsName];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < mLayer.mInputWidth; ++i)
+        {
+            for (size_t n = 0; n < N; ++n)
+            {
+                nablaXhat2D[n][i] = deltas2D[n][i] * gamma[i];
+            }
+        }
+
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+        auto prevDeltas2D = prevLayerDelta.reshape(yato::dims(N, mLayer.mInputWidth));
+        auto v_1 = 1_dt / TODTYPE(mLayer.mInputWidth);
+        auto scale = mLayer.mTFStyle || !mLayer.mUseBesselCorrection ? 1_dt / TODTYPE(mLayer.mInputWidth) : 1_dt / (TODTYPE(mLayer.mInputWidth) - 1_dt);
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t n = 0; n < N; ++n)
+        {
+            for (size_t j = 0; j < mLayer.mInputWidth; ++j)
+            {
+                auto coeff = xhat2D[n][j] * scale;
+                dtype val = 0_dt;
+                for (size_t i = 0; i < mLayer.mInputWidth; ++i)
+                {
+                    dtype V = v_1 + xhat2D[n][i] * coeff;
+                    val -= V * nablaXhat2D[n][i];
+                }
+                val += nablaXhat2D[n][j];
+                prevDeltas2D[n][j] += TOMMTYPE(val) * Var[n];
+            }
+        }
+    }
+}
+
+template class LayerNormLayerCPU<MemoryManager>;
+template class LayerNormLayerCPU<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNormCPU.h b/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNormCPU.h
new file mode 100644
index 00000000..470fc57c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/basic/trainable/impl/LayerNormCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAYERNORM_LAYER_CPU_H
+#define LAYERNORM_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+class LayerNormLayer;
+
+/**
+ * @brief Layer Normalization CPU implementation
+ */
+template<typename MM>
+class LayerNormLayerCPU : public BasicImpl
+{
+  public:
+    LayerNormLayerCPU(LayerNormLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    LayerNormLayerCPU(LayerNormLayerCPU&&) = default;
+    LayerNormLayerCPU(const LayerNormLayerCPU&) = delete;
+    LayerNormLayerCPU& operator=(const LayerNormLayerCPU&) = delete;
+
+    void initNotBSTensors() override;
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LayerNormLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/AdditiveAttentionLayer.cpp b/training/src/compiler/training/base/layers/composite/AdditiveAttentionLayer.cpp
new file mode 100644
index 00000000..897f9a95
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/AdditiveAttentionLayer.cpp
@@ -0,0 +1,105 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "AdditiveAttentionLayer.h"
+
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/activations/TanhActivation.h>
+#include <training/base/layers/basic/DropoutLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/basic/MaskedFillLayer.h>
+#include <training/base/layers/basic/MatMulLayer.h>
+#include <training/base/layers/basic/ReduceSumLayer.h>
+#include <training/base/layers/basic/ReshapeLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+
+namespace
+{
+const raul::dtype MASK_FILL_VALUE = -1e9_dt;
+} // anonymous namespace
+
+namespace raul
+{
+
+AdditiveAttentionLayer::AdditiveAttentionLayer(const Name& name, const DropoutParams& params, NetworkParameters& networkParameters)
+{
+    auto prefix = "AdditiveAttention[" + name + "::ctor]: ";
+    // Query, Key, Value, [Mask]
+    if (params.getInputs().empty() || params.getInputs().size() > 4)
+    {
+        THROW("AdditiveAttention", name, "wrong number of input names");
+    }
+
+    if (params.getOutputs().size() != 1 && params.getOutputs().size() != 2)
+    {
+        THROW("AdditiveAttention", name, "wrong number of output names");
+    }
+
+    bool hasMask = params.getInputs().size() == 4 || params.getInputs().size() == 2;
+    bool hasDropout = params.probability > 0.0_dt;
+    bool needsPAttn = params.getOutputs().size() > 1;
+
+    std::string maskName = hasMask ? params.getInputs().back() : "";
+
+    auto [queryName, valueName, keyName] = std::make_tuple(params.getInputs()[0], params.getInputs()[0], params.getInputs()[0]);
+    if (params.getInputs().size() > 2)
+    {
+        std::tie(queryName, valueName, keyName) = std::make_tuple(params.getInputs()[0], params.getInputs()[1], params.getInputs()[2]);
+    }
+
+    const int Tq = static_cast<int>(networkParameters.mWorkflow.getHeight(queryName));
+    const int Tv = static_cast<int>(networkParameters.mWorkflow.getHeight(valueName));
+    const int dim = static_cast<int>(networkParameters.mWorkflow.getWidth(valueName));
+
+    // Layers
+
+    // Reshape query to [batch, Tq, 1, dim]
+    networkParameters.mWorkflow.add<raul::ReshapeLayer>(name / "reshapeQ", raul::ViewParams{ { queryName }, { name / "queryReshaped" }, Tq, 1, dim });
+
+    // Sum query and value
+    networkParameters.mWorkflow.add<raul::ElementWiseSumLayer>(name / "sumQK", raul::ElementWiseLayerParams{ { name / "queryReshaped", keyName }, { name / "sum" } });
+
+    // Tanh activation
+    networkParameters.mWorkflow.add<raul::TanhActivation>(name / "tanhQK", raul::BasicParams{ { name / "sum" }, { name / "tanh" } });
+
+    // Reduction sum
+    networkParameters.mWorkflow.add<raul::ReduceSumLayer>(name / "rSumQK", raul::BasicParamsWithDim{ { name / "tanh" }, { name / "scores" }, raul::Dimension::Width });
+
+    if (hasMask)
+    {
+        networkParameters.mWorkflow.add<raul::MaskedFillLayer>(name / "mask", MaskedFillParams{ { name / "scores", maskName }, name / "masked_scores", MASK_FILL_VALUE, true });
+    }
+
+    // Transpose for faster softmax calculation
+    networkParameters.mWorkflow.add<raul::TransposeLayer>(name / "t",
+                                                          TransposingParams{ { hasMask ? name / "masked_scores" : name / "scores" }, { name / "scores_t" }, Dimension::Width, Dimension::Height });
+
+    // Get probabilities
+    networkParameters.mWorkflow.add<raul::SoftMaxActivation>(name / "softmax", raul::BasicParamsWithDim{ { name / "scores_t" }, { name / "sm" }, raul::Dimension::Width });
+
+    // If pAttn needed, then return it
+    auto pAttnName = (!hasDropout && needsPAttn) ? params.getOutputs()[1] : name / "smReshaped";
+
+    // Reshape result to [batch, 1, Tq, Tv]
+    networkParameters.mWorkflow.add<raul::ReshapeLayer>(name / "reshapeSm", raul::ViewParams{ { name / "sm" }, { pAttnName }, 1, Tq, Tv });
+
+    if (hasDropout)
+    {
+        pAttnName = needsPAttn ? params.getOutputs()[1] : name / "do";
+        networkParameters.mWorkflow.add<raul::DropoutLayer>(name / "dropout", DropoutParams{ { name / "smReshaped" }, { pAttnName }, params.probability });
+    }
+
+    networkParameters.mWorkflow.add<raul::MatMulLayer>(name / "mulV", raul::MatMulParams{ { pAttnName, valueName }, params.getOutputs()[0] });
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/AdditiveAttentionLayer.h b/training/src/compiler/training/base/layers/composite/AdditiveAttentionLayer.h
new file mode 100644
index 00000000..6ae67c57
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/AdditiveAttentionLayer.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ADDITIVE_ATTENTION_LAYER_H
+#define ADDITIVE_ATTENTION_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/parameters/DropoutParams.h>
+#include <training/base/common/NetworkParameters.h>
+namespace raul
+{
+
+/**
+ * @brief Additive Attention Layer
+ *
+ * This layer is a additive attention layer, a.k.a. Bahdanau-style attention.
+ *
+ * Implementation follows https://www.tensorflow.org/api_docs/python/tf/keras/layers/AdditiveAttention
+ *
+ * Inputs: Query, Value, Key[, Mask] or Query[, Mask]. In latter case Value=Key=Query
+ * Outputs: Attention[, Probabilities]
+ *
+ * @see
+ * Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio, �Neural Machine Translation by Jointly Learning to Align and Translate�, ICLR 2015
+ */
+class AdditiveAttentionLayer
+{
+
+  public:
+    AdditiveAttentionLayer(const Name& name, const DropoutParams& params, NetworkParameters& networkParameters);
+
+    AdditiveAttentionLayer(AdditiveAttentionLayer&&) = default;
+    AdditiveAttentionLayer(const AdditiveAttentionLayer&) = delete;
+    AdditiveAttentionLayer& operator=(const AdditiveAttentionLayer&) = delete;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/AttentionLayer.cpp b/training/src/compiler/training/base/layers/composite/AttentionLayer.cpp
new file mode 100644
index 00000000..db33260d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/AttentionLayer.cpp
@@ -0,0 +1,80 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "AttentionLayer.h"
+
+#include <training/base/common/NetworkParameters.h>
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/basic/DropoutLayer.h>
+#include <training/base/layers/basic/MaskedFillLayer.h>
+#include <training/base/layers/basic/MatMulLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+
+namespace
+{
+const raul::dtype MASK_FILL_VALUE = -1e9_dt;
+} // anonymous namespace
+
+namespace raul
+{
+
+AttentionLayer::AttentionLayer(const Name& name, const DropoutParams& params, NetworkParameters& networkParameters)
+{
+    auto prefix = "Attention[" + name + "::ctor]: ";
+    // Query, Key, Value, [Mask]
+    if (params.getInputs().empty() || params.getInputs().size() > 4)
+    {
+        THROW("Attention", name, "wrong number of input names");
+    }
+
+    if (params.getOutputs().size() != 1 && params.getOutputs().size() != 2)
+    {
+        THROW("Attention", name, "wrong number of output names");
+    }
+
+    bool hasMask = params.getInputs().size() == 2 || params.getInputs().size() == 4;
+    bool hasDropout = params.probability > 0.0_dt;
+    bool needsPAttn = params.getOutputs().size() > 1;
+
+    std::string maskName = hasMask ? params.getInputs().back() : "";
+
+    auto [queryName, valueName, keyName] = std::make_tuple(params.getInputs()[0], params.getInputs()[0], params.getInputs()[0]);
+    if (params.getInputs().size() > 2)
+    {
+        std::tie(queryName, valueName, keyName) = std::make_tuple(params.getInputs()[0], params.getInputs()[1], params.getInputs()[2]);
+    }
+
+    networkParameters.mWorkflow.add<TransposeLayer>(name / "t", TransposingParams{ keyName, name / "key_t", Dimension::Width, Dimension::Height });
+
+    networkParameters.mWorkflow.add<MatMulLayer>(
+        name / "mulQK", MatMulParams{ { queryName, name / "key_t" }, name / "scores", static_cast<float>(1.0 / sqrt(TODTYPE(networkParameters.mWorkflow.getWidth(params.getInputs()[0])))) });
+
+    if (hasMask)
+    {
+        networkParameters.mWorkflow.add<MaskedFillLayer>(name / "mask", MaskedFillParams{ { name / "scores", maskName }, name / "masked_scores", MASK_FILL_VALUE, true });
+    }
+
+    auto pAttnName = (needsPAttn && !hasDropout) ? params.getOutputs()[1] : name / "sm";
+
+    networkParameters.mWorkflow.add<SoftMaxActivation>(name / "softmax", BasicParamsWithDim{ { hasMask ? name / "masked_scores" : name / "scores" }, { pAttnName }, Dimension::Width });
+
+    if (hasDropout)
+    {
+        pAttnName = needsPAttn ? params.getOutputs()[1] : name / "do";
+        networkParameters.mWorkflow.add<DropoutLayer>(name / "dropout", DropoutParams{ { name / "sm" }, { pAttnName }, params.probability });
+    }
+
+    networkParameters.mWorkflow.add<MatMulLayer>(name / "mulV", MatMulParams{ { pAttnName, valueName }, params.getOutputs()[0] });
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/AttentionLayer.h b/training/src/compiler/training/base/layers/composite/AttentionLayer.h
new file mode 100644
index 00000000..608c8b1c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/AttentionLayer.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ATTENTION_LAYER_H
+#define ATTENTION_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/parameters/DropoutParams.h>
+#include <training/base/common/NetworkParameters.h>
+
+namespace raul
+{
+/**
+ * @brief Dot-Product Attention Layer With Scaling
+ *
+ * This layer is a dot-product attention layer with additional scaling.
+ *
+ * Implementation follows http://nlp.seas.harvard.edu/2018/04/03/attention.html#attention
+ *
+ * Inputs: Query, Value, Key[, Mask] or Query[, Mask]. In latter case Value=Key=Query
+ * Outputs: Attention[, Probabilities]
+ *
+ * @see
+ * Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin, �Attention Is All You Need�
+ */
+class AttentionLayer
+{
+  public:
+    AttentionLayer(const Name& name, const DropoutParams& params, NetworkParameters& networkParameters);
+
+    AttentionLayer(AttentionLayer&&) = default;
+    AttentionLayer(const AttentionLayer&) = delete;
+    AttentionLayer& operator=(const AttentionLayer&) = delete;
+
+  private:
+};
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/AttentionMaskCreatorLayer.h b/training/src/compiler/training/base/layers/composite/AttentionMaskCreatorLayer.h
new file mode 100644
index 00000000..bdcf05c3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/AttentionMaskCreatorLayer.h
@@ -0,0 +1,122 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ATTENTION_MASK_CREATOR_LAYER_H
+#define ATTENTION_MASK_CREATOR_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+class AttentionMaskCreatorLayer : public raul::BasicLayer
+{
+  public:
+    AttentionMaskCreatorLayer(const raul::Name& name, const raul::BasicParams& params, size_t maskLen, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "AttentionMaskCreator", params, networkParameters, { false, false })
+        , mMaskLen(maskLen)
+    {
+        auto prefix = "AttentionMaskCreatorLayer[" + mName + "::ctor]: ";
+
+        if (mInputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of input names");
+        }
+        if (mInputs[0].empty())
+        {
+            THROW(mTypeName, mName, "empty input name");
+        }
+
+        if (mOutputs.size() > 2)
+        {
+            THROW(mTypeName, mName, "wrong number of output names");
+        }
+        if (std::any_of(mOutputs.begin(), mOutputs.end(), [](const auto& s) { return s.empty(); }))
+        {
+            THROW(mTypeName, mName, "empty output name");
+        }
+
+        // Declare needed constants
+        networkParameters.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ raul::BS(), 1u, mMaskLen, 1u }, DEC_FORW_WRIT);
+
+        if (mOutputs.size() == 2)
+        {
+            networkParameters.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[1], DEC_FORW_WRIT_NOMEMOPT);
+        }
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override
+    {
+        auto& work = mNetworkParams.mWorkflow;
+
+        // Fill mask for memory depending on memorySeqLength
+        if (work.getExecutionTarget() == ExecutionTarget::CPU)
+        {
+            const Tensor& memorySeqLength = mNetworkParams.mMemoryManager[mInputs[0]];
+            Tensor& mask = mNetworkParams.mMemoryManager[mOutputs[0]];
+            size_t batchSize = memorySeqLength.getBatchSize();
+
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                if (memorySeqLength[i] == 0.0_dt)
+                {
+                    THROW("AttentionMaskCreatorLayer", mName, "all values in memorySeqLength must be greater than zero");
+                }
+                for (size_t j = 0; j < mMaskLen; ++j)
+                {
+                    mask[i * mMaskLen + j] = static_cast<dtype>(static_cast<dtype>(j) < memorySeqLength[i]);
+                    if (mOutputs.size() == 2)
+                    {
+                        mNetworkParams.mMemoryManager[mOutputs[1]][i * mMaskLen + j] = -std::numeric_limits<dtype>::infinity();
+                    }
+                }
+            }
+        }
+        else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+        {
+            const auto& memorySeqLength = work.getMemoryManager<MemoryManagerFP16>()[mInputs[0]];
+            auto& mask = work.getMemoryManager<MemoryManagerFP16>()[mOutputs[0]];
+            size_t batchSize = memorySeqLength.getBatchSize();
+
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                if (memorySeqLength[i] == 0.0_hf)
+                {
+                    THROW("AttentionMaskCreatorLayer", mName, "all values in memorySeqLength must be greater than zero");
+                }
+                for (size_t j = 0; j < mMaskLen; ++j)
+                {
+                    mask[i * mMaskLen + j] = TOHTYPE(TOHTYPE(j) < memorySeqLength[i]);
+                    if (mOutputs.size() == 2)
+                    {
+                        work.getMemoryManager<MemoryManagerFP16>()[mOutputs[1]][i * mMaskLen + j] = -std::numeric_limits<half>::infinity();
+                    }
+                }
+            }
+        }
+        else
+        {
+            THROW(mTypeName, mName, "unsupported execution target");
+        }
+    }
+
+    void backwardComputeImpl() override {}
+
+  private:
+    size_t mMaskLen;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/BahdanauMonotonicAttentionInternalLayers.h b/training/src/compiler/training/base/layers/composite/BahdanauMonotonicAttentionInternalLayers.h
new file mode 100644
index 00000000..e83c700c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/BahdanauMonotonicAttentionInternalLayers.h
@@ -0,0 +1,177 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BAHDANAU_MONOTONIC_ATTENTION_INTERNAL_LAYERS_H
+#define BAHDANAU_MONOTONIC_ATTENTION_INTERNAL_LAYERS_H
+
+#include <training/base/initializers/RandomUniformInitializer.h>
+#include <training/base/layers/TrainableLayer.h>
+
+namespace raul::bahdanau
+{
+
+class BahdanauTrainableInitializerLayer : public raul::TrainableLayer
+{
+  public:
+    BahdanauTrainableInitializerLayer(const raul::Name& name,
+                                      const raul::TrainableParams& params,
+                                      size_t numUnits,
+                                      bool normalize,
+                                      raul::dtype scoreBiasInit,
+                                      raul::NetworkParameters& networkParameters)
+        : TrainableLayer(name, "BahdanauTrainableInitializer", params, networkParameters, { false, true })
+        , mNumUnits(numUnits)
+        , mNormalize(normalize)
+        , mScoreBiasInit(scoreBiasInit)
+    {
+        auto prefix = "BahdanauTrainableInitializerLayer[" + mName + "::ctor]: ";
+
+        if (mInputs.size() != 0)
+        {
+            THROW(mTypeName, mName, "no input names expected");
+        }
+
+        if (mOutputs.size() != 2 && !(mNormalize && mOutputs.size() == 4))
+        {
+            THROW(mTypeName, mName, "wrong number of output names");
+        }
+        if (std::any_of(mOutputs.begin(), mOutputs.end(), [](const auto& s) { return s.empty(); }))
+        {
+            THROW(mTypeName, mName, "empty output name");
+        }
+
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ 1u, 1u, 1u, mNumUnits }, DEC_TRAINABLE);
+
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[1], raul::WShape{ 1u, 1u, 1u, 1u }, DEC_TRAINABLE);
+
+        if (mNormalize)
+        {
+            networkParameters.mWorkflow.tensorNeeded(name, mOutputs[2], raul::WShape{ 1u, 1u, 1u, 1u }, DEC_TRAINABLE);
+
+            networkParameters.mWorkflow.tensorNeeded(name, mOutputs[3], raul::WShape{ 1u, 1u, 1u, mNumUnits }, DEC_TRAINABLE);
+        }
+
+        if (!mFrozen)
+        {
+            networkParameters.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_TRAINABLE_GRAD);
+            networkParameters.mWorkflow.copyDeclaration(name, mOutputs[1], mOutputs[1].grad(), DEC_TRAINABLE_GRAD);
+
+            if (mNormalize)
+            {
+                networkParameters.mWorkflow.copyDeclaration(name, mOutputs[2], mOutputs[2].grad(), DEC_TRAINABLE_GRAD);
+                networkParameters.mWorkflow.copyDeclaration(name, mOutputs[3], mOutputs[3].grad(), DEC_TRAINABLE_GRAD);
+            }
+        }
+    }
+
+    void initNotBSTensors() override
+    {
+        auto& work = mNetworkParams.mWorkflow;
+
+        if (work.getExecutionTarget() == ExecutionTarget::CPU)
+        {
+            // Initialize attentionV
+            const raul::dtype limit = std::sqrt(3.0_dt / static_cast<raul::dtype>(mNumUnits));
+            raul::initializers::RandomUniformInitializer initializer{ 0.0_dt, 1.0_dt };
+            initializer(mNetworkParams.mMemoryManager[mOutputs[0]]);
+            mNetworkParams.mMemoryManager[mOutputs[0]] *= 2 * limit;
+            mNetworkParams.mMemoryManager[mOutputs[0]] -= limit;
+
+            // Initialize scoreBias
+            mNetworkParams.mMemoryManager[mOutputs[1]][0] = mScoreBiasInit;
+
+            // Initialize attentionG
+            if (mNormalize)
+            {
+                mNetworkParams.mMemoryManager[mOutputs[2]][0] = std::sqrt(1.0_dt / static_cast<raul::dtype>(mNumUnits));
+            }
+        }
+        else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+        {
+            // Initialize attentionV
+            const raul::dtype limit = std::sqrt(3.0_dt / static_cast<raul::dtype>(mNumUnits));
+            raul::initializers::RandomUniformInitializer initializer{ 0.0_dt, 1.0_dt };
+            initializer(work.getMemoryManager<MemoryManagerFP16>()[mOutputs[0]]);
+            work.getMemoryManager<MemoryManagerFP16>()[mOutputs[0]] *= TOHTYPE(2 * limit);
+            work.getMemoryManager<MemoryManagerFP16>()[mOutputs[0]] -= TOHTYPE(limit);
+
+            // Initialize scoreBias
+            work.getMemoryManager<MemoryManagerFP16>()[mOutputs[1]][0] = TOHTYPE(mScoreBiasInit);
+
+            // Initialize attentionG
+            if (mNormalize)
+            {
+                work.getMemoryManager<MemoryManagerFP16>()[mOutputs[2]][0] = TOHTYPE(std::sqrt(1.0_dt / static_cast<raul::dtype>(mNumUnits)));
+            }
+        }
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override {}
+
+    void backwardComputeImpl() override {}
+
+  private:
+    size_t mNumUnits;
+    bool mNormalize;
+    raul::dtype mScoreBiasInit;
+};
+
+class BahdanauConstantsInitializerLayer : public raul::BasicLayer
+{
+  public:
+    BahdanauConstantsInitializerLayer(const raul::Name& name, const raul::BasicParams& params, size_t width, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "BahdanauConstantsInitializer", params, networkParameters, { false, false })
+    {
+        auto prefix = "BahdanauConstantsInitializerLayer[" + mName + "::ctor]: ";
+
+        if (mOutputs.size() != 3)
+        {
+            THROW(mTypeName, mName, "wrong number of output names");
+        }
+        if (mOutputs[0].empty())
+        {
+            THROW(mTypeName, mName, "empty output name");
+        }
+
+        // Declare needed constant
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ 1u, 1u, 1u, width }, DEC_FORW_WRIT);
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[1], raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_WRIT);
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[2], raul::WShape{ raul::BS(), 1u, 1u, width - 1u }, DEC_FORW_WRIT);
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override
+    {
+        auto& work = mNetworkParams.mWorkflow;
+
+        if (work.getExecutionTarget() == ExecutionTarget::CPU)
+        {
+            auto& out = mNetworkParams.mMemoryManager[mOutputs[0]];
+            std::fill(out.begin(), out.end(), 1.0_dt);
+        }
+        else if (work.getExecutionTarget() == ExecutionTarget::CPUFP16)
+        {
+            auto& out = work.getMemoryManager<MemoryManagerFP16>()[mOutputs[0]];
+            std::fill(out.begin(), out.end(), 1.0_hf);
+        }
+        else
+        {
+            THROW_NONAME("BahdanauConstantsInitializerLayer", "unsupported execution target");
+        }
+    }
+
+    void backwardComputeImpl() override {}
+};
+
+} // namespace raul::bahdanau
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/BahdanauMonotonicAttentionLayer.cpp b/training/src/compiler/training/base/layers/composite/BahdanauMonotonicAttentionLayer.cpp
new file mode 100644
index 00000000..deeb83e0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/BahdanauMonotonicAttentionLayer.cpp
@@ -0,0 +1,283 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BahdanauMonotonicAttentionLayer.h"
+#include "AttentionMaskCreatorLayer.h"
+#include "BahdanauMonotonicAttentionInternalLayers.h"
+
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/activations/TanhActivation.h>
+#include <training/base/layers/basic/ArgMaxLayer.h>
+#include <training/base/layers/basic/ClampLayer.h>
+#include <training/base/layers/basic/ConcatenationLayer.h>
+#include <training/base/layers/basic/CumSumLayer.h>
+#include <training/base/layers/basic/ElementWiseDivLayer.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+#include <training/base/layers/basic/ElementWiseSubLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/basic/ExpLayer.h>
+#include <training/base/layers/basic/IndexFillLayer.h>
+#include <training/base/layers/basic/LogLayer.h>
+#include <training/base/layers/basic/RSqrtLayer.h>
+#include <training/base/layers/basic/RandomTensorLayer.h>
+#include <training/base/layers/basic/ReduceSumLayer.h>
+#include <training/base/layers/basic/RollLayer.h>
+#include <training/base/layers/basic/ScaleLayer.h>
+#include <training/base/layers/basic/SelectLayer.h>
+#include <training/base/layers/basic/SlicerLayer.h>
+#include <training/base/layers/basic/SquareLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+
+using namespace raul;
+
+namespace
+{
+
+const raul::dtype CLAMP_MIN1 = 1.1754943508222875e-38_dt;
+const raul::dtype CLAMP_MIN2 = 1.0e-10_dt;
+const raul::dtype CLAMP_MAX = 1.0_dt;
+
+} // anonymous namespace
+
+namespace raul
+{
+
+BahdanauMonotonicAttentionLayer::BahdanauMonotonicAttentionLayer(const Name& name, const BahdanauAttentionParams& params, raul::NetworkParameters& networkParameters)
+    : mNumUnits(params.mNumUnits)
+    , mNormalize(params.mNormalize)
+    , mSigmoidNoise(params.mSigmoidNoise)
+    , mScoreBiasInit(params.mScoreBiasInit)
+    , mMode(params.mMode)
+    , mStepwise(params.mStepwise)
+{
+    // Query, State, Memory, [MemorySeqLength], [ScoreMaskValues]
+    if (params.getInputs().size() < 3 && params.getInputs().size() > 5)
+    {
+        THROW("BahdanauMonotonicAttentionLayer", name, "wrong number of input names");
+    }
+
+    mHasMask = params.getInputs().size() > 3;
+
+    if (params.getOutputs().size() < 1 || params.getOutputs().size() > 3)
+    {
+        THROW("BahdanauMonotonicAttentionLayer", name, "wrong number of output names");
+    }
+
+    // Input names
+    auto [queryName, stateName, memoryName] = std::make_tuple(params.getInputs()[0], params.getInputs()[1], params.getInputs()[2]);
+
+    if (!params.getSharedLayer().empty())
+    {
+        mAttentionVName = params.getSharedLayer() / "attention_v";
+        mScoreBiasName = params.getSharedLayer() / "score_bias";
+        if (mNormalize)
+        {
+            mAttentionGName = params.getSharedLayer() / "attention_g";
+            mAttentionBName = params.getSharedLayer() / "attention_b";
+        }
+    }
+    else
+    {
+        raul::Names trainableNames;
+        mAttentionVName = name / "attention_v";
+        mScoreBiasName = name / "score_bias";
+
+        trainableNames.push_back(mAttentionVName);
+        trainableNames.push_back(mScoreBiasName);
+
+        if (mNormalize)
+        {
+            mAttentionGName = name / "attention_g";
+            mAttentionBName = name / "attention_b";
+
+            trainableNames.push_back(mAttentionGName);
+            trainableNames.push_back(mAttentionBName);
+        }
+
+        networkParameters.mWorkflow.add<bahdanau::BahdanauTrainableInitializerLayer>(
+            name / "trainable_part", TrainableParams{ {}, trainableNames, params.frozen }, mNumUnits, mNormalize, mScoreBiasInit);
+    }
+
+    // Declare mask for memory and default scores
+    bool hasNoise = mSigmoidNoise > 0.0_dt;
+    if (mHasMask && params.getSharedLayer().empty())
+    {
+        networkParameters.mWorkflow.add<AttentionMaskCreatorLayer>(
+            name / "create_mask", BasicParams{ { params.getInputs()[3] }, { name / "Mask", name / "DefaultScores" } }, networkParameters.mWorkflow.getHeight(memoryName));
+    }
+
+    // Layers
+
+    // Process query from [batch, 1, 1, decoder_output_size] to [batch, 1, 1, mNumUnits]
+    networkParameters.mWorkflow.add<LinearLayer>(
+        name / "query_layer",
+        LinearParams{ { queryName }, { name / "queryProcessed" }, params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "query_layer", mNumUnits, false, params.frozen });
+
+    // Mask memory if needed
+    Name keyName = !params.getSharedLayer().empty() ? params.getSharedLayer() / "keys" : name / "keys";
+    if (mHasMask)
+    {
+        if (params.getSharedLayer().empty())
+        {
+            networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "mask_memory", ElementWiseLayerParams{ { memoryName, name / "Mask" }, { params.getOutputs()[1] } });
+        }
+    }
+
+    if (params.getSharedLayer().empty())
+    {
+        // Process memory from [batch, 1, max_time, encoder_output_size] to key with size [batch, 1, max_time, mNumUnits]
+        networkParameters.mWorkflow.add<LinearLayer>(name / "memory_layer", LinearParams{ { mHasMask ? params.getOutputs()[1] : memoryName }, { keyName }, mNumUnits, false, params.frozen });
+    }
+
+    // Sum queryProcessed + key
+    networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "sumQK", ElementWiseLayerParams{ { name / "queryProcessed", keyName }, { name / "sum" } });
+
+    if (mNormalize)
+    {
+        // Add AttentionB
+        networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "sumQKN", ElementWiseLayerParams{ { name / "sum", mAttentionBName }, { name / "sum_b" } });
+    }
+
+    // Tanh activation
+    networkParameters.mWorkflow.add<TanhActivation>(name / "tanhQK", BasicParams{ { mNormalize ? name / "sum_b" : name / "sum" }, { name / "tanh" } });
+
+    // Multiply mAttentionV/Normed mAttentionV and output of tanh
+    if (mNormalize)
+    {
+        networkParameters.mWorkflow.add<SquareLayer>(name / "squareAttentionV", BasicParams{ { mAttentionVName }, { name / "squareAttentionV" } });
+        networkParameters.mWorkflow.add<ReduceSumLayer>(name / "rsumAttentionV", BasicParamsWithDim{ { name / "squareAttentionV" }, { name / "rsumAttentionV" }, Dimension::Width });
+        networkParameters.mWorkflow.add<RSqrtLayer>(name / "rsqrtAttentionV", BasicParams{ { name / "rsumAttentionV" }, { name / "rsqrtAttentionV" } });
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "normalizeAttentionV",
+                                                             ElementWiseLayerParams{ { mAttentionGName, mAttentionVName, name / "rsqrtAttentionV" }, { name / "normalizedAttentionV" } });
+    }
+
+    networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "mulAttntanhQK",
+                                                         ElementWiseLayerParams{ { mNormalize ? name / "normalizedAttentionV" : mAttentionVName, name / "tanh" }, { name / "mul" } });
+
+    // Reduction sum
+    networkParameters.mWorkflow.add<ReduceSumLayer>(name / "rSumQKV", BasicParamsWithDim{ { name / "mul" }, { name / "scores" }, Dimension::Width });
+
+    // Add bias to score
+    networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "sumScoreBias", ElementWiseLayerParams{ { name / "scores", mScoreBiasName }, { name / "biasedScores" } });
+
+    auto currName = name / "biasedScores";
+    if (mHasMask)
+    {
+        auto scoresName = params.getInputs().size() == 5 ? params.getInputs()[4] : !params.getSharedLayer().empty() ? params.getSharedLayer() / "DefaultScores" : name / "DefaultScores";
+        networkParameters.mWorkflow.add<SelectLayer>(
+            name / "selectScores", ElementWiseLayerParams{ { !params.getSharedLayer().empty() ? params.getSharedLayer() / "Mask" : name / "Mask", currName, scoresName }, { name / "maskedScores" } });
+        currName = name / "maskedScores";
+    }
+
+    // Add noise
+    if (hasNoise)
+    {
+        networkParameters.mWorkflow.add<RandomTensorLayer>(
+            name / "createNoise",
+            RandomTensorLayerParams{
+                { name / "initialNoise" }, networkParameters.mWorkflow.getDepth(currName), networkParameters.mWorkflow.getHeight(currName), networkParameters.mWorkflow.getWidth(currName) });
+        networkParameters.mWorkflow.add<ScaleLayer>(name / "increaseNoise", ScaleParams{ { name / "initialNoise" }, { name / "Noise" }, mSigmoidNoise });
+        networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "addNoise", ElementWiseLayerParams{ { name / "Noise", currName }, { name / "noisyScore" } });
+        currName = name / "noisyScore";
+    }
+
+    // Transpose
+    networkParameters.mWorkflow.add<TransposeLayer>(name / "transpose", TransposingParams{ { currName }, { name / "scoresT" }, Dimension::Width, Dimension::Height });
+
+    // Get weights
+    networkParameters.mWorkflow.add<SigmoidActivation>(name / "sigmoid", BasicParamsWithDim{ { name / "scoresT" }, { name / "sigmWeights" }, raul::Dimension::Width });
+
+    // Subtract one
+    networkParameters.mWorkflow.add<bahdanau::BahdanauConstantsInitializerLayer>(
+        name / "createOnesAndZeroes", BasicParams{ {}, { name / "Ones", name / "Zeroes1", name / "Zeroes2" } }, networkParameters.mWorkflow.getHeight(memoryName));
+    networkParameters.mWorkflow.add<ElementWiseSubLayer>(name / "decreaseSigmWeights", ElementWiseLayerParams{ { name / "Ones", name / "sigmWeights" }, { name / "reversedWeights" } });
+
+    // Final calculations
+    if (mStepwise)
+    {
+        if (params.mOldSMA)
+        {
+            networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "mulStateAndRWeights", ElementWiseLayerParams{ { stateName, name / "reversedWeights" }, { name / "newStateToRoll" } });
+            // Shift weights
+            networkParameters.mWorkflow.add<RollLayer>(name / "shift", RollLayerParams{ { name / "newStateToRoll" }, { name / "shiftedWeights" }, Dimension::Width, 1 });
+            // Change first values
+            networkParameters.mWorkflow.add<IndexFillLayer>(name / "fill",
+                                                            IndexFillLayerParams{ { name / "shiftedWeights" }, { name / "filledWeights" }, Dimension::Width, { 0 }, mStepwise ? 0.0_dt : 1.0_dt });
+            networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "mulStateAndWeights", ElementWiseLayerParams{ { stateName, name / "sigmWeights" }, { name / "prefinalState" } });
+            networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "calcNewAttn", ElementWiseLayerParams{ { name / "prefinalState", name / "filledWeights" }, { params.getOutputs()[0] } });
+        }
+        else
+        {
+            // Calculate stay part
+            networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "calcStayPart", ElementWiseLayerParams{ { stateName, name / "sigmWeights" }, { name / "stayPart" } });
+            // Calculate go part
+            networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "calcGoPart", ElementWiseLayerParams{ { stateName, name / "reversedWeights" }, { name / "goPart" } });
+            // Slice go part
+            networkParameters.mWorkflow.add<SlicerLayer>(
+                name / "sliceGoPart",
+                SlicingParams{ { name / "goPart" }, { name / "preShifted", name / "preEos" }, Dimension::Width, { static_cast<int>(networkParameters.mWorkflow.getWidth(name / "goPart")) - 1, 1 } });
+            networkParameters.mWorkflow.add<ConcatenationLayer>(name / "computeShifted", BasicParamsWithDim{ { name / "Zeroes1", name / "preShifted" }, { name / "shifted" }, Dimension::Width });
+            networkParameters.mWorkflow.add<ConcatenationLayer>(name / "computeEos", BasicParamsWithDim{ { name / "Zeroes2", name / "preEos" }, { name / "eos" }, Dimension::Width });
+            // Some these parts
+            networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "calcFinalGoPart", ElementWiseLayerParams{ { name / "shifted", name / "eos" }, { name / "finalGoPart" } });
+            // Calculate final attention as sum of stay and go parts
+            networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "calcNewAttn", ElementWiseLayerParams{ { name / "stayPart", name / "finalGoPart" }, { params.getOutputs()[0] } });
+        }
+
+        // Indices if needed
+        if ((params.getSharedLayer().empty() && params.getOutputs().size() == 3) || (!params.getSharedLayer().empty() && params.getOutputs().size() == 2))
+        {
+            // Don't need to declare temporary gradients for params.getOutputs().back()
+            // because ArgMaxLayer doesn't create nabla-tensor and doesn't calculate gradient
+            // if only indices are required
+            networkParameters.mWorkflow.add<ArgMaxLayer>(name / "maxAttn", BasicParamsWithDim{ { params.getOutputs()[0] }, { params.getOutputs().back() }, Dimension::Width });
+        }
+    }
+    else
+    {
+        // Shift weights
+        networkParameters.mWorkflow.add<RollLayer>(name / "shift", RollLayerParams{ { name / "reversedWeights" }, { name / "shiftedWeights" }, Dimension::Width, 1 });
+
+        // Change first values
+        networkParameters.mWorkflow.add<IndexFillLayer>(name / "fill", IndexFillLayerParams{ { name / "shiftedWeights" }, { name / "filledWeights" }, Dimension::Width, { 0 }, 1.0_dt });
+
+        // Clamp current result
+        networkParameters.mWorkflow.add<ClampLayer>(name / "clamp", ClampLayerParams{ { name / "filledWeights" }, { name / "clampedResult" }, CLAMP_MIN1, CLAMP_MAX });
+
+        // Log
+        networkParameters.mWorkflow.add<LogLayer>(name / "log", BasicParams{ { name / "clampedResult" }, { name / "logedResult" } });
+
+        // Cumulative sum
+        networkParameters.mWorkflow.add<CumSumLayer>(name / "csum1", BasicParamsWithDim{ { name / "logedResult" }, { name / "cumulativeResult" }, raul::Dimension::Width });
+
+        // Exp
+        networkParameters.mWorkflow.add<ExpLayer>(name / "exp", BasicParams{ { name / "cumulativeResult" }, { name / "expResult" } });
+
+        // Clamp current result
+        networkParameters.mWorkflow.add<ClampLayer>(name / "clamp2", ClampLayerParams{ { name / "expResult" }, { name / "clampedExpResult" }, CLAMP_MIN2, CLAMP_MAX });
+
+        // Divide prev state
+        networkParameters.mWorkflow.add<ElementWiseDivLayer>(name / "div", ElementWiseLayerParams{ { stateName, name / "clampedExpResult" }, { name / "normalizedOldState" } });
+
+        // Cumulative sum of the result
+        networkParameters.mWorkflow.add<CumSumLayer>(name / "csum2", BasicParamsWithDim{ { name / "normalizedOldState" }, { name / "summedOldStateResult" }, raul::Dimension::Width });
+
+        // Final result
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "mulFinal",
+                                                             ElementWiseLayerParams{ { name / "sigmWeights", name / "expResult", name / "summedOldStateResult" }, { params.getOutputs()[0] } });
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/BahdanauMonotonicAttentionLayer.h b/training/src/compiler/training/base/layers/composite/BahdanauMonotonicAttentionLayer.h
new file mode 100644
index 00000000..c034f786
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/BahdanauMonotonicAttentionLayer.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BAHDANAU_MONOTONIC_ATTENTION_LAYER_H
+#define BAHDANAU_MONOTONIC_ATTENTION_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/parameters/trainable/BahdanauAttentionParams.h>
+#include <training/base/common/NetworkParameters.h>
+
+namespace raul
+{
+
+/**
+ * @brief Bahdanau Monotonic Attention Layer
+ *
+ * This layer produce Bahdanau-style attention.
+ *
+ * Implementation follows https://texar-pytorch.readthedocs.io/en/latest/code/core.html#bahdanauattention
+ *
+ * Inputs:
+ *     1. Query  [batch, 1, 1, decoder_output_size]
+ *     2. State  [batch, 1, 1, alignments_size(i.e. max_time)]
+ *     3. Memory [batch, 1, max_time, encoder_output_size]
+ *     4. { MemorySeqLength } [batch, 1, 1, 1] - values to create mask.
+ *     5. { MaskValues } [batch, 1, alignments_size, 1] or broadcastable to this size -
+ *     values, which replace weights from softmax according
+ *     mask, generated using memory sequence length parameter.
+ * Outputs: Probabilities [batch, 1, 1, alignments_size(i.e. max_time)],
+ *          { values } [batch, 1, max_time, encoder_output_size] (if non-shared layer and mask provided),
+ *          { max_attn } [batch, 1, 1, 1] (indices of maximum numbers in probabilities).
+ *
+ * Steps:
+ * 1. Using linear layer or matrix multiplication, process Query from [batch, 1, 1, decoder_output_size] to [batch, 1, 1, numUnits].
+ * 2. Using linear layer or matrix multiplication, Process Memory from [batch, 1, max_time, encoder_output_size] to [batch, 1, max_time, numUnits]. Mask Memory if needed.
+ * 3. Sum obtained tensors.
+ * 4. Take tanh activation from the result.
+ * 5. Multiply output from previous step by attention_v tensor (mormalize this tensor if needed).
+ * 6. Take reduction sum.
+ * 7. Add bias to the result.
+ * 8. Apply mask if needed: replace elements of a tensor by some specified value.
+ * 9. Add noise to sigmoid input.
+ * 10. Take sigmoid activation.
+ * 11. Reflect calculated weights relativelty to unit.
+ * If simple monotonic attention:
+ *  12. Shift the result by 1 along Dimension::Width, fill new empty space by 1.0.
+ *  13. Clamp obtained result in [some small value, 1.0].
+ *  14. Take element-wise natural logarithm.
+ *  15. Calculate cumulative sum along Dimension::Width.
+ *  16. Take element-wise exponent.
+ *  17. Clamp the result again.
+ *  18. Divide State tensor by calculated one.
+ *  19. Take reduction sum.
+ *  20. Result is element-wise multiplication of normalized State tensor, sigmoid weights and exponential output.
+ * If stepwise monotonic attention:
+ *  12. Multiply reflected weights by state tensor
+ *  13. Shift obtained result by 1 along Dimension::Width, fill new empty space by 0.0.
+ *  14. Multiply state tensor by sigmoid output.
+ *  15. Result is element-wise sum of outputs from steps 13 and 14.
+ * @see
+ * Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio, “Neural Machine Translation by Jointly Learning to Align and Translate”, ICLR 2015
+ */
+class BahdanauMonotonicAttentionLayer
+{
+
+  public:
+    BahdanauMonotonicAttentionLayer(const Name& name, const BahdanauAttentionParams& params, NetworkParameters& networkParameters);
+
+    BahdanauMonotonicAttentionLayer(BahdanauMonotonicAttentionLayer&&) = default;
+    BahdanauMonotonicAttentionLayer(const BahdanauMonotonicAttentionLayer&) = delete;
+    BahdanauMonotonicAttentionLayer& operator=(const BahdanauMonotonicAttentionLayer&) = delete;
+
+  private:
+    size_t mNumUnits;
+    bool mNormalize;
+    dtype mSigmoidNoise;
+    dtype mScoreBiasInit;
+    std::string mMode;
+
+    // Internal params which need gradient
+    Name mAttentionVName;
+    Name mScoreBiasName;
+    Name mAttentionBName;
+    Name mAttentionGName;
+
+    // Mask
+    bool mHasMask;
+
+    // Stepwise or not
+    bool mStepwise;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/DynamicConvolutionAttentionInternalLayers.h b/training/src/compiler/training/base/layers/composite/DynamicConvolutionAttentionInternalLayers.h
new file mode 100644
index 00000000..53927d76
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/DynamicConvolutionAttentionInternalLayers.h
@@ -0,0 +1,271 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DYNAMIC_CONVOLUTION_ATTENTION_INTERNAL_LAYERS_H
+#define DYNAMIC_CONVOLUTION_ATTENTION_INTERNAL_LAYERS_H
+
+#include <training/base/initializers/XavierInitializer.h>
+#include <training/base/layers/TrainableLayer.h>
+
+#include <math.h>
+
+namespace
+{
+
+size_t comb(size_t n, size_t k)
+{
+    if (k > n)
+    {
+        return 0;
+    }
+    if (k * 2 > n)
+    {
+        k = n - k;
+    }
+    if (k == 0)
+    {
+        return 1;
+    }
+
+    size_t result = n;
+    for (size_t i = 2; i <= k; ++i)
+    {
+        result *= (n - i + 1);
+        result /= i;
+    }
+    return result;
+}
+
+} // anonymous namespace
+
+namespace raul::dca
+{
+
+const raul::dtype MIN_INPUT = 1.775e-38_dt;
+const raul::dtype MIN_OUTPUT = -1.0e6_dt;
+
+class DCATrainableInitializerLayer : public raul::TrainableLayer
+{
+  public:
+    DCATrainableInitializerLayer(const raul::Name& name,
+                                 const raul::TrainableParams& params,
+                                 size_t numUnits,
+                                 raul::dtype priorAlpha,
+                                 raul::dtype priorBeta,
+                                 size_t priorFilterSize,
+                                 raul::NetworkParameters& networkParameters)
+        : TrainableLayer(name, "DCATrainableInitializer", params, networkParameters, { false, true })
+        , mPriorAlpha(priorAlpha)
+        , mPriorBeta(priorBeta)
+        , mPriorFilterSize(priorFilterSize)
+    {
+        auto prefix = "DCATrainableInitializerLayer[" + mName + "::ctor]: ";
+
+        if (mInputs.size() != 0)
+        {
+            THROW(mTypeName, mName, "no input names expected");
+        }
+
+        if (mOutputs.size() != 2)
+        {
+            THROW(mTypeName, mName, "wrong number of output names");
+        }
+        if (mOutputs[0].empty() || mOutputs[1].empty())
+        {
+            THROW(mTypeName, mName, "empty output name");
+        }
+
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ 1u, 1u, 1u, numUnits }, DEC_TRAINABLE);
+
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[1], raul::WShape{ 1u, 1u, 1u, numUnits }, DEC_TRAINABLE);
+
+        if (!mFrozen)
+        {
+            networkParameters.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_TRAINABLE_GRAD);
+            networkParameters.mWorkflow.copyDeclaration(name, mOutputs[1], mOutputs[1].grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+
+    void initNotBSTensors() override
+    {
+        // Initialize trainable params
+        initializers::XavierUniformInitializer initializer;
+        Name priorFilterName = Name(mName.str().substr(0, mName.str().find_last_of("::") - 1)) / "apply_prior_filters" / "Weights";
+        if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+        {
+            initializer(mNetworkParams.mMemoryManager[mOutputs[0]]);
+
+            Tensor& priorFilter = mNetworkParams.mMemoryManager[priorFilterName];
+            const auto divisor = static_cast<dtype>(tgamma(mPriorAlpha + mPriorBeta + TODTYPE(mPriorFilterSize - 1)) * tgamma(mPriorAlpha) * tgamma(mPriorBeta) / tgamma(mPriorAlpha + mPriorBeta));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < priorFilter.size(); ++i)
+            {
+                priorFilter[i] = TODTYPE(comb(mPriorFilterSize - 1, mPriorFilterSize - 1 - i)) * tgamma(TODTYPE(mPriorFilterSize - 1 - i) + mPriorAlpha) * tgamma(TODTYPE(i) + mPriorBeta) / divisor;
+            }
+        }
+        else
+        {
+            THROW(mTypeName, mName, "unsupported execution target");
+        }
+    }
+
+    void forwardComputeImpl(NetworkMode) override {}
+    void backwardComputeImpl() override {}
+
+  private:
+    dtype mPriorAlpha;
+    dtype mPriorBeta;
+    size_t mPriorFilterSize;
+};
+
+// Reshape calculated features
+class CustomReshapeLayer : public raul::BasicLayer
+{
+  public:
+    CustomReshapeLayer(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "CustomReshape", params, networkParameters, { false, false })
+        , mOutputHeight(networkParameters.mWorkflow.getHeight(mInputs[0]))
+    {
+        auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+        if (mInputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of input names");
+        }
+        if (mOutputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of output names");
+        }
+
+        mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[0], raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ BS(), mOutputDepth, mOutputHeight, mOutputWidth }, DEC_FORW_WRIT);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override
+    {
+        if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+        {
+            Tensor& output = mNetworkParams.mMemoryManager[mOutputs[0]];
+            const Tensor& input = mNetworkParams.mMemoryManager[mInputs[0]];
+
+            const size_t realBatchSize = mNetworkParams.mWorkflow.getBatch(mOutputs[0]);
+
+            // Get 4D View
+            auto output4D = output.reshape(yato::dims(realBatchSize, mOutputDepth, mOutputHeight, mOutputWidth));
+            auto input4D = input.reshape(yato::dims(1u, 1u, mOutputHeight, realBatchSize * mOutputWidth));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t hi = 0; hi < mOutputHeight; ++hi)
+            {
+                for (size_t wi = 0; wi < realBatchSize * mOutputWidth; ++wi)
+                {
+                    output4D[wi / mOutputWidth][0][hi][wi % mOutputWidth] = input4D[0][0][hi][wi];
+                }
+            }
+        }
+        else
+        {
+            THROW(mTypeName, mName, "unsupported execution target");
+        }
+    }
+
+    void backwardComputeImpl() override
+    {
+        if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+        {
+            const Tensor& deltas = mNetworkParams.mMemoryManager[mOutputs[0].grad()];
+            Tensor& inputNabla = mNetworkParams.mMemoryManager[mInputs[0].grad()];
+
+            const size_t realBatchSize = mNetworkParams.mWorkflow.getBatch(mOutputs[0].grad());
+
+            // Get 4D View
+            auto deltas4D = deltas.reshape(yato::dims(realBatchSize, mOutputDepth, mOutputHeight, mOutputWidth));
+            auto inputNabla4D = inputNabla.reshape(yato::dims(1u, 1u, mOutputHeight, realBatchSize * mOutputWidth));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t hi = 0; hi < mOutputHeight; ++hi)
+            {
+                for (size_t wi = 0; wi < realBatchSize * mOutputWidth; ++wi)
+                {
+                    inputNabla4D[0][0][hi][wi] += deltas4D[wi / mOutputWidth][0][hi][wi % mOutputWidth];
+                }
+            }
+        }
+        else
+        {
+            THROW(mTypeName, mName, "unsupported execution target");
+        }
+    }
+
+  private:
+    // Hardcoded values
+    size_t mOutputDepth{ 1u };
+    size_t mOutputWidth{ 8u };
+
+    size_t mOutputHeight;
+};
+
+// Do not use very small values
+class DCAConstantsInitializerLayer : public raul::BasicLayer
+{
+  public:
+    DCAConstantsInitializerLayer(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "DCAConstantsInitializer", params, networkParameters, { false, false })
+    {
+        auto prefix = "DCAConstantsInitializerLayer[" + mName + "::ctor]: ";
+
+        if (mOutputs.size() != 2)
+        {
+            THROW(mTypeName, mName, "wrong number of output names");
+        }
+        if (mOutputs[0].empty() || mOutputs[1].empty())
+        {
+            THROW(mTypeName, mName, "empty output name");
+        }
+
+        // Declare needed constants
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ 1u, 1u, 1u, 1u }, DEC_FORW_WRIT);
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[1], raul::WShape{ 1u, 1u, 1u, 1u }, DEC_FORW_WRIT);
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override
+    {
+        // Initialize constants
+        if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU)
+        {
+            mNetworkParams.mMemoryManager[mOutputs[0]][0] = MIN_INPUT;
+            mNetworkParams.mMemoryManager[mOutputs[1]][0] = MIN_OUTPUT;
+        }
+        else
+        {
+            THROW(mTypeName, mName, "unsupported execution target");
+        }
+    }
+
+    void backwardComputeImpl() override {}
+};
+
+} // raul::dca namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/DynamicConvolutionAttentionLayer.cpp b/training/src/compiler/training/base/layers/composite/DynamicConvolutionAttentionLayer.cpp
new file mode 100644
index 00000000..4387092d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/DynamicConvolutionAttentionLayer.cpp
@@ -0,0 +1,273 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DynamicConvolutionAttentionLayer.h"
+#include "AttentionMaskCreatorLayer.h"
+#include "DynamicConvolutionAttentionInternalLayers.h"
+
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/activations/TanhActivation.h>
+#include <training/base/layers/basic/ArgMaxLayer.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.h>
+#include <training/base/layers/basic/ElementWiseCompareLayer.h>
+#include <training/base/layers/basic/ElementWiseMaxLayer.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/basic/LogLayer.h>
+#include <training/base/layers/basic/PaddingLayer.h>
+#include <training/base/layers/basic/ReduceSumLayer.h>
+#include <training/base/layers/basic/ReshapeLayer.h>
+#include <training/base/layers/basic/SelectLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/base/layers/basic/trainable/Convolution1DLayer.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+
+namespace raul
+{
+
+DynamicConvolutionAttentionLayer::DynamicConvolutionAttentionLayer(const Name& name, const DynamicConvolutionAttentionParams& params, raul::NetworkParameters& networkParameters)
+    : mNumUnits(params.mNumUnits)
+    , mCumulativeMode(params.mCumulateWeights)
+    , mLocationConvolutionFilters(params.mHparams.mAttentionFilters)
+    , mLocationConvolutionKernelSize(params.mHparams.mAttentionKernel)
+    , mPriorFilterSize(params.mHparams.mPriorFilterSize)
+    , mPriorAlpha(params.mHparams.mPriorAlpha)
+    , mPriorBeta(params.mHparams.mPriorBeta)
+    , mHasMask(false)
+    , mMaskLen(0)
+{
+    auto prefix = "DynamicConvolutionAttention[" + name + "::ctor]: ";
+
+    // Query, State, Memory, [MemorySeqLength]
+    if (params.getInputs().size() != 3 && params.getInputs().size() != 4)
+    {
+        THROW("DynamicConvolutionAttention", name, "wrong number of input names");
+    }
+
+    // Mask independent
+    if ((!params.getSharedLayer().empty() && !mCumulativeMode && (params.getOutputs().size() == 3 || params.getOutputs().size() == 4)) ||
+        (!params.getSharedLayer().empty() && mCumulativeMode && (params.getOutputs().size() == 1 || params.getOutputs().size() == 4)))
+    {
+        THROW("DynamicConvolutionAttention", name, "wrong number of output names");
+    }
+
+    // With mask
+    if (params.getInputs().size() == 4 && params.getSharedLayer().empty() &&
+        ((mCumulativeMode && params.getOutputs().size() < 3) || (!mCumulativeMode && params.getOutputs().size() != 2 && params.getOutputs().size() != 3)))
+    {
+        THROW("DynamicConvolutionAttention", name, "wrong number of output names");
+    }
+
+    // Without mask
+    if (params.getInputs().size() == 3 && params.getSharedLayer().empty() &&
+        ((mCumulativeMode && params.getOutputs().size() != 2 && params.getOutputs().size() != 3) || (!mCumulativeMode && params.getOutputs().size() > 2)))
+    {
+        THROW("DynamicConvolutionAttention", name, "wrong number of output names");
+    }
+
+    // Input names
+    auto [queryName, stateName, memoryName] = std::make_tuple(params.getInputs()[0], params.getInputs()[1], params.getInputs()[2]);
+
+    if (params.getInputs().size() == 4 && params.getSharedLayer().empty())
+    {
+        mHasMask = true;
+        mMaskLen = networkParameters.mWorkflow.getWidth(stateName);
+
+        networkParameters.mWorkflow.add<AttentionMaskCreatorLayer>(name / "create_mask", raul::BasicParams{ { params.getInputs()[3] }, { name / "mask" } }, mMaskLen);
+    }
+
+    // Trainable params
+    if (!params.getSharedLayer().empty())
+    {
+        mAttentionVPName = params.getSharedLayer() / "attention_variable_projection";
+        mAttentionBiasName = params.getSharedLayer() / "attention_bias";
+    }
+    else
+    {
+        mAttentionVPName = name / "attention_variable_projection";
+        mAttentionBiasName = name / "attention_bias";
+
+        // Also init prior filters
+        networkParameters.mWorkflow.add<raul::dca::DCATrainableInitializerLayer>(
+            name / "trainable_params", raul::TrainableParams{ {}, { mAttentionVPName, mAttentionBiasName }, params.frozen }, mNumUnits, mPriorAlpha, mPriorBeta, mPriorFilterSize);
+    }
+
+    // Calculate values
+    if (mHasMask)
+    {
+        networkParameters.mWorkflow.add<raul::ElementWiseMulLayer>(name / "mask_memory", raul::ElementWiseLayerParams{ { memoryName, name / "mask" }, { params.getOutputs()[1] } });
+    }
+
+    // Get keys
+    if (params.getSharedLayer().empty())
+    {
+        // Process memory from [batch, 1, max_time, encoder_output_size] to key with size [batch, 1, max_time, mNumUnits]
+        networkParameters.mWorkflow.add<raul::LinearLayer>(name / "memory_layer",
+                                                           raul::LinearParams{ { mHasMask ? params.getOutputs()[1] : memoryName }, { name / "keys" }, mNumUnits, false, params.frozen });
+    }
+
+    // Transpose state from [batch_size, 1, 1, max_time] to [batch_size, 1, max_time, 1]
+    networkParameters.mWorkflow.add<raul::TransposeLayer>(name / "transpose_state", raul::TransposingParams{ { stateName }, { name / "expanded_alignments" }, Dimension::Width, Dimension::Height });
+
+    // Extract location-based features from state. Padding mode is "same"
+    size_t locationConvolutionPadding = (mLocationConvolutionKernelSize - 1) / 2;
+
+    networkParameters.mWorkflow.add<raul::Convolution1DLayer>(name / "location_convolution",
+                                                              raul::Convolution1DParams{ name / "expanded_alignments",
+                                                                                         name / "f",
+                                                                                         params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "location_convolution",
+                                                                                         mLocationConvolutionKernelSize,
+                                                                                         mLocationConvolutionFilters,
+                                                                                         1,
+                                                                                         locationConvolutionPadding,
+                                                                                         1,
+                                                                                         1,
+                                                                                         true,
+                                                                                         false,
+                                                                                         true,
+                                                                                         params.frozen });
+
+    // Projected location features [batch_size, 1, max_time, attention_dim]
+    networkParameters.mWorkflow.add<raul::LinearLayer>(
+        name / "location_layer",
+        raul::LinearParams{
+            { name / "f" }, { name / "processed_location_features" }, params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "location_layer", mNumUnits, false, params.frozen });
+
+    // Get dynamic filters
+    networkParameters.mWorkflow.add<raul::LinearLayer>(
+        name / "dynamic_fc1",
+        raul::LinearParams{ { queryName }, { name / "intermediate_dynamic_filters" }, params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "dynamic_fc1", 128, true, params.frozen });
+
+    // Tanh activation
+    networkParameters.mWorkflow.add<raul::TanhActivation>(name / "dynamic_fc1_activation",
+                                                          raul::BasicParams{ { name / "intermediate_dynamic_filters" }, { name / "activated_intermediate_dynamic_filters" } });
+
+    networkParameters.mWorkflow.add<raul::LinearLayer>(name / "dynamic_fc2",
+                                                       raul::LinearParams{ { name / "activated_intermediate_dynamic_filters" },
+                                                                           { name / "dynamic_filters" },
+                                                                           params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "dynamic_fc2",
+                                                                           168,
+                                                                           false,
+                                                                           params.frozen });
+
+    // Final filters for dynamic convolution
+    networkParameters.mWorkflow.add<raul::ReshapeLayer>(name / "reshape_filters", raul::ViewParams{ name / "dynamic_filters", name / "dynamic_filters_reshaped", 1u, 21u, 8u });
+
+    // Transpose filters [0, 1, 2, 3] -> [0, 2, 1, 3]
+    networkParameters.mWorkflow.add<raul::TransposeLayer>(
+        name / "transpose_filters_step_1", raul::TransposingParams{ { name / "dynamic_filters_reshaped" }, { name / "dynamic_filters_transposed" }, Dimension::Depth, Dimension::Height });
+
+    // Transpose filters [0, 2, 1, 3] -> [1, 2, 0, 3]
+    networkParameters.mWorkflow.add<raul::TransposeLayer>(name / "transpose_filters_step_2",
+                                                          raul::TransposingParams{ { name / "dynamic_filters_transposed" }, { name / "final_dynamic_filters" }, Dimension::Batch, Dimension::Height });
+
+    // Begin of the section which is not as in reference python code!
+
+    // Pad input for dynamic convolution
+    networkParameters.mWorkflow.add<raul::PaddingLayer>(name / "pad_input_for_dc",
+                                                        raul::PaddingLayerParams{ { name / "expanded_alignments" }, { name / "pre_dynamic_input" }, 10u, 10u, 0u, 0u, 0.0_dt });
+
+    // Transpose input for dynamic convolution - simplified! (additionally check it)
+    networkParameters.mWorkflow.add<raul::TransposeLayer>(name / "transpose_input_for_dc",
+                                                          raul::TransposingParams{ { name / "pre_dynamic_input" }, { name / "dynamic_input" }, Dimension::Batch, Dimension::Width });
+
+    // End of this section
+
+    // Dynamic convolution
+    networkParameters.mWorkflow.add<raul::DynamicDepthwiseConvolution2DLayer>(name / "dynamic_convolution",
+                                                                              raul::BasicParams{ { name / "dynamic_input", name / "final_dynamic_filters" }, { name / "dynamic_features_1" } });
+
+    // Reshape calculated features
+    networkParameters.mWorkflow.add<raul::dca::CustomReshapeLayer>(name / "get_final_dynamic_features", raul::BasicParams{ { name / "dynamic_features_1" }, { name / "dynamic_features" } });
+
+    // Project obtained features
+    networkParameters.mWorkflow.add<raul::LinearLayer>(name / "dynamic_projection",
+                                                       raul::LinearParams{ { name / "dynamic_features" },
+                                                                           { name / "processed_dynamic_features" },
+                                                                           params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "dynamic_projection",
+                                                                           mNumUnits,
+                                                                           false,
+                                                                           params.frozen });
+
+    // Padded expanded state
+    networkParameters.mWorkflow.add<raul::PaddingLayer>(name / "pad_state",
+                                                        raul::PaddingLayerParams{ { name / "expanded_alignments" }, { name / "padded_expanded_alignments" }, 10u, 0u, 0u, 0u, 0.0_dt });
+
+    // Apply prior filters
+    // prior_filters - constant tensor, so layer is always frozen
+    networkParameters.mWorkflow.add<raul::Convolution1DLayer>(name / "apply_prior_filters",
+                                                              raul::Convolution1DParams{ name / "padded_expanded_alignments",
+                                                                                         name / "prior",
+                                                                                         params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "apply_prior_filters",
+                                                                                         mPriorFilterSize,
+                                                                                         1,
+                                                                                         1,
+                                                                                         0,
+                                                                                         1,
+                                                                                         1,
+                                                                                         true,
+                                                                                         false,
+                                                                                         true,
+                                                                                         true });
+
+    networkParameters.mWorkflow.add<raul::dca::DCAConstantsInitializerLayer>(name / "constants", raul::BasicParams{ {}, { name / "MIN_INPUT", name / "MIN_OUTPUT" } });
+
+    networkParameters.mWorkflow.add<raul::ElementWiseMaxLayer>(name / "limit_prior", raul::ElementWiseLayerParams{ { name / "prior", name / "MIN_INPUT" }, { name / "prior_limited" } });
+
+    networkParameters.mWorkflow.add<raul::LogLayer>(name / "log_prior", raul::ElementWiseLayerParams{ { name / "prior_limited" }, { name / "prior_logged" } });
+
+    // Again exclude small values
+    networkParameters.mWorkflow.add<raul::ElementWiseCompareLayer>(name / "mask_prior_output",
+                                                                   raul::ElementWiseComparisonLayerParams{ { name / "prior", name / "MIN_INPUT" }, { name / "prior_mask" }, true, "ge", 0.0_dt });
+
+    networkParameters.mWorkflow.add<raul::SelectLayer>(name / "generate_prior_output",
+                                                       raul::ElementWiseLayerParams{ { name / "prior_mask", name / "prior_logged", name / "MIN_OUTPUT" }, { name / "prior_output" } });
+
+    // Calculate DCA score (Bahdanau-style)
+    networkParameters.mWorkflow.add<raul::ElementWiseSumLayer>(
+        name / "sum_location_and_dynamic_features",
+        raul::ElementWiseLayerParams{ { name / "processed_location_features", name / "processed_dynamic_features", mAttentionBiasName }, { name / "sum_of_features" } });
+
+    networkParameters.mWorkflow.add<raul::TanhActivation>(name / "activate_sum", raul::BasicParams{ { name / "sum_of_features" }, { name / "act_sum_of_features" } });
+
+    networkParameters.mWorkflow.add<raul::ElementWiseMulLayer>(name / "multibly_by_attention_vp",
+                                                               raul::ElementWiseLayerParams{ { name / "act_sum_of_features", mAttentionVPName }, { name / "increased_act_sum_of_features" } });
+
+    networkParameters.mWorkflow.add<raul::ReduceSumLayer>(name / "reduce_to_energy",
+                                                          raul::BasicParamsWithDim{ { name / "increased_act_sum_of_features" }, { name / "pre_energy" }, raul::Dimension::Width });
+
+    networkParameters.mWorkflow.add<raul::ElementWiseSumLayer>(name / "calculate_energy", raul::ElementWiseLayerParams{ { name / "pre_energy", name / "prior_output" }, { name / "energy" } });
+
+    // Transpose
+    networkParameters.mWorkflow.add<raul::TransposeLayer>(name / "transpose_energy",
+                                                          raul::TransposingParams{ { name / "energy" }, { name / "transposed_energy" }, Dimension::Width, Dimension::Height });
+
+    networkParameters.mWorkflow.add<raul::SoftMaxActivation>(name / "activate_energy", raul::BasicParamsWithDim{ { name / "transposed_energy" }, { params.getOutputs()[0] }, Dimension::Width });
+
+    // Get new alignments
+    if (mCumulativeMode)
+    {
+        const size_t i = mHasMask ? 2 : 1;
+        // Get new state
+        networkParameters.mWorkflow.add<raul::ElementWiseSumLayer>(name / "calculate_new_state", raul::ElementWiseLayerParams{ { params.getOutputs()[0], stateName }, { params.getOutputs()[i] } });
+    }
+
+    if (params.getOutputs().size() == 4 || ((!mHasMask || !mCumulativeMode) && params.getOutputs().size() == 3) || (!mHasMask && !mCumulativeMode && params.getOutputs().size() == 2))
+    {
+        // Get max attention indices
+        networkParameters.mWorkflow.add<raul::ArgMaxLayer>(name / "get_max_attn", raul::BasicParamsWithDim{ { params.getOutputs()[0] }, { params.getOutputs().back() }, raul::Dimension::Width });
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/DynamicConvolutionAttentionLayer.h b/training/src/compiler/training/base/layers/composite/DynamicConvolutionAttentionLayer.h
new file mode 100644
index 00000000..4cd727e0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/DynamicConvolutionAttentionLayer.h
@@ -0,0 +1,101 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DYNAMIC_CONVOLUTION_ATTENTION_LAYER_H
+#define DYNAMIC_CONVOLUTION_ATTENTION_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/parameters/trainable/DynamicConvolutionAttentionParams.h>
+#include <training/base/common/NetworkParameters.h>
+
+namespace raul
+{
+
+/**
+ * @brief Dynamic Convolution Attention Layer
+ *
+ * Impelements Bahdanau-style (cumulative) scoring function. The attention is location-based
+ *
+ * Implementation follows https://rnd-gitlab-msc.huawei.com/voice-assistant/tts/release_code/-/blob/feature/experimental/tacotron/models/attention.py#L453
+ *
+ * Inputs:
+ *     1. Query  [batch, 1, 1, decoder_output_size]
+ *     2. State  [batch, 1, 1, alignments_size(i.e. max_time)]
+ *     3. Memory [batch, 1, max_time, encoder_output_size]
+ *     4. { MemorySeqLength } [batch, 1, 1, 1] - values to create mask.
+ * Outputs: Probabilities [batch, 1, 1, alignments_size(i.e. max_time)],
+ *          { values } [batch, 1, max_time, encoder_output_size] (if non-shared layer and mask provided),
+ *          { next_state } [batch, 1, 1, alignments_size(i.e. max_time)] (if in cumulative mode),
+ *          { max_attn } [batch, 1, 1, 1] (indices of maximum numbers in probabilities).
+ *
+ * Steps:
+ * 1. If mask provided, zero some part of initial memory input.
+ * 2. Using linear layer, process memory from [batch, 1, max_time, encoder_output_size] to [batch, 1, max_time, numUnits].
+ * 3. Transposed initial state input goes through conv 1D layer create raw location feature
+ * 4. Using linear layer, calculate final location features.
+ * 5. Multiply output from previous step by attention_v tensor (mormalize this tensor if needed).
+ * 6. In order to create raw dynamic filters, process query input throught two linear layers with tanh activation between.
+ * 7. To produce final dynamic features, apply some transformations: reshape and transpose raw filters.
+ * 8. Apply depthwise dynamic convolution 2D to produce dynamic features: use transposed state tensor as input and calculated dynamic filters.
+ * 9. Reshape obtained result
+ * 10. Using linear layer, get dynamic projection of these features.
+ * 11. Apply prior filters in conv 1D layer.
+ * 12. Exclude too small values.
+ * 13. Log the result.
+ * 14. Again exclude too small values.
+ * 15. Calculate Bahdanau Score (i.e. energy) using calculated features and DCA trainable params.
+ * 16. Final result - activated energy (SoftMax is used).
+ * 17. If cumulative mode: calculate next_state as sum of previous state + current final result. Indices ofr maximum numbers in final result also can be obtained.
+ *
+ * @see
+ * Eric Battenberg, RJ Skerry-Ryan, Soroosh Mariooryad, Daisy Stanton, David Kao, Matt Shannon, Tom Bagby,
+ * “Location-Relative Attention Mechanisms For Robust Long-Form Speech Synthesis”, ICASSP 2020
+ */
+
+class DynamicConvolutionAttentionLayer
+{
+
+  public:
+    DynamicConvolutionAttentionLayer(const Name& name, const DynamicConvolutionAttentionParams& params, raul::NetworkParameters& networkParameters);
+
+    DynamicConvolutionAttentionLayer(DynamicConvolutionAttentionLayer&&) = default;
+    DynamicConvolutionAttentionLayer(const DynamicConvolutionAttentionLayer&) = delete;
+    DynamicConvolutionAttentionLayer& operator=(const DynamicConvolutionAttentionLayer&) = delete;
+
+  private:
+    // General params
+    size_t mNumUnits;
+    bool mCumulativeMode;
+
+    // Parameters for first conv1d layer
+    size_t mLocationConvolutionFilters;
+    size_t mLocationConvolutionKernelSize;
+
+    // Size of filter in second conv1d layer
+    size_t mPriorFilterSize;
+
+    // Values to fill prior filter
+    raul::dtype mPriorAlpha;
+    raul::dtype mPriorBeta;
+
+    bool mHasMask;
+    size_t mMaskLen;
+
+    // Local trainable parameters
+    raul::Name mAttentionVPName;
+    raul::Name mAttentionBiasName;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/LeNet.cpp b/training/src/compiler/training/base/layers/composite/LeNet.cpp
new file mode 100644
index 00000000..b3788312
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/LeNet.cpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LeNet.h"
+
+#include <training/compiler/Layers.h>
+
+namespace raul
+{
+
+void AddLeNetModel(Workflow* work, const Name& name, const BasicParams& params, size_t numClasses, bool includeTop)
+{
+    if (params.getInputs().size() != 1)
+    {
+        THROW_NONAME("AddLeNetModel", "Wrong number of inputs: 1 expected, " + std::to_string(params.getInputs().size()) + " provided");
+    }
+    if (params.getOutputs().size() != 1)
+    {
+        THROW_NONAME("AddLeNetModel", "Wrong number of outputs: 1 expected, " + std::to_string(params.getOutputs().size()) + " provided");
+    }
+
+    const auto& input = params.getInputs()[0];
+    const auto& output = params.getOutputs()[0];
+
+    work->add<Convolution2DLayer>(name / "conv1", Convolution2DParams{ { input }, { name / "c1_out" }, 5, 6 });
+    work->add<ReLUActivation>(name / "relu1", BasicParams{ { name / "c1_out" }, { name / "c1_relu" } });
+    work->add<MaxPoolLayer2D>(name / "pool1", Pool2DParams{ { name / "c1_relu" }, { name / "pool1_out" }, 2, 2 });
+
+    work->add<Convolution2DLayer>(name / "conv2", Convolution2DParams{ { name / "pool1_out" }, { name / "c2_out" }, 5, 16 });
+    work->add<ReLUActivation>(name / "relu2", BasicParams{ { name / "c2_out" }, { name / "c2_relu" } });
+    work->add<MaxPoolLayer2D>(name / "pool2", Pool2DParams{ { name / "c2_relu" }, { includeTop ? name / "pool2_out" : output }, 2, 2 });
+
+    if (includeTop)
+    {
+        work->add<ReshapeLayer>(name / "flatten", ViewParams{ name / "pool2_out", name / "flatten", 1, 1, -1 });
+        work->add<LinearLayer>(name / "fc1", LinearParams{ name / "flatten", name / "fc1_out", 120 });
+        work->add<ReLUActivation>(name / "relu3", BasicParams{ { name / "fc1_out" }, { name / "fc1_relu" } });
+        work->add<LinearLayer>(name / "fc2", LinearParams{ name / "fc1_relu", name / "fc2_out", 84 });
+        work->add<ReLUActivation>(name / "relu4", BasicParams{ { name / "fc2_out" }, { name / "fc2_relu" } });
+        work->add<LinearLayer>(name / "fc3", LinearParams{ name / "fc2_relu", output, numClasses });
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/LeNet.h b/training/src/compiler/training/base/layers/composite/LeNet.h
new file mode 100644
index 00000000..aea20a06
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/LeNet.h
@@ -0,0 +1,36 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LENET_H
+#define LENET_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/parameters/BasicParameters.h>
+#include <training/compiler/Workflow.h>
+
+namespace raul
+{
+
+/**
+ * @brief LeNet model
+ *   Params:
+ *     Inputs:
+ *       - input [BS, numChannels, 32, 32]
+ *     Outputs:
+ *       - prediction
+ */
+void AddLeNetModel(Workflow* work, const Name& name, const BasicParams& params, size_t numClasses = 10, bool includeTop = true);
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/LocationSensitiveAttentionInternalLayers.h b/training/src/compiler/training/base/layers/composite/LocationSensitiveAttentionInternalLayers.h
new file mode 100644
index 00000000..edd94185
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/LocationSensitiveAttentionInternalLayers.h
@@ -0,0 +1,101 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LOCATION_SENSITIVE_ATTENTION_INTERNAL_LAYERS_H
+#define LOCATION_SENSITIVE_ATTENTION_INTERNAL_LAYERS_H
+
+#include <training/base/initializers/XavierInitializer.h>
+#include <training/base/layers/TrainableLayer.h>
+
+namespace raul::lsa
+{
+
+class LSATrainableInitializerLayer : public raul::TrainableLayer
+{
+  public:
+    LSATrainableInitializerLayer(const raul::Name& name, const raul::TrainableParams& params, size_t numUnits, raul::NetworkParameters& networkParameters)
+        : TrainableLayer(name, "LSATrainableInitializer", params, networkParameters, { false, true })
+    {
+        auto prefix = "LSATrainableInitializerLayer[" + mName + "::ctor]: ";
+
+        if (mInputs.size() != 0)
+        {
+            THROW(mTypeName, mName, "no input names expected");
+        }
+
+        if (mOutputs.size() != 2)
+        {
+            THROW(mTypeName, mName, "wrong number of output names");
+        }
+        if (mOutputs[0].empty() || mOutputs[1].empty())
+        {
+            THROW(mTypeName, mName, "empty output name");
+        }
+
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ 1u, 1u, 1u, numUnits }, DEC_TRAINABLE);
+
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[1], raul::WShape{ 1u, 1u, 1u, numUnits }, DEC_TRAINABLE);
+
+        if (!mFrozen)
+        {
+            networkParameters.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_TRAINABLE_GRAD);
+            networkParameters.mWorkflow.copyDeclaration(name, mOutputs[1], mOutputs[1].grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+
+    void initNotBSTensors() override
+    {
+        // Initialize trainable params
+        raul::initializers::XavierUniformInitializer initializer;
+        initializer(mNetworkParams.mMemoryManager[mOutputs[0]]);
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override {}
+
+    void backwardComputeImpl() override {}
+};
+
+// Give needed constant
+class LSAConstantsInitializerLayer : public raul::BasicLayer
+{
+  public:
+    LSAConstantsInitializerLayer(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "LSAConstantsInitializer", params, networkParameters, { false, false })
+    {
+        auto prefix = "LSAConstantsInitializerLayer[" + mName + "::ctor]: ";
+
+        if (mOutputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of output names");
+        }
+        if (mOutputs[0].empty())
+        {
+            THROW(mTypeName, mName, "empty output name");
+        }
+
+        // Declare needed constants
+        networkParameters.mWorkflow.tensorNeeded(name, mOutputs[0], raul::WShape{ 1u, 1u, 1u, 1u }, DEC_FORW_WRIT);
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override
+    {
+        // Initialize constants
+        mNetworkParams.mMemoryManager[mOutputs[0]][0] = 1.0_dt;
+    }
+
+    void backwardComputeImpl() override {}
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/LocationSensitiveAttentionLayer.cpp b/training/src/compiler/training/base/layers/composite/LocationSensitiveAttentionLayer.cpp
new file mode 100644
index 00000000..4003b26d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/LocationSensitiveAttentionLayer.cpp
@@ -0,0 +1,275 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LocationSensitiveAttentionLayer.h"
+#include "AttentionMaskCreatorLayer.h"
+#include "LocationSensitiveAttentionInternalLayers.h"
+
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/activations/TanhActivation.h>
+#include <training/base/layers/basic/ArgMaxLayer.h>
+#include <training/base/layers/basic/ConcatenationLayer.h>
+#include <training/base/layers/basic/ElementWiseDivLayer.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+#include <training/base/layers/basic/ElementWiseSubLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/basic/MatMulLayer.h>
+#include <training/base/layers/basic/RandomTensorLayer.h>
+#include <training/base/layers/basic/ReduceSumLayer.h>
+#include <training/base/layers/basic/RollLayer.h>
+#include <training/base/layers/basic/ScaleLayer.h>
+#include <training/base/layers/basic/SelectLayer.h>
+#include <training/base/layers/basic/SplitterLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/base/layers/basic/trainable/Convolution1DLayer.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+
+namespace raul
+{
+
+LocationSensitiveAttentionLayer::LocationSensitiveAttentionLayer(const Name& name, const LocationSensitiveAttentionParams& params, NetworkParameters& networkParameters)
+    : mNumUnits(params.mNumUnits)
+    , mCumulativeMode(params.mCumulateWeights)
+    , mLocationConvolutionFilters(params.mHparams.mAttentionFilters)
+    , mLocationConvolutionKernelSize(params.mHparams.mAttentionKernel)
+    , mHasMask(false)
+    , mTransitionProba(0.5_dt)
+{
+    auto prefix = "LocationSensitiveAttention[" + name + "::ctor]: ";
+
+    // Query, State, Memory, [MemorySeqLength]
+    if (params.getInputs().size() != 3 && params.getInputs().size() != 4)
+    {
+        THROW("LocationSensitiveAttention", name, "wrong number of input names");
+    }
+
+    // Mask independent
+    if ((!params.getSharedLayer().empty() && !mCumulativeMode && (params.getOutputs().size() == 3 || params.getOutputs().size() == 4)) ||
+        (!params.getSharedLayer().empty() && mCumulativeMode && (params.getOutputs().size() == 1 || params.getOutputs().size() == 4)))
+    {
+        THROW("LocationSensitiveAttention", name, "wrong number of output names");
+    }
+
+    // With mask
+    if (params.getInputs().size() == 4 && params.getSharedLayer().empty() &&
+        ((mCumulativeMode && params.getOutputs().size() < 3) || (!mCumulativeMode && params.getOutputs().size() != 2 && params.getOutputs().size() != 3)))
+    {
+        THROW("LocationSensitiveAttention", name, "wrong number of output names");
+    }
+
+    // Without mask
+    if (params.getInputs().size() == 3 && params.getSharedLayer().empty() &&
+        ((mCumulativeMode && params.getOutputs().size() != 2 && params.getOutputs().size() != 3) || (!mCumulativeMode && params.getOutputs().size() > 2)))
+    {
+        THROW("LocationSensitiveAttention", name, "wrong number of output names");
+    }
+
+    // Input names
+    auto [queryName, stateName, memoryName] = std::make_tuple(params.getInputs()[0], params.getInputs()[1], params.getInputs()[2]);
+
+    // Trainable params
+    if (!params.getSharedLayer().empty())
+    {
+        mAttentionVPName = params.getSharedLayer() / "attention_variable_projection";
+        mAttentionBiasName = params.getSharedLayer() / "attention_bias";
+    }
+    else
+    {
+        mAttentionVPName = name / "attention_variable_projection";
+        mAttentionBiasName = name / "attention_bias";
+
+        networkParameters.mWorkflow.add<lsa::LSATrainableInitializerLayer>(name / "trainable_params", TrainableParams{ {}, { mAttentionVPName, mAttentionBiasName }, params.frozen }, mNumUnits);
+    }
+
+    if (params.getInputs().size() == 4 && params.getSharedLayer().empty())
+    {
+        mHasMask = true;
+        mMaskLen = networkParameters.mWorkflow.getWidth(stateName);
+
+        networkParameters.mWorkflow.add<AttentionMaskCreatorLayer>(name / "create_mask", BasicParams{ { params.getInputs()[3] }, { name / "mask", name / "default_scores" } }, mMaskLen);
+
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "mask_memory", ElementWiseLayerParams{ { memoryName, name / "mask" }, { name / "values" } });
+        networkParameters.mWorkflow.add<SplitterLayer>(name / "copy_values_to_output", BasicParams{ { name / "values" }, { params.getOutputs()[1] } });
+    }
+
+    // Get keys
+    if (params.getSharedLayer().empty())
+    {
+        // Process memory from [batch, 1, max_time, encoder_output_size] to key with size [batch, 1, max_time, mNumUnits]
+        networkParameters.mWorkflow.add<LinearLayer>(name / "memory_layer", LinearParams{ { mHasMask ? params.getOutputs()[1] : memoryName }, { name / "keys" }, mNumUnits, false, params.frozen });
+    }
+
+    // Recalculate transition proba if needed
+    if (params.mHparams.mUseTransAgent)
+    {
+        networkParameters.mWorkflow.add<MatMulLayer>(
+            name / "calculate_previous_context", MatMulParams{ { stateName, params.getSharedLayer().empty() ? name / "values" : params.getSharedLayer() / "values" }, { name / "previous_context" } });
+        networkParameters.mWorkflow.add<ConcatenationLayer>(name / "calculate_ta_input", BasicParamsWithDim{ { queryName, name / "previous_context" }, { name / "ta_input" }, Dimension::Width });
+        // Transition layer itself
+        networkParameters.mWorkflow.add<LinearLayer>(
+            name / "transition_agent_layer",
+            LinearParams{ { name / "ta_input" }, { name / "ta_output" }, params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "transition_agent_layer", 1u, true, params.frozen });
+        // Get new probability
+        networkParameters.mWorkflow.add<SigmoidActivation>(name / "calculate_tr_proba", BasicParams{ { name / "ta_output" }, { name / "transit_proba" } });
+    }
+    else
+    {
+        // Use hardcoded value
+        networkParameters.mWorkflow.add<TensorLayer>(name / "create_transit_proba", TensorParams{ { name / "transit_proba" }, WShape{ BS(), 1u, 1u, 1u }, mTransitionProba });
+    }
+
+    // Process query from [batch, 1, 1, query_depth] to [batch, 1, 1, attention_dim]
+    networkParameters.mWorkflow.add<LinearLayer>(
+        name / "query_layer",
+        LinearParams{ { queryName }, { name / "processed_query" }, params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "query_layer", mNumUnits, false, params.frozen });
+
+    // Transpose state from [batch_size, 1, 1, max_time] to [batch_size, 1, max_time, 1]
+    networkParameters.mWorkflow.add<TransposeLayer>(name / "transpose_state", TransposingParams{ { stateName }, { name / "expanded_alignments" }, Dimension::Width, Dimension::Height });
+
+    // Extract location-based features from previous attention. Padding mode is "same"
+    size_t locationConvolutionPadding = (mLocationConvolutionKernelSize - 1) / 2;
+
+    networkParameters.mWorkflow.add<Convolution1DLayer>(name / "location_convolution",
+                                                        Convolution1DParams{ name / "expanded_alignments",
+                                                                             name / "f",
+                                                                             params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "location_convolution",
+                                                                             mLocationConvolutionKernelSize,
+                                                                             mLocationConvolutionFilters,
+                                                                             1u,
+                                                                             locationConvolutionPadding,
+                                                                             1u,
+                                                                             1u,
+                                                                             true,
+                                                                             false,
+                                                                             true,
+                                                                             params.frozen });
+
+    networkParameters.mWorkflow.add<LinearLayer>(
+        name / "location_layer",
+        LinearParams{ { name / "f" }, { name / "processed_location_features" }, params.getSharedLayer().empty() ? "" : params.getSharedLayer() / "location_layer", mNumUnits, false, params.frozen });
+
+    // Compute energy
+    networkParameters.mWorkflow.add<ElementWiseSumLayer>(
+        name / "sum_features",
+        ElementWiseLayerParams{
+            { name / "processed_query", name / "processed_location_features", params.getSharedLayer().empty() ? name / "keys" : params.getSharedLayer() / "keys", mAttentionBiasName },
+            { name / "sum_of_features" } });
+
+    networkParameters.mWorkflow.add<TanhActivation>(name / "activate_features", BasicParams{ { name / "sum_of_features" }, { name / "act_sum_of_features" } });
+
+    networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "mul_with_attention_vp", ElementWiseLayerParams{ { name / "act_sum_of_features", mAttentionVPName }, { name / "non_reduced_energy" } });
+
+    networkParameters.mWorkflow.add<ReduceSumLayer>(name / "location_sensitive_score", BasicParamsWithDim{ { name / "non_reduced_energy" }, { name / "energy" }, Dimension::Width });
+
+    Name energyName = name / "energy";
+    if (params.getInputs().size() == 4 && !params.mHparams.mUseStepwiseMonotonicConstraintType)
+    {
+        networkParameters.mWorkflow.add<SelectLayer>(name / "mask_energy",
+                                                     ElementWiseLayerParams{ { params.getSharedLayer().empty() ? name / "mask" : params.getSharedLayer() / "mask",
+                                                                               energyName,
+                                                                               params.getSharedLayer().empty() ? name / "default_scores" : params.getSharedLayer() / "default_scores" },
+                                                                             { name / "maskedEnergy" } });
+        energyName = name / "maskedEnergy";
+    }
+    // Compute alignments
+    networkParameters.mWorkflow.add<TransposeLayer>(name / "transpose_energy", TransposingParams{ { energyName }, { name / "transposed_energy" }, Dimension::Width, Dimension::Height });
+
+    Name alignmentsName = params.mUseForward ? name / "prefinal_alignments" : params.getOutputs()[0];
+    if (params.mHparams.mUseStepwiseMonotonicConstraintType)
+    {
+        networkParameters.mWorkflow.add<RandomTensorLayer>(name / "create_noise",
+                                                           RandomTensorLayerParams{ { name / "noise" },
+                                                                                    networkParameters.mWorkflow.getDepth(name / "transposed_energy"),
+                                                                                    networkParameters.mWorkflow.getHeight(name / "transposed_energy"),
+                                                                                    networkParameters.mWorkflow.getWidth(name / "transposed_energy") });
+        networkParameters.mWorkflow.add<ScaleLayer>(name / "scale_noise", ScaleParams{ { name / "noise" }, { name / "scaled_noise" }, params.mSigmoidNoise });
+        networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "add_noise_to_sigmoid_input",
+                                                             ElementWiseLayerParams{ { name / "transposed_energy", name / "scaled_noise" }, { name / "noisy_energy" } });
+        networkParameters.mWorkflow.add<SigmoidActivation>(name / "activate_energy_using_sigmoid", BasicParams{ { name / "noisy_energy" }, { name / "activated_energy" } });
+        // Calculate alignments
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "calculate_first_part_of_alignments",
+                                                             ElementWiseLayerParams{ { stateName, name / "activated_energy" }, { name / "first_part_of_alignments" } });
+        // Reverse activated energy
+        networkParameters.mWorkflow.add<lsa::LSAConstantsInitializerLayer>(name / "create_constant", BasicParams{ {}, { name / "ones" } });
+        networkParameters.mWorkflow.add<ElementWiseSubLayer>(name / "reverse_activated_energy",
+                                                             ElementWiseLayerParams{ { name / "ones", name / "activated_energy" }, { name / "reversed_activated_energy" } });
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "calculate_second_part_of_alignments",
+                                                             ElementWiseLayerParams{ { stateName, name / "reversed_activated_energy" }, { name / "second_part_of_alignments" } });
+        networkParameters.mWorkflow.add<RollLayer>(name / "shift_second_part_of_alignments",
+                                                   RollLayerParams{ { name / "second_part_of_alignments" }, { name / "shifted_second_part_of_alignments" }, Dimension::Width, 1u, false, 0.0_dt });
+        networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "calculate_alignments",
+                                                             ElementWiseLayerParams{ { name / "first_part_of_alignments", name / "shifted_second_part_of_alignments" }, { alignmentsName } });
+    }
+    else
+    {
+        if (params.mSmoothing)
+        {
+            networkParameters.mWorkflow.add<SigmoidActivation>(name / "activate_energy_using_sigmoid", BasicParams{ { name / "transposed_energy" }, { name / "activated_energy" } });
+            // Calculate divisor
+            networkParameters.mWorkflow.add<ReduceSumLayer>(name / "calculate_divisor_for_energy", BasicParamsWithDim{ { name / "activated_energy" }, { name / "energy_divisor" }, Dimension::Width });
+            // Normalize energy
+            networkParameters.mWorkflow.add<ElementWiseDivLayer>(name / "smoothing_normalization",
+                                                                 ElementWiseLayerParams{ { name / "activated_energy", name / "energy_divisor" }, { alignmentsName } });
+        }
+        else
+        {
+            networkParameters.mWorkflow.add<SoftMaxActivation>(name / "probability_fn", BasicParamsWithDim{ { name / "transposed_energy" }, { alignmentsName }, Dimension::Width });
+        }
+    }
+
+    if (params.mUseForward)
+    {
+        // Shift previous alignments (i.e. input state)
+        networkParameters.mWorkflow.add<RollLayer>(name / "shift_state", RollLayerParams{ { stateName }, { name / "previous_alignments_shifted" }, Dimension::Width, 1u, false, 0.0_dt });
+        if (!params.mHparams.mUseStepwiseMonotonicConstraintType)
+        {
+            networkParameters.mWorkflow.add<lsa::LSAConstantsInitializerLayer>(name / "create_constant", BasicParams{ {}, { name / "ones" } });
+        }
+        networkParameters.mWorkflow.add<ElementWiseSubLayer>(name / "reverse_transit_proba", ElementWiseLayerParams{ { name / "ones", name / "transit_proba" }, { name / "reversed_transit_proba" } });
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "weight_previous_alignments", ElementWiseLayerParams{ { stateName, name / "reversed_transit_proba" }, { stateName / "weighted" } });
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(
+            name / "weight_previous_alignments_shifted",
+            ElementWiseLayerParams{ { name / "previous_alignments_shifted", name / "transit_proba" }, { name / "previous_alignments_shifted_and_weighted" } });
+        networkParameters.mWorkflow.add<ElementWiseSumLayer>(
+            name / "sum_weighted_previous_alignments",
+            ElementWiseLayerParams{ { stateName / "weighted", name / "previous_alignments_shifted_and_weighted" }, { name / "weighted_sum_of_previous_alignments" } });
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "scale_by_newly_calculated_alignments",
+                                                             ElementWiseLayerParams{ { alignmentsName, name / "weighted_sum_of_previous_alignments" }, { name / "unnormalized_alignments" } });
+        // Normalize calculated score
+        networkParameters.mWorkflow.add<ReduceSumLayer>(name / "calculate_divisor_for_alignments",
+                                                        BasicParamsWithDim{ { name / "unnormalized_alignments" }, { name / "alignments_divisor" }, Dimension::Width });
+        // Normalize energy
+        networkParameters.mWorkflow.add<ElementWiseDivLayer>(name / "calculate_final_alignments",
+                                                             ElementWiseLayerParams{ { name / "unnormalized_alignments", name / "alignments_divisor" }, { params.getOutputs()[0] } });
+    }
+
+    if (mCumulativeMode)
+    {
+        const size_t i = mHasMask ? 2 : 1;
+
+        // Get new state
+        networkParameters.mWorkflow.add<raul::ElementWiseSumLayer>(name / "calculate_new_state", raul::ElementWiseLayerParams{ { params.getOutputs()[0], stateName }, { params.getOutputs()[i] } });
+    }
+
+    if (params.getOutputs().size() == 4 || ((!mHasMask || !mCumulativeMode) && params.getOutputs().size() == 3) || (!mHasMask && !mCumulativeMode && params.getOutputs().size() == 2))
+    {
+        networkParameters.mWorkflow.tensorNeeded(name, params.getOutputs().back(), WShape{ BS(), 1u, 1u, 1u }, DEC_FORW_WRIT);
+
+        // Get max attention indices
+        networkParameters.mWorkflow.add<ArgMaxLayer>(name / "get_max_attn", BasicParamsWithDim{ { params.getOutputs()[0] }, { params.getOutputs().back() }, Dimension::Width });
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/LocationSensitiveAttentionLayer.h b/training/src/compiler/training/base/layers/composite/LocationSensitiveAttentionLayer.h
new file mode 100644
index 00000000..5950e785
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/LocationSensitiveAttentionLayer.h
@@ -0,0 +1,95 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LOCATION_SENSITIVE_ATTENTION_LAYER_H
+#define LOCATION_SENSITIVE_ATTENTION_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/parameters/trainable/LocationSensitiveAttentionParams.h>
+#include <training/base/common/NetworkParameters.h>
+
+namespace raul
+{
+
+/**
+ * @brief Location Sensitive Attention Layer
+ *
+ * Impelements Bahdanau-style (cumulative) scoring function.
+ * Usually referred to as "hybrid" attention (content-based + location-based)
+ * Extends the additive attention described in:
+ * "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
+ * tion by jointly learning to align and translate,” in Proceedings of ICLR, 2015."
+ * to use previous alignments as additional location features.
+ *
+ * Implementation follows https://rnd-gitlab-msc.huawei.com/voice-assistant/tts/release_code/-/blob/feature/experimental/tacotron/models/attention.py#L101
+ *
+ * Inputs:
+ *     1. Query  [batch, 1, 1, decoder_output_size]
+ *     2. State  [batch, 1, 1, alignments_size(i.e. max_time)]
+ *     3. Memory [batch, 1, max_time, encoder_output_size]
+ *     4. { MemorySeqLength } [batch, 1, 1, 1] - values to create mask.
+ * Outputs: Probabilities [batch, 1, 1, alignments_size(i.e. max_time)],
+ *          { values } [batch, 1, max_time, encoder_output_size] (if non-shared layer and mask provided),
+ *          { next_state } [batch, 1, 1, alignments_size(i.e. max_time)] (if in cumulative mode),
+ *          { max_attn } [batch, 1, 1, 1] (indices of maximum numbers in probabilities).
+ *
+ * Steps:
+ * 1. Calculate transition probability using transition layer agent. Default value is 0.5 (optional).
+ * 2. Using linear layer, process query tensor from [batch, 1, 1, decoder_output_size] to [batch, 1, 1, numUnits]
+ * 3. Using linear layer, process memory from [batch, 1, max_time, encoder_output_size] to [batch, 1, max_time, numUnits] (only in parent layer).
+ * 4. Extract location-based features from state tensor using 1D convolution, passing the result through linear layer.
+ * 5. Calculate energy (i. e. location_sensitive_score).
+ * 6. Calculate new alignments using softmax activation
+ *  a. If smoothing enabled, use sigmoid and normalize instead.
+ *  b. If stepwise monotonic constraint applied, calculate sigmoid activation of noisy energy, mix it with state and shifted state tensors.
+ * 7. If use forward option enabled, mix calculated alignments with previous state. Proportion depends on transition probability tensor.
+ * 8. In cumulative mode, calculate next_state output tensor as sum of new alignments and previous state.
+ * 9. If it is needed, indices of max numbers in alingments can be calcuated too.
+ *
+ * @see
+ * J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, Y. Ben-gio,
+ * “Attention-based models for speech recognition”, Advances in Neural Information Processing Systems, 2015, pp. 577–585.
+ */
+
+class LocationSensitiveAttentionLayer
+{
+
+  public:
+    LocationSensitiveAttentionLayer(const Name& name, const LocationSensitiveAttentionParams& params, raul::NetworkParameters& networkParameters);
+
+    LocationSensitiveAttentionLayer(LocationSensitiveAttentionLayer&&) = default;
+    LocationSensitiveAttentionLayer(const LocationSensitiveAttentionLayer&) = delete;
+    LocationSensitiveAttentionLayer& operator=(const LocationSensitiveAttentionLayer&) = delete;
+
+  private:
+    // General params
+    size_t mNumUnits;
+    bool mCumulativeMode;
+
+    // Parameters for first conv1d layer
+    size_t mLocationConvolutionFilters;
+    size_t mLocationConvolutionKernelSize;
+
+    bool mHasMask;
+    size_t mMaskLen;
+
+    dtype mTransitionProba;
+
+    // Local trainable parameters
+    raul::Name mAttentionVPName;
+    raul::Name mAttentionBiasName;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/MultiHeadAttention.cpp b/training/src/compiler/training/base/layers/composite/MultiHeadAttention.cpp
new file mode 100644
index 00000000..8f8ee4da
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/MultiHeadAttention.cpp
@@ -0,0 +1,89 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "MultiHeadAttention.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/ReshapeLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+#include <training/base/layers/composite/AttentionLayer.h>
+
+namespace raul
+{
+
+MultiHeadAttentionLayer::MultiHeadAttentionLayer(const Name& name, const MultiHeadAttentionParams& params, NetworkParameters& networkParameters)
+    : mHeads(params.heads)
+    , mDropout(params.probability)
+{
+    MEASURE_BLOCK("MultiHeadAttention[" + name + "::ctor]")
+    auto prefix = "MultiHeadAttention[" + name + "::ctor]: ";
+    // Query, Key, Value, [Mask] or Query, [Mask]
+    if (params.getInputs().empty() || params.getInputs().size() > 4)
+    {
+        THROW("MultiHeadAttention", name, "wrong number of input names");
+    }
+    if (params.getOutputs().size() != 1 && params.getOutputs().size() != 2)
+    {
+        THROW("MultiHeadAttention", name, "wrong number of output names");
+    }
+
+    bool hasMask = params.getInputs().size() == 4 || params.getInputs().size() == 2;
+
+    std::string maskName = params.getInputs().size() == 4 ? params.getInputs()[3] : (params.getInputs().size() == 2 ? params.getInputs()[1] : "");
+    std::string attnName = name / "attn";
+
+    auto [queryName, valueName, keyName] = std::make_tuple(params.getInputs()[0], params.getInputs()[0], params.getInputs()[0]);
+    if (params.getInputs().size() > 2)
+    {
+        std::tie(queryName, valueName, keyName) = std::make_tuple(params.getInputs()[0], params.getInputs()[1], params.getInputs()[2]);
+    }
+
+    size_t d_model = networkParameters.mWorkflow.getWidth(params.getInputs()[0]);
+    int d_k = static_cast<int>(d_model / mHeads);
+
+    std::string var[] = { "q", "k", "v" };
+    std::string names[] = { queryName, valueName, keyName };
+
+    for (size_t i = 0; i < 3; ++i)
+    {
+        std::string suffix = "[" + std::to_string(i) + "]";
+        networkParameters.mWorkflow.add<LinearLayer>(name / "linears" + suffix, LinearParams{ { names[i] }, { name / var[i] + "_l" }, d_model, true, params.frozen });
+        networkParameters.mWorkflow.add<ReshapeLayer>(name / "reshape_" + var[i], ViewParams{ name / var[i] + "_l", name / var[i] + "_v", -1, mHeads, d_k });
+        networkParameters.mWorkflow.add<TransposeLayer>(name / "transp_" + var[i], TransposingParams{ name / var[i] + "_v", name / var[i] + "_t", Dimension::Depth, Dimension::Height });
+    }
+
+    Names attnInputs = { name / "q_t", name / "v_t", name / "k_t" };
+
+    if (hasMask)
+    {
+        attnInputs.push_back(maskName);
+    }
+
+    AttentionLayer(name / "attn", DropoutParams{ attnInputs, { name / "attn" }, static_cast<float>(mDropout) }, networkParameters);
+    networkParameters.mWorkflow.add<TransposeLayer>(name / "transp_attn", TransposingParams{ name / "attn", name / "attn_t", Dimension::Depth, Dimension::Height });
+
+    if (!params.finalTransform)
+    {
+        networkParameters.mWorkflow.add<ReshapeLayer>(name / "reshape_attn", ViewParams{ name / "attn_t", params.getOutputs()[0], 1, -1, mHeads * d_k });
+    }
+    else
+    {
+        networkParameters.mWorkflow.add<ReshapeLayer>(name / "reshape_attn", ViewParams{ name / "attn_t", name / "attn_v", 1, -1, mHeads * d_k });
+        networkParameters.mWorkflow.add<LinearLayer>(name / "linears[3]", LinearParams{ { name / "attn_v" }, { params.getOutputs()[0] }, d_model, true, params.frozen });
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/MultiHeadAttention.h b/training/src/compiler/training/base/layers/composite/MultiHeadAttention.h
new file mode 100644
index 00000000..681732d6
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/MultiHeadAttention.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MULTI_HEAD_ATTENTION_LAYER_H
+#define MULTI_HEAD_ATTENTION_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/parameters/trainable/MultiHeadAttentionParams.h>
+#include <training/base/common/NetworkParameters.h>
+
+namespace raul
+{
+
+/**
+ * @brief Multihead Attention Layer
+ *
+ * The mechanism that allows a model to jointly attend to information from different representation subspaces.
+ */
+class MultiHeadAttentionLayer
+{
+  public:
+    MultiHeadAttentionLayer(const Name& name, const MultiHeadAttentionParams& params, NetworkParameters& networkParameters);
+
+    MultiHeadAttentionLayer(MultiHeadAttentionLayer&&) = default;
+    MultiHeadAttentionLayer(const MultiHeadAttentionLayer&) = delete;
+    MultiHeadAttentionLayer& operator=(const MultiHeadAttentionLayer&) = delete;
+
+  private:
+    int mHeads;
+    dtype mDropout;
+};
+
+} // raul namespace
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/Transformer.cpp b/training/src/compiler/training/base/layers/composite/Transformer.cpp
new file mode 100644
index 00000000..f5f6a1f3
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/Transformer.cpp
@@ -0,0 +1,205 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Transformer.h"
+
+#include <algorithm>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/activations/LogSoftMaxActivation.h>
+#include <training/base/layers/activations/ReLUActivation.h>
+#include <training/base/layers/basic/DropoutLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/basic/PositionalEncoding.h>
+#include <training/base/layers/basic/SplitterLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/base/layers/basic/trainable/Embedding.h>
+#include <training/base/layers/basic/trainable/LayerNorm.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+#include <training/base/layers/composite/MultiHeadAttention.h>
+
+namespace
+{
+
+using namespace raul;
+
+const float TransformerLayerNormEps = 1e-6f;
+
+// sublayer must use name.x as first input and name.y as output
+void SublayerConnection(const Name& name, const Name& input, const Name& output, float dropout, bool frozen, std::function<void()> sublayer_creator, NetworkParameters& networkParameters)
+{
+    networkParameters.mWorkflow.add<SplitterLayer>(name / "splitter", BasicParams{ { input }, { name / "x1", name / "x2" } });
+    networkParameters.mWorkflow.add<LayerNormLayer>(name / "norm", LayerNormParams{ name / "x2", name / "x", TransformerLayerNormEps, false, true, frozen });
+    sublayer_creator();
+    std::string out = name / "y";
+    if (dropout > 0.0_dt)
+    {
+        networkParameters.mWorkflow.add<DropoutLayer>(name / "dropout", DropoutParams{ { out }, { name / "do" }, dropout });
+        out = name / "do";
+    }
+    networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "sum", ElementWiseLayerParams{ { name / "x1", out }, { output } });
+}
+
+void PositionwiseFeedForward(const Name& name, const Name& input, const Name& output, const TransformerParams& params, NetworkParameters& networkParameters)
+{
+    std::string out = name / "relu";
+    networkParameters.mWorkflow.add<LinearLayer>(name / "w_1", LinearParams{ { input }, { name / "w_1" }, params.feedForwardSize, true, params.frozen });
+    networkParameters.mWorkflow.add<ReLUActivation>(name / "relu", BasicParams{ { name / "w_1" }, { out } });
+    if (params.dropout > 0.0_dt)
+    {
+        networkParameters.mWorkflow.add<DropoutLayer>(name / "dropout", DropoutParams{ { out }, { name / "do" }, params.dropout });
+        out = name / "do";
+    }
+    networkParameters.mWorkflow.add<LinearLayer>(name / "w_2", LinearParams{ { out }, { output }, params.modelSize, true, params.frozen });
+}
+
+void EncoderLayer(const Name& name, const Names& inputs, const Name& output, const TransformerParams& params, NetworkParameters& networkParameters)
+{
+    auto xName = inputs[0];
+    auto maskName = inputs[1];
+
+    auto selfAttention = [&name, &maskName, &params, &networkParameters]() {
+        MultiHeadAttentionLayer(
+            name / "self_attn", MultiHeadAttentionParams{ { name / "sublayer[0]" / "x", maskName }, name / "sublayer[0]" / "y", params.heads, params.dropout, true, params.frozen }, networkParameters);
+    };
+
+    auto ff = [&name, &params, &networkParameters]() { PositionwiseFeedForward(name / "feed_forward", name / "sublayer[1]" / "x", name / "sublayer[1]" / "y", params, networkParameters); };
+
+    SublayerConnection(name / "sublayer[0]", xName, name / "sublayer[0]" / "z", params.dropout, params.frozen, selfAttention, networkParameters);
+    SublayerConnection(name / "sublayer[1]", name / "sublayer[0]" / "z", output, params.dropout, params.frozen, ff, networkParameters);
+}
+
+void Encoder(const Name& name, const Names& inputs, const Name& output, const TransformerParams& params, NetworkParameters& networkParameters)
+{
+    auto xName = inputs[0];
+    auto maskName = inputs[1];
+
+    for (size_t i = 0; i < params.encoderDecoderLength; ++i)
+    {
+        std::string suffix = "[" + std::to_string(i) + "]";
+        std::string out = name / "out" + suffix;
+        EncoderLayer(name / "layers" + suffix, { xName, maskName }, out, params, networkParameters);
+        xName = out;
+    }
+
+    networkParameters.mWorkflow.add<LayerNormLayer>(name / "norm", LayerNormParams{ xName, output, TransformerLayerNormEps, false, true, params.frozen });
+}
+
+void DecoderLayer(const Name& name, const Names& inputs, const Name& output, const TransformerParams& params, NetworkParameters& networkParameters)
+{
+    auto xName = inputs[0];
+    auto memName = inputs[1];
+    auto srcMaskName = inputs[2];
+    auto tgtMaskName = inputs[3];
+
+    auto selfAttention = [&name, &tgtMaskName, &params, &networkParameters]() {
+        MultiHeadAttentionLayer(name / "self_attn",
+                                MultiHeadAttentionParams{ { name / "sublayer[0]" / "x", tgtMaskName }, name / "sublayer[0]" / "y", params.heads, params.dropout, true, params.frozen },
+                                networkParameters);
+    };
+    auto srcAttention = [&name, &memName, &srcMaskName, &params, &networkParameters]() {
+        MultiHeadAttentionLayer(
+            name / "src_attn",
+            MultiHeadAttentionParams{ { name / "sublayer[1]" / "x", memName, memName, srcMaskName }, name / "sublayer[1]" / "y", params.heads, params.dropout, true, params.frozen },
+            networkParameters);
+    };
+
+    auto ff = [&name, &params, &networkParameters]() { PositionwiseFeedForward(name / "feed_forward", name / "sublayer[2]" / "x", name / "sublayer[2]" / "y", params, networkParameters); };
+
+    SublayerConnection(name / "sublayer[0]", xName, name / "sublayer[0]" / "z", params.dropout, params.frozen, selfAttention, networkParameters);
+    SublayerConnection(name / "sublayer[1]", name / "sublayer[0]" / "z", name / "sublayer[1]" / "z", params.dropout, params.frozen, srcAttention, networkParameters);
+    SublayerConnection(name / "sublayer[2]", name / "sublayer[1]" / "z", output, params.dropout, params.frozen, ff, networkParameters);
+}
+
+void Decoder(const Name& name, const Names& inputs, const Name& output, const TransformerParams& params, NetworkParameters& networkParameters)
+{
+    auto xName = inputs[0];
+    auto memName = inputs[1];
+    auto srcMaskName = inputs[2];
+    auto tgtMaskName = inputs[3];
+
+    Names memNames(params.encoderDecoderLength);
+    for (size_t i = 0; i < params.encoderDecoderLength; ++i)
+    {
+        memNames[i] = name / "m" / std::to_string(i);
+    }
+
+    networkParameters.mWorkflow.add<SplitterLayer>(name / "mem_splitter", BasicParams{ { memName }, { memNames } });
+
+    for (size_t i = 0; i < params.encoderDecoderLength; ++i)
+    {
+        std::string suffix = "[" + std::to_string(i) + "]";
+        std::string out = name / "out" + suffix;
+        DecoderLayer(name / "layers" + suffix, { xName, memNames[i], srcMaskName, tgtMaskName }, out, params, networkParameters);
+        xName = out;
+    }
+
+    networkParameters.mWorkflow.add<LayerNormLayer>(name / "norm", LayerNormParams{ xName, output, TransformerLayerNormEps, false, true, params.frozen });
+}
+}
+
+namespace raul
+{
+
+void CreateGenerator(const Name& name, const BasicParams& params, size_t vocabularySize, NetworkParameters& networkParameters)
+{
+    auto input = params.getInputs()[0];
+    auto output = params.getOutputs()[0];
+
+    networkParameters.mWorkflow.add<LinearLayer>(name / "proj", LinearParams{ { input }, { name / "proj" }, vocabularySize });
+    networkParameters.mWorkflow.add<LogSoftMaxActivation>(name / "log_softmax", BasicParamsWithDim{ { name / "proj" }, { output }, Dimension::Width });
+}
+
+TransformerModel::TransformerModel(const Name& name, const TransformerParams& params, NetworkParameters& networkParameters)
+{
+    auto prefix = "TransformerModel[" + name + "::ctor]: ";
+    // src, tgt, src_mask, tgt_mask
+    if (params.getInputs().size() != 4)
+    {
+        THROW("TransformerModel", name, "wrong number of input names (must be 4)");
+    }
+    if (params.getOutputs().size() != 1)
+    {
+        THROW("TransformerModel", name, "wrong number of output names (must be 1)");
+    }
+    if (params.heads == 0)
+    {
+        THROW("TransformerModel", name, "heads count must be > 0");
+    }
+
+    auto srcName = params.getInputs()[0];
+    auto tgtName = params.getInputs()[1];
+    auto srcMaskName = params.getInputs()[2];
+    auto tgtMaskName = params.getInputs()[3];
+
+    networkParameters.mWorkflow.add<Embedding>(name / "src_embed" / "0" / "lut", EmbeddingParams{ srcName, name / "src_emb", params.frozen, params.srcVocabSize, params.modelSize });
+    networkParameters.mWorkflow.add<Embedding>(name / "tgt_embed" / "0" / "lut", EmbeddingParams{ tgtName, name / "tgt_emb", params.frozen, params.tgtVocabSize, params.modelSize });
+
+    if (params.dropout > 0._dt)
+    {
+        networkParameters.mWorkflow.add<PositionalEncoding>(name / "src_embed" / "1" / "pos_enc", PositionalEncodingParams{ name / "src_emb", name / "src_embed0", params.modelSize });
+        networkParameters.mWorkflow.add<PositionalEncoding>(name / "tgt_embed" / "1" / "pos_enc", PositionalEncodingParams{ name / "tgt_emb", name / "tgt_embed0", params.modelSize });
+        networkParameters.mWorkflow.add<DropoutLayer>(name / "src_embed" / "2" / "dropout", DropoutParams{ { name / "src_embed0" }, { name / "src_embed" }, params.dropout });
+        networkParameters.mWorkflow.add<DropoutLayer>(name / "tgt_embed" / "2" / "dropout", DropoutParams{ { name / "tgt_embed0" }, { name / "tgt_embed" }, params.dropout });
+    }
+    else
+    {
+        networkParameters.mWorkflow.add<PositionalEncoding>(name / "src_posenc", PositionalEncodingParams{ name / "src_emb", name / "src_embed", params.modelSize });
+        networkParameters.mWorkflow.add<PositionalEncoding>(name / "tgt_posenc", PositionalEncodingParams{ name / "tgt_emb", name / "tgt_embed", params.modelSize });
+    }
+
+    Encoder(name / "encoder", { name / "src_embed", srcMaskName }, name / "enc", params, networkParameters);
+    Decoder(name / "decoder", { name / "tgt_embed", name / "enc", srcMaskName, tgtMaskName }, params.getOutputs()[0], params, networkParameters);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/Transformer.h b/training/src/compiler/training/base/layers/composite/Transformer.h
new file mode 100644
index 00000000..0a85da8e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/Transformer.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TRANSFORMER_MODEL_H
+#define TRANSFORMER_MODEL_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/parameters/trainable/TransformerParams.h>
+#include <training/base/common/NetworkParameters.h>
+
+namespace raul
+{
+
+/**
+ * @brief Transformer Model
+ *
+ *
+ * @see https://nlp.seas.harvard.edu/2018/04/03/attention.html
+ */
+class TransformerModel
+{
+  public:
+    TransformerModel(const Name& name, const TransformerParams& params, NetworkParameters& networkParameters);
+
+    TransformerModel(TransformerModel&&) = default;
+    TransformerModel(const TransformerModel&) = delete;
+    TransformerModel& operator=(const TransformerModel&) = delete;
+
+  private:
+};
+
+void CreateGenerator(const Name& name, const BasicParams& params, size_t vocabularySize, NetworkParameters& networkParameters);
+
+} // raul namespace
+#endif // TRANSFORMER_MODEL_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/BidirectionalLSTMFunc.cpp b/training/src/compiler/training/base/layers/composite/rnn/BidirectionalLSTMFunc.cpp
new file mode 100644
index 00000000..e3e3249b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/BidirectionalLSTMFunc.cpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BidirectionalLSTMFunc.h"
+
+namespace raul
+{
+
+void BidirectionalLSTMFunc(const Name& name, const LSTMParams& params, NetworkParameters& networkParameters, BidirectionalMergeType mergeType)
+{
+    try
+    {
+        auto directParams = params;
+        auto reversedParams = params;
+        reversedParams.mReversed = true;
+
+        for (auto& tensor : directParams.getOutputs())
+        {
+            tensor /= "direct";
+        }
+
+        for (auto& tensor : reversedParams.getOutputs())
+        {
+            tensor /= "reversed";
+        }
+
+        LSTMLayer(name / "direct", directParams, networkParameters);
+        LSTMLayer(name / "reversed", reversedParams, networkParameters);
+
+        const Names inputs{ directParams.getOutputs()[0], reversedParams.getOutputs()[0] };
+        const Names& output = params.getOutputs();
+
+        switch (mergeType)
+        {
+            case BidirectionalMergeType::Sum:
+                THROW("BidirectionalLSTMFunc", name, " sum merge not implemented");
+            case BidirectionalMergeType::Mul:
+                THROW("BidirectionalLSTMFunc", name, " mul merge not implemented");
+            case BidirectionalMergeType::ConcatHeight:
+                networkParameters.mWorkflow.add<ConcatenationLayer>(name / "concat", BasicParamsWithDim(inputs, output, "height"));
+                break;
+            case BidirectionalMergeType::ConcatDepth:
+                networkParameters.mWorkflow.add<ConcatenationLayer>(name / "concat", BasicParamsWithDim(inputs, output, "depth"));
+                break;
+            case BidirectionalMergeType::ConcatWidth:
+            default:
+                networkParameters.mWorkflow.add<ConcatenationLayer>(name / "concat", BasicParamsWithDim(inputs, output, "width"));
+        }
+    }
+    catch (...)
+    {
+        THROW("BidirectionalLSTMFunc", name, "Cannot create bidirectional LSTM layer");
+    }
+}
+
+} // raul namespace
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/BidirectionalLSTMFunc.h b/training/src/compiler/training/base/layers/composite/rnn/BidirectionalLSTMFunc.h
new file mode 100644
index 00000000..6c171e28
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/BidirectionalLSTMFunc.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BIDIRECTIONAL_LSTM_FUNC_H
+#define BIDIRECTIONAL_LSTM_FUNC_H
+
+#include "training/base/layers/parameters/BasicParameters.h"
+#include "training/base/layers/parameters/trainable/LSTMCellParams.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/basic/ConcatenationLayer.h>
+#include <training/base/layers/composite/rnn/LSTMLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Bidirectional configuration for sequence processing layers
+ *
+ */
+
+enum class BidirectionalMergeType
+{
+    ConcatWidth,
+    ConcatHeight,
+    ConcatDepth,
+    Sum,
+    Mul
+};
+
+void BidirectionalLSTMFunc(const Name& name, const LSTMParams& params, NetworkParameters& networkParameters, BidirectionalMergeType mergeType = BidirectionalMergeType::ConcatWidth);
+
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/composite/rnn/GRUCellLayer.cpp b/training/src/compiler/training/base/layers/composite/rnn/GRUCellLayer.cpp
new file mode 100644
index 00000000..cb037f39
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/GRUCellLayer.cpp
@@ -0,0 +1,113 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GRUCellLayer.h"
+
+#include "GRUFusedGatesCalcLayer.h"
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/layers/activations/TanhActivation.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+#include <training/base/layers/basic/ElementWiseSubLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/basic/SlicerLayer.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+
+namespace raul
+{
+
+GRUCellLayer::GRUCellLayer(const Name& name, const GRUCellParams& params, NetworkParameters& networkParameters)
+{
+    if (params.getInputs().size() != 2U)
+    {
+        THROW("GRUCellLayer", name, "wrong number of input names");
+    }
+    if (params.getOutputs().size() != 1U)
+    {
+        THROW("GRUCellLayer", name, "wrong number of output names");
+    }
+
+    this->buildLayer(name, params, networkParameters);
+}
+
+void GRUCellLayer::buildLayer(const Name& name, const GRUCellParams& params, NetworkParameters& networkParameters)
+{
+    const auto parts = 3U;
+    const auto nameInput = params.getInputs()[0];
+    const auto nameHidden = params.getInputs()[1];
+    const auto nameNewHidden = params.getOutputs()[0];
+
+    const auto sizeHidden = networkParameters.mWorkflow.getWidth(nameHidden);
+
+    const auto sharedLayer = params.getSharedLayer();
+
+    if (!params.getSharedWeights().empty() && (params.getSharedWeights().size() < 2U || params.getSharedWeights().size() > 4U))
+    {
+        THROW("GRUCellLayer", name, "wrong number of weight names");
+    }
+
+    if (!params.mUseFusion)
+    {
+        BasicParams ihParams{ { nameInput }, { name / "linear_ih" } }, hhParams{ { nameHidden }, { name / "linear_hh" } };
+        if (!params.getSharedWeights().empty())
+        {
+            ihParams = { { nameInput }, { name / "linear_ih" }, { params.getSharedWeights()[0], params.getSharedWeights()[1] } };
+            hhParams = { { nameHidden }, { name / "linear_hh" }, { params.getSharedWeights()[2], params.getSharedWeights()[3] } };
+        }
+        else if (!sharedLayer.empty())
+        {
+            ihParams = { { nameInput }, { name / "linear_ih" }, sharedLayer / "linear_ih" };
+            hhParams = { { nameHidden }, { name / "linear_hh" }, sharedLayer / "linear_hh" };
+        }
+
+        // Calculate gates
+        networkParameters.mWorkflow.add<LinearLayer>(name / "linear_ih", LinearParams(ihParams, sizeHidden * parts, params.mUseBiasForInput, params.frozen));
+        networkParameters.mWorkflow.add<LinearLayer>(name / "linear_hh", LinearParams(hhParams, sizeHidden * parts, params.mUseBiasForHidden, params.frozen));
+
+        networkParameters.mWorkflow.add<SlicerLayer>(name / "slice_ih",
+                                                     SlicingParams(name / "linear_ih", { name / "half_ih_gates[0]", name / "half_ih_gates[1]", name / "half_ih_gates[2]" }, "width"));
+        networkParameters.mWorkflow.add<SlicerLayer>(name / "slice_hh",
+                                                     SlicingParams(name / "linear_hh", { name / "half_hh_gates[0]", name / "half_hh_gates[1]", name / "half_hh_gates[2]" }, "width"));
+
+        networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "gates[0]", ElementWiseLayerParams({ name / "half_ih_gates[0]", name / "half_hh_gates[0]" }, { name / "gates[0]" }));
+        networkParameters.mWorkflow.add<SigmoidActivation>(name / "sigmoid_gates[0]", HSigmoidActivationParams({ name / "gates[0]" }, { name / "sigmoid_gates[0]" }));
+        networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "gates[1]", ElementWiseLayerParams({ name / "half_ih_gates[1]", name / "half_hh_gates[1]" }, { name / "gates[1]" }));
+        networkParameters.mWorkflow.add<SigmoidActivation>(name / "sigmoid_gates[1]", HSigmoidActivationParams({ name / "gates[1]" }, { name / "sigmoid_gates[1]" }));
+
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "half_gates[2]", ElementWiseLayerParams({ name / "sigmoid_gates[0]", name / "half_hh_gates[2]" }, { name / "half_gates[2]" }));
+        networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "gates[2]", ElementWiseLayerParams({ name / "half_ih_gates[2]", name / "half_gates[2]" }, { name / "gates[2]" }));
+        networkParameters.mWorkflow.add<TanhActivation>(name / "tanh_gates[2]", HSigmoidActivationParams({ name / "gates[2]" }, { name / "tanh_gates[2]" }));
+
+        // Calculate new hidden state
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "calculate_old_part", ElementWiseLayerParams({ name / "sigmoid_gates[1]", nameHidden }, { name / "old_part" }));
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "calculate_update_part", ElementWiseLayerParams({ name / "sigmoid_gates[1]", name / "tanh_gates[2]" }, { name / "update_part" }));
+        networkParameters.mWorkflow.add<ElementWiseSubLayer>(name / "calculate_new_part", ElementWiseLayerParams({ name / "tanh_gates[2]", name / "update_part" }, { name / "new_part" }));
+        networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "sum_new_hidden_state", ElementWiseLayerParams({ name / "old_part", name / "new_part" }, { nameNewHidden }));
+    }
+    else
+    {
+        BasicParams fusedParams{ { nameInput, nameHidden }, { nameNewHidden } };
+        if (!params.getSharedWeights().empty())
+        {
+            fusedParams = { { nameInput, nameHidden }, { nameNewHidden }, { params.getSharedWeights()[0], params.getSharedWeights()[1], params.getSharedWeights()[2], params.getSharedWeights()[3] } };
+        }
+        else if (!sharedLayer.empty())
+        {
+            fusedParams = { { nameInput, nameHidden }, { nameNewHidden }, sharedLayer / "linear" };
+        }
+
+        networkParameters.mWorkflow.add<GRUFusedGatesCalcLayer>(name / "linear",
+                                                                GRUFusedGatesCalcParams(fusedParams, sizeHidden * parts, params.mUseBiasForInput, params.mUseBiasForHidden, params.frozen));
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/GRUCellLayer.h b/training/src/compiler/training/base/layers/composite/rnn/GRUCellLayer.h
new file mode 100644
index 00000000..3e3fbe70
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/GRUCellLayer.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRU_CELL_LAYER_H
+#define GRU_CELL_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/parameters/trainable/GRUCellParams.h>
+
+namespace raul
+{
+
+/**
+ * @brief Gated Recurrent Unit Cell
+ *
+ * @see
+ * - K. Cho, B. Merriënboer, C. Gulcehre, D. Bahdanau, F. Bougares, H. Schwenk, and Y. Bengio
+ * “Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation” arXiv preprint arXiv:1406.1078 (2014).
+ */
+class GRUCellLayer
+{
+  public:
+    GRUCellLayer(const Name& name, const GRUCellParams& params, NetworkParameters& networkParameters);
+
+    GRUCellLayer(GRUCellLayer&&) = default;
+    GRUCellLayer(const GRUCellLayer&) = delete;
+    GRUCellLayer& operator=(const GRUCellLayer&) = delete;
+
+  private:
+    void buildLayer(const Name& name, const GRUCellParams& params, NetworkParameters& networkParameters);
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/GRUFusedGatesCalcLayer.cpp b/training/src/compiler/training/base/layers/composite/rnn/GRUFusedGatesCalcLayer.cpp
new file mode 100644
index 00000000..f29f0fb9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/GRUFusedGatesCalcLayer.cpp
@@ -0,0 +1,116 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GRUFusedGatesCalcLayer.h"
+
+#include "impl/GRUFusedGatesCalcLayerCPU.h"
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+GRUFusedGatesCalcLayer::GRUFusedGatesCalcLayer(const Name& name, const GRUFusedGatesCalcParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "GRUFusedGatesCalc", params, networkParameters, { true, true })
+    , mOutputsCount(params.outputsCount)
+    , mUseBiasForInput(params.mUseBiasForInput)
+    , mUseBiasForHidden(params.mUseBiasForHidden)
+    , mLinearIHTmp(mName / "linearIH")
+    , mLinearHHTmp(mName / "linearHH")
+    , mWeightsNameIH(Name(mName + "_ih") / "Weights")
+    , mBiasesNameIH(Name(mName + "_ih") / "Biases")
+    , mWeightsNameHH(Name(mName + "_hh") / "Weights")
+    , mBiasesNameHH(Name(mName + "_hh") / "Biases")
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 2)
+    {
+        THROW("GRUFusedGatesCalcLayer", name, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW("GRUFusedGatesCalcLayer", name, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(GRUFusedGatesCalcLayer, GRUFusedGatesCalcLayerCPU<MemoryManager>, GRUFusedGatesCalcLayerCPU<MemoryManagerFP16>)
+
+    // Declare inputs
+    for (size_t i = 0; i < mInputs.size(); ++i)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[i], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[i], mInputs[i].grad(), DEC_BACK_WRIT_ZERO);
+    }
+
+    // Declare temporal storages
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mLinearIHTmp, WShape{ BS(), 1u, 1u, mOutputsCount }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mLinearIHTmp, mLinearIHTmp, DEC_BACK_READ);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mLinearIHTmp, mLinearHHTmp, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mLinearHHTmp, mLinearHHTmp, DEC_BACK_READ);
+    // And their grads
+    mNetworkParams.mWorkflow.copyDeclaration(name, mLinearIHTmp, mLinearIHTmp.grad(), DEC_BACK_WRIT_ZERO);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mLinearHHTmp, mLinearHHTmp.grad(), DEC_BACK_WRIT_ZERO);
+
+    // Declare output
+    mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[1], mOutputs[0], DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+    // Declare trainable params
+    if (mSharedLayer.empty() && mSharedWeights.empty())
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsNameIH, WShape{ 1u, 1u, mOutputsCount, mNetworkParams.mWorkflow.getWidth(mInputs[0]) }, DEC_TRAINABLE);
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsNameHH, WShape{ 1u, 1u, mOutputsCount, mNetworkParams.mWorkflow.getWidth(mInputs[1]) }, DEC_TRAINABLE);
+        if (mUseBiasForInput)
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesNameIH, WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+        }
+        if (mUseBiasForHidden)
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesNameHH, WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+        }
+    }
+    else
+    {
+        if (!mSharedWeights.empty())
+        {
+            mWeightsNameIH = mSharedWeights[0];
+            mBiasesNameIH = mSharedWeights[1];
+            mWeightsNameHH = mSharedWeights[2];
+            mBiasesNameHH = mSharedWeights[3];
+        }
+        else
+        {
+            mWeightsNameIH = Name(mSharedLayer + "_ih") / "Weights";
+            mWeightsNameHH = Name(mSharedLayer + "_hh") / "Weights";
+            mBiasesNameIH = Name(mSharedLayer + "_ih") / "Biases";
+            mBiasesNameHH = Name(mSharedLayer + "_hh") / "Biases";
+        }
+    }
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsNameIH, mWeightsNameIH.grad(), DEC_TRAINABLE_GRAD);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsNameHH, mWeightsNameHH.grad(), DEC_TRAINABLE_GRAD);
+
+        if (mUseBiasForInput)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesNameIH, mBiasesNameIH.grad(), DEC_TRAINABLE_GRAD);
+        }
+        if (mUseBiasForHidden)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesNameHH, mBiasesNameHH.grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/GRUFusedGatesCalcLayer.h b/training/src/compiler/training/base/layers/composite/rnn/GRUFusedGatesCalcLayer.h
new file mode 100644
index 00000000..b395a3e4
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/GRUFusedGatesCalcLayer.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRU_FUSED_GATES_CALC_LAYER_H
+#define GRU_FUSED_GATES_CALC_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/parameters/trainable/GRUFusedGatesCalcParams.h>
+
+namespace raul
+{
+
+class GRUFusedGatesCalcLayer : public TrainableLayer
+{
+
+  public:
+    GRUFusedGatesCalcLayer(const Name& name, const GRUFusedGatesCalcParams& params, NetworkParameters& networkParameters);
+
+    GRUFusedGatesCalcLayer(GRUFusedGatesCalcLayer&&) = default;
+    GRUFusedGatesCalcLayer(const GRUFusedGatesCalcLayer&) = delete;
+    GRUFusedGatesCalcLayer& operator=(const GRUFusedGatesCalcLayer&) = delete;
+
+  private:
+    size_t mOutputsCount;
+    bool mUseBiasForInput;
+    bool mUseBiasForHidden;
+    Name mLinearIHTmp;
+    Name mLinearHHTmp;
+
+    Name mWeightsNameIH;
+    Name mBiasesNameIH;
+    Name mWeightsNameHH;
+    Name mBiasesNameHH;
+
+    template<typename MM>
+    friend class GRUFusedGatesCalcLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/GRUFusedLayer.cpp b/training/src/compiler/training/base/layers/composite/rnn/GRUFusedLayer.cpp
new file mode 100644
index 00000000..429e622c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/GRUFusedLayer.cpp
@@ -0,0 +1,246 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GRUFusedLayer.h"
+
+#include <training/base/impl/composite/rnn/GRUFusedLayerCPU.h>
+
+namespace raul
+{
+
+GRUFusedLayer::GRUFusedLayer(const Name& name, const GRUParams& params, const Name& basicName, const Name& nameHiddenStateIn, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "GRUFusedLayer", params, networkParameters, { true, false })
+    , mIsExternalState(!params.mHiddenFeatures)
+    , mLengthSequence(0)
+    , mSequenceDimension("depth")
+    , mUseBiasForInput(params.mUseBiasForInput)
+    , mUseBiasForHidden(params.mUseBiasForHidden)
+    , mCurrentInputMaxSize(0)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    const auto nameInput = params.getInputs()[0];
+
+    if (networkParameters.mWorkflow.getDepth(nameInput) != 1u && networkParameters.mWorkflow.getHeight(nameInput) != 1u)
+    {
+        throw std::runtime_error("GRUFusedLayer[" + name + "::ctor]: length of sequence should be placed in one dimension: depth or height");
+    }
+
+    if (networkParameters.mWorkflow.getDepth(nameInput) == 1u)
+    {
+        mLengthSequence = networkParameters.mWorkflow.getHeight(nameInput);
+        mSequenceDimension = "height";
+    }
+    else
+    {
+        mLengthSequence = networkParameters.mWorkflow.getDepth(nameInput);
+    }
+
+    if (mLengthSequence == 0)
+    {
+        throw std::runtime_error("GRUFusedLayer[" + name + "::ctor]: length of sequence cannot be zero");
+    }
+
+    if (mIsExternalState)
+    {
+        THROW("GRUFusedLayer", name, " external state not supported");
+    }
+
+    mDirection = BasicParamsWithDim({}, {}, mSequenceDimension).dim;
+
+    if (!mNetworkParams.mWorkflow.isCompilerEnabled())
+    {
+        DECLARE_IMPL(GRUFusedLayer, GRUFusedLayerCPU<MemoryManager>, GRUFusedLayerCPU<MemoryManagerFP16>)
+    }
+
+    size_t cnt = 0;
+
+    Names nameInputSlices(mLengthSequence);
+    Names nameOutputSlices(mIsExternalState ? mLengthSequence - 1 : mLengthSequence);
+    std::generate_n(nameInputSlices.begin(), mLengthSequence, [&, cnt]() mutable { return name / nameInput + "[" + Conversions::toString(cnt++) + "]"; });
+
+    std::generate_n(nameOutputSlices.begin(), mLengthSequence, [&, cnt]() mutable { return nameHiddenStateIn + "[" + Conversions::toString(cnt++) + "]"; });
+
+    auto nameUnrolledCellZero = basicName / "cell";
+    const Names nameUnrolledCellWeights = {
+        nameUnrolledCellZero / "linear_ih::Weights", nameUnrolledCellZero / "linear_ih::Biases", nameUnrolledCellZero / "linear_hh::Weights", nameUnrolledCellZero / "linear_hh::Biases"
+    };
+
+    mWeightsNameIH = nameUnrolledCellWeights[0];
+    mBiasesNameIH = nameUnrolledCellWeights[1];
+    mWeightsNameHH = nameUnrolledCellWeights[2];
+    mBiasesNameHH = nameUnrolledCellWeights[3];
+
+    Name nameHiddenStateInCopy = nameHiddenStateIn;
+
+    mLinearIHTmp.resize(mLengthSequence);
+    mLinearHHTmp.resize(mLengthSequence);
+
+    mInputsLocal.resize(mLengthSequence);
+    mOutputsLocal.resize(mLengthSequence);
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, nameInput, Workflow::Usage::ForwardAndBackward, Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, nameInput, nameInput.grad(), DEC_BACK_WRIT_ZERO);
+
+    for (size_t i = 0; i < mLengthSequence; ++i)
+    {
+        const auto idx = Conversions::toString(i);
+        std::string nameHiddenStateOut = nameHiddenStateIn + "[" + idx + "]";
+        auto nameUnrolledCell = i == 0 ? nameUnrolledCellZero : (basicName / "unrolled_cell[" + idx + "]");
+
+        Name input;
+        Name inputHidden;
+        Name outputHidden;
+
+        auto inputIdx = i;
+        if (i == mLengthSequence - 1)
+        {
+            input = nameInputSlices[inputIdx];
+            inputHidden = nameHiddenStateInCopy;
+            outputHidden = nameHiddenStateOut;
+        }
+        else
+        {
+            input = nameInputSlices[inputIdx];
+            inputHidden = nameHiddenStateInCopy;
+            outputHidden = nameHiddenStateOut;
+
+            nameHiddenStateInCopy = nameHiddenStateOut;
+        }
+
+        mLinearIHTmp[i] = nameUnrolledCell / "linearIH";
+        mLinearHHTmp[i] = nameUnrolledCell / "linearHH";
+
+        mInputsLocal[i] = { input, inputHidden };
+        mOutputsLocal[i] = outputHidden;
+
+        const auto parts = 3U;
+
+        const auto sizeHidden = networkParameters.mWorkflow.getWidth(inputHidden);
+
+        mOutputsCount = sizeHidden * parts;
+
+        // Declare inputs
+        if (i == 0) // avoid conflicts with outputs
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(name, inputHidden, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Write);
+            mNetworkParams.mWorkflow.copyDeclaration(name, inputHidden, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read);
+            mNetworkParams.mWorkflow.copyDeclaration(name, inputHidden, inputHidden.grad(), DEC_BACK_WRIT_ZERO);
+        }
+
+        // Declare temporal storages
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mLinearIHTmp[i], WShape{ BS(), 1u, 1u, mOutputsCount }, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mLinearIHTmp[i], mLinearIHTmp[i], DEC_BACK_READ);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mLinearIHTmp[i], mLinearHHTmp[i], DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mLinearHHTmp[i], mLinearHHTmp[i], DEC_BACK_READ);
+        // And their grads
+        mNetworkParams.mWorkflow.copyDeclaration(name, mLinearIHTmp[i], mLinearIHTmp[i].grad(), DEC_BACK_WRIT_ZERO);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mLinearHHTmp[i], mLinearHHTmp[i].grad(), DEC_BACK_WRIT_ZERO);
+
+        // Declare output
+        mNetworkParams.mWorkflow.copyDeclaration(name, inputHidden, outputHidden, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(name, outputHidden, outputHidden, DEC_BACK_READ);
+        mNetworkParams.mWorkflow.copyDeclaration(name, outputHidden, outputHidden.grad(), DEC_BACK_READ);
+    }
+
+    // Declare trainable params
+    if (mSharedLayer.empty() && mSharedWeights.empty())
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsNameIH, WShape{ 1u, 1u, mOutputsCount, mNetworkParams.mWorkflow.getWidth(nameInput) }, DEC_TRAINABLE);
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsNameHH, WShape{ 1u, 1u, mOutputsCount, mNetworkParams.mWorkflow.getWidth(mInputsLocal[0][1]) }, DEC_TRAINABLE);
+        if (mUseBiasForInput)
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesNameIH, WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+        }
+        if (mUseBiasForHidden)
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesNameHH, WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+        }
+    }
+    else
+    {
+        if (!mSharedWeights.empty())
+        {
+            mWeightsNameIH = mSharedWeights[0];
+            mBiasesNameIH = mSharedWeights[1];
+            mWeightsNameHH = mSharedWeights[2];
+            mBiasesNameHH = mSharedWeights[3];
+        }
+        else
+        {
+            mWeightsNameIH = Name(mSharedLayer + "_ih") / "Weights";
+            mWeightsNameHH = Name(mSharedLayer + "_hh") / "Weights";
+            mBiasesNameIH = Name(mSharedLayer + "_ih") / "Biases";
+            mBiasesNameHH = Name(mSharedLayer + "_hh") / "Biases";
+        }
+    }
+
+    if (!mFrozen)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsNameIH, mWeightsNameIH.grad(), DEC_TRAINABLE_GRAD);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsNameHH, mWeightsNameHH.grad(), DEC_TRAINABLE_GRAD);
+
+        if (mUseBiasForInput)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesNameIH, mBiasesNameIH.grad(), DEC_TRAINABLE_GRAD);
+        }
+        if (mUseBiasForHidden)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesNameHH, mBiasesNameHH.grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+
+    // concat output
+    yato::dimensionality<3U, size_t> shape(
+        mNetworkParams.mWorkflow.getDepth(mOutputsLocal[0]), mNetworkParams.mWorkflow.getHeight(mOutputsLocal[0]), mNetworkParams.mWorkflow.getWidth(mOutputsLocal[0]));
+
+    switch (mDirection)
+    {
+        case Dimension::Depth:
+            mDimIndex = 0;
+            break;
+        case Dimension::Height:
+            mDimIndex = 1;
+            break;
+        case Dimension::Width:
+            mDimIndex = 2;
+            break;
+        default:
+            THROW(mTypeName, mName, "unsupported dim");
+    }
+
+    for (size_t i = 1; i < mOutputsLocal.size(); ++i)
+    {
+        yato::dimensionality<3U, size_t> inputShape(
+            mNetworkParams.mWorkflow.getDepth(mOutputsLocal[i]), mNetworkParams.mWorkflow.getHeight(mOutputsLocal[i]), mNetworkParams.mWorkflow.getWidth(mOutputsLocal[i]));
+        mCurrentInputMaxSize = std::max(mCurrentInputMaxSize, inputShape[0] * inputShape[1] * inputShape[2]);
+
+        for (size_t k = 0; k < 3; ++k)
+        {
+            if (k == mDimIndex)
+            {
+                continue;
+            }
+            if (shape[k] != inputShape[k])
+            {
+                THROW(mTypeName, mName, "inconsistent input shapes (" + mOutputs[0] + " " + Conversions::toString(shape) + " vs " + mOutputsLocal[i] + " " + Conversions::toString(inputShape) + ")");
+            }
+        }
+        shape[mDimIndex] += inputShape[mDimIndex];
+    }
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], WShape{ BS(), shape[0], shape[1], shape[2] }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/GRUFusedLayer.h b/training/src/compiler/training/base/layers/composite/rnn/GRUFusedLayer.h
new file mode 100644
index 00000000..fcee4cf5
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/GRUFusedLayer.h
@@ -0,0 +1,61 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRU_FUSED_LAYER_H
+#define GRU_FUSED_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/parameters/trainable/GRUParams.h>
+
+namespace raul
+{
+
+class GRUFusedLayer : public TrainableLayer
+{
+  public:
+    GRUFusedLayer(const Name& name, const GRUParams& params, const Name& basicName, const Name& nameHiddenStateIn, NetworkParameters& networkParameters);
+
+    GRUFusedLayer(GRUFusedLayer&&) = default;
+    GRUFusedLayer(const GRUFusedLayer&) = delete;
+    GRUFusedLayer& operator=(const GRUFusedLayer&) = delete;
+
+  private:
+    bool mIsExternalState;
+    size_t mLengthSequence;
+    std::string mSequenceDimension;
+
+    std::vector<Names> mInputsLocal;
+    Names mOutputsLocal;
+
+    size_t mOutputsCount;
+    bool mUseBiasForInput;
+    bool mUseBiasForHidden;
+    Names mLinearIHTmp;
+    Names mLinearHHTmp;
+
+    Name mWeightsNameIH;
+    Name mBiasesNameIH;
+    Name mWeightsNameHH;
+    Name mBiasesNameHH;
+
+    Dimension mDirection;
+    size_t mDimIndex = 0;
+    size_t mCurrentInputMaxSize;
+
+    template<typename MM>
+    friend class GRUFusedLayerCPU;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/GRULayer.cpp b/training/src/compiler/training/base/layers/composite/rnn/GRULayer.cpp
new file mode 100644
index 00000000..25c77d71
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/GRULayer.cpp
@@ -0,0 +1,180 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GRULayer.h"
+
+#include <algorithm>
+#include <map>
+
+#include <training/base/layers/basic/ConcatenationLayer.h>
+#include <training/base/layers/basic/SlicerLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/base/layers/composite/rnn/GRUFusedLayer.h>
+
+namespace raul
+{
+
+GRULayer::GRULayer(const Name& name, const GRUParams& params, NetworkParameters& networkParameters)
+    : mIsExternalState(!params.mHiddenFeatures)
+    , mLengthSequence(0)
+    , mSequenceDimension("depth")
+{
+    this->verifyInOut(name, params);
+    this->initLocalState(name, params, networkParameters);
+    this->buildGRULayer(name, params, networkParameters);
+}
+
+void GRULayer::verifyInOut(const Name& lName, const GRUParams& params) const
+{
+    const size_t inOutNum = mIsExternalState ? 2U : 1U;
+    if (params.getInputs().size() != inOutNum)
+    {
+        THROW("GRULayer", lName, "wrong number of input names");
+    }
+    if (params.getOutputs().size() != inOutNum)
+    {
+        THROW("GRULayer", lName, "wrong number of output names");
+    }
+}
+
+void GRULayer::initLocalState(const Name& name, const GRUParams& params, NetworkParameters& networkParameters)
+{
+    const auto nameInput = params.getInputs()[0];
+
+    if (!params.mUseGlobalFusion)
+    {
+        if (networkParameters.mWorkflow.getDepth(nameInput) != 1u && networkParameters.mWorkflow.getHeight(nameInput) != 1u)
+        {
+            THROW("GRULayer", name, "length of sequence should be placed in one dimension: depth or height");
+        }
+
+        if (networkParameters.mWorkflow.getDepth(nameInput) == 1u)
+        {
+            mLengthSequence = networkParameters.mWorkflow.getHeight(nameInput);
+            mSequenceDimension = "height";
+        }
+        else
+        {
+            mLengthSequence = networkParameters.mWorkflow.getDepth(nameInput);
+        }
+
+        if (mLengthSequence == 0)
+        {
+            THROW("GRULayer", name, "length of sequence cannot be zero");
+        }
+    }
+
+    if (params.mHiddenFeatures)
+    {
+        mHidden = name / "hidden_state";
+        networkParameters.mWorkflow.add<TensorLayer>(mHidden, raul::TensorParams{ { mHidden }, raul::WShape{ BS(), 1, 1, *params.mHiddenFeatures }, 0.0_dt, DEC_FORW_READ });
+    }
+    else
+    {
+        mHidden = params.getInputs()[1];
+    }
+}
+
+void GRULayer::buildGRULayer(const Name& name, const GRUParams& params, NetworkParameters& networkParameters) const
+{
+    size_t cnt = 0;
+    const auto nameInput = params.getInputs()[0];
+    const auto nameOutput = params.getOutputs()[0];
+
+    if (!params.mUseGlobalFusion)
+    {
+        Names nameInputSlices(mLengthSequence);
+        Names nameOutputSlices(mIsExternalState ? mLengthSequence - 1 : mLengthSequence);
+        std::generate_n(nameInputSlices.begin(), mLengthSequence, [&, cnt]() mutable { return name / nameInput + "[" + Conversions::toString(cnt++) + "]"; });
+
+        if (mIsExternalState)
+        {
+            std::generate_n(nameOutputSlices.begin(), mLengthSequence - 1, [&, cnt]() mutable { return mHidden + "[" + Conversions::toString(cnt++) + "]"; });
+            nameOutputSlices.push_back(params.getOutputs()[1]);
+        }
+        else
+        {
+            std::generate_n(nameOutputSlices.begin(), mLengthSequence, [&, cnt]() mutable { return mHidden + "[" + Conversions::toString(cnt++) + "]"; });
+        }
+
+        networkParameters.mWorkflow.add<SlicerLayer>(name / "slice", SlicingParams(nameInput, nameInputSlices, mSequenceDimension));
+
+        std::string nameHiddenStateIn = mHidden;
+
+        auto nameUnrolledCellZero = name / "cell";
+        Names nameUnrolledCellWeights = {
+            nameUnrolledCellZero / "linear_ih::Weights", nameUnrolledCellZero / "linear_ih::Biases", nameUnrolledCellZero / "linear_hh::Weights", nameUnrolledCellZero / "linear_hh::Biases"
+        };
+
+        for (size_t i = 0; i < mLengthSequence; ++i)
+        {
+            const auto idx = Conversions::toString(i);
+            auto nameUnrolledCell = i == 0 ? nameUnrolledCellZero : (name / "unrolled_cell[" + idx + "]");
+            std::string nameHiddenStateOut = mHidden + "[" + idx + "]";
+
+            auto inputIdx = i;
+            if (i == mLengthSequence - 1)
+            {
+                if (mIsExternalState)
+                {
+                    GRUCellLayer(nameUnrolledCell,
+                                 GRUCellParams(nameInputSlices[inputIdx],
+                                               nameHiddenStateIn,
+                                               params.getOutputs()[1],
+                                               i == 0 ? Names{} : nameUnrolledCellWeights,
+                                               params.mUseBiasForInput,
+                                               params.mUseBiasForHidden,
+                                               params.frozen,
+                                               params.mUseFusion),
+                                 networkParameters);
+                }
+                else
+                {
+                    GRUCellLayer(nameUnrolledCell,
+                                 GRUCellParams(nameInputSlices[inputIdx],
+                                               nameHiddenStateIn,
+                                               nameHiddenStateOut,
+                                               i == 0 ? Names{} : nameUnrolledCellWeights,
+                                               params.mUseBiasForInput,
+                                               params.mUseBiasForHidden,
+                                               params.frozen,
+                                               params.mUseFusion),
+                                 networkParameters);
+                }
+            }
+            else
+            {
+                GRUCellLayer(nameUnrolledCell,
+                             GRUCellParams(nameInputSlices[inputIdx],
+                                           nameHiddenStateIn,
+                                           nameHiddenStateOut,
+                                           i == 0 ? Names{} : nameUnrolledCellWeights,
+                                           params.mUseBiasForInput,
+                                           params.mUseBiasForHidden,
+                                           params.frozen,
+                                           params.mUseFusion),
+                             networkParameters);
+
+                nameHiddenStateIn = nameHiddenStateOut;
+            }
+        }
+
+        networkParameters.mWorkflow.add<ConcatenationLayer>(name / "concat", BasicParamsWithDim(nameOutputSlices, { nameOutput }, mSequenceDimension));
+    }
+    else
+    {
+        networkParameters.mWorkflow.add<GRUFusedLayer>(name / "globalFusion", GRUParams(params), name, mHidden);
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/GRULayer.h b/training/src/compiler/training/base/layers/composite/rnn/GRULayer.h
new file mode 100644
index 00000000..9b5cf093
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/GRULayer.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRU_LAYER_H
+#define GRU_LAYER_H
+
+#include "training/base/layers/parameters/trainable/GRUParams.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/composite/rnn/GRUCellLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Gated Recurrent Unit Cell
+ *
+ * @see
+ * - K. Cho, B. Merriënboer, C. Gulcehre, D. Bahdanau, F. Bougares, H. Schwenk, and Y. Bengio
+ * “Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation” arXiv preprint arXiv:1406.1078 (2014).
+ */
+class GRULayer
+{
+
+  public:
+    GRULayer(const Name& name, const GRUParams& params, NetworkParameters& networkParameters);
+
+    GRULayer(GRULayer&&) = default;
+    GRULayer(const GRULayer&) = delete;
+    GRULayer& operator=(const GRULayer&) = delete;
+
+  private:
+    void verifyInOut(const Name& name, const GRUParams& params) const;
+    void initLocalState(const Name& name, const GRUParams& params, NetworkParameters& networkParameters);
+    void buildGRULayer(const Name& name, const GRUParams& params, NetworkParameters& networkParameters) const;
+
+    Name mHidden;
+    bool mIsExternalState;
+    size_t mLengthSequence;
+    std::string mSequenceDimension;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/composite/rnn/LSTMCellLayer.cpp b/training/src/compiler/training/base/layers/composite/rnn/LSTMCellLayer.cpp
new file mode 100644
index 00000000..2df4c147
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/LSTMCellLayer.cpp
@@ -0,0 +1,154 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LSTMCellLayer.h"
+
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/layers/activations/TanhActivation.h>
+#include <training/base/layers/basic/ConcatenationLayer.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/basic/FixedBiasLayer.h>
+#include <training/base/layers/basic/SlicerLayer.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+#include <training/base/layers/composite/rnn/LSTMFusedGatesCalcLayer.h>
+#include <training/base/layers/composite/rnn/ZoneoutLayer.h>
+
+namespace raul
+{
+
+LSTMCellLayer::LSTMCellLayer(const Name& name, const LSTMCellParams& params, NetworkParameters& networkParameters)
+{
+    try
+    {
+
+        MEASURE_BLOCK("LSTMCellLayer[" + name + "::ctor]")
+        if (params.getInputs().size() != 3U)
+        {
+            THROW("LSTMCellLayer", name, " wrong number of input names");
+        }
+        if (params.getOutputs().size() != 2U)
+        {
+            THROW("LSTMCellLayer", name, " wrong number of output names");
+        }
+
+        this->buildLayer(name, params, networkParameters);
+    }
+    catch (...)
+    {
+        THROW("LSTMCellLayer", name, "Cannot create LSTMCellLayer layer");
+    }
+}
+
+void LSTMCellLayer::buildLayer(const Name& name, const LSTMCellParams& params, NetworkParameters& networkParameters)
+{
+    const auto parts = 4U;
+    const auto nameInput = params.getInputs()[0];
+    const auto nameHidden = params.getInputs()[1];
+    const auto nameCell = params.getInputs()[2];
+    const auto nameNewHidden = params.getOutputs()[0];
+    const auto nameNewCell = params.getOutputs()[1];
+
+    const auto sizeHidden = params.mHiddenFeatures ? *params.mHiddenFeatures : networkParameters.mWorkflow.getWidth(nameHidden);
+
+    const bool useZoneout = params.mZoneout != 0.0_dt;
+
+    const auto sharedLayer = params.getSharedLayer();
+    bool useFusion = params.mUseFusion;
+    if (!useFusion)
+    {
+        if (params.mUseSingleParamTensor)
+        {
+            if (!params.getSharedWeights().empty() && params.getSharedWeights().size() < 2U)
+            {
+                THROW("LSTMCellLayer", name, " wrong number of weight names");
+            }
+
+            networkParameters.mWorkflow.add<ConcatenationLayer>(name / "concat", BasicParamsWithDim({ nameInput, nameHidden }, { name / "concat" }, Dimension::Width));
+
+            BasicParams linearParams = !params.getSharedWeights().empty() ? BasicParams{ { name / "concat" }, { name / "gates" }, { params.getSharedWeights()[0], params.getSharedWeights()[1] } }
+                                                                          : !sharedLayer.empty() ? BasicParams{ { name / "concat" }, { name / "gates" }, sharedLayer / "linear" }
+                                                                                                 : BasicParams{ { name / "concat" }, { name / "gates" } };
+
+            networkParameters.mWorkflow.add<LinearLayer>(name / "linear", LinearParams(linearParams, sizeHidden * parts, params.mBias, params.frozen));
+        }
+        else
+        {
+            if (!params.getSharedWeights().empty() && params.getSharedWeights().size() < 4U)
+            {
+                THROW("LSTMCellLayer", name, " wrong number of weight names");
+            }
+
+            BasicParams ihParams{ { nameInput }, { name / "linear_ih" } }, hhParams{ { nameHidden }, { name / "linear_hh" } };
+            if (!params.getSharedWeights().empty())
+            {
+                ihParams = { { nameInput }, { name / "linear_ih" }, { params.getSharedWeights()[0], params.getSharedWeights()[1] } };
+                hhParams = { { nameHidden }, { name / "linear_hh" }, { params.getSharedWeights()[2], params.getSharedWeights()[3] } };
+            }
+            else if (!sharedLayer.empty())
+            {
+                ihParams = { { nameInput }, { name / "linear_ih" }, sharedLayer / "linear_ih" };
+                hhParams = { { nameHidden }, { name / "linear_hh" }, sharedLayer / "linear_hh" };
+            }
+
+            networkParameters.mWorkflow.add<LinearLayer>(name / "linear_ih", LinearParams(ihParams, sizeHidden * parts, params.mBias, params.frozen));
+            networkParameters.mWorkflow.add<LinearLayer>(name / "linear_hh", LinearParams(hhParams, sizeHidden * parts, params.mBias, params.frozen));
+            networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "gates", ElementWiseLayerParams({ name / "linear_ih", name / "linear_hh" }, { name / "gates" }));
+        }
+
+        networkParameters.mWorkflow.add<SlicerLayer>(name / "slice", SlicingParams(name / "gates", { name / "gates[0]", name / "gates[1]", name / "gates[2]", name / "gates[3]" }, "width"));
+        networkParameters.mWorkflow.add<SigmoidActivation>(name / "sigmoid_input", HSigmoidActivationParams({ name / "gates[0]" }, { name / "sigmoid_input" }));
+        if (params.mForgetBias != 0.0_dt)
+        {
+            networkParameters.mWorkflow.add<FixedBiasLayer>(name / "forget_bias", FixedBiasParams({ name / "gates[1]" }, { name / "gates[1]_biased" }, params.mForgetBias));
+            networkParameters.mWorkflow.add<SigmoidActivation>(name / "sigmoid_forget", HSigmoidActivationParams({ name / "gates[1]_biased" }, { name / "sigmoid_forget" }));
+        }
+        else
+        {
+            networkParameters.mWorkflow.add<SigmoidActivation>(name / "sigmoid_forget", HSigmoidActivationParams({ name / "gates[1]" }, { name / "sigmoid_forget" }));
+        }
+
+        networkParameters.mWorkflow.add<TanhActivation>(name / "tanh_gates", HSigmoidActivationParams({ name / "gates[2]" }, { name / "tanh_gates" }));
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "mul_input", ElementWiseLayerParams({ name / "sigmoid_input", name / "tanh_gates" }, { name / "mul_input" }));
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "mul_forget", ElementWiseLayerParams({ name / "sigmoid_forget", nameCell }, { name / "mul_forget" }));
+        networkParameters.mWorkflow.add<ElementWiseSumLayer>(name / "sum_new_cell_state",
+                                                             ElementWiseLayerParams({ name / "mul_input", name / "mul_forget" }, { useZoneout ? name / "new_cell_zoneout" : nameNewCell }));
+
+        networkParameters.mWorkflow.add<SigmoidActivation>(name / "sigmoid_output", HSigmoidActivationParams({ name / "gates[3]" }, { name / "sigmoid_output" }));
+        networkParameters.mWorkflow.add<TanhActivation>(name / "tanh_new_cell_state",
+                                                        HSigmoidActivationParams({ useZoneout ? name / "new_cell_zoneout" : nameNewCell }, { name / "tanh_internal_new_cell_state" }));
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(
+            name / "mul_new_hidden_state", ElementWiseLayerParams({ name / "sigmoid_output", name / "tanh_internal_new_cell_state" }, { useZoneout ? name / "new_hidden_zoneout" : nameNewHidden }));
+        if (useZoneout)
+        {
+            networkParameters.mWorkflow.add<ZoneoutLayer>(name / "zoneout_hidden", ZoneoutParams({ name / "new_hidden_zoneout", nameHidden }, { nameNewHidden }, params.mZoneout));
+            networkParameters.mWorkflow.add<ZoneoutLayer>(name / "zoneout_cell", ZoneoutParams({ name / "new_cell_zoneout", nameCell }, { nameNewCell }, params.mZoneout));
+        }
+    }
+    else
+    {
+        BasicParams fusedParams{ { nameInput, nameHidden, nameCell }, { nameNewHidden, nameNewCell } };
+        if (!params.getSharedWeights().empty())
+        {
+            fusedParams = { { nameInput, nameHidden, nameCell }, { nameNewHidden, nameNewCell }, params.getSharedWeights() };
+        }
+        else if (!sharedLayer.empty())
+        {
+            fusedParams = { { nameInput, nameHidden, nameCell }, { nameNewHidden, nameNewCell }, sharedLayer / "linear" };
+        }
+        networkParameters.mWorkflow.add<LSTMFusedGatesCalcLayer>(
+            name / "linear", LSTMFusedGatesCalcParams(fusedParams, sizeHidden * parts, params.mBias, params.mZoneout, params.mUseSingleParamTensor, params.mForgetBias, params.frozen));
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/LSTMCellLayer.h b/training/src/compiler/training/base/layers/composite/rnn/LSTMCellLayer.h
new file mode 100644
index 00000000..8e47f205
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/LSTMCellLayer.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LSTM_CELL_LAYER_H
+#define LSTM_CELL_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/parameters/trainable/LSTMCellParams.h>
+
+#include <optional>
+
+namespace raul
+{
+
+/**
+ * @brief Long Short-Term Memory Cell
+ *
+ * @see
+ * - S. Hochreiter and J. Schmidhuber, “Long Short-Term Memory” Neural Comput., vol. 9, no. 8, pp. 1735–1780, Nov. 1997, doi: 10.1162/neco.1997.9.8.1735.
+ */
+class LSTMCellLayer
+{
+  public:
+    LSTMCellLayer(const Name& name, const LSTMCellParams& params, NetworkParameters& networkParameters);
+
+    LSTMCellLayer(LSTMCellLayer&&) = default;
+    LSTMCellLayer(const LSTMCellLayer&) = delete;
+    LSTMCellLayer& operator=(const LSTMCellLayer&) = delete;
+
+  private:
+    void buildLayer(const Name& name, const LSTMCellParams& params, NetworkParameters& networkParameters);
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedGatesCalcLayer.cpp b/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedGatesCalcLayer.cpp
new file mode 100644
index 00000000..737169a1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedGatesCalcLayer.cpp
@@ -0,0 +1,157 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LSTMFusedGatesCalcLayer.h"
+
+#include "impl/LSTMFusedGatesCalcLayerCPU.h"
+
+#include <training/base/common/MemoryManager.h>
+
+namespace raul
+{
+
+LSTMFusedGatesCalcLayer::LSTMFusedGatesCalcLayer(const Name& name, const LSTMFusedGatesCalcParams& params, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "LSTMFusedGatesCalc", params, networkParameters, { true, true })
+    , mOutputsCount(params.outputsCount)
+    , mUseBias(params.mUseBias)
+    , mZoneout(params.mZoneout)
+    , mUseSingleParamTensor(params.mUseSingleParamTensor)
+    , mForgetBias(params.mForgetBias)
+    , mTmpCalculationsName(mName / "tmp")
+    , mGatesName(mName / "gates")
+    , mRandomNameHidden(mName / "randomH")
+    , mRandomNameCell(mName / "randomC")
+    , mNoZoneoutNewCellName(mName / "noZoneoutNewCell")
+    , mNoZoneoutNewHiddenGradName(mName / "noZoneoutNewHiddenGrad")
+    , mNoZoneoutNewCellGradName(mName / "noZoneoutNewCellGrad")
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 3)
+    {
+        throw std::runtime_error(prefix + "wrong number of input names");
+    }
+    if (mOutputs.size() != 2)
+    {
+        throw std::runtime_error(prefix + "wrong number of output names");
+    }
+
+    DECLARE_IMPL(LSTMFusedGatesCalcLayer, LSTMFusedGatesCalcLayerCPU<MemoryManager>, LSTMFusedGatesCalcLayerCPU<MemoryManagerFP16>)
+
+    // Declare inputs
+    for (size_t i = 0; i < mInputs.size(); ++i)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[i], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[i], mInputs[i].grad(), DEC_BACK_WRIT_ZERO);
+    }
+
+    // Declare temporal storages
+    if (!mUseSingleParamTensor)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mTmpCalculationsName, WShape{ BS(), 1u, 1u, mOutputsCount }, DEC_FORW_WRIT);
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(
+            mName, mTmpCalculationsName, WShape{ BS(), 1u, 1u, mNetworkParams.mWorkflow.getWidth(mInputs[0]) + mNetworkParams.mWorkflow.getWidth(mInputs[1]) }, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mTmpCalculationsName, mTmpCalculationsName, DEC_BACK_READ);
+        mNetworkParams.mWorkflow.copyDeclaration(name, mTmpCalculationsName, mTmpCalculationsName.grad(), DEC_BACK_WRIT_ZERO);
+    }
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mGatesName, WShape{ BS(), 1u, 1u, mOutputsCount }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(name, mGatesName, mGatesName, DEC_BACK_READ);
+    // And their grads
+    mNetworkParams.mWorkflow.copyDeclaration(name, mGatesName, mGatesName.grad(), DEC_BACK_WRIT_ZERO);
+
+    mUseZoneout = mZoneout != 0.0_dt;
+
+    // Declare outputs
+    for (size_t i = 0; i < mOutputs.size(); ++i)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(name, mInputs[i + 1], mOutputs[i], DEC_FORW_WRIT);
+        if (i == 1)
+        {
+            if (mUseZoneout)
+            {
+                mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[i], mNoZoneoutNewCellName, DEC_FORW_WRIT);
+                mNetworkParams.mWorkflow.copyDeclaration(name, mNoZoneoutNewCellName, mNoZoneoutNewCellName, DEC_BACK_READ);
+            }
+            else
+            {
+                mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[i], mOutputs[i], DEC_BACK_READ);
+            }
+        }
+        mNetworkParams.mWorkflow.copyDeclaration(name, mOutputs[i], mOutputs[i].grad(), DEC_BACK_READ);
+    }
+
+    // Declare trainable params
+    if (!mSharedWeights.empty())
+    {
+        mTrainableParamsNames = mSharedWeights;
+    }
+    else
+    {
+        const auto baseName = !mSharedLayer.empty() ? mSharedLayer : mName;
+        if (mUseSingleParamTensor)
+        {
+            mTrainableParamsNames = { baseName / "Weights", baseName / "Biases" };
+        }
+        else
+        {
+            mTrainableParamsNames = { Name(baseName + "_ih") / "Weights", Name(baseName + "_ih") / "Biases", Name(baseName + "_hh") / "Weights", Name(baseName + "_hh") / "Biases" };
+        }
+    }
+
+    if (mUseSingleParamTensor)
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(
+            mName, mTrainableParamsNames[0], WShape{ 1u, 1u, mOutputsCount, mNetworkParams.mWorkflow.getWidth(mInputs[0]) + mNetworkParams.mWorkflow.getWidth(mInputs[1]) }, DEC_TRAINABLE);
+        if (mUseBias)
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mTrainableParamsNames[1], WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+        }
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mTrainableParamsNames[0], WShape{ 1u, 1u, mOutputsCount, mNetworkParams.mWorkflow.getWidth(mInputs[0]) }, DEC_TRAINABLE);
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mTrainableParamsNames[2], WShape{ 1u, 1u, mOutputsCount, mNetworkParams.mWorkflow.getWidth(mInputs[1]) }, DEC_TRAINABLE);
+        if (mUseBias)
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mTrainableParamsNames[1], WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mTrainableParamsNames[3], WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+        }
+    }
+
+    if (!mFrozen)
+    {
+        const size_t numOfParams = mUseSingleParamTensor ? (mUseBias ? 2 : 1) : (mUseBias ? 4 : 2);
+        for (size_t i = 0; i < numOfParams; ++i)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mTrainableParamsNames[i], mTrainableParamsNames[i].grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+
+    // Zoneout part
+    if (mUseZoneout)
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mRandomNameHidden, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[1], mRandomNameCell, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mRandomNameHidden, DEC_BACK_READ);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[1], mRandomNameCell, DEC_BACK_READ);
+
+        // Declare additional grads storages
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0].grad(), mNoZoneoutNewHiddenGradName, DEC_BACK_WRIT_ZERO);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[1].grad(), mNoZoneoutNewCellGradName, DEC_BACK_WRIT_ZERO);
+    }
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedGatesCalcLayer.h b/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedGatesCalcLayer.h
new file mode 100644
index 00000000..005200fc
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedGatesCalcLayer.h
@@ -0,0 +1,60 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LSTM_FUSED_GATES_CALC_LAYER_H
+#define LSTM_FUSED_GATES_CALC_LAYER_H
+
+#include "training/base/layers/TrainableLayer.h"
+#include "training/base/layers/parameters/trainable/LSTMFusedGatesCalcParams.h"
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+
+class LSTMFusedGatesCalcLayer : public TrainableLayer
+{
+
+  public:
+    LSTMFusedGatesCalcLayer(const Name& name, const LSTMFusedGatesCalcParams& params, NetworkParameters& networkParameters);
+
+    LSTMFusedGatesCalcLayer(LSTMFusedGatesCalcLayer&&) = default;
+    LSTMFusedGatesCalcLayer(const LSTMFusedGatesCalcLayer&) = delete;
+    LSTMFusedGatesCalcLayer& operator=(const LSTMFusedGatesCalcLayer&) = delete;
+
+  private:
+    size_t mOutputsCount;
+    bool mUseBias;
+    dtype mZoneout;
+    bool mUseSingleParamTensor;
+    dtype mForgetBias;
+    Name mTmpCalculationsName;
+    Name mGatesName;
+
+    // Needed if zoneout used
+    Name mRandomNameHidden;
+    Name mRandomNameCell;
+    Name mNoZoneoutNewCellName;
+    Name mNoZoneoutNewHiddenGradName;
+    Name mNoZoneoutNewCellGradName;
+    bool mUseZoneout;
+
+    // Trainable names
+    Names mTrainableParamsNames;
+
+    template<typename MM>
+    friend class LSTMFusedGatesCalcLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedLayer.cpp b/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedLayer.cpp
new file mode 100644
index 00000000..3df95f52
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedLayer.cpp
@@ -0,0 +1,298 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LSTMFusedLayer.h"
+
+#include <training/base/impl/composite/rnn/LSTMFusedLayerCPU.h>
+
+namespace raul
+{
+
+LSTMFusedLayer::LSTMFusedLayer(const Name& name, const LSTMParams& params, const Name& basicName, const Name& nameHiddenStateIn, const Name& nameCellStateIn, NetworkParameters& networkParameters)
+    : TrainableLayer(name, "LSTMFusedLayer", params, networkParameters, { true, false })
+    , mIsExternalState(!params.mHiddenFeatures)
+    , mLengthSequence(0)
+    , mSequenceDimension("depth")
+    , mUseBias(params.mBias)
+    , mZoneout(params.mZoneout)
+    , mUseSingleParamTensor(params.mUseSingleParamTensor)
+    , mForgetBias(params.mForgetBias)
+    , mCurrentInputMaxSize(0)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    const auto nameInput = params.getInputs()[0];
+
+    if (networkParameters.mWorkflow.getDepth(nameInput) != 1u && networkParameters.mWorkflow.getHeight(nameInput) != 1u)
+    {
+        THROW("LSTMFusedLayer", name, " length of sequence should be placed in one dimension: depth or height");
+    }
+
+    if (networkParameters.mWorkflow.getDepth(nameInput) == 1u)
+    {
+        mLengthSequence = networkParameters.mWorkflow.getHeight(nameInput);
+        mSequenceDimension = "height";
+    }
+    else
+    {
+        mLengthSequence = networkParameters.mWorkflow.getDepth(nameInput);
+    }
+
+    if (mLengthSequence == 0)
+    {
+        THROW("LSTMFusedLayer", name, " length of sequence cannot be zero");
+    }
+
+    if (mIsExternalState)
+    {
+        THROW("LSTMFusedLayer", name, " external state not supported");
+    }
+
+    mDirection = BasicParamsWithDim({}, {}, mSequenceDimension).dim;
+
+    if (!mNetworkParams.mWorkflow.isCompilerEnabled())
+    {
+        DECLARE_IMPL(LSTMFusedLayer, LSTMFusedLayerCPU<MemoryManager>, LSTMFusedLayerCPU<MemoryManagerFP16>)
+    }
+
+    size_t cnt = 0;
+
+    Names nameInputSlices(mLengthSequence);
+    Names nameOutputSlices(mIsExternalState ? mLengthSequence - 1 : mLengthSequence);
+    std::generate_n(nameInputSlices.begin(), mLengthSequence, [&, cnt]() mutable { return basicName / nameInput + "[" + Conversions::toString(cnt++) + "]"; });
+
+    std::generate_n(nameOutputSlices.begin(), mLengthSequence, [&, cnt]() mutable { return nameHiddenStateIn + "[" + Conversions::toString(cnt++) + "]"; });
+
+    auto nameUnrolledCellZero = basicName / "cell";
+
+    mTmpCalculationsName.resize(mLengthSequence);
+    mGatesName.resize(mLengthSequence);
+    mRandomNameHidden.resize(mLengthSequence);
+    mRandomNameCell.resize(mLengthSequence);
+    mNoZoneoutNewCellName.resize(mLengthSequence);
+    mNoZoneoutNewHiddenGradName.resize(mLengthSequence);
+    mNoZoneoutNewCellGradName.resize(mLengthSequence);
+
+    mInputsLocal.resize(mLengthSequence);
+    mOutputsLocal.resize(mLengthSequence);
+
+    Name nameHiddenStateInCopy = nameHiddenStateIn;
+    Name nameCellStateInCopy = nameCellStateIn;
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, nameInput, Workflow::Usage::ForwardAndBackward, Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, nameInput, nameInput.grad(), DEC_BACK_WRIT_ZERO);
+
+    for (size_t q = 0; q < mLengthSequence; ++q)
+    {
+        const auto idx = Conversions::toString(q);
+        auto nameUnrolledCell = q == 0 ? nameUnrolledCellZero : (basicName / "unrolled_cell[" + idx + "]");
+        std::string nameHiddenStateOut = nameHiddenStateIn + "[" + idx + "]";
+        std::string nameCellStateOut = nameCellStateIn + "[" + idx + "]";
+
+        Name input;
+        Name inputHidden;
+        Name inputCell;
+        Name outputHidden;
+        Name outputCell;
+
+        auto inputIdx = q;
+        if (q == mLengthSequence - 1)
+        {
+            input = nameInputSlices[inputIdx];
+            inputHidden = nameHiddenStateInCopy;
+            inputCell = nameCellStateInCopy;
+            outputHidden = nameHiddenStateOut;
+            outputCell = nameCellStateOut;
+        }
+        else
+        {
+            input = nameInputSlices[inputIdx];
+            inputHidden = nameHiddenStateInCopy;
+            inputCell = nameCellStateInCopy;
+            outputHidden = nameHiddenStateOut;
+            outputCell = nameCellStateOut;
+
+            nameHiddenStateInCopy = nameHiddenStateOut;
+            nameCellStateInCopy = nameCellStateOut;
+        }
+
+        mTmpCalculationsName[q] = nameUnrolledCell / "tmp";
+        mGatesName[q] = nameUnrolledCell / "gates";
+        mRandomNameHidden[q] = nameUnrolledCell / "randomH";
+        mRandomNameCell[q] = nameUnrolledCell / "randomC";
+        mNoZoneoutNewCellName[q] = nameUnrolledCell / "noZoneoutNewCell";
+        mNoZoneoutNewHiddenGradName[q] = nameUnrolledCell / "noZoneoutNewHiddenGrad";
+        mNoZoneoutNewCellGradName[q] = nameUnrolledCell / "noZoneoutNewCellGrad";
+
+        const auto parts = 4U;
+        mInputsLocal[q] = { input, inputHidden, inputCell };
+        mOutputsLocal[q] = { outputHidden, outputCell };
+
+        const auto sizeHidden = networkParameters.mWorkflow.getWidth(mInputsLocal[q][1]);
+
+        mOutputsCount = sizeHidden * parts;
+
+        // Declare inputs
+        for (size_t i = 1; i < mInputsLocal[q].size(); ++i) // skip data
+        {
+            if (q == 0 || i == 0) // avoid conflicts with outputs
+                mNetworkParams.mWorkflow.copyDeclaration(mName, mInputsLocal[q][i], Workflow::Usage::Forward, Workflow::Mode::Write);
+
+            if (q == 0 || mUseZoneout || i == 0 || i == 1) // avoid conflicts with outputs
+                mNetworkParams.mWorkflow.copyDeclaration(mName, mInputsLocal[q][i], Workflow::Usage::Backward, Workflow::Mode::Read);
+
+            if (q == 0 || i == 0) // avoid conflicts with outputs
+                mNetworkParams.mWorkflow.copyDeclaration(mName, mInputsLocal[q][i], mInputsLocal[q][i].grad(), DEC_BACK_WRIT_ZERO);
+        }
+
+        // Declare temporal storages
+        if (!mUseSingleParamTensor)
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mTmpCalculationsName[q], WShape{ BS(), 1u, 1u, mOutputsCount }, DEC_FORW_WRIT);
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(
+                mName, mTmpCalculationsName[q], WShape{ BS(), 1u, 1u, mNetworkParams.mWorkflow.getWidth(nameInput) + mNetworkParams.mWorkflow.getWidth(mInputsLocal[q][1]) }, DEC_FORW_WRIT);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mTmpCalculationsName[q], mTmpCalculationsName[q], DEC_BACK_READ);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mTmpCalculationsName[q], mTmpCalculationsName[q].grad(), DEC_BACK_WRIT_ZERO);
+        }
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mGatesName[q], WShape{ BS(), 1u, 1u, mOutputsCount }, DEC_FORW_WRIT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mGatesName[q], mGatesName[q], DEC_BACK_READ);
+        // And their grads
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mGatesName[q], mGatesName[q].grad(), DEC_BACK_WRIT_ZERO);
+
+        mUseZoneout = mZoneout != 0.0_dt;
+
+        // Declare outputs
+        for (size_t i = 0; i < mOutputsLocal[q].size(); ++i)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mInputsLocal[q][i + 1], mOutputsLocal[q][i], DEC_FORW_WRIT);
+            if (i == 1)
+            {
+                if (mUseZoneout)
+                {
+                    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputsLocal[q][i], mNoZoneoutNewCellName[q], DEC_FORW_WRIT);
+                    mNetworkParams.mWorkflow.copyDeclaration(mName, mNoZoneoutNewCellName[q], mNoZoneoutNewCellName[q], DEC_BACK_READ);
+                }
+                else
+                {
+                    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputsLocal[q][i], mOutputsLocal[q][i], DEC_BACK_READ);
+                }
+            }
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputsLocal[q][i], mOutputsLocal[q][i].grad(), DEC_BACK_READ);
+        }
+
+        // Zoneout part
+        if (mUseZoneout)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputsLocal[q][0], mRandomNameHidden[q], DEC_FORW_WRIT);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputsLocal[q][1], mRandomNameCell[q], DEC_FORW_WRIT);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputsLocal[q][0], mRandomNameHidden[q], DEC_BACK_READ);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputsLocal[q][1], mRandomNameCell[q], DEC_BACK_READ);
+
+            // Declare additional grads storages
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputsLocal[q][0].grad(), mNoZoneoutNewHiddenGradName[q], DEC_BACK_WRIT_ZERO);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputsLocal[q][1].grad(), mNoZoneoutNewCellGradName[q], DEC_BACK_WRIT_ZERO);
+        }
+    }
+
+    // Declare trainable params
+    const auto baseName = !mSharedLayer.empty() ? mSharedLayer : nameUnrolledCellZero;
+    if (mUseSingleParamTensor)
+    {
+        mTrainableParamsNames = { baseName / "linear" / "Weights", baseName / "linear" / "Biases" };
+    }
+    else
+    {
+        mTrainableParamsNames = { baseName / "linear_ih" / "Weights", baseName / "linear_ih" / "Biases", baseName / "linear_hh" / "Weights", baseName / "linear_hh" / "Biases" };
+    }
+
+    if (mSharedLayer.empty())
+    {
+        if (mUseSingleParamTensor)
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(
+                mName, mTrainableParamsNames[0], WShape{ 1u, 1u, mOutputsCount, mNetworkParams.mWorkflow.getWidth(nameInput) + mNetworkParams.mWorkflow.getWidth(mInputsLocal[0][1]) }, DEC_TRAINABLE);
+            if (mUseBias)
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(mName, mTrainableParamsNames[1], WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+            }
+        }
+        else
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mTrainableParamsNames[0], WShape{ 1u, 1u, mOutputsCount, mNetworkParams.mWorkflow.getWidth(nameInput) }, DEC_TRAINABLE);
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mTrainableParamsNames[2], WShape{ 1u, 1u, mOutputsCount, mNetworkParams.mWorkflow.getWidth(mInputsLocal[0][1]) }, DEC_TRAINABLE);
+            if (mUseBias)
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(mName, mTrainableParamsNames[1], WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+                mNetworkParams.mWorkflow.tensorNeeded(mName, mTrainableParamsNames[3], WShape{ 1u, 1u, 1u, mOutputsCount }, DEC_TRAINABLE);
+            }
+        }
+    }
+
+    if (!mFrozen)
+    {
+        const size_t numOfParams = mUseSingleParamTensor ? (mUseBias ? 2 : 1) : (mUseBias ? 4 : 2);
+        for (size_t i = 0; i < numOfParams; ++i)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mTrainableParamsNames[i], mTrainableParamsNames[i].grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+
+    // concat output
+    yato::dimensionality<3U, size_t> shape(
+        mNetworkParams.mWorkflow.getDepth(mOutputsLocal[0][0]), mNetworkParams.mWorkflow.getHeight(mOutputsLocal[0][0]), mNetworkParams.mWorkflow.getWidth(mOutputsLocal[0][0]));
+
+    switch (mDirection)
+    {
+        case Dimension::Depth:
+            mDimIndex = 0;
+            break;
+        case Dimension::Height:
+            mDimIndex = 1;
+            break;
+        case Dimension::Width:
+            mDimIndex = 2;
+            break;
+        default:
+            THROW(mTypeName, mName, "unsupported dim");
+    }
+
+    for (size_t i = 1; i < mOutputsLocal.size(); ++i)
+    {
+        yato::dimensionality<3U, size_t> inputShape(
+            mNetworkParams.mWorkflow.getDepth(mOutputsLocal[i][0]), mNetworkParams.mWorkflow.getHeight(mOutputsLocal[i][0]), mNetworkParams.mWorkflow.getWidth(mOutputsLocal[i][0]));
+        mCurrentInputMaxSize = std::max(mCurrentInputMaxSize, inputShape[0] * inputShape[1] * inputShape[2]);
+
+        for (size_t k = 0; k < 3; ++k)
+        {
+            if (k == mDimIndex)
+            {
+                continue;
+            }
+            if (shape[k] != inputShape[k])
+            {
+                THROW(
+                    mTypeName, mName, "inconsistent input shapes (" + mOutputs[0] + " " + Conversions::toString(shape) + " vs " + mOutputsLocal[i][i] + " " + Conversions::toString(inputShape) + ")");
+            }
+        }
+        shape[mDimIndex] += inputShape[mDimIndex];
+    }
+
+    mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], WShape{ BS(), shape[0], shape[1], shape[2] }, DEC_FORW_WRIT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedLayer.h b/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedLayer.h
new file mode 100644
index 00000000..36c24510
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/LSTMFusedLayer.h
@@ -0,0 +1,80 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LSTM_FUSED_LAYER_H
+#define LSTM_FUSED_LAYER_H
+
+#include "training/base/layers/TrainableLayer.h"
+#include "training/base/layers/parameters/trainable/LSTMParams.h"
+#include <training/base/common/Common.h>
+
+namespace raul
+{
+/**
+ * @brief Long Short-Term Memory Layer - fused version
+ *
+ * This layer is a fused layer.
+ *
+ * @see
+ * - S. Hochreiter and J. Schmidhuber, �Long Short-Term Memory� Neural Comput., vol. 9, no. 8, pp. 1735�1780, Nov. 1997, doi: 10.1162/neco.1997.9.8.1735.
+ */
+class LSTMFusedLayer : public TrainableLayer
+{
+  public:
+    LSTMFusedLayer(const Name& name, const LSTMParams& params, const Name& basicName, const Name& nameHiddenStateIn, const Name& nameCellStateIn, NetworkParameters& networkParameters);
+
+    LSTMFusedLayer(LSTMFusedLayer&&) = default;
+    LSTMFusedLayer(const LSTMFusedLayer&) = delete;
+    LSTMFusedLayer& operator=(const LSTMFusedLayer&) = delete;
+
+  private:
+    bool mIsExternalState;
+    size_t mLengthSequence;
+    std::string mSequenceDimension;
+
+    std::vector<Names> mInputsLocal;
+    std::vector<Names> mOutputsLocal;
+
+    size_t mOutputsCount;
+    bool mUseBias;
+    dtype mZoneout;
+    bool mUseSingleParamTensor;
+    dtype mForgetBias;
+    Names mTmpCalculationsName;
+    Names mGatesName;
+
+    // Needed if zoneout used
+    Names mRandomNameHidden;
+    Names mRandomNameCell;
+    Names mNoZoneoutNewCellName;
+    Names mNoZoneoutNewHiddenGradName;
+    Names mNoZoneoutNewCellGradName;
+    bool mUseZoneout;
+
+    // Trainable names
+    Names mTrainableParamsNames;
+
+    Name mHidden;
+    Name mCell;
+
+    Dimension mDirection;
+    size_t mDimIndex = 0;
+    size_t mCurrentInputMaxSize;
+
+    template<typename MM>
+    friend class LSTMFusedLayerCPU;
+    friend class LSTMFusedLayerCPUFP16;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/LSTMLayer.cpp b/training/src/compiler/training/base/layers/composite/rnn/LSTMLayer.cpp
new file mode 100644
index 00000000..9119aa07
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/LSTMLayer.cpp
@@ -0,0 +1,257 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LSTMLayer.h"
+#include "ZeroOutputLayer.h"
+
+#include <algorithm>
+#include <map>
+
+#include <training/base/layers/basic/ConcatenationLayer.h>
+#include <training/base/layers/basic/ReverseLayer.h>
+#include <training/base/layers/basic/SlicerLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/base/layers/composite/rnn/LSTMFusedLayer.h>
+
+namespace raul
+{
+
+LSTMLayer::LSTMLayer(const Name& name, const LSTMParams& params, NetworkParameters& networkParameters)
+    : mIsExternalState(!params.mHiddenFeatures)
+    , mLengthSequence(0)
+    , mSequenceDimension("depth")
+{
+    try
+    {
+
+        this->verifyInOut(name, params);
+        this->initLocalState(name, params, networkParameters);
+        this->buildLSTMLayer(name, params, networkParameters);
+    }
+    catch (...)
+    {
+        THROW("LSTMLayer", name, "Cannot create LSTM layer");
+    }
+}
+
+void LSTMLayer::verifyInOut(const Name& lName, const LSTMParams& params) const
+{
+    const size_t inOutNum = mIsExternalState ? 3U : 1U;
+    if (params.getInputs().size() != inOutNum && params.getInputs().size() != inOutNum + 1u)
+    {
+        THROW("LSTM", lName, " wrong number of input names");
+    }
+    if (params.getOutputs().size() != inOutNum)
+    {
+        THROW("LSTM", lName, " wrong number of output names");
+    }
+}
+
+void LSTMLayer::initLocalState(const Name& name, const LSTMParams& params, NetworkParameters& networkParameters)
+{
+
+    if (params.mUseGlobalFusion && params.mUseFusion)
+    {
+        THROW("LSTM", name, " use only one type of fusion");
+    }
+
+    if (!params.mUseGlobalFusion)
+    {
+        const auto nameInput = params.getInputs()[0];
+
+        if (networkParameters.mWorkflow.getDepth(nameInput) != 1u && networkParameters.mWorkflow.getHeight(nameInput) != 1u)
+        {
+            THROW("LSTM", name, " length of sequence should be placed in one dimension: depth or height");
+        }
+
+        if (networkParameters.mWorkflow.getDepth(nameInput) == 1u)
+        {
+            mLengthSequence = networkParameters.mWorkflow.getHeight(nameInput);
+            mSequenceDimension = "height";
+        }
+        else
+        {
+            mLengthSequence = networkParameters.mWorkflow.getDepth(nameInput);
+        }
+
+        if (mLengthSequence == 0)
+        {
+            THROW("LSTM", name, " length of sequence cannot be zero");
+        }
+    }
+
+    if (params.mHiddenFeatures)
+    {
+        mState.hidden = name / "hidden_state";
+        mState.cell = name / "cell_state";
+        networkParameters.mWorkflow.add<TensorLayer>(mState.hidden, raul::TensorParams{ { mState.hidden }, raul::WShape{ BS(), 1, 1, *params.mHiddenFeatures }, 0.0_dt, DEC_FORW_READ });
+        networkParameters.mWorkflow.add<TensorLayer>(mState.cell, raul::TensorParams{ { mState.cell }, raul::WShape{ BS(), 1, 1, *params.mHiddenFeatures }, 0.0_dt, DEC_FORW_READ });
+    }
+    else
+    {
+        mState.hidden = params.getInputs()[1];
+        mState.cell = params.getInputs()[2];
+    }
+}
+
+void LSTMLayer::buildLSTMLayer(const Name& name, const LSTMParams& params, NetworkParameters& networkParameters) const
+{
+    size_t cnt = 0;
+    const auto nameInput = params.getInputs()[0];
+    const auto nameOutput = params.getOutputs()[0];
+
+    const auto useSeqLength = params.getInputs().size() == 2u || params.getInputs().size() == 4u;
+    if (params.mReversed)
+    {
+        if (useSeqLength)
+        {
+            networkParameters.mWorkflow.add<ReverseLayer>(name / "reverse_input", BasicParams({ nameInput, params.getInputs().back() }, { name / "input_reversed" }));
+        }
+        else
+        {
+            networkParameters.mWorkflow.add<ReverseLayer>(name / "reverse_input", BasicParams({ nameInput }, { name / "input_reversed" }));
+        }
+    }
+
+    if (!params.mUseGlobalFusion)
+    {
+        Names nameInputSlices(mLengthSequence);
+        Names nameOutputSlices(mIsExternalState ? mLengthSequence - 1 : mLengthSequence);
+        std::generate_n(nameInputSlices.begin(), mLengthSequence, [&, cnt]() mutable { return name / nameInput + "[" + Conversions::toString(cnt++) + "]"; });
+
+        if (mIsExternalState)
+        {
+            std::generate_n(nameOutputSlices.begin(), mLengthSequence - 1, [&, cnt]() mutable { return mState.hidden + "[" + Conversions::toString(cnt++) + "]"; });
+            nameOutputSlices.push_back(params.getOutputs()[1]);
+        }
+        else
+        {
+            std::generate_n(nameOutputSlices.begin(), mLengthSequence, [&, cnt]() mutable { return mState.hidden + "[" + Conversions::toString(cnt++) + "]"; });
+        }
+
+        networkParameters.mWorkflow.add<SlicerLayer>(name / "slice", SlicingParams(params.mReversed ? name / "input_reversed" : nameInput, nameInputSlices, mSequenceDimension));
+
+        std::string nameHiddenStateIn = mState.hidden;
+        std::string nameCellStateIn = mState.cell;
+
+        auto nameUnrolledCellZero = name / "cell";
+
+        for (size_t i = 0; i < mLengthSequence; ++i)
+        {
+            const auto idx = Conversions::toString(i);
+            auto nameUnrolledCell = i == 0 ? nameUnrolledCellZero : (name / "unrolled_cell[" + idx + "]");
+            std::string nameHiddenStateOut = mState.hidden + "[" + idx + "]";
+            std::string nameCellStateOut = mState.cell + "[" + idx + "]";
+
+            Names nameUnrolledCellWeights = {
+                nameUnrolledCellZero / "linear_ih::Weights", nameUnrolledCellZero / "linear_ih::Biases", nameUnrolledCellZero / "linear_hh::Weights", nameUnrolledCellZero / "linear_hh::Biases"
+            };
+            if (params.mUseSingleParamTensor)
+            {
+                nameUnrolledCellWeights = { nameUnrolledCellZero / "linear::Weights", nameUnrolledCellZero / "linear::Biases" };
+            }
+
+            if (i == 0) nameUnrolledCellWeights.clear(); // to create tensors for first layer
+
+            auto inputIdx = i;
+            if (i == mLengthSequence - 1)
+            {
+                if (mIsExternalState)
+                {
+                    LSTMCellLayer(nameUnrolledCell,
+                                  LSTMCellParams(nameInputSlices[inputIdx],
+                                                 nameHiddenStateIn,
+                                                 nameCellStateIn,
+                                                 params.getOutputs()[1],
+                                                 params.getOutputs()[2],
+                                                 nameUnrolledCellWeights,
+                                                 params.mBias,
+                                                 params.mZoneout,
+                                                 params.mUseSingleParamTensor,
+                                                 params.mForgetBias,
+                                                 params.frozen,
+                                                 params.mUseFusion),
+                                  networkParameters);
+                }
+                else
+                {
+                    LSTMCellLayer(nameUnrolledCell,
+                                  LSTMCellParams(nameInputSlices[inputIdx],
+                                                 nameHiddenStateIn,
+                                                 nameCellStateIn,
+                                                 nameHiddenStateOut,
+                                                 nameCellStateOut,
+                                                 nameUnrolledCellWeights,
+                                                 params.mBias,
+                                                 params.mZoneout,
+                                                 params.mUseSingleParamTensor,
+                                                 params.mForgetBias,
+                                                 params.frozen,
+                                                 params.mUseFusion),
+                                  networkParameters);
+                }
+            }
+            else
+            {
+                LSTMCellLayer(nameUnrolledCell,
+                              LSTMCellParams(nameInputSlices[inputIdx],
+                                             nameHiddenStateIn,
+                                             nameCellStateIn,
+                                             nameHiddenStateOut,
+                                             nameCellStateOut,
+                                             nameUnrolledCellWeights,
+                                             params.mBias,
+                                             params.mZoneout,
+                                             params.mUseSingleParamTensor,
+                                             params.mForgetBias,
+                                             params.frozen,
+                                             params.mUseFusion),
+                              networkParameters);
+
+                nameHiddenStateIn = nameHiddenStateOut;
+                nameCellStateIn = nameCellStateOut;
+            }
+        }
+
+        networkParameters.mWorkflow.add<ConcatenationLayer>(name / "concat",
+                                                            BasicParamsWithDim(nameOutputSlices, { (useSeqLength || params.mReversed) ? name / "proto_output" : nameOutput }, mSequenceDimension));
+    }
+    else
+    {
+        LSTMParams lstmParams(params);
+        if (useSeqLength || params.mReversed)
+        {
+            lstmParams.getOutputs()[0] = name / "proto_output";
+        }
+        networkParameters.mWorkflow.add<LSTMFusedLayer>(name / "globalFusion", lstmParams, name, mState.hidden, mState.cell);
+    }
+
+    if (useSeqLength)
+    {
+        networkParameters.mWorkflow.add<ZeroOutputLayer>(name / "zero", BasicParams({ name / "proto_output", params.getInputs().back() }, { params.mReversed ? name / "zeroed_output" : nameOutput }));
+    }
+
+    if (params.mReversed)
+    {
+        if (useSeqLength)
+        {
+            networkParameters.mWorkflow.add<ReverseLayer>(name / "reverse", BasicParams({ name / "zeroed_output", params.getInputs().back() }, { nameOutput }));
+        }
+        else
+        {
+            networkParameters.mWorkflow.add<ReverseLayer>(name / "reverse", BasicParams({ name / "proto_output" }, { nameOutput }));
+        }
+    }
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/composite/rnn/LSTMLayer.h b/training/src/compiler/training/base/layers/composite/rnn/LSTMLayer.h
new file mode 100644
index 00000000..998e5c0a
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/LSTMLayer.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LSTM_LAYER_H
+#define LSTM_LAYER_H
+
+#include "training/base/layers/parameters/trainable/LSTMParams.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/composite/rnn/LSTMCellLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Long Short-Term Memory Layer
+ *
+ * This layer is a multi LSTM cell layer.
+ *
+ * @see
+ * - S. Hochreiter and J. Schmidhuber, “Long Short-Term Memory” Neural Comput., vol. 9, no. 8, pp. 1735–1780, Nov. 1997, doi: 10.1162/neco.1997.9.8.1735.
+ */
+class LSTMLayer
+{
+    struct InternalStateT
+    {
+        Name hidden;
+        Name cell;
+    };
+
+  public:
+    LSTMLayer(const Name& name, const LSTMParams& params, NetworkParameters& networkParameters);
+
+    LSTMLayer(LSTMLayer&&) = default;
+    LSTMLayer(const LSTMLayer&) = delete;
+    LSTMLayer& operator=(const LSTMLayer&) = delete;
+
+  private:
+    void verifyInOut(const Name& name, const LSTMParams& params) const;
+    void initLocalState(const Name& name, const LSTMParams& params, NetworkParameters& networkParameters);
+    void buildLSTMLayer(const Name& name, const LSTMParams& params, NetworkParameters& networkParameters) const;
+
+    InternalStateT mState;
+    bool mIsExternalState;
+    size_t mLengthSequence;
+    std::string mSequenceDimension;
+};
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/composite/rnn/ZeroOutputLayer.cpp b/training/src/compiler/training/base/layers/composite/rnn/ZeroOutputLayer.cpp
new file mode 100644
index 00000000..ddc07386
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/ZeroOutputLayer.cpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ZeroOutputLayer.h"
+#include "impl/ZeroOutputLayerCPU.h"
+
+namespace raul
+{
+
+ZeroOutputLayer::ZeroOutputLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "ZeroOutput", params, networkParameters)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    DECLARE_IMPL(ZeroOutputLayer, ZeroOutputLayerCPU<MemoryManager>, ZeroOutputLayerCPU<MemoryManagerFP16>)
+
+    if (mInputs.size() != 2)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mRealLengthName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mRealLengthName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+    mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/ZeroOutputLayer.h b/training/src/compiler/training/base/layers/composite/rnn/ZeroOutputLayer.h
new file mode 100644
index 00000000..9e201a26
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/ZeroOutputLayer.h
@@ -0,0 +1,42 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ZERO_OUTPUT_LAYER_H
+#define ZERO_OUTPUT_LAYER_H
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+class ZeroOutputLayer : public BasicLayer
+{
+  public:
+    ZeroOutputLayer(const Name& name, const BasicParams& params, NetworkParameters& networkParameters);
+
+    ZeroOutputLayer(ZeroOutputLayer&&) = default;
+    ZeroOutputLayer(const ZeroOutputLayer&) = delete;
+    ZeroOutputLayer& operator=(const ZeroOutputLayer&) = delete;
+
+  private:
+    Name mInputName;
+    Name mRealLengthName;
+    Name mOutputName;
+    template<typename MM>
+    friend class ZeroOutputLayerCPU;
+};
+
+} // namespace raul
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/ZoneoutLayer.cpp b/training/src/compiler/training/base/layers/composite/rnn/ZoneoutLayer.cpp
new file mode 100644
index 00000000..1855ee5d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/ZoneoutLayer.cpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ZoneoutLayer.h"
+#include <training/base/layers/basic/DropoutLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/basic/RandomSelectLayer.h>
+#include <training/base/layers/basic/ScaleLayer.h>
+#include <training/base/layers/basic/SelectLayer.h>
+
+namespace raul
+{
+
+ZoneoutLayer::ZoneoutLayer(const Name& name, const ZoneoutParams& params, NetworkParameters& networkParameters)
+    : RandomSelectLayer(name, RandomSelectParams{ params.getInputs(), params.getOutputs(), 1_dt - params.mProbability }, networkParameters)
+{
+}
+
+void ZoneoutLayer::forwardComputeImpl(NetworkMode mode)
+{
+    if (mode == NetworkMode::Train || mode == NetworkMode::TrainCheckpointed)
+    {
+        RandomSelectLayer::forwardComputeImpl(mode);
+    }
+    else
+    {
+        THROW("ZoneoutLayer", mName, "Test mode not implemented");
+    }
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/composite/rnn/ZoneoutLayer.h b/training/src/compiler/training/base/layers/composite/rnn/ZoneoutLayer.h
new file mode 100644
index 00000000..34188f99
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/ZoneoutLayer.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ZONEOUT_LAYER_H
+#define ZONEOUT_LAYER_H
+
+#include "training/base/layers/BasicLayer.h"
+#include "training/base/layers/parameters/ZoneoutParams.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/basic/RandomSelectLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Zoneout mechanism for RNN cells
+ *
+ * The main parameter is the probability of choosing a previous value. The probability must be in [0,1).
+ *
+ * @see
+ * - D. Krueger et al., “Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations,” arXiv:1606.01305 [cs], Sep. 2017, Accessed: Jul. 15, 2020. [Online]. Available:
+ * http://arxiv.org/abs/1606.01305.
+ */
+class ZoneoutLayer : public RandomSelectLayer
+{
+  public:
+    ZoneoutLayer(const Name& name, const ZoneoutParams& params, NetworkParameters& networkParameters);
+
+    ZoneoutLayer(ZoneoutLayer&&) = default;
+    ZoneoutLayer(const ZoneoutLayer&) = delete;
+    ZoneoutLayer& operator=(const ZoneoutLayer&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/impl/GRUFusedGatesCalcLayerCPU.cpp b/training/src/compiler/training/base/layers/composite/rnn/impl/GRUFusedGatesCalcLayerCPU.cpp
new file mode 100644
index 00000000..4651c177
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/impl/GRUFusedGatesCalcLayerCPU.cpp
@@ -0,0 +1,309 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GRUFusedGatesCalcLayerCPU.h"
+#include "../GRUFusedGatesCalcLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::GRUFusedGatesCalcLayer, raul::GRUFusedGatesCalcLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::GRUFusedGatesCalcLayer, raul::GRUFusedGatesCalcLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void GRUFusedGatesCalcLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    // Process input
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    auto& linearIH = work.getMemoryManager<MM>()[mLayer.mLinearIHTmp];
+
+    const auto& weightsIH = work.getMemoryManager<MM>().getTensor(mLayer.mWeightsNameIH);
+
+    const auto batchSize = work.getBatchSize();
+    size_t N = batchSize * input.getDepth() * input.getHeight();
+
+    auto beta = 0.0_dt;
+    if (mLayer.mUseBiasForInput)
+    {
+        const auto& biasesIH = work.getMemoryManager<MM>().getTensor(mLayer.mBiasesNameIH);
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < N; i++)
+        {
+            std::copy(biasesIH.cbegin(), biasesIH.cend(), linearIH.begin() + i * mLayer.mOutputsCount);
+        }
+
+        beta = 1.0_dt;
+    }
+
+    Common::gemm(CblasNoTrans,
+                 CblasTrans,
+                 N,
+                 mLayer.mOutputsCount,
+                 input.getWidth(),
+                 1.0_dt,
+                 input.getBuffer(),
+                 weightsIH.getBuffer(),
+                 beta,
+                 linearIH.getBuffer());
+
+    // Process hidden
+    const auto& hiddenState = work.getMemoryManager<MM>()[mLayer.mInputs[1]];
+    auto& linearHH = work.getMemoryManager<MM>()[mLayer.mLinearHHTmp];
+
+    const auto& weightsHH = work.getMemoryManager<MM>().getTensor(mLayer.mWeightsNameHH);
+
+    beta = 0_dt;
+    if (mLayer.mUseBiasForHidden)
+    {
+        const auto& biasesHH = work.getMemoryManager<MM>().getTensor(mLayer.mBiasesNameHH);
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < N; i++)
+        {
+            std::copy(biasesHH.cbegin(), biasesHH.cend(), linearHH.begin() + i * mLayer.mOutputsCount);
+        }
+
+        beta = 1.0_dt;
+    }
+
+    Common::gemm(CblasNoTrans,
+                 CblasTrans,
+                 N,
+                 mLayer.mOutputsCount,
+                 hiddenState.getWidth(),
+                 1.0_dt,
+                 hiddenState.getBuffer(),
+                 weightsHH.getBuffer(),
+                 beta,
+                 linearHH.getBuffer());
+
+    auto& newHiddenState = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+
+    const auto sliceSize = linearIH.getWidth() / 3;
+
+    const auto linearIH2D = linearIH.reshape(yato::dims(batchSize, sliceSize * 3));
+    const auto linearHH2D = linearHH.reshape(yato::dims(batchSize, sliceSize * 3));
+    const auto hiddenState2D = hiddenState.reshape(yato::dims(batchSize, sliceSize));
+    const auto newHiddenState2D = newHiddenState.reshape(yato::dims(batchSize, sliceSize));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < batchSize; ++i)
+    {
+        for (size_t j = 0; j < sliceSize; ++j)
+        {
+            auto sigmoidGates0 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][j]) - TODTYPE(linearHH2D[i][j])));
+            auto sigmoidGates1 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][sliceSize + j]) - TODTYPE(linearHH2D[i][sliceSize + j])));
+            auto tanhGates2 = std::tanh(sigmoidGates0 * TODTYPE(linearHH2D[i][sliceSize * 2 + j]) + TODTYPE(linearIH2D[i][sliceSize * 2 + j]));
+            newHiddenState2D[i][j] = TOMMTYPE(sigmoidGates1 * TODTYPE(hiddenState2D[i][j]) + tanhGates2 * (1.0_dt - sigmoidGates1));
+        }
+    }
+}
+
+template<typename MM>
+void GRUFusedGatesCalcLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltasHidden = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+    const auto& linearIH = work.getMemoryManager<MM>()[mLayer.mLinearIHTmp];
+    const auto& linearHH = work.getMemoryManager<MM>()[mLayer.mLinearHHTmp];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& hiddenState = work.getMemoryManager<MM>()[mLayer.mInputs[1]];
+
+    const auto& weightsIH = work.getMemoryManager<MM>()[mLayer.mWeightsNameIH];
+    const auto& weightsHH = work.getMemoryManager<MM>()[mLayer.mWeightsNameHH];
+
+    const auto batchSize = work.getBatchSize();
+    const auto sliceSize = linearIH.getWidth() / 3;
+    size_t N = batchSize * input.getDepth() * input.getHeight();
+
+    const auto deltasHidden2D = deltasHidden.reshape(yato::dims(batchSize, sliceSize));
+    const auto linearIH2D = linearIH.reshape(yato::dims(batchSize, sliceSize * 3));
+    const auto linearHH2D = linearHH.reshape(yato::dims(batchSize, sliceSize * 3));
+    const auto hiddenState2D = hiddenState.reshape(yato::dims(batchSize, sliceSize));
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]) || !mLayer.mFrozen)
+    {
+        auto& linearIHGrad = work.getMemoryManager<MM>()[mLayer.mLinearIHTmp.grad()];
+        auto linearIHGrad2D = linearIHGrad.reshape(yato::dims(batchSize, sliceSize * 3));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t j = 0; j < sliceSize; ++j)
+            {
+                auto sigmoidGates0 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][j]) - TODTYPE(linearHH2D[i][j])));
+                auto sigmoidGates1 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][sliceSize + j]) - TODTYPE(linearHH2D[i][sliceSize + j])));
+                auto tanhGates2 = std::tanh(sigmoidGates0 * TODTYPE(linearHH2D[i][sliceSize * 2 + j]) + TODTYPE(linearIH2D[i][sliceSize * 2 + j]));
+
+                const auto coeff = (1.0_dt - sigmoidGates1) * TODTYPE(deltasHidden2D[i][j]);
+
+                linearIHGrad2D[i][j] += TOMMTYPE(sigmoidGates0 * (1.0_dt - sigmoidGates0) * TODTYPE(linearHH2D[i][sliceSize * 2 + j]) * (1.0_dt - tanhGates2 * tanhGates2) * coeff);
+                linearIHGrad2D[i][sliceSize + j] += TOMMTYPE(sigmoidGates1 * (TODTYPE(hiddenState2D[i][j]) - tanhGates2) * coeff);
+                linearIHGrad2D[i][sliceSize * 2 + j] += TOMMTYPE((1.0_dt - tanhGates2 * tanhGates2) * coeff);
+            }
+        }
+
+        // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+        {
+            auto& inputGrad = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+
+            Common::gemm(CblasNoTrans,
+                         CblasNoTrans,
+                         N,
+                         inputGrad.getWidth(),
+                         mLayer.mOutputsCount,
+                         1.0_dt,
+                         linearIHGrad.getBuffer(),
+                         weightsIH.getBuffer(),
+                         1.0_dt,
+                         inputGrad.getBuffer());
+        }
+
+        if (!mLayer.mFrozen)
+        {
+            auto& gradWeightsIH = work.getMemoryManager<MM>()[mLayer.mWeightsNameIH.grad()];
+
+            Common::gemm(CblasTrans,
+                         CblasNoTrans,
+                         mLayer.mOutputsCount,
+                         input.getWidth(),
+                         N,
+                         1.0_dt,
+                         linearIHGrad.getBuffer(),
+                         input.getBuffer(),
+                         1.0_dt,
+                         gradWeightsIH.getBuffer());
+
+            if (mLayer.mUseBiasForInput)
+            {
+                auto& gradBiasesIH = work.getMemoryManager<MM>()[mLayer.mBiasesNameIH.grad()];
+                //#if defined(_OPENMP)
+                //#pragma omp parallel for
+                //#endif
+                for (size_t i = 0; i < N; i++)
+                {
+                    std::transform(linearIHGrad.cbegin() + i * mLayer.mOutputsCount,
+                                   linearIHGrad.cbegin() + i * mLayer.mOutputsCount + mLayer.mOutputsCount,
+                                   gradBiasesIH.cbegin(),
+                                   gradBiasesIH.begin(),
+                                   std::plus<typename MM::type>());
+                }
+            }
+        }
+    }
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[1]) || !mLayer.mFrozen)
+    {
+        auto& linearHHGrad = work.getMemoryManager<MM>()[mLayer.mLinearHHTmp.grad()];
+        auto linearHHGrad2D = linearHHGrad.reshape(yato::dims(batchSize, sliceSize * 3));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t j = 0; j < sliceSize; ++j)
+            {
+                auto sigmoidGates0 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][j]) - TODTYPE(linearHH2D[i][j])));
+                auto sigmoidGates1 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][sliceSize + j]) - TODTYPE(linearHH2D[i][sliceSize + j])));
+                auto tanhGates2 = std::tanh(sigmoidGates0 * TODTYPE(linearHH2D[i][sliceSize * 2 + j]) + TODTYPE(linearIH2D[i][sliceSize * 2 + j]));
+
+                const auto coeff = (1.0_dt - sigmoidGates1) * TODTYPE(deltasHidden2D[i][j]);
+
+                linearHHGrad2D[i][j] += TOMMTYPE(sigmoidGates0 * (1.0_dt - sigmoidGates0) * TODTYPE(linearHH2D[i][sliceSize * 2 + j]) * (1.0_dt - tanhGates2 * tanhGates2) * coeff);
+                linearHHGrad2D[i][sliceSize + j] += TOMMTYPE(sigmoidGates1 * (TODTYPE(hiddenState2D[i][j]) - tanhGates2) * coeff);
+                linearHHGrad2D[i][sliceSize * 2 + j] += TOMMTYPE(sigmoidGates0 * (1.0_dt - tanhGates2 * tanhGates2) * coeff);
+            }
+        }
+
+        // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[1]))
+        {
+            auto& hiddenStateGrad = work.getMemoryManager<MM>()[mLayer.mInputs[1].grad()];
+            auto hiddenStateGrad2D = hiddenStateGrad.reshape(yato::dims(batchSize, sliceSize));
+
+#if defined(_OPENMP)
+#pragma omp parallel for ordered
+#endif
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t j = 0; j < sliceSize; ++j)
+                {
+                    auto sigmoidGates1 = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(linearIH2D[i][sliceSize + j]) - TODTYPE(linearHH2D[i][sliceSize + j])));
+                    hiddenStateGrad2D[i][j] += TOMMTYPE(sigmoidGates1 * TODTYPE(deltasHidden2D[i][j]));
+                }
+            }
+
+            Common::gemm(CblasNoTrans,
+                         CblasNoTrans,
+                         N,
+                         hiddenStateGrad.getWidth(),
+                         mLayer.mOutputsCount,
+                         1.0_dt,
+                         linearHHGrad.getBuffer(),
+                         weightsHH.getBuffer(),
+                         1.0_dt,
+                         hiddenStateGrad.getBuffer());
+        }
+
+        if (!mLayer.mFrozen)
+        {
+            auto& gradWeightsHH = work.getMemoryManager<MM>()[mLayer.mWeightsNameHH.grad()];
+
+            Common::gemm(CblasTrans,
+                         CblasNoTrans,
+                         mLayer.mOutputsCount,
+                         hiddenState.getWidth(),
+                         N,
+                         1.0_dt,
+                         linearHHGrad.getBuffer(),
+                         hiddenState.getBuffer(),
+                         1.0_dt,
+                         gradWeightsHH.getBuffer());
+
+            if (mLayer.mUseBiasForHidden)
+            {
+                auto& gradBiasesHH = work.getMemoryManager<MM>()[mLayer.mBiasesNameHH.grad()];
+                //#if defined(_OPENMP)
+                //#pragma omp parallel for
+                //#endif
+                for (size_t i = 0; i < N; i++)
+                {
+                    std::transform(linearHHGrad.cbegin() + i * mLayer.mOutputsCount,
+                                   linearHHGrad.cbegin() + i * mLayer.mOutputsCount + mLayer.mOutputsCount,
+                                   gradBiasesHH.cbegin(),
+                                   gradBiasesHH.begin(),
+                                   std::plus<typename MM::type>());
+                }
+            }
+        }
+    }
+}
+
+template class GRUFusedGatesCalcLayerCPU<MemoryManager>;
+template class GRUFusedGatesCalcLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/impl/GRUFusedGatesCalcLayerCPU.h b/training/src/compiler/training/base/layers/composite/rnn/impl/GRUFusedGatesCalcLayerCPU.h
new file mode 100644
index 00000000..86fc8a2e
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/impl/GRUFusedGatesCalcLayerCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRU_FUSED_GATES_CALC_LAYER_CPU_H
+#define GRU_FUSED_GATES_CALC_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class GRUFusedGatesCalcLayer;
+
+/**
+ * @brief GRUFusedGatesCalcLayer CPU implementation
+ */
+template<typename MM>
+class GRUFusedGatesCalcLayerCPU : public BasicImpl
+{
+  public:
+    GRUFusedGatesCalcLayerCPU(GRUFusedGatesCalcLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    GRUFusedGatesCalcLayerCPU(GRUFusedGatesCalcLayerCPU&&) = default;
+    GRUFusedGatesCalcLayerCPU(const GRUFusedGatesCalcLayerCPU&) = delete;
+    GRUFusedGatesCalcLayerCPU& operator=(const GRUFusedGatesCalcLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    GRUFusedGatesCalcLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/impl/LSTMFusedGatesCalcLayerCPU.cpp b/training/src/compiler/training/base/layers/composite/rnn/impl/LSTMFusedGatesCalcLayerCPU.cpp
new file mode 100644
index 00000000..16632c73
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/impl/LSTMFusedGatesCalcLayerCPU.cpp
@@ -0,0 +1,488 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LSTMFusedGatesCalcLayerCPU.h"
+#include "../LSTMFusedGatesCalcLayer.h"
+
+#include <training/base/common/Random.h>
+
+namespace raul
+{
+
+template<typename MM>
+void LSTMFusedGatesCalcLayerCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    // Process gates
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& hiddenState = work.getMemoryManager<MM>()[mLayer.mInputs[1]];
+
+    auto& gates = work.getMemoryManager<MM>()[mLayer.mGatesName];
+    auto& tmp = work.getMemoryManager<MM>()[mLayer.mTmpCalculationsName];
+
+    const auto batchSize = work.getBatchSize();
+    size_t N = batchSize * input.getDepth() * input.getHeight();
+
+    // If use single matrix
+    if (mLayer.mUseSingleParamTensor)
+    {
+        auto tmp2D = tmp.reshape(yato::dims(batchSize, tmp.getWidth()));
+        const auto input2D = input.reshape(yato::dims(batchSize, input.getWidth()));
+        const auto hiddenState2D = hiddenState.reshape(yato::dims(batchSize, hiddenState.getWidth()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            std::copy(input2D[i].begin(), input2D[i].end(), tmp2D[i].begin());
+            std::copy(hiddenState2D[i].begin(), hiddenState2D[i].end(), &tmp2D[i][input.getWidth()]);
+        }
+
+        const auto& weights = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[0]);
+
+        Common::gemm(CblasNoTrans,
+                     CblasTrans,
+                     N,
+                     mLayer.mOutputsCount,
+                     tmp.getWidth(),
+                     1.0_dt,
+                     tmp.getBuffer(),
+                     weights.getBuffer(),
+                     0.0_dt,
+                     gates.getBuffer());
+
+        if (mLayer.mUseBias)
+        {
+            const auto& biases = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[1]);
+
+            for (size_t index = 0; index < N; ++index)
+            {
+                Common::axpy(mLayer.mOutputsCount, 1.0_dt, biases.getBuffer(), 1, gates.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+            }
+        }
+    }
+    else
+    {
+        // Process input
+        const auto& weightsIH = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[0]);
+
+        Common::gemm(CblasNoTrans,
+                     CblasTrans,
+                     N,
+                     mLayer.mOutputsCount,
+                     input.getWidth(),
+                     1.0_dt,
+                     input.getBuffer(),
+                     weightsIH.getBuffer(),
+                     0.0_dt,
+                     tmp.getBuffer());
+
+        if (mLayer.mUseBias)
+        {
+            const auto& biasesIH = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[1]);
+
+            for (size_t index = 0; index < N; ++index)
+            {
+                Common::axpy(mLayer.mOutputsCount, 1.0_dt, biasesIH.getBuffer(), 1, tmp.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+            }
+        }
+
+        // Process hidden
+        const auto& weightsHH = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[2]);
+
+        Common::gemm(CblasNoTrans,
+                     CblasTrans,
+                     N,
+                     mLayer.mOutputsCount,
+                     hiddenState.getWidth(),
+                     1.0_dt,
+                     hiddenState.getBuffer(),
+                     weightsHH.getBuffer(),
+                     0.0_dt,
+                     gates.getBuffer());
+
+        if (mLayer.mUseBias)
+        {
+            const auto& biasesHH = work.getMemoryManager<MM>().getTensor(mLayer.mTrainableParamsNames[3]);
+
+            for (size_t index = 0; index < N; ++index)
+            {
+                Common::axpy(mLayer.mOutputsCount, 1.0_dt, biasesHH.getBuffer(), 1, gates.getBuffer(), 1, 0, index * mLayer.mOutputsCount);
+            }
+        }
+
+        // Final gates is sum of tmp and gates
+        gates += tmp;
+    }
+
+    const auto& cellState = work.getMemoryManager<MM>()[mLayer.mInputs[2]];
+    auto& newCellState = mLayer.mUseZoneout ? work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewCellName] : work.getMemoryManager<MM>()[mLayer.mOutputs[1]];
+
+    auto& newHiddenState = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+
+    const auto sliceSize = gates.getWidth() / 4;
+
+    const auto gates2D = gates.reshape(yato::dims(batchSize, sliceSize * 4));
+    auto newHiddenState2D = newHiddenState.reshape(yato::dims(batchSize, sliceSize));
+    const auto cellState2D = cellState.reshape(yato::dims(batchSize, sliceSize));
+    auto newCellState2D = newCellState.reshape(yato::dims(batchSize, sliceSize));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < batchSize; ++i)
+    {
+        for (size_t j = 0; j < sliceSize; ++j)
+        {
+            auto newCState = std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j])) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][j]))) +
+                             TODTYPE(cellState2D[i][j]) / (1.0_dt + std::exp(-(TODTYPE(gates2D[i][sliceSize + j]) + mLayer.mForgetBias)));
+            newCellState2D[i][j] = TOMMTYPE(newCState);
+            newHiddenState2D[i][j] = TOMMTYPE(std::tanh(newCState) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j]))));
+        }
+    }
+
+    if (mLayer.mUseZoneout)
+    {
+        if (mode == NetworkMode::Test)
+        {
+            throw std::runtime_error(mLayer.mTypeName + "[" + mLayer.mName + "::forwardComputeImpl]: Test mode with zoneout is not implemented");
+        }
+
+        auto& newCellStateFinal = work.getMemoryManager<MM>()[mLayer.mOutputs[1]];
+        auto& mRandomCPUHidden = work.getMemoryManager<MM>()[mLayer.mRandomNameHidden];
+        auto& mRandomCPUCell = work.getMemoryManager<MM>()[mLayer.mRandomNameCell];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < hiddenState.size(); ++q)
+        {
+            mRandomCPUHidden[q] = TOMMTYPE(random::bernoulli::randBool(1.0_dt - mLayer.mZoneout) ? 1.0_dt : 0.0_dt);
+            mRandomCPUCell[q] = TOMMTYPE(random::bernoulli::randBool(1.0_dt - mLayer.mZoneout) ? 1.0_dt : 0.0_dt);
+            newHiddenState[q] = TOMMTYPE(newHiddenState[q] * mRandomCPUHidden[q] + (1.0_dt - mRandomCPUHidden[q]) * hiddenState[q]);
+            newCellStateFinal[q] = TOMMTYPE(newCellState[q] * mRandomCPUCell[q] + (1.0_dt - mRandomCPUCell[q]) * cellState[q]);
+        }
+    }
+}
+
+template<typename MM>
+void LSTMFusedGatesCalcLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& deltasHidden = work.getMemoryManager<MM>()[mLayer.mOutputs[0].grad()];
+    const auto& deltasCell = work.getMemoryManager<MM>()[mLayer.mOutputs[1].grad()];
+    const auto& gates = work.getMemoryManager<MM>()[mLayer.mGatesName];
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputs[0]];
+    const auto& hiddenState = work.getMemoryManager<MM>()[mLayer.mInputs[1]];
+    const auto& cellState = work.getMemoryManager<MM>()[mLayer.mInputs[2]];
+    const auto& newCellState = mLayer.mUseZoneout ? work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewCellName] : work.getMemoryManager<MM>()[mLayer.mOutputs[1]];
+
+    const auto batchSize = work.getBatchSize();
+    const auto sliceSize = gates.getWidth() / 4;
+    size_t N = batchSize * input.getDepth() * input.getHeight();
+
+    if (mLayer.mUseZoneout)
+    {
+        auto& deltasHiddenNoZoneout = work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewHiddenGradName];
+        auto& deltasCellNoZoneout = work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewCellGradName];
+        auto& mRandomCPUHidden = work.getMemoryManager<MM>()[mLayer.mRandomNameHidden];
+        auto& mRandomCPUCell = work.getMemoryManager<MM>()[mLayer.mRandomNameCell];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize * sliceSize; ++i)
+        {
+            if (mRandomCPUHidden[i] == TOMMTYPE(1.0_dt))
+            {
+                deltasHiddenNoZoneout[i] += deltasHidden[i];
+            }
+            if (mRandomCPUCell[i] == TOMMTYPE(1.0_dt))
+            {
+                deltasCellNoZoneout[i] += deltasCell[i];
+            }
+        }
+    }
+    const auto deltasHidden2D =
+        mLayer.mUseZoneout ? work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewHiddenGradName].reshape(yato::dims(batchSize, sliceSize)) : deltasHidden.reshape(yato::dims(batchSize, sliceSize));
+    const auto deltasCell2D =
+        mLayer.mUseZoneout ? work.getMemoryManager<MM>()[mLayer.mNoZoneoutNewCellGradName].reshape(yato::dims(batchSize, sliceSize)) : deltasCell.reshape(yato::dims(batchSize, sliceSize));
+    const auto gates2D = gates.reshape(yato::dims(batchSize, sliceSize * 4));
+    const auto cellState2D = cellState.reshape(yato::dims(batchSize, sliceSize));
+    const auto newCellState2D = newCellState.reshape(yato::dims(batchSize, sliceSize));
+
+    // Calculate gradients for tmp storages
+    auto& gatesGrad = work.getMemoryManager<MM>()[mLayer.mGatesName.grad()];
+    auto gatesGrad2D = gatesGrad.reshape(yato::dims(batchSize, sliceSize * 4));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < batchSize; ++i)
+    {
+        for (size_t j = 0; j < sliceSize; ++j)
+        {
+            auto globalGrad = TODTYPE(deltasHidden2D[i][j]) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j]))) *
+                                  (1.0_dt - std::tanh(TODTYPE(newCellState2D[i][j])) * std::tanh(TODTYPE(newCellState2D[i][j]))) +
+                              TODTYPE(deltasCell2D[i][j]);
+
+            auto tmp = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(gates2D[i][j])));
+            gatesGrad2D[i][j] += TOMMTYPE(globalGrad * std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j])) * tmp * (1.0_dt - tmp));
+
+            tmp = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize + j]) - mLayer.mForgetBias));
+            gatesGrad2D[i][sliceSize + j] += TOMMTYPE(globalGrad * TODTYPE(cellState2D[i][j]) * tmp * (1.0_dt - tmp));
+
+            gatesGrad2D[i][sliceSize * 2 + j] +=
+                TOMMTYPE(globalGrad / (1.0_dt + std::exp(-TODTYPE(gates2D[i][j]))) * (1.0_dt - std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j])) * std::tanh(TODTYPE(gates2D[i][sliceSize * 2 + j]))));
+
+            tmp = 1.0_dt / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j])));
+            gatesGrad2D[i][sliceSize * 3 + j] += TOMMTYPE(TODTYPE(deltasHidden2D[i][j]) * std::tanh(TODTYPE(newCellState2D[i][j])) * tmp * (1.0_dt - tmp));
+        }
+    }
+
+    if (mLayer.mUseSingleParamTensor)
+    {
+        auto& tmpGrad = work.getMemoryManager<MM>()[mLayer.mTmpCalculationsName.grad()];
+        const auto& weights = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[0]];
+        Common::gemm(CblasNoTrans,
+                     CblasNoTrans,
+                     N,
+                     tmpGrad.getWidth(),
+                     mLayer.mOutputsCount,
+                     1.0_dt,
+                     gatesGrad.getBuffer(),
+                     weights.getBuffer(),
+                     1.0_dt,
+                     tmpGrad.getBuffer());
+        const auto tmpGrad2D = tmpGrad.reshape(yato::dims(batchSize, tmpGrad.getWidth()));
+        // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+        {
+            auto& inputGrad = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+            auto inputGrad2D = inputGrad.reshape(yato::dims(batchSize, inputGrad.getWidth()));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t j = 0; j < inputGrad.getWidth(); ++j)
+                {
+                    inputGrad2D[i][j] += tmpGrad2D[i][j];
+                }
+            }
+        }
+        // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[1]))
+        {
+            auto& hiddenStateGrad = work.getMemoryManager<MM>()[mLayer.mInputs[1].grad()];
+            auto hiddenStateGrad2D = hiddenStateGrad.reshape(yato::dims(batchSize, hiddenStateGrad.getWidth()));
+
+            if (mLayer.mUseZoneout)
+            {
+                auto& mRandomCPUHidden = work.getMemoryManager<MM>()[mLayer.mRandomNameHidden];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < hiddenStateGrad.size(); ++i)
+                {
+                    if (mRandomCPUHidden[i] == TOMMTYPE(0.0_dt))
+                    {
+                        hiddenStateGrad[i] += deltasHidden[i];
+                    }
+                }
+            }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < batchSize; ++i)
+            {
+                for (size_t j = 0; j < hiddenStateGrad.getWidth(); ++j)
+                {
+                    hiddenStateGrad2D[i][j] += tmpGrad2D[i][input.getWidth() + j];
+                }
+            }
+        }
+    }
+    else
+    {
+        // if ((mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[0]))
+        {
+            auto& inputGrad = work.getMemoryManager<MM>()[mLayer.mInputs[0].grad()];
+            const auto& weightsIH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[0]];
+            Common::gemm(CblasNoTrans,
+                         CblasNoTrans,
+                         N,
+                         inputGrad.getWidth(),
+                         mLayer.mOutputsCount,
+                         1.0_dt,
+                         gatesGrad.getBuffer(),
+                         weightsIH.getBuffer(),
+                         1.0_dt,
+                         inputGrad.getBuffer());
+        }
+        // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[1]))
+        {
+            auto& hiddenStateGrad = work.getMemoryManager<MM>()[mLayer.mInputs[1].grad()];
+
+            if (mLayer.mUseZoneout)
+            {
+                auto& mRandomCPUHidden = work.getMemoryManager<MM>()[mLayer.mRandomNameHidden];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+                for (size_t i = 0; i < hiddenStateGrad.size(); ++i)
+                {
+                    if (mRandomCPUHidden[i] == TOMMTYPE(0.0_dt))
+                    {
+                        hiddenStateGrad[i] += deltasHidden[i];
+                    }
+                }
+            }
+
+            const auto& weightsHH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[2]];
+            Common::gemm(CblasNoTrans,
+                         CblasNoTrans,
+                         N,
+                         hiddenStateGrad.getWidth(),
+                         mLayer.mOutputsCount,
+                         1.0_dt,
+                         gatesGrad.getBuffer(),
+                         weightsHH.getBuffer(),
+                         1.0_dt,
+                         hiddenStateGrad.getBuffer());
+        }
+    }
+
+    // if (mLayer.mNetworkParams.isGradNeeded(mLayer.mInputs[2]))
+    {
+        auto& cellStateGrad = work.getMemoryManager<MM>()[mLayer.mInputs[2].grad()];
+        auto cellStateGrad2D = cellStateGrad.reshape(yato::dims(batchSize, sliceSize));
+
+        if (mLayer.mUseZoneout)
+        {
+            auto& mRandomCPUCell = work.getMemoryManager<MM>()[mLayer.mRandomNameCell];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t i = 0; i < cellStateGrad.size(); ++i)
+            {
+                if (mRandomCPUCell[i] == TOMMTYPE(0.0_dt))
+                {
+                    cellStateGrad[i] += deltasCell[i];
+                }
+            }
+        }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < batchSize; ++i)
+        {
+            for (size_t j = 0; j < sliceSize; ++j)
+            {
+                cellStateGrad2D[i][j] += TOMMTYPE((TODTYPE(deltasHidden2D[i][j]) / (1.0_dt + std::exp(-TODTYPE(gates2D[i][sliceSize * 3 + j]))) *
+                                                       (1.0_dt - std::tanh(TODTYPE(newCellState2D[i][j])) * std::tanh(TODTYPE(newCellState2D[i][j]))) +
+                                                   TODTYPE(deltasCell2D[i][j])) /
+                                                  (1.0_dt + std::exp(-(TODTYPE(gates2D[i][sliceSize + j]) + mLayer.mForgetBias))));
+            }
+        }
+    }
+
+    if (!mLayer.mFrozen)
+    {
+        if (mLayer.mUseSingleParamTensor)
+        {
+            auto& gradWeights = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[0].grad()];
+            auto& tmp = work.getMemoryManager<MM>()[mLayer.mTmpCalculationsName];
+            Common::gemm(CblasTrans,
+                         CblasNoTrans,
+                         mLayer.mOutputsCount,
+                         tmp.getWidth(),
+                         N,
+                         1.0_dt,
+                         gatesGrad.getBuffer(),
+                         tmp.getBuffer(),
+                         1.0_dt,
+                         gradWeights.getBuffer());
+
+            if (mLayer.mUseBias)
+            {
+                auto& gradBiases = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[1].grad()];
+                for (size_t index = 0; index < N; ++index)
+                {
+                    Common::axpy(mLayer.mOutputsCount, 1.0_dt, gatesGrad.getBuffer(), 1, gradBiases.getBuffer(), 1, index * mLayer.mOutputsCount, 0);
+                }
+            }
+        }
+        else
+        {
+            auto& gradWeightsIH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[0].grad()];
+
+            Common::gemm(CblasTrans,
+                         CblasNoTrans,
+                         mLayer.mOutputsCount,
+                         input.getWidth(),
+                         N,
+                         1.0_dt,
+                         gatesGrad.getBuffer(),
+                         input.getBuffer(),
+                         1.0_dt,
+                         gradWeightsIH.getBuffer());
+
+            auto& gradWeightsHH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[2].grad()];
+
+            Common::gemm(CblasTrans,
+                         CblasNoTrans,
+                         mLayer.mOutputsCount,
+                         hiddenState.getWidth(),
+                         N,
+                         1.0_dt,
+                         gatesGrad.getBuffer(),
+                         hiddenState.getBuffer(),
+                         1.0_dt,
+                         gradWeightsHH.getBuffer());
+
+            if (mLayer.mUseBias)
+            {
+                auto& gradBiasesIH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[1].grad()];
+                for (size_t index = 0; index < N; ++index)
+                {
+                    Common::axpy(mLayer.mOutputsCount,
+                                 1.0_dt,
+                                 gatesGrad.getBuffer(),
+                                 1,
+                                 gradBiasesIH.getBuffer(),
+                                 1,
+                                 index * mLayer.mOutputsCount,
+                                 0);
+                }
+
+                auto& gradBiasesHH = work.getMemoryManager<MM>()[mLayer.mTrainableParamsNames[3].grad()];
+                for (size_t index = 0; index < N; ++index)
+                {
+                    Common::axpy(mLayer.mOutputsCount,
+                                 1.0_dt,
+                                 gatesGrad.getBuffer(),
+                                 1,
+                                 gradBiasesHH.getBuffer(),
+                                 1,
+                                 index * mLayer.mOutputsCount,
+                                 0);
+                }
+            }
+        }
+    }
+}
+
+template class LSTMFusedGatesCalcLayerCPU<MemoryManager>;
+template class LSTMFusedGatesCalcLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/composite/rnn/impl/LSTMFusedGatesCalcLayerCPU.h b/training/src/compiler/training/base/layers/composite/rnn/impl/LSTMFusedGatesCalcLayerCPU.h
new file mode 100644
index 00000000..b6ef6129
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/impl/LSTMFusedGatesCalcLayerCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LSTM_FUSED_GATES_CALC_LAYER_CPU_H
+#define LSTM_FUSED_GATES_CALC_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class LSTMFusedGatesCalcLayer;
+
+/**
+ * @brief LSTMFusedGatesCalcLayer CPU implementation
+ */
+template<typename MM>
+class LSTMFusedGatesCalcLayerCPU : public BasicImpl
+{
+  public:
+    LSTMFusedGatesCalcLayerCPU(LSTMFusedGatesCalcLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    LSTMFusedGatesCalcLayerCPU(LSTMFusedGatesCalcLayerCPU&&) = default;
+    LSTMFusedGatesCalcLayerCPU(const LSTMFusedGatesCalcLayerCPU&) = delete;
+    LSTMFusedGatesCalcLayerCPU& operator=(const LSTMFusedGatesCalcLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    LSTMFusedGatesCalcLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/impl/ZeroOutputLayerCPU.cpp b/training/src/compiler/training/base/layers/composite/rnn/impl/ZeroOutputLayerCPU.cpp
new file mode 100644
index 00000000..3c441dda
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/impl/ZeroOutputLayerCPU.cpp
@@ -0,0 +1,100 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ZeroOutputLayerCPU.h"
+#include "../ZeroOutputLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::ZeroOutputLayer, raul::ZeroOutputLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::ZeroOutputLayer, raul::ZeroOutputLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void ZeroOutputLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+    const auto& length = work.getMemoryManager<MM>()[mLayer.mRealLengthName];
+    auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+
+    const auto batch = work.getBatchSize();
+    const auto depth = input.getDepth();
+    const auto height = input.getHeight();
+    const auto width = input.getWidth();
+    auto input3D = input.reshape(yato::dims(batch, depth * height, width));
+    auto output3D = output.reshape(yato::dims(batch, depth * height, width));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < batch; ++i)
+    {
+        auto realLength = static_cast<size_t>(length[i]);
+        for (size_t w = 0; w < width; ++w)
+        {
+            for (size_t start = 0; start < depth * height; ++start)
+            {
+                if (start < realLength)
+                {
+                    output3D[i][start][w] = input3D[i][start][w];
+                }
+            }
+        }
+    }
+}
+
+template<typename MM>
+void ZeroOutputLayerCPU<MM>::backwardComputeImpl()
+{
+    Workflow& work = mLayer.mNetworkParams.mWorkflow;
+
+    const auto& length = work.getMemoryManager<MM>()[mLayer.mRealLengthName];
+    const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+    auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+    const auto batch = work.getBatchSize();
+    const auto depth = deltas.getDepth();
+    const auto height = deltas.getHeight();
+    const auto width = deltas.getWidth();
+    auto prevLayerDelta3D = prevLayerDelta.reshape(yato::dims(batch, depth * height, width));
+    auto deltas3D = deltas.reshape(yato::dims(batch, depth * height, width));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < batch; ++i)
+    {
+        auto realLength = static_cast<size_t>(length[i]);
+        for (size_t w = 0; w < width; ++w)
+        {
+            for (size_t start = 0; start < depth * height; ++start)
+            {
+                if (start < realLength)
+                {
+                    prevLayerDelta3D[i][start][w] += deltas3D[i][start][w];
+                }
+            }
+        }
+    }
+}
+
+template class ZeroOutputLayerCPU<MemoryManager>;
+template class ZeroOutputLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/composite/rnn/impl/ZeroOutputLayerCPU.h b/training/src/compiler/training/base/layers/composite/rnn/impl/ZeroOutputLayerCPU.h
new file mode 100644
index 00000000..b7d8dd47
--- /dev/null
+++ b/training/src/compiler/training/base/layers/composite/rnn/impl/ZeroOutputLayerCPU.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ZERO_OUTPUT_LAYER_CPU_H
+#define ZERO_OUTPUT_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class ZeroOutputLayer;
+
+/**
+ * @brief ZeroOutputLayer layer CPU implementation
+ */
+template<typename MM>
+class ZeroOutputLayerCPU : public BasicImpl
+{
+  public:
+    ZeroOutputLayerCPU(ZeroOutputLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    ZeroOutputLayerCPU(ZeroOutputLayerCPU&&) = default;
+    ZeroOutputLayerCPU(const ZeroOutputLayerCPU&) = delete;
+    ZeroOutputLayerCPU& operator=(const ZeroOutputLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    ZeroOutputLayer& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/BasicParameters.cpp b/training/src/compiler/training/base/layers/parameters/BasicParameters.cpp
new file mode 100644
index 00000000..faaaa5df
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/BasicParameters.cpp
@@ -0,0 +1,256 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BasicParameters.h"
+
+#include <map>
+
+namespace raul
+{
+
+void LabelSmoothingParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "smoothing: " << smoothing << ", padding_class_idx: ";
+    if (paddingClass >= 0)
+        stream << paddingClass;
+    else
+        stream << "none";
+}
+
+void ViewParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "shape: [" << /*batch << ", " << */ depth << ", " << height << ", " << width << "]";
+}
+
+TransposingParams::TransposingParams(const raul::Name& input, const raul::Name& output, const std::string& paramDim1, const std::string& paramDim2)
+    : BasicParams(Names(1, input), Names(1, output))
+{
+    std::map<std::string, Dimension> dmap{ { "width", Dimension::Width }, { "height", Dimension::Height }, { "depth", Dimension::Depth } /*, { "batch", Dimension::Batch } */ };
+    if (dmap.find(paramDim1) != dmap.end())
+        dim1 = dmap[paramDim1];
+    else
+        THROW_NONAME("TransposingParams", "Unknown dimension: " + paramDim1);
+    if (dmap.find(paramDim2) != dmap.end())
+        dim2 = dmap[paramDim2];
+    else
+        THROW_NONAME("TransposingParams", "Unknown dimension: " + paramDim2);
+}
+
+TransposingParams::TransposingParams(const raul::Name& input, const raul::Name& output, Dimension paramDim1, Dimension paramDim2)
+    : BasicParams(Names(1, input), Names(1, output))
+    , dim1(paramDim1)
+    , dim2(paramDim2)
+{
+}
+
+void TransposingParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    std::map<Dimension, std::string> dmap{
+        { Dimension::Default, "default" }, { Dimension::Width, "width" }, { Dimension::Height, "height" }, { Dimension::Depth, "depth" } /*, { Dimension::Batch, "batch" }*/
+    };
+    stream << "dim1: " << dmap[dim1] << ", dim2: " << dmap[dim2];
+}
+
+MatMulParams::MatMulParams(const Names& inputs, const raul::Name& output, float scaleValue)
+    : BasicParams(inputs, Names(1, output))
+    , scale(scaleValue)
+{
+}
+
+void MatMulParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "scale: " << scale;
+}
+
+PositionalEncodingParams::PositionalEncodingParams(const raul::Name& input, const raul::Name& output, size_t modelSize, size_t maxLength, bool useDurations, size_t maxMelLength)
+    : BasicParams(Names(1, input), Names(1, output))
+    , modelSize(modelSize)
+    , maxLength(maxLength)
+    , durationEncoding(useDurations)
+    , maxMelLength(maxMelLength)
+{
+}
+
+void PositionalEncodingParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "model_size: " << modelSize << ", max_length: " << maxLength << ", durations: " << (durationEncoding ? "true" : "false");
+    if (durationEncoding)
+    {
+        stream << ", max_mel_length: " << maxMelLength;
+    }
+}
+
+void FillParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "fill_value: " << fillValue;
+}
+
+void MaskedFillParams::print(std::ostream& stream) const
+{
+    FillParams::print(stream);
+    stream << ", inverted: " << (inverted ? "yes" : "no");
+}
+
+Pool2DParams::Pool2DParams(const Names& inputs, const Names& outputs, size_t kernelW, size_t kernelH, size_t strideWidth, size_t strideHeight, size_t paddingWidth, size_t paddingHeight)
+    : BasicParams(inputs, outputs)
+    , kernelWidth(kernelW)
+    , kernelHeight(kernelH)
+    , strideW(strideWidth)
+    , strideH(strideHeight)
+    , paddingW(paddingWidth)
+    , paddingH(paddingHeight)
+{
+}
+
+Pool2DParams::Pool2DParams(const Names& inputs, const Names& outputs, size_t kernelSize, size_t stride, size_t padding)
+    : BasicParams(inputs, outputs)
+    , kernelWidth(kernelSize)
+    , kernelHeight(kernelSize)
+    , strideW(stride)
+    , strideH(stride)
+    , paddingW(padding)
+    , paddingH(padding)
+{
+}
+
+void Pool2DParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "kernel: [" << kernelWidth << " x " << kernelHeight;
+    stream << "], stride: [" << strideW << ", " << strideH;
+    stream << "], padding: [" << paddingW << ", " << paddingH << "]";
+}
+
+LossParams::LossParams(const Names& inputs, const Names& outputs, const std::string& reduction_type, bool isFinal)
+    : BasicParams(inputs, outputs)
+{
+    std::string r = reduction_type;
+    if (r == "none")
+        reduction = LossParams::Reduction::None;
+    else if (r == "mean")
+        reduction = LossParams::Reduction::Mean;
+    else if (r == "batch_mean")
+        reduction = LossParams::Reduction::Batch_Mean;
+    else if (r == "sum")
+        reduction = LossParams::Reduction::Sum;
+    else if (r == "sum_over_weights")
+        reduction = LossParams::Reduction::Sum_Over_Weights;
+    else if (r == "sum_over_nonzero_weights")
+        reduction = LossParams::Reduction::Sum_Over_Nonzero_Weights;
+    else if (r == "custom_mean")
+        reduction = LossParams::Reduction::Custom_Mean;
+    else if (r == "custom_batch_mean")
+        reduction = LossParams::Reduction::Custom_Batch_Mean;
+    else
+        THROW_NONAME("TransposingParams", "Unknown reduction type: " + r);
+
+    mIsFinal = isFinal;
+}
+
+LossParams::LossParams(const Names& inputs, const Names& outputs, LossParams::Reduction reduction_type, bool isFinal)
+    : BasicParams(inputs, outputs)
+    , reduction(reduction_type)
+    , mIsFinal(isFinal)
+{
+}
+
+void LossParams::print(std::ostream& stream) const
+{
+    using namespace std::string_literals;
+    BasicParams::print(stream);
+    std::string ss = std::map<LossParams::Reduction, std::string>{ { LossParams::Reduction::None, "none" }, { LossParams::Reduction::Sum, "sum" }, { LossParams::Reduction::Mean, "mean" } }[reduction];
+    stream << "reduction: " << ss << ", final: " << (mIsFinal ? "true"s : "false"s);
+}
+
+BasicParamsWithDim::BasicParamsWithDim(const Names& inputs, const Names& outputs, const std::string& paramDim)
+    : BasicParams(inputs, outputs)
+{
+    std::string r = paramDim;
+    if (r == "width")
+        dim = Dimension::Width;
+    else if (r == "height")
+        dim = Dimension::Height;
+    else if (r == "depth")
+        dim = Dimension::Depth;
+    else if (r == "batch")
+        dim = Dimension::Batch;
+    else if (r == "default")
+        dim = Dimension::Default;
+    else
+        THROW_NONAME("TransposingParams", "Unknown dimension: " + r);
+}
+
+BasicParamsWithDim::BasicParamsWithDim(const Names& inputs, const Names& outputs, Dimension paramDim)
+    : BasicParams(inputs, outputs)
+    , dim(paramDim)
+{
+}
+
+void BasicParamsWithDim::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    std::string s = std::map<Dimension, std::string>{
+        { Dimension::Default, "default" }, { Dimension::Width, "width" }, { Dimension::Height, "height" }, { Dimension::Depth, "depth" }, { Dimension::Batch, "batch" }
+    }[dim];
+    stream << "dim: " << s;
+}
+
+void HSwishActivationParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    const auto to_str = [](const auto& x) {
+        std::string s = std::map<Limit, std::string>{ { Limit::Left, "Left" }, { Limit::Middle, "Middle" }, { Limit::Right, "Right" } }[x];
+        return s;
+    };
+    stream << "limit at -3: " << to_str(m3PointVal) << ", limit at 3: " << to_str(p3PointVal);
+}
+
+void HSigmoidActivationParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    const auto to_str = [](const auto& x) {
+        std::string s = std::map<Limit, std::string>{ { Limit::Left, "Left" }, { Limit::Middle, "Middle" }, { Limit::Right, "Right" } }[x];
+        return s;
+    };
+    stream << "limit at -3: " << to_str(m3PointVal) << ", limit at 3: " << to_str(p3PointVal);
+}
+
+void ElementWiseLayerParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << (mBroadcast ? "broadcast enabled" : "broadcast disabled");
+}
+
+void PaddingLayerParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "padding top:   " << mTopPadding << "padding back:  " << mBottomPadding << "padding left:  " << mLeftPadding << "padding right: " << mRightPadding;
+    stream << "filling mode: ";
+    if (mFillingMode == USE_FILLING_VALUE)
+    {
+        stream << "use given filling value";
+        stream << "filling value: " << mFillingValue;
+    }
+    else
+    {
+        stream << "reflection";
+    }
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/BasicParameters.h b/training/src/compiler/training/base/layers/parameters/BasicParameters.h
new file mode 100644
index 00000000..6fe94404
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/BasicParameters.h
@@ -0,0 +1,463 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BASIC_LAYER_PARAMETERS_H
+#define BASIC_LAYER_PARAMETERS_H
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/system/Types.h>
+
+namespace raul
+{
+
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param weights vector of names of weight/bias tensors (for weights sharing)
+ */
+
+struct BasicParams
+{
+    BasicParams() = delete;
+
+    BasicParams(Names i, Names o, bool compressOut = true)
+        : inputs(std::move(i))
+        , outputs(std::move(o))
+        , compressOutput(compressOut)
+    {
+    }
+
+    BasicParams(Names i, Names o, Names w, bool compressOut = true)
+        : inputs(std::move(i))
+        , outputs(std::move(o))
+        , sharedWeights(std::move(w))
+        , compressOutput(compressOut)
+    {
+    }
+
+    BasicParams(Names i, Names o, Name s, bool compressOut = true)
+        : inputs(std::move(i))
+        , outputs(std::move(o))
+        , sharedLayer(std::move(s))
+        , compressOutput(compressOut)
+    {
+    }
+
+    BasicParams(const BasicParams&) = default;
+
+    virtual ~BasicParams() = default;
+
+    virtual void print(std::ostream&) const {}
+
+    [[nodiscard]] virtual Names& getInputs() { return inputs; }
+
+    [[nodiscard]] virtual Names& getOutputs() { return outputs; }
+
+    [[nodiscard]] virtual Names& getSharedWeights() { return sharedWeights; }
+
+    [[nodiscard]] virtual const Names& getInputs() const { return inputs; }
+
+    [[nodiscard]] virtual const Names& getOutputs() const { return outputs; }
+
+    [[nodiscard]] virtual const Names& getSharedWeights() const { return sharedWeights; }
+
+    [[nodiscard]] virtual const Name& getSharedLayer() const { return sharedLayer; }
+    [[nodiscard]] virtual Name& getSharedLayer() { return sharedLayer; }
+
+    [[nodiscard]] virtual bool isCompressOutput() const { return compressOutput; }
+
+  private:
+    friend class Workflow; // to override layerExecutionTarget
+
+    Names inputs;
+    Names outputs;
+    Names sharedWeights;
+    Name sharedLayer;
+    bool compressOutput; // switch output tensor compression, globally overridden buy compression mode in NetworkParams
+};
+
+/**
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param paramBatch new batch size
+ * @param paramDepth new depth
+ * @param paramHeight new height
+ * @param paramWidth new width
+ */
+
+struct ViewParams : public BasicParams
+{
+    ViewParams() = delete;
+    ViewParams(const Name& input, const Name& output, int paramDepth = -1, int paramHeight = -1, int paramWidth = -1)
+        : BasicParams(Names(1, input), Names(1, output))
+        , depth(paramDepth)
+        , height(paramHeight)
+        , width(paramWidth)
+    {
+    }
+
+    ViewParams(const Name& input, const Name& output, const Name& weight, int paramDepth = -1, int paramHeight = -1, int paramWidth = -1)
+        : BasicParams(Names(1, input), Names(1, output), Names(1, weight))
+        , depth(paramDepth)
+        , height(paramHeight)
+        , width(paramWidth)
+    {
+    }
+
+    int depth;
+    int height;
+    int width;
+
+    void print(std::ostream&) const override;
+};
+
+/**
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param smoothing amount of smoothing in [0, 1]
+ */
+struct LabelSmoothingParams : public BasicParams
+{
+    LabelSmoothingParams() = delete;
+    LabelSmoothingParams(const Name& input, const Name& output, float smoothingAmount, int paddingClassIdx = -1)
+        : BasicParams(Names(1, input), Names(1, output))
+        , smoothing(smoothingAmount)
+        , paddingClass(paddingClassIdx)
+    {
+    }
+
+    float smoothing = 0.f;
+    int paddingClass = -1;
+
+    void print(std::ostream& stream) const override;
+};
+
+/**
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param paramDim1 first dimension to swap
+ * @param paramDim2 second dimension to swap
+ */
+struct TransposingParams : public BasicParams
+{
+    TransposingParams() = delete;
+    TransposingParams(const Name& input, const Name& output, const std::string& paramDim1, const std::string& paramDim2);
+    TransposingParams(const Name& input, const Name& output, Dimension paramDim1 = Dimension::Default, Dimension paramDim2 = Dimension::Default);
+
+    Dimension dim1;
+    Dimension dim2;
+
+    void print(std::ostream& stream) const override;
+};
+
+/**
+ * @param inputs names of input tensors
+ * @param output name of output tensor
+ * @param scale multiplier for result tensor
+ */
+struct MatMulParams : public BasicParams
+{
+    MatMulParams() = delete;
+    MatMulParams(const Names& inputs, const Name& output, float scaleValue = 1.f);
+
+    float scale = 1.f;
+
+    void print(std::ostream& stream) const override;
+};
+
+/**
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param modelSize the size of each input and output vector, must be even
+ * @param maxLength maximum number of vectors in batch
+ * @param durationEncoding inputs are durations instead of embeddings
+ * @param maxMelLengths maximum mel length (used when durationEncoding==true)
+ */
+struct PositionalEncodingParams : public BasicParams
+{
+    PositionalEncodingParams() = delete;
+    PositionalEncodingParams(const Name& input, const Name& output, size_t modelSize, size_t maxLength = 5000, bool useDurations = false, size_t maxMelLength = 200);
+
+    size_t modelSize = 2;
+    size_t maxLength = 5000;
+    bool durationEncoding = false;
+    size_t maxMelLength = 200;
+
+    void print(std::ostream& stream) const override;
+};
+
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param fillValue value to fill with
+ */
+struct FillParams : public BasicParams
+{
+    FillParams() = delete;
+    FillParams(const Names& inputs, const Names& outputs, float paramfillValue)
+        : BasicParams(inputs, outputs)
+        , fillValue(paramfillValue)
+    {
+    }
+
+    float fillValue;
+
+    void print(std::ostream& stream) const override;
+};
+
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param fillValue value to fill with
+ */
+struct MaskedFillParams : public FillParams
+{
+    MaskedFillParams() = delete;
+    MaskedFillParams(const Names& inputs, const Name& output, float paramfillValue, bool paramInverted = false)
+        : FillParams(inputs, Names(1, output), paramfillValue)
+        , inverted(paramInverted)
+    {
+    }
+
+    bool inverted;
+
+    void print(std::ostream& stream) const override;
+};
+
+struct BasicParamsWithDim : public BasicParams
+{
+    BasicParamsWithDim() = delete;
+
+    BasicParamsWithDim(const Names& inputs, const Names& outputs, const std::string& paramDim = "default");
+
+    BasicParamsWithDim(const Names& inputs, const Names& outputs, Dimension paramDim);
+
+    Dimension dim = Dimension::Default;
+
+    void print(std::ostream& stream) const override;
+};
+
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param kernelW/H the size of the window to take a max over
+ * @param strideW/H the stride of the window
+ * @param paddingW/H implicit zero padding to be added on both sides
+ */
+struct Pool2DParams : public BasicParams
+{
+    Pool2DParams() = delete;
+    Pool2DParams(const Names& inputs, const Names& outputs, size_t kernelW, size_t kernelH, size_t strideWidth, size_t strideHeight, size_t paddingWidth, size_t paddingHeight);
+    Pool2DParams(const Names& inputs, const Names& outputs, size_t kernelSize, size_t stride, size_t padding = 0);
+
+    size_t kernelWidth = 0, kernelHeight = 0;
+    size_t strideW = 0, strideH = 0;
+    size_t paddingW = 0, paddingH = 0;
+
+    void print(std::ostream& stream) const override;
+};
+
+struct LossParams : public BasicParams
+{
+    enum class Reduction : int
+    {
+        None = 0,
+        Sum = 1,
+        Mean = 2,                     // divide sum of losses by number of elements in input
+        Batch_Mean = 3,               // divide sum of losses by batch size
+        Sum_Over_Weights = 4,         // divide sum of losses by sum of weights
+        Sum_Over_Nonzero_Weights = 5, // divide sum of losses by number of non-zero weights
+        Custom_Mean = 6,              // divide by value from parameters of the network multiplied by total input size
+        Custom_Batch_Mean = 7         // divide by value from parameters of the network
+    };
+
+    LossParams() = delete;
+
+    LossParams(const Names& inputs, const Names& outputs, const std::string& reduction_type = "mean", bool isFinal = true);
+
+    LossParams(const Names& inputs, const Names& outputs, Reduction reduction_type, bool isFinal = true);
+
+    Reduction reduction = Reduction::Mean; // to conform to torch default behaviour
+    bool mIsFinal;
+
+    void print(std::ostream& stream) const override;
+};
+
+/** Parameters for hard swish activaction function
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param m3PointVal limit at -3 point (left, right, middle), default: left
+ * @param p3PointVal limit at 3 point (left, right, middle), default: right
+ */
+struct HSwishActivationParams : public BasicParams
+{
+    HSwishActivationParams() = delete;
+
+    HSwishActivationParams(const Names& inputs, const Names& outputs)
+        : HSwishActivationParams(inputs, outputs, Limit::Left, Limit::Right)
+    {
+    }
+
+    HSwishActivationParams(const Names& inputs, const Names& outputs, const Limit m3PointVal, const Limit p3PointVal)
+        : BasicParams(inputs, outputs)
+        , m3PointVal(m3PointVal)
+        , p3PointVal(p3PointVal)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    Limit m3PointVal = Limit::Left;
+    Limit p3PointVal = Limit::Right;
+};
+
+/** Parameters for hard sigmoid activaction function
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param m3PointVal limit at -3 point (left, right, middle), default: left
+ * @param p3PointVal limit at 3 point (left, right, middle), default: right
+ */
+struct HSigmoidActivationParams : public BasicParams
+{
+    HSigmoidActivationParams() = delete;
+
+    HSigmoidActivationParams(const Names& inputs, const Names& outputs)
+        : HSigmoidActivationParams(inputs, outputs, Limit::Left, Limit::Right)
+    {
+    }
+
+    HSigmoidActivationParams(const Names& inputs, const Names& outputs, const Limit m3PointVal, const Limit p3PointVal)
+        : BasicParams(inputs, outputs)
+        , m3PointVal(m3PointVal)
+        , p3PointVal(p3PointVal)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    Limit m3PointVal = Limit::Left;
+    Limit p3PointVal = Limit::Right;
+};
+
+/** Parameters for element-wize layers
+ * @param inputs vector of names of input tensors
+ * @param output name of output tensor
+ * @param broadcast if true the layer does not throw exception when sizes do not match and tries to broadcast tensors
+ *
+ * @note Broadcast can fail too
+ * @see Tensor broadcast implementation
+ */
+struct ElementWiseLayerParams : public BasicParams
+{
+    ElementWiseLayerParams() = delete;
+
+    ElementWiseLayerParams(const Names& inputs, const Name& output, const bool broadcast = true)
+        : BasicParams(inputs, Names(1, output))
+        , mBroadcast(broadcast)
+    {
+    }
+
+    ElementWiseLayerParams(const BasicParams& params, const bool broadcast = true)
+        : BasicParams(params)
+        , mBroadcast(broadcast)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    bool mBroadcast;
+};
+
+struct PaddingLayerParams : public BasicParams
+{
+    enum FillingMode
+    {
+        USE_FILLING_VALUE,
+        REFLECTION,
+        REPLICATION
+    };
+
+    uint32_t mTopPadding;
+    uint32_t mBottomPadding;
+    uint32_t mLeftPadding;
+    uint32_t mRightPadding;
+    dtype mFillingValue;
+    FillingMode mFillingMode;
+
+    PaddingLayerParams(const Names& inputs, const Names& outputs, uint32_t padding, dtype fillingValue)
+        : BasicParams(inputs, outputs)
+        , mTopPadding(padding)
+        , mBottomPadding(padding)
+        , mLeftPadding(padding)
+        , mRightPadding(padding)
+        , mFillingValue(fillingValue)
+        , mFillingMode(USE_FILLING_VALUE)
+    {
+    }
+
+    PaddingLayerParams(const Names& inputs, const Names& outputs, uint32_t padding, FillingMode filling_mode)
+        : BasicParams(inputs, outputs)
+        , mTopPadding(padding)
+        , mBottomPadding(padding)
+        , mLeftPadding(padding)
+        , mRightPadding(padding)
+        , mFillingValue(0._dt)
+        , mFillingMode(filling_mode)
+    {
+    }
+
+    PaddingLayerParams(const Names& inputs, const Names& outputs, uint32_t topPadding, uint32_t bottomPadding, uint32_t leftPadding, uint32_t rightPadding, dtype fillingValue)
+        : BasicParams(inputs, outputs)
+        , mTopPadding(topPadding)
+        , mBottomPadding(bottomPadding)
+        , mLeftPadding(leftPadding)
+        , mRightPadding(rightPadding)
+        , mFillingValue(fillingValue)
+        , mFillingMode(USE_FILLING_VALUE)
+    {
+    }
+
+    PaddingLayerParams(const Names& inputs, const Names& outputs, uint32_t topPadding, uint32_t bottomPadding, uint32_t leftPadding, uint32_t rightPadding, FillingMode fillingMode)
+        : BasicParams(inputs, outputs)
+        , mTopPadding(topPadding)
+        , mBottomPadding(bottomPadding)
+        , mLeftPadding(leftPadding)
+        , mRightPadding(rightPadding)
+        , mFillingValue(0._dt)
+        , mFillingMode(fillingMode)
+    {
+    }
+
+    PaddingLayerParams(const Names& inputs, const Names& outputs, uint32_t topPadding, uint32_t bottomPadding, uint32_t leftPadding, uint32_t rightPadding, dtype fillingValue, FillingMode fillingMode)
+        : BasicParams(inputs, outputs)
+        , mTopPadding(topPadding)
+        , mBottomPadding(bottomPadding)
+        , mLeftPadding(leftPadding)
+        , mRightPadding(rightPadding)
+        , mFillingValue(fillingValue)
+        , mFillingMode(fillingMode)
+    {
+    }
+
+    void print(std::ostream& stream) const final;
+};
+
+/// @warning (ck): please add a new parameter class into separate files!
+
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/parameters/ClampLayerParams.cpp b/training/src/compiler/training/base/layers/parameters/ClampLayerParams.cpp
new file mode 100644
index 00000000..84d1f955
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/ClampLayerParams.cpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ClampLayerParams.h"
+
+namespace raul
+{
+
+void ClampLayerParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "min = " << mMin << ", max = " << mMax;
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/ClampLayerParams.h b/training/src/compiler/training/base/layers/parameters/ClampLayerParams.h
new file mode 100644
index 00000000..bf960aa9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/ClampLayerParams.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CLAMP_LAYER_PARAMS_H
+#define CLAMP_LAYER_PARAMS_H
+
+#include "BasicParameters.h"
+#include <string>
+
+namespace raul
+{
+
+/**
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param min specifies lower bound of acceptable values
+ * @param max specifies upper bound of acceptable values
+ */
+
+struct ClampLayerParams : public BasicParams
+{
+    ClampLayerParams() = delete;
+    ClampLayerParams(const raul::Name& input, const raul::Name& output, raul::dtype min = 0.0_dt, raul::dtype max = 1.0_dt)
+        : BasicParams(Names(1, input), Names(1, output))
+        , mMin(min)
+        , mMax(max)
+    {
+    }
+
+    raul::dtype mMin;
+    raul::dtype mMax;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // CLAMP_LAYER_PARAMS_H
diff --git a/training/src/compiler/training/base/layers/parameters/ConvertPrecisionParams.h b/training/src/compiler/training/base/layers/parameters/ConvertPrecisionParams.h
new file mode 100644
index 00000000..c50e21d2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/ConvertPrecisionParams.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVERT_PRECISION_PARAMS_H
+#define CONVERT_PRECISION_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+
+struct ConvertPrecisionParams : public BasicParams
+{
+    ConvertPrecisionParams() = delete;
+    ConvertPrecisionParams(const Name& input, const Name& output, LayerExecutionTarget fromTarget, LayerExecutionTarget toTarget)
+        : BasicParams({ input }, { output }), mFromTarget(fromTarget), mToTarget(toTarget), mOptimizeMemory(true)
+    {
+    }
+
+    ConvertPrecisionParams(const Name& input, const Name& output, LayerExecutionTarget fromTarget, LayerExecutionTarget toTarget, bool optimizeMemory)
+        : BasicParams({ input }, { output }), mFromTarget(fromTarget), mToTarget(toTarget), mOptimizeMemory(optimizeMemory)
+    {
+    }
+
+    LayerExecutionTarget mFromTarget;
+    LayerExecutionTarget mToTarget;
+    bool mOptimizeMemory;
+
+};
+
+} // raul namespace
+
+#endif // CAST_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/DataParams.cpp b/training/src/compiler/training/base/layers/parameters/DataParams.cpp
new file mode 100644
index 00000000..e6b308ec
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/DataParams.cpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DataParams.h"
+
+namespace raul
+{
+DataParams::DataParams(const Names& outputs, size_t d, size_t h, size_t w, size_t labelsCnt)
+    : BasicParams({}, outputs)
+    , depth(d)
+    , height(h)
+    , width(w)
+    , labelsCount(labelsCnt)
+{
+}
+
+void DataParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "shape: [" << depth << ", " << height << ", " << width << "], labels: " << labelsCount;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/DataParams.h b/training/src/compiler/training/base/layers/parameters/DataParams.h
new file mode 100644
index 00000000..5bb489b2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/DataParams.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DATALAYER_PARAMS_H
+#define DATALAYER_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param d depth size in elements
+ * @param h height size in elements
+ * @param w width size in elements
+ * @param labelsCnt size of the labels in elements
+ */
+struct DataParams : public BasicParams
+{
+    DataParams() = delete;
+    DataParams(const Names& outputs, size_t d, size_t h, size_t w, size_t labelsCnt = 0);
+
+    size_t depth;
+    size_t height;
+    size_t width;
+    size_t labelsCount;
+
+    void print(std::ostream& stream) const override;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/DropoutParams.cpp b/training/src/compiler/training/base/layers/parameters/DropoutParams.cpp
new file mode 100644
index 00000000..1cd8e49c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/DropoutParams.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DropoutParams.h"
+
+namespace raul
+{
+void DropoutParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "probability: " << probability;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/DropoutParams.h b/training/src/compiler/training/base/layers/parameters/DropoutParams.h
new file mode 100644
index 00000000..51b38540
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/DropoutParams.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DROPOUT_PARAMS_H
+#define DROPOUT_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param paramProbability probability of element to be zero-ed
+ */
+struct DropoutParams : public BasicParams
+{
+    DropoutParams() = delete;
+    DropoutParams(const Names& inputs, const Names& outputs, dtype paramProbability)
+        : BasicParams(inputs, outputs)
+        , probability(paramProbability)
+    {
+    }
+
+    dtype probability;
+
+    void print(std::ostream& stream) const override;
+};
+} // raul namespace
+
+#endif // DROPOUT_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/ElementWiseComparisonLayerParams.cpp b/training/src/compiler/training/base/layers/parameters/ElementWiseComparisonLayerParams.cpp
new file mode 100644
index 00000000..a4c90590
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/ElementWiseComparisonLayerParams.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ElementWiseComparisonLayerParams.h"
+
+namespace raul
+{
+void ElementWiseComparisonLayerParams::print(std::ostream& stream) const
+{
+    ElementWiseLayerParams::print(stream);
+    stream << ", cmp: " << mComparator << ", tolerance: " << mTolerance;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/ElementWiseComparisonLayerParams.h b/training/src/compiler/training/base/layers/parameters/ElementWiseComparisonLayerParams.h
new file mode 100644
index 00000000..0e56a525
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/ElementWiseComparisonLayerParams.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENT_WISE_COMPARISON_LAYER_PARAMS_H
+#define ELEMENT_WISE_COMPARISON_LAYER_PARAMS_H
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+
+/** Parameters for element-wise comparison layers
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param broadcast if true the layer does not throw exception when sizes do not match and tries broadcast tensors
+ * @param comparator rule to compare input tensors elementwise with 12 predefined values (approximate "equal", "ne" (not equal), "less", "greater", "le" (less or equal), "ge" (greater or equal)
+ * and their "exact" variants)
+ * @param tolerance determine possible inaccuracy of comparison
+ *
+ * @note Broadcast can fail too
+ * @see Tensor broadcast implementation
+ */
+struct ElementWiseComparisonLayerParams : public ElementWiseLayerParams
+{
+    ElementWiseComparisonLayerParams() = delete;
+
+    ElementWiseComparisonLayerParams(const Names& inputs, const Names& outputs, const bool broadcast = true, const std::string comparator = std::string("equal"), const dtype tolerance = 0.0_dt)
+        : ElementWiseLayerParams({ inputs, outputs }, broadcast)
+        , mComparator(comparator)
+        , mTolerance(tolerance)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    std::string mComparator;
+    raul::dtype mTolerance;
+};
+
+} // raul namespace
+
+#endif // ELEMENT_WISE_COMPARISON_LAYER_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/FakeQuantParams.cpp b/training/src/compiler/training/base/layers/parameters/FakeQuantParams.cpp
new file mode 100644
index 00000000..f08748e9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/FakeQuantParams.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "FakeQuantParams.h"
+
+#include <map>
+
+namespace raul
+{
+
+void FakeQuantParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    switch (mQuantizationMode)
+    {
+        case QuantizationMode::over_full_tensor:
+            stream << "over full tensor";
+            break;
+        case QuantizationMode::over_batch:
+            stream << "over batch";
+            break;
+        case QuantizationMode::over_batch_and_channels:
+            stream << "over batch and channels";
+            break;
+            // default: Do nothing
+    }
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/FakeQuantParams.h b/training/src/compiler/training/base/layers/parameters/FakeQuantParams.h
new file mode 100644
index 00000000..fea259ac
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/FakeQuantParams.h
@@ -0,0 +1,54 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FAKE_QUANT_PARAMS_H
+#define FAKE_QUANT_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "BasicParameters.h"
+#include <training/base/common/quantization/IQuantizer.h>
+
+namespace raul
+{
+
+enum class QuantizationMode
+{
+    over_full_tensor,
+    over_batch,
+    over_batch_and_channels
+};
+
+struct FakeQuantParams : public BasicParams
+{
+    FakeQuantParams() = delete;
+    /** Parameters for fake quantization layer
+     * @param inputs vector of names of input tensors
+     * @param outputs vector of names of output tensors
+     * @param mode quantization mode
+     */
+    FakeQuantParams(const Names& inputs, const Names& outputs, QuantizationMode mode = QuantizationMode::over_batch)
+        : BasicParams(inputs, outputs)
+        , mQuantizationMode(mode)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    QuantizationMode mQuantizationMode;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/FixedBiasParams.cpp b/training/src/compiler/training/base/layers/parameters/FixedBiasParams.cpp
new file mode 100644
index 00000000..be329b9d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/FixedBiasParams.cpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "FixedBiasParams.h"
+
+namespace raul
+{
+
+void FixedBiasParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "bias: " << mBias;
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/FixedBiasParams.h b/training/src/compiler/training/base/layers/parameters/FixedBiasParams.h
new file mode 100644
index 00000000..d343badf
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/FixedBiasParams.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FIXED_BIAS_PARAMS_H
+#define FIXED_BIAS_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+
+struct FixedBiasParams : public BasicParams
+{
+    FixedBiasParams() = delete;
+
+    template<class T>
+    FixedBiasParams(const Names& inputs, const Names& outputs, const T bias)
+        : BasicParams(inputs, outputs)
+        , mBias(bias)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    dtype mBias;
+};
+
+} // raul namespace
+
+#endif // FIXED_BIAS_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/GaussianUpsamplingParams.cpp b/training/src/compiler/training/base/layers/parameters/GaussianUpsamplingParams.cpp
new file mode 100644
index 00000000..3ca0a697
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/GaussianUpsamplingParams.cpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GaussianUpsamplingParams.h"
+
+namespace raul
+{
+
+void GaussianUpsamplingParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "mel len = " << mMelLen << ", eps = " << mEps;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/GaussianUpsamplingParams.h b/training/src/compiler/training/base/layers/parameters/GaussianUpsamplingParams.h
new file mode 100644
index 00000000..94f073ea
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/GaussianUpsamplingParams.h
@@ -0,0 +1,42 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GAUSSIAN_UPSAMPLING_PARAMS_H
+#define GAUSSIAN_UPSAMPLING_PARAMS_H
+
+#include <optional>
+#include <string>
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+
+struct GaussianUpsamplingParams : public BasicParams
+{
+    GaussianUpsamplingParams() = delete;
+    GaussianUpsamplingParams(const Names& inputs, const Name output, const size_t melLen, const dtype eps = 0.00001_dt)
+        : BasicParams(inputs, Names(1, output))
+        , mMelLen(melLen)
+        , mEps(eps)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+    size_t mMelLen;
+    dtype mEps;
+};
+
+} // raul namespace
+
+#endif // GAUSSIAN_UPSAMPLING_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/IndexFillLayerParams.cpp b/training/src/compiler/training/base/layers/parameters/IndexFillLayerParams.cpp
new file mode 100644
index 00000000..6df30acf
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/IndexFillLayerParams.cpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "IndexFillLayerParams.h"
+
+namespace raul
+{
+
+void IndexFillLayerParams::print(std::ostream& stream) const
+{
+    BasicParamsWithDim::print(stream);
+    stream << "value to fill = " << mFillValue << " ";
+    for (const auto& it : mIndices)
+    {
+        stream << "index = " << it << " ";
+    }
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/IndexFillLayerParams.h b/training/src/compiler/training/base/layers/parameters/IndexFillLayerParams.h
new file mode 100644
index 00000000..c697dbb7
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/IndexFillLayerParams.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef INDEX_FILL_LAYER_PARAMS_H
+#define INDEX_FILL_LAYER_PARAMS_H
+
+#include "BasicParameters.h"
+#include <string>
+#include <vector>
+
+namespace raul
+{
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param dimension specifies the dimension to fill
+ * @param indices vector of indices in chose dim to fill
+ */
+struct IndexFillLayerParams : public BasicParamsWithDim
+{
+    IndexFillLayerParams() = delete;
+    IndexFillLayerParams(const raul::Name& input, const raul::Name& output, raul::Dimension dimension, const std::unordered_set<size_t>& indices, raul::dtype value = 1.0_dt)
+        : BasicParamsWithDim(Names(1, input), Names(1, output), dimension)
+        , mIndices(indices)
+        , mFillValue(value)
+    {
+    }
+
+    std::unordered_set<size_t> mIndices;
+    raul::dtype mFillValue;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // INDEX_FILL_LAYER_PARAMS_H
diff --git a/training/src/compiler/training/base/layers/parameters/LayerParameters.h b/training/src/compiler/training/base/layers/parameters/LayerParameters.h
new file mode 100644
index 00000000..0121b7c6
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/LayerParameters.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAYER_PARAMETERS_H
+#define LAYER_PARAMETERS_H
+
+#include "BasicParameters.h"
+#include "DataParams.h"
+#include "DropoutParams.h"
+#include "ElementWiseComparisonLayerParams.h"
+#include "FakeQuantParams.h"
+#include "IndexFillLayerParams.h"
+#include "RandomSelectParams.h"
+#include "RollLayerParams.h"
+#include "ScaleParams.h"
+#include "SlicingParams.h"
+#include "TensorParams.h"
+#include "TilingParameters.h"
+#include "trainable/BatchnormParams.h"
+#include "trainable/Convolution2DParams.h"
+#include "trainable/DynamicConvolutionAttentionParams.h"
+#include "trainable/EmbeddingParams.h"
+#include "trainable/LSTMCellParams.h"
+#include "trainable/LSTMParams.h"
+#include "trainable/LayerNormParams.h"
+#include "trainable/LinearParams.h"
+#include "trainable/LocationSensitiveAttentionParams.h"
+#include "trainable/MultiHeadAttentionParams.h"
+#include "trainable/TransformerParams.h"
+#include "trainable/TransposedConvolution1DParams.h"
+#include "trainable/TransposedConvolution2DParams.h"
+
+#endif // LAYER_PARAMETERS_H
diff --git a/training/src/compiler/training/base/layers/parameters/LeakyReLUParams.cpp b/training/src/compiler/training/base/layers/parameters/LeakyReLUParams.cpp
new file mode 100644
index 00000000..a4ef6454
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/LeakyReLUParams.cpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LeakyReLUParams.h"
+
+namespace raul
+{
+
+void LeakyReLUParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "negative slope = " << mNegativeSlope;
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/LeakyReLUParams.h b/training/src/compiler/training/base/layers/parameters/LeakyReLUParams.h
new file mode 100644
index 00000000..660b106f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/LeakyReLUParams.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LEAKY_RELU_PARAMS_H
+#define LEAKY_RELU_PARAMS_H
+
+#include "BasicParameters.h"
+#include <string>
+
+namespace raul
+{
+
+/**
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param negativeSlope controls the angle of the negative slope
+ */
+
+struct LeakyReLUParams : public BasicParams
+{
+    LeakyReLUParams() = delete;
+    LeakyReLUParams(const raul::Name& input, const raul::Name& output, raul::dtype negativeSlope = 0.01_dt)
+        : BasicParams(Names(1, input), Names(1, output))
+        , mNegativeSlope(negativeSlope)
+    {
+    }
+
+    raul::dtype mNegativeSlope;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // LEAKY_RELU_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/RandomChoiceParams.cpp b/training/src/compiler/training/base/layers/parameters/RandomChoiceParams.cpp
new file mode 100644
index 00000000..b76da715
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/RandomChoiceParams.cpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RandomChoiceParams.h"
+
+namespace raul
+{
+
+void RandomChoiceParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "ratios = [";
+    for (const auto& r : mRatios)
+    {
+        stream << " " << r;
+    }
+    stream << "]";
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/RandomChoiceParams.h b/training/src/compiler/training/base/layers/parameters/RandomChoiceParams.h
new file mode 100644
index 00000000..a5dcfcb6
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/RandomChoiceParams.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_CHOICE_PARAMS_H
+#define RANDOM_CHOICE_PARAMS_H
+
+#include "BasicParameters.h"
+#include <string>
+#include <vector>
+
+namespace raul
+{
+/**
+ * @param output name of output tensor
+ * @param ratiosOrProbabilities specify the ratio of choice
+ * For N inputs could be:
+ *   - N ratios
+ *   - N probabilities
+ *   - N-1 probabilities (last one will be calculated)
+ */
+struct RandomChoiceParams : public BasicParams
+{
+    RandomChoiceParams() = delete;
+    RandomChoiceParams(const Names& inputs, const raul::Name& output, std::vector<float> ratiosOrProbabilities, size_t seed = std::random_device{}())
+        : BasicParams(inputs, Names(1, output))
+        , mRatios(ratiosOrProbabilities)
+        , mSeed(seed)
+    {
+    }
+
+    std::vector<float> mRatios;
+    size_t mSeed;
+
+    void print(std::ostream& stream) const override;
+};
+
+}
+
+#endif // raul namespace
diff --git a/training/src/compiler/training/base/layers/parameters/RandomSelectParams.cpp b/training/src/compiler/training/base/layers/parameters/RandomSelectParams.cpp
new file mode 100644
index 00000000..2a94f67b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/RandomSelectParams.cpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RandomSelectParams.h"
+
+namespace raul
+{
+
+void RandomSelectParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "probability: " << probability << ", broadcast: " << broadcast;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/RandomSelectParams.h b/training/src/compiler/training/base/layers/parameters/RandomSelectParams.h
new file mode 100644
index 00000000..2d6ea230
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/RandomSelectParams.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_SELECT_PARAMS_H
+#define RANDOM_SELECT_PARAMS_H
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param paramProbability probability of element to be chosen from first tensor
+ * @param paramBroadcast if true the layer does not throw exception when sizes do not match and tries broadcast tensors
+ */
+struct RandomSelectParams : public BasicParams
+{
+    RandomSelectParams() = delete;
+    RandomSelectParams(const Names& inputs, const Names& outputs, dtype paramProbability = 0.0_dt, bool paramBroadcast = false)
+        : BasicParams(inputs, outputs)
+        , probability(paramProbability)
+        , broadcast(paramBroadcast)
+    {
+    }
+
+    raul::dtype probability;
+    bool broadcast;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // RANDOM_SELECT_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/RandomTensorLayerParams.cpp b/training/src/compiler/training/base/layers/parameters/RandomTensorLayerParams.cpp
new file mode 100644
index 00000000..b47efcc2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/RandomTensorLayerParams.cpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RandomTensorLayerParams.h"
+
+namespace raul
+{
+
+void RandomTensorLayerParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "depth = " << mDepth << " height = " << mHeight << " width = " << mWidth << "\n";
+    stream << "mean = " << mMean << " std deviation = " << mStdDev;
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/RandomTensorLayerParams.h b/training/src/compiler/training/base/layers/parameters/RandomTensorLayerParams.h
new file mode 100644
index 00000000..c8c28fb0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/RandomTensorLayerParams.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANDOM_TENSOR_LAYER_PARAMS_H
+#define RANDOM_TENSOR_LAYER_PARAMS_H
+
+#include "BasicParameters.h"
+#include <string>
+#include <training/base/common/Random.h>
+#include <vector>
+
+namespace raul
+{
+/**
+ * @param output name of output tensor
+ * @param initializer specify the distribution to use
+ */
+struct RandomTensorLayerParams : public BasicParams
+{
+    RandomTensorLayerParams() = delete;
+    RandomTensorLayerParams(const raul::Name& output,
+                            size_t depth = 1,
+                            size_t height = 1,
+                            size_t width = 1,
+                            raul::dtype mean = 0.0_dt,
+                            raul::dtype stddev = 1.0_dt,
+                            size_t seed = random::getGlobalSeed())
+        : BasicParams(Names(), Names(1, output))
+        , mDepth(depth)
+        , mHeight(height)
+        , mWidth(width)
+        , mMean(mean)
+        , mStdDev(stddev)
+        , mSeed(seed)
+    {
+    }
+
+    // Shape
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    // Distribution params
+    raul::dtype mMean;
+    raul::dtype mStdDev;
+    size_t mSeed;
+
+    void print(std::ostream& stream) const override;
+};
+
+}
+
+#endif // raul namespace
diff --git a/training/src/compiler/training/base/layers/parameters/RepeatInterleaveParams.cpp b/training/src/compiler/training/base/layers/parameters/RepeatInterleaveParams.cpp
new file mode 100644
index 00000000..9c91e3a8
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/RepeatInterleaveParams.cpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RepeatInterleaveParams.h"
+
+namespace raul
+{
+
+RepeatInterleaveParams::RepeatInterleaveParams(const raul::Name& input, const raul::Name& output, std::vector<size_t> repeats, raul::Dimension paramDim)
+    : BasicParamsWithDim(Names(1, input), Names(1, output), paramDim)
+    , mRepeats(std::move(repeats))
+{
+}
+
+RepeatInterleaveParams::RepeatInterleaveParams(const raul::Name& input, const raul::Name& output, std::vector<size_t> repeats, const std::string& paramDim)
+    : BasicParamsWithDim(Names(1, input), Names(1, output), paramDim)
+    , mRepeats(std::move(repeats))
+{
+}
+
+void RepeatInterleaveParams::print(std::ostream& stream) const
+{
+    BasicParamsWithDim::print(stream);
+    stream << ", repeats: ";
+    for (auto& val : mRepeats)
+    {
+        stream << val << " ";
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/RepeatInterleaveParams.h b/training/src/compiler/training/base/layers/parameters/RepeatInterleaveParams.h
new file mode 100644
index 00000000..418ceb7b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/RepeatInterleaveParams.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REPEAT_INTERLEAVE_PARAMETERS_H
+#define REPEAT_INTERLEAVE_PARAMETERS_H
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+
+/** Parameters for Tile Layer
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param repeats contains number of repeatitions for each element
+ * @param paramDim says what dimension to repeat (depth, height or width)
+ */
+
+struct RepeatInterleaveParams : public BasicParamsWithDim
+{
+    RepeatInterleaveParams() = delete;
+
+    RepeatInterleaveParams(const raul::Name& input, const raul::Name& output, std::vector<size_t> repeats, raul::Dimension paramDim);
+    RepeatInterleaveParams(const raul::Name& input, const raul::Name& output, std::vector<size_t> repeats, const std::string& paramDim = "width");
+
+    void print(std::ostream& stream) const override;
+
+    std::vector<size_t> mRepeats;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/RollLayerParams.cpp b/training/src/compiler/training/base/layers/parameters/RollLayerParams.cpp
new file mode 100644
index 00000000..fbde5b74
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/RollLayerParams.cpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RollLayerParams.h"
+
+namespace raul
+{
+
+void RollLayerParams::print(std::ostream& stream) const
+{
+    BasicParamsWithDim::print(stream);
+    stream << "shift = " << mShift << ", cycled = " << mCycled << ", value to fill = " << mValueToFill;
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/RollLayerParams.h b/training/src/compiler/training/base/layers/parameters/RollLayerParams.h
new file mode 100644
index 00000000..4c5ae153
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/RollLayerParams.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ROLL_LAYER_PARAMS_H
+#define ROLL_LAYER_PARAMS_H
+
+#include "BasicParameters.h"
+#include <string>
+
+namespace raul
+{
+
+/**
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param dimension specifies the dimension to shift
+ * @param shift number of elements to shift
+ */
+
+struct RollLayerParams : public BasicParamsWithDim
+{
+    RollLayerParams() = delete;
+    RollLayerParams(const raul::Name& input, const raul::Name& output, raul::Dimension dimension, size_t shift, bool cycled = true, raul::dtype val = 1.0_dt)
+        : BasicParamsWithDim(Names(1, input), Names(1, output), dimension)
+        , mShift(shift)
+        , mCycled(cycled)
+        , mValueToFill(val)
+    {
+    }
+
+    size_t mShift;
+    bool mCycled;
+    raul::dtype mValueToFill;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // ROLL_LAYER_PARAMS_H
diff --git a/training/src/compiler/training/base/layers/parameters/ScaleParams.cpp b/training/src/compiler/training/base/layers/parameters/ScaleParams.cpp
new file mode 100644
index 00000000..5c4cba7c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/ScaleParams.cpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ScaleParams.h"
+
+namespace raul
+{
+
+void ScaleParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "scale: " << mScale;
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/ScaleParams.h b/training/src/compiler/training/base/layers/parameters/ScaleParams.h
new file mode 100644
index 00000000..b09dee65
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/ScaleParams.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SCALE_PARAMS_H
+#define SCALE_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+
+struct ScaleParams : public BasicParams
+{
+    ScaleParams() = delete;
+    ScaleParams(const Names& inputs, const Names& outputs, const dtype scale)
+        : BasicParams(inputs, outputs)
+        , mScale(scale)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    dtype mScale;
+};
+
+} // raul namespace
+
+#endif // SCALE_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/SlicingParams.cpp b/training/src/compiler/training/base/layers/parameters/SlicingParams.cpp
new file mode 100644
index 00000000..ccc7172f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/SlicingParams.cpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SlicingParams.h"
+
+namespace raul
+{
+
+SlicingParams::SlicingParams(const Name& input, const Names& outputs, const std::string& paramDimStr, std::vector<int> slice_sizes)
+    : BasicParamsWithDim(Names(1, input), outputs, paramDimStr == "default" ? "width" : paramDimStr)
+    , sliceSize(slice_sizes)
+{
+}
+
+SlicingParams::SlicingParams(const Name& input, const Names& outputs, Dimension paramDim, std::vector<int> slice_sizes)
+    : BasicParamsWithDim(Names(1, input), outputs, paramDim == Dimension::Default ? Dimension::Width : paramDim)
+    , sliceSize(slice_sizes)
+{
+}
+
+void SlicingParams::print(std::ostream& stream) const
+{
+    BasicParamsWithDim::print(stream);
+    if (sliceSize.empty())
+    {
+        stream << ", slices: " << getOutputs().size();
+    }
+    else
+    {
+        stream << ", slice_size: ";
+        for (auto& i : sliceSize)
+            stream << i << " ";
+    }
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/SlicingParams.h b/training/src/compiler/training/base/layers/parameters/SlicingParams.h
new file mode 100644
index 00000000..658646ab
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/SlicingParams.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SLICING_PARAMS_H
+#define SLICING_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+
+/** Parameters for splitting input tensor along specified axis
+ * @param paramDimStr axis for splitting (possible values: "width", "height", "depth", "default" = "width")
+ *
+ * Usage examples:
+ * SlicingParams(input, outputs, "width") - outputs.size() equal slices
+ * SlicingParams(input, outputs, "width", {1, 2}) - two slices with size 1 and 2 (remaining part will be omitted)
+ * SlicingParams(input, outputs, "width", {1, -1, 2}) - three slices with. first - 1, third - 2, second - remaining part
+ * SlicingParams(input, outputs, "width", {-1, 1}) - two slices with. second - 1, first - remaining part
+ * SlicingParams(input, outputs, "width", {1, -1}) - two slices with. first - 1, second - remaining part
+ */
+struct SlicingParams : public BasicParamsWithDim
+{
+    SlicingParams() = delete;
+
+    SlicingParams(const Name& input, const Names& outputs, const std::string& paramDimStr = "width", std::vector<int> slice_sizes = std::vector<int>());
+
+    SlicingParams(const Name& input, const Names& outputs, Dimension paramDim, std::vector<int> slice_sizes = std::vector<int>());
+
+    std::vector<int> sliceSize;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+#endif // SLICING_PARAMS_H
diff --git a/training/src/compiler/training/base/layers/parameters/SoftPlusActivationParams.cpp b/training/src/compiler/training/base/layers/parameters/SoftPlusActivationParams.cpp
new file mode 100644
index 00000000..3c8b8ce8
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/SoftPlusActivationParams.cpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SoftPlusActivationParams.h"
+
+namespace raul
+{
+
+SoftPlusActivationParams::SoftPlusActivationParams(const Name& input, const Name& output, const dtype beta, const dtype threshold)
+    : BasicParams(Names(1, input), Names(1, output))
+    , mBeta(beta)
+    , mThreshold(threshold)
+{
+}
+
+void SoftPlusActivationParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "beta = " << mBeta << ", threshold = " << mThreshold;
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/SoftPlusActivationParams.h b/training/src/compiler/training/base/layers/parameters/SoftPlusActivationParams.h
new file mode 100644
index 00000000..8ad8dc52
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/SoftPlusActivationParams.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SOFTPLUS_ACTIVATION_PARAMS_H
+#define SOFTPLUS_ACTIVATION_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+
+/** Parameters for softplus activaction function
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param beta value for the Softplus formulation
+ * @param threshold values above this revert to a linear function
+ */
+
+struct SoftPlusActivationParams : public BasicParams
+{
+    SoftPlusActivationParams() = delete;
+
+    SoftPlusActivationParams(const Name& input, const Name& output, const dtype beta = 1.0_dt, const dtype threshold = 20.0_dt);
+
+    void print(std::ostream& stream) const override;
+
+    dtype mBeta;
+    dtype mThreshold;
+};
+
+} // raul namespace
+#endif // SOFTPLUS_ACTIVATION_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/TensorParams.cpp b/training/src/compiler/training/base/layers/parameters/TensorParams.cpp
new file mode 100644
index 00000000..3e297eb9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/TensorParams.cpp
@@ -0,0 +1,23 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TensorParams.h"
+
+namespace raul
+{
+void TensorParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    // stream << "size: [" << shape[0] << " x " << width << " x " << height << " x " << depth << "]";
+}
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/TensorParams.h b/training/src/compiler/training/base/layers/parameters/TensorParams.h
new file mode 100644
index 00000000..1a04b1da
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/TensorParams.h
@@ -0,0 +1,137 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TENSOR_LAYER_PARAMS_H
+#define TENSOR_LAYER_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "BasicParameters.h"
+
+#include <training/compiler/Workflow.h>
+
+namespace raul
+{
+
+struct TensorParams : public BasicParams
+{
+    TensorParams() = delete;
+    /**
+     * @param outputs vector of names of output tensors
+     * @param b batch size in elements
+     * @param d depth size in elements
+     * @param h height size in elements
+     * @param w width size in elements
+     */
+    TensorParams(const Names& outputs,
+                 size_t b,
+                 size_t d,
+                 size_t h,
+                 size_t w,
+                 Workflow::Usage u = raul::Workflow::Usage::Forward,
+                 Workflow::Mode m = raul::Workflow::Mode::Read,
+                 bool optimizeGraph = false,
+                 bool optimizeMem = false,
+                 bool trainable = false,
+                 bool zero = false,
+                 bool compress = false)
+        : BasicParams({}, outputs)
+        , shape(b, d, h, w)
+        , init(false)
+        , initValue(0_dt)
+        , usage(u)
+        , mode(m)
+        , isOptimizeGraph(optimizeGraph)
+        , isOptimizeMem(optimizeMem)
+        , isTrainable(trainable)
+        , isZero(zero)
+        , isCompress(compress)
+    {
+    }
+
+    /**
+     * @param outputs vector of names of output tensors
+     * @param shape shape of output tensors
+     * @param initValue fill value
+     */
+    TensorParams(const Names& outputs,
+                 raul::WShape shape,
+                 raul::dtype initValue,
+                 Workflow::Usage u = raul::Workflow::Usage::Forward,
+                 Workflow::Mode m = raul::Workflow::Mode::Read,
+                 bool optimizeGraph = false,
+                 bool optimizeMem = false,
+                 bool trainable = false,
+                 bool zero = false,
+                 bool compress = false)
+        : BasicParams({}, outputs)
+        , shape(shape)
+        , init(true)
+        , initValue(initValue)
+        , usage(u)
+        , mode(m)
+        , isOptimizeGraph(optimizeGraph)
+        , isOptimizeMem(optimizeMem)
+        , isTrainable(trainable)
+        , isZero(zero)
+        , isCompress(compress)
+    {
+    }
+
+    /**
+     * @brief declares Tensor that will be filled externally
+     * @param outputs vector of names of output tensors
+     * @param shape shape of output tensors
+     */
+    TensorParams(const Names& outputs,
+                 raul::WShape shape,
+                 Workflow::Usage u = raul::Workflow::Usage::Forward,
+                 Workflow::Mode m = raul::Workflow::Mode::Read,
+                 bool optimizeGraph = false,
+                 bool optimizeMem = false,
+                 bool trainable = false,
+                 bool zero = false,
+                 bool compress = false)
+        : BasicParams({}, outputs)
+        , shape(shape)
+        , init(zero)
+        , initValue(0_dt)
+        , usage(u)
+        , mode(m)
+        , isOptimizeGraph(optimizeGraph)
+        , isOptimizeMem(optimizeMem)
+        , isTrainable(trainable)
+        , isZero(zero)
+        , isCompress(compress)
+    {
+    }
+
+    raul::WShape shape;
+    bool init;
+    raul::dtype initValue;
+
+    Workflow::Usage usage;
+    Workflow::Mode mode;
+
+    bool isOptimizeGraph;
+    bool isOptimizeMem;
+    bool isTrainable;
+    bool isZero;
+    bool isCompress;
+
+    void print(std::ostream& stream) const override;
+};
+} // raul namespace
+
+#endif // TENSOR_LAYER_PARAMS_H
diff --git a/training/src/compiler/training/base/layers/parameters/TilingParameters.cpp b/training/src/compiler/training/base/layers/parameters/TilingParameters.cpp
new file mode 100644
index 00000000..93a48040
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/TilingParameters.cpp
@@ -0,0 +1,95 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TilingParameters.h"
+
+namespace
+{
+
+raul::Dimension stringToDimension(const std::string& dim)
+{
+    if (dim == "width")
+    {
+        return raul::Dimension::Width;
+    }
+    if (dim == "height")
+    {
+        return raul::Dimension::Height;
+    }
+    if (dim == "depth")
+    {
+        return raul::Dimension::Depth;
+    }
+    if (dim == "batch")
+    {
+        return raul::Dimension::Batch;
+    }
+    return raul::Dimension::Default;
+}
+
+}
+
+namespace raul
+{
+
+TilingParams::TilingParams(const raul::Name& input, const raul::Name& output, const std::initializer_list<size_t> repeats)
+    : BasicParamsWithDim(Names(1, input), Names(1, output), raul::Dimension::Default)
+{
+    if (repeats.size() != 3)
+    {
+        THROW_NONAME("TilingParams", "3 numbers should be provided to duplicate whole tensor");
+    }
+
+    mRepeatDepth = *repeats.begin();
+    mRepeatHeight = *(repeats.begin() + 1);
+    mRepeatWidth = *(repeats.begin() + 2);
+}
+
+TilingParams::TilingParams(const raul::Name& input, const raul::Name& output, size_t multiple, raul::Dimension paramDim)
+    : BasicParamsWithDim(Names(1, input), Names(1, output), paramDim)
+    , mRepeatDepth(1u)
+    , mRepeatHeight(1u)
+    , mRepeatWidth(1u)
+{
+    if (dim == raul::Dimension::Depth)
+    {
+        mRepeatDepth *= multiple;
+    }
+    else if (dim == raul::Dimension::Height)
+    {
+        mRepeatHeight *= multiple;
+    }
+    else if (dim == raul::Dimension::Width)
+    {
+        mRepeatWidth *= multiple;
+    }
+    else
+    {
+        mRepeatDepth *= multiple;
+        mRepeatHeight *= multiple;
+        mRepeatWidth *= multiple;
+    }
+}
+
+TilingParams::TilingParams(const raul::Name& input, const raul::Name& output, size_t multiple, const std::string& paramDim)
+    : TilingParams(input, output, multiple, stringToDimension(paramDim))
+{
+}
+
+void TilingParams::print(std::ostream& stream) const
+{
+    BasicParamsWithDim::print(stream);
+    stream << "repeats: " << mRepeatDepth << " (Depth) " << mRepeatHeight << " (Height) " << mRepeatWidth << " (Width)";
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/TilingParameters.h b/training/src/compiler/training/base/layers/parameters/TilingParameters.h
new file mode 100644
index 00000000..7cdfbf35
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/TilingParameters.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TILING_PARAMETERS_H
+#define TILING_PARAMETERS_H
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+
+/** Parameters for Tile Layer
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param repeats contains number of repetitions
+ * @param paramDim says what dimension to repeat
+ */
+struct TilingParams : public BasicParamsWithDim
+{
+    TilingParams() = delete;
+
+    TilingParams(const raul::Name& input, const raul::Name& output, const std::initializer_list<size_t> repeats);
+    TilingParams(const raul::Name& input, const raul::Name& output, size_t multiple, raul::Dimension paramDim);
+    TilingParams(const raul::Name& input, const raul::Name& output, size_t multiple, const std::string& paramDim = "default");
+
+    size_t mRepeatDepth;
+    size_t mRepeatHeight;
+    size_t mRepeatWidth;
+
+    void print(std::ostream& stream) const override;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/ZoneoutParams.cpp b/training/src/compiler/training/base/layers/parameters/ZoneoutParams.cpp
new file mode 100644
index 00000000..735002a1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/ZoneoutParams.cpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ZoneoutParams.h"
+
+namespace raul
+{
+
+void ZoneoutParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "probability: " << mProbability;
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/ZoneoutParams.h b/training/src/compiler/training/base/layers/parameters/ZoneoutParams.h
new file mode 100644
index 00000000..687cdb2f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/ZoneoutParams.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ZONEOUT_PARAMS_H
+#define ZONEOUT_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "BasicParameters.h"
+
+namespace raul
+{
+
+struct ZoneoutParams : public BasicParams
+{
+    ZoneoutParams() = delete;
+    /** Parameters for Zoneout layer
+     * @param inputs vector of names of input tensors
+     * @param outputs vector of names of output tensors
+     * @param param probability of keeping previous value
+     */
+    ZoneoutParams(const Names& inputs, const Names& outputs, const dtype probability)
+        : BasicParams(inputs, outputs)
+        , mProbability(probability)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    dtype mProbability;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/BahdanauAttentionParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/BahdanauAttentionParams.cpp
new file mode 100644
index 00000000..2034d454
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/BahdanauAttentionParams.cpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BahdanauAttentionParams.h"
+
+namespace raul
+{
+
+BahdanauAttentionParams::BahdanauAttentionParams(const BasicParams& params,
+                                                 size_t numUnits,
+                                                 bool normalize,
+                                                 raul::dtype noise,
+                                                 raul::dtype bias,
+                                                 const std::string& mode,
+                                                 bool stepwise,
+                                                 bool oldSMA,
+                                                 bool frozen)
+    : TrainableParams(params, frozen)
+    , mNumUnits(numUnits)
+    , mNormalize(normalize)
+    , mSigmoidNoise(noise)
+    , mScoreBiasInit(bias)
+    , mMode(mode)
+    , mStepwise(stepwise)
+    , mOldSMA(oldSMA)
+{
+}
+
+BahdanauAttentionParams::BahdanauAttentionParams(const BasicParams& params,
+                                                 const Name& sharedLayer,
+                                                 size_t numUnits,
+                                                 bool normalize,
+                                                 raul::dtype noise,
+                                                 raul::dtype bias,
+                                                 const std::string& mode,
+                                                 bool stepwise,
+                                                 bool oldSMA,
+                                                 bool frozen)
+    : TrainableParams(params.getInputs(), params.getOutputs(), sharedLayer, frozen)
+    , mNumUnits(numUnits)
+    , mNormalize(normalize)
+    , mSigmoidNoise(noise)
+    , mScoreBiasInit(bias)
+    , mMode(mode)
+    , mStepwise(stepwise)
+    , mOldSMA(oldSMA)
+{
+}
+
+void BahdanauAttentionParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "num units = " << mNumUnits << ", normalize = " << mNormalize << ", sigmoid noise = " << mSigmoidNoise;
+    stream << ", score bias init = " << mScoreBiasInit << ", mode = " << mMode << ", stepwise = " << mStepwise << ", old version = " << mOldSMA;
+}
+
+}
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/BahdanauAttentionParams.h b/training/src/compiler/training/base/layers/parameters/trainable/BahdanauAttentionParams.h
new file mode 100644
index 00000000..2231621c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/BahdanauAttentionParams.h
@@ -0,0 +1,79 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BAHDANAU_ATTENTION_PARAMS_H
+#define BAHDANAU_ATTENTION_PARAMS_H
+
+#include "TrainableParams.h"
+
+namespace raul
+{
+
+/** Parameters for Bahdanau Attention Layer
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param weights vector of names of weight tensors
+ * @param mNumUnits - the depth of the query mechanism
+ * @param normalize specifies the type of attention - normalized or not
+ * @param mSigmoidNoise - standard deviation of pre-sigmoid noise
+ * @param mScoreBiasInit - initial value for score bias scalar.
+ * It is recommended to initialize this to a negative value when the length of the memory is large
+ * @param mMode specifies how to compute the attention distribution.
+ * Now only "parallel" mode is available
+ * @param stepwise specifies whether to use simple monotonic or stepwise monotonic attention.
+ */
+
+struct BahdanauAttentionParams : public TrainableParams
+{
+    BahdanauAttentionParams() = delete;
+
+    BahdanauAttentionParams(const BasicParams& params,
+                            size_t numUnits,
+                            bool normalize = false,
+                            dtype noise = 0.0_dt,
+                            dtype bias = 0.0_dt,
+                            const std::string& mode = "parallel",
+                            bool stepwise = false,
+                            bool oldSMA = true,
+                            bool frozen = false);
+
+    BahdanauAttentionParams(const BasicParams& params,
+                            const Name& sharedLayer,
+                            size_t numUnits,
+                            bool normalize = false,
+                            dtype noise = 0.0_dt,
+                            dtype bias = 0.0_dt,
+                            const std::string& mode = "parallel",
+                            bool stepwise = false,
+                            bool oldSMA = true,
+                            bool frozen = false);
+
+    // Standard parameters
+    size_t mNumUnits;
+    bool mNormalize;
+
+    // BahdanauMonotonicAttention parameters
+    dtype mSigmoidNoise;
+    dtype mScoreBiasInit;
+    std::string mMode;
+
+    // Stepwise or not
+    bool mStepwise;
+    bool mOldSMA;
+
+    void print(std::ostream& stream) const override;
+};
+
+}
+
+#endif
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/BatchnormParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/BatchnormParams.cpp
new file mode 100644
index 00000000..8bb406be
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/BatchnormParams.cpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BatchnormParams.h"
+
+namespace raul
+{
+
+TrainableBasicParamsWithDim::TrainableBasicParamsWithDim(const Names& inputs, const Names& outputs, const Name& sharedLayer, const std::string& paramDim, bool frozen)
+    : TrainableParams(inputs, outputs, sharedLayer, frozen)
+{
+    std::string r = paramDim;
+    if (r == "width")
+        dim = Dimension::Width;
+    else if (r == "height")
+        dim = Dimension::Height;
+    else if (r == "depth")
+        dim = Dimension::Depth;
+    else if (r == "batch")
+        dim = Dimension::Batch;
+    else if (r == "default")
+        dim = Dimension::Default;
+    else
+        THROW_NONAME("TrainableBasicParamsWithDim", "Unknown dimension: " + r);
+}
+
+TrainableBasicParamsWithDim::TrainableBasicParamsWithDim(const Names& inputs, const Names& outputs, const Name& sharedLayer, Dimension paramDim, bool frozen)
+    : TrainableParams(inputs, outputs, sharedLayer, frozen)
+    , dim(paramDim)
+{
+}
+
+void TrainableBasicParamsWithDim::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    std::string s = std::map<Dimension, std::string>{
+        { Dimension::Default, "default" }, { Dimension::Width, "width" }, { Dimension::Height, "height" }, { Dimension::Depth, "depth" }, { Dimension::Batch, "batch" }
+    }[dim];
+    stream << "dim: " << s;
+}
+
+void BatchnormParams::print(std::ostream& stream) const
+{
+    TrainableBasicParamsWithDim::print(stream);
+    stream << "momentum: " << momentum << " eps:" << eps;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/BatchnormParams.h b/training/src/compiler/training/base/layers/parameters/trainable/BatchnormParams.h
new file mode 100644
index 00000000..24204b22
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/BatchnormParams.h
@@ -0,0 +1,72 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BATCHNORM_PARAMS_H
+#define BATCHNORM_PARAMS_H
+
+#include "TrainableParams.h"
+#include <string>
+#include <vector>
+
+namespace raul
+{
+struct TrainableBasicParamsWithDim : public TrainableParams
+{
+    TrainableBasicParamsWithDim() = delete;
+
+    TrainableBasicParamsWithDim(const Names& inputs, const Names& outputs, const Name& sharedLayer, const std::string& paramDim = "default", bool frozen = false);
+
+    TrainableBasicParamsWithDim(const Names& inputs, const Names& outputs, const Name& sharedLayer, Dimension paramDim, bool frozen = false);
+
+    Dimension dim = Dimension::Default;
+
+    void print(std::ostream& stream) const override;
+};
+
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param paramMomentum statistics computation for evaluation (LayerName::MeanEval, LayerName::VarianceEval)
+ * @param paramEps a value added to the denominator for numerical stability
+ */
+struct BatchnormParams : public TrainableBasicParamsWithDim
+{
+    BatchnormParams() = delete;
+    BatchnormParams(const Names& inputs, const Names& outputs, float paramMomentum, float paramEps, raul::Dimension dim, bool frozen = false)
+        : TrainableBasicParamsWithDim(inputs, outputs, Name{}, dim, frozen)
+        , momentum(paramMomentum)
+        , eps(paramEps)
+    {
+    }
+
+    BatchnormParams(const Names& inputs, const Names& outputs, float paramMomentum = 0.0f, float paramEps = 1e-5f, const std::string& dim = "depth", bool frozen = false)
+        : BatchnormParams(inputs, outputs, Name{}, paramMomentum, paramEps, dim, frozen)
+    {
+    }
+
+    BatchnormParams(const Names& inputs, const Names& outputs, const Name& sharedLayer, float paramMomentum = 0.0f, float paramEps = 1e-5f, const std::string& dim = "depth", bool frozen = false)
+        : TrainableBasicParamsWithDim(inputs, outputs, sharedLayer, dim, frozen)
+        , momentum(paramMomentum)
+        , eps(paramEps)
+    {
+    }
+
+    float momentum;
+    float eps;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // BATCHNORM_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/Convolution1DParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/Convolution1DParams.cpp
new file mode 100644
index 00000000..301e8aa9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/Convolution1DParams.cpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Convolution1DParams.h"
+
+namespace raul
+{
+
+Convolution1DParams::Convolution1DParams(const BasicParams& params,
+                                         size_t paramKernelSize,
+                                         size_t paramOutChannel,
+                                         size_t paramStride,
+                                         size_t paramPadding,
+                                         size_t paramDilation,
+                                         size_t paramGroups,
+                                         bool paramBias,
+                                         bool paramQuantizeWeights,
+                                         bool paramTFStyle,
+                                         bool paramFrozen)
+    : TrainableParams(params, paramFrozen)
+    , kernelSize(paramKernelSize)
+    , kernelsCount(paramOutChannel)
+    , stride(paramStride)
+    , padding(paramPadding)
+    , dilation(paramDilation)
+    , groups(paramGroups)
+    , useBias(paramBias)
+    , quantizeWeights(paramQuantizeWeights)
+    , tfStyle(paramTFStyle)
+{
+}
+
+Convolution1DParams::Convolution1DParams(const Name& input,
+                                         const Name& output,
+                                         const Name& sharedLayer,
+                                         size_t paramKernelSize,
+                                         size_t paramOutChannel,
+                                         size_t paramStride,
+                                         size_t paramPadding,
+                                         size_t paramDilation,
+                                         size_t paramGroups,
+                                         bool paramBias,
+                                         bool paramQuantizeWeights,
+                                         bool paramTFStyle,
+                                         bool paramFrozen)
+    : TrainableParams({ input }, { output }, sharedLayer, paramFrozen)
+    , kernelSize(paramKernelSize)
+    , kernelsCount(paramOutChannel)
+    , stride(paramStride)
+    , padding(paramPadding)
+    , dilation(paramDilation)
+    , groups(paramGroups)
+    , useBias(paramBias)
+    , quantizeWeights(paramQuantizeWeights)
+    , tfStyle(paramTFStyle)
+{
+}
+
+Convolution1DParams::Convolution1DParams(const Name& input,
+                                         const Name& output,
+                                         size_t kernelSize,
+                                         size_t outChannel,
+                                         size_t stride,
+                                         size_t padding,
+                                         size_t dilation,
+                                         size_t groups,
+                                         bool bias,
+                                         bool quantizeWeights,
+                                         bool tfStyle,
+                                         bool paramFrozen)
+    : Convolution1DParams({ { input }, { output } }, kernelSize, outChannel, stride, padding, dilation, groups, bias, quantizeWeights, tfStyle, paramFrozen)
+{
+}
+
+void Convolution1DParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << " kernel_size: " << kernelSize << ", out_channels: " << kernelsCount << ", stride: " << stride << ", padding: " << padding << ", dilation: " << dilation;
+    stream << ", groups: " << groups << ", bias: " << (useBias ? "true" : "false") << ", quantized weights: " << (quantizeWeights ? "true" : "false") << ", TF style: " << (tfStyle ? "true" : "false");
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/Convolution1DParams.h b/training/src/compiler/training/base/layers/parameters/trainable/Convolution1DParams.h
new file mode 100644
index 00000000..348f7f72
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/Convolution1DParams.h
@@ -0,0 +1,97 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVOLUTION_1D_PARAMS_H
+#define CONVOLUTION_1D_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "TrainableParams.h"
+
+namespace raul
+{
+
+/**
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param paramKernelSize the size of the convolution kernel
+ * @param paramOutChannel number of channels produced by the convolution
+ * @param paramStride the stride of the convolution
+ * @param paramPadding implicit zero padding to be added on both sides
+ * @param paramDilation the spacing between the kernel points
+ * @param paramGroups controls the connections between inputs and outputs
+ * @param paramBias enable or disable bias usage
+ * @param paramQuantizeWeights enable or disable weights quantization
+ *
+ * @see https://pytorch.org/docs/master/generated/torch.nn.Conv1d.html
+ */
+struct Convolution1DParams : public TrainableParams
+{
+    Convolution1DParams() = delete;
+
+    Convolution1DParams(const BasicParams& params,
+                        size_t paramKernelSize,
+                        size_t paramOutChannel,
+                        size_t paramStride = 1,
+                        size_t paramPadding = 0,
+                        size_t paramDilation = 1,
+                        size_t paramGroups = 1,
+                        bool paramBias = true,
+                        bool paramQuantizeWeights = false,
+                        bool paramTFStyle = false,
+                        bool paramFrozen = false);
+
+    Convolution1DParams(const Name& input,
+                        const Name& output,
+                        const Name& sharedLayer,
+                        size_t kernelSize,
+                        size_t outChannel,
+                        size_t stride = 1,
+                        size_t padding = 0,
+                        size_t dilation = 1,
+                        size_t groups = 1,
+                        bool bias = true,
+                        bool quantizeWeights = false,
+                        bool tfStyle = false,
+                        bool paramFrozen = false);
+
+    Convolution1DParams(const Name& input,
+                        const Name& output,
+                        size_t kernelSize,
+                        size_t outChannel,
+                        size_t stride = 1,
+                        size_t padding = 0,
+                        size_t dilation = 1,
+                        size_t groups = 1,
+                        bool bias = true,
+                        bool quantizeWeights = false,
+                        bool tfStyle = false,
+                        bool paramFrozen = false);
+
+    size_t kernelSize;
+    size_t kernelsCount;
+    size_t stride;
+    size_t padding;
+    size_t dilation;
+    size_t groups;
+    bool useBias;
+    bool quantizeWeights;
+    bool tfStyle;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/Convolution2DParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/Convolution2DParams.cpp
new file mode 100644
index 00000000..0625907c
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/Convolution2DParams.cpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Convolution2DParams.h"
+
+namespace raul
+{
+
+Convolution2DParams::Convolution2DParams(const Names& inputs,
+                                         const Names& outputs,
+                                         size_t kernelW,
+                                         size_t kernelH,
+                                         size_t paramOutChannel,
+                                         size_t strideW,
+                                         size_t strideH,
+                                         size_t paddingW,
+                                         size_t paddingH,
+                                         bool paramBias,
+                                         bool quantizeWeights,
+                                         size_t dilationW,
+                                         size_t dilationH,
+                                         size_t groups,
+                                         bool frozen)
+    : TrainablePool2DParams(inputs, outputs, kernelW, kernelH, strideW, strideH, paddingW, paddingH, frozen)
+    , kernelsCount(paramOutChannel)
+    , bias(paramBias)
+    , quantizeWeights(quantizeWeights)
+    , mDilationW(dilationW)
+    , mDilationH(dilationH)
+    , mGroups(groups)
+{
+}
+
+Convolution2DParams::Convolution2DParams(const Names& inputs,
+                                         const Names& outputs,
+                                         size_t kernelSize,
+                                         size_t paramOutChannel,
+                                         size_t stride,
+                                         size_t padding,
+                                         bool paramBias,
+                                         bool quantizeWeights,
+                                         size_t dilation,
+                                         size_t groups,
+                                         bool frozen)
+    : TrainablePool2DParams(inputs, outputs, kernelSize, stride, padding, frozen)
+    , kernelsCount(paramOutChannel)
+    , bias(paramBias)
+    , quantizeWeights(quantizeWeights)
+    , mDilationW(dilation)
+    , mDilationH(dilation)
+    , mGroups(groups)
+{
+}
+
+void Convolution2DParams::print(std::ostream& stream) const
+{
+    TrainablePool2DParams::print(stream);
+    stream << ", kernels: " << kernelsCount << ", bias: " << (bias ? "true" : "false") << ", quantized weights: " << (quantizeWeights ? "true" : "false");
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/Convolution2DParams.h b/training/src/compiler/training/base/layers/parameters/trainable/Convolution2DParams.h
new file mode 100644
index 00000000..e7499782
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/Convolution2DParams.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CONVOLUTION_2D_PARAMS_H
+#define CONVOLUTION_2D_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "TrainablePool2DParams.h"
+
+namespace raul
+{
+
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param kernelW/H the size of the convolution kernel
+ * @param paramOutChannel number of channels produced by the convolution
+ * @param strideW/H the stride of the convolution
+ * @param paddingW/H implicit zero padding to be added on both sides
+ * @param paramBias enable or disable bias usage
+ * @param quantizeWeights enable or disable weights quantization
+ */
+struct Convolution2DParams : public TrainablePool2DParams
+{
+    Convolution2DParams() = delete;
+
+    Convolution2DParams(const Names& inputs,
+                        const Names& outputs,
+                        size_t kernelW,
+                        size_t kernelH,
+                        size_t paramOutChannel,
+                        size_t strideW,
+                        size_t strideH,
+                        size_t paddingW,
+                        size_t paddingH,
+                        bool paramBias,
+                        bool quantizeWeights = false,
+                        size_t dilationW = 1,
+                        size_t dilationH = 1,
+                        size_t groups = 1,
+                        bool frozen = false);
+
+    Convolution2DParams(const Names& inputs,
+                        const Names& outputs,
+                        size_t kernelSize,
+                        size_t paramOutChannel,
+                        size_t stride = 1,
+                        size_t padding = 0,
+                        bool paramBias = true,
+                        bool quantizeWeights = false,
+                        size_t dilation = 1,
+                        size_t groups = 1,
+                        bool frozen = false);
+
+    size_t kernelsCount = 0;
+    bool bias;
+    bool quantizeWeights;
+
+    size_t mDilationW;
+    size_t mDilationH;
+
+    size_t mGroups;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/DynamicConvolutionAttentionParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/DynamicConvolutionAttentionParams.cpp
new file mode 100644
index 00000000..ab340c91
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/DynamicConvolutionAttentionParams.cpp
@@ -0,0 +1,61 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DynamicConvolutionAttentionParams.h"
+
+namespace raul
+{
+
+DynamicConvolutionAttentionParams::hparams::hparams(size_t attnFilters, size_t attnKernel, size_t priorFilterSize, size_t attentionWinSize, raul::dtype alpha, raul::dtype beta)
+    : mAttentionFilters(attnFilters)
+    , mAttentionKernel(attnKernel)
+    , mPriorFilterSize(priorFilterSize)
+    , mAttentionWindowSize(attentionWinSize)
+    , mPriorAlpha(alpha)
+    , mPriorBeta(beta)
+{
+}
+
+DynamicConvolutionAttentionParams::DynamicConvolutionAttentionParams(const Names& inputs, const Names& outputs, size_t numUnits, const hparams& params, bool cumulateWeights, bool paramFrozen)
+    : TrainableParams(inputs, outputs, paramFrozen)
+    , mNumUnits(numUnits)
+    , mHparams(params)
+    , mCumulateWeights(cumulateWeights)
+{
+}
+
+DynamicConvolutionAttentionParams::DynamicConvolutionAttentionParams(const Names& inputs,
+                                                                     const Names& outputs,
+                                                                     const Name& sharedLayer,
+                                                                     size_t numUnits,
+                                                                     const hparams& params,
+                                                                     bool cumulateWeights,
+                                                                     bool paramFrozen)
+    : TrainableParams(inputs, outputs, sharedLayer, paramFrozen)
+    , mNumUnits(numUnits)
+    , mHparams(params)
+    , mCumulateWeights(cumulateWeights)
+{
+}
+
+void DynamicConvolutionAttentionParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "num units = " << mNumUnits << ", cumulate output = " << mCumulateWeights << "\n";
+    stream << "Hyperparameters:\n";
+    stream << "attention filter number = " << mHparams.mAttentionFilters << ", attention kernel size = " << mHparams.mAttentionKernel;
+    stream << ", prior filter size = " << mHparams.mPriorFilterSize << ", attention window size = " << mHparams.mAttentionWindowSize;
+    stream << ", prior alpha = " << mHparams.mPriorAlpha << ", prior beta = " << mHparams.mPriorBeta;
+}
+
+}
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/DynamicConvolutionAttentionParams.h b/training/src/compiler/training/base/layers/parameters/trainable/DynamicConvolutionAttentionParams.h
new file mode 100644
index 00000000..00d754f0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/DynamicConvolutionAttentionParams.h
@@ -0,0 +1,74 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DYNAMIC_CONVOLUTION_ATTENTION_PARAMS_H
+#define DYNAMIC_CONVOLUTION_ATTENTION_PARAMS_H
+
+#include "TrainableParams.h"
+
+namespace raul
+{
+
+/** Parameters for Dynamic Convolution Attention Layer
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param numUnits depth of the query mechanism
+ * @param hparams contains hyperparameters for dense and convolutional layers
+ * @param cumulateWeights whether to add previous alignments to output
+ */
+
+struct DynamicConvolutionAttentionParams : public TrainableParams
+{
+    /** Hyper parameters for DCA
+     * @param attnFilters number of filters in first conv1d layer
+     * @param attnKernel kernel size in first conv1d layer
+     * @param priorFilterSize kernel size of prior filter in second conv1d layer
+     * @param alpha whether to mask encoder paddings
+     * @param beta specifies the type of attention - normalized or not
+     */
+    struct hparams
+    {
+        hparams() = delete;
+
+        hparams(size_t attnFilters, size_t attnKernel, size_t priorFilterSize, size_t attentionWinSize, raul::dtype alpha, raul::dtype beta);
+
+        size_t mAttentionFilters;
+        size_t mAttentionKernel;
+        size_t mPriorFilterSize;
+        size_t mAttentionWindowSize;
+        raul::dtype mPriorAlpha;
+        raul::dtype mPriorBeta;
+    };
+
+    DynamicConvolutionAttentionParams() = delete;
+
+    DynamicConvolutionAttentionParams(const Names& inputs, const Names& outputs, size_t numUnits, const hparams& params, bool cumulateWeights = false, bool paramFrozen = false);
+
+    DynamicConvolutionAttentionParams(const Names& inputs,
+                                      const Names& outputs,
+                                      const Name& sharedLayer,
+                                      size_t numUnits,
+                                      const hparams& params,
+                                      bool cumulateWeights = false,
+                                      bool paramFrozen = false);
+
+    // Standard parameters
+    size_t mNumUnits;
+    hparams mHparams;
+    bool mCumulateWeights;
+    void print(std::ostream& stream) const override;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/EmbeddingParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/EmbeddingParams.cpp
new file mode 100644
index 00000000..5a317d81
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/EmbeddingParams.cpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "EmbeddingParams.h"
+
+namespace raul
+{
+
+void EmbeddingParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "dict_size: " << dictionarySize << ", embedding_size: " << embeddingSize << ", padding_class_idx: ";
+    if (paddingClass >= 0)
+        stream << paddingClass;
+    else
+        stream << "none";
+    stream << ", scale_by_model_size: " << (scaleOutput ? "yes" : "no") << ", scale_grad_by_frequency: " << (scaleGradByFrequency ? "yes" : "no");
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/EmbeddingParams.h b/training/src/compiler/training/base/layers/parameters/trainable/EmbeddingParams.h
new file mode 100644
index 00000000..77d8bc4b
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/EmbeddingParams.h
@@ -0,0 +1,66 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef EMBEDDING_PARAMS_H
+#define EMBEDDING_PARAMS_H
+
+#include "TrainableParams.h"
+#include <string>
+#include <vector>
+
+namespace raul
+{
+
+/**
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param dictSize width size in elements
+ * @param embSize the size of each embedding vector
+ * @param scaleByModelSize if given, this will multiply output by sqrt(embSize)
+ * @param scaleGradByFreq if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default: false
+ */
+struct EmbeddingParams : public TrainableParams
+{
+    EmbeddingParams() = delete;
+    EmbeddingParams(const Name& input, const Name& output, size_t dictSize, size_t embSize, int paddingClassIdx = -1, bool scaleByModelSize = true, bool scaleGradByFreq = false)
+        : TrainableParams(Names(1, input), Names(1, output))
+        , dictionarySize(dictSize)
+        , embeddingSize(embSize)
+        , paddingClass(paddingClassIdx)
+        , scaleOutput(scaleByModelSize)
+        , scaleGradByFrequency(scaleGradByFreq)
+    {
+    }
+
+    EmbeddingParams(const Name& input, const Name& output, bool paramFrozen, size_t dictSize, size_t embSize, int paddingClassIdx = -1, bool scaleByModelSize = true, bool scaleGradByFreq = false)
+        : TrainableParams(Names(1, input), Names(1, output), paramFrozen)
+        , dictionarySize(dictSize)
+        , embeddingSize(embSize)
+        , paddingClass(paddingClassIdx)
+        , scaleOutput(scaleByModelSize)
+        , scaleGradByFrequency(scaleGradByFreq)
+    {
+    }
+
+    size_t dictionarySize;
+    size_t embeddingSize;
+    int paddingClass = -1;
+    bool scaleOutput = true;
+    bool scaleGradByFrequency = false;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // EMBEDDING_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/GRUCellParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/GRUCellParams.cpp
new file mode 100644
index 00000000..acaabda7
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/GRUCellParams.cpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GRUCellParams.h"
+
+namespace raul
+{
+
+void GRUCellParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << ", bias for input: " << (mUseBiasForInput ? "true" : "false");
+    stream << ", bias for hidden: " << (mUseBiasForHidden ? "true" : "false");
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/GRUCellParams.h b/training/src/compiler/training/base/layers/parameters/trainable/GRUCellParams.h
new file mode 100644
index 00000000..110d7b14
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/GRUCellParams.h
@@ -0,0 +1,73 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRU_CELL_PARAMS_H
+#define GRU_CELL_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "TrainableParams.h"
+
+namespace raul
+{
+
+struct GRUCellParams : public TrainableParams
+{
+    GRUCellParams() = delete;
+    /** Parameters for GRU Cell layer
+     * @param inputs vector of names of input tensors
+     * @param outputs vector of names of output tensors
+     * @param useBiasForInput enable or disable bias usage for input
+     * @param useBiasForHidden enable or disable bias usage for hidden
+     */
+    GRUCellParams(const BasicParams& params, bool useBiasForInput = true, bool useBiasForHidden = true, bool frozen = false, bool useFusion = false)
+        : TrainableParams(params, frozen)
+        , mUseBiasForInput(useBiasForInput)
+        , mUseBiasForHidden(useBiasForHidden)
+        , mUseFusion(useFusion)
+    {
+    }
+
+    /** Parameters for GRU Cell layer
+     * @param input name of input tensor
+     * @param inputHidden name of input hidden state tensor
+     * @param outputHidden name of output hidden state tensor
+     * @param useBiasForInput enable or disable bias usage for input
+     * @param useBiasForHidden enable or disable bias usage for hidden
+     */
+    GRUCellParams(const raul::Name& input,
+                  const raul::Name& inputHidden,
+                  const raul::Name& outputHidden,
+                  const Names& weights,
+                  bool useBiasForInput = true,
+                  bool useBiasForHidden = true,
+                  bool frozen = false,
+                  bool useFusion = false)
+        : TrainableParams({ input, inputHidden }, { outputHidden }, weights, frozen)
+        , mUseBiasForInput(useBiasForInput)
+        , mUseBiasForHidden(useBiasForHidden)
+        , mUseFusion(useFusion)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    bool mUseBiasForInput;
+    bool mUseBiasForHidden;
+    bool mUseFusion;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/GRUFusedGatesCalcParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/GRUFusedGatesCalcParams.cpp
new file mode 100644
index 00000000..1f083151
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/GRUFusedGatesCalcParams.cpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GRUFusedGatesCalcParams.h"
+
+namespace raul
+{
+
+void GRUFusedGatesCalcParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "outputs: " << outputsCount;
+    stream << ", bias for input: " << (mUseBiasForInput ? "true" : "false");
+    stream << ", bias for hidden: " << (mUseBiasForHidden ? "true" : "false");
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/GRUFusedGatesCalcParams.h b/training/src/compiler/training/base/layers/parameters/trainable/GRUFusedGatesCalcParams.h
new file mode 100644
index 00000000..7c6263c9
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/GRUFusedGatesCalcParams.h
@@ -0,0 +1,74 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRU_FUSED_GATES_CALC_PARAMS_H
+#define GRU_FUSED_GATES_CALC_PARAMS_H
+
+#include "TrainableParams.h"
+#include <string>
+#include <vector>
+
+namespace raul
+{
+
+/**
+ * @param inputs vector of names of input tensors
+ * @param output name output tensor
+ * @param paramOutputsCount output size in elements of hidden linear layer
+ * @param useBiasForInput enable or disable bias usage for input
+ * @param useBiasForHidden enable or disable bias usage for hidden
+ */
+struct GRUFusedGatesCalcParams : public TrainableParams
+{
+    GRUFusedGatesCalcParams() = delete;
+
+    GRUFusedGatesCalcParams(const Names& inputs, const Name& output, size_t paramOutputsCount, bool useBiasForInput = true, bool useBiasForHidden = true, bool paramFrozen = false)
+        : TrainableParams(inputs, { output }, paramFrozen)
+        , outputsCount(paramOutputsCount)
+        , mUseBiasForInput(useBiasForInput)
+        , mUseBiasForHidden(useBiasForHidden)
+    {
+    }
+
+    GRUFusedGatesCalcParams(const BasicParams& params, size_t paramOutputsCount, bool useBiasForInput = true, bool useBiasForHidden = true, bool paramFrozen = false)
+        : TrainableParams(params, paramFrozen)
+        , outputsCount(paramOutputsCount)
+        , mUseBiasForInput(useBiasForInput)
+        , mUseBiasForHidden(useBiasForHidden)
+    {
+    }
+
+    GRUFusedGatesCalcParams(const Names& inputs,
+                            const Name& output,
+                            const Name& sharedLayer,
+                            size_t paramOutputsCount,
+                            bool useBiasForInput = true,
+                            bool useBiasForHidden = true,
+                            bool paramFrozen = false)
+        : TrainableParams(inputs, { output }, sharedLayer, paramFrozen)
+        , outputsCount(paramOutputsCount)
+        , mUseBiasForInput(useBiasForInput)
+        , mUseBiasForHidden(useBiasForHidden)
+    {
+    }
+
+    size_t outputsCount;
+    bool mUseBiasForInput;
+    bool mUseBiasForHidden;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // GRU_FUSED_GATES_CALC_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/GRUParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/GRUParams.cpp
new file mode 100644
index 00000000..9b404382
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/GRUParams.cpp
@@ -0,0 +1,37 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GRUParams.h"
+
+namespace raul
+{
+
+void GRUParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+
+    stream << ", hidden features: ";
+    if (mHiddenFeatures)
+    {
+        stream << (*mHiddenFeatures);
+    }
+    else
+    {
+        stream << "from input hidden state shape";
+    }
+
+    stream << ", bias for input: " << (mUseBiasForInput ? "true" : "false");
+    stream << ", bias for hidden: " << (mUseBiasForHidden ? "true" : "false");
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/GRUParams.h b/training/src/compiler/training/base/layers/parameters/trainable/GRUParams.h
new file mode 100644
index 00000000..ba508bb2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/GRUParams.h
@@ -0,0 +1,124 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRU_LAYER_PARAMS_H
+#define GRU_LAYER_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "TrainableParams.h"
+
+namespace raul
+{
+
+struct GRUParams : public TrainableParams
+{
+    GRUParams() = delete;
+
+    /** Parameters for GRU layer
+     * @param inputs vector of names of input tensors
+     * @param outputs vector of names of output tensors
+     * @param hiddenFeatures the number of features in the hidden state
+     * @param useBiasForInput enable or disable bias usage for input
+     * @param useBiasForHidden enable or disable bias usage for hidden
+     */
+    GRUParams(const Names& inputs,
+              const Names& outputs,
+              const size_t hiddenFeatures,
+              bool useGlobalFusion,
+              bool useBiasForInput = true,
+              bool useBiasForHidden = true,
+              bool frozen = false,
+              bool useFusion = false)
+        : TrainableParams(inputs, outputs, frozen)
+        , mHiddenFeatures(hiddenFeatures)
+        , mUseBiasForInput(useBiasForInput)
+        , mUseBiasForHidden(useBiasForHidden)
+        , mUseGlobalFusion(useGlobalFusion)
+        , mUseFusion(useFusion)
+    {
+        if (hiddenFeatures == 0U)
+        {
+            throw std::runtime_error("GRULayerParams[ctor]: number of hidden features cannot be zero");
+        }
+    }
+
+    /** Parameters for GRU layer
+     * @param input name of input tensor
+     * @param output name of output tensor
+     * @param inputHidden names of output tensors
+     * @param outputHidden names of output tensors
+     * @param useBiasForInput enable or disable bias usage for input
+     * @param useBiasForHidden enable or disable bias usage for hidden
+     */
+    GRUParams(const raul::Name& input,
+              const raul::Name& inputHidden,
+              const raul::Name& output,
+              const raul::Name& outputHidden,
+              bool useGlobalFusion,
+              bool useBiasForInput = true,
+              bool useBiasForHidden = true,
+              bool frozen = false,
+              bool useFusion = false)
+        : TrainableParams({ input, inputHidden }, { output, outputHidden }, frozen)
+        , mUseBiasForInput(useBiasForInput)
+        , mUseBiasForHidden(useBiasForHidden)
+        , mUseGlobalFusion(useGlobalFusion)
+        , mUseFusion(useFusion)
+    {
+        for (const auto& name : getInputs())
+        {
+            if (name.empty())
+            {
+                throw std::runtime_error("GRUParams[ctor]: empty input name");
+            }
+        }
+
+        for (const auto& name : getOutputs())
+        {
+            if (name.empty())
+            {
+                throw std::runtime_error("GRUParams[ctor]: empty output name");
+            }
+        }
+    }
+
+    /** Parameters for GRU layer
+     * @param inputs vector of names of input tensors
+     * @param outputs vector of names of output tensors
+     * @param useBiasForInput enable or disable bias usage for input
+     * @param useBiasForHidden enable or disable bias usage for hidden
+     */
+    GRUParams(const Names& inputs, const Names& outputs, bool useGlobalFusion, bool useBiasForInput = true, bool useBiasForHidden = true, bool frozen = false, bool useFusion = false)
+        : TrainableParams(inputs, outputs, frozen)
+        , mHiddenFeatures(std::nullopt)
+        , mUseBiasForInput(useBiasForInput)
+        , mUseBiasForHidden(useBiasForHidden)
+        , mUseGlobalFusion(useGlobalFusion)
+        , mUseFusion(useFusion)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    std::optional<size_t> mHiddenFeatures;
+    bool mUseBiasForInput;
+    bool mUseBiasForHidden;
+    bool mUseGlobalFusion;
+    bool mUseFusion;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LSTMCellParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/LSTMCellParams.cpp
new file mode 100644
index 00000000..adc393e8
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LSTMCellParams.cpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LSTMCellParams.h"
+
+namespace raul
+{
+
+void LSTMCellParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << ", bias: " << (mBias ? "true" : "false");
+    stream << ", zoneout: " << mZoneout;
+    stream << ", weight: " << (mUseSingleParamTensor ? "single" : "double");
+    stream << ", forget bias: " << mForgetBias;
+    stream << ", fusion: " << (mUseFusion ? "on" : "off");
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LSTMCellParams.h b/training/src/compiler/training/base/layers/parameters/trainable/LSTMCellParams.h
new file mode 100644
index 00000000..28bd7777
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LSTMCellParams.h
@@ -0,0 +1,98 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LSTM_CELL_PARAMS_H
+#define LSTM_CELL_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "TrainableParams.h"
+
+namespace raul
+{
+
+struct LSTMCellParams : public TrainableParams
+{
+    LSTMCellParams() = delete;
+    /** Parameters for LSTM Cell layer
+     * @param inputs vector of names of input tensors
+     * @param outputs vector of names of output tensors
+     * @param paramBias enable or disable bias usage
+     * @param zoneout value (probability, optional)
+     * @param useSingleParamTensor flag (default=false)
+     * @param forgetBias value (default=0)
+     */
+    LSTMCellParams(const BasicParams& params,
+                   bool paramBias = true,
+                   dtype zoneout = 0.0_dt,
+                   bool useSingleParamTensor = false,
+                   dtype forgetBias = 0.0_dt,
+                   bool frozen = false,
+                   bool useFusion = false,
+                   std::optional<size_t> hidden_size = std::nullopt)
+        : TrainableParams(params, frozen)
+        , mBias(paramBias)
+        , mZoneout(zoneout)
+        , mUseSingleParamTensor(useSingleParamTensor)
+        , mForgetBias(forgetBias)
+        , mUseFusion(useFusion)
+        , mHiddenFeatures(hidden_size)
+    {
+    }
+
+    /** Parameters for LSTM Cell layer
+     * @param input name of input tensor
+     * @param inputHidden name of input hidden state tensor
+     * @param inputCell name of input cell state tensor
+     * @param outputHidden name of output hidden state tensor
+     * @param outputCell name of output cell state tensor
+     * @param paramBias enable or disable bias usage
+     * @param zoneout value (probability)
+     * @param useSingleWeightTensor flag (default=false)
+     * @param forgetBias value (default=0)
+     */
+    LSTMCellParams(const raul::Name& input,
+                   const raul::Name& inputHidden,
+                   const raul::Name& inputCell,
+                   const raul::Name& outputHidden,
+                   const raul::Name& outputCell,
+                   const Names& weights,
+                   bool paramBias = true,
+                   dtype zoneout = 0.0_dt,
+                   bool useSingleMatrix = false,
+                   dtype forgetBias = 0.0_dt,
+                   bool frozen = false,
+                   bool useFusion = false)
+        : TrainableParams({ input, inputHidden, inputCell }, { outputHidden, outputCell }, weights, frozen)
+        , mBias(paramBias)
+        , mZoneout(zoneout)
+        , mUseSingleParamTensor(useSingleMatrix)
+        , mForgetBias(forgetBias)
+        , mUseFusion(useFusion)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    bool mBias;
+    dtype mZoneout;
+    bool mUseSingleParamTensor;
+    dtype mForgetBias;
+    bool mUseFusion;
+    std::optional<size_t> mHiddenFeatures;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LSTMFusedGatesCalcParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/LSTMFusedGatesCalcParams.cpp
new file mode 100644
index 00000000..5c941f25
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LSTMFusedGatesCalcParams.cpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LSTMFusedGatesCalcParams.h"
+
+namespace raul
+{
+
+void LSTMFusedGatesCalcParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "outputs: " << outputsCount;
+    stream << ", bias: " << (mUseBias ? "true" : "false");
+    stream << ", zoneout: " << mZoneout;
+    stream << ", weight: " << (mUseSingleParamTensor ? "single" : "double");
+    stream << ", forget bias: " << mForgetBias;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LSTMFusedGatesCalcParams.h b/training/src/compiler/training/base/layers/parameters/trainable/LSTMFusedGatesCalcParams.h
new file mode 100644
index 00000000..dfb580e5
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LSTMFusedGatesCalcParams.h
@@ -0,0 +1,97 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LSTM_FUSED_GATES_CALC_PARAMS_H
+#define LSTM_FUSED_GATES_CALC_PARAMS_H
+
+#include "TrainableParams.h"
+#include <string>
+#include <vector>
+
+namespace raul
+{
+
+/**
+ * @param inputs vector of names of input tensors
+ * @param output name output tensor
+ * @param paramOutputsCount output size in elements of hidden linear layer
+ * @param useBias enable or disable bias usage for input
+ * @param useBiasForHidden enable or disable bias usage for hidden
+ */
+struct LSTMFusedGatesCalcParams : public TrainableParams
+{
+    LSTMFusedGatesCalcParams() = delete;
+
+    LSTMFusedGatesCalcParams(const Names& inputs,
+                             const Name& output,
+                             size_t paramOutputsCount,
+                             bool useBias = true,
+                             dtype zoneout = 0.0_dt,
+                             bool useSingleParamTensor = false,
+                             dtype forgetBias = 0.0_dt,
+                             bool paramFrozen = false)
+        : TrainableParams(inputs, { output }, paramFrozen)
+        , outputsCount(paramOutputsCount)
+        , mUseBias(useBias)
+        , mZoneout(zoneout)
+        , mUseSingleParamTensor(useSingleParamTensor)
+        , mForgetBias(forgetBias)
+    {
+    }
+
+    LSTMFusedGatesCalcParams(const BasicParams& params,
+                             size_t paramOutputsCount,
+                             bool useBias = true,
+                             dtype zoneout = 0.0_dt,
+                             bool useSingleParamTensor = false,
+                             dtype forgetBias = 0.0_dt,
+                             bool paramFrozen = false)
+        : TrainableParams(params, paramFrozen)
+        , outputsCount(paramOutputsCount)
+        , mUseBias(useBias)
+        , mZoneout(zoneout)
+        , mUseSingleParamTensor(useSingleParamTensor)
+        , mForgetBias(forgetBias)
+    {
+    }
+
+    LSTMFusedGatesCalcParams(const Names& inputs,
+                             const Name& output,
+                             const Name& sharedLayer,
+                             size_t paramOutputsCount,
+                             bool useBias = true,
+                             dtype zoneout = 0.0_dt,
+                             bool useSingleParamTensor = false,
+                             dtype forgetBias = 0.0_dt,
+                             bool paramFrozen = false)
+        : TrainableParams(inputs, { output }, sharedLayer, paramFrozen)
+        , outputsCount(paramOutputsCount)
+        , mUseBias(useBias)
+        , mZoneout(zoneout)
+        , mUseSingleParamTensor(useSingleParamTensor)
+        , mForgetBias(forgetBias)
+    {
+    }
+
+    size_t outputsCount;
+    bool mUseBias;
+    dtype mZoneout;
+    bool mUseSingleParamTensor;
+    dtype mForgetBias;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // LSTM_FUSED_GATES_CALC_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LSTMParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/LSTMParams.cpp
new file mode 100644
index 00000000..a25ee624
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LSTMParams.cpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LSTMParams.h"
+
+namespace raul
+{
+
+void LSTMParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+
+    stream << ", hidden features: ";
+    if (mHiddenFeatures)
+    {
+        stream << (*mHiddenFeatures);
+    }
+    else
+    {
+        stream << "from input hidden state shape";
+    }
+
+    stream << ", bias: " << (mBias ? "true" : "false");
+    stream << ", reversed: " << (mReversed ? "true" : "false");
+    stream << ", zoneout: " << mZoneout;
+    stream << ", weight: " << (mUseSingleParamTensor ? "single" : "double");
+    stream << ", forget bias: " << mForgetBias;
+    stream << ", fusion: " << (mUseFusion ? "on" : "off");
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LSTMParams.h b/training/src/compiler/training/base/layers/parameters/trainable/LSTMParams.h
new file mode 100644
index 00000000..42b48bf1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LSTMParams.h
@@ -0,0 +1,156 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LSTM_LAYER_PARAMS_H
+#define LSTM_LAYER_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "TrainableParams.h"
+
+namespace raul
+{
+
+struct LSTMParams : public TrainableParams
+{
+    LSTMParams() = delete;
+
+    /** Parameters for LSTM layer
+     * @param inputs vector of names of input tensors
+     * @param outputs vector of names of output tensors
+     * @param hiddenFeatures the number of features in the hidden state
+     * @param paramBias enable or disable bias usage
+     * @param reversed enable reversed LSTM
+     */
+    LSTMParams(const Names& inputs,
+               const Names& outputs,
+               const size_t hiddenFeatures,
+               bool useGlobalFusion,
+               bool paramBias = true,
+               bool frozen = false,
+               bool reversed = false,
+               dtype zoneout = 0.0_dt,
+               bool useSingleMatrix = false,
+               dtype forgetBias = 0.0_dt,
+               bool useFusion = false)
+        : TrainableParams(inputs, outputs, frozen)
+        , mHiddenFeatures(hiddenFeatures)
+        , mBias(paramBias)
+        , mReversed(reversed)
+        , mZoneout(zoneout)
+        , mUseSingleParamTensor(useSingleMatrix)
+        , mForgetBias(forgetBias)
+        , mUseGlobalFusion(useGlobalFusion)
+        , mUseFusion(useFusion)
+    {
+        if (hiddenFeatures == 0U)
+        {
+            THROW_NONAME("LSTMLayerParams", "number of hidden features cannot be zero");
+        }
+    }
+
+    /** Parameters for LSTM layer
+     * @param input name of input tensor
+     * @param output name of output tensor
+     * @param inputHidden names of output tensors
+     * @param inputCell names of output tensors
+     * @param outputHidden names of output tensors
+     * @param outputCell name of output tensors
+     * @param paramBias enable or disable bias usage
+     * @param reversed enable reversed LSTM
+     */
+    LSTMParams(const raul::Name& input,
+               const raul::Name& inputHidden,
+               const raul::Name& inputCell,
+               const raul::Name& output,
+               const raul::Name& outputHidden,
+               const raul::Name& outputCell,
+               bool useGlobalFusion,
+               bool paramBias = true,
+               bool frozen = false,
+               bool reversed = false,
+               dtype zoneout = 0.0_dt,
+               bool useSingleMatrix = false,
+               dtype forgetBias = 0.0_dt,
+               bool useFusion = false)
+        : TrainableParams({ input, inputHidden, inputCell }, { output, outputHidden, outputCell }, frozen)
+        , mBias(paramBias)
+        , mReversed(reversed)
+        , mZoneout(zoneout)
+        , mUseSingleParamTensor(useSingleMatrix)
+        , mForgetBias(forgetBias)
+        , mUseGlobalFusion(useGlobalFusion)
+        , mUseFusion(useFusion)
+    {
+        for (const auto& name : getInputs())
+        {
+            if (name.empty())
+            {
+                THROW_NONAME("LSTMParams", "empty input name");
+            }
+        }
+
+        for (const auto& name : getOutputs())
+        {
+            if (name.empty())
+            {
+                THROW_NONAME("LSTMParams", "empty output name");
+            }
+        }
+    }
+
+    /** Parameters for LSTM layer
+     * @param inputs vector of names of input tensors
+     * @param outputs vector of names of output tensors
+     * @param paramBias enable or disable bias usage
+     * @param reversed enable reversed LSTM
+     *
+     */
+    LSTMParams(const Names& inputs,
+               const Names& outputs,
+               bool useGlobalFusion,
+               bool paramBias = true,
+               bool frozen = false,
+               bool reversed = false,
+               dtype zoneout = 0.0_dt,
+               bool useSingleMatrix = false,
+               dtype forgetBias = 0.0_dt,
+               bool useFusion = false)
+        : TrainableParams(inputs, outputs, frozen)
+        , mHiddenFeatures(std::nullopt)
+        , mBias(paramBias)
+        , mReversed(reversed)
+        , mZoneout(zoneout)
+        , mUseSingleParamTensor(useSingleMatrix)
+        , mForgetBias(forgetBias)
+        , mUseGlobalFusion(useGlobalFusion)
+        , mUseFusion(useFusion)
+    {
+    }
+
+    void print(std::ostream& stream) const override;
+
+    std::optional<size_t> mHiddenFeatures;
+    bool mBias;
+    bool mReversed;
+    dtype mZoneout;
+    bool mUseSingleParamTensor;
+    dtype mForgetBias;
+    bool mUseGlobalFusion;
+    bool mUseFusion;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LayerNormParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/LayerNormParams.cpp
new file mode 100644
index 00000000..240c100d
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LayerNormParams.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LayerNormParams.h"
+
+namespace raul
+{
+void LayerNormParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "eps:" << eps;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LayerNormParams.h b/training/src/compiler/training/base/layers/parameters/trainable/LayerNormParams.h
new file mode 100644
index 00000000..a439f674
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LayerNormParams.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAYERNORM_PARAMS_H
+#define LAYERNORM_PARAMS_H
+
+#include "TrainableParams.h"
+#include <string>
+#include <vector>
+
+namespace raul
+{
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param paramEps a value added to the denominator for numerical stability
+ * @param paramTFStyle TF style - eps added to biased dispersion or Transformer style (eps added to unbiased stddev), useBesselCorrection is ignored
+ */
+struct LayerNormParams : public TrainableParams
+{
+    LayerNormParams() = delete;
+    LayerNormParams(const Name& input, const Name& output, float paramEps = 1e-5f, bool paramTFStyle = false, bool useBesselCorrection = true, bool frozen = false)
+        : TrainableParams(Names(1, input), Names(1, output), frozen)
+        , eps(paramEps)
+        , useTFstyle(paramTFStyle)
+        , useBesselCorrection(useBesselCorrection)
+    {
+    }
+
+    float eps;
+    bool useTFstyle;
+    bool useBesselCorrection;
+
+    void print(std::ostream& stream) const override;
+};
+} // raul namespace
+
+#endif // LAYERNORM_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LinearParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/LinearParams.cpp
new file mode 100644
index 00000000..09a27820
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LinearParams.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LinearParams.h"
+
+namespace raul
+{
+
+void LinearParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "outputs: " << outputsCount << ", bias: " << (bias ? "true" : "false");
+}
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LinearParams.h b/training/src/compiler/training/base/layers/parameters/trainable/LinearParams.h
new file mode 100644
index 00000000..08c10812
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LinearParams.h
@@ -0,0 +1,62 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LINEAR_LAYER_PARAMS_H
+#define LINEAR_LAYER_PARAMS_H
+
+#include "TrainableParams.h"
+#include <string>
+#include <vector>
+
+namespace raul
+{
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param paramOutputsCount output size in elements
+ * @param paramBias enable or disable bias usage
+ */
+struct LinearParams : public TrainableParams
+{
+    LinearParams() = delete;
+
+    LinearParams(const Name& input, const Name& output, size_t paramOutputsCount, bool paramBias = true, bool paramFrozen = false)
+        : TrainableParams({ input }, { output }, paramFrozen)
+        , outputsCount(paramOutputsCount)
+        , bias(paramBias)
+    {
+    }
+
+    LinearParams(const BasicParams& params, size_t paramOutputsCount, bool paramBias = true, bool paramFrozen = false)
+        : TrainableParams(params, paramFrozen)
+        , outputsCount(paramOutputsCount)
+        , bias(paramBias)
+    {
+    }
+
+    LinearParams(const Name& input, const Name& output, const Name& sharedLayer, size_t paramOutputsCount, bool paramBias = true, bool paramFrozen = false)
+        : TrainableParams({ input }, { output }, sharedLayer, paramFrozen)
+        , outputsCount(paramOutputsCount)
+        , bias(paramBias)
+    {
+    }
+
+    size_t outputsCount;
+    bool bias;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // LINEAR_LAYER_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LocationSensitiveAttentionParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/LocationSensitiveAttentionParams.cpp
new file mode 100644
index 00000000..a2990e67
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LocationSensitiveAttentionParams.cpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LocationSensitiveAttentionParams.h"
+
+namespace raul
+{
+
+LocationSensitiveAttentionParams::hparams::hparams(size_t attnFilters, size_t attnKernel, bool useTransAgent, bool useStepwiseMonotonicConstraintType)
+    : mAttentionFilters(attnFilters)
+    , mAttentionKernel(attnKernel)
+    , mUseTransAgent(useTransAgent)
+    , mUseStepwiseMonotonicConstraintType(useStepwiseMonotonicConstraintType)
+{
+}
+
+LocationSensitiveAttentionParams::LocationSensitiveAttentionParams(const Names& inputs,
+                                                                   const Names& outputs,
+                                                                   size_t numUnits,
+                                                                   const hparams& params,
+                                                                   bool cumulateWeights,
+                                                                   bool smoothing,
+                                                                   dtype sigmoidNoise,
+                                                                   bool useForward,
+                                                                   bool paramFrozen)
+    : TrainableParams(inputs, outputs, paramFrozen)
+    , mNumUnits(numUnits)
+    , mHparams(params)
+    , mCumulateWeights(cumulateWeights)
+    , mSmoothing(smoothing)
+    , mSigmoidNoise(sigmoidNoise)
+    , mUseForward(useForward)
+{
+}
+
+LocationSensitiveAttentionParams::LocationSensitiveAttentionParams(const Names& inputs,
+                                                                   const Names& outputs,
+                                                                   const Name& sharedLayer,
+                                                                   size_t numUnits,
+                                                                   const hparams& params,
+                                                                   bool cumulateWeights,
+                                                                   bool smoothing,
+                                                                   dtype sigmoidNoise,
+                                                                   bool useForward,
+                                                                   bool paramFrozen)
+    : TrainableParams(inputs, outputs, sharedLayer, paramFrozen)
+    , mNumUnits(numUnits)
+    , mHparams(params)
+    , mCumulateWeights(cumulateWeights)
+    , mSmoothing(smoothing)
+    , mSigmoidNoise(sigmoidNoise)
+    , mUseForward(useForward)
+{
+}
+
+void LocationSensitiveAttentionParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "internal dimensionality = " << mNumUnits << ", cumulate output: " << (mCumulateWeights ? "true" : "false") << ",\n";
+    stream << "smoothing: " << (mSmoothing ? "true" : "false") << ", sigmoid noise = " << mSigmoidNoise << ", use forward: " << (mUseForward ? "true" : "false") << "\n";
+    stream << "Hyperparameters:\n";
+    stream << "attention filter number = " << mHparams.mAttentionFilters << ", attention kernel size = " << mHparams.mAttentionKernel;
+    stream << ", calculate transition probability: " << (mHparams.mUseTransAgent ? "true" : "false")
+           << ", use stepwise monotonic constraint: " << (mHparams.mUseStepwiseMonotonicConstraintType ? "true" : "false");
+}
+
+}
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/LocationSensitiveAttentionParams.h b/training/src/compiler/training/base/layers/parameters/trainable/LocationSensitiveAttentionParams.h
new file mode 100644
index 00000000..15cc23d1
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/LocationSensitiveAttentionParams.h
@@ -0,0 +1,91 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LOCATION_SENSITIVE_ATTENTION_PARAMS_H
+#define LOCATION_SENSITIVE_ATTENTION_PARAMS_H
+
+#include "TrainableParams.h"
+
+namespace raul
+{
+
+/** Parameters for Location Sensitive Attention Layer
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param numUnits depth of the query mechanism
+ * @param hparams contains hyperparameters for dense and convolutional layers
+ * @param cumulateWeights whether to add previous alignments to output
+ * @param smoothing Determines which normalization function to use.
+ *  Default normalization function (probablity_fn) is softmax. If smoothing is enabled, we replace softmax with:
+ *  a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
+ * @param sigmoidNoise coefficient to increase random noise added to sigmoid input
+ * @param useForward whether to shift produced alignments
+ */
+
+struct LocationSensitiveAttentionParams : public TrainableParams
+{
+    /** Hyper parameters for LSA
+     * @param attnFilters number of filters in first conv1d layer
+     * @param attnKernel kernel size in first conv1d layer
+     * @param useTransAgent whether to calculate transition probability
+     * @param useStepwiseMonotonicConstraintType whether to use stepwise monotonic constraint
+     */
+    struct hparams
+    {
+        hparams() = delete;
+
+        hparams(size_t attnFilters, size_t attnKernel, bool useTransAgent, bool useStepwiseMonotonicConstraintType);
+
+        size_t mAttentionFilters;
+        size_t mAttentionKernel;
+        bool mUseTransAgent;
+        bool mUseStepwiseMonotonicConstraintType;
+    };
+
+    LocationSensitiveAttentionParams() = delete;
+
+    LocationSensitiveAttentionParams(const Names& inputs,
+                                     const Names& outputs,
+                                     size_t numUnits,
+                                     const hparams& params,
+                                     bool cumulateWeights = false,
+                                     bool smoothing = false,
+                                     dtype sigmoidNoise = 0.0_dt,
+                                     bool useForward = false,
+                                     bool paramFrozen = false);
+
+    LocationSensitiveAttentionParams(const Names& inputs,
+                                     const Names& outputs,
+                                     const Name& sharedLayer,
+                                     size_t numUnits,
+                                     const hparams& params,
+                                     bool cumulateWeights = false,
+                                     bool smoothing = false,
+                                     dtype sigmoidNoise = 0.0_dt,
+                                     bool useForward = false,
+                                     bool paramFrozen = false);
+
+    // Standard parameters
+    size_t mNumUnits;
+    hparams mHparams;
+    bool mCumulateWeights;
+    bool mSmoothing;
+    dtype mSigmoidNoise;
+    bool mUseForward;
+
+    void print(std::ostream& stream) const override;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/MultiHeadAttentionParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/MultiHeadAttentionParams.cpp
new file mode 100644
index 00000000..2e87ade0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/MultiHeadAttentionParams.cpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "MultiHeadAttentionParams.h"
+
+namespace raul
+{
+void TrainableDropoutParams::print(std::ostream& stream) const
+{
+    TrainableParams::print(stream);
+    stream << "probability: " << probability;
+}
+
+void MultiHeadAttentionParams::print(std::ostream& stream) const
+{
+    TrainableDropoutParams::print(stream);
+    stream << ", heads: " << heads << ", finlTranform: " << (finalTransform ? "yes" : "no");
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/MultiHeadAttentionParams.h b/training/src/compiler/training/base/layers/parameters/trainable/MultiHeadAttentionParams.h
new file mode 100644
index 00000000..3c6bb4a5
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/MultiHeadAttentionParams.h
@@ -0,0 +1,61 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MULTIHEAD_ATTENTION_PARAMS_H
+#define MULTIHEAD_ATTENTION_PARAMS_H
+
+#include "TrainableParams.h"
+#include <string>
+#include <vector>
+
+namespace raul
+{
+struct TrainableDropoutParams : public TrainableParams
+{
+    TrainableDropoutParams() = delete;
+    TrainableDropoutParams(const Names& inputs, const Names& outputs, dtype paramProbability, bool frozen = false)
+        : TrainableParams(inputs, outputs, frozen)
+        , probability(paramProbability)
+    {
+    }
+
+    dtype probability;
+
+    void print(std::ostream& stream) const override;
+};
+
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param heads number of parallel heads
+ * @param paramProbability probability of element to be zero-ed
+ */
+struct MultiHeadAttentionParams : public TrainableDropoutParams
+{
+    MultiHeadAttentionParams() = delete;
+    MultiHeadAttentionParams(const Names& inputs, const raul::Name& output, unsigned int paramHeads = 1, float paramProbability = 0.1f, bool paramFinalTransform = true, bool frozen = false)
+        : TrainableDropoutParams(inputs, Names(1, output), paramProbability, frozen)
+        , heads(paramHeads)
+        , finalTransform(paramFinalTransform)
+    {
+    }
+
+    unsigned int heads;
+    bool finalTransform;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif // MULTIHEAD_ATTENTION_PARAMS_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/TrainableParams.h b/training/src/compiler/training/base/layers/parameters/trainable/TrainableParams.h
new file mode 100644
index 00000000..730ce301
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/TrainableParams.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TRAINABLE_PARAMETERS_H
+#define TRAINABLE_PARAMETERS_H
+
+#include "../BasicParameters.h"
+
+namespace raul
+{
+
+/**
+ * @param paramFreeze keep layer freezed (not trainable)
+ */
+
+struct TrainableParams : public BasicParams
+{
+    TrainableParams() = delete;
+
+    TrainableParams(const BasicParams& params, bool paramFrozen = false)
+        : BasicParams(params)
+        , frozen(paramFrozen)
+    {
+    }
+
+    TrainableParams(const Names& inputs, const Names& outputs, bool paramFrozen = false)
+        : BasicParams(inputs, outputs)
+        , frozen(paramFrozen)
+    {
+    }
+
+    TrainableParams(const Names& inputs, const Names& outputs, const Name& sharedLayer, bool paramFrozen = false)
+        : BasicParams(inputs, outputs, sharedLayer)
+        , frozen(paramFrozen)
+    {
+    }
+
+    TrainableParams(const Names& inputs, const Names& outputs, const Names& weights, bool paramFrozen = false)
+        : BasicParams(inputs, outputs, weights)
+        , frozen(paramFrozen)
+    {
+    }
+
+    bool frozen;
+};
+
+} // raul namespace
+
+#endif
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/TrainablePool2DParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/TrainablePool2DParams.cpp
new file mode 100644
index 00000000..105fcc66
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/TrainablePool2DParams.cpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TrainablePool2DParams.h"
+
+namespace raul
+{
+
+TrainablePool2DParams::TrainablePool2DParams(const Names& inputs,
+                                             const Names& outputs,
+                                             size_t kernelW,
+                                             size_t kernelH,
+                                             size_t strideWidth,
+                                             size_t strideHeight,
+                                             size_t paddingWidth,
+                                             size_t paddingHeight,
+                                             bool frozen)
+    : TrainableParams(inputs, outputs, frozen)
+    , kernelWidth(kernelW)
+    , kernelHeight(kernelH)
+    , strideW(strideWidth)
+    , strideH(strideHeight)
+    , paddingW(paddingWidth)
+    , paddingH(paddingHeight)
+{
+}
+
+TrainablePool2DParams::TrainablePool2DParams(const Names& inputs, const Names& outputs, size_t kernelSize, size_t stride, size_t padding, bool frozen)
+    : TrainableParams(inputs, outputs, frozen)
+    , kernelWidth(kernelSize)
+    , kernelHeight(kernelSize)
+    , strideW(stride)
+    , strideH(stride)
+    , paddingW(padding)
+    , paddingH(padding)
+{
+}
+
+void TrainablePool2DParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "kernel: [" << kernelWidth << " x " << kernelHeight;
+    stream << "], stride: [" << strideW << ", " << strideH;
+    stream << "], padding: [" << paddingW << ", " << paddingH << "]";
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/TrainablePool2DParams.h b/training/src/compiler/training/base/layers/parameters/trainable/TrainablePool2DParams.h
new file mode 100644
index 00000000..77d5e432
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/TrainablePool2DParams.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TRAINABLEPOOL_2D_PARAMS_H
+#define TRAINABLEPOOL_2D_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "TrainableParams.h"
+
+namespace raul
+{
+struct TrainablePool2DParams : public TrainableParams
+{
+    TrainablePool2DParams() = delete;
+    TrainablePool2DParams(const Names& inputs,
+                          const Names& outputs,
+                          size_t kernelW,
+                          size_t kernelH,
+                          size_t strideWidth,
+                          size_t strideHeight,
+                          size_t paddingWidth,
+                          size_t paddingHeight,
+                          bool frozen = false);
+    TrainablePool2DParams(const Names& inputs, const Names& outputs, size_t kernelSize, size_t stride, size_t padding = 0, bool frozen = false);
+
+    size_t kernelWidth = 0, kernelHeight = 0;
+    size_t strideW = 0, strideH = 0;
+    size_t paddingW = 0, paddingH = 0;
+
+    void print(std::ostream& stream) const override;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/TransformerParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/TransformerParams.cpp
new file mode 100644
index 00000000..562cf382
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/TransformerParams.cpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TransformerParams.h"
+
+namespace raul
+{
+
+TransformerParams::TransformerParams(const Names& inputs,
+                                     const raul::Name& output,
+                                     unsigned int paramSrcVocabSize,
+                                     unsigned int paramTgtVocabSize,
+                                     unsigned int paramEncoderDecoderLength,
+                                     unsigned int paramModelSize,
+                                     unsigned int paramFeedForwardSize,
+                                     unsigned int paramHeadsCount,
+                                     float paramDropout,
+                                     bool frozen)
+    : TrainableParams(inputs, Names(1, output), frozen)
+    , srcVocabSize(paramSrcVocabSize)
+    , tgtVocabSize(paramTgtVocabSize)
+    , encoderDecoderLength(paramEncoderDecoderLength)
+    , modelSize(paramModelSize)
+    , feedForwardSize(paramFeedForwardSize)
+    , heads(paramHeadsCount)
+    , dropout(paramDropout)
+{
+}
+
+void TransformerParams::print(std::ostream& stream) const
+{
+    BasicParams::print(stream);
+    stream << "src_vocab: " << srcVocabSize << ", tgt_vocab: " << tgtVocabSize << ", ";
+    stream << "N: " << encoderDecoderLength << ", model_size: " << modelSize << ", ";
+    stream << "ff_size: " << feedForwardSize << ", heads: " << heads << ", dropout: " << dropout;
+}
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/TransformerParams.h b/training/src/compiler/training/base/layers/parameters/trainable/TransformerParams.h
new file mode 100644
index 00000000..18d4c5e0
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/TransformerParams.h
@@ -0,0 +1,55 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TRANSFORMER_PARAMS_H
+#define TRANSFORMER_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "TrainableParams.h"
+
+namespace raul
+{
+
+/**
+ * @param input names of input tensors (source, target, source_mask, target_mask)
+ * @param output name of output tensor
+ */
+struct TransformerParams : public TrainableParams
+{
+    TransformerParams() = delete;
+    TransformerParams(const Names& inputs,
+                      const raul::Name& output,
+                      unsigned int paramSrcVocabSize,
+                      unsigned int paramTgtVocabSize,
+                      unsigned int paramEncoderDecoderLength = 6,
+                      unsigned int paramModelSize = 512,
+                      unsigned int paramFeedForwardSize = 2048,
+                      unsigned int paramHeadsCount = 8,
+                      float paramDropout = 0.1f,
+                      bool frozen = false);
+
+    unsigned int srcVocabSize;
+    unsigned int tgtVocabSize;
+    unsigned int encoderDecoderLength;
+    unsigned int modelSize;
+    unsigned int feedForwardSize;
+    unsigned int heads;
+    float dropout;
+
+    void print(std::ostream&) const override;
+};
+
+} // raul namespace
+#endif // TRANSFORMER_PARAMS_H
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution1DParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution1DParams.cpp
new file mode 100644
index 00000000..0684a6a4
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution1DParams.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TransposedConvolution1DParams.h"
+
+namespace raul
+{
+
+TransposedConvolution1DParams::TransposedConvolution1DParams(const raul::Name& input,
+                                                             const raul::Name& output,
+                                                             size_t paramKernelSize,
+                                                             size_t paramOutChannel,
+                                                             size_t paramStride,
+                                                             size_t paramPadding,
+                                                             size_t paramOutputPadding,
+                                                             size_t paramDilation,
+                                                             size_t paramGroups,
+                                                             bool paramBias,
+                                                             bool paramQuantizeWeights,
+                                                             bool frozen)
+    : Convolution1DParams({ { input }, { output } }, paramKernelSize, paramOutChannel, paramStride, paramPadding, paramDilation, paramGroups, paramBias, paramQuantizeWeights, false, frozen)
+    , outputPadding(paramOutputPadding)
+{
+}
+
+void TransposedConvolution1DParams::print(std::ostream& stream) const
+{
+    Convolution1DParams::print(stream);
+    stream << "output padding = " << outputPadding;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution1DParams.h b/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution1DParams.h
new file mode 100644
index 00000000..45a314a2
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution1DParams.h
@@ -0,0 +1,66 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TRANSPOSED_CONVOLUTION_1D_PARAMS_H
+#define TRANSPOSED_CONVOLUTION_1D_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "Convolution1DParams.h"
+
+namespace raul
+{
+
+/**
+ * @param input name of input tensor
+ * @param output name of output tensor
+ * @param paramKernelSize the size of the convolution kernel
+ * @param paramOutChannel number of channels produced by the convolution
+ * @param paramStride the stride of the convolution
+ * @param paramPadding implicit zero padding to be added on both sides
+ * @param paramOutputPadding additional size added to one side of the output shap
+ * @param paramDilation the spacing between the kernel points
+ * @param paramGroups controls the connections between inputs and outputs
+ * @param paramBias enable or disable bias usagee
+ * @param paramQuantizeWeights enable or disable weights quantization
+ *
+ * @see
+ * https://pytorch.org/docs/master/generated/torch.nn.ConvTranspose1d.html
+ */
+
+struct TransposedConvolution1DParams : public Convolution1DParams
+{
+    TransposedConvolution1DParams() = delete;
+
+    TransposedConvolution1DParams(const raul::Name& input,
+                                  const raul::Name& output,
+                                  size_t paramKernelSize,
+                                  size_t paramOutChannel,
+                                  size_t paramStride = 1,
+                                  size_t paramPadding = 0,
+                                  size_t paramOutputPadding = 0,
+                                  size_t paramDilation = 1,
+                                  size_t paramGroups = 1,
+                                  bool paramBias = true,
+                                  bool paramQuantizeWeights = false,
+                                  bool frozen = false);
+
+    size_t outputPadding;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution2DParams.cpp b/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution2DParams.cpp
new file mode 100644
index 00000000..b66ddb8f
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution2DParams.cpp
@@ -0,0 +1,75 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "TransposedConvolution2DParams.h"
+
+namespace raul
+{
+
+TransposedConvolution2DParams::TransposedConvolution2DParams(const Names& inputs,
+                                                             const Names& outputs,
+                                                             size_t kernelW,
+                                                             size_t kernelH,
+                                                             size_t paramOutChannel,
+                                                             size_t strideW,
+                                                             size_t strideH,
+                                                             size_t paddingW,
+                                                             size_t paddingH,
+                                                             size_t outputPaddingW,
+                                                             size_t outputPaddingH,
+                                                             bool paramBias,
+                                                             size_t dilationW,
+                                                             size_t dilationH,
+                                                             bool quantizeWeights,
+                                                             bool frozen)
+    : TrainablePool2DParams(inputs, outputs, kernelW, kernelH, strideW, strideH, paddingW, paddingH, frozen)
+    , kernelsCount(paramOutChannel)
+    , mOutputPaddingW(outputPaddingW)
+    , mOutputPaddingH(outputPaddingH)
+    , bias(paramBias)
+    , mDilationW(dilationW)
+    , mDilationH(dilationH)
+    , quantizeWeights(quantizeWeights)
+{
+}
+
+TransposedConvolution2DParams::TransposedConvolution2DParams(const Names& inputs,
+                                                             const Names& outputs,
+                                                             size_t kernelSize,
+                                                             size_t paramOutChannel,
+                                                             size_t stride,
+                                                             size_t padding,
+                                                             size_t outputPadding,
+                                                             bool paramBias,
+                                                             size_t dilation,
+                                                             bool quantizeWeights,
+                                                             bool frozen)
+    : TrainablePool2DParams(inputs, outputs, kernelSize, stride, padding, frozen)
+    , kernelsCount(paramOutChannel)
+    , mOutputPaddingW(outputPadding)
+    , mOutputPaddingH(outputPadding)
+    , bias(paramBias)
+    , mDilationW(dilation)
+    , mDilationH(dilation)
+    , quantizeWeights(quantizeWeights)
+{
+}
+
+void TransposedConvolution2DParams::print(std::ostream& stream) const
+{
+    TrainablePool2DParams::print(stream);
+    stream << ", kernels: " << kernelsCount << ", output padding: width = " << mOutputPaddingW << ", height = " << mOutputPaddingH;
+    stream << ", bias: " << (bias ? "true" : "false") << ", width dilation:" << mDilationW << ", height dilation: " << mDilationH << ", quantized weights: " << (quantizeWeights ? "true" : "false");
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution2DParams.h b/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution2DParams.h
new file mode 100644
index 00000000..98bf4492
--- /dev/null
+++ b/training/src/compiler/training/base/layers/parameters/trainable/TransposedConvolution2DParams.h
@@ -0,0 +1,83 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TRANSPOSED_CONVOLUTION_2D_PARAMS_H
+#define TRANSPOSED_CONVOLUTION_2D_PARAMS_H
+
+#include <string>
+#include <vector>
+
+#include "TrainablePool2DParams.h"
+
+namespace raul
+{
+
+/**
+ * @param inputs vector of names of input tensors
+ * @param outputs vector of names of output tensors
+ * @param kernelW/H the size of the convolution kernel
+ * @param paramOutChannel number of channels produced by the convolution
+ * @param strideW/H the stride of the convolution
+ * @param paddingW/H implicit zero padding to be added on both sides
+ * @param outputPaddingW/H additional size added to one side of output shape
+ * @param paramBias enable or disable bias usage
+ * @param dilation spacing between the kernel points
+ * @param quantizeWeights enable or disable weights quantization
+ */
+struct TransposedConvolution2DParams : public TrainablePool2DParams
+{
+    TransposedConvolution2DParams() = delete;
+
+    TransposedConvolution2DParams(const Names& inputs,
+                                  const Names& outputs,
+                                  size_t kernelW,
+                                  size_t kernelH,
+                                  size_t paramOutChannel,
+                                  size_t strideW,
+                                  size_t strideH,
+                                  size_t paddingW,
+                                  size_t paddingH,
+                                  size_t outputPaddingW,
+                                  size_t outputPaddingH,
+                                  bool paramBias,
+                                  size_t dilationW,
+                                  size_t dilationH,
+                                  bool quantizeWeights = false,
+                                  bool frozen = false);
+
+    TransposedConvolution2DParams(const Names& inputs,
+                                  const Names& outputs,
+                                  size_t kernelSize,
+                                  size_t paramOutChannel,
+                                  size_t stride = 1,
+                                  size_t padding = 0,
+                                  size_t outputPadding = 0,
+                                  bool paramBias = true,
+                                  size_t dilation = 1,
+                                  bool quantizeWeights = false,
+                                  bool frozen = false);
+
+    size_t kernelsCount = 0;
+    size_t mOutputPaddingW;
+    size_t mOutputPaddingH;
+    bool bias;
+    size_t mDilationW;
+    size_t mDilationH;
+    bool quantizeWeights;
+
+    void print(std::ostream& stream) const override;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/BinaryCrossEntropyLoss.cpp b/training/src/compiler/training/base/loss/BinaryCrossEntropyLoss.cpp
new file mode 100644
index 00000000..12953615
--- /dev/null
+++ b/training/src/compiler/training/base/loss/BinaryCrossEntropyLoss.cpp
@@ -0,0 +1,68 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BinaryCrossEntropyLoss.h"
+
+#include "impl/BinaryCrossEntropyLossCPU.h"
+
+namespace raul
+{
+
+BinaryCrossEntropyLoss::BinaryCrossEntropyLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "BinaryCrossEntropyLoss", params, networkParameters)
+    , mIsFinal(params.mIsFinal)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 2 && mInputs.size() != 3)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(BinaryCrossEntropyLoss, BinaryCrossEntropyLossCPU<MemoryManager>, BinaryCrossEntropyLossCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mLabelName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    if (params.reduction != LossParams::Reduction::None || mInputs.size() == 3)
+    {
+        LossParams paramsWrap = params;
+        paramsWrap.getOutputs()[0] = mOutputName / "Wrap";
+        mInputName = paramsWrap.getOutputs()[0];
+
+        wrapper = std::make_unique<LossWrapper<BinaryCrossEntropyLoss>>(mName, paramsWrap, networkParameters);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        if (!mIsFinal)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mLabelName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/BinaryCrossEntropyLoss.h b/training/src/compiler/training/base/loss/BinaryCrossEntropyLoss.h
new file mode 100644
index 00000000..189d8e2c
--- /dev/null
+++ b/training/src/compiler/training/base/loss/BinaryCrossEntropyLoss.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BINARY_CROSS_ENTROPY_LOSS_H
+#define BINARY_CROSS_ENTROPY_LOSS_H
+
+#include "LossWrapper.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief BinaryCrossEntropyLoss
+ * This is used for measuring the error of a reconstruction in for example an auto-encoder. Note that the targets y should be numbers between 0 and 1
+ * @see
+ * https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html
+ */
+class BinaryCrossEntropyLoss : public BasicLayer
+{
+  public:
+    BinaryCrossEntropyLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters);
+
+    BinaryCrossEntropyLoss(BinaryCrossEntropyLoss&&) = default;
+    BinaryCrossEntropyLoss(const BinaryCrossEntropyLoss&) = delete;
+    BinaryCrossEntropyLoss& operator=(const BinaryCrossEntropyLoss&) = delete;
+
+  private:
+    Name mInputName;
+    std::string mLabelName;
+    Name mOutputName;
+    std::shared_ptr<LossWrapper<BinaryCrossEntropyLoss>> wrapper;
+    bool mIsFinal;
+
+    template<typename MM>
+    friend class BinaryCrossEntropyLossCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/CrossEntropyLoss.cpp b/training/src/compiler/training/base/loss/CrossEntropyLoss.cpp
new file mode 100644
index 00000000..7d5d24ec
--- /dev/null
+++ b/training/src/compiler/training/base/loss/CrossEntropyLoss.cpp
@@ -0,0 +1,73 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CrossEntropyLoss.h"
+
+#include "impl/CrossEntropyLossCPU.h"
+
+namespace raul
+{
+
+CrossEntropyLoss::CrossEntropyLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "CrossEntropyLoss", params, networkParameters)
+    , mIsFinal(params.mIsFinal)
+{
+    if (mInputs.size() != 2 && mInputs.size() != 3)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(CrossEntropyLoss, CrossEntropyLossCPU<MemoryManager>, CrossEntropyLossCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mLabelName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    if (params.reduction != LossParams::Reduction::None || mInputs.size() == 3)
+    {
+        LossParams paramsWrap = params;
+        paramsWrap.getOutputs()[0] = mOutputName / "Wrap";
+        mInputName = paramsWrap.getOutputs()[0];
+
+        wrapper = std::make_unique<LossWrapper<CrossEntropyLoss>>(mName, paramsWrap, networkParameters);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        if (!mIsFinal)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mLabelName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    }
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/CrossEntropyLoss.h b/training/src/compiler/training/base/loss/CrossEntropyLoss.h
new file mode 100644
index 00000000..63886ed8
--- /dev/null
+++ b/training/src/compiler/training/base/loss/CrossEntropyLoss.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CROSS_ENTROPY_LOSS_H
+#define CROSS_ENTROPY_LOSS_H
+
+#include "LossWrapper.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Cross-entropy Loss Function
+ */
+class CrossEntropyLoss : public BasicLayer
+{
+  public:
+    CrossEntropyLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters);
+
+    CrossEntropyLoss(CrossEntropyLoss&&) = default;
+    CrossEntropyLoss(const CrossEntropyLoss&) = delete;
+    CrossEntropyLoss& operator=(const CrossEntropyLoss&) = delete;
+
+  private:
+    Name mInputName;
+    std::string mLabelName;
+    Name mOutputName;
+    std::unique_ptr<LossWrapper<CrossEntropyLoss>> wrapper;
+    bool mIsFinal;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class CrossEntropyLossCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/DivisorLossHelperLayer.h b/training/src/compiler/training/base/loss/DivisorLossHelperLayer.h
new file mode 100644
index 00000000..a2ea824b
--- /dev/null
+++ b/training/src/compiler/training/base/loss/DivisorLossHelperLayer.h
@@ -0,0 +1,61 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DIVISOR_LOSS_HELPER_LAYER_H
+#define DIVISOR_LOSS_HELPER_LAYER_H
+
+#include <training/base/layers/BasicLayer.h>
+
+#include "impl/DivisorLossHelperLayerCPU.h"
+
+namespace raul
+{
+
+/**
+ * @brief Helper layer for loss wrapper
+ */
+class DivisorLossHelperLayer : public raul::BasicLayer
+{
+  public:
+    DivisorLossHelperLayer(const raul::Name& name, const raul::BasicParams& params, bool isCustomMean, const Name& inputName, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "DivisorLossHelperLayer", params, networkParameters, { false, false })
+        , mIsCustomMean(isCustomMean)
+        , mInputName(inputName)
+    {
+        auto prefix = "DivisorLossHelperLayer[" + mName + "::ctor]: ";
+
+        if (mOutputs.size() != 1)
+        {
+            THROW("DivisorLossHelperLayer", name, "wrong number of output names");
+        }
+        if (mOutputs[0].empty())
+        {
+            THROW("DivisorLossHelperLayer", name, "empty output name");
+        }
+
+        DECLARE_IMPL(DivisorLossHelperLayer, DivisorLossHelperLayerCPU<MemoryManager>, DivisorLossHelperLayerCPU<MemoryManagerFP16>)
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], raul::WShape{ 1u, 1u, 1u, 1u }, DEC_FORW_WRIT_NOMEMOPT);
+    }
+
+  private:
+    bool mIsCustomMean;
+    Name mInputName;
+
+    template<typename MM>
+    friend class DivisorLossHelperLayerCPU;
+};
+
+} // raul namespace
+
+#endif // DIVISOR_LOSS_HELPER_LAYER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/KLDivLoss.cpp b/training/src/compiler/training/base/loss/KLDivLoss.cpp
new file mode 100644
index 00000000..c444fc17
--- /dev/null
+++ b/training/src/compiler/training/base/loss/KLDivLoss.cpp
@@ -0,0 +1,75 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "KLDivLoss.h"
+
+#include "impl/KLDivLossCPU.h"
+
+namespace raul
+{
+
+KLDivLoss::KLDivLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "KLDivLoss", params, networkParameters)
+    , mIsFinal(params.mIsFinal)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 2 && mInputs.size() != 3)
+    {
+        THROW("KLDivLoss", name, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW("KLDivLoss", name, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(KLDivLoss, KLDivLossCPU<MemoryManager>, KLDivLossCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mLabelName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    if (params.reduction != LossParams::Reduction::None || mInputs.size() == 3)
+    {
+        LossParams paramsWrap = params;
+        paramsWrap.getOutputs()[0] = mOutputName / "Wrap";
+        mInputName = paramsWrap.getOutputs()[0];
+
+        wrapper = std::make_unique<LossWrapper<KLDivLoss>>(mName, paramsWrap, networkParameters);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        if (!mIsFinal)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mLabelName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    }
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/KLDivLoss.h b/training/src/compiler/training/base/loss/KLDivLoss.h
new file mode 100644
index 00000000..31dbe1c9
--- /dev/null
+++ b/training/src/compiler/training/base/loss/KLDivLoss.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef KL_DIV_LOSS_H
+#define KL_DIV_LOSS_H
+
+#include "LossWrapper.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief KLDivLoss (Kullback-Leibler divergence Loss)
+ *
+ *  KL divergence is a useful distance measure for continuous distributions
+ *  and is often useful when performing direct regression
+ *  over the space of (discretely sampled) continuous output distributions.
+ *  As with NLLLoss, the input given is expected to contain log-probabilities.
+ *  Target values (labels) should be given as probabilities.
+ *
+ *  MEAN reduction works as 'batchmean' in torch
+ *  @see
+ *  https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+ */
+class KLDivLoss : public BasicLayer
+{
+  public:
+    KLDivLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters);
+
+    KLDivLoss(KLDivLoss&&) = default;
+    KLDivLoss(const KLDivLoss&) = delete;
+    KLDivLoss& operator=(const KLDivLoss&) = delete;
+
+  private:
+    Name mInputName;
+    std::string mLabelName;
+    Name mOutputName;
+    std::shared_ptr<LossWrapper<KLDivLoss>> wrapper;
+    bool mIsFinal;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class KLDivLossCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/L1Loss.cpp b/training/src/compiler/training/base/loss/L1Loss.cpp
new file mode 100644
index 00000000..ddba7700
--- /dev/null
+++ b/training/src/compiler/training/base/loss/L1Loss.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "L1Loss.h"
+
+#include "impl/L1LossCPU.h"
+
+namespace raul
+{
+
+L1Loss::L1Loss(const Name& name, const LossParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "L1Loss", params, networkParameters)
+    , mIsFinal(params.mIsFinal)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 2 && mInputs.size() != 3)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mLabelName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    DECLARE_IMPL(L1Loss, L1LossLayerCPU<MemoryManager>, L1LossLayerCPU<MemoryManagerFP16>)
+
+    if (params.reduction != LossParams::Reduction::None || mInputs.size() == 3)
+    {
+        LossParams paramsWrap = params;
+        paramsWrap.getOutputs()[0] = mOutputName / "Wrap";
+        mInputName = paramsWrap.getOutputs()[0];
+        wrapper = std::make_unique<LossWrapper<L1Loss>>(mName, paramsWrap, networkParameters);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        if (!mIsFinal)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mLabelName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    }
+
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/L1Loss.h b/training/src/compiler/training/base/loss/L1Loss.h
new file mode 100644
index 00000000..5dd1692d
--- /dev/null
+++ b/training/src/compiler/training/base/loss/L1Loss.h
@@ -0,0 +1,57 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef L1_LOSS_H
+#define L1_LOSS_H
+
+#include "LossWrapper.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief L1Loss
+ * Creates a criterion that measures the mean absolute error(MAE)
+ * between each element in the input x and target y
+ * @see
+ * https://en.wikipedia.org/wiki/Mean_absolute_error
+ */
+class L1Loss : public BasicLayer
+{
+  public:
+    L1Loss(const Name& name, const LossParams& params, NetworkParameters& networkParameters);
+
+    L1Loss(L1Loss&&) = default;
+    L1Loss(const L1Loss&) = delete;
+    L1Loss& operator=(const L1Loss&) = delete;
+
+  private:
+    Name mInputName;
+    std::string mLabelName;
+    Name mOutputName;
+    std::shared_ptr<LossWrapper<L1Loss>> wrapper;
+    bool mIsFinal;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class L1LossLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/LossWrapper.h b/training/src/compiler/training/base/loss/LossWrapper.h
new file mode 100644
index 00000000..deb43cdd
--- /dev/null
+++ b/training/src/compiler/training/base/loss/LossWrapper.h
@@ -0,0 +1,240 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LOSS_WRAPPER_H
+#define LOSS_WRAPPER_H
+
+#include <training/base/common/Common.h>
+
+#include <training/base/layers/basic/ElementWiseDivLayer.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+#include <training/base/layers/basic/LossWrapperHelperLayer.h>
+#include <training/base/layers/basic/ReduceBatchMeanLayer.h>
+#include <training/base/layers/basic/ReduceMeanLayer.h>
+#include <training/base/layers/basic/ReduceNonZeroLayer.h>
+#include <training/base/layers/basic/ReduceSumLayer.h>
+
+#include <training/base/loss/DivisorLossHelperLayer.h>
+
+namespace raul
+{
+
+template<typename T>
+void LossWrapperFunction(const Name& name, const LossParams& params, Workflow& work)
+{
+    auto mInputs = params.getInputs();
+
+    if (params.reduction != LossParams::Reduction::None || mInputs.size() == 3)
+    {
+        auto prefix = "LossWrapper[" + name + "::ctor]: ";
+        if (params.getInputs().size() != 2 && params.getInputs().size() != 3)
+        {
+            THROW("LossWrapper", name, "wrong number of input names");
+        }
+        if (params.getOutputs().size() != 1)
+        {
+            THROW("LossWrapper", name, "wrong number of output names");
+        }
+
+        if (params.getInputs().size() == 2 && (params.reduction == LossParams::Reduction::Sum_Over_Nonzero_Weights || params.reduction == LossParams::Reduction::Sum_Over_Weights))
+        {
+            THROW("LossWrapper", name, "reduction over weights used for unweighted loss");
+        }
+
+        auto mInputName = params.getInputs()[0];
+        auto mLabelName = params.getInputs()[1];
+        auto mWeightsName = params.getInputs().size() == 2 ? name + "::Weights" : params.getInputs()[2];
+        auto mOutputName = params.getOutputs()[0];
+
+        bool weighted = params.getInputs().size() == 3;
+        bool isNone = params.reduction == LossParams::Reduction::None;                                      // element-wise loss
+        bool isSum = params.reduction == LossParams::Reduction::Sum;                                        // sum of losses
+        bool isMean = params.reduction == LossParams::Reduction::Mean;                                      // sum of losses divided by the total number of elements
+        bool isBatchMean = params.reduction == LossParams::Reduction::Batch_Mean;                           // sum of losses divided by batch size
+        bool isSumOverWeights = params.reduction == LossParams::Reduction::Sum_Over_Weights;                // sum of losses divided by sum of weights
+        bool isSumOverNonZeroWeights = params.reduction == LossParams::Reduction::Sum_Over_Nonzero_Weights; // sum of losses divided by the number of nonzero weights
+        bool isCustomMean = params.reduction == LossParams::Reduction::Custom_Mean; // use divisor specified in network params (i.e. custom batch size) multiplied by input total size
+
+        Name preFinalName = name / "output_loss";
+        Name nextName = name / "elementWiseLoss";
+        // Loss function with Reduction::None
+        work.add<T>(name / "loss", raul::LossParams{ { mInputName, mLabelName }, { isNone && !weighted ? preFinalName : nextName }, LossParams::Reduction::None });
+
+        if (weighted)
+        {
+            // Get weighted loss
+            work.add<ElementWiseMulLayer>(name / "mul", raul::ElementWiseLayerParams{ { nextName, mWeightsName }, { isNone ? preFinalName : name / "weightedLoss" } });
+            nextName = name / "weightedLoss";
+        }
+
+        if (!isNone)
+        {
+            if (isMean)
+            {
+                work.add<ReduceMeanLayer>(name / "rmeanNonWeightedLoss", raul::BasicParamsWithDim{ { nextName }, { preFinalName } });
+            }
+            else if (isBatchMean)
+            {
+                work.add<ReduceBatchMeanLayer>(name / "rbmeanNonWeightedLoss", raul::BasicParamsWithDim{ { nextName }, { preFinalName } });
+            }
+            else
+            {
+                // Get sum of losses
+                work.add<ReduceSumLayer>(name / "rsumNonWeightedLoss", raul::BasicParamsWithDim{ { nextName }, { isSum ? preFinalName : name / "weightedLossSum" } });
+
+                Name divisorName = "";
+                if (isSumOverWeights)
+                {
+                    work.add<ReduceSumLayer>(name / "rsumWeights", raul::BasicParamsWithDim{ { mWeightsName }, { name / "sumOfWeights" } });
+                    divisorName = name / "sumOfWeights";
+                }
+                else if (isSumOverNonZeroWeights)
+                {
+                    work.add<ReduceNonZeroLayer>(name / "rsumWeights", raul::BasicParamsWithDim{ { mWeightsName }, { name / "sumOfWeights" } });
+                    divisorName = name / "sumOfWeights";
+                }
+                else
+                {
+                    // Use custom divisor specified in network params (custom batch size or custom batch size * total size)
+                    if (!isSum)
+                    {
+                        divisorName = name / "customDivisor";
+
+                        work.add<DivisorLossHelperLayer>(divisorName, raul::BasicParams{ {}, { divisorName } }, isCustomMean, mInputName);
+                    }
+                }
+
+                if (!divisorName.empty())
+                {
+                    work.add<ElementWiseDivLayer>(name / "div", raul::ElementWiseLayerParams{ { name / "weightedLossSum", divisorName }, { preFinalName } });
+                }
+            }
+        }
+
+        work.add<LossWrapperHelperLayer>(name / "helper", raul::BasicParams{ { preFinalName }, { mOutputName } }, params.mIsFinal);
+    }
+}
+
+/**
+ * @brief LossWrapper
+ * Calculates specified type of loss with weights.
+ */
+template<typename T>
+class LossWrapper
+{
+  public:
+    LossWrapper(const Name& name, const LossParams& params, NetworkParameters& networkParameters);
+
+  private:
+    Name mInputName;
+    Name mLabelName;
+    Name mOutputName;
+    Name mWeightsName;
+    LossParams::Reduction mReduction;
+};
+
+template<typename T>
+LossWrapper<T>::LossWrapper(const Name& name, const LossParams& params, NetworkParameters& networkParameters)
+    : mReduction(params.reduction)
+{
+    auto prefix = "LossWrapper[" + name + "::ctor]: ";
+    if (params.getInputs().size() != 2 && params.getInputs().size() != 3)
+    {
+        THROW("LossWrapper", name, "wrong number of input names");
+    }
+    if (params.getOutputs().size() != 1)
+    {
+        THROW("LossWrapper", name, "wrong number of output names");
+    }
+
+    if (params.getInputs().size() == 2 && (mReduction == LossParams::Reduction::Sum_Over_Nonzero_Weights || mReduction == LossParams::Reduction::Sum_Over_Weights))
+    {
+        THROW("LossWrapper", name, "reduction over weights used for unweighted loss");
+    }
+
+    mInputName = params.getInputs()[0];
+    mLabelName = params.getInputs()[1];
+    mWeightsName = params.getInputs().size() == 2 ? name + "::Weights" : params.getInputs()[2];
+    mOutputName = params.getOutputs()[0];
+
+    bool weighted = params.getInputs().size() == 3;
+    bool isNone = mReduction == LossParams::Reduction::None;                                      // element-wise loss
+    bool isSum = mReduction == LossParams::Reduction::Sum;                                        // sum of losses
+    bool isMean = mReduction == LossParams::Reduction::Mean;                                      // sum of losses divided by the total number of elements
+    bool isBatchMean = mReduction == LossParams::Reduction::Batch_Mean;                           // sum of losses divided by batch size
+    bool isSumOverWeights = mReduction == LossParams::Reduction::Sum_Over_Weights;                // sum of losses divided by sum of weights
+    bool isSumOverNonZeroWeights = mReduction == LossParams::Reduction::Sum_Over_Nonzero_Weights; // sum of losses divided by the number of nonzero weights
+    bool isCustomMean = mReduction == LossParams::Reduction::Custom_Mean;                         // use divisor specified in network params (i.e. custom batch size) multiplied by input total size
+
+    Name preFinalName = name / "output_loss";
+    Name nextName = name / "elementWiseLoss";
+    // Loss function with Reduction::None
+    networkParameters.mWorkflow.add<T>(name / "loss", raul::LossParams{ { mInputName, mLabelName }, { isNone && !weighted ? preFinalName : nextName }, LossParams::Reduction::None });
+
+    if (weighted)
+    {
+        // Get weighted loss
+        networkParameters.mWorkflow.add<ElementWiseMulLayer>(name / "mul", raul::ElementWiseLayerParams{ { nextName, mWeightsName }, { isNone ? preFinalName : name / "weightedLoss" } });
+        nextName = name / "weightedLoss";
+    }
+
+    if (!isNone)
+    {
+        if (isMean)
+        {
+            networkParameters.mWorkflow.add<ReduceMeanLayer>(name / "rmeanNonWeightedLoss", raul::BasicParamsWithDim{ { nextName }, { preFinalName } });
+        }
+        else if (isBatchMean)
+        {
+            networkParameters.mWorkflow.add<ReduceBatchMeanLayer>(name / "rbmeanNonWeightedLoss", raul::BasicParamsWithDim{ { nextName }, { preFinalName } });
+        }
+        else
+        {
+            // Get sum of losses
+            networkParameters.mWorkflow.add<ReduceSumLayer>(name / "rsumNonWeightedLoss", raul::BasicParamsWithDim{ { nextName }, { isSum ? preFinalName : name / "weightedLossSum" } });
+
+            Name divisorName = "";
+            if (isSumOverWeights)
+            {
+                networkParameters.mWorkflow.add<ReduceSumLayer>(name / "rsumWeights", raul::BasicParamsWithDim{ { mWeightsName }, { name / "sumOfWeights" } });
+                divisorName = name / "sumOfWeights";
+            }
+            else if (isSumOverNonZeroWeights)
+            {
+                networkParameters.mWorkflow.add<ReduceNonZeroLayer>(name / "rsumWeights", raul::BasicParamsWithDim{ { mWeightsName }, { name / "sumOfWeights" } });
+                divisorName = name / "sumOfWeights";
+            }
+            else
+            {
+                // Use custom divisor specified in network params (custom batch size or custom batch size * total size)
+                if (!isSum)
+                {
+                    divisorName = name / "customDivisor";
+
+                    networkParameters.mWorkflow.add<DivisorLossHelperLayer>(divisorName, raul::BasicParams{ {}, { divisorName } }, isCustomMean, mInputName);
+                }
+            }
+
+            if (!divisorName.empty())
+            {
+                networkParameters.mWorkflow.add<ElementWiseDivLayer>(name / "div", raul::ElementWiseLayerParams{ { name / "weightedLossSum", divisorName }, { preFinalName } });
+            }
+        }
+    }
+
+    networkParameters.mWorkflow.add<LossWrapperHelperLayer>(name / "helper", raul::BasicParams{ { preFinalName }, { mOutputName } }, params.mIsFinal);
+}
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/MSELoss.cpp b/training/src/compiler/training/base/loss/MSELoss.cpp
new file mode 100644
index 00000000..4c22de7e
--- /dev/null
+++ b/training/src/compiler/training/base/loss/MSELoss.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "MSELoss.h"
+
+#include "impl/MSELossCPU.h"
+
+namespace raul
+{
+
+MSELoss::MSELoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "MSELoss", params, networkParameters)
+    , mIsFinal(params.mIsFinal)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 2 && mInputs.size() != 3)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    mInputName = mInputs[0];
+    mLabelName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    DECLARE_IMPL(MSELoss, MSELossLayerCPU<MemoryManager>, MSELossLayerCPU<MemoryManagerFP16>)
+
+    if (params.reduction != LossParams::Reduction::None || mInputs.size() == 3)
+    {
+        LossParams paramsWrap = params;
+        paramsWrap.getOutputs()[0] = mOutputName / "Wrap";
+        mInputName = paramsWrap.getOutputs()[0];
+        wrapper = std::make_unique<LossWrapper<MSELoss>>(mName, paramsWrap, networkParameters);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        if (!mIsFinal)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDec(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mLabelName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    }
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/MSELoss.h b/training/src/compiler/training/base/loss/MSELoss.h
new file mode 100644
index 00000000..004e288f
--- /dev/null
+++ b/training/src/compiler/training/base/loss/MSELoss.h
@@ -0,0 +1,57 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MSE_LOSS_H
+#define MSE_LOSS_H
+
+#include "LossWrapper.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief MSELoss
+ * Creates a criterion that measures the mean squared error (squared L2 norm)
+ * between each element in the input x and target y
+ * @see
+ * https://en.wikipedia.org/wiki/Mean_squared_error
+ */
+class MSELoss : public BasicLayer
+{
+  public:
+    MSELoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters);
+
+    MSELoss(MSELoss&&) = default;
+    MSELoss(const MSELoss&) = delete;
+    MSELoss& operator=(const MSELoss&) = delete;
+
+  private:
+    Name mInputName;
+    std::string mLabelName;
+    Name mOutputName;
+    std::shared_ptr<LossWrapper<MSELoss>> wrapper;
+    bool mIsFinal;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class MSELossLayerCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/NegativeLogLikelihoodLoss.cpp b/training/src/compiler/training/base/loss/NegativeLogLikelihoodLoss.cpp
new file mode 100644
index 00000000..c8368a73
--- /dev/null
+++ b/training/src/compiler/training/base/loss/NegativeLogLikelihoodLoss.cpp
@@ -0,0 +1,76 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "NegativeLogLikelihoodLoss.h"
+
+#include "impl/NegativeLogLikelihoodLossCPU.h"
+
+namespace raul
+{
+
+NLLLoss::NLLLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "NLLLoss", params, networkParameters)
+    , mIsFinal(params.mIsFinal)
+{
+    MEASURE_BLOCK(mTypeName + "[" + name + "::ctor]")
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 2 && mInputs.size() != 3)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(NLLLoss, NLLLossCPU<MemoryManager>, NLLLossCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mLabelName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    if (params.reduction != LossParams::Reduction::None || mInputs.size() == 3)
+    {
+        LossParams paramsWrap = params;
+        paramsWrap.getOutputs()[0] = mOutputName / "Wrap";
+        mInputName = paramsWrap.getOutputs()[0];
+
+        wrapper = std::make_shared<LossWrapper<NLLLoss>>(name, paramsWrap, networkParameters);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        if (!mIsFinal)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mLabelName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    }
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/NegativeLogLikelihoodLoss.h b/training/src/compiler/training/base/loss/NegativeLogLikelihoodLoss.h
new file mode 100644
index 00000000..2f2e0d95
--- /dev/null
+++ b/training/src/compiler/training/base/loss/NegativeLogLikelihoodLoss.h
@@ -0,0 +1,55 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NLL_LOSS_H
+#define NLL_LOSS_H
+
+#include "LossWrapper.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief Negative Log-Likelihood Loss Function
+ *
+ * Measures a negative log-likelihood loss.
+ */
+class NLLLoss : public BasicLayer
+{
+  public:
+    NLLLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters);
+
+    NLLLoss(NLLLoss&&) = default;
+    NLLLoss(const NLLLoss&) = delete;
+    NLLLoss& operator=(const NLLLoss&) = delete;
+
+  private:
+    Name mInputName;
+    std::string mLabelName;
+    Name mOutputName;
+    std::shared_ptr<LossWrapper<NLLLoss>> wrapper;
+    bool mIsFinal;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class NLLLossCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/SigmoidCrossEntropyLoss.cpp b/training/src/compiler/training/base/loss/SigmoidCrossEntropyLoss.cpp
new file mode 100644
index 00000000..c53695f6
--- /dev/null
+++ b/training/src/compiler/training/base/loss/SigmoidCrossEntropyLoss.cpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SigmoidCrossEntropyLoss.h"
+
+#include "impl/SigmoidCrossEntropyLossCPU.h"
+
+namespace raul
+{
+
+SigmoidCrossEntropyLoss::SigmoidCrossEntropyLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "SigmoidCrossEntropyLoss", params, networkParameters)
+    , mIsFinal(params.mIsFinal)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 2 && mInputs.size() != 3)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(SigmoidCrossEntropyLoss, SigmoidCrossEntropyLossCPU<MemoryManager>, SigmoidCrossEntropyLossCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mLabelName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    if (params.reduction != LossParams::Reduction::None || mInputs.size() == 3)
+    {
+        LossParams paramsWrap = params;
+        paramsWrap.getOutputs()[0] = mOutputName / "Wrap";
+        mInputName = paramsWrap.getOutputs()[0];
+
+        wrapper = std::make_unique<LossWrapper<SigmoidCrossEntropyLoss>>(mName, paramsWrap, networkParameters);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        if (!mIsFinal)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName.grad(), DEC_BACK_READ);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mLabelName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+    }
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/SigmoidCrossEntropyLoss.h b/training/src/compiler/training/base/loss/SigmoidCrossEntropyLoss.h
new file mode 100644
index 00000000..2b2aa79c
--- /dev/null
+++ b/training/src/compiler/training/base/loss/SigmoidCrossEntropyLoss.h
@@ -0,0 +1,57 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SIGMOID_CROSS_ENTROPY_LOSS_H
+#define SIGMOID_CROSS_ENTROPY_LOSS_H
+
+#include "LossWrapper.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief SigmoidCrossEntropyLoss
+ * Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive.
+ * For instance, one could perform multilabel classification where a picture can contain both an elephant and a dog at the same time.
+ * @see
+ * https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits
+ */
+class SigmoidCrossEntropyLoss : public BasicLayer
+{
+  public:
+    SigmoidCrossEntropyLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters);
+
+    SigmoidCrossEntropyLoss(SigmoidCrossEntropyLoss&&) = default;
+    SigmoidCrossEntropyLoss(const SigmoidCrossEntropyLoss&) = delete;
+    SigmoidCrossEntropyLoss& operator=(const SigmoidCrossEntropyLoss&) = delete;
+
+  private:
+    Name mInputName;
+    std::string mLabelName;
+    Name mOutputName;
+    std::shared_ptr<LossWrapper<SigmoidCrossEntropyLoss>> wrapper;
+    bool mIsFinal;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class SigmoidCrossEntropyLossCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/SoftmaxCrossEntropyLoss.cpp b/training/src/compiler/training/base/loss/SoftmaxCrossEntropyLoss.cpp
new file mode 100644
index 00000000..f78581c0
--- /dev/null
+++ b/training/src/compiler/training/base/loss/SoftmaxCrossEntropyLoss.cpp
@@ -0,0 +1,74 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SoftmaxCrossEntropyLoss.h"
+
+#include "impl/SoftmaxCrossEntropyLossCPU.h"
+
+namespace raul
+{
+
+SoftmaxCrossEntropyLoss::SoftmaxCrossEntropyLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters)
+    : BasicLayer(name, "SoftmaxCrossEntropyLoss", params, networkParameters)
+    , mIsFinal(params.mIsFinal)
+{
+    auto prefix = mTypeName + "[" + mName + "::ctor]: ";
+
+    if (mInputs.size() != 2 && mInputs.size() != 3)
+    {
+        THROW(mTypeName, mName, "wrong number of input names");
+    }
+    if (mOutputs.size() != 1)
+    {
+        THROW(mTypeName, mName, "wrong number of output names");
+    }
+
+    DECLARE_IMPL(SoftmaxCrossEntropyLoss, SoftmaxCrossEntropyLossCPU<MemoryManager>, SoftmaxCrossEntropyLossCPU<MemoryManagerFP16>)
+
+    mInputName = mInputs[0];
+    mLabelName = mInputs[1];
+    mOutputName = mOutputs[0];
+
+    if (params.reduction != LossParams::Reduction::None || mInputs.size() == 3)
+    {
+        LossParams paramsWrap = params;
+        paramsWrap.getOutputs()[0] = mOutputName / "Wrap";
+        mInputName = paramsWrap.getOutputs()[0];
+
+        wrapper = std::make_unique<LossWrapper<SoftmaxCrossEntropyLoss>>(mName, paramsWrap, networkParameters);
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        if (!mIsFinal)
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        }
+    }
+    else
+    {
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName.grad(), DEC_BACK_WRIT_ZERO);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mOutputName, DEC_FORW_WRIT_NOMEMOPT);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputName, mOutputName.grad(), DEC_BACK_READ);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mLabelName, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+
+        // Temporal tensor to reduce calculations
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputName, mInputName / "AFTER_SOFTMAX", DEC_FRBC_WRIT_NOMEMOPT);
+    }
+    mDepth = mNetworkParams.mWorkflow.getDepth(mInputName);
+    mHeight = mNetworkParams.mWorkflow.getHeight(mInputName);
+    mWidth = mNetworkParams.mWorkflow.getWidth(mInputName);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/SoftmaxCrossEntropyLoss.h b/training/src/compiler/training/base/loss/SoftmaxCrossEntropyLoss.h
new file mode 100644
index 00000000..ac6ea7ab
--- /dev/null
+++ b/training/src/compiler/training/base/loss/SoftmaxCrossEntropyLoss.h
@@ -0,0 +1,56 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SOFTMAX_CROSS_ENTROPY_LOSS_H
+#define SOFTMAX_CROSS_ENTROPY_LOSS_H
+
+#include "LossWrapper.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace raul
+{
+
+/**
+ * @brief SoftmaxCrossEntropyLoss
+ * Measures the probability error in discrete classification tasks in which the classes are mutually exclusive (each entry is in exactly one class)
+ * @see
+ * https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits
+ */
+class SoftmaxCrossEntropyLoss : public BasicLayer
+{
+  public:
+    SoftmaxCrossEntropyLoss(const Name& name, const LossParams& params, NetworkParameters& networkParameters);
+
+    SoftmaxCrossEntropyLoss(SoftmaxCrossEntropyLoss&&) = default;
+    SoftmaxCrossEntropyLoss(const SoftmaxCrossEntropyLoss&) = delete;
+    SoftmaxCrossEntropyLoss& operator=(const SoftmaxCrossEntropyLoss&) = delete;
+
+  private:
+    Name mInputName;
+    std::string mLabelName;
+    Name mOutputName;
+    std::shared_ptr<LossWrapper<SoftmaxCrossEntropyLoss>> wrapper;
+    bool mIsFinal;
+
+    size_t mDepth;
+    size_t mHeight;
+    size_t mWidth;
+
+    template<typename MM>
+    friend class SoftmaxCrossEntropyLossCPU;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/impl/BinaryCrossEntropyLossCPU.cpp b/training/src/compiler/training/base/loss/impl/BinaryCrossEntropyLossCPU.cpp
new file mode 100644
index 00000000..7eb318a2
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/BinaryCrossEntropyLossCPU.cpp
@@ -0,0 +1,115 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BinaryCrossEntropyLossCPU.h"
+#include "../BinaryCrossEntropyLoss.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::BinaryCrossEntropyLoss, raul::BinaryCrossEntropyLossCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::BinaryCrossEntropyLoss, raul::BinaryCrossEntropyLossCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void BinaryCrossEntropyLossCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (mLayer.wrapper)
+    {
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+        output = TORANGE_MM(input);
+    }
+    else
+    {
+        if (mode == NetworkMode::Test)
+        {
+            return;
+        }
+
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            output[q] =
+                -TOMMTYPE(TODTYPE(targets[q]) * std::max(-100.0_dt, std::log(TODTYPE(inputs[q]))) + (1.0_dt - TODTYPE(targets[q])) * std::max(-100.0_dt, std::log(1.0_dt - TODTYPE(inputs[q]))));
+        }
+    }
+}
+
+template<typename MM>
+void BinaryCrossEntropyLossCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (!mLayer.wrapper)
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+
+        const auto EPS = 0.000000000001_dt;
+
+        if (deltas.getShape() != prevLayerDelta.getShape())
+        {
+            if (!deltas.isBroadcastableTo(prevLayerDelta.getShape()))
+            {
+                THROW(mLayer.mTypeName, mLayer.mName, "bad incoming deltas shape");
+            }
+            auto deltas_viewer = deltas.getBroadcastedViewer(prevLayerDelta.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] -= TOMMTYPE(TODTYPE(targets[q] - inputs[q]) / ((1.0_dt - TODTYPE(inputs[q]) + EPS) * (TODTYPE(inputs[q]) + EPS)) * TODTYPE(deltas_viewer[q]));
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] -= TOMMTYPE(TODTYPE(targets[q] - inputs[q]) / ((1.0_dt - TODTYPE(inputs[q]) + EPS) * (TODTYPE(inputs[q]) + EPS)) * TODTYPE(deltas[q]));
+            }
+        }
+    }
+    else if (!mLayer.mIsFinal)
+    {
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        prevLayerDelta += deltas;
+    }
+}
+
+template class BinaryCrossEntropyLossCPU<MemoryManager>;
+template class BinaryCrossEntropyLossCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/loss/impl/BinaryCrossEntropyLossCPU.h b/training/src/compiler/training/base/loss/impl/BinaryCrossEntropyLossCPU.h
new file mode 100644
index 00000000..0684157e
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/BinaryCrossEntropyLossCPU.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BINARY_CROSS_ENTROPY_LOSS_CPU_H
+#define BINARY_CROSS_ENTROPY_LOSS_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class BinaryCrossEntropyLoss;
+
+template<typename MM>
+class BinaryCrossEntropyLossCPU : public BasicImpl
+{
+  public:
+    BinaryCrossEntropyLossCPU(BinaryCrossEntropyLoss& layer)
+        : mLayer(layer)
+    {
+    }
+
+    BinaryCrossEntropyLossCPU(BinaryCrossEntropyLossCPU&&) = default;
+    BinaryCrossEntropyLossCPU(const BinaryCrossEntropyLossCPU&) = delete;
+    BinaryCrossEntropyLossCPU& operator=(const BinaryCrossEntropyLossCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    BinaryCrossEntropyLoss& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/impl/CrossEntropyLossCPU.cpp b/training/src/compiler/training/base/loss/impl/CrossEntropyLossCPU.cpp
new file mode 100644
index 00000000..3bf5e012
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/CrossEntropyLossCPU.cpp
@@ -0,0 +1,104 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CrossEntropyLossCPU.h"
+#include "../CrossEntropyLoss.h"
+
+namespace raul
+{
+
+template<typename MM>
+void CrossEntropyLossCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (mLayer.wrapper)
+    {
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+        output = TORANGE_MM(input);
+    }
+    else
+    {
+        if (mode == NetworkMode::Test)
+        {
+            return;
+        }
+
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            output[q] = -TOMMTYPE(TODTYPE(targets[q]) * std::log(TODTYPE(inputs[q])));
+        }
+    }
+}
+
+template<typename MM>
+void CrossEntropyLossCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (!mLayer.wrapper)
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        if (deltas.getShape() != prevLayerDelta.getShape())
+        {
+            if (!deltas.isBroadcastableTo(prevLayerDelta.getShape()))
+            {
+                THROW(mLayer.mTypeName, mLayer.mName, "bad incoming deltas shape");
+            }
+            auto deltas_viewer = deltas.getBroadcastedViewer(prevLayerDelta.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] -= targets[q] / inputs[q] * deltas_viewer[q];
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] -= targets[q] / inputs[q] * deltas[q];
+            }
+        }
+    }
+    else if (!mLayer.mIsFinal)
+    {
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        prevLayerDelta += deltas;
+    }
+}
+
+template class CrossEntropyLossCPU<MemoryManager>;
+template class CrossEntropyLossCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/impl/CrossEntropyLossCPU.h b/training/src/compiler/training/base/loss/impl/CrossEntropyLossCPU.h
new file mode 100644
index 00000000..6a083af0
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/CrossEntropyLossCPU.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CROSS_ENTROPY_LOSS_CPU_H
+#define CROSS_ENTROPY_LOSS_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class CrossEntropyLoss;
+
+template<typename MM>
+class CrossEntropyLossCPU : public BasicImpl
+{
+  public:
+    CrossEntropyLossCPU(CrossEntropyLoss& layer)
+        : mLayer(layer)
+    {
+    }
+
+    CrossEntropyLossCPU(CrossEntropyLossCPU&&) = default;
+    CrossEntropyLossCPU(const CrossEntropyLossCPU&) = delete;
+    CrossEntropyLossCPU& operator=(const CrossEntropyLossCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    CrossEntropyLoss& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/impl/DivisorLossHelperLayerCPU.cpp b/training/src/compiler/training/base/loss/impl/DivisorLossHelperLayerCPU.cpp
new file mode 100644
index 00000000..4824c673
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/DivisorLossHelperLayerCPU.cpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DivisorLossHelperLayerCPU.h"
+#include "../DivisorLossHelperLayer.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::DivisorLossHelperLayer, raul::DivisorLossHelperLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::DivisorLossHelperLayer, raul::DivisorLossHelperLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void DivisorLossHelperLayerCPU<MM>::forwardComputeImpl(NetworkMode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    auto divisor = TOMMTYPE(mLayer.mNetworkParams.mLossReductionCoefficient);
+    if (mLayer.mIsCustomMean)
+    {
+        divisor *= TOMMTYPE(mLayer.mNetworkParams.mWorkflow.getDepth(mLayer.mInputName) * mLayer.mNetworkParams.mWorkflow.getHeight(mLayer.mInputName) *
+                            mLayer.mNetworkParams.mWorkflow.getWidth(mLayer.mInputName));
+    }
+
+    auto& t = work.getMemoryManager<MM>()[mLayer.mOutputs[0]];
+    t[0] = divisor;
+}
+
+template class DivisorLossHelperLayerCPU<MemoryManager>;
+template class DivisorLossHelperLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/loss/impl/DivisorLossHelperLayerCPU.h b/training/src/compiler/training/base/loss/impl/DivisorLossHelperLayerCPU.h
new file mode 100644
index 00000000..8ff90b40
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/DivisorLossHelperLayerCPU.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DIVISOR_LAYER_CPU_H
+#define DIVISOR_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class DivisorLossHelperLayer;
+
+template<typename MM>
+class DivisorLossHelperLayerCPU : public BasicImpl
+{
+  public:
+    DivisorLossHelperLayerCPU(DivisorLossHelperLayer& layer)
+        : mLayer(layer)
+    {
+    }
+
+    DivisorLossHelperLayerCPU(DivisorLossHelperLayerCPU&&) = default;
+    DivisorLossHelperLayerCPU(const DivisorLossHelperLayerCPU&) = delete;
+    DivisorLossHelperLayerCPU& operator=(const DivisorLossHelperLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode) override;
+    void backwardComputeImpl() override {}
+
+  private:
+    DivisorLossHelperLayer& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/loss/impl/KLDivLossCPU.cpp b/training/src/compiler/training/base/loss/impl/KLDivLossCPU.cpp
new file mode 100644
index 00000000..267c5b6b
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/KLDivLossCPU.cpp
@@ -0,0 +1,106 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "KLDivLossCPU.h"
+#include "../KLDivLoss.h"
+
+namespace raul
+{
+
+template<typename MM>
+void KLDivLossCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (mLayer.wrapper)
+    {
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+        output = TORANGE_MM(input);
+    }
+    else
+    {
+        if (mode == NetworkMode::Test)
+        {
+            return;
+        }
+
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        output = TOMMTYPE(0.0_dt);
+
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            if (targets[q] > TOMMTYPE(0.0_dt))
+            {
+                output[q] = TOMMTYPE(TODTYPE(targets[q]) * (log(TODTYPE(targets[q])) - TODTYPE(inputs[q])));
+            }
+        }
+    }
+}
+
+template<typename MM>
+void KLDivLossCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (!mLayer.wrapper)
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        if (deltas.getShape() != prevLayerDelta.getShape())
+        {
+            if (!deltas.isBroadcastableTo(prevLayerDelta.getShape()))
+            {
+                THROW(mLayer.mTypeName, mLayer.mName, "bad incoming deltas shape");
+            }
+            auto deltas_viewer = deltas.getBroadcastedViewer(prevLayerDelta.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] -= targets[q] * deltas_viewer[q];
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] -= targets[q] * deltas[q];
+            }
+        }
+    }
+    else if (!mLayer.mIsFinal)
+    {
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        prevLayerDelta += deltas;
+    }
+}
+
+template class KLDivLossCPU<MemoryManager>;
+template class KLDivLossCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/loss/impl/KLDivLossCPU.h b/training/src/compiler/training/base/loss/impl/KLDivLossCPU.h
new file mode 100644
index 00000000..46a14d05
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/KLDivLossCPU.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef KL_DIV_LOSS_CPU_H
+#define KL_DIV_LOSS_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class KLDivLoss;
+
+template<typename MM>
+class KLDivLossCPU : public BasicImpl
+{
+  public:
+    KLDivLossCPU(KLDivLoss& layer)
+        : mLayer(layer)
+    {
+    }
+
+    KLDivLossCPU(KLDivLossCPU&&) = default;
+    KLDivLossCPU(const KLDivLossCPU&) = delete;
+    KLDivLossCPU& operator=(const KLDivLossCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    KLDivLoss& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/impl/L1LossCPU.cpp b/training/src/compiler/training/base/loss/impl/L1LossCPU.cpp
new file mode 100644
index 00000000..71dd9c12
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/L1LossCPU.cpp
@@ -0,0 +1,113 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "L1LossCPU.h"
+#include "../L1Loss.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::L1Loss, raul::L1LossLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::L1Loss, raul::L1LossLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void L1LossLayerCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (mLayer.wrapper)
+    {
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+        output = TORANGE_MM(input);
+    }
+    else
+    {
+        if (mode == NetworkMode::Test)
+        {
+            return;
+        }
+
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            output[q] = TOMMTYPE(std::abs(TODTYPE(input[q] - targets[q])));
+        }
+    }
+}
+
+template<typename MM>
+void L1LossLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (!mLayer.wrapper)
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        if (deltas.getShape() != prevLayerDelta.getShape())
+        {
+            if (!deltas.isBroadcastableTo(prevLayerDelta.getShape()))
+            {
+                THROW(mLayer.mTypeName, mLayer.mName, "bad incoming deltas shape");
+            }
+            auto deltas_viewer = deltas.getBroadcastedViewer(prevLayerDelta.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                dtype val = (inputs[q] == targets[q] ? 0_dt : (inputs[q] > targets[q] ? 1_dt : -1_dt));
+                prevLayerDelta[q] += TOMMTYPE(val) * deltas_viewer[q];
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                dtype val = (inputs[q] == targets[q] ? 0_dt : (inputs[q] > targets[q] ? 1_dt : -1_dt));
+                prevLayerDelta[q] += TOMMTYPE(val) * deltas[q];
+            }
+        }
+    }
+    else if (!mLayer.mIsFinal)
+    {
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        prevLayerDelta += deltas;
+    }
+}
+
+template class L1LossLayerCPU<MemoryManager>;
+template class L1LossLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/loss/impl/L1LossCPU.h b/training/src/compiler/training/base/loss/impl/L1LossCPU.h
new file mode 100644
index 00000000..38c9b42c
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/L1LossCPU.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef L1LOSS_LAYER_CPU_H
+#define L1LOSS_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class L1Loss;
+
+template<typename MM>
+class L1LossLayerCPU : public BasicImpl
+{
+  public:
+    L1LossLayerCPU(L1Loss& layer)
+        : mLayer(layer)
+    {
+    }
+
+    L1LossLayerCPU(L1LossLayerCPU&&) = default;
+    L1LossLayerCPU(const L1LossLayerCPU&) = delete;
+    L1LossLayerCPU& operator=(const L1LossLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    L1Loss& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/loss/impl/MSELossCPU.cpp b/training/src/compiler/training/base/loss/impl/MSELossCPU.cpp
new file mode 100644
index 00000000..c226f384
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/MSELossCPU.cpp
@@ -0,0 +1,110 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "MSELossCPU.h"
+#include "../MSELoss.h"
+
+#include <training/base/impl/ImplFactory.h>
+namespace
+{
+bool reg1 = raul::TheImplFactory::Instance().regCPUFP32<raul::MSELoss, raul::MSELossLayerCPU<raul::MemoryManager>>();
+bool reg2 = raul::TheImplFactory::Instance().regCPUFP16<raul::MSELoss, raul::MSELossLayerCPU<raul::MemoryManagerFP16>>();
+} // anonymous namespace
+
+namespace raul
+{
+
+template<typename MM>
+void MSELossLayerCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (mLayer.wrapper)
+    {
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+        output = TORANGE_MM(inputs);
+    }
+    else
+    {
+        if (mode == NetworkMode::Test)
+        {
+            return;
+        }
+
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            output[q] = (inputs[q] - targets[q]) * (inputs[q] - targets[q]);
+        }
+    }
+}
+
+template<typename MM>
+void MSELossLayerCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (!mLayer.wrapper)
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        if (deltas.getShape() != prevLayerDelta.getShape())
+        {
+            if (!deltas.isBroadcastableTo(prevLayerDelta.getShape()))
+            {
+                THROW(mLayer.mTypeName, mLayer.mName, "bad incoming deltas shape");
+            }
+            auto deltas_viewer = deltas.getBroadcastedViewer(prevLayerDelta.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] += (inputs[q] - targets[q]) * TOMMTYPE(2.0_dt) * deltas_viewer[q];
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] += (inputs[q] - targets[q]) * TOMMTYPE(2.0_dt) * deltas[q];
+            }
+        }
+    }
+    else if (!mLayer.mIsFinal)
+    {
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        prevLayerDelta += deltas;
+    }
+}
+
+template class MSELossLayerCPU<MemoryManager>;
+template class MSELossLayerCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/loss/impl/MSELossCPU.h b/training/src/compiler/training/base/loss/impl/MSELossCPU.h
new file mode 100644
index 00000000..eb2d028e
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/MSELossCPU.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MSELOSS_LAYER_CPU_H
+#define MSELOSS_LAYER_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class MSELoss;
+
+template<typename MM>
+class MSELossLayerCPU : public BasicImpl
+{
+  public:
+    MSELossLayerCPU(MSELoss& layer)
+        : mLayer(layer)
+    {
+    }
+
+    MSELossLayerCPU(MSELossLayerCPU&&) = default;
+    MSELossLayerCPU(const MSELossLayerCPU&) = delete;
+    MSELossLayerCPU& operator=(const MSELossLayerCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    MSELoss& mLayer;
+};
+} // raul namespace
+#endif
diff --git a/training/src/compiler/training/base/loss/impl/NegativeLogLikelihoodLossCPU.cpp b/training/src/compiler/training/base/loss/impl/NegativeLogLikelihoodLossCPU.cpp
new file mode 100644
index 00000000..74db8988
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/NegativeLogLikelihoodLossCPU.cpp
@@ -0,0 +1,103 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "NegativeLogLikelihoodLossCPU.h"
+#include "../NegativeLogLikelihoodLoss.h"
+
+namespace raul
+{
+
+template<typename MM>
+void NLLLossCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (mLayer.wrapper)
+    {
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+        output = TORANGE_MM(input);
+    }
+    else
+    {
+        if (mode == NetworkMode::Test)
+        {
+            return;
+        }
+
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            output[q] = -targets[q] * inputs[q];
+        }
+    }
+}
+
+template<typename MM>
+void NLLLossCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (!mLayer.wrapper)
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        if (deltas.getShape() != prevLayerDelta.getShape())
+        {
+            if (!deltas.isBroadcastableTo(prevLayerDelta.getShape()))
+            {
+                THROW(mLayer.mTypeName, mLayer.mName, "bad incoming deltas shape");
+            }
+            auto deltas_viewer = deltas.getBroadcastedViewer(prevLayerDelta.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] -= targets[q] * deltas_viewer[q];
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] -= targets[q] * deltas[q];
+            }
+        }
+    }
+    else if (!mLayer.mIsFinal)
+    {
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        prevLayerDelta += deltas;
+    }
+}
+
+template class NLLLossCPU<MemoryManager>;
+template class NLLLossCPU<MemoryManagerFP16>;
+
+} // namespace raul
diff --git a/training/src/compiler/training/base/loss/impl/NegativeLogLikelihoodLossCPU.h b/training/src/compiler/training/base/loss/impl/NegativeLogLikelihoodLossCPU.h
new file mode 100644
index 00000000..601d4574
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/NegativeLogLikelihoodLossCPU.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NLL_LOSS_CPU_H
+#define NLL_LOSS_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class NLLLoss;
+
+template<typename MM>
+class NLLLossCPU : public BasicImpl
+{
+  public:
+    NLLLossCPU(NLLLoss& layer)
+        : mLayer(layer)
+    {
+    }
+
+    NLLLossCPU(NLLLossCPU&&) = default;
+    NLLLossCPU(const NLLLossCPU&) = delete;
+    NLLLossCPU& operator=(const NLLLossCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    NLLLoss& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/impl/SigmoidCrossEntropyLossCPU.cpp b/training/src/compiler/training/base/loss/impl/SigmoidCrossEntropyLossCPU.cpp
new file mode 100644
index 00000000..5501a3b1
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/SigmoidCrossEntropyLossCPU.cpp
@@ -0,0 +1,106 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SigmoidCrossEntropyLossCPU.h"
+#include "../SigmoidCrossEntropyLoss.h"
+
+namespace raul
+{
+
+template<typename MM>
+void SigmoidCrossEntropyLossCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (mLayer.wrapper)
+    {
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+        output = TORANGE_MM(input);
+    }
+    else
+    {
+        if (mode == NetworkMode::Test)
+        {
+            return;
+        }
+
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < output.size(); ++q)
+        {
+            output[q] = TOMMTYPE(std::max(TODTYPE(input[q]), 0_dt) - TODTYPE(input[q]) * TODTYPE(targets[q]) + std::log(1_dt + std::exp(-std::abs(TODTYPE(input[q])))));
+        }
+    }
+}
+
+template<typename MM>
+void SigmoidCrossEntropyLossCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (!mLayer.wrapper)
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+        const auto& inputs = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        if (deltas.getShape() != prevLayerDelta.getShape())
+        {
+            if (!deltas.isBroadcastableTo(prevLayerDelta.getShape()))
+            {
+                THROW(mLayer.mTypeName, mLayer.mName, "bad incoming deltas shape");
+            }
+            auto deltas_viewer = deltas.getBroadcastedViewer(prevLayerDelta.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                dtype val = (1_dt - TODTYPE(targets[q]) - std::exp(-TODTYPE(inputs[q])) / (1_dt + std::exp(-TODTYPE(inputs[q]))));
+                prevLayerDelta[q] += TOMMTYPE(val * TODTYPE(deltas_viewer[q]));
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                dtype val = (1_dt - TODTYPE(targets[q]) - std::exp(-TODTYPE(inputs[q])) / (1_dt + std::exp(-TODTYPE(inputs[q]))));
+                prevLayerDelta[q] += TOMMTYPE(val * TODTYPE(deltas[q]));
+            }
+        }
+    }
+    else if (!mLayer.mIsFinal)
+    {
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        prevLayerDelta += deltas;
+    }
+}
+
+template class SigmoidCrossEntropyLossCPU<MemoryManager>;
+template class SigmoidCrossEntropyLossCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/impl/SigmoidCrossEntropyLossCPU.h b/training/src/compiler/training/base/loss/impl/SigmoidCrossEntropyLossCPU.h
new file mode 100644
index 00000000..8dff5365
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/SigmoidCrossEntropyLossCPU.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SIGMOID_CROSS_ENTROPY_LOSS_CPU_H
+#define SIGMOID_CROSS_ENTROPY_LOSS_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SigmoidCrossEntropyLoss;
+
+template<typename MM>
+class SigmoidCrossEntropyLossCPU : public BasicImpl
+{
+  public:
+    SigmoidCrossEntropyLossCPU(SigmoidCrossEntropyLoss& layer)
+        : mLayer(layer)
+    {
+    }
+
+    SigmoidCrossEntropyLossCPU(SigmoidCrossEntropyLossCPU&&) = default;
+    SigmoidCrossEntropyLossCPU(const SigmoidCrossEntropyLossCPU&) = delete;
+    SigmoidCrossEntropyLossCPU& operator=(const SigmoidCrossEntropyLossCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SigmoidCrossEntropyLoss& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/impl/SoftmaxCrossEntropyLossCPU.cpp b/training/src/compiler/training/base/loss/impl/SoftmaxCrossEntropyLossCPU.cpp
new file mode 100644
index 00000000..4c0648be
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/SoftmaxCrossEntropyLossCPU.cpp
@@ -0,0 +1,127 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SoftmaxCrossEntropyLossCPU.h"
+#include "../SoftmaxCrossEntropyLoss.h"
+
+namespace raul
+{
+
+template<typename MM>
+void SoftmaxCrossEntropyLossCPU<MM>::forwardComputeImpl(NetworkMode mode)
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (mLayer.wrapper)
+    {
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+        output = TORANGE_MM(input);
+    }
+    else
+    {
+        if (mode == NetworkMode::Test)
+        {
+            return;
+        }
+
+        auto& output = work.getMemoryManager<MM>()[mLayer.mOutputName];
+        const auto& input = work.getMemoryManager<MM>()[mLayer.mInputName];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+
+        size_t size = input.getBatchSize() * input.getDepth() * input.getHeight();
+        auto input2D = input.reshape(yato::dims(size, input.getWidth()));
+        auto output2D = output.reshape(yato::dims(size, input.getWidth()));
+        auto target2D = targets.reshape(yato::dims(size, input.getWidth()));
+
+        auto& inputTemp = work.getMemoryManager<MM>()[mLayer.mInputName / "AFTER_SOFTMAX"];
+        auto inputTemp2D = inputTemp.reshape(yato::dims(size, input.getWidth()));
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t q = 0; q < size; ++q)
+        {
+            dtype sum = 0.0_dt;
+            auto max = (*std::max_element(input.begin() + q * input.getWidth(), input.begin() + (q + 1) * input.getWidth()));
+
+            for (size_t i = 0; i < input.getWidth(); ++i)
+            {
+                auto tmp = std::exp(TODTYPE(input2D[q][i] - max));
+                sum += tmp;
+                inputTemp2D[q][i] = TOMMTYPE(tmp);
+            }
+
+            for (size_t i = 0; i < input.getWidth(); ++i)
+            {
+                auto tmp = TODTYPE(inputTemp2D[q][i]) / sum;
+                inputTemp2D[q][i] = TOMMTYPE(tmp);
+                output2D[q][i] = -TOMMTYPE(TODTYPE(target2D[q][i]) * std::log(tmp));
+            }
+        }
+    }
+}
+
+template<typename MM>
+void SoftmaxCrossEntropyLossCPU<MM>::backwardComputeImpl()
+{
+    auto& work = mLayer.mNetworkParams.mWorkflow;
+
+    if (!mLayer.wrapper)
+    {
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        const auto& targets = work.getMemoryManager<MM>()[mLayer.mLabelName];
+        const auto& inputTemp = work.getMemoryManager<MM>()[mLayer.mInputName / "AFTER_SOFTMAX"];
+
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        if (deltas.getShape() != prevLayerDelta.getShape())
+        {
+            if (!deltas.isBroadcastableTo(prevLayerDelta.getShape()))
+            {
+                THROW(mLayer.mTypeName, mLayer.mName, "bad incoming deltas shape");
+            }
+            auto deltas_viewer = deltas.getBroadcastedViewer(prevLayerDelta.getShape());
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] += (inputTemp[q] - targets[q]) * deltas_viewer[q];
+            }
+        }
+        else
+        {
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                prevLayerDelta[q] += (inputTemp[q] - targets[q]) * deltas[q];
+            }
+        }
+    }
+    else if (!mLayer.mIsFinal)
+    {
+        const auto& deltas = work.getMemoryManager<MM>()[mLayer.mOutputName.grad()];
+        auto& prevLayerDelta = work.getMemoryManager<MM>()[mLayer.mInputName.grad()];
+
+        prevLayerDelta += deltas;
+    }
+}
+
+template class SoftmaxCrossEntropyLossCPU<MemoryManager>;
+template class SoftmaxCrossEntropyLossCPU<MemoryManagerFP16>;
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/impl/SoftmaxCrossEntropyLossCPU.h b/training/src/compiler/training/base/loss/impl/SoftmaxCrossEntropyLossCPU.h
new file mode 100644
index 00000000..dc9f411d
--- /dev/null
+++ b/training/src/compiler/training/base/loss/impl/SoftmaxCrossEntropyLossCPU.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SOFTMAX_CROSS_ENTROPY_LOSS_CPU_H
+#define SOFTMAX_CROSS_ENTROPY_LOSS_CPU_H
+
+#include <training/base/layers/BasicImpl.h>
+
+namespace raul
+{
+
+class SoftmaxCrossEntropyLoss;
+
+template<typename MM>
+class SoftmaxCrossEntropyLossCPU : public BasicImpl
+{
+  public:
+    SoftmaxCrossEntropyLossCPU(SoftmaxCrossEntropyLoss& layer)
+        : mLayer(layer)
+    {
+    }
+
+    SoftmaxCrossEntropyLossCPU(SoftmaxCrossEntropyLossCPU&&) = default;
+    SoftmaxCrossEntropyLossCPU(const SoftmaxCrossEntropyLossCPU&) = delete;
+    SoftmaxCrossEntropyLossCPU& operator=(const SoftmaxCrossEntropyLossCPU&) = delete;
+
+    void forwardComputeImpl(NetworkMode mode) override;
+    void backwardComputeImpl() override;
+
+  private:
+    SoftmaxCrossEntropyLoss& mLayer;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/loss/scaling/ScalingStrategy.h b/training/src/compiler/training/base/loss/scaling/ScalingStrategy.h
new file mode 100644
index 00000000..976a2682
--- /dev/null
+++ b/training/src/compiler/training/base/loss/scaling/ScalingStrategy.h
@@ -0,0 +1,56 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SCALING_STRATEGY_H
+#define SCALING_STRATEGY_H
+
+namespace raul
+{
+
+struct ScalingStrategy
+{
+    explicit ScalingStrategy(dtype scale)
+        : mScale(scale)
+        , mMin(std::numeric_limits<dtype>::min())
+        , mMax(std::numeric_limits<dtype>::max())
+    {
+    }
+
+    ScalingStrategy setMax(dtype max)
+    {
+        mMax = max;
+        return *this;
+    }
+
+    ScalingStrategy setMin(dtype min)
+    {
+        mMin = min;
+        return *this;
+    }
+
+    template<typename T>
+    void scale(T& tensor)
+    {
+        tensor.unscale();
+        tensor.scale(static_cast<typename T::type>(mScale));
+    }
+
+  private:
+    dtype mScale;
+    dtype mMin;
+    dtype mMax;
+};
+
+}
+
+#endif // SCALING_STRATEGY_H
diff --git a/training/src/compiler/training/base/optimizers/ASGD.cpp b/training/src/compiler/training/base/optimizers/ASGD.cpp
new file mode 100644
index 00000000..da0e6cad
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/ASGD.cpp
@@ -0,0 +1,109 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ASGD.h"
+#include <iostream>
+#include <stdexcept>
+
+namespace raul::optimizers
+{
+
+ASGD::ASGD(const dtype lr, const dtype lambda, const dtype alpha, const dtype startPoint, const dtype weightDecay)
+    : mLearningRate(lr)
+    , mLambda(lambda)
+    , mAlpha(alpha)
+    , mStartPoint(startPoint)
+    , mWeightDecay(weightDecay)
+    , mStep(0.0_dt)
+    , mEta(lr)
+    , mMu(1.0_dt)
+{
+    if (lr < 0.0_dt)
+    {
+        THROW_NONAME("ASGD", "reset lr>=0 (current lr=" + Conversions::toString(lr) + ")");
+    }
+    if (weightDecay < 0.0_dt)
+    {
+        THROW_NONAME("ASGD", "reset weight decay>=0 (current weight decay=" + Conversions::toString(weightDecay) + ")");
+    }
+}
+
+void ASGD::optimize(MemoryManager&, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("ASGD", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    mStep += 1.0_dt;
+
+    size_t n = param.size();
+    size_t incx = 1U;
+    dtype* sy = &(param[0]);
+    const dtype* sx = &(grad[0]);
+
+    size_t incy = 1U;
+    size_t xOffset = 0U;
+    size_t yOffset = 0U;
+
+    // Decay term
+    OPENBLAS_CONST dtype sa2 = TODTYPE(-1.0) * mEta * (mLambda + mWeightDecay);
+    Common::axpy(n, sa2, sy, incx, sy, incy, xOffset, yOffset);
+
+    // Update param
+    OPENBLAS_CONST dtype sa3 = TODTYPE(-1.0) * mEta;
+    Common::axpy(n, sa3, sx, incx, sy, incy, xOffset, yOffset);
+
+    // Update eta and mu
+    mEta = mLearningRate / std::pow(1.0_dt + mLambda * mLearningRate * mStep, mAlpha);
+    mMu = 1.0_dt / std::max(1.0_dt, mStep - mStartPoint);
+}
+
+void ASGD::optimize(MemoryManagerFP16&, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("ASGD", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    mStep += 1.0_dt;
+
+    size_t n = param.size();
+    size_t incx = 1U;
+    half* sy = &(param[0]);
+    const half* sx = &(grad[0]);
+
+    size_t incy = 1U;
+    size_t xOffset = 0U;
+    size_t yOffset = 0U;
+
+    // Decay term
+    OPENBLAS_CONST dtype sa2 = TODTYPE(-1.0) * mEta * (mLambda + mWeightDecay);
+    Common::axpy(n, sa2, sy, incx, sy, incy, xOffset, yOffset);
+
+    // Update param
+    OPENBLAS_CONST dtype sa3 = TODTYPE(-1.0) * mEta;
+    Common::axpy(n, sa3, sx, incx, sy, incy, xOffset, yOffset);
+
+    // Update eta and mu
+    mEta = mLearningRate / std::pow(1.0_dt + mLambda * mLearningRate * mStep, mAlpha);
+    mMu = 1.0_dt / std::max(1.0_dt, mStep - mStartPoint);
+}
+
+std::ostream& ASGD::as_ostream(std::ostream& out) const
+{
+    out << "ASGD(lr=" << std::scientific << mLearningRate << ", lambda=" << mLambda << ", start point=" << mStartPoint << ", weight decay=" << mWeightDecay << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/ASGD.h b/training/src/compiler/training/base/optimizers/ASGD.h
new file mode 100644
index 00000000..962dfd89
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/ASGD.h
@@ -0,0 +1,69 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ASGD_H
+#define ASGD_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+/**
+ * @brief Averaged Stochastic gradient descent (ASGD)
+ *
+ *  This is a variation of classical stochastic gradient descent
+ *  with next parameters:
+ *  1. Learning rate (lr).
+ *  2. Decay term (lambda).
+ *  3. Power for eta update (alpha).
+ *  4. Point at which to start averaging (startPoint).
+ *  An optimization algorithm works according to the following formula.
+ *
+ *  \f[
+ *      \theta_{t} =  \theta_{t-1} - \eta_{t-1} \nabla_{\theta} E(\theta_{t-1}),
+ *  \f]
+ *  where
+ *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+ *  - \f$\eta\f$ is a learning rate,
+ *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+ *
+ *  @see
+ *  - B. T. Polyak, A. B. Juditsky, “Acceleration of stochastic approximation by averaging” SIAM Journal on Control and Optimization, Jul. 1992.
+ */
+struct ASGD : public Optimizer
+{
+    explicit ASGD(const dtype lr, const dtype lambda = 1.0e-4_dt, const dtype alpha = 0.75_dt, const dtype startPoint = 1.0e6_dt, const dtype weightDecay = 0.0_dt);
+    void setLearningRate(dtype lr) final { mLearningRate = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return mLearningRate; }
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype mLearningRate;
+    dtype mLambda;
+    dtype mAlpha;
+    dtype mStartPoint;
+    dtype mWeightDecay;
+
+    // Needed for update
+    dtype mStep;
+    dtype mEta;
+    dtype mMu;
+};
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Adadelta.cpp b/training/src/compiler/training/base/optimizers/Adadelta.cpp
new file mode 100644
index 00000000..9083007c
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Adadelta.cpp
@@ -0,0 +1,136 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Adadelta.h"
+#include <iostream>
+#include <stdexcept>
+
+namespace
+{
+constexpr raul::dtype rho_lower_boundary = 0.0_dt;
+constexpr raul::dtype rho_upper_boundary = 1.0_dt;
+constexpr raul::dtype epsilon_lower_boundary = 0.0_dt;
+constexpr raul::dtype epsilon_upper_boundary = 0.1_dt;
+}
+
+namespace raul::optimizers
+{
+
+Adadelta::Adadelta(const dtype rho, const dtype epsilon)
+    : m_rho(rho)
+    , m_epsilon(epsilon)
+{
+    if (rho < rho_lower_boundary || rho >= rho_upper_boundary)
+    {
+        THROW_NONAME("Adadelta",
+                     "reset rho from [" + Conversions::toString(rho_lower_boundary) + ", " + Conversions::toString(rho_upper_boundary) + ") (current rho=" + Conversions::toString(epsilon) + ")");
+    }
+
+    if (epsilon < epsilon_lower_boundary || epsilon >= epsilon_upper_boundary)
+    {
+        THROW_NONAME("Adadelta",
+                     "reset epsilon from [" + Conversions::toString(epsilon_lower_boundary) + ", " + Conversions::toString(epsilon_upper_boundary) +
+                         ") (current epsilon=" + Conversions::toString(epsilon) + ")");
+    }
+}
+
+void Adadelta::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Adadelta", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    Tensor *gp, *gu;
+    if (!memory_manager.tensorExists(Name("Adadelta") / param.getName() / "g"))
+    {
+        gp = memory_manager.createTensor(Name("Adadelta") / param.getName() / "g", 1, param.size(), 1, 1);
+    }
+    gp = &memory_manager.getTensor(Name("Adadelta") / param.getName() / "g");
+
+    if (!memory_manager.tensorExists(Name("Adadelta") / param.getName() / "u"))
+    {
+        gu = memory_manager.createTensor(Name("Adadelta") / param.getName() / "u", 1, param.size(), 1, 1);
+    }
+    gu = &memory_manager.getTensor(Name("Adadelta") / param.getName() / "u");
+
+    Tensor& g = *gp;
+    Tensor& u = *gu;
+
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        // Accumulate gradients
+        g[i] = this->m_rho * g[i] + (1.0_dt - this->m_rho) * grad[i] * grad[i];
+        // Compute update
+        const auto delta = -std::sqrt(u[i] + this->m_epsilon) / std::sqrt(g[i] + this->m_epsilon) * grad[i];
+        // Accumulate updates
+        u[i] = this->m_rho * u[i] + (1.0_dt - this->m_rho) * delta * delta;
+        // Apply updates
+        param[i] = param[i] + delta;
+    }
+}
+
+void Adadelta::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Adadelta", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    TensorFP16 *gp, *gu;
+    if (!memory_manager.tensorExists(Name("Adadelta") / param.getName() / "g"))
+    {
+        gp = memory_manager.createTensor(Name("Adadelta") / param.getName() / "g", 1, param.size(), 1, 1);
+    }
+    gp = &memory_manager.getTensor(Name("Adadelta") / param.getName() / "g");
+
+    if (!memory_manager.tensorExists(Name("Adadelta") / param.getName() / "u"))
+    {
+        gu = memory_manager.createTensor(Name("Adadelta") / param.getName() / "u", 1, param.size(), 1, 1);
+    }
+    gu = &memory_manager.getTensor(Name("Adadelta") / param.getName() / "u");
+
+    TensorFP16& g = *gp;
+    TensorFP16& u = *gu;
+
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        // Accumulate gradients
+        const auto gTmp = this->m_rho * TODTYPE(g[i]) + (1.0_dt - this->m_rho) * TODTYPE(grad[i]) * TODTYPE(grad[i]);
+        g[i] = TOHTYPE(gTmp);
+        // Compute update
+        const auto delta = -std::sqrt(TODTYPE(u[i]) + this->m_epsilon) / std::sqrt(gTmp + this->m_epsilon) * TODTYPE(grad[i]);
+        // Accumulate updates
+        u[i] = TOHTYPE(this->m_rho * TODTYPE(u[i]) + (1.0_dt - this->m_rho) * delta * delta);
+        // Apply updates
+        param[i] = TOHTYPE(TODTYPE(param[i]) + delta);
+    }
+}
+
+std::ostream& Adadelta::as_ostream(std::ostream& out) const
+{
+    out << "Adadelta(rho=" << std::scientific << this->m_rho << ", epsilon=" << this->m_epsilon << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Adadelta.h b/training/src/compiler/training/base/optimizers/Adadelta.h
new file mode 100644
index 00000000..3e320443
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Adadelta.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ADADELTA_H
+#define ADADELTA_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+/**
+ * @brief Adadelta (Adaptive gradient)
+ *
+ * Adadelta is a per-dimension learning rate method for gradient descent
+ * which is improved version of Adagrad.
+ *
+ * Adadelta improves two main drawbacks of the Adagrad:
+ * 1. the continual decay of learning rates throughout training,
+ * 2. the need for a manually selected global learning rate.
+ *
+ *  \f[
+ *      g_t = \nabla_{\theta} E(\theta_{t}),\\
+ *      \mathrm{M}[\Delta g^2]_{t} = \rho \mathrm{M}[\Delta g^2]_{t-1} - (1-\rho) g^2_{t},\\
+ *      \Delta \theta_{t} = - \frac{\sqrt{\mathrm{M}[\Delta \theta^2]_{t-1} + \epsilon}}{\sqrt{\mathrm{M}[\Delta g^2]_{t} + \epsilon}} g_t,\\
+ *      \mathrm{M}[\Delta \theta^2]_{t} = \rho \mathrm{M}[\Delta \theta^2]_{t-1} - (1-\rho) \Delta \theta^2_{t},\\
+ *      \theta_{t} =  \theta_{t-1} + \Delta \theta_{t},
+ *  \f]
+ *  where
+ *  - \f$\epsilon\f$ is a small value to avoid division by zero,
+ *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+ *  - \f$\rho\f$ is a momentum,
+ *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+ *
+ *  @see
+ *  - M. D. Zeiler, “ADADELTA: An Adaptive Learning Rate Method” arXiv:1212.5701 [cs], Dec. 2012.
+ */
+struct Adadelta : public Optimizer
+{
+    explicit Adadelta(const dtype rho, const dtype epsilon = 1e-8_dt);
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype m_rho;
+    dtype m_epsilon;
+};
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Adagrad.cpp b/training/src/compiler/training/base/optimizers/Adagrad.cpp
new file mode 100644
index 00000000..7331ec0f
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Adagrad.cpp
@@ -0,0 +1,109 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Adagrad.h"
+#include <iostream>
+#include <stdexcept>
+
+namespace
+{
+constexpr raul::dtype alpha_lower_boundary = 0.0_dt;
+constexpr raul::dtype epsilon_lower_boundary = 0.0_dt;
+constexpr raul::dtype epsilon_upper_boundary = 0.1_dt;
+}
+
+namespace raul::optimizers
+{
+
+Adagrad::Adagrad(const dtype alpha, const dtype epsilon)
+    : m_alpha(alpha)
+    , m_epsilon(epsilon)
+{
+    if (alpha <= alpha_lower_boundary)
+    {
+        THROW_NONAME("Adagrad", "reset alpha>" + Conversions::toString(alpha_lower_boundary) + " (current alpha=" + Conversions::toString(alpha) + ")");
+    }
+
+    if (epsilon < epsilon_lower_boundary || epsilon >= epsilon_upper_boundary)
+    {
+        THROW_NONAME("Adagrad",
+                     "reset epsilon from [" + Conversions::toString(epsilon_lower_boundary) + ", " + Conversions::toString(epsilon_upper_boundary) +
+                         ") (current epsilon=" + Conversions::toString(epsilon) + ")");
+    }
+}
+
+void Adagrad::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Adagrad", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    Tensor* gp;
+    if (!memory_manager.tensorExists(Name("Adagrad") / param.getName() / "g"))
+    {
+        gp = memory_manager.createTensor(Name("Adagrad") / param.getName() / "g", 1, param.size(), 1, 1);
+    }
+    gp = &memory_manager.getTensor(Name("Adagrad") / param.getName() / "g");
+
+    Tensor& g = *gp;
+
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        g[i] += grad[i] * grad[i];
+        // param_new = param - alpha*grad/(sqrt[g + epsilon])
+        param[i] = param[i] - this->m_alpha * grad[i] / (std::sqrt(g[i]) + this->m_epsilon);
+    }
+}
+
+void Adagrad::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Adagrad", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    TensorFP16* gp;
+    if (!memory_manager.tensorExists(Name("Adagrad") / param.getName() / "g"))
+    {
+        gp = memory_manager.createTensor(Name("Adagrad") / param.getName() / "g", 1, param.size(), 1, 1);
+    }
+    gp = &memory_manager.getTensor(Name("Adagrad") / param.getName() / "g");
+
+    TensorFP16& g = *gp;
+
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        g[i] += grad[i] * grad[i];
+        // param_new = param - alpha*grad/(sqrt[g + epsilon])
+        param[i] = param[i] - TOHTYPE(this->m_alpha * TODTYPE(grad[i]) / (std::sqrt(TODTYPE(g[i])) + this->m_epsilon));
+    }
+}
+
+std::ostream& Adagrad::as_ostream(std::ostream& out) const
+{
+    out << "Adagrad(alpha=" << std::scientific << this->m_alpha << ", epsilon=" << this->m_epsilon << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Adagrad.h b/training/src/compiler/training/base/optimizers/Adagrad.h
new file mode 100644
index 00000000..5ca25d7e
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Adagrad.h
@@ -0,0 +1,66 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ADAGRAD_H
+#define ADAGRAD_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+/**
+ * @brief AdaGrad (Adaptive gradient)
+ *
+ *  AdaGrad is the simplest adaptive gradient descent method.
+ *  A per-component learning rate reduces according to the history of its changes.
+ *
+ *  Technically, the sums of the squares of the gradient components are calculating and then
+ *  its square roots are using as normalizing factors for the learning rate.
+ *
+ *  \f[
+ *      g = g + \nabla^2_{\theta} E(\theta_{t-1}),\\
+ *      \theta_{t} =  \theta_{t-1} - \alpha \frac{\nabla_{\theta} E(\theta_{t-1})}{\sqrt{g + \epsilon}},
+ *  \f]
+ *  where
+ *  - \f$g\f$ is accumulated squares of the gradient components,
+ *  - \f$\epsilon\f$ is a small value to avoid division by zero,
+ *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+ *  - \f$\alpha\f$ is a learning rate,
+ *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+ *
+ *  Default parameters:
+ *  - \f$\epsilon = 10^{-10}\f$
+ *
+ *  @see
+ *  - J. Duchi, E. Hazan, and Y. Singer, “Adaptive Subgradient Methods for Online Learning and Stochastic Optimization” Journal of Machine Learning Research, vol. 12, no. Jul, pp. 2121–2159, 2011.
+ *  - D. P. Kingma and J. Ba, “Adam: A Method for Stochastic Optimization” arXiv:1412.6980 [cs], Jan. 2017.
+ */
+struct Adagrad : public Optimizer
+{
+    explicit Adagrad(const dtype alpha, const dtype epsilon = 1e-10_dt);
+    void setLearningRate(dtype lr) final { m_alpha = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return m_alpha; }
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const override;
+
+  private:
+    dtype m_alpha;
+    dtype m_epsilon;
+};
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Adam.cpp b/training/src/compiler/training/base/optimizers/Adam.cpp
new file mode 100644
index 00000000..b19e3122
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Adam.cpp
@@ -0,0 +1,848 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Adam.h"
+#include <iostream>
+#include <stdexcept>
+
+namespace
+{
+constexpr raul::dtype alpha_lower_boundary = 0.0_dt;
+constexpr raul::dtype beta_lower_boundary = 0.0_dt;
+constexpr raul::dtype beta_upper_boundary = 1.0_dt;
+
+bool abs_compare(raul::dtype a, raul::dtype b)
+{
+    return (std::abs(a) < std::abs(b));
+}
+
+}
+
+namespace raul::optimizers
+{
+
+Adam::Adam(const dtype alpha, const dtype beta_1, const dtype beta_2, const dtype epsilon, bool use_simple_epsilon)
+    : m_alpha(alpha)
+    , m_beta_1(beta_1)
+    , m_beta_2(beta_2)
+    , m_epsilon(epsilon)
+    , m_use_simple_epsilon(use_simple_epsilon)
+{
+    if (alpha <= alpha_lower_boundary)
+    {
+        THROW_NONAME("Adam", "reset alpha>" + Conversions::toString(alpha_lower_boundary) + " (current alpha=" + Conversions::toString(alpha) + ")");
+    }
+
+    if (beta_1 < beta_lower_boundary || beta_1 >= beta_upper_boundary)
+    {
+        THROW_NONAME("Adam",
+                     "reset beta_1 from [" + Conversions::toString(beta_lower_boundary) + ", " + Conversions::toString(beta_upper_boundary) + ") (current beta_1=" + Conversions::toString(beta_1) +
+                         ")");
+    }
+
+    if (beta_2 < beta_lower_boundary || beta_2 >= beta_upper_boundary)
+    {
+        THROW_NONAME("Adam",
+                     "reset beta_2 from [" + Conversions::toString(beta_lower_boundary) + ", " + Conversions::toString(beta_upper_boundary) + ") (current beta_2=" + Conversions::toString(beta_2) +
+                         ")");
+    }
+}
+
+void Adam::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Adam", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    Tensor *b1tp, *b2tp, *mp, *vp;
+    if (!memory_manager.tensorExists(Name("Adam") / param.getName() / "beta_1_t"))
+    {
+        b1tp = memory_manager.createTensor(Name("Adam") / param.getName() / "beta_1_t", 1, 1, 1, 1, this->m_beta_1);
+    }
+    b1tp = &memory_manager.getTensor(Name("Adam") / param.getName() / "beta_1_t");
+
+    if (!memory_manager.tensorExists(Name("Adam") / param.getName() / "beta_2_t"))
+    {
+        b2tp = memory_manager.createTensor(Name("Adam") / param.getName() / "beta_2_t", 1, 1, 1, 1, this->m_beta_2);
+    }
+    b2tp = &memory_manager.getTensor(Name("Adam") / param.getName() / "beta_2_t");
+
+    if (!memory_manager.tensorExists(Name("Adam") / param.getName() / "m"))
+    {
+        mp = memory_manager.createTensor(Name("Adam") / param.getName() / "m", param.getShape());
+    }
+    mp = &memory_manager.getTensor(Name("Adam") / param.getName() / "m");
+
+    if (!memory_manager.tensorExists(Name("Adam") / param.getName() / "v"))
+    {
+        vp = memory_manager.createTensor(Name("Adam") / param.getName() / "v", param.getShape());
+    }
+    vp = &memory_manager.getTensor(Name("Adam") / param.getName() / "v");
+
+    Tensor& beta_1_t = *b1tp;
+    Tensor& beta_2_t = *b2tp;
+    Tensor& m = *mp;
+    Tensor& v = *vp;
+
+    const auto sqrt_beta_2_t_0 = std::sqrt(1.0_dt - beta_2_t[0]);
+    const auto alpha_new = this->m_alpha * sqrt_beta_2_t_0 / (1.0_dt - beta_1_t[0]);
+    const auto epsilon_new = m_use_simple_epsilon ? this->m_epsilon : this->m_epsilon * sqrt_beta_2_t_0;
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        // m_new = beta_1*m + (1-beta_1)*grad
+        m[i] = this->m_beta_1 * m[i] + (1.0_dt - this->m_beta_1) * grad[i];
+        // v_new = beta_2*v + (1-beta_2)*grad*grad
+        v[i] = this->m_beta_2 * v[i] + (1.0_dt - this->m_beta_2) * grad[i] * grad[i];
+        // param_new = param - alpha_new*m_new/(sqrt(v_new) + epsilon_new)
+        param[i] = param[i] - alpha_new * m[i] / (std::sqrt(v[i]) + epsilon_new);
+    }
+
+    beta_1_t[0] *= this->m_beta_1;
+    beta_2_t[0] *= this->m_beta_2;
+}
+
+void Adam::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Adam", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    TensorFP16 *b1tp, *b2tp, *mp, *vp;
+    if (!memory_manager.tensorExists(Name("Adam") / param.getName() / "beta_1_t"))
+    {
+        b1tp = memory_manager.createTensor(Name("Adam") / param.getName() / "beta_1_t", 1, 1, 1, 1, TOHTYPE(this->m_beta_1));
+    }
+    b1tp = &memory_manager.getTensor(Name("Adam") / param.getName() / "beta_1_t");
+
+    if (!memory_manager.tensorExists(Name("Adam") / param.getName() / "beta_2_t"))
+    {
+        b2tp = memory_manager.createTensor(Name("Adam") / param.getName() / "beta_2_t", 1, 1, 1, 1, TOHTYPE(this->m_beta_2));
+    }
+    b2tp = &memory_manager.getTensor(Name("Adam") / param.getName() / "beta_2_t");
+
+    if (!memory_manager.tensorExists(Name("Adam") / param.getName() / "m"))
+    {
+        mp = memory_manager.createTensor(Name("Adam") / param.getName() / "m", param.getShape());
+    }
+    mp = &memory_manager.getTensor(Name("Adam") / param.getName() / "m");
+
+    if (!memory_manager.tensorExists(Name("Adam") / param.getName() / "v"))
+    {
+        vp = memory_manager.createTensor(Name("Adam") / param.getName() / "v", param.getShape());
+    }
+    vp = &memory_manager.getTensor(Name("Adam") / param.getName() / "v");
+
+    TensorFP16& beta_1_t = *b1tp;
+    TensorFP16& beta_2_t = *b2tp;
+    TensorFP16& m = *mp;
+    TensorFP16& v = *vp;
+
+    const auto sqrt_beta_2_t_0 = std::sqrt(1.0_dt - beta_2_t[0]);
+    const auto alpha_new = this->m_alpha * sqrt_beta_2_t_0 / (1.0_dt - beta_1_t[0]);
+    const auto epsilon_new = m_use_simple_epsilon ? this->m_epsilon : this->m_epsilon * sqrt_beta_2_t_0;
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        // m_new = beta_1*m + (1-beta_1)*grad
+        const auto mTmp = this->m_beta_1 * TODTYPE(m[i]) + (1.0_dt - this->m_beta_1) * TODTYPE(grad[i]);
+        m[i] = TOHTYPE(mTmp);
+        // v_new = beta_2*v + (1-beta_2)*grad*grad
+        const auto vTmp = this->m_beta_2 * TODTYPE(v[i]) + (1.0_dt - this->m_beta_2) * TODTYPE(grad[i]) * TODTYPE(grad[i]);
+        v[i] = TOHTYPE(vTmp);
+        // param_new = param - alpha_new*m_new/(sqrt(v_new) + epsilon_new)
+        param[i] = TOHTYPE(TODTYPE(param[i]) - alpha_new * mTmp / (std::sqrt(vTmp) + epsilon_new));
+    }
+
+    beta_1_t[0] = TOHTYPE(TODTYPE(beta_1_t[0]) * this->m_beta_1);
+    beta_2_t[0] = TOHTYPE(TODTYPE(beta_2_t[0]) * this->m_beta_2);
+}
+
+std::ostream& Adam::as_ostream(std::ostream& out) const
+{
+    out << "Adam(alpha=" << std::scientific << this->m_alpha << ", beta_1=" << this->m_beta_1 << ", beta_2=" << this->m_beta_2 << ", epsilon=" << this->m_epsilon << ")";
+    return out;
+}
+
+AdamQuantized::AdamQuantized(size_t blockSize, const dtype alpha, const dtype beta_1, const dtype beta_2, const dtype epsilon, bool use_simple_epsilon)
+    : m_alpha(alpha)
+    , m_beta_1(beta_1)
+    , m_beta_2(beta_2)
+    , m_epsilon(epsilon)
+    , m_use_simple_epsilon(use_simple_epsilon)
+    , mBlockSize(blockSize)
+{
+    if (alpha <= alpha_lower_boundary)
+    {
+        THROW_NONAME("AdamQuantized", "reset alpha>" + Conversions::toString(alpha_lower_boundary) + " (current alpha=" + Conversions::toString(alpha) + ")");
+    }
+
+    if (beta_1 < beta_lower_boundary || beta_1 >= beta_upper_boundary)
+    {
+        THROW_NONAME("AdamQuantized",
+                     "reset beta_1 from [" + Conversions::toString(beta_lower_boundary) + ", " + Conversions::toString(beta_upper_boundary) + ") (current beta_1=" + Conversions::toString(beta_1) +
+                         ")");
+    }
+
+    if (beta_2 < beta_lower_boundary || beta_2 >= beta_upper_boundary)
+    {
+        THROW_NONAME("AdamQuantized",
+                     "reset beta_2 from [" + Conversions::toString(beta_lower_boundary) + ", " + Conversions::toString(beta_upper_boundary) + ") (current beta_2=" + Conversions::toString(beta_2) +
+                         ")");
+    }
+}
+
+void AdamQuantized::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("AdamQuantized", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    Tensor *b1tp, *b2tp;
+
+    bool firstRunM = false;
+    bool firstRunV = false;
+
+    if (!memory_manager.tensorExists(Name("AdamQuantized") / param.getName() / "beta_1_t"))
+    {
+        b1tp = memory_manager.createTensor(Name("AdamQuantized") / param.getName() / "beta_1_t", 1, 1, 1, 1, this->m_beta_1);
+    }
+    b1tp = &memory_manager.getTensor(Name("AdamQuantized") / param.getName() / "beta_1_t");
+
+    if (!memory_manager.tensorExists(Name("AdamQuantized") / param.getName() / "beta_2_t"))
+    {
+        b2tp = memory_manager.createTensor(Name("AdamQuantized") / param.getName() / "beta_2_t", 1, 1, 1, 1, this->m_beta_2);
+    }
+    b2tp = &memory_manager.getTensor(Name("AdamQuantized") / param.getName() / "beta_2_t");
+
+    if (!checkTensorExists(Name("AdamQuantized") / param.getName() / "m"))
+    {
+        Name t = Name("AdamQuantized") / param.getName() / "m";
+        mTensors[t] = std::vector<dtype>(param.size(), 0_dt);
+        firstRunM = true;
+    }
+    std::vector<dtype>& m = mTensors.find(Name("AdamQuantized") / param.getName() / "m")->second;
+
+    if (!checkTensorExists(Name("AdamQuantized") / param.getName() / "v"))
+    {
+        Name t = Name("AdamQuantized") / param.getName() / "v";
+        mTensors[t] = std::vector<dtype>(param.size(), 0_dt);
+        firstRunV = true;
+    }
+    std::vector<dtype>& v = mTensors.find(Name("AdamQuantized") / param.getName() / "v")->second;
+
+    Tensor& beta_1_t = *b1tp;
+    Tensor& beta_2_t = *b2tp;
+
+    //auto dynamicMapSigned = createDynamicMap(true);
+    //dynamicMapSigned[0] = -1.0f;
+    //auto dynamicMapUnsigned = createDynamicMap(false);
+
+    auto dynamicMapSigned = createNormalQuantileMap(true);
+    auto dynamicMapUnsigned = createNormalQuantileMap(false);
+
+    auto nameM = Name("AdamQuantized") / param.getName() / "m";
+    auto nameV = Name("AdamQuantized") / param.getName() / "v";
+
+    //if(!firstRunM) decompress(nameM);
+    //if(!firstRunV) decompress(nameV);
+
+    if(!firstRunM) decompressDynamic(nameM, dynamicMapSigned);
+    if(!firstRunV) decompressDynamic(nameV, dynamicMapUnsigned);
+
+    //if(!firstRunM) decompressQuantile(nameM);
+    //if(!firstRunV) decompressQuantile(nameV);
+
+    const auto sqrt_beta_2_t_0 = std::sqrt(1.0_dt - beta_2_t[0]);
+    const auto alpha_new = this->m_alpha * sqrt_beta_2_t_0 / (1.0_dt - beta_1_t[0]);
+    const auto epsilon_new = m_use_simple_epsilon ? this->m_epsilon : this->m_epsilon * sqrt_beta_2_t_0;
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        // m_new = beta_1*m + (1-beta_1)*grad
+        m[i] = this->m_beta_1 * m[i] + (1.0_dt - this->m_beta_1) * grad[i];
+        // v_new = beta_2*v + (1-beta_2)*grad*grad
+        v[i] = this->m_beta_2 * v[i] + (1.0_dt - this->m_beta_2) * grad[i] * grad[i];
+        // param_new = param - alpha_new*m_new/(sqrt(v_new) + epsilon_new)
+        param[i] = param[i] - alpha_new * m[i] / (std::sqrt(v[i]) + epsilon_new);
+    }
+
+    //compress(nameM, mBlockSize);
+    //compress(nameV, mBlockSize);
+
+    compressDynamic(nameM, dynamicMapSigned, mBlockSize);
+    compressDynamic(nameV, dynamicMapUnsigned, mBlockSize);
+
+    //compressQuantile(nameM);
+    //compressQuantile(nameV);
+
+    beta_1_t[0] *= this->m_beta_1;
+    beta_2_t[0] *= this->m_beta_2;
+}
+
+void AdamQuantized::compress(const raul::Name& name, size_t bucketSize)
+{
+    auto it = mTensors.find(name);
+    if(it == mTensors.end()) THROW_NONAME("AdamQuantized[compress]", "Tensor [" + name + "] not found");
+
+    auto itc = mCompressedTensors.find(name);
+
+    if(itc == mCompressedTensors.end())
+    {
+        mCompressedTensors[name] = std::vector<CompressBucket>();
+        itc = mCompressedTensors.find(name);
+    }
+    std::vector<CompressBucket>& cBuckets = itc->second;
+
+    std::vector<dtype>& data = (*it).second;
+
+    size_t buckets = data.size() / bucketSize;
+    
+    if(!buckets)
+    {
+        buckets = 1;
+        bucketSize = data.size();
+    }
+
+    for (size_t q = 0; q < buckets; ++q)
+    {
+        size_t offsetFirst = q * bucketSize;
+        size_t offsetLast = (q + 1) * bucketSize;
+        if(q == buckets - 1) offsetLast = data.size();
+
+        CompressBucket newBuck;
+        auto minMax = std::minmax_element(data.begin() + offsetFirst, data.begin() + offsetLast);
+        newBuck.mCompressInt8Min = *minMax.first;
+        newBuck.mCompressInt8Max = *minMax.second;
+        newBuck.mCompressedDataInt8.resize(offsetLast - offsetFirst);
+
+        dtype unit = TODTYPE(255.0f) / (newBuck.mCompressInt8Max - newBuck.mCompressInt8Min);
+
+        for (size_t w = offsetFirst, w2 = 0; w < offsetLast; ++w, ++w2)
+        {
+            newBuck.mCompressedDataInt8[w2] = static_cast<uint8_t>((data[w] - newBuck.mCompressInt8Min) * unit);
+        }
+
+        cBuckets.emplace_back(newBuck);
+    }
+
+    data.clear();
+    data.shrink_to_fit();
+}
+
+void AdamQuantized::decompress(const raul::Name& name)
+{
+    auto it = mTensors.find(name);
+    if(it == mTensors.end()) THROW_NONAME("AdamQuantized[decompress]", "Tensor [" + name + "] not found");
+
+    auto itc = mCompressedTensors.find(name);
+    if(itc == mCompressedTensors.end()) THROW_NONAME("AdamQuantized[decompress]", "Tensor [" + name + "] not found in compressed");
+    std::vector<CompressBucket>& cBuckets = itc->second;
+
+    std::vector<dtype>& data = (*it).second;
+
+    const size_t buckets = cBuckets.size();
+
+    if(!buckets)
+    {
+        THROW_NONAME("AdamQuantized[decompress]", "Buckets are empty");
+    }
+
+    size_t totalSize = std::accumulate(cBuckets.begin(), cBuckets.end(), static_cast<size_t>(0), [](size_t sum, const CompressBucket& buck) { return buck.mCompressedDataInt8.size() + sum; });
+
+    data.resize(totalSize);
+
+    size_t offset = 0;
+
+    for (size_t q = 0; q < buckets; ++q)
+    {
+        dtype unit = (cBuckets[q].mCompressInt8Max - cBuckets[q].mCompressInt8Min) / TODTYPE(255.0f);
+
+        for (size_t w = 0; w < cBuckets[q].mCompressedDataInt8.size(); ++w)
+        {
+            data[offset + w] = cBuckets[q].mCompressInt8Min + cBuckets[q].mCompressedDataInt8[w] * unit;
+        }
+
+        offset += cBuckets[q].mCompressedDataInt8.size();
+    }
+
+    cBuckets.clear();
+    cBuckets.shrink_to_fit();
+}
+
+void AdamQuantized::compressDynamic(const raul::Name& name, const std::vector<dtype>& map, size_t bucketSize)
+{
+    auto it = mTensors.find(name);
+    if(it == mTensors.end()) THROW_NONAME("AdamQuantized[compressDynamic]", "Tensor [" + name + "] not found");
+
+    auto itc = mCompressedTensors.find(name);
+
+    if(itc == mCompressedTensors.end())
+    {
+        mCompressedTensors[name] = std::vector<CompressBucket>();
+        itc = mCompressedTensors.find(name);
+    }
+    std::vector<CompressBucket>& cBuckets = itc->second;
+
+    std::vector<dtype>& data = (*it).second;
+
+    size_t buckets = data.size() / bucketSize;
+    
+    if(!buckets)
+    {
+        buckets = 1;
+        bucketSize = data.size();
+    }
+
+    for (size_t q = 0; q < buckets; ++q)
+    {
+        size_t offsetFirst = q * bucketSize;
+        size_t offsetLast = (q + 1) * bucketSize;
+        if(q == buckets - 1) offsetLast = data.size();
+
+        CompressBucket newBuck;
+        newBuck.mCompressInt8Max = std::abs(*std::max_element(data.begin() + offsetFirst, data.begin() + offsetLast, abs_compare));
+        newBuck.mCompressedDataInt8.resize(offsetLast - offsetFirst);
+
+        for (size_t w = offsetFirst, w2 = 0; w < offsetLast; ++w, ++w2)
+        {
+            dtype normed = data[w] / newBuck.mCompressInt8Max;
+            size_t index = std::lower_bound(map.begin(), map.end(), normed) - map.begin();
+            if (index < 255)
+            {
+                float dist_left = fabs(normed - (map[index]));
+                float dist_right = fabs(normed - (map[index + 1]));
+                if(dist_right < dist_left) ++index;
+            }
+            newBuck.mCompressedDataInt8[w2] = static_cast<uint8_t>(index);
+        }
+
+        cBuckets.emplace_back(newBuck);
+    }
+
+    data.clear();
+    data.shrink_to_fit();
+}
+
+void AdamQuantized::decompressDynamic(const raul::Name& name, const std::vector<dtype>& map)
+{
+    auto it = mTensors.find(name);
+    if(it == mTensors.end()) THROW_NONAME("AdamQuantized[decompressDynamic]", "Tensor [" + name + "] not found");
+
+    auto itc = mCompressedTensors.find(name);
+    if(itc == mCompressedTensors.end()) THROW_NONAME("AdamQuantized[decompressDynamic]", "Tensor [" + name + "] not found in compressed");
+    std::vector<CompressBucket>& cBuckets = itc->second;
+
+    std::vector<dtype>& data = (*it).second;
+
+    const size_t buckets = cBuckets.size();
+
+    if(!buckets)
+    {
+        THROW_NONAME("AdamQuantized[decompressDynamic]", "Buckets are empty");
+    }
+
+    size_t totalSize = std::accumulate(cBuckets.begin(), cBuckets.end(), static_cast<size_t>(0), [](size_t sum, const CompressBucket& buck) { return buck.mCompressedDataInt8.size() + sum; });
+
+    data.resize(totalSize);
+
+    size_t offset = 0;
+
+    for (size_t q = 0; q < buckets; ++q)
+    {
+        for (size_t w = 0; w < cBuckets[q].mCompressedDataInt8.size(); ++w)
+        {
+            data[offset + w] = map[cBuckets[q].mCompressedDataInt8[w]] * cBuckets[q].mCompressInt8Max;
+        }
+
+        offset += cBuckets[q].mCompressedDataInt8.size();
+    }
+
+    cBuckets.clear();
+    cBuckets.shrink_to_fit();
+}
+
+void AdamQuantized::compressDynamicFP16(const raul::Name& name, const std::vector<half>& map, size_t bucketSize)
+{
+    auto it = mTensorsFP16.find(name);
+    if(it == mTensorsFP16.end()) THROW_NONAME("AdamQuantized[compressDynamicFP16]", "Tensor [" + name + "] not found");
+
+    auto itc = mCompressedTensors.find(name);
+
+    if(itc == mCompressedTensors.end())
+    {
+        mCompressedTensors[name] = std::vector<CompressBucket>();
+        itc = mCompressedTensors.find(name);
+    }
+    std::vector<CompressBucket>& cBuckets = itc->second;
+
+    std::vector<half>& data = (*it).second;
+
+    size_t buckets = data.size() / bucketSize;
+    
+    if(!buckets)
+    {
+        buckets = 1;
+        bucketSize = data.size();
+    }
+
+    for (size_t q = 0; q < buckets; ++q)
+    {
+        size_t offsetFirst = q * bucketSize;
+        size_t offsetLast = (q + 1) * bucketSize;
+        if(q == buckets - 1) offsetLast = data.size();
+
+        CompressBucket newBuck;
+        newBuck.mCompressInt8Max = std::abs(TODTYPE(*std::max_element(data.begin() + offsetFirst, data.begin() + offsetLast, abs_compare)));
+        newBuck.mCompressedDataInt8.resize(offsetLast - offsetFirst);
+
+        for (size_t w = offsetFirst, w2 = 0; w < offsetLast; ++w, ++w2)
+        {
+            dtype normed = data[w] / newBuck.mCompressInt8Max;
+            size_t index = std::lower_bound(map.begin(), map.end(), normed) - map.begin();
+            if (index < 255)
+            {
+                float dist_left = fabs(normed - (map[index]));
+                float dist_right = fabs(normed - (map[index + 1]));
+                if(dist_right < dist_left) ++index;
+            }
+            newBuck.mCompressedDataInt8[w2] = static_cast<uint8_t>(index);
+        }
+
+        cBuckets.emplace_back(newBuck);
+    }
+
+    data.clear();
+    data.shrink_to_fit();
+}
+
+void AdamQuantized::decompressDynamicFP16(const raul::Name& name, const std::vector<half>& map)
+{
+    auto it = mTensorsFP16.find(name);
+    if(it == mTensorsFP16.end()) THROW_NONAME("AdamQuantized[decompressDynamicFP16]", "Tensor [" + name + "] not found");
+
+    auto itc = mCompressedTensors.find(name);
+    if(itc == mCompressedTensors.end()) THROW_NONAME("AdamQuantized[decompressDynamicFP16]", "Tensor [" + name + "] not found in compressed");
+    std::vector<CompressBucket>& cBuckets = itc->second;
+
+    std::vector<half>& data = (*it).second;
+
+    const size_t buckets = cBuckets.size();
+
+    if(!buckets)
+    {
+        THROW_NONAME("AdamQuantized[decompressDynamicFP16]", "Buckets are empty");
+    }
+
+    size_t totalSize = std::accumulate(cBuckets.begin(), cBuckets.end(), static_cast<size_t>(0), [](size_t sum, const CompressBucket& buck) { return buck.mCompressedDataInt8.size() + sum; });
+
+    data.resize(totalSize);
+
+    size_t offset = 0;
+
+    for (size_t q = 0; q < buckets; ++q)
+    {
+        for (size_t w = 0; w < cBuckets[q].mCompressedDataInt8.size(); ++w)
+        {
+            data[offset + w] = map[cBuckets[q].mCompressedDataInt8[w]] * cBuckets[q].mCompressInt8Max;
+        }
+
+        offset += cBuckets[q].mCompressedDataInt8.size();
+    }
+
+    cBuckets.clear();
+    cBuckets.shrink_to_fit();
+}
+
+void AdamQuantized::compressQuantile(const raul::Name& name)
+{
+    auto it = mTensors.find(name);
+    if(it == mTensors.end()) THROW_NONAME("AdamQuantized[compressQuantile]", "Tensor [" + name + "] not found");
+
+    auto itc = mCompressedTensorsQuantile.find(name);
+
+    if(itc == mCompressedTensorsQuantile.end())
+    {
+        mCompressedTensorsQuantile[name] = CompressQuantile();
+        itc = mCompressedTensorsQuantile.find(name);
+    }
+    CompressQuantile& cBucket = itc->second;
+
+    std::vector<dtype>& data = (*it).second;
+
+    cBucket.mCompressInt8Max = std::abs(*std::max_element(data.begin(), data.end(), abs_compare));
+    dtype maxVal = cBucket.mCompressInt8Max;
+    std::transform(data.begin(), data.end(), data.begin(), [maxVal](const dtype val) { return val / maxVal; });
+    cBucket.mMap = createQuantileMap(data);
+    cBucket.mCompressedDataInt8.resize(data.size());
+
+    for (size_t w = 0; w < data.size(); ++w)
+    {
+        size_t index = std::lower_bound(cBucket.mMap.begin(), cBucket.mMap.end(), data[w]) - cBucket.mMap.begin();
+        if (index < 255)
+        {
+            float dist_left = fabs(data[w] - (cBucket.mMap[index]));
+            float dist_right = fabs(data[w] - (cBucket.mMap[index + 1]));
+            if(dist_right < dist_left) ++index;
+        }
+        cBucket.mCompressedDataInt8[w] = static_cast<uint8_t>(index);
+    }
+
+
+    data.clear();
+    data.shrink_to_fit();
+}
+
+void AdamQuantized::decompressQuantile(const raul::Name& name)
+{
+    auto it = mTensors.find(name);
+    if(it == mTensors.end()) THROW_NONAME("AdamQuantized[decompressQuantile]", "Tensor [" + name + "] not found");
+
+    auto itc = mCompressedTensorsQuantile.find(name);
+    if(itc == mCompressedTensorsQuantile.end()) THROW_NONAME("AdamQuantized[decompressQuantile]", "Tensor [" + name + "] not found in compressed");
+    CompressQuantile& cBucket = itc->second;
+
+    std::vector<dtype>& data = (*it).second;
+
+    data.resize(cBucket.mCompressedDataInt8.size());
+
+    for (size_t w = 0; w < cBucket.mCompressedDataInt8.size(); ++w)
+    {
+        data[w] = cBucket.mMap[cBucket.mCompressedDataInt8[w]] * cBucket.mCompressInt8Max;
+    }
+
+    cBucket.mMap.clear();
+    cBucket.mMap.shrink_to_fit();
+    cBucket.mCompressedDataInt8.clear();
+    cBucket.mCompressedDataInt8.shrink_to_fit();
+}
+
+std::vector<dtype> AdamQuantized::createQuantileMap(const std::vector<dtype>& param)
+{
+    std::vector<dtype> ret(256, 0.0f);
+
+    std::vector<dtype> sorted(param.begin(), param.end());
+    std::sort(sorted.begin(), sorted.end());
+    //dtype maxVal = std::abs(*std::max_element(sorted.begin(), sorted.end(), abs_compare));
+    //std::transform(sorted.begin(), sorted.end(), sorted.begin(), [maxVal](const dtype val) { return val / maxVal; });
+
+    for (size_t q = 0; q < 256; ++q)
+    {
+        float qA = static_cast<float>(q) / 257.0f;
+        float qB = static_cast<float>(q + 1) / 257.0f;
+        size_t indA = static_cast<size_t>(std::floor(qA * static_cast<float>(sorted.size())));
+        size_t indB = static_cast<size_t>(std::floor(qB * static_cast<float>(sorted.size())));
+        ret[q] = (sorted[indA] + sorted[indB]) / 2.0f;
+    }
+
+    return ret;
+}
+
+std::vector<dtype> AdamQuantized::createNormalQuantileMap(bool isSigned)
+{
+    //https://github.com/facebookresearch/bitsandbytes
+    //bitsandbytes/functional.py optimal_normal, optimal_half_normal
+
+    std::vector<dtype> ret;
+
+    if (isSigned)
+    {
+        ret = { -1.0f, -0.8727636337280273f, -0.8097418546676636f, -0.7660024166107178f, -0.7318882346153259f, -0.6793879270553589f, -0.657649040222168f, -0.6385974884033203f, -0.6211113333702087f, -0.5901028513908386f, -0.5762918591499329f, -0.5630806684494019f, -0.5509274005889893f, -0.5394591689109802f, -0.5283197164535522f, -0.517780065536499f, -0.5074946284294128f, -0.4980469048023224f, -0.48867011070251465f, -0.48003149032592773f, -0.47125306725502014f, -0.4629971981048584f, -0.4547359049320221f, -0.446626216173172f, -0.43902668356895447f, -0.43158355355262756f, -0.4244747757911682f, -0.4173796474933624f, -0.41038978099823f, -0.4055633544921875f, -0.4035947024822235f, -0.39701032638549805f, -0.39057496190071106f, -0.38439232110977173f, -0.3782760500907898f, -0.3721940815448761f, -0.3661896586418152f, -0.3604033589363098f, -0.354605108499527f, -0.34892538189888f, -0.34320303797721863f, -0.3376772701740265f, -0.3323028087615967f, -0.3269782066345215f, -0.32166096568107605f, -0.316457599401474f, -0.3112771809101105f, -0.3061025142669678f, -0.30106794834136963f, -0.2961243987083435f, -0.2912728488445282f, -0.28644347190856934f, -0.28165507316589355f, -0.2769731283187866f, -0.2722635865211487f, -0.26779335737228394f, -0.26314786076545715f, -0.2586647868156433f, -0.2541804611682892f, -0.2496625930070877f, -0.24527113139629364f, -0.24097171425819397f, -0.23659978806972504f, -0.23218469321727753f, -0.22799566388130188f, -0.22380566596984863f, -0.21965542435646057f, -0.2154538631439209f, -0.2113603949546814f, -0.20735277235507965f, -0.20334717631340027f, -0.19932441413402557f, -0.19530178606510162f, -0.19136647880077362f, -0.18736697733402252f, -0.18337111175060272f, -0.17951400578022003f, -0.1757056713104248f, -0.17182783782482147f, -0.1680615097284317f, -0.16431649029254913f, -0.16053077578544617f, -0.15685945749282837f, -0.15298527479171753f, -0.1493264138698578f, -0.14566898345947266f, -0.14188314974308014f, -0.13819937407970428f, -0.1344561129808426f, -0.1306886374950409f, -0.1271020770072937f, -0.12346585839986801f, -0.11981867253780365f, -0.11614970862865448f, -0.11256207525730133f, -0.10889036953449249f, -0.10525048524141312f, -0.1016591489315033f, -0.09824034571647644f, -0.09469068050384521f, -0.0911419615149498f, -0.08773849159479141f, -0.08416644483804703f, -0.08071305602788925f, -0.07720902562141418f, -0.07371306419372559f, -0.07019119709730148f, -0.06673648208379745f, -0.06329209357500076f, -0.059800852090120316f, -0.0564190037548542f, -0.05296570807695389f, -0.049522045999765396f, -0.04609023034572601f, -0.04262964054942131f, -0.039246633648872375f, -0.03577171266078949f, -0.03236335143446922f, -0.028855687007308006f, -0.02542758360505104f, -0.022069433704018593f, -0.018754752352833748f, -0.015386369079351425f, -0.01194947212934494f, -0.008439815603196621f, -0.004995611496269703f, -0.0016682245768606663f, 0.0f, 0.0015510577941313386f, 0.005062474869191647f, 0.008417150937020779f, 0.011741090565919876f, 0.015184164978563786f, 0.018582714721560478f, 0.02204744517803192f, 0.025471193715929985f, 0.02889077737927437f, 0.0323684960603714f, 0.03579240292310715f, 0.039281025528907776f, 0.0427563451230526f, 0.04619763046503067f, 0.04968220740556717f, 0.05326594039797783f, 0.05679265409708023f, 0.060245808213949203f, 0.06372645497322083f, 0.06721872836351395f, 0.0706876739859581f, 0.0742349922657013f, 0.07774098962545395f, 0.08123527467250824f, 0.08468879014253616f, 0.08810535818338394f, 0.09155989438295364f, 0.09498448669910431f, 0.0985206812620163f, 0.10206405073404312f, 0.10563778132200241f, 0.10921968519687653f, 0.11284469068050385f, 0.11653254181146622f, 0.12008969485759735f, 0.12368203699588776f, 0.1272617131471634f, 0.13089501857757568f, 0.134552001953125f, 0.1382799744606018f, 0.14194637537002563f, 0.14563234150409698f, 0.14930322766304016f, 0.15303383767604828f, 0.1567956507205963f, 0.16050070524215698f, 0.16431072354316711f, 0.16813558340072632f, 0.17204202711582184f, 0.1758781224489212f, 0.17973239719867706f, 0.1836014688014984f, 0.18753431737422943f, 0.19138391315937042f, 0.19535475969314575f, 0.19931404292583466f, 0.20333819091320038f, 0.20738255977630615f, 0.21152682602405548f, 0.21568812429904938f, 0.21978361904621124f, 0.22393859922885895f, 0.22814159095287323f, 0.23241068422794342f, 0.23675410449504852f, 0.24123944342136383f, 0.24569889903068542f, 0.2500703036785126f, 0.25904011726379395f, 0.26349544525146484f, 0.2682226300239563f, 0.272907555103302f, 0.2774306833744049f, 0.28220856189727783f, 0.2869136929512024f, 0.2916390895843506f, 0.29649388790130615f, 0.30142995715141296f, 0.3065022826194763f, 0.3114383816719055f, 0.31648796796798706f, 0.3216581642627716f, 0.32700115442276f, 0.3322487473487854f, 0.33778008818626404f, 0.3431521952152252f, 0.3487405776977539f, 0.3543166518211365f, 0.3601346015930176f, 0.36605337262153625f, 0.37217751145362854f, 0.378179669380188f, 0.3843980133533478f, 0.3906566798686981f, 0.39714935421943665f, 0.40357843041419983f, 0.4104187488555908f, 0.4171563684940338f, 0.42418959736824036f, 0.43136918544769287f, 0.4389212429523468f, 0.44673123955726624f, 0.45457619428634644f, 0.4627031683921814f, 0.47130417823791504f, 0.4798591434955597f, 0.48897242546081543f, 0.4979848861694336f, 0.5f, 0.5076631307601929f, 0.5177803635597229f, 0.5282770991325378f, 0.5392990112304688f, 0.5506287813186646f, 0.5632893443107605f, 0.5764452815055847f, 0.5903191566467285f, 0.6051878333091736f, 0.6209936141967773f, 0.6382884979248047f, 0.6573970913887024f, 0.6795773506164551f, 0.7037051916122437f, 0.7327037453651428f, 0.7677436470985413f, 0.8111193776130676f, 0.875165581703186f, 1.0f };
+    }
+    else
+    {
+        ret = { 0.0f, 0.005811259150505066f, 0.00961565226316452f, 0.010822802782058716f, 0.013123787939548492f, 0.014242202043533325f, 0.0143156498670578f, 0.016469404101371765f, 0.017666727304458618f, 0.01773911714553833f, 0.0199756920337677f, 0.0210941880941391f, 0.021161124110221863f, 0.02451971173286438f, 0.024580076336860657f, 0.02685210108757019f, 0.028012827038764954f, 0.030198264867067337f, 0.0302925705909729f, 0.03136435151100159f, 0.03374280035495758f, 0.03487399220466614f, 0.035243816673755646f, 0.037192340940237045f, 0.03822284936904907f, 0.04164902865886688f, 0.04173608124256134f, 0.04401407018303871f, 0.04508155584335327f, 0.047482021152973175f, 0.04756556823849678f, 0.050963032990694046f, 0.05196474492549896f, 0.055417388677597046f, 0.05793146416544914f, 0.05799369141459465f, 0.05887940526008606f, 0.05895659327507019f, 0.062420234084129333f, 0.06493274495005608f, 0.06499008461833f, 0.06935599446296692f, 0.07197384163737297f, 0.07201516255736351f, 0.07276943325996399f, 0.07283210754394531f, 0.07550075277686119f, 0.07975354790687561f, 0.07980883121490479f, 0.08257630094885826f, 0.0867777168750763f, 0.08682405948638916f, 0.08967285975813866f, 0.09323835000395775f, 0.09386616945266724f, 0.09735457599163055f, 0.09739077091217041f, 0.10092401504516602f, 0.10444298386573792f, 0.10447832942008972f, 0.10770941898226738f, 0.10803905129432678f, 0.11161200702190399f, 0.1151546835899353f, 0.11520349979400635f, 0.11875157058238983f, 0.11879390478134155f, 0.1222602017223835f, 0.122351735830307f, 0.12240418791770935f, 0.12594850733876228f, 0.12597402930259705f, 0.12602100148797035f, 0.12960633635520935f, 0.1296597123146057f, 0.12966342642903328f, 0.13227657973766327f, 0.13325360417366028f, 0.1333133578300476f, 0.13691483438014984f, 0.1371927298605442f, 0.14066261053085327f, 0.14088113978505135f, 0.1447291411459446f, 0.14805573225021362f, 0.148526418954134f, 0.15170684456825256f, 0.15178103744983673f, 0.15225710347294807f, 0.1554398238658905f, 0.15609459951519966f, 0.15618794038891792f, 0.1592724472284317f, 0.1629735231399536f, 0.16382690146565437f, 0.16676269471645355f, 0.16873238794505596f, 0.17066434025764465f, 0.17068277299404144f, 0.1717144437134266f, 0.17558929696679115f, 0.17827065289020538f, 0.17835864424705505f, 0.18222273886203766f, 0.18353315070271492f, 0.18604370951652527f, 0.18611834943294525f, 0.1876586265861988f, 0.18996606767177582f, 0.19170701876282692f, 0.19398853182792664f, 0.19786442816257477f, 0.19795633852481842f, 0.20195159316062927f, 0.2058800607919693f, 0.2099103182554245f, 0.2122517265379429f, 0.21410366892814636f, 0.21819619834423065f, 0.22221362590789795f, 0.22233009338378906f, 0.22500130906701088f, 0.2251257635653019f, 0.22638091444969177f, 0.23067741096019745f, 0.23368822410702705f, 0.2348879873752594f, 0.2382080741226673f, 0.2390350103378296f, 0.2391497790813446f, 0.24253453686833382f, 0.24265171959996223f, 0.2470107562839985f, 0.24764248728752136f, 0.24777774512767792f, 0.2516774423420429f, 0.256104726344347f, 0.2564055472612381f, 0.2607169933617115f, 0.265461727976799f, 0.26985861361026764f, 0.2701106257736683f, 0.2702729292213917f, 0.274574413895607f, 0.2750340588390827f, 0.27919672429561615f, 0.283704474568367f, 0.28386808931827545f, 0.28953738883137703f, 0.2896753139793873f, 0.29320384562015533f, 0.29451676085591316f, 0.295327290892601f, 0.29802779853343964f, 0.29818175733089447f, 0.29972871020436287f, 0.30290623009204865f, 0.30305664241313934f, 0.30486901476979256f, 0.31299956142902374f, 0.31518544629216194f, 0.31790371239185333f, 0.3205283172428608f, 0.3230419009923935f, 0.32595496252179146f, 0.32612212374806404f, 0.3282426446676254f, 0.3283906430006027f, 0.33146094158291817f, 0.3316439874470234f, 0.33365286886692047f, 0.33723779395222664f, 0.3390095978975296f, 0.3427443392574787f, 0.34853987768292427f, 0.34869300201535225f, 0.35457711294293404f, 0.35537679493427277f, 0.3604113645851612f, 0.36124424636363983f, 0.3665340431034565f, 0.36667295172810555f, 0.3727492541074753f, 0.3729033060371876f, 0.37888188660144806f, 0.37907837703824043f, 0.3792510814964771f, 0.38557394221425056f, 0.38573457673192024f, 0.39108292758464813f, 0.39911722019314766f, 0.40589402988553047f, 0.40604450181126595f, 0.410498782992363f, 0.4106704741716385f, 0.4129834659397602f, 0.4131447561085224f, 0.4172855168581009f, 0.4202354736626148f, 0.4204071946442127f, 0.43538858368992805f, 0.4355536885559559f, 0.4432900734245777f, 0.44603554904460907f, 0.4461968094110489f, 0.451409537345171f, 0.4598204083740711f, 0.46002377942204475f, 0.46178819239139557f, 0.46868549659848213f, 0.46995367109775543f, 0.4868385046720505f, 0.48702501133084297f, 0.4958047419786453f, 0.4960057884454727f, 0.5051481872797012f, 0.506847757846117f, 0.5148334950208664f, 0.5150565356016159f, 0.5174009390175343f, 0.5249751061201096f, 0.5283288545906544f, 0.5355450958013535f, 0.539984006434679f, 0.5467876642942429f, 0.5522958822548389f, 0.5584012717008591f, 0.5706631988286972f, 0.5836620181798935f, 0.5836880058050156f, 0.5942088551819324f, 0.5975865572690964f, 0.6102624125778675f, 0.6124880760908127f, 0.6286389082670212f, 0.646102175116539f, 0.6471664495766163f, 0.665437325835228f, 0.6687244363129139f, 0.687017485499382f, 0.6932839937508106f, 0.7115348428487778f, 0.7218200154602528f, 0.7219699807465076f, 0.7747527211904526f, 0.7749756425619125f, 0.8192005604505539f, 0.8194110840559006f, 0.8830635994672775f, 0.9217727445065975f, 0.9245667457580566f, 0.947742685675621f, 0.9674464613199234f, 0.9890814647078514f, 0.9891453236341476f, 1.0f };
+    }
+
+    return ret;
+}
+
+std::vector<half> AdamQuantized::createNormalQuantileMapFP16(bool isSigned)
+{
+    //https://github.com/facebookresearch/bitsandbytes
+    //bitsandbytes/functional.py optimal_normal, optimal_half_normal
+
+    std::vector<half> ret;
+
+    if (isSigned)
+    {
+        ret = { -1.0_hf, -0.8727636337280273_hf, -0.8097418546676636_hf, -0.7660024166107178_hf, -0.7318882346153259_hf, -0.6793879270553589_hf, -0.657649040222168_hf, -0.6385974884033203_hf, -0.6211113333702087_hf, -0.5901028513908386_hf, -0.5762918591499329_hf, -0.5630806684494019_hf, -0.5509274005889893_hf, -0.5394591689109802_hf, -0.5283197164535522_hf, -0.517780065536499_hf, -0.5074946284294128_hf, -0.4980469048023224_hf, -0.48867011070251465_hf, -0.48003149032592773_hf, -0.47125306725502014_hf, -0.4629971981048584_hf, -0.4547359049320221_hf, -0.446626216173172_hf, -0.43902668356895447_hf, -0.43158355355262756_hf, -0.4244747757911682_hf, -0.4173796474933624_hf, -0.41038978099823_hf, -0.4055633544921875_hf, -0.4035947024822235_hf, -0.39701032638549805_hf, -0.39057496190071106_hf, -0.38439232110977173_hf, -0.3782760500907898_hf, -0.3721940815448761_hf, -0.3661896586418152_hf, -0.3604033589363098_hf, -0.354605108499527_hf, -0.34892538189888_hf, -0.34320303797721863_hf, -0.3376772701740265_hf, -0.3323028087615967_hf, -0.3269782066345215_hf, -0.32166096568107605_hf, -0.316457599401474_hf, -0.3112771809101105_hf, -0.3061025142669678_hf, -0.30106794834136963_hf, -0.2961243987083435_hf, -0.2912728488445282_hf, -0.28644347190856934_hf, -0.28165507316589355_hf, -0.2769731283187866_hf, -0.2722635865211487_hf, -0.26779335737228394_hf, -0.26314786076545715_hf, -0.2586647868156433_hf, -0.2541804611682892_hf, -0.2496625930070877_hf, -0.24527113139629364_hf, -0.24097171425819397_hf, -0.23659978806972504_hf, -0.23218469321727753_hf, -0.22799566388130188_hf, -0.22380566596984863_hf, -0.21965542435646057_hf, -0.2154538631439209_hf, -0.2113603949546814_hf, -0.20735277235507965_hf, -0.20334717631340027_hf, -0.19932441413402557_hf, -0.19530178606510162_hf, -0.19136647880077362_hf, -0.18736697733402252_hf, -0.18337111175060272_hf, -0.17951400578022003_hf, -0.1757056713104248_hf, -0.17182783782482147_hf, -0.1680615097284317_hf, -0.16431649029254913_hf, -0.16053077578544617_hf, -0.15685945749282837_hf, -0.15298527479171753_hf, -0.1493264138698578_hf, -0.14566898345947266_hf, -0.14188314974308014_hf, -0.13819937407970428_hf, -0.1344561129808426_hf, -0.1306886374950409_hf, -0.1271020770072937_hf, -0.12346585839986801_hf, -0.11981867253780365_hf, -0.11614970862865448_hf, -0.11256207525730133_hf, -0.10889036953449249_hf, -0.10525048524141312_hf, -0.1016591489315033_hf, -0.09824034571647644_hf, -0.09469068050384521_hf, -0.0911419615149498_hf, -0.08773849159479141_hf, -0.08416644483804703_hf, -0.08071305602788925_hf, -0.07720902562141418_hf, -0.07371306419372559_hf, -0.07019119709730148_hf, -0.06673648208379745_hf, -0.06329209357500076_hf, -0.059800852090120316_hf, -0.0564190037548542_hf, -0.05296570807695389_hf, -0.049522045999765396_hf, -0.04609023034572601_hf, -0.04262964054942131_hf, -0.039246633648872375_hf, -0.03577171266078949_hf, -0.03236335143446922_hf, -0.028855687007308006_hf, -0.02542758360505104_hf, -0.022069433704018593_hf, -0.018754752352833748_hf, -0.015386369079351425_hf, -0.01194947212934494_hf, -0.008439815603196621_hf, -0.004995611496269703_hf, -0.0016682245768606663_hf, 0.0_hf, 0.0015510577941313386_hf, 0.005062474869191647_hf, 0.008417150937020779_hf, 0.011741090565919876_hf, 0.015184164978563786_hf, 0.018582714721560478_hf, 0.02204744517803192_hf, 0.025471193715929985_hf, 0.02889077737927437_hf, 0.0323684960603714_hf, 0.03579240292310715_hf, 0.039281025528907776_hf, 0.0427563451230526_hf, 0.04619763046503067_hf, 0.04968220740556717_hf, 0.05326594039797783_hf, 0.05679265409708023_hf, 0.060245808213949203_hf, 0.06372645497322083_hf, 0.06721872836351395_hf, 0.0706876739859581_hf, 0.0742349922657013_hf, 0.07774098962545395_hf, 0.08123527467250824_hf, 0.08468879014253616_hf, 0.08810535818338394_hf, 0.09155989438295364_hf, 0.09498448669910431_hf, 0.0985206812620163_hf, 0.10206405073404312_hf, 0.10563778132200241_hf, 0.10921968519687653_hf, 0.11284469068050385_hf, 0.11653254181146622_hf, 0.12008969485759735_hf, 0.12368203699588776_hf, 0.1272617131471634_hf, 0.13089501857757568_hf, 0.134552001953125_hf, 0.1382799744606018_hf, 0.14194637537002563_hf, 0.14563234150409698_hf, 0.14930322766304016_hf, 0.15303383767604828_hf, 0.1567956507205963_hf, 0.16050070524215698_hf, 0.16431072354316711_hf, 0.16813558340072632_hf, 0.17204202711582184_hf, 0.1758781224489212_hf, 0.17973239719867706_hf, 0.1836014688014984_hf, 0.18753431737422943_hf, 0.19138391315937042_hf, 0.19535475969314575_hf, 0.19931404292583466_hf, 0.20333819091320038_hf, 0.20738255977630615_hf, 0.21152682602405548_hf, 0.21568812429904938_hf, 0.21978361904621124_hf, 0.22393859922885895_hf, 0.22814159095287323_hf, 0.23241068422794342_hf, 0.23675410449504852_hf, 0.24123944342136383_hf, 0.24569889903068542_hf, 0.2500703036785126_hf, 0.25904011726379395_hf, 0.26349544525146484_hf, 0.2682226300239563_hf, 0.272907555103302_hf, 0.2774306833744049_hf, 0.28220856189727783_hf, 0.2869136929512024_hf, 0.2916390895843506_hf, 0.29649388790130615_hf, 0.30142995715141296_hf, 0.3065022826194763_hf, 0.3114383816719055_hf, 0.31648796796798706_hf, 0.3216581642627716_hf, 0.32700115442276_hf, 0.3322487473487854_hf, 0.33778008818626404_hf, 0.3431521952152252_hf, 0.3487405776977539_hf, 0.3543166518211365_hf, 0.3601346015930176_hf, 0.36605337262153625_hf, 0.37217751145362854_hf, 0.378179669380188_hf, 0.3843980133533478_hf, 0.3906566798686981_hf, 0.39714935421943665_hf, 0.40357843041419983_hf, 0.4104187488555908_hf, 0.4171563684940338_hf, 0.42418959736824036_hf, 0.43136918544769287_hf, 0.4389212429523468_hf, 0.44673123955726624_hf, 0.45457619428634644_hf, 0.4627031683921814_hf, 0.47130417823791504_hf, 0.4798591434955597_hf, 0.48897242546081543_hf, 0.4979848861694336_hf, 0.5_hf, 0.5076631307601929_hf, 0.5177803635597229_hf, 0.5282770991325378_hf, 0.5392990112304688_hf, 0.5506287813186646_hf, 0.5632893443107605_hf, 0.5764452815055847_hf, 0.5903191566467285_hf, 0.6051878333091736_hf, 0.6209936141967773_hf, 0.6382884979248047_hf, 0.6573970913887024_hf, 0.6795773506164551_hf, 0.7037051916122437_hf, 0.7327037453651428_hf, 0.7677436470985413_hf, 0.8111193776130676_hf, 0.875165581703186_hf, 1.0_hf };
+    }
+    else
+    {
+        ret = { 0.0_hf, 0.005811259150505066_hf, 0.00961565226316452_hf, 0.010822802782058716_hf, 0.013123787939548492_hf, 0.014242202043533325_hf, 0.0143156498670578_hf, 0.016469404101371765_hf, 0.017666727304458618_hf, 0.01773911714553833_hf, 0.0199756920337677_hf, 0.0210941880941391_hf, 0.021161124110221863_hf, 0.02451971173286438_hf, 0.024580076336860657_hf, 0.02685210108757019_hf, 0.028012827038764954_hf, 0.030198264867067337_hf, 0.0302925705909729_hf, 0.03136435151100159_hf, 0.03374280035495758_hf, 0.03487399220466614_hf, 0.035243816673755646_hf, 0.037192340940237045_hf, 0.03822284936904907_hf, 0.04164902865886688_hf, 0.04173608124256134_hf, 0.04401407018303871_hf, 0.04508155584335327_hf, 0.047482021152973175_hf, 0.04756556823849678_hf, 0.050963032990694046_hf, 0.05196474492549896_hf, 0.055417388677597046_hf, 0.05793146416544914_hf, 0.05799369141459465_hf, 0.05887940526008606_hf, 0.05895659327507019_hf, 0.062420234084129333_hf, 0.06493274495005608_hf, 0.06499008461833_hf, 0.06935599446296692_hf, 0.07197384163737297_hf, 0.07201516255736351_hf, 0.07276943325996399_hf, 0.07283210754394531_hf, 0.07550075277686119_hf, 0.07975354790687561_hf, 0.07980883121490479_hf, 0.08257630094885826_hf, 0.0867777168750763_hf, 0.08682405948638916_hf, 0.08967285975813866_hf, 0.09323835000395775_hf, 0.09386616945266724_hf, 0.09735457599163055_hf, 0.09739077091217041_hf, 0.10092401504516602_hf, 0.10444298386573792_hf, 0.10447832942008972_hf, 0.10770941898226738_hf, 0.10803905129432678_hf, 0.11161200702190399_hf, 0.1151546835899353_hf, 0.11520349979400635_hf, 0.11875157058238983_hf, 0.11879390478134155_hf, 0.1222602017223835_hf, 0.122351735830307_hf, 0.12240418791770935_hf, 0.12594850733876228_hf, 0.12597402930259705_hf, 0.12602100148797035_hf, 0.12960633635520935_hf, 0.1296597123146057_hf, 0.12966342642903328_hf, 0.13227657973766327_hf, 0.13325360417366028_hf, 0.1333133578300476_hf, 0.13691483438014984_hf, 0.1371927298605442_hf, 0.14066261053085327_hf, 0.14088113978505135_hf, 0.1447291411459446_hf, 0.14805573225021362_hf, 0.148526418954134_hf, 0.15170684456825256_hf, 0.15178103744983673_hf, 0.15225710347294807_hf, 0.1554398238658905_hf, 0.15609459951519966_hf, 0.15618794038891792_hf, 0.1592724472284317_hf, 0.1629735231399536_hf, 0.16382690146565437_hf, 0.16676269471645355_hf, 0.16873238794505596_hf, 0.17066434025764465_hf, 0.17068277299404144_hf, 0.1717144437134266_hf, 0.17558929696679115_hf, 0.17827065289020538_hf, 0.17835864424705505_hf, 0.18222273886203766_hf, 0.18353315070271492_hf, 0.18604370951652527_hf, 0.18611834943294525_hf, 0.1876586265861988_hf, 0.18996606767177582_hf, 0.19170701876282692_hf, 0.19398853182792664_hf, 0.19786442816257477_hf, 0.19795633852481842_hf, 0.20195159316062927_hf, 0.2058800607919693_hf, 0.2099103182554245_hf, 0.2122517265379429_hf, 0.21410366892814636_hf, 0.21819619834423065_hf, 0.22221362590789795_hf, 0.22233009338378906_hf, 0.22500130906701088_hf, 0.2251257635653019_hf, 0.22638091444969177_hf, 0.23067741096019745_hf, 0.23368822410702705_hf, 0.2348879873752594_hf, 0.2382080741226673_hf, 0.2390350103378296_hf, 0.2391497790813446_hf, 0.24253453686833382_hf, 0.24265171959996223_hf, 0.2470107562839985_hf, 0.24764248728752136_hf, 0.24777774512767792_hf, 0.2516774423420429_hf, 0.256104726344347_hf, 0.2564055472612381_hf, 0.2607169933617115_hf, 0.265461727976799_hf, 0.26985861361026764_hf, 0.2701106257736683_hf, 0.2702729292213917_hf, 0.274574413895607_hf, 0.2750340588390827_hf, 0.27919672429561615_hf, 0.283704474568367_hf, 0.28386808931827545_hf, 0.28953738883137703_hf, 0.2896753139793873_hf, 0.29320384562015533_hf, 0.29451676085591316_hf, 0.295327290892601_hf, 0.29802779853343964_hf, 0.29818175733089447_hf, 0.29972871020436287_hf, 0.30290623009204865_hf, 0.30305664241313934_hf, 0.30486901476979256_hf, 0.31299956142902374_hf, 0.31518544629216194_hf, 0.31790371239185333_hf, 0.3205283172428608_hf, 0.3230419009923935_hf, 0.32595496252179146_hf, 0.32612212374806404_hf, 0.3282426446676254_hf, 0.3283906430006027_hf, 0.33146094158291817_hf, 0.3316439874470234_hf, 0.33365286886692047_hf, 0.33723779395222664_hf, 0.3390095978975296_hf, 0.3427443392574787_hf, 0.34853987768292427_hf, 0.34869300201535225_hf, 0.35457711294293404_hf, 0.35537679493427277_hf, 0.3604113645851612_hf, 0.36124424636363983_hf, 0.3665340431034565_hf, 0.36667295172810555_hf, 0.3727492541074753_hf, 0.3729033060371876_hf, 0.37888188660144806_hf, 0.37907837703824043_hf, 0.3792510814964771_hf, 0.38557394221425056_hf, 0.38573457673192024_hf, 0.39108292758464813_hf, 0.39911722019314766_hf, 0.40589402988553047_hf, 0.40604450181126595_hf, 0.410498782992363_hf, 0.4106704741716385_hf, 0.4129834659397602_hf, 0.4131447561085224_hf, 0.4172855168581009_hf, 0.4202354736626148_hf, 0.4204071946442127_hf, 0.43538858368992805_hf, 0.4355536885559559_hf, 0.4432900734245777_hf, 0.44603554904460907_hf, 0.4461968094110489_hf, 0.451409537345171_hf, 0.4598204083740711_hf, 0.46002377942204475_hf, 0.46178819239139557_hf, 0.46868549659848213_hf, 0.46995367109775543_hf, 0.4868385046720505_hf, 0.48702501133084297_hf, 0.4958047419786453_hf, 0.4960057884454727_hf, 0.5051481872797012_hf, 0.506847757846117_hf, 0.5148334950208664_hf, 0.5150565356016159_hf, 0.5174009390175343_hf, 0.5249751061201096_hf, 0.5283288545906544_hf, 0.5355450958013535_hf, 0.539984006434679_hf, 0.5467876642942429_hf, 0.5522958822548389_hf, 0.5584012717008591_hf, 0.5706631988286972_hf, 0.5836620181798935_hf, 0.5836880058050156_hf, 0.5942088551819324_hf, 0.5975865572690964_hf, 0.6102624125778675_hf, 0.6124880760908127_hf, 0.6286389082670212_hf, 0.646102175116539_hf, 0.6471664495766163_hf, 0.665437325835228_hf, 0.6687244363129139_hf, 0.687017485499382_hf, 0.6932839937508106_hf, 0.7115348428487778_hf, 0.7218200154602528_hf, 0.7219699807465076_hf, 0.7747527211904526_hf, 0.7749756425619125_hf, 0.8192005604505539_hf, 0.8194110840559006_hf, 0.8830635994672775_hf, 0.9217727445065975_hf, 0.9245667457580566_hf, 0.947742685675621_hf, 0.9674464613199234_hf, 0.9890814647078514_hf, 0.9891453236341476_hf, 1.0_hf };
+    }
+
+    return ret;
+}
+
+std::vector<dtype> AdamQuantized::createDynamicMap(bool isSigned, int n)
+{
+    //https://github.com/facebookresearch/bitsandbytes
+    //bitsandbytes/functional.py create_dynamic_map()
+    std::vector<dtype> ret;
+
+    int additionalItems = static_cast<int>(std::pow(2.0f, 7 - n) - 1);
+    if(!isSigned) additionalItems *= 2;
+
+    int q = 0;
+    for (; q < n; ++q)
+    {
+        size_t fractionItems = static_cast<size_t>(std::pow(2.0f, q + 7 - n) + 1);
+        if(!isSigned) fractionItems = static_cast<size_t>(std::pow(2.0f, q + 7 - n + 1) + 1);
+
+        auto boundaries = linspace(0.1_dt, 1.0_dt, fractionItems);
+        for (size_t w = 0; w < boundaries.size() - 1; ++w)
+        {
+            boundaries[w] += boundaries[w + 1];
+            boundaries[w] /= 2.0_dt;
+            ret.push_back(TODTYPE(std::pow(10.0f, -(n - 1) + q) * boundaries[w]));
+            if(isSigned)
+                ret.push_back(TODTYPE(-std::pow(10.0f, -(n - 1) + q) * boundaries[w]));
+        }
+    }
+
+    --q;
+
+    if (additionalItems > 0)
+    {
+        auto boundaries = linspace(0.1_dt, 1.0_dt, additionalItems + 1);
+        for (size_t w = 0; w < boundaries.size() - 1; ++w)
+        {
+            boundaries[w] += boundaries[w + 1];
+            boundaries[w] /= 2.0_dt;
+            ret.push_back(TODTYPE(std::pow(10.0f, -(n - 1) + q) * boundaries[w]));
+            if(isSigned)
+                ret.push_back(TODTYPE(-std::pow(10.0f, -(n - 1) + q) * boundaries[w]));
+        }
+    }
+
+    ret.push_back(0.0_dt);
+    ret.push_back(1.0_dt);
+    std::sort(ret.begin(), ret.end());
+
+    return ret;
+}
+
+std::vector<dtype> AdamQuantized::linspace(dtype start_in, dtype end_in, size_t n)
+{
+
+    std::vector<dtype> linspaced(n);
+
+    dtype delta = (end_in - start_in) / TODTYPE(n - 1);
+
+    std::generate(linspaced.begin(), linspaced.end(), [delta, n = start_in]()mutable{auto result = n; n += delta; return result;});
+
+    return linspaced;
+}
+
+void AdamQuantized::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("AdamQuantized", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    bool firstRunM = false;
+    bool firstRunV = false;
+
+    TensorFP16 *b1tp, *b2tp;
+    if (!memory_manager.tensorExists(Name("AdamQuantized") / param.getName() / "beta_1_t"))
+    {
+        b1tp = memory_manager.createTensor(Name("AdamQuantized") / param.getName() / "beta_1_t", 1, 1, 1, 1, TOHTYPE(this->m_beta_1));
+    }
+    b1tp = &memory_manager.getTensor(Name("AdamQuantized") / param.getName() / "beta_1_t");
+
+    if (!memory_manager.tensorExists(Name("AdamQuantized") / param.getName() / "beta_2_t"))
+    {
+        b2tp = memory_manager.createTensor(Name("AdamQuantized") / param.getName() / "beta_2_t", 1, 1, 1, 1, TOHTYPE(this->m_beta_2));
+    }
+    b2tp = &memory_manager.getTensor(Name("AdamQuantized") / param.getName() / "beta_2_t");
+
+    if (!checkTensorExistsFP16(Name("AdamQuantized") / param.getName() / "m"))
+    {
+        Name t = Name("AdamQuantized") / param.getName() / "m";
+        mTensorsFP16[t] = std::vector<half>(param.size(), 0_hf);
+        firstRunM = true;
+    }
+    std::vector<half>& m = mTensorsFP16.find(Name("AdamQuantized") / param.getName() / "m")->second;
+
+    if (!checkTensorExistsFP16(Name("AdamQuantized") / param.getName() / "v"))
+    {
+        Name t = Name("AdamQuantized") / param.getName() / "v";
+        mTensorsFP16[t] = std::vector<half>(param.size(), 0_hf);
+        firstRunV = true;
+    }
+    std::vector<half>& v = mTensorsFP16.find(Name("AdamQuantized") / param.getName() / "v")->second;
+
+    TensorFP16& beta_1_t = *b1tp;
+    TensorFP16& beta_2_t = *b2tp;
+
+    auto dynamicMapSigned = createNormalQuantileMapFP16(true);
+    auto dynamicMapUnsigned = createNormalQuantileMapFP16(false);
+
+    auto nameM = Name("AdamQuantized") / param.getName() / "m";
+    auto nameV = Name("AdamQuantized") / param.getName() / "v";
+
+    if(!firstRunM) decompressDynamicFP16(nameM, dynamicMapSigned);
+    if(!firstRunV) decompressDynamicFP16(nameV, dynamicMapUnsigned);
+
+    const auto sqrt_beta_2_t_0 = std::sqrt(1.0_dt - beta_2_t[0]);
+    const auto alpha_new = this->m_alpha * sqrt_beta_2_t_0 / (1.0_dt - beta_1_t[0]);
+    const auto epsilon_new = m_use_simple_epsilon ? this->m_epsilon : this->m_epsilon * sqrt_beta_2_t_0;
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        // m_new = beta_1*m + (1-beta_1)*grad
+        const auto mTmp = this->m_beta_1 * TODTYPE(m[i]) + (1.0_dt - this->m_beta_1) * TODTYPE(grad[i]);
+        m[i] = TOHTYPE(mTmp);
+        // v_new = beta_2*v + (1-beta_2)*grad*grad
+        const auto vTmp = this->m_beta_2 * TODTYPE(v[i]) + (1.0_dt - this->m_beta_2) * TODTYPE(grad[i]) * TODTYPE(grad[i]);
+        v[i] = TOHTYPE(vTmp);
+        // param_new = param - alpha_new*m_new/(sqrt(v_new) + epsilon_new)
+        param[i] = TOHTYPE(TODTYPE(param[i]) - alpha_new * mTmp / (std::sqrt(vTmp) + epsilon_new));
+    }
+
+    compressDynamicFP16(nameM, dynamicMapSigned, mBlockSize);
+    compressDynamicFP16(nameV, dynamicMapUnsigned, mBlockSize);
+
+    beta_1_t[0] = TOHTYPE(TODTYPE(beta_1_t[0]) * this->m_beta_1);
+    beta_2_t[0] = TOHTYPE(TODTYPE(beta_2_t[0]) * this->m_beta_2);
+}
+
+std::ostream& AdamQuantized::as_ostream(std::ostream& out) const
+{
+    out << "AdamQuantized(alpha=" << std::scientific << this->m_alpha << ", beta_1=" << this->m_beta_1 << ", beta_2=" << this->m_beta_2 << ", epsilon=" << this->m_epsilon << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Adam.h b/training/src/compiler/training/base/optimizers/Adam.h
new file mode 100644
index 00000000..dbc7ceeb
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Adam.h
@@ -0,0 +1,153 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ADAM_H
+#define ADAM_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+/**
+ * @brief Adam (Adaptive moment estimation)
+ *
+ *  The Adam method computes individual adaptive learning rates for
+ *  different parameters from estimates of first
+ *  and second moments of the gradients. This method is combination
+ *  of AdaGrad and RMSProp. AdaGrad works well with sparse gradients.
+ *  RMSProp works well in on-line and non-stationary settings.
+ *
+ *  \f[
+ *      m_t =  \beta_1 m_{t-1} - (1-\beta_1) \nabla_{\theta} E(\theta_{t-1}),\\
+ *      \nu_t =  \beta_2 \nu_{t-1} - (1-\beta_2) \nabla^2_{\theta} E(\theta_{t-1}),\\
+ *      \hat m_t = \frac{m}{1-\beta_1^t}, \\
+ *      \hat \nu_t = \frac{\nu}{1-\beta_2^t}, \\
+ *      \theta_{t} =  \theta_{t-1} - \alpha \frac{m_{t}}{\sqrt{\hat \nu_t} + \epsilon},
+ *  \f]
+ *  where
+ *  - \f$m\f$ is the 1st moment vector (the mean of gradient),
+ *  - \f$\nu\f$ is the 2st moment vector (the uncentered variance of gradient),
+ *  - \f$\beta_1\f$ is the exponential decay rate for 1st moment,
+ *  - \f$\beta_2\f$ is the exponential decay rate for 2st moment,
+ *  - \f$\hat m\f$ is the bias-corrected 1st moment vector,
+ *  - \f$\hat \nu\f$ is the bias-corrected 2st moment vector,
+ *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+ *  - \f$\alpha\f$ is a learning rate,
+ *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+ *
+ *  Good default settings from the original article:
+ *  - \f$\alpha = 0.0001\f$
+ *  - \f$\beta_1 = 0.9\f$
+ *  - \f$\beta_2 = 0.999\f$
+ *  - \f$\epsilon = 10^{-8}\f$
+ *
+ *  @see
+ *  - D. P. Kingma and J. Ba, “Adam: A Method for Stochastic Optimization” arXiv:1412.6980 [cs], Jan. 2017.
+ */
+struct Adam : public Optimizer
+{
+    explicit Adam(const dtype alpha = 0.0001_dt, const dtype beta_1 = 0.9_dt, const dtype beta_2 = 0.999_dt, const dtype epsilon = 1e-8_dt, bool use_simple_epsilon = false);
+
+    void setLearningRate(dtype lr) final { m_alpha = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return m_alpha; }
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype m_alpha;
+    dtype m_beta_1;
+    dtype m_beta_2;
+    dtype m_epsilon;
+    bool m_use_simple_epsilon;
+};
+
+/**
+ * https://arxiv.org/abs/2110.02861
+ */
+struct AdamQuantized : public Optimizer
+{
+    explicit AdamQuantized(size_t blockSize, const dtype alpha = 0.0001_dt, const dtype beta_1 = 0.9_dt, const dtype beta_2 = 0.999_dt, const dtype epsilon = 1e-8_dt, bool use_simple_epsilon = false);
+
+    void setLearningRate(dtype lr) final { m_alpha = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return m_alpha; }
+
+    static std::vector<dtype> createNormalQuantileMap(bool isSigned = true);
+    static std::vector<half> createNormalQuantileMapFP16(bool isSigned = true);
+    static std::vector<dtype> createQuantileMap(const std::vector<dtype>& param);
+
+    static std::vector<dtype> createDynamicMap(bool isSigned = true, int n = 7);
+    static std::vector<dtype> linspace(dtype start_in, dtype end_in, size_t n);
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+    bool checkTensorExists(const raul::Name& name) const
+    {
+        return mTensors.find(name) != mTensors.end();
+    }
+
+    bool checkTensorExistsFP16(const raul::Name& name) const
+    {
+        return mTensorsFP16.find(name) != mTensorsFP16.end();
+    }
+
+    void compress(const raul::Name& name, size_t bucketSize);
+    void decompress(const raul::Name& name);
+
+    void compressDynamic(const raul::Name& name, const std::vector<dtype>& map, size_t bucketSize);
+    void decompressDynamic(const raul::Name& name, const std::vector<dtype>& map);
+
+    void compressDynamicFP16(const raul::Name& name, const std::vector<half>& map, size_t bucketSize);
+    void decompressDynamicFP16(const raul::Name& name, const std::vector<half>& map);
+
+    void compressQuantile(const raul::Name& name);
+    void decompressQuantile(const raul::Name& name);
+
+
+  private:
+    dtype m_alpha;
+    dtype m_beta_1;
+    dtype m_beta_2;
+    dtype m_epsilon;
+    bool m_use_simple_epsilon;
+    size_t mBlockSize;
+
+    std::unordered_map<Name, std::vector<dtype>> mTensors;
+    std::unordered_map<Name, std::vector<half>> mTensorsFP16;
+
+    struct CompressBucket
+    {
+        std::vector<uint8_t> mCompressedDataInt8;
+        dtype mCompressInt8Min;
+        dtype mCompressInt8Max;
+    };
+
+    struct CompressQuantile
+    {
+        std::vector<uint8_t> mCompressedDataInt8;
+        std::vector<dtype> mMap;
+        dtype mCompressInt8Max;
+    };
+
+    std::unordered_map<Name, std::vector<CompressBucket>> mCompressedTensors;
+    std::unordered_map<Name, CompressQuantile> mCompressedTensorsQuantile;
+};
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/AdamW.cpp b/training/src/compiler/training/base/optimizers/AdamW.cpp
new file mode 100644
index 00000000..8b3763a7
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/AdamW.cpp
@@ -0,0 +1,184 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "AdamW.h"
+#include <iostream>
+#include <stdexcept>
+
+namespace
+{
+
+constexpr raul::dtype alpha_lower_boundary = 0.0_dt;
+constexpr raul::dtype beta_lower_boundary = 0.0_dt;
+constexpr raul::dtype beta_upper_boundary = 1.0_dt;
+
+}
+
+namespace raul::optimizers
+{
+
+AdamW::AdamW(const dtype alpha, const dtype beta_1, const dtype beta_2, const dtype epsilon, const dtype lambda)
+    : mAlpha(alpha)
+    , mBeta_1(beta_1)
+    , mBeta_2(beta_2)
+    , mEpsilon(epsilon)
+    , mLambda(lambda)
+{
+    if (alpha <= alpha_lower_boundary)
+    {
+        THROW_NONAME("AdamW", "reset alpha >" + Conversions::toString(alpha_lower_boundary) + " (current alpha=" + Conversions::toString(alpha) + ")");
+    }
+
+    if (beta_1 < beta_lower_boundary || beta_1 >= beta_upper_boundary)
+    {
+        THROW_NONAME("AdamW",
+                     "reset beta_1 from (" + Conversions::toString(beta_lower_boundary) + ", " + Conversions::toString(beta_upper_boundary) + ") (current beta_1=" + Conversions::toString(beta_1) +
+                         ")");
+    }
+
+    if (beta_2 < beta_lower_boundary || beta_2 >= beta_upper_boundary)
+    {
+        THROW_NONAME("AdamW",
+                     "reset beta_2 from (" + Conversions::toString(beta_lower_boundary) + ", " + Conversions::toString(beta_upper_boundary) + ") (current beta_2=" + Conversions::toString(beta_2) +
+                         ")");
+    }
+}
+
+void AdamW::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("AdamW", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    const auto n = param.size();
+    Tensor *b1tp, *b2tp, *mp, *vp;
+    if (!memory_manager.tensorExists(Name("AdamW") / param.getName() / "beta_1_t"))
+    {
+        b1tp = memory_manager.createTensor(Name("AdamW") / param.getName() / "beta_1_t", 1, 1, 1, 1, mBeta_1);
+    }
+    b1tp = &memory_manager.getTensor(Name("AdamW") / param.getName() / "beta_1_t");
+
+    if (!memory_manager.tensorExists(Name("AdamW") / param.getName() / "beta_2_t"))
+    {
+        b2tp = memory_manager.createTensor(Name("AdamW") / param.getName() / "beta_2_t", 1, 1, 1, 1, mBeta_2);
+    }
+    b2tp = &memory_manager.getTensor(Name("AdamW") / param.getName() / "beta_2_t");
+
+    if (!memory_manager.tensorExists(Name("AdamW") / param.getName() / "m"))
+    {
+        mp = memory_manager.createTensor(Name("AdamW") / param.getName() / "m", 1, n, 1, 1);
+    }
+    mp = &memory_manager.getTensor(Name("AdamW") / param.getName() / "m");
+
+    if (!memory_manager.tensorExists(Name("AdamW") / param.getName() / "v"))
+    {
+        vp = memory_manager.createTensor(Name("AdamW") / param.getName() / "v", 1, n, 1, 1);
+    }
+    vp = &memory_manager.getTensor(Name("AdamW") / param.getName() / "v");
+
+    Tensor& beta_1_t = *b1tp;
+    Tensor& beta_2_t = *b2tp;
+    Tensor& m = *mp;
+    Tensor& v = *vp;
+
+    const auto alpha_new = mAlpha * std::sqrt(1.0_dt - beta_2_t[0]) / (1.0_dt - beta_1_t[0]);
+    const auto epsilon_new = mEpsilon * std::sqrt(1.0_dt - beta_2_t[0]);
+    const auto lambdaW = (1.0_dt - mAlpha * mLambda);
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        // m_new = beta_1*m + (1-beta_1)*grad
+        m[i] = mBeta_1 * m[i] + (1.0_dt - mBeta_1) * grad[i];
+        // v_new = beta_2*v + (1-beta_2)*grad*grad
+        v[i] = mBeta_2 * v[i] + (1.0_dt - mBeta_2) * grad[i] * grad[i];
+        // param_new = param - alpha_new*m_new/(sqrt(v_new) + epsilon_new) + lambda * param
+        param[i] = lambdaW * param[i] - alpha_new * m[i] / (std::sqrt(v[i]) + epsilon_new);
+    }
+
+    beta_1_t[0] *= mBeta_1;
+    beta_2_t[0] *= mBeta_2;
+}
+
+void AdamW::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("AdamW", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    const auto n = param.size();
+    TensorFP16 *b1tp, *b2tp, *mp, *vp;
+    if (!memory_manager.tensorExists(Name("AdamW") / param.getName() / "beta_1_t"))
+    {
+        b1tp = memory_manager.createTensor(Name("AdamW") / param.getName() / "beta_1_t", 1, 1, 1, 1, TOHTYPE(mBeta_1));
+    }
+    b1tp = &memory_manager.getTensor(Name("AdamW") / param.getName() / "beta_1_t");
+
+    if (!memory_manager.tensorExists(Name("AdamW") / param.getName() / "beta_2_t"))
+    {
+        b2tp = memory_manager.createTensor(Name("AdamW") / param.getName() / "beta_2_t", 1, 1, 1, 1, TOHTYPE(mBeta_2));
+    }
+    b2tp = &memory_manager.getTensor(Name("AdamW") / param.getName() / "beta_2_t");
+
+    if (!memory_manager.tensorExists(Name("AdamW") / param.getName() / "m"))
+    {
+        mp = memory_manager.createTensor(Name("AdamW") / param.getName() / "m", 1, n, 1, 1);
+    }
+    mp = &memory_manager.getTensor(Name("AdamW") / param.getName() / "m");
+
+    if (!memory_manager.tensorExists(Name("AdamW") / param.getName() / "v"))
+    {
+        vp = memory_manager.createTensor(Name("AdamW") / param.getName() / "v", 1, n, 1, 1);
+    }
+    vp = &memory_manager.getTensor(Name("AdamW") / param.getName() / "v");
+
+    TensorFP16& beta_1_t = *b1tp;
+    TensorFP16& beta_2_t = *b2tp;
+    TensorFP16& m = *mp;
+    TensorFP16& v = *vp;
+
+    const auto alpha_new = mAlpha * std::sqrt(1.0_dt - TODTYPE(beta_2_t[0])) / (1.0_dt - TODTYPE(beta_1_t[0]));
+    const auto epsilon_new = mEpsilon * std::sqrt(1.0_dt - TODTYPE(beta_2_t[0]));
+    const auto lambdaW = (1.0_dt - mAlpha * mLambda);
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        // m_new = beta_1*m + (1-beta_1)*grad
+        const auto mTmp = mBeta_1 * TODTYPE(m[i]) + (1.0_dt - mBeta_1) * TODTYPE(grad[i]);
+        m[i] = TOHTYPE(mTmp);
+        // v_new = beta_2*v + (1-beta_2)*grad*grad
+        const auto vTmp = mBeta_2 * TODTYPE(v[i]) + (1.0_dt - mBeta_2) * TODTYPE(grad[i]) * TODTYPE(grad[i]);
+        v[i] = TOHTYPE(vTmp);
+        // param_new = param - alpha_new*m_new/(sqrt(v_new) + epsilon_new) + lambda * param
+        param[i] = TOHTYPE(lambdaW * TODTYPE(param[i]) - alpha_new * mTmp / (std::sqrt(vTmp) + epsilon_new));
+    }
+
+    beta_1_t[0] = TOHTYPE(mBeta_1 * TODTYPE(beta_1_t[0]));
+    beta_2_t[0] = TOHTYPE(mBeta_2 * TODTYPE(beta_2_t[0]));
+}
+
+std::ostream& AdamW::as_ostream(std::ostream& out) const
+{
+    out << "AdamW(alpha=" << std::scientific << this->mAlpha << ", beta_1=" << mBeta_1 << ", beta_2=" << mBeta_2;
+    out << ", epsilon=" << this->mEpsilon << ", lambda=" << this->mLambda << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/AdamW.h b/training/src/compiler/training/base/optimizers/AdamW.h
new file mode 100644
index 00000000..1cd9c515
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/AdamW.h
@@ -0,0 +1,83 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ADAMW_H
+#define ADAMW_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+
+/**
+ * @brief AdamW (Adaptive moment estimation with weight decay)
+ *
+ *  The AdamW method computes individual adaptive learning rates for
+ *  different parameters from estimates of first
+ *  and second moments of the gradients taking into account weight decay.
+ *  This follows pytorch realization.
+ *
+ *  \f[
+ *      m_t =  \beta_1 m_{t-1} - (1-\beta_1) \nabla_{\theta} E(\theta_{t-1}),\\
+ *      \nu_t =  \beta_2 \nu_{t-1} - (1-\beta_2) \nabla^2_{\theta} E(\theta_{t-1}),\\
+ *      \hat m_t = \frac{m}{1-\beta_1^t}, \\
+ *      \hat \nu_t = \frac{\nu}{1-\beta_2^t}, \\
+ *      \theta_{t} =  \theta_{t-1} - \alpha \frac{m_{t}}{\sqrt{\hat \nu_t} + \epsilon} - \alpha \lambda \theta_{t-1},
+ *  \f]
+ *  where
+ *  - \f$m\f$ is the 1st moment vector (the mean of gradient),
+ *  - \f$\nu\f$ is the 2st moment vector (the uncentered variance of gradient),
+ *  - \f$\beta_1\f$ is the exponential decay rate for 1st moment,
+ *  - \f$\beta_2\f$ is the exponential decay rate for 2st moment,
+ *  - \f$\hat m\f$ is the bias-corrected 1st moment vector,
+ *  - \f$\hat \nu\f$ is the bias-corrected 2st moment vector,
+ *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+ *  - \f$\alpha\f$ is a learning rate,
+ *  - \f$\lambda\f$ is a weight decay,
+ *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+ *
+ *  Good default settings from the original article:
+ *  - \f$\alpha = 0.0001\f$
+ *  - \f$\beta_1 = 0.9\f$
+ *  - \f$\beta_2 = 0.999\f$
+ *  - \f$\epsilon = 10^{-8}\f$
+ *  - \f$\lambda = 0.01
+ *
+ *  @see
+ *  - Ilya Loshchilov, Frank Hutter, “Decoupled Weight Decay Regularization” arXiv:1711.05101 [cs], Nov. 2017.
+ *  - https://pytorch.org/docs/master/optim.html?highlight=adamw#torch.optim.AdamW
+ */
+struct AdamW : public Optimizer
+{
+    explicit AdamW(const dtype alpha = 0.0001_dt, const dtype beta_1 = 0.9_dt, const dtype beta_2 = 0.999_dt, const dtype epsilon = 1e-8_dt, const dtype lambda = 0.01_dt);
+
+    void setLearningRate(dtype lr) final { mAlpha = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return mAlpha; }
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype mAlpha;
+    dtype mBeta_1;
+    dtype mBeta_2;
+    dtype mEpsilon;
+    dtype mLambda;
+};
+
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Adamax.cpp b/training/src/compiler/training/base/optimizers/Adamax.cpp
new file mode 100644
index 00000000..edaee7c8
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Adamax.cpp
@@ -0,0 +1,158 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Adamax.h"
+#include <iostream>
+#include <stdexcept>
+
+namespace
+{
+constexpr raul::dtype alpha_lower_boundary = 0.0_dt;
+constexpr raul::dtype beta_lower_boundary = 0.0_dt;
+constexpr raul::dtype beta_upper_boundary = 1.0_dt;
+}
+
+namespace raul::optimizers
+{
+
+Adamax::Adamax(const dtype alpha, const dtype beta_1, const dtype beta_2, const dtype epsilon)
+    : m_alpha(alpha)
+    , m_beta_1(beta_1)
+    , m_beta_2(beta_2)
+    , m_epsilon(epsilon)
+{
+    if (alpha <= alpha_lower_boundary)
+    {
+        THROW_NONAME("Adamax", "reset alpha>" + Conversions::toString(alpha_lower_boundary) + " (current alpha=" + Conversions::toString(alpha) + ")");
+    }
+
+    if (beta_1 < beta_lower_boundary || beta_1 >= beta_upper_boundary)
+    {
+        THROW_NONAME("Adamax",
+                     "reset beta_1 from [" + Conversions::toString(beta_lower_boundary) + ", " + Conversions::toString(beta_upper_boundary) + ") (current beta_1=" + Conversions::toString(beta_1) +
+                         ")");
+    }
+
+    if (beta_2 < beta_lower_boundary || beta_2 >= beta_upper_boundary)
+    {
+        THROW_NONAME("Adamax",
+                     "reset beta_2 from [" + Conversions::toString(beta_lower_boundary) + ", " + Conversions::toString(beta_upper_boundary) + ") (current beta_2=" + Conversions::toString(beta_2) +
+                         ")");
+    }
+}
+
+void Adamax::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Adamax", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    Tensor *b1tp, *mp, *up;
+    if (!memory_manager.tensorExists(Name("Adamax") / param.getName() / "beta_1_t"))
+    {
+        b1tp = memory_manager.createTensor(Name("Adamax") / param.getName() / "beta_1_t", 1, 1, 1, 1, this->m_beta_1);
+    }
+    b1tp = &memory_manager.getTensor(Name("Adamax") / param.getName() / "beta_1_t");
+
+    if (!memory_manager.tensorExists(Name("Adamax") / param.getName() / "m"))
+    {
+        mp = memory_manager.createTensor(Name("Adamax") / param.getName() / "m", 1, param.size(), 1, 1);
+    }
+    mp = &memory_manager.getTensor(Name("Adamax") / param.getName() / "m");
+
+    if (!memory_manager.tensorExists(Name("Adamax") / param.getName() / "u"))
+    {
+        up = memory_manager.createTensor(Name("Adamax") / param.getName() / "u", 1, param.size(), 1, 1);
+    }
+    up = &memory_manager.getTensor(Name("Adamax") / param.getName() / "u");
+
+    Tensor& beta_1_t = *b1tp;
+    Tensor& m = *mp;
+    Tensor& u = *up;
+
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        // m_new = beta_1*m + (1-beta_1)*grad
+        m[i] = this->m_beta_1 * m[i] + (1.0_dt - this->m_beta_1) * grad[i];
+        // u_new = max[u*beta_2, abs(grad)]
+        u[i] = std::max(this->m_beta_2 * u[i], std::abs(grad[i]));
+        // param_new = param - alpha m_new/(u_new (1-\beta_1^t)),
+        param[i] = param[i] - this->m_alpha * m[i] / (u[i] * (1 - beta_1_t[0]) + this->m_epsilon);
+    }
+
+    beta_1_t[0] *= this->m_beta_1;
+}
+
+void Adamax::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Adamax", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    TensorFP16 *b1tp, *mp, *up;
+    if (!memory_manager.tensorExists(Name("Adamax") / param.getName() / "beta_1_t"))
+    {
+        b1tp = memory_manager.createTensor(Name("Adamax") / param.getName() / "beta_1_t", 1, 1, 1, 1, TOHTYPE(this->m_beta_1));
+    }
+    b1tp = &memory_manager.getTensor(Name("Adamax") / param.getName() / "beta_1_t");
+
+    if (!memory_manager.tensorExists(Name("Adamax") / param.getName() / "m"))
+    {
+        mp = memory_manager.createTensor(Name("Adamax") / param.getName() / "m", 1, param.size(), 1, 1);
+    }
+    mp = &memory_manager.getTensor(Name("Adamax") / param.getName() / "m");
+
+    if (!memory_manager.tensorExists(Name("Adamax") / param.getName() / "u"))
+    {
+        up = memory_manager.createTensor(Name("Adamax") / param.getName() / "u", 1, param.size(), 1, 1);
+    }
+    up = &memory_manager.getTensor(Name("Adamax") / param.getName() / "u");
+
+    TensorFP16& beta_1_t = *b1tp;
+    TensorFP16& m = *mp;
+    TensorFP16& u = *up;
+
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        // m_new = beta_1*m + (1-beta_1)*grad
+        const auto mTmp = this->m_beta_1 * TODTYPE(m[i]) + (1.0_dt - this->m_beta_1) * TODTYPE(grad[i]);
+        m[i] = TOHTYPE(mTmp);
+        // u_new = max[u*beta_2, abs(grad)]
+        const auto uTmp = std::max(this->m_beta_2 * TODTYPE(u[i]), std::abs(TODTYPE(grad[i])));
+        u[i] = TOHTYPE(uTmp);
+        // param_new = param - alpha m_new/(u_new (1-\beta_1^t)),
+        param[i] = TOHTYPE(TODTYPE(param[i]) - this->m_alpha * mTmp / (uTmp * (1.0_dt - TODTYPE(beta_1_t[0])) + this->m_epsilon));
+    }
+
+    beta_1_t[0] = TOHTYPE(this->m_beta_1 * TODTYPE(beta_1_t[0]));
+}
+
+std::ostream& Adamax::as_ostream(std::ostream& out) const
+{
+    out << "Adamax(alpha=" << std::scientific << this->m_alpha << ", beta_1=" << this->m_beta_1 << ", beta_2=" << this->m_beta_2 << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Adamax.h b/training/src/compiler/training/base/optimizers/Adamax.h
new file mode 100644
index 00000000..8e5ee43e
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Adamax.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ADAMAX_H
+#define ADAMAX_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+
+/**
+ * @brief AdaMax (Adaptive moment estimation Maximum)
+ *
+ * The AdaMax method is an extension of Adam optimization method based on the infinity norm.
+ *
+ *  \f[
+ *      m_t =  \beta_1 m_{t-1} - (1-\beta_1) \nabla_{\theta} E(\theta_{t-1}),\\
+ *      \u_t =  \max{\beta_2 \u_{t-1}, |\nabla_{\theta} E(\theta_{t-1})|),\\
+ *      \theta_{t} =  \theta_{t-1} - \alpha \frac{m_t}{u_t(1-\beta_1^t)},
+ *  \f]
+ *  where
+ *  - \f$m\f$ is the 1st moment vector (the mean of gradient),
+ *  - \f$u\f$ is the Lp norm of 2st moment vector (the uncentered variance of gradient),
+ *  - \f$\beta_1\f$ is the exponential decay rate for 1st moment,
+ *  - \f$\beta_2\f$ is the exponential decay rate for 2st moment,
+ *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+ *  - \f$\alpha\f$ is a learning rate,
+ *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+ *
+ *  Good default settings from the original article:
+ *  - \f$\alpha = 0.002\f$
+ *  - \f$\beta_1 = 0.9\f$
+ *  - \f$\beta_2 = 0.999\f$
+ *
+ *  @see
+ *  - D. P. Kingma and J. Ba, “Adam: A Method for Stochastic Optimization” arXiv:1412.6980 [cs], Jan. 2017.
+ */
+struct Adamax : public Optimizer
+{
+    explicit Adamax(const dtype alpha, const dtype beta_1 = 0.9_dt, const dtype beta_2 = 0.999_dt, const dtype epsilon = 1e-8_dt);
+    void setLearningRate(dtype lr) final { m_alpha = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return m_alpha; }
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype m_alpha;
+    dtype m_beta_1;
+    dtype m_beta_2;
+    dtype m_epsilon;
+};
+
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/IOptimizer.h b/training/src/compiler/training/base/optimizers/IOptimizer.h
new file mode 100644
index 00000000..63cc4e51
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/IOptimizer.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef IOPTIMIZER_H
+#define IOPTIMIZER_H
+
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+namespace raul::optimizers
+{
+/**
+ * @brief Optimizer interface
+ */
+struct IOptimizer
+{
+    virtual ~IOptimizer() = default;
+    virtual void operator()(MemoryManager& memory_manager, Tensor& param, Tensor& grad) = 0;
+    virtual void operator()(MemoryManagerFP16& memory_manager, TensorFP16& param, TensorFP16& grad) = 0;
+    virtual void setLearningRate(dtype lr) = 0;
+    [[nodiscard]] virtual dtype getLearningRate() = 0;
+};
+
+} // raul::optimizers
+
+#endif // IOPTIMIZER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/LAMB.cpp b/training/src/compiler/training/base/optimizers/LAMB.cpp
new file mode 100644
index 00000000..6596af17
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/LAMB.cpp
@@ -0,0 +1,172 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LAMB.h"
+#include <algorithm>
+#include <iostream>
+#include <stdexcept>
+
+namespace
+{
+constexpr raul::dtype lowerBoundary = 0.0_dt;
+constexpr raul::dtype upperBoundary = 1.0_dt;
+
+constexpr raul::dtype weightNormLowerBoundary = 0.0_dt;
+constexpr raul::dtype weightNormUpperBoundary = 10.0_dt;
+}
+
+namespace raul::optimizers
+{
+
+LAMB::LAMB(const dtype lr, const dtype beta1, const dtype beta2, const dtype epsilon, const dtype weightDecay, const bool adam)
+    : mLearningRate(lr)
+    , mBeta1(beta1)
+    , mBeta2(beta2)
+    , mEpsilon(epsilon)
+    , mWeightDecay(weightDecay)
+    , mAdam(adam)
+{
+    if (lr < lowerBoundary)
+    {
+        THROW_NONAME("LAMB", "reset lr>" + Conversions::toString(lowerBoundary) + " (current lr=" + Conversions::toString(lr) + ")");
+    }
+    if (epsilon < lowerBoundary)
+    {
+        THROW_NONAME("LAMB", "reset epsilon>" + Conversions::toString(lowerBoundary) + " (current epsilon=" + Conversions::toString(epsilon) + ")");
+    }
+
+    if (beta1 < lowerBoundary || beta1 >= upperBoundary)
+    {
+        THROW_NONAME("LAMB", "reset beta1 from [" + Conversions::toString(lowerBoundary) + ", " + Conversions::toString(upperBoundary) + ") (current beta1=" + Conversions::toString(beta1) + ")");
+    }
+    if (beta2 < lowerBoundary || beta2 >= upperBoundary)
+    {
+        THROW_NONAME("LAMB", "reset beta2 from [" + Conversions::toString(lowerBoundary) + ", " + Conversions::toString(upperBoundary) + ") (current beta2=" + Conversions::toString(beta1) + ")");
+    }
+}
+
+void LAMB::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("LAMB", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    if (!memory_manager.tensorExists(Name("LAMB") / param.getName() / "m"))
+    {
+        // Exponential moving average of gradient values
+        memory_manager.createTensor(Name("LAMB") / param.getName() / "m", 1, param.size(), 1, 1, 0.0_dt);
+        // Exponential moving average of squared gradient values
+        memory_manager.createTensor(Name("LAMB") / param.getName() / "v", 1, param.size(), 1, 1, 0.0_dt);
+        // Temporal tensor to keep actual parameter updates
+        memory_manager.createTensor(Name("LAMB") / param.getName() / "adam_steps", 1, param.size(), 1, 1, 0.0_dt);
+    }
+
+    Tensor& m = memory_manager[Name("LAMB") / param.getName() / "m"];
+    Tensor& v = memory_manager[Name("LAMB") / param.getName() / "v"];
+    Tensor& adamSteps = memory_manager[Name("LAMB") / param.getName() / "adam_steps"];
+
+    // Calculate weight norm and adam norm
+    dtype weightNorm = 0.0_dt;
+    dtype adamNorm = 0.0_dt;
+#if defined(_OPENMP)
+#pragma omp parallel for reduction(+ : weightNorm, adamNorm)
+#endif
+    for (size_t i = 0; i < grad.size(); ++i)
+    {
+        // Decay the first and second moment running average coefficient
+        m[i] = m[i] * mBeta1 + (1.0_dt - mBeta1) * grad[i];
+        v[i] = v[i] * mBeta2 + (1.0_dt - mBeta2) * grad[i] * grad[i];
+        // Adam steps
+        adamSteps[i] = m[i] / (std::sqrt(v[i]) + mEpsilon) + mWeightDecay * param[i];
+        // Weight and adam norms
+        weightNorm += param[i] * param[i];
+        adamNorm += adamSteps[i] * adamSteps[i];
+    }
+    weightNorm = std::clamp(std::sqrt(weightNorm), weightNormLowerBoundary, weightNormUpperBoundary);
+    adamNorm = std::sqrt(adamNorm);
+
+    // Update parameter
+    OPENBLAS_CONST dtype sa = -mLearningRate * ((mAdam || weightNorm == 0.0_dt || adamNorm == 0.0_dt) ? 1.0_dt : weightNorm / adamNorm);
+    OPENBLAS_CONST dtype* sx = &(adamSteps[0]);
+    size_t incx = 1U;
+    dtype* sy = &(param[0]);
+    size_t incy = 1U;
+    size_t xOffset = 0U;
+    size_t yOffset = 0U;
+    Common::axpy(param.size(), sa, sx, incx, sy, incy, xOffset, yOffset);
+}
+
+void LAMB::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("LAMB", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    if (!memory_manager.tensorExists(Name("LAMB") / param.getName() / "m"))
+    {
+        // Exponential moving average of gradient values
+        memory_manager.createTensor(Name("LAMB") / param.getName() / "m", 1, param.size(), 1, 1, 0.0_hf);
+        // Exponential moving average of squared gradient values
+        memory_manager.createTensor(Name("LAMB") / param.getName() / "v", 1, param.size(), 1, 1, 0.0_hf);
+        // Temporal tensor to keep actual parameter updates
+        memory_manager.createTensor(Name("LAMB") / param.getName() / "adam_steps", 1, param.size(), 1, 1, 0.0_hf);
+    }
+
+    TensorFP16& m = memory_manager[Name("LAMB") / param.getName() / "m"];
+    TensorFP16& v = memory_manager[Name("LAMB") / param.getName() / "v"];
+    TensorFP16& adamSteps = memory_manager[Name("LAMB") / param.getName() / "adam_steps"];
+
+    // Calculate weight norm and adam norm
+    dtype weightNorm = 0.0_dt;
+    dtype adamNorm = 0.0_dt;
+#if defined(_OPENMP)
+#pragma omp parallel for reduction(+ : weightNorm, adamNorm)
+#endif
+    for (size_t i = 0; i < grad.size(); ++i)
+    {
+        // Decay the first and second moment running average coefficient
+        const auto mTmp = TODTYPE(m[i]) * mBeta1 + (1.0_dt - mBeta1) * TODTYPE(grad[i]);
+        m[i] = TOHTYPE(mTmp);
+        const auto vTmp = TODTYPE(v[i]) * mBeta2 + (1.0_dt - mBeta2) * TODTYPE(grad[i]) * TODTYPE(grad[i]);
+        v[i] = TOHTYPE(vTmp);
+        // Adam steps
+        const auto adamStepTmp = mTmp / (std::sqrt(vTmp) + mEpsilon) + mWeightDecay * TODTYPE(param[i]);
+        adamSteps[i] = TOHTYPE(adamStepTmp);
+        // Weight and adam norms
+        weightNorm += TODTYPE(param[i]) * TODTYPE(param[i]);
+        adamNorm += adamStepTmp * adamStepTmp;
+    }
+    weightNorm = std::clamp(std::sqrt(weightNorm), weightNormLowerBoundary, weightNormUpperBoundary);
+    adamNorm = std::sqrt(adamNorm);
+
+    // Update parameter
+    OPENBLAS_CONST dtype sa = -mLearningRate * ((mAdam || weightNorm == 0.0_dt || adamNorm == 0.0_dt) ? 1.0_dt : weightNorm / adamNorm);
+    OPENBLAS_CONST half* sx = &(adamSteps[0]);
+    size_t incx = 1U;
+    half* sy = &(param[0]);
+    size_t incy = 1U;
+    size_t xOffset = 0U;
+    size_t yOffset = 0U;
+    Common::axpy(param.size(), sa, sx, incx, sy, incy, xOffset, yOffset);
+}
+
+std::ostream& LAMB::as_ostream(std::ostream& out) const
+{
+    out << "LAMB(lr=" << std::scientific << mLearningRate << ", beta1=" << mBeta1 << ", beta2=" << mBeta2;
+    out << ", epsilon=" << mEpsilon << ", weight decay=" << mWeightDecay << ", adam: " << (mAdam ? "true" : "false") << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/LAMB.h b/training/src/compiler/training/base/optimizers/LAMB.h
new file mode 100644
index 00000000..61cd8e06
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/LAMB.h
@@ -0,0 +1,69 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAMB_H
+#define LAMB_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+
+/**
+ * @brief LAMB (Layer-wise Adaptive Moments optimizer for Batch training)
+ *
+ * This method can be summarized as LARS (Layerwise Adaptive Rate Scaling)
+ * applied to Adam, since it’s just multiplying the old update step
+ * by the trust ratio. Update rule:
+ *
+ *  \f[
+ *      \m_t =  \beta_1 m_{t-1} + (1-\beta_1) \nabla_{\theta} E(\theta_{t-1}),\\
+ *      \nu_t =  \beta_2 \nu_{t-1} + (1-\beta_2) \nabla^2_{\theta} E(\theta_{t-1}),\\
+ *      \theta_{t} =  \theta_{t-1} - \frac{\r_1}{\r_2} \eta (\frac{\m_t}{\sqrt{\nu_t} + \epsilon} + \lambda * \theta_{t-1})
+ *  \f]
+ * Where:
+ * r1 - L2-norm of weights.
+ * r2 - L2-norm of the Adam update rule with weight decay.
+ * Implementation follows https://github.com/cybertronai/pytorch-lamb/blob/master/pytorch_lamb/lamb.py
+ *
+ *  @see
+ *  - Yang You, Jing Li, Sashank Reddi, Jonathan Hseu, Sanjiv Kumar, Srinadh Bhojanapalli, Xiaodan Song, James Demmel, Kurt Keutzer, Cho-Jui Hsieh,
+ *    “Large Batch Optimization for Deep Learning: Training BERT in 76 minutes” arXiv:1904.00962 [cs], Apr. 2019. https://arxiv.org/pdf/1904.00962.pdf
+ *
+ *  - Intuitive description: https://towardsdatascience.com/an-intuitive-understanding-of-the-lamb-optimizer-46f8c0ae4866
+ */
+struct LAMB : public Optimizer
+{
+    explicit LAMB(const dtype lr, const dtype beta1 = 0.9_dt, const dtype beta2 = 0.999_dt, const dtype epsilon = 1e-6_dt, const dtype weightDecay = 0.0_dt, const bool adam = false);
+
+    void setLearningRate(dtype lr) final { mLearningRate = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return mLearningRate; }
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype mLearningRate;
+    dtype mBeta1;
+    dtype mBeta2;
+    dtype mEpsilon;
+    dtype mWeightDecay;
+    bool mAdam;
+};
+
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Momentum.cpp b/training/src/compiler/training/base/optimizers/Momentum.cpp
new file mode 100644
index 00000000..6396c2d4
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Momentum.cpp
@@ -0,0 +1,138 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Momentum.h"
+#include "training/base/common/Conversions.h"
+#include <iostream>
+#include <stdexcept>
+
+namespace
+{
+constexpr raul::dtype lr_lower_boundary = 0.0_dt;
+constexpr raul::dtype momentum_lower_boundary = 0.0_dt;
+constexpr raul::dtype momentum_upper_boundary = 1.0_dt;
+}
+
+namespace raul::optimizers
+{
+
+Momentum::Momentum(const dtype lr, const dtype momentum)
+    : m_learning_rate(lr)
+    , m_momentum(momentum)
+{
+
+    if (lr <= lr_lower_boundary)
+    {
+        THROW_NONAME("Momentum", "reset lr>" + Conversions::toString(lr_lower_boundary) + " (current lr=" + Conversions::toString(lr) + ")");
+    }
+    if (momentum < momentum_lower_boundary || momentum > momentum_upper_boundary)
+    {
+        THROW_NONAME("Momentum",
+                     "reset momentum from [" + Conversions::toString(momentum_lower_boundary) + "," + Conversions::toString(momentum_upper_boundary) +
+                         "] (current momentum=" + Conversions::toString(momentum) + ")");
+    }
+}
+
+void Momentum::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Momentum", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    Tensor* velP;
+    if (!memory_manager.tensorExists(Name("Momentum") / param.getName() / "v"))
+    {
+        velP = memory_manager.createTensor(Name("Momentum") / param.getName() / "v", 1, param.size(), 1, 1, 0.0_dt);
+    }
+    velP = &memory_manager.getTensor(Name("Momentum") / param.getName() / "v");
+
+    Tensor& velocity_vector = *velP;
+
+    // velocity_new = momentum*velocity + lr*grad
+    {
+        OPENBLAS_CONST size_t n = velocity_vector.size();
+        OPENBLAS_CONST dtype alpha = this->m_learning_rate;
+        OPENBLAS_CONST dtype* x = &(grad[0]);
+        OPENBLAS_CONST size_t incx = 1U;
+        OPENBLAS_CONST dtype beta = this->m_momentum;
+        dtype* y = &(velocity_vector[0]);
+        OPENBLAS_CONST size_t incy = 1U;
+        size_t xOffset = 0U;
+        size_t yOffset = 0U;
+        Common::axpby(n, alpha, x, incx, beta, y, incy, xOffset, yOffset);
+    }
+    // param_new = param - velocity_new = param - momentum*velocity - lr*grad
+    {
+        size_t n = velocity_vector.size();
+        OPENBLAS_CONST dtype sa = -1.0_dt;
+        OPENBLAS_CONST dtype* sx = &(velocity_vector[0]);
+        size_t incx = 1U;
+        dtype* sy = &(param[0]);
+        size_t incy = 1U;
+        size_t xOffset = 0U;
+        size_t yOffset = 0U;
+        Common::axpy(n, sa, sx, incx, sy, incy, xOffset, yOffset);
+    }
+}
+
+void Momentum::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Momentum", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    TensorFP16* velP;
+    if (!memory_manager.tensorExists(Name("Momentum") / param.getName() / "v"))
+    {
+        velP = memory_manager.createTensor(Name("Momentum") / param.getName() / "v", 1, param.size(), 1, 1, 0.0_hf);
+    }
+    velP = &memory_manager.getTensor(Name("Momentum") / param.getName() / "v");
+
+    TensorFP16& velocity_vector = *velP;
+
+    // velocity_new = momentum*velocity + lr*grad
+    {
+        OPENBLAS_CONST size_t n = velocity_vector.size();
+        OPENBLAS_CONST dtype alpha = this->m_learning_rate;
+        OPENBLAS_CONST half* x = &(grad[0]);
+        OPENBLAS_CONST size_t incx = 1U;
+        OPENBLAS_CONST dtype beta = this->m_momentum;
+        half* y = &(velocity_vector[0]);
+        OPENBLAS_CONST size_t incy = 1U;
+        size_t xOffset = 0U;
+        size_t yOffset = 0U;
+        Common::axpby(n, alpha, x, incx, beta, y, incy, xOffset, yOffset);
+    }
+    // param_new = param - velocity_new = param - momentum*velocity - lr*grad
+    {
+        size_t n = velocity_vector.size();
+        OPENBLAS_CONST dtype sa = -1.0_dt;
+        OPENBLAS_CONST half* sx = &(velocity_vector[0]);
+        size_t incx = 1U;
+        half* sy = &(param[0]);
+        size_t incy = 1U;
+        size_t xOffset = 0U;
+        size_t yOffset = 0U;
+        Common::axpy(n, sa, sx, incx, sy, incy, xOffset, yOffset);
+    }
+}
+
+std::ostream& Momentum::as_ostream(std::ostream& out) const
+{
+    out << "Momentum(lr=" << std::scientific << this->m_learning_rate << ", momentum=" << this->m_momentum << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Momentum.h b/training/src/compiler/training/base/optimizers/Momentum.h
new file mode 100644
index 00000000..5381f96c
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Momentum.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MOMENTUM_H
+#define MOMENTUM_H
+
+#include "Optimizer.h"
+#include <iostream>
+#include <training/base/common/Common.h>
+
+namespace raul::optimizers
+{
+/**
+ * @brief Momentum method
+ *
+ *  The momentum method is a technique for accelerating
+ *  gradient descent that accumulates a velocity
+ *  vector in directions of persistent reduction in the
+ *  objective across iterations.
+ *
+ *  \f[
+ *      \nu_{t} =  \mu \nu_{t-1} + \alpha \nabla_{\theta} E(\theta_{t-1}),\\
+ *      \theta_{t} =  \theta_{t-1} - \nu_{t}
+ *  \f]
+ *  where
+ *  - \f$\nu\f$ is a velocity,
+ *  - \f$\mu\f$ is a momentum parameter,
+ *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+ *  - \f$\alpha\f$ is a learning rate,
+ *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+ *
+ *  @see
+ *  - I. Sutskever, J. Martens, G. Dahl, and G. Hinton, “On the importance of initialization and momentum in deep learning” p. 14.
+ */
+struct Momentum : public Optimizer
+{
+    Momentum(const dtype lr, const dtype momentum);
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    void setLearningRate(dtype lr) final { m_learning_rate = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return m_learning_rate; }
+
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype m_learning_rate;
+    const dtype m_momentum;
+};
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Nesterov.cpp b/training/src/compiler/training/base/optimizers/Nesterov.cpp
new file mode 100644
index 00000000..a8660acd
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Nesterov.cpp
@@ -0,0 +1,112 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Nesterov.h"
+#include <iostream>
+#include <stdexcept>
+
+namespace
+{
+constexpr raul::dtype lr_lower_boundary = 0.0_dt;
+constexpr raul::dtype momentum_lower_boundary = 0.0_dt;
+constexpr raul::dtype momentum_upper_boundary = 1.0_dt;
+}
+
+namespace raul::optimizers
+{
+
+Nesterov::Nesterov(const dtype lr, const dtype momentum)
+    : m_learning_rate(lr)
+    , m_momentum(momentum)
+{
+    if (lr <= lr_lower_boundary)
+    {
+        THROW_NONAME("Nesterov", "reset lr>" + Conversions::toString(lr_lower_boundary) + " (current lr=" + Conversions::toString(lr) + ")");
+    }
+    if (momentum < momentum_lower_boundary || momentum > momentum_upper_boundary)
+    {
+        THROW_NONAME("Nesterov",
+                     "reset momentum from [" + Conversions::toString(momentum_lower_boundary) + "," + Conversions::toString(momentum_upper_boundary) +
+                         "] (current momentum=" + Conversions::toString(momentum) + ")");
+    }
+}
+
+void Nesterov::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Nesterov", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    Tensor* velP;
+    if (!memory_manager.tensorExists(Name("Nesterov") / param.getName() / "v"))
+    {
+        velP = memory_manager.createTensor(Name("Nesterov") / param.getName() / "v", 1, param.size(), 1, 1, 0.0_dt);
+    }
+    velP = &memory_manager.getTensor(Name("Nesterov") / param.getName() / "v");
+
+    Tensor& velocity_vector = *velP;
+
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        const auto prev_v = velocity_vector[i];
+        // v_new = momentum*v - lr*grad
+        velocity_vector[i] = this->m_momentum * velocity_vector[i] - this->m_learning_rate * grad[i];
+        // param_new = param - momentum*v + (1 + momentum)*v_new
+        param[i] = param[i] - this->m_momentum * prev_v + (1.0_dt + this->m_momentum) * velocity_vector[i];
+    }
+}
+
+void Nesterov::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Nesterov", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    TensorFP16* velP;
+    if (!memory_manager.tensorExists(Name("Nesterov") / param.getName() / "v"))
+    {
+        velP = memory_manager.createTensor(Name("Nesterov") / param.getName() / "v", 1, param.size(), 1, 1, 0.0_hf);
+    }
+    velP = &memory_manager.getTensor(Name("Nesterov") / param.getName() / "v");
+
+    TensorFP16& velocity_vector = *velP;
+
+    const auto n = param.size();
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < n; i++)
+    {
+        const auto prev_v = velocity_vector[i];
+        // v_new = momentum*v - lr*grad
+        velocity_vector[i] = TOHTYPE(this->m_momentum) * velocity_vector[i] - TOHTYPE(this->m_learning_rate) * grad[i];
+        // param_new = param - momentum*v + (1 + momentum)*v_new
+        param[i] = param[i] - TOHTYPE(this->m_momentum) * prev_v + TOHTYPE(1.0_dt + this->m_momentum) * velocity_vector[i];
+    }
+}
+
+std::ostream& Nesterov::as_ostream(std::ostream& out) const
+{
+    out << "Nesterov(lr=" << std::scientific << this->m_learning_rate << ", momentum=" << this->m_momentum << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Nesterov.h b/training/src/compiler/training/base/optimizers/Nesterov.h
new file mode 100644
index 00000000..1b055c45
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Nesterov.h
@@ -0,0 +1,57 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NESTEROV_H
+#define NESTEROV_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+/**
+ * @brief Nesterov accelerated gradient (NAG) method
+ *
+ *  \f[
+ *      \nu_{t} = \mu nu_{t-1} - \alpha \nabla_{\theta} E(\theta_{t-1}),\\
+ *      \theta_{t} = \theta_{t-1} - \mu \nu_{t-1} + (1 + \mu) \nu_{t},
+ *  \f]
+ *  where
+ *  - \f$\nu\f$ is a velocity,
+ *  - \f$\mu\f$ is a momentum parameter,
+ *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+ *  - \f$\alpha\f$ is a learning rate,
+ *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+ *
+ *  @see
+ *  - Y. Bengio, N. Boulanger-Lewandowski, and R. Pascanu, “Advances in Optimizing Recurrent Networks” arXiv:1212.0901 [cs], Dec. 2012.
+ */
+struct Nesterov : public Optimizer
+{
+    Nesterov(const dtype lr, const dtype momentum);
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    void setLearningRate(dtype lr) final { m_learning_rate = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return m_learning_rate; }
+
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype m_learning_rate;
+    dtype m_momentum;
+};
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Optimizer.cpp b/training/src/compiler/training/base/optimizers/Optimizer.cpp
new file mode 100644
index 00000000..b79798f3
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Optimizer.cpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <training/base/optimizers/Optimizer.h>
+
+namespace raul::optimizers
+{
+
+void Optimizer::operator()(MemoryManager& memory_manager, Tensor& param, Tensor& grad)
+{
+    grad.unscale();
+    optimize(memory_manager, param, grad);
+}
+
+void Optimizer::operator()(MemoryManagerFP16& memory_manager, TensorFP16& param, TensorFP16& grad)
+{
+    grad.unscale();
+    optimize(memory_manager, param, grad);
+}
+
+void Optimizer::optimize(MemoryManagerFP16&, TensorFP16&, const TensorFP16&)
+{
+    THROW_NONAME("Optimizer[optimize]", "not implemented");
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Optimizer.h b/training/src/compiler/training/base/optimizers/Optimizer.h
new file mode 100644
index 00000000..252059fa
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Optimizer.h
@@ -0,0 +1,60 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef OPTIMIZER_H
+#define OPTIMIZER_H
+
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+
+#include "IOptimizer.h"
+
+namespace raul::optimizers
+{
+/**
+ * @brief Optimizer base class
+ */
+struct Optimizer : IOptimizer
+{
+    Optimizer() = default;
+    Optimizer(const Optimizer& other) = delete;
+    Optimizer(const Optimizer&& other) = delete;
+    Optimizer& operator=(const Optimizer& other) = delete;
+    Optimizer& operator=(const Optimizer&& other) = delete;
+
+    /**
+     * Call-method of an optimizer that modifies parameters with gradients.
+     * @param memory_manager Memory manager reference
+     * @param param Tensor of parameters.
+     * @param grad Tensor of gradients.
+     */
+    void operator()(MemoryManager& memory_manager, Tensor& param, Tensor& grad) override;
+    void operator()(MemoryManagerFP16& memory_manager, TensorFP16& param, TensorFP16& grad) override;
+    void setLearningRate(dtype) override { THROW_NONAME("Optimizer", "Optimizer doesn't have configurable learning rate"); }
+    [[nodiscard]] dtype getLearningRate() override { THROW_NONAME("Optimizer", "Optimizer doesn't have configurable learning rate"); }
+
+    friend std::ostream& operator<<(std::ostream& out, const Optimizer& instance) { return instance.as_ostream(out); }
+
+  private:
+    virtual void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) = 0;
+    virtual void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad);
+    virtual std::ostream& as_ostream(std::ostream& out) const = 0;
+
+  private:
+};
+
+} // raul::optimizers
+
+#endif // OPTIMIZER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/RMSprop.cpp b/training/src/compiler/training/base/optimizers/RMSprop.cpp
new file mode 100644
index 00000000..e577787d
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/RMSprop.cpp
@@ -0,0 +1,201 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "RMSprop.h"
+
+namespace
+{
+
+constexpr raul::dtype boundary = 0.0_dt;
+
+}
+
+namespace raul::optimizers
+{
+
+RMSprop::RMSprop(const dtype lr, const dtype alpha, const dtype eps, const dtype weightDecay, const dtype momentum, bool centered, bool tfStyle)
+    : mLearningRate(lr)
+    , mAlpha(alpha)
+    , mEps(eps)
+    , mWeightDecay(weightDecay)
+    , mMomentum(momentum)
+    , mCentered(centered)
+    , mTFStyle(tfStyle)
+{
+    std::string prefix = "RMSProp[ctor]: ";
+    if (lr < boundary)
+    {
+        THROW_NONAME("RMSProp", "reset lr>" + Conversions::toString(boundary) + " (current lr=" + Conversions::toString(lr) + ")");
+    }
+    if (alpha < boundary)
+    {
+        THROW_NONAME("RMSProp", "reset alpha>" + Conversions::toString(boundary) + " (current alpha=" + Conversions::toString(alpha) + ")");
+    }
+    if (eps < boundary)
+    {
+        THROW_NONAME("RMSProp", "reset eps>" + Conversions::toString(boundary) + " (current eps=" + Conversions::toString(eps) + ")");
+    }
+    if (weightDecay < boundary)
+    {
+        THROW_NONAME("RMSProp", "reset weightDecay>" + Conversions::toString(boundary) + " (current weightDecay=" + Conversions::toString(weightDecay) + ")");
+    }
+    if (momentum < boundary)
+    {
+        THROW_NONAME("RMSProp", "reset momentum>" + Conversions::toString(boundary) + " (current momentum=" + Conversions::toString(momentum) + ")");
+    }
+}
+
+void RMSprop::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("RMSprop[optimize]", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    if (!memory_manager.tensorExists(Name("RMSprop") / param.getName() / "square_avg"))
+    {
+        memory_manager.createTensor(Name("RMSprop") / param.getName() / "square_avg", 1, param.size(), 1, 1, 0.0_dt);
+        if (mMomentum != 0.0_dt)
+        {
+            memory_manager.createTensor(Name("RMSprop") / param.getName() / "momentum_buffer", 1, param.size(), 1, 1, 0.0_dt);
+        }
+        if (mCentered)
+        {
+            memory_manager.createTensor(Name("RMSprop") / param.getName() / "grad_avg", 1, param.size(), 1, 1, 0.0_dt);
+        }
+    }
+
+    Tensor& squareAvg = memory_manager[Name("RMSprop") / param.getName() / "square_avg"];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < param.size(); i++)
+    {
+        const auto resultGrad = grad[i] + mWeightDecay * param[i];
+        squareAvg[i] = squareAvg[i] * mAlpha + resultGrad * resultGrad * (1.0_dt - mAlpha);
+
+        dtype avg = 0.0_dt;
+        if (mCentered)
+        {
+            auto& avgGrad = memory_manager[Name("RMSprop") / param.getName() / "grad_avg"][i];
+            avgGrad = avgGrad * mAlpha + resultGrad * (1.0_dt - mAlpha);
+            if (mTFStyle)
+            {
+                avg = std::sqrt(squareAvg[i] - avgGrad * avgGrad + mEps);
+            }
+            else
+            {
+                avg = std::sqrt(squareAvg[i] - avgGrad * avgGrad) + mEps;
+            }
+        }
+        else
+        {
+            if (mTFStyle)
+            {
+                avg = std::sqrt(squareAvg[i] + mEps);
+            }
+            else
+            {
+                avg = std::sqrt(squareAvg[i]) + mEps;
+            }
+        }
+        if (mMomentum != 0.0_dt)
+        {
+            auto& momentum = memory_manager[Name("RMSprop") / param.getName() / "momentum_buffer"][i];
+            momentum = momentum * mMomentum + resultGrad / avg;
+            param[i] -= mLearningRate * momentum;
+        }
+        else
+        {
+            param[i] -= mLearningRate * resultGrad / avg;
+        }
+    }
+}
+
+void RMSprop::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("RMSprop[optimize]", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    if (!memory_manager.tensorExists(Name("RMSprop") / param.getName() / "square_avg"))
+    {
+        memory_manager.createTensor(Name("RMSprop") / param.getName() / "square_avg", 1, param.size(), 1, 1, 0.0_hf);
+        if (mMomentum != 0.0_dt)
+        {
+            memory_manager.createTensor(Name("RMSprop") / param.getName() / "momentum_buffer", 1, param.size(), 1, 1, 0.0_hf);
+        }
+        if (mCentered)
+        {
+            memory_manager.createTensor(Name("RMSprop") / param.getName() / "grad_avg", 1, param.size(), 1, 1, 0.0_hf);
+        }
+    }
+
+    TensorFP16& squareAvg = memory_manager[Name("RMSprop") / param.getName() / "square_avg"];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < param.size(); i++)
+    {
+        const auto resultGrad = TODTYPE(grad[i]) + mWeightDecay * TODTYPE(param[i]);
+        const auto squareAvgTmp = TODTYPE(squareAvg[i]) * mAlpha + resultGrad * resultGrad * (1.0_dt - mAlpha);
+        squareAvg[i] = TOHTYPE(squareAvgTmp);
+
+        dtype avg = 0.0_dt;
+        if (mCentered)
+        {
+            auto& avgGrad = memory_manager[Name("RMSprop") / param.getName() / "grad_avg"][i];
+            const auto avgGradTmp = TODTYPE(avgGrad) * mAlpha + resultGrad * (1.0_dt - mAlpha);
+            avgGrad = TOHTYPE(avgGradTmp);
+            if (mTFStyle)
+            {
+                avg = std::sqrt(squareAvgTmp - avgGradTmp * avgGradTmp + mEps);
+            }
+            else
+            {
+                avg = std::sqrt(squareAvgTmp - avgGradTmp * avgGradTmp) + mEps;
+            }
+        }
+        else
+        {
+            if (mTFStyle)
+            {
+                avg = std::sqrt(squareAvgTmp + mEps);
+            }
+            else
+            {
+                avg = std::sqrt(squareAvgTmp) + mEps;
+            }
+        }
+        if (mMomentum != 0.0_dt)
+        {
+            auto& momentum = memory_manager[Name("RMSprop") / param.getName() / "momentum_buffer"][i];
+            momentum = TOHTYPE(TODTYPE(momentum) * mMomentum + resultGrad / avg);
+            param[i] -= TOHTYPE(mLearningRate) * momentum;
+        }
+        else
+        {
+            param[i] -= TOHTYPE(mLearningRate * resultGrad / avg);
+        }
+    }
+}
+
+std::ostream& RMSprop::as_ostream(std::ostream& out) const
+{
+    out << "RMSprop(lr=" << std::scientific << mLearningRate << ", alpha=" << mAlpha << ", eps=" << mEps << ", weight decay=" << mWeightDecay << ", momentum: " << mMomentum
+        << ", centered: " << (mCentered ? "true" : "false") << ", style: " << (mTFStyle ? "tensorflow" : "pytorch") << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/RMSprop.h b/training/src/compiler/training/base/optimizers/RMSprop.h
new file mode 100644
index 00000000..8b7ceba5
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/RMSprop.h
@@ -0,0 +1,65 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RMSPROP_H
+#define RMSPROP_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+
+/**
+ * @brief RMSprop algorithm
+ *
+ * By default, the implementation here takes the square root of the gradient average before
+ * adding epsilon (PyTorch behaviour). The effective learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)`
+ * where :math:`\alpha` is the scheduled learning rate and :math:`v` is the weighted moving average
+ * of the squared gradient.
+ * Behaviour can be changed to TF by setting tfStyle = true.
+ *
+ *  @see
+ *  - Proposed by G. Hinton in his course: https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+ *  - Centered version: A. Graves - Generating Sequences With Recurrent Neural Networks. https://arxiv.org/pdf/1308.0850v5.pdf
+ */
+struct RMSprop : public Optimizer
+{
+    explicit RMSprop(const dtype lr,
+                     const dtype alpha = 0.99_dt,
+                     const dtype eps = 1.0e-8_dt,
+                     const dtype weightDecay = 0.0_dt,
+                     const dtype momentum = 0.0_dt,
+                     bool centered = false,
+                     bool tfStyle = false);
+    void setLearningRate(dtype lr) final { mLearningRate = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return mLearningRate; }
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype mLearningRate;
+    dtype mAlpha;
+    dtype mEps;
+    dtype mWeightDecay;
+    dtype mMomentum;
+    bool mCentered;
+    bool mTFStyle;
+};
+
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Ranger.cpp b/training/src/compiler/training/base/optimizers/Ranger.cpp
new file mode 100644
index 00000000..64af5632
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Ranger.cpp
@@ -0,0 +1,273 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Ranger.h"
+#include <cmath>
+
+using namespace raul;
+
+namespace
+{
+constexpr raul::dtype LB = 0.0_dt;
+constexpr raul::dtype UB = 1.0_dt;
+enum buffFields : int
+{
+    stepNumField,
+    smaValField,
+    stepSizeField
+};
+}
+
+namespace raul::optimizers
+{
+Ranger::Ranger(const dtype lr, const dtype alpha, const size_t k, const dtype nSmaThreshold, const dtype beta1, const dtype beta2, const dtype eps, const dtype weightDecay, bool useGc)
+    : mLearningRate(lr)
+    , mAlpha(alpha)
+    , mK(k)
+    , mNSmaThreshold(nSmaThreshold)
+    , mBeta1(beta1)
+    , mBeta2(beta2)
+    , mEpsilon(eps)
+    , mWeightDecay(weightDecay)
+    , mUseGc(useGc)
+    , mStep(0)
+{
+    if (lr <= LB)
+    {
+        THROW_NONAME("Ranger", "reset lr>" + Conversions::toString(LB) + " (current lr=" + Conversions::toString(lr) + ")");
+    }
+    if (alpha < LB || alpha > UB)
+    {
+        THROW_NONAME("Ranger", "reset alpha from [" + Conversions::toString(LB) + ", " + Conversions::toString(UB) + "] (current alpha=" + Conversions::toString(alpha) + ")");
+    }
+    if (k < 1)
+    {
+        THROW_NONAME("Ranger", "Invalid lookahead steps=" + Conversions::toString(k));
+    }
+    if (eps <= LB)
+    {
+        THROW_NONAME("Ranger", "reset eps>" + Conversions::toString(LB) + " (current eps=" + Conversions::toString(eps) + ")");
+    }
+    if (beta1 < LB || beta1 >= UB)
+    {
+        THROW_NONAME("Ranger", "reset beta1 from [" + Conversions::toString(LB) + ", " + Conversions::toString(UB) + ") (current beta1=" + Conversions::toString(beta1) + ")");
+    }
+    if (beta2 < LB || beta2 >= UB)
+    {
+        THROW_NONAME("Ranger", "reset beta2 from [" + Conversions::toString(LB) + ", " + Conversions::toString(UB) + ") (current beta2=" + Conversions::toString(beta2) + ")");
+    }
+    if (weightDecay < LB)
+    {
+        THROW_NONAME("Ranger", "reset weightDecay>=" + Conversions::toString(LB) + " (current weightDecay=" + Conversions::toString(weightDecay) + ")");
+    }
+}
+
+void Ranger::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Ranger", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    if (!memory_manager.tensorExists(Name("Ranger") / param.getName() / "exp_avg"))
+    {
+        memory_manager.createTensor(Name("Ranger") / param.getName() / "exp_avg", param.getShape());
+        memory_manager.createTensor(Name("Ranger") / param.getName() / "exp_avg_sq", param.getShape());
+        // In order to reduce calculations
+        memory_manager.createTensor(Name("Ranger") / param.getName() / "beta1T", 1, 1, 1, 1, mBeta1);
+        memory_manager.createTensor(Name("Ranger") / param.getName() / "beta2T", 1, 1, 1, 1, mBeta2);
+        // For lookahead part
+        memory_manager.createTensor(Name("Ranger") / param.getName() / "slow_buffer", param.getShape(), param);
+    }
+
+    mStep++;
+
+    // Calculate values needed for denominator
+    auto& beta1T = memory_manager[Name("Ranger") / param.getName() / "beta1T"];
+    auto& beta2T = memory_manager[Name("Ranger") / param.getName() / "beta2T"];
+    dtype nSma = 0.0_dt;
+    dtype stepSize = 1.0_dt;
+    auto& buff = mBuffered[mStep % 10];
+    if (mStep == std::get<stepNumField>(buff))
+    {
+        nSma = std::get<smaValField>(buff);
+        stepSize = std::get<stepSizeField>(buff);
+    }
+    else
+    {
+        auto nSmaMax = 2.0_dt / (1.0_dt - mBeta2) - 1.0_dt;
+        nSma = nSmaMax - 2.0_dt * static_cast<dtype>(mStep) * beta2T[0] / (1.0_dt - beta2T[0]);
+        auto stepSizeDivisor = 1.0_dt - beta1T[0];
+        if (nSma > mNSmaThreshold)
+        {
+            stepSize = std::sqrt((1.0_dt - beta2T[0]) * (nSma - 4.0_dt) / (nSmaMax - 4.0_dt) * (nSma - 2.0_dt) / nSma * nSmaMax / (nSmaMax - 2.0_dt));
+        }
+        stepSize /= stepSizeDivisor;
+        std::get<stepNumField>(buff) = mStep;
+        std::get<smaValField>(buff) = nSma;
+        std::get<stepSizeField>(buff) = stepSize;
+    }
+
+    auto gradMean = 0.0_dt;
+    // If use GC option (in original implementation it is used for convolutions and FC)
+    if (mUseGc)
+    {
+        gradMean = std::accumulate(grad.begin(), grad.end(), 0.0_dt) / static_cast<dtype>(grad.size());
+    }
+
+    // Update param
+    auto& m = memory_manager[Name("Ranger") / param.getName() / "exp_avg"];
+    auto& v = memory_manager[Name("Ranger") / param.getName() / "exp_avg_sq"];
+    auto& slowBuffer = memory_manager[Name("Ranger") / param.getName() / "slow_buffer"];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < param.size(); ++i)
+    {
+        // m_new = beta1 * m + (1 - beta1) * grad
+        m[i] = mBeta1 * m[i] + (1.0_dt - mBeta1) * (grad[i] - gradMean);
+        // v_new = beta2 * v + (1 - beta2) * grad * grad
+        v[i] = mBeta2 * v[i] + (1.0_dt - mBeta2) * (grad[i] - gradMean) * (grad[i] - gradMean);
+        // Param decay
+        if (mWeightDecay != 0.0_dt)
+        {
+            param[i] = param[i] * (1.0_dt - mWeightDecay * mLearningRate);
+        }
+        auto denominator = 1.0_dt;
+        if (nSma > mNSmaThreshold)
+        {
+            // param_new = param - lr * step_size * m_new / (sqrt(v_new) + epsilon)
+            // else param_new = param - lr * step_size * m_new
+            denominator = std::sqrt(v[i]) + mEpsilon;
+        }
+        param[i] = param[i] - mLearningRate * stepSize * m[i] / denominator;
+
+        // Lookahead part
+        if (mStep % mK == 0)
+        {
+            slowBuffer[i] += mAlpha * (param[i] - slowBuffer[i]);
+            param[i] = slowBuffer[i];
+        }
+    }
+    beta1T[0] *= mBeta1;
+    beta2T[0] *= mBeta2;
+}
+
+void Ranger::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Ranger", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    if (!memory_manager.tensorExists(Name("Ranger") / param.getName() / "exp_avg"))
+    {
+        memory_manager.createTensor(Name("Ranger") / param.getName() / "exp_avg", param.getShape());
+        memory_manager.createTensor(Name("Ranger") / param.getName() / "exp_avg_sq", param.getShape());
+        // In order to reduce calculations
+        memory_manager.createTensor(Name("Ranger") / param.getName() / "beta1T", 1, 1, 1, 1, TOHTYPE(mBeta1));
+        memory_manager.createTensor(Name("Ranger") / param.getName() / "beta2T", 1, 1, 1, 1, TOHTYPE(mBeta2));
+        // For lookahead part
+        memory_manager.createTensor(Name("Ranger") / param.getName() / "slow_buffer", param.getShape(), param);
+    }
+
+    mStep++;
+
+    // Calculate values needed for denominator
+    auto& beta1T = memory_manager[Name("Ranger") / param.getName() / "beta1T"];
+    auto& beta2T = memory_manager[Name("Ranger") / param.getName() / "beta2T"];
+    dtype nSma = 0.0_dt;
+    dtype stepSize = 1.0_dt;
+    auto& buff = mBuffered[mStep % 10];
+    if (mStep == std::get<stepNumField>(buff))
+    {
+        nSma = std::get<smaValField>(buff);
+        stepSize = std::get<stepSizeField>(buff);
+    }
+    else
+    {
+        auto nSmaMax = 2.0_dt / (1.0_dt - mBeta2) - 1.0_dt;
+        nSma = nSmaMax - 2.0_dt * static_cast<dtype>(mStep) * TODTYPE(beta2T[0] / (1.0_hf - beta2T[0]));
+        auto stepSizeDivisor = 1.0_dt - TODTYPE(beta1T[0]);
+        if (nSma > mNSmaThreshold)
+        {
+            stepSize = std::sqrt((1.0_dt - TODTYPE(beta2T[0])) * (nSma - 4.0_dt) / (nSmaMax - 4.0_dt) * (nSma - 2.0_dt) / nSma * nSmaMax / (nSmaMax - 2.0_dt));
+        }
+        stepSize /= stepSizeDivisor;
+        std::get<stepNumField>(buff) = mStep;
+        std::get<smaValField>(buff) = nSma;
+        std::get<stepSizeField>(buff) = stepSize;
+    }
+
+    auto gradMean = 0.0_dt;
+    // If use GC option (in original implementation it is used for convolutions and FC)
+    if (mUseGc)
+    {
+#if defined(_OPENMP)
+#pragma omp parallel for reduction(+ : gradMean)
+#endif
+        for (size_t i = 0; i < grad.size(); ++i)
+        {
+            gradMean += TODTYPE(grad[i]);
+        }
+        gradMean /= TODTYPE(grad.size());
+    }
+
+    // Update param
+    auto& m = memory_manager[Name("Ranger") / param.getName() / "exp_avg"];
+    auto& v = memory_manager[Name("Ranger") / param.getName() / "exp_avg_sq"];
+    auto& slowBuffer = memory_manager[Name("Ranger") / param.getName() / "slow_buffer"];
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < param.size(); ++i)
+    {
+        // m_new = beta1 * m + (1 - beta1) * grad
+        const auto mTmp = mBeta1 * TODTYPE(m[i]) + (1.0_dt - mBeta1) * (TODTYPE(grad[i]) - gradMean);
+        m[i] = TOHTYPE(mTmp);
+        // v_new = beta2 * v + (1 - beta2) * grad * grad
+        const auto vTmp = mBeta2 * TODTYPE(v[i]) + (1.0_dt - mBeta2) * (TODTYPE(grad[i]) - gradMean) * (TODTYPE(grad[i]) - gradMean);
+        v[i] = TOHTYPE(vTmp);
+        // Param decay
+        if (mWeightDecay != 0.0_dt)
+        {
+            param[i] = param[i] * TOHTYPE(1.0_dt - mWeightDecay * mLearningRate);
+        }
+        auto denominator = 1.0_dt;
+        if (nSma > mNSmaThreshold)
+        {
+            // param_new = param - lr * step_size * m_new / (sqrt(v_new) + epsilon)
+            // else param_new = param - lr * step_size * m_new
+            denominator = std::sqrt(vTmp) + mEpsilon;
+        }
+        param[i] = param[i] - TOHTYPE(mLearningRate * stepSize * mTmp / denominator);
+
+        // Lookahead part
+        if (mStep % mK == 0)
+        {
+            slowBuffer[i] += TOHTYPE(mAlpha) * (param[i] - slowBuffer[i]);
+            param[i] = slowBuffer[i];
+        }
+    }
+    beta1T[0] = TOHTYPE(mBeta1 * TODTYPE(beta1T[0]));
+    beta2T[0] = TOHTYPE(mBeta2 * TODTYPE(beta2T[0]));
+}
+
+std::ostream& Ranger::as_ostream(std::ostream& out) const
+{
+    out << "Ranger(lr=" << std::scientific << mLearningRate << ", alpha=" << mAlpha << ", k=" << mK << ", nSMaThreshold=" << mNSmaThreshold << ", beta1=" << mBeta1;
+    out << ", beta2=" << mBeta2 << ", epsilon=" << mEpsilon << ", weightDecay=" << mWeightDecay << ", useGc=" << (mUseGc ? "true" : "false") << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Ranger.h b/training/src/compiler/training/base/optimizers/Ranger.h
new file mode 100644
index 00000000..06ca03a8
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Ranger.h
@@ -0,0 +1,73 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RANGER_H
+#define RANGER_H
+
+#include "Optimizer.h"
+#include <array>
+#include <iostream>
+#include <tuple>
+
+namespace raul::optimizers
+{
+
+/**
+ * @brief Ranger algorithm
+ *
+ * The Ranger optimizer combines two very new developments - RAdam + Lookahead + Gradient Centralization - into
+ * a single optimizer for deep learning. Implementation follows https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer
+ *
+ *  @see
+ *  - Gradient Centralization: H. Yong, J. Huang, X. Hua, L. Zhang, "Gradient Centralization: A New Optimization Technique for Deep Neural Networks"  arXiv:2004.01461v2 [cs.CV],
+ * https://arxiv.org/abs/2004.01461v2
+ *  - LookAhead: M. Zhang, J. Lucas, G. Hinton, J. Ba, "Lookahead Optimizer: k steps forward, 1 step back" arXiv:1907.08610 [cs], Dec 2019
+ *  - RAdam: L. Liu, H. Jiang, P. He, W. Chen, X. Liu, J. Gao, J. Han, "On the variance of the adaptive learning rate and beyond" arXiv:1908.03265 [cs], Apr 2020
+ */
+struct Ranger : public Optimizer
+{
+    explicit Ranger(const dtype lr,
+                    const dtype alpha = 0.5_dt,
+                    const size_t k = 6,
+                    const dtype nSmaThreshold = 5.0_dt,
+                    const dtype beta1 = 0.95_dt,
+                    const dtype beta2 = 0.999_dt,
+                    const dtype eps = 1.0e-5_dt,
+                    const dtype weightDecay = 0.0_dt,
+                    bool useGc = true);
+    void setLearningRate(dtype lr) final { mLearningRate = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return mLearningRate; }
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype mLearningRate;
+    dtype mAlpha;
+    size_t mK;
+    dtype mNSmaThreshold;
+    dtype mBeta1;
+    dtype mBeta2;
+    dtype mEpsilon;
+    dtype mWeightDecay;
+    bool mUseGc;
+    int mStep;
+
+    std::array<std::tuple<int, dtype, dtype>, 10> mBuffered;
+};
+
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Rprop.cpp b/training/src/compiler/training/base/optimizers/Rprop.cpp
new file mode 100644
index 00000000..12d7ad21
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Rprop.cpp
@@ -0,0 +1,169 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Rprop.h"
+#include <iostream>
+#include <stdexcept>
+
+using namespace raul;
+
+namespace
+{
+
+template<typename T>
+T sign(const T val)
+{
+    return static_cast<T>((TODTYPE(val) > 0.0_dt) - (TODTYPE(val) < 0.0_dt));
+}
+#if !defined(ANDROID)
+template<>
+half sign(const half val)
+{
+    return half_float::half_cast<half>((val > 0.0_hf) - (val < 0.0_hf));
+}
+#endif
+
+constexpr raul::dtype lr_lower_boundary = 0.0_dt;
+constexpr raul::dtype alpha_lower_boundary = 0.0_dt;
+constexpr raul::dtype beta_lower_boundary = 1.0_dt;
+
+}
+
+namespace raul::optimizers
+{
+
+Rprop::Rprop(const dtype lr, const dtype alpha, const dtype beta, const dtype minStep, const dtype maxStep)
+    : mLearningRate(lr)
+    , mAlpha(alpha)
+    , mBeta(beta)
+    , mMinStep(minStep)
+    , mMaxStep(maxStep)
+{
+    if (lr <= lr_lower_boundary)
+    {
+        THROW_NONAME("Rprop", "reset lr>" + Conversions::toString(lr_lower_boundary) + " (current lr=" + Conversions::toString(lr) + ")");
+    }
+    if (alpha < alpha_lower_boundary || alpha > beta_lower_boundary)
+    {
+        THROW_NONAME("Rprop",
+                     "reset alpha from [" + Conversions::toString(alpha_lower_boundary) + "," + Conversions::toString(beta_lower_boundary) + "] (current alpha=" + Conversions::toString(alpha) + ")");
+    }
+    if (beta < beta_lower_boundary)
+    {
+        THROW_NONAME("Rprop", "reset beta from [" + Conversions::toString(beta_lower_boundary) + ",) (current beta=" + Conversions::toString(beta) + ")");
+    }
+}
+
+void Rprop::optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Rprop", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    if (!memory_manager.tensorExists(Name("Rprop") / param.getName() / "prevGradSigns"))
+    {
+        Tensor* prevGradSigns = memory_manager.createTensor(Name("Rprop") / param.getName() / "prevGradSigns", 1, param.size(), 1, 1, 1.0_dt);
+        memory_manager.createTensor(Name("Rprop") / param.getName() / "prevLRs", 1, param.size(), 1, 1, mLearningRate);
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < param.size(); i++)
+        {
+            (*prevGradSigns)[i] = sign(grad[i]);
+            param[i] = param[i] - mLearningRate * (*prevGradSigns)[i];
+        }
+        return;
+    }
+
+    Tensor& signs = memory_manager[Name("Rprop") / param.getName() / "prevGradSigns"];
+    Tensor& LRs = memory_manager[Name("Rprop") / param.getName() / "prevLRs"];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < param.size(); i++)
+    {
+        if (signs[i] * sign(grad[i]) > 0.0_dt)
+        {
+            LRs[i] = std::min(LRs[i] * mBeta, mMaxStep);
+        }
+        else
+        {
+            if (signs[i] * sign(grad[i]) < 0.0_dt)
+            {
+                LRs[i] = std::max(LRs[i] * mAlpha, mMinStep);
+                signs[i] = 0.0_dt;
+                continue;
+            }
+        }
+        signs[i] = sign(grad[i]);
+        param[i] = param[i] - LRs[i] * signs[i];
+    }
+}
+
+void Rprop::optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("Rprop", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+
+    if (!memory_manager.tensorExists(Name("Rprop") / param.getName() / "prevGradSigns"))
+    {
+        TensorFP16* prevGradSigns = memory_manager.createTensor(Name("Rprop") / param.getName() / "prevGradSigns", 1, param.size(), 1, 1, 1.0_hf);
+        memory_manager.createTensor(Name("Rprop") / param.getName() / "prevLRs", 1, param.size(), 1, 1, TOHTYPE(mLearningRate));
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+        for (size_t i = 0; i < param.size(); i++)
+        {
+            (*prevGradSigns)[i] = sign(grad[i]);
+            param[i] = TOHTYPE(TODTYPE(param[i]) - mLearningRate * TODTYPE((*prevGradSigns)[i]));
+        }
+        return;
+    }
+
+    TensorFP16& signs = memory_manager[Name("Rprop") / param.getName() / "prevGradSigns"];
+    TensorFP16& LRs = memory_manager[Name("Rprop") / param.getName() / "prevLRs"];
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t i = 0; i < param.size(); i++)
+    {
+        if (signs[i] * sign(grad[i]) > 0.0_hf)
+        {
+            LRs[i] = TOHTYPE(std::min(TODTYPE(LRs[i]) * mBeta, mMaxStep));
+        }
+        else
+        {
+            if (signs[i] * sign(grad[i]) < 0.0_hf)
+            {
+                LRs[i] = TOHTYPE(std::max(TODTYPE(LRs[i]) * mAlpha, mMinStep));
+                signs[i] = 0.0_hf;
+                continue;
+            }
+        }
+        signs[i] = sign(grad[i]);
+        param[i] = param[i] - LRs[i] * signs[i];
+    }
+}
+
+std::ostream& Rprop::as_ostream(std::ostream& out) const
+{
+    out << "Rprop(lr=" << std::scientific << mLearningRate << ", alpha=" << mAlpha << ", beta=" << mBeta << ", min step=" << mMinStep << ", max step=" << mMaxStep << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/Rprop.h b/training/src/compiler/training/base/optimizers/Rprop.h
new file mode 100644
index 00000000..ee74c4f9
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/Rprop.h
@@ -0,0 +1,68 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef RPROP_H
+#define RPROP_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+
+/**
+ * @brief Resilient backpropagation algorithm (Rprop)
+ *
+ *  RProp is a popular gradient descent algorithm that only
+ *  uses the signs of gradients to compute updates. Parameters:
+ *  1. learning rate (lr).
+ *  2. alpha and beta.
+ *  3. minimum and maximum step sizes (minStep, maxStep)
+ *  An optimization algorithm works according to the following formula:
+ *
+ *  \f[
+ *      \theta_{t} =  \theta_{t-1} - \eta_{t-1} sign (\nabla_{\theta} E(\theta_{t-1})),
+ *  \f]
+ *  where
+ *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+ *  - \f$\eta\f$ depends on sign of previous derivitive. Possibilities:
+ *  1.$\eta_{t} =  min(\eta_{t-1} * \alpha, \eta_{max}), if \nabla_{\theta} E(\theta_{t-1}) * \nabla_{\theta} E(\theta_{t}) > 0$
+ *  2.$\eta_{t} =  max(\eta_{t-1} * \beta, \eta_{min}), if \nabla_{\theta} E(\theta_{t-1}) * \nabla_{\theta} E(\theta_{t}) < 0$
+ *  3.$\eta_{t} =  \eta_{t-1}, otherwise$
+ *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+ *
+ *  @see
+ *  - M. Riedmiller, H. Braun, “A direct adaptive method for faster backpropagation learning: The RPROP algorithm” IEEE International Conference on (pp. 586-591), 1993
+ */
+struct Rprop : public Optimizer
+{
+    explicit Rprop(const dtype lr, const dtype alpha = 0.5_dt, const dtype beta = 1.2_dt, const dtype minStep = 1.0e-6_dt, const dtype maxStep = 50.0_dt);
+    void setLearningRate(dtype lr) final { mLearningRate = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return mLearningRate; }
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype mLearningRate;
+    dtype mAlpha;
+    dtype mBeta;
+    dtype mMinStep;
+    dtype mMaxStep;
+};
+
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/SGD.cpp b/training/src/compiler/training/base/optimizers/SGD.cpp
new file mode 100644
index 00000000..0f91f9c3
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/SGD.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "SGD.h"
+#include <iostream>
+#include <stdexcept>
+
+namespace raul::optimizers
+{
+
+SGD::SGD(const dtype lr)
+    : m_learning_rate(lr)
+{
+    if (lr <= .0)
+    {
+        THROW_NONAME("SGD", "reset lr>0 (current lr=" + Conversions::toString(lr) + ")");
+    }
+}
+
+void SGD::optimize(MemoryManager&, Tensor& param, const Tensor& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("SGD", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+    size_t n = param.size();
+    OPENBLAS_CONST dtype sa = TODTYPE(-1.0) * this->m_learning_rate;
+    OPENBLAS_CONST dtype* sx = &(grad[0]);
+    size_t incx = 1U;
+    dtype* sy = &(param[0]);
+    size_t incy = 1U;
+    size_t xOffset = 0U;
+    size_t yOffset = 0U;
+    Common::axpy(n, sa, sx, incx, sy, incy, xOffset, yOffset);
+}
+
+void SGD::optimize(MemoryManagerFP16&, TensorFP16& param, const TensorFP16& grad)
+{
+    if (param.size() != grad.size())
+    {
+        THROW_NONAME("SGD", "parameters and gradients must have the same size (" + Conversions::toString(param.size()) + " != " + Conversions::toString(grad.size()) + ")");
+    }
+    size_t n = param.size();
+    OPENBLAS_CONST dtype sa = TODTYPE(-1.0) * this->m_learning_rate;
+    OPENBLAS_CONST half* sx = &(grad[0]);
+    size_t incx = 1U;
+    half* sy = &(param[0]);
+    size_t incy = 1U;
+    size_t xOffset = 0U;
+    size_t yOffset = 0U;
+    Common::axpy(n, sa, sx, incx, sy, incy, xOffset, yOffset);
+}
+
+std::ostream& SGD::as_ostream(std::ostream& out) const
+{
+    out << "SGD(lr=" << std::scientific << this->m_learning_rate << ")";
+    return out;
+}
+
+} // raul::optimizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/SGD.h b/training/src/compiler/training/base/optimizers/SGD.h
new file mode 100644
index 00000000..b1f00301
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/SGD.h
@@ -0,0 +1,56 @@
+// Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SGD_H
+#define SGD_H
+
+#include "Optimizer.h"
+#include <iostream>
+
+namespace raul::optimizers
+{
+/**
+ * @brief Stochastic gradient descent (SGD)
+ *
+ *  This is classical stochastic gradient descent with
+ *  one parameter: learning rate (lr). An optimization
+ *  algorithm works according to the following formula.
+ *
+ *  \f[
+ *      \theta_{t} =  \theta_{t-1} - \eta_{t-1} \nabla_{\theta} E(\theta_{t-1}),
+ *  \f]
+ *  where
+ *  - \f$\theta\f$ is a tuned parameter at specific step of the algorithm,
+ *  - \f$\eta\f$ is a learning rate,
+ *  - \f$E(\theta)\f$ is an objective function (error function in our case).
+ *
+ *  @see
+ *  - S. Ruder, “An overview of gradient descent optimization algorithms” arXiv:1609.04747 [cs], Jun. 2017.
+ */
+struct SGD : public Optimizer
+{
+    explicit SGD(const dtype lr);
+    void setLearningRate(dtype lr) final { m_learning_rate = lr; }
+    [[nodiscard]] dtype getLearningRate() final { return m_learning_rate; }
+
+  private:
+    void optimize(MemoryManager& memory_manager, Tensor& param, const Tensor& grad) final;
+    void optimize(MemoryManagerFP16& memory_manager, TensorFP16& param, const TensorFP16& grad) final;
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+  private:
+    dtype m_learning_rate;
+};
+} // raul::optimizers
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/regularizers/Regularizer.cpp b/training/src/compiler/training/base/optimizers/regularizers/Regularizer.cpp
new file mode 100644
index 00000000..778c0de7
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/regularizers/Regularizer.cpp
@@ -0,0 +1,61 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Regularizer.h"
+
+namespace raul::optimizers::Regularizers
+{
+
+void Regularizer::addPenalty(Tensor& param, Tensor& grad)
+{
+    for (auto paramIter = param.begin(), gradIter = grad.begin(); paramIter != param.end(); ++paramIter, ++gradIter)
+    {
+        *gradIter += mRegularizerStrategy->operator()(*paramIter);
+    }
+}
+
+void Regularizer::operator()(MemoryManager& memory_manager, Tensor& param, Tensor& grad)
+{
+    addPenalty(param, grad);
+    mOptimizer->operator()(memory_manager, param, grad);
+}
+
+void Regularizer::operator()(MemoryManagerFP16&, TensorFP16&, TensorFP16&)
+{
+    // d.polubotko: implement
+    THROW_NONAME("Regularizer[operator()]", "not implemented");
+    // addPenalty(param, grad);
+    // mOptimizer->operator()(memory_manager, param, grad);
+}
+
+raul::dtype Regularizer::getPenalty(Workflow& net) const
+{
+    return mRegularizerStrategy->getPenalty(net);
+}
+
+void Regularizer::setLearningRate(dtype lr)
+{
+    mOptimizer->setLearningRate(lr);
+}
+
+[[nodiscard]] dtype Regularizer::getLearningRate()
+{
+    return mOptimizer->getLearningRate();
+}
+
+std::ostream& Regularizer::as_ostream(std::ostream& out) const
+{
+    return out << "Regularizer" << std::endl;
+}
+
+} // raul::optimizers::Regularizers
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/regularizers/Regularizer.h b/training/src/compiler/training/base/optimizers/regularizers/Regularizer.h
new file mode 100644
index 00000000..b4938ed9
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/regularizers/Regularizer.h
@@ -0,0 +1,56 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef REGULARIZER_H
+#define REGULARIZER_H
+
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/IOptimizer.h>
+#include <training/base/optimizers/regularizers/strategies/IRegularizerStrategy.h>
+
+namespace raul::optimizers::Regularizers
+{
+
+/**
+ * @brief The wrapper over Optimizer class allowing to use regularization during model training
+ */
+class Regularizer : public IOptimizer
+{
+  public:
+    Regularizer(std::unique_ptr<raul::optimizers::Regularizers::Strategies::IRegularizerStrategy> regularizerStrategy, std::unique_ptr<raul::optimizers::IOptimizer> optimizer)
+        : mRegularizerStrategy(std::move(regularizerStrategy))
+        , mOptimizer(std::move(optimizer))
+    {
+    }
+
+    raul::dtype getPenalty(Workflow& net) const;
+    void operator()(MemoryManager& memory_manager, Tensor& param, Tensor& grad) override;
+    void operator()(MemoryManagerFP16& memory_manager, TensorFP16& param, TensorFP16& grad) override;
+    void setLearningRate(dtype lr) override;
+    [[nodiscard]] dtype getLearningRate() final;
+
+  private:
+    void addPenalty(Tensor& param, Tensor& grad);
+
+    virtual std::ostream& as_ostream(std::ostream& out) const final;
+    std::unique_ptr<raul::optimizers::Regularizers::Strategies::IRegularizerStrategy> mRegularizerStrategy;
+    std::unique_ptr<raul::optimizers::IOptimizer> mOptimizer;
+};
+
+} // namespace raul::optimizers::Regularizers
+
+#endif // REGULARIZER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/regularizers/strategies/IRegularizerStrategy.h b/training/src/compiler/training/base/optimizers/regularizers/strategies/IRegularizerStrategy.h
new file mode 100644
index 00000000..13481372
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/regularizers/strategies/IRegularizerStrategy.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef I_REGULARIZER_STRATEGY_H
+#define I_REGULARIZER_STRATEGY_H
+
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/compiler/Workflow.h>
+
+namespace raul::optimizers::Regularizers::Strategies
+{
+/**
+ * @brief Regularizer strategy interface
+ */
+struct IRegularizerStrategy
+{
+    IRegularizerStrategy() = default;
+    virtual ~IRegularizerStrategy() = default;
+
+    IRegularizerStrategy(const IRegularizerStrategy& other) = delete;
+    IRegularizerStrategy(const IRegularizerStrategy&& other) = delete;
+    IRegularizerStrategy& operator=(const IRegularizerStrategy& other) = delete;
+    IRegularizerStrategy& operator=(const IRegularizerStrategy&& other) = delete;
+
+    virtual dtype operator()(dtype weight) const = 0;
+    virtual dtype getPenalty(Workflow& net) const = 0;
+    friend std::ostream& operator<<(std::ostream& out, const IRegularizerStrategy& instance) { return instance.as_ostream(out); }
+
+  private:
+    virtual std::ostream& as_ostream(std::ostream& out) const = 0;
+};
+} // namespace raul::optimizers::Regularizers::Strategies
+
+#endif // I_REGULARIZER_STRATEGY_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/regularizers/strategies/L1.cpp b/training/src/compiler/training/base/optimizers/regularizers/strategies/L1.cpp
new file mode 100644
index 00000000..5630db3e
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/regularizers/strategies/L1.cpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "L1.h"
+#include "training/base/common/Common.h"
+#include "training/base/layers/BasicLayer.h"
+
+namespace raul::optimizers::Regularizers::Strategies
+{
+
+dtype L1::operator()(dtype weight) const
+{
+    return this->mLambda * Common::sign(weight);
+}
+
+dtype L1::getPenalty(Workflow& net) const
+{
+    auto loss_delta = 0.0_dt;
+    auto paramsAndWeights = net.getTrainableParameters();
+    for (auto s : paramsAndWeights)
+    {
+        const auto n = s.Param.size();
+        for (size_t i = 0; i < n; i++)
+        {
+            loss_delta += std::abs(s.Param[i]);
+        }
+    }
+    return this->mLambda * loss_delta;
+}
+
+std::ostream& L1::as_ostream(std::ostream& out) const
+{
+    out << "L1(lambda=" << std::scientific << this->mLambda << ")";
+    return out;
+}
+
+} // raul::loss::regularization
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/regularizers/strategies/L1.h b/training/src/compiler/training/base/optimizers/regularizers/strategies/L1.h
new file mode 100644
index 00000000..fccbcbdf
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/regularizers/strategies/L1.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef L1_H
+#define L1_H
+
+#include "IRegularizerStrategy.h"
+#include <training/compiler/Workflow.h>
+
+namespace raul::optimizers::Regularizers::Strategies
+{
+
+/**
+ * @brief L1 regularization
+ *
+ * L1 (norm) regularization is also known as LASSO regularization.
+ *
+ * By definition, L1 regularization adds penalty term to the loss function:
+ * \f[
+ *      L = L_{0} + \lambda \sum_{i} |\omega_i|
+ * \f]
+ *
+ * The regularization term in loss function gives the following correction
+ * \f[
+ *      \lambda \frac{\partial}{\partial \omega_k}\sum_{} |\omega_i| = \lambda \frac{\omega_k}{|\omega_k|} = \pm \lambda.
+ * \f]
+ * 1. Here derivation was got by substitution: \f[|x| = \sqrt{x^2}\f].
+ * 2. Also, there is indeterminate at zero point.
+ *
+ */
+struct L1 final : public IRegularizerStrategy
+{
+    explicit L1(const dtype lambda)
+        : mLambda(lambda)
+    {
+    }
+    /**
+     * @brief Return additional correction to the weight
+     *
+     * @param weight (omega_k in formula above)
+     * @return
+     */
+    dtype operator()(dtype weight) const override;
+
+    /**
+     * @brief Returns penalty to the loss function
+     *
+     * @param net
+     * @return penalty
+     */
+    dtype getPenalty(Workflow& net) const override;
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const override;
+    const dtype mLambda;
+};
+
+} // namespace raul::optimizers::Regularizers::Strategies
+
+#endif // L1_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/regularizers/strategies/L2.cpp b/training/src/compiler/training/base/optimizers/regularizers/strategies/L2.cpp
new file mode 100644
index 00000000..0b36e870
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/regularizers/strategies/L2.cpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "L2.h"
+#include "training/base/common/Common.h"
+#include "training/base/layers/BasicLayer.h"
+
+namespace raul::optimizers::Regularizers::Strategies
+{
+
+dtype L2::operator()(dtype weight) const
+{
+    return 2.0_dt * this->mLambda * weight;
+}
+
+dtype L2::getPenalty(Workflow& net) const
+{
+    auto loss_delta = 0.0_dt;
+    auto paramsAndWeights = net.getTrainableParameters();
+    for (auto s : paramsAndWeights)
+    {
+        const auto n = s.Param.size();
+        for (size_t i = 0; i < n; i++)
+        {
+            loss_delta += s.Param[i] * s.Param[i];
+        }
+    }
+    return this->mLambda * loss_delta;
+}
+
+std::ostream& L2::as_ostream(std::ostream& out) const
+{
+    out << "L2(lambda=" << std::scientific << this->mLambda << ")";
+    return out;
+}
+
+} // raul::loss::regularization
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/regularizers/strategies/L2.h b/training/src/compiler/training/base/optimizers/regularizers/strategies/L2.h
new file mode 100644
index 00000000..06c940a1
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/regularizers/strategies/L2.h
@@ -0,0 +1,62 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef L2_H
+#define L2_H
+
+#include "IRegularizerStrategy.h"
+#include <training/compiler/Workflow.h>
+
+namespace raul::optimizers::Regularizers::Strategies
+{
+
+struct L2 final : public IRegularizerStrategy
+{
+    explicit L2(const dtype lambda)
+        : mLambda(lambda)
+    {
+    }
+
+    /**
+     * @brief Return additional correction to the weight
+     *
+     *  The regularization term in loss function gives the following correction
+     *  \f[
+     *  \lambda \frac{\partial}{\partial \omega_k}\sum_{i} \omega_i^2 = 2 \lambda \omega_k
+     *  \f]
+     *
+     * @param weight (omega_k in formula above)
+     * @return
+     */
+    dtype operator()(dtype weight) const override;
+
+    /**
+     * @brief Returns penalty to the loss function
+     *
+     * \f[
+     * L = L_{0} + \lambda \sum_{i} \omega_i^2
+     * \f]
+     *
+     * @param net
+     * @return penalty
+     */
+    dtype getPenalty(Workflow& net) const override;
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const override;
+    const dtype mLambda;
+};
+
+} // namespace raul::optimizers::Regularizers::Strategies
+
+#endif // L2_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/LrScheduler.cpp b/training/src/compiler/training/base/optimizers/schedulers/LrScheduler.cpp
new file mode 100644
index 00000000..77ac03a7
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/LrScheduler.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LrScheduler.h"
+#include "training/base/optimizers/schedulers/strategies/Base.h"
+
+namespace raul::optimizers::Scheduler
+{
+
+void LrScheduler::step()
+{
+    ++currentStep;
+    calculateLR();
+}
+
+void LrScheduler::reset()
+{
+    currentStep = 0;
+    mOptimizer->setLearningRate(baseLR);
+}
+
+void LrScheduler::reset(size_t steps)
+{
+    reset();
+    forward(steps);
+}
+
+void LrScheduler::operator()(MemoryManager& memory_manager, Tensor& params, Tensor& grads)
+{
+    mOptimizer->operator()(memory_manager, params, grads);
+}
+
+void LrScheduler::operator()(MemoryManagerFP16& memory_manager, TensorFP16& params, TensorFP16& grads)
+{
+    mOptimizer->operator()(memory_manager, params, grads);
+}
+
+void LrScheduler::setLearningRate(dtype lr)
+{
+    mOptimizer->setLearningRate(lr);
+}
+[[nodiscard]] dtype LrScheduler::getLearningRate()
+{
+    return mOptimizer->getLearningRate();
+}
+
+void LrScheduler::calculateLR()
+{
+    auto lr = mOptimizer->getLearningRate();
+    mStrategy->calculateLR(baseLR, currentStep, lr);
+    mOptimizer->setLearningRate(lr);
+}
+
+void LrScheduler::forward(size_t numberOfSteps)
+{
+    for (size_t i = 0; i < numberOfSteps; ++i)
+    {
+        step();
+    }
+}
+
+} // raul::optimizers::Scheduler
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/LrScheduler.h b/training/src/compiler/training/base/optimizers/schedulers/LrScheduler.h
new file mode 100644
index 00000000..b4bf511b
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/LrScheduler.h
@@ -0,0 +1,72 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SCHEDULER_H
+#define SCHEDULER_H
+
+#include <iostream>
+#include <utility>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/optimizers/IOptimizer.h>
+
+#include "training/base/optimizers/schedulers/strategies/Base.h"
+
+namespace raul::optimizers::Scheduler
+{
+
+struct LrScheduler : IOptimizer
+{
+    LrScheduler(std::unique_ptr<Strategies::Base> strategy, std::unique_ptr<IOptimizer> optimizer)
+        : mStrategy(std::move(strategy))
+        , mOptimizer(std::move(optimizer))
+        , currentStep(0)
+        , baseLR(mOptimizer->getLearningRate())
+    {
+    }
+
+    LrScheduler(const LrScheduler& other) = delete;
+    LrScheduler(const LrScheduler&& other) = delete;
+    LrScheduler& operator=(const LrScheduler& other) = delete;
+    LrScheduler& operator=(const LrScheduler&& other) = delete;
+
+    void step();
+    void reset();
+    void reset(size_t steps);
+
+    void operator()(MemoryManager& memory_manager, Tensor& params, Tensor& grads) override;
+    void operator()(MemoryManagerFP16& memory_manager, TensorFP16& params, TensorFP16& grads) override;
+    void setLearningRate(dtype lr) override;
+    [[nodiscard]] dtype getLearningRate() override;
+
+    friend std::ostream& operator<<(std::ostream& out, const LrScheduler& instance)
+    {
+        out << "Scheduler: " << *instance.mStrategy;
+        return out;
+    }
+
+  private:
+    void calculateLR();
+
+    void forward(size_t numberOfSteps);
+
+    std::unique_ptr<Strategies::Base> mStrategy;
+    std::unique_ptr<IOptimizer> mOptimizer;
+    size_t currentStep;
+    const dtype baseLR;
+};
+
+} // raul::optimizers::Scheduler
+
+#endif // SCHEDULER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/Base.cpp b/training/src/compiler/training/base/optimizers/schedulers/strategies/Base.cpp
new file mode 100644
index 00000000..a42945b4
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/Base.cpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Base.h"
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+
+void Base::calculateLR(dtype baseLR, long long step, dtype& currentLR)
+{
+    if (child)
+    {
+        child->calculateLR(baseLR, step, currentLR);
+    }
+}
+
+} // raul::optimizers::Scheduler::Strategies
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/Base.h b/training/src/compiler/training/base/optimizers/schedulers/strategies/Base.h
new file mode 100644
index 00000000..5d5bbc4b
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/Base.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef BASE_STRATEGY_H
+#define BASE_STRATEGY_H
+
+#include <iostream>
+#include <utility>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/optimizers/IOptimizer.h>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+
+/**
+ * @brief Scheduler Base class
+ */
+struct Base
+{
+    Base() = default;
+
+    explicit Base(std::unique_ptr<Base> child)
+        : child(std::move(child))
+    {
+    }
+
+    virtual ~Base() = default;
+    virtual void calculateLR(dtype baseLR, long long step, dtype& currentLR);
+    friend std::ostream& operator<<(std::ostream& out, const Base& instance) { return instance.as_ostream(out); }
+
+  protected:
+    std::unique_ptr<Base> child;
+
+  private:
+    virtual std::ostream& as_ostream(std::ostream& out) const { return out; }
+};
+
+} // raul::optimizers::Scheduler::Strategies
+
+#endif // BASE_STRATEGY_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipLower.cpp b/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipLower.cpp
new file mode 100644
index 00000000..bd28329e
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipLower.cpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ClipLower.h"
+#include <iostream>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+
+std::ostream& ClipLower::as_ostream(std::ostream& out) const
+{
+    const std::string boundaryVal = boundary ? Conversions::toString(*boundary) : "no";
+    out << "ClipLowerStrategy(" << boundaryVal << ")";
+    return out;
+}
+
+void ClipLower::calculateLR(dtype baseLR, long long step, dtype& currentLR)
+{
+    Base::calculateLR(baseLR, step, currentLR);
+    auto boundaryValue = boundary ? *boundary : baseLR;
+    currentLR = std::max(currentLR, boundaryValue);
+}
+
+} // raul::optimizers::Scheduler::Strategies
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipLower.h b/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipLower.h
new file mode 100644
index 00000000..6b040e77
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipLower.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CLIP_LOWER_SCHEDULER_H
+#define CLIP_LOWER_SCHEDULER_H
+
+#include "training/base/optimizers/schedulers/LrScheduler.h"
+#include <iostream>
+#include <optional>
+#include <utility>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+/**
+ * @brief Clip lower boundary strategy
+ *
+ */
+struct ClipLower : public Base
+{
+
+    ClipLower() {}
+
+    explicit ClipLower(std::unique_ptr<Base> child)
+        : Base(std::move(child))
+    {
+    }
+
+    explicit ClipLower(dtype boundary)
+        : boundary(boundary)
+    {
+    }
+
+    ClipLower(dtype boundary, std::unique_ptr<Base> child)
+        : Base(std::move(child))
+        , boundary(boundary)
+    {
+    }
+
+    void calculateLR(dtype baseLR, long long step, dtype& currentLR) override;
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+    const std::optional<dtype> boundary;
+};
+} // raul::optimizers::Scheduler::Strategies
+
+#endif // CLIP_LOWER_SCHEDULER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipUpper.cpp b/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipUpper.cpp
new file mode 100644
index 00000000..c75fdc52
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipUpper.cpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "ClipUpper.h"
+#include <iostream>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+
+std::ostream& ClipUpper::as_ostream(std::ostream& out) const
+{
+    const std::string boundaryVal = boundary ? Conversions::toString(*boundary) : "no";
+    out << "ClipUpperStrategy(" << boundaryVal << ")";
+    return out;
+}
+
+void ClipUpper::calculateLR(dtype baseLR, long long step, dtype& currentLR)
+{
+    Base::calculateLR(baseLR, step, currentLR);
+    auto boundaryValue = boundary ? *boundary : baseLR;
+    currentLR = std::min(currentLR, boundaryValue);
+}
+
+} // raul::optimizers::Scheduler::Strategies
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipUpper.h b/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipUpper.h
new file mode 100644
index 00000000..837f014b
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/ClipUpper.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef CLIP_UPPER_SCHEDULER_H
+#define CLIP_UPPER_SCHEDULER_H
+
+#include "training/base/optimizers/schedulers/LrScheduler.h"
+#include <iostream>
+#include <optional>
+#include <utility>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+/**
+ * @brief Clip upper boundary strategy
+ *
+ */
+struct ClipUpper : public Base
+{
+
+    ClipUpper() {}
+
+    explicit ClipUpper(std::unique_ptr<Base> child)
+        : Base(std::move(child))
+    {
+    }
+
+    explicit ClipUpper(dtype boundary)
+        : boundary(boundary)
+    {
+    }
+
+    ClipUpper(dtype boundary, std::unique_ptr<Base> child)
+        : Base(std::move(child))
+        , boundary(boundary)
+    {
+    }
+
+    void calculateLR(dtype baseLR, long long step, dtype& currentLR) override;
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+    const std::optional<dtype> boundary;
+};
+} // raul::optimizers::Scheduler::Strategies
+
+#endif // CLIP_UPPER_SCHEDULER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/CosineAnnealing.cpp b/training/src/compiler/training/base/optimizers/schedulers/strategies/CosineAnnealing.cpp
new file mode 100644
index 00000000..3738f5d9
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/CosineAnnealing.cpp
@@ -0,0 +1,55 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CosineAnnealing.h"
+#include <iostream>
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+
+std::ostream& CosineAnnealing::as_ostream(std::ostream& out) const
+{
+    out << "Strategies::CosineAnnealing()";
+    return out;
+}
+
+void CosineAnnealing::calculateLR(dtype baseLR, long long step, dtype& currentLR)
+{
+    Base::calculateLR(baseLR, step, currentLR); // value calculated here is not used due to the nature of this strategy
+
+    float value = 1.f;
+
+    size_t i = step - 1;
+
+    if (static_cast<size_t>(i) < mWarmupLoops)
+    {
+        i = mWarmupLoops - 1 - i;
+        value = static_cast<float>(cos(M_PI * i / mWarmupLoops));
+        value = 0.5f * (value + 1.f);
+        value = pow(value, mWarmupPow);
+    }
+    else
+    {
+        i = i - mWarmupLoops;
+        value = static_cast<float>(cos(M_PI * i / mAnnealingLoops));
+        value = 0.5f * (value + 1.f);
+        value = pow(value, mAnnealingPow);
+    }
+    value = (mMaxA - mMinA) * value + mMinA;
+    currentLR = baseLR * value;
+}
+
+} // raul::optimizers::Scheduler::Strategies
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/CosineAnnealing.h b/training/src/compiler/training/base/optimizers/schedulers/strategies/CosineAnnealing.h
new file mode 100644
index 00000000..aa601c06
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/CosineAnnealing.h
@@ -0,0 +1,79 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef COSINE_ANNEALING_SCHEDULER_H
+#define COSINE_ANNEALING_SCHEDULER_H
+
+#include "training/base/optimizers/schedulers/LrScheduler.h"
+#include <iostream>
+#include <utility>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+/**
+ * @brief Cosine Annealing with Warm-up
+ *
+ *  Conforms to original paper with default arguments
+ *  Modifications:
+ *    1. Adds warm-up stage
+ *    2. Allows to specify separate powers for warmup and annealing stages.
+ *
+ *  @see
+ *  - Ilya Loshchilov, Frank Hutter, �SGDR: Stochastic Gradient Descent with Warm Restarts� arXiv:1608.03983v5 [cs.LG], May 2017.
+ */
+struct CosineAnnealing : public Base
+{
+
+    CosineAnnealing(size_t num_loops, float max_a = 1.f, float min_a = 0.f, float warmup_percentage = 0.00f, float warmup_pow = 1.f, float annealing_pow = 1.f)
+        : mNumLoops(num_loops)
+        , mMaxA(max_a)
+        , mMinA(min_a)
+        , mWarmupPercentage(warmup_percentage)
+        , mWarmupPow(warmup_pow)
+        , mAnnealingPow(annealing_pow)
+    {
+        mWarmupLoops = static_cast<size_t>(mNumLoops * mWarmupPercentage);
+        mAnnealingLoops = mNumLoops - mWarmupLoops;
+    }
+
+    CosineAnnealing(size_t num_loops, float max_a, float min_a, float warmup_percentage, float warmup_pow, float annealing_pow, std::unique_ptr<Base> child)
+        : Base(std::move(child))
+        , mNumLoops(num_loops)
+        , mMaxA(max_a)
+        , mMinA(min_a)
+        , mWarmupPercentage(warmup_percentage)
+        , mWarmupPow(warmup_pow)
+        , mAnnealingPow(annealing_pow)
+    {
+        mWarmupLoops = static_cast<size_t>(mNumLoops * mWarmupPercentage);
+        mAnnealingLoops = mNumLoops - mWarmupLoops;
+    }
+
+    void calculateLR(dtype baseLR, long long step, dtype& currentLR) override;
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+    size_t mNumLoops;
+    float mMaxA;
+    float mMinA;
+    float mWarmupPercentage;
+    float mWarmupPow;
+    float mAnnealingPow;
+
+    size_t mWarmupLoops;
+    size_t mAnnealingLoops;
+};
+} // raul::optimizers::Scheduler::Strategies
+
+#endif // COSINE_ANNEALING_SCHEDULER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/Exponential.cpp b/training/src/compiler/training/base/optimizers/schedulers/strategies/Exponential.cpp
new file mode 100644
index 00000000..009f95fd
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/Exponential.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Exponential.h"
+#include <iostream>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+
+std::ostream& Exponential::as_ostream(std::ostream& out) const
+{
+    out << "Strategies::Exponential(decay rate=" << decayRate << ", decay steps=" << decaySteps << ")";
+    return out;
+}
+
+void Exponential::calculateLR(dtype baseLR, long long step, dtype& currentLR)
+{
+    Base::calculateLR(baseLR, step, currentLR);
+    if (iterativeMode)
+    {
+        if (step % decaySteps == 0U)
+        {
+            currentLR *= decayRate;
+        }
+    }
+    else
+    {
+        currentLR = baseLR * std::pow(decayRate, static_cast<dtype>(step) / static_cast<dtype>(decaySteps));
+    }
+}
+
+} // raul::optimizers::Scheduler::Strategies
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/Exponential.h b/training/src/compiler/training/base/optimizers/schedulers/strategies/Exponential.h
new file mode 100644
index 00000000..e6a227c2
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/Exponential.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef EXPONENTIAL_SCHEDULER_H
+#define EXPONENTIAL_SCHEDULER_H
+
+#include <training/base/optimizers/schedulers/LrScheduler.h>
+#include <iostream>
+#include <utility>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+/**
+ * @brief Exponential scheduler strategy
+ *
+ */
+struct Exponential : public Base
+{
+
+    explicit Exponential(dtype decayRate)
+        : decayRate(decayRate)
+        , decaySteps(1U)
+        , iterativeMode(false)
+    {
+    }
+
+    Exponential(dtype decayRate, std::unique_ptr<Base> child)
+        : Base(std::move(child))
+        , decayRate(decayRate)
+        , decaySteps(1U)
+        , iterativeMode(false)
+    {
+    }
+
+    Exponential(dtype decayRate, size_t decaySteps)
+        : decayRate(decayRate)
+        , decaySteps(decaySteps)
+        , iterativeMode(false)
+    {
+    }
+
+    Exponential(dtype decayRate, size_t decaySteps, std::unique_ptr<Base> child)
+        : Base(std::move(child))
+        , decayRate(decayRate)
+        , decaySteps(decaySteps)
+        , iterativeMode(false)
+    {
+    }
+
+    void calculateLR(dtype baseLR, long long step, dtype& currentLR) override;
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+    dtype decayRate;
+    size_t decaySteps;
+    bool iterativeMode;
+};
+} // raul::optimizers::Scheduler::Strategies
+
+#endif // EXPONENTIAL_SCHEDULER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/Lambda.cpp b/training/src/compiler/training/base/optimizers/schedulers/strategies/Lambda.cpp
new file mode 100644
index 00000000..8078da1a
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/Lambda.cpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Lambda.h"
+#include <iostream>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+
+std::ostream& Lambda::as_ostream(std::ostream& out) const
+{
+    out << "Strategies::Lambda()";
+    return out;
+}
+
+void Lambda::calculateLR(dtype baseLR, long long step, dtype& currentLR)
+{
+    Base::calculateLR(baseLR, step, currentLR);
+    function(baseLR, step, currentLR);
+}
+
+} // raul::optimizers::Scheduler::Strategies
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/Lambda.h b/training/src/compiler/training/base/optimizers/schedulers/strategies/Lambda.h
new file mode 100644
index 00000000..1a625210
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/Lambda.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAMBDA_SCHEDULER_H
+#define LAMBDA_SCHEDULER_H
+
+#include "training/base/optimizers/schedulers/LrScheduler.h"
+#include <iostream>
+#include <utility>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+/**
+ * @brief Lambda scheduler strategy
+ *
+ */
+struct Lambda : public Base
+{
+    using Function = std::function<void(dtype, long long, dtype&)>;
+
+    explicit Lambda(Function function)
+        : function(std::move(function))
+    {
+    }
+
+    Lambda(Function function, std::unique_ptr<Base> child)
+        : Base(std::move(child))
+        , function(std::move(function))
+    {
+    }
+
+    void calculateLR(dtype baseLR, long long step, dtype& currentLR) override;
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+    Function function;
+};
+} // raul::optimizers::Scheduler::Strategies
+
+#endif // LAMBDA_SCHEDULER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/StepOffset.cpp b/training/src/compiler/training/base/optimizers/schedulers/strategies/StepOffset.cpp
new file mode 100644
index 00000000..48824298
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/StepOffset.cpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "StepOffset.h"
+#include <iostream>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+
+std::ostream& StepOffset::as_ostream(std::ostream& out) const
+{
+    out << "StepOffsetStrategy(" << offset << ")";
+    return out;
+}
+
+void StepOffset::calculateLR(dtype baseLR, long long step, dtype& currentLR)
+{
+    Base::calculateLR(baseLR, step + offset, currentLR);
+}
+
+} // raul::optimizers::Scheduler::Strategies
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/StepOffset.h b/training/src/compiler/training/base/optimizers/schedulers/strategies/StepOffset.h
new file mode 100644
index 00000000..5a8f825a
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/StepOffset.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef STEP_OFFSET_SCHEDULER_H
+#define STEP_OFFSET_SCHEDULER_H
+
+#include "training/base/optimizers/schedulers/LrScheduler.h"
+#include <iostream>
+#include <utility>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+/**
+ * @brief Step offset scheduler strategy
+ *
+ */
+struct StepOffset : public Base
+{
+
+    StepOffset(long long offset, std::unique_ptr<Base> child)
+        : Base(std::move(child))
+        , offset(offset)
+    {
+    }
+
+    void calculateLR(dtype baseLR, long long step, dtype& currentLR) override;
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+    const long long offset;
+};
+} // raul::optimizers::Scheduler::Strategies
+
+#endif // STEP_OFFSET_SCHEDULER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/WarmUp.cpp b/training/src/compiler/training/base/optimizers/schedulers/strategies/WarmUp.cpp
new file mode 100644
index 00000000..e4c304dc
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/WarmUp.cpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "WarmUp.h"
+#include <iostream>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+
+std::ostream& WarmUp::as_ostream(std::ostream& out) const
+{
+    out << "Strategies::WarmUp(warm up steps=" << Conversions::toString(warmUpSteps) << ")";
+    return out;
+}
+
+void WarmUp::calculateLR(dtype baseLR, long long step, dtype& currentLR)
+{
+    Base::calculateLR(baseLR, step, currentLR);
+    const auto percent = static_cast<dtype>(step) / static_cast<dtype>(warmUpSteps);
+    if (useMin)
+    {
+        currentLR = std::min(currentLR, baseLR * percent);
+    }
+    else
+    {
+        currentLR = baseLR * percent;
+    }
+}
+
+} // raul::optimizers::Scheduler::Strategies
\ No newline at end of file
diff --git a/training/src/compiler/training/base/optimizers/schedulers/strategies/WarmUp.h b/training/src/compiler/training/base/optimizers/schedulers/strategies/WarmUp.h
new file mode 100644
index 00000000..bccd94fe
--- /dev/null
+++ b/training/src/compiler/training/base/optimizers/schedulers/strategies/WarmUp.h
@@ -0,0 +1,65 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef WARM_UP_SCHEDULER_H
+#define WARM_UP_SCHEDULER_H
+
+#include "training/base/optimizers/schedulers/LrScheduler.h"
+#include <iostream>
+#include <utility>
+
+namespace raul::optimizers::Scheduler::Strategies
+{
+/**
+ * @brief Warm up scheduler strategy
+ *
+ */
+struct WarmUp : public Base
+{
+    explicit WarmUp(size_t warmUp)
+        : warmUpSteps(warmUp)
+        , useMin(false)
+    {
+    }
+
+    WarmUp(size_t warmUp, std::unique_ptr<Base> child)
+        : Base(std::move(child))
+        , warmUpSteps(warmUp)
+        , useMin(false)
+    {
+    }
+
+    WarmUp(size_t warmUp, bool useMin)
+        : warmUpSteps(warmUp)
+        , useMin(useMin)
+    {
+    }
+
+    WarmUp(size_t warmUp, bool useMin, std::unique_ptr<Base> child)
+        : Base(std::move(child))
+        , warmUpSteps(warmUp)
+        , useMin(useMin)
+    {
+    }
+
+    void calculateLR(dtype baseLR, long long step, dtype& currentLR) override;
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const final;
+
+    const size_t warmUpSteps;
+    const bool useMin;
+};
+} // raul::optimizers::Scheduler::Strategies
+
+#endif // WARM_UP_SCHEDULER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/base/postprocessing/GradientClipping.cpp b/training/src/compiler/training/base/postprocessing/GradientClipping.cpp
new file mode 100644
index 00000000..cd745493
--- /dev/null
+++ b/training/src/compiler/training/base/postprocessing/GradientClipping.cpp
@@ -0,0 +1,282 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "GradientClipping.h"
+#include <training/compiler/Workflow.h>
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4938)
+#endif
+
+namespace raul::postprocessing
+{
+
+GradientClipping::GradientClipping(raul::dtype clipNorm, std::optional<raul::dtype> globalNorm)
+    : mClipNorm(clipNorm)
+    , mGlobalNorm(globalNorm)
+    , mCurrentGlobalNorm(0.0_dt)
+{
+    if (mClipNorm <= 0.0_dt)
+    {
+        THROW_NONAME("GradientClipping", "clip norm should be more than zero");
+    }
+    if (mGlobalNorm && mGlobalNorm.value() <= 0.0_dt)
+    {
+        THROW_NONAME("GradientClipping", "global norm should be more than zero");
+    }
+}
+
+raul::dtype GradientClipping::calcGlobalNorm(const std::vector<ParamAndGrad>& trainableParams, const NetworkParameters& networkParameters) const
+{
+    raul::dtype qSum = 0.0_dt;
+    if (networkParameters.mCalculationMode == CalculationMode::DETERMINISTIC)
+    {
+        for (size_t j = 0; j < trainableParams.size(); ++j)
+        {
+            auto& [param, grad] = trainableParams[j];
+            for (size_t i = 0; i < grad.size(); ++i)
+            {
+                qSum += grad[i] * grad[i];
+            }
+        }
+    }
+#if defined(_OPENMP)
+    else if (networkParameters.mCalculationMode == CalculationMode::FAST)
+    {
+#pragma omp parallel for reduction(+ : qSum)
+        for (size_t j = 0; j < trainableParams.size(); ++j)
+        {
+            auto& [param, grad] = trainableParams[j];
+            for (size_t i = 0; i < grad.size(); ++i)
+            {
+                qSum += grad[i] * grad[i];
+            }
+        }
+    }
+#endif
+    else
+    {
+        THROW_NONAME("GradientClipping", "unexpected calculation mode");
+    }
+    qSum = std::sqrt(qSum);
+
+    return qSum;
+}
+
+raul::dtype GradientClipping::calcGlobalNorm(const std::vector<ParamAndGradImpl<TensorFP16>>& trainableParams, const NetworkParameters& networkParameters) const
+{
+    raul::dtype qSum = 0.0_dt;
+    if (networkParameters.mCalculationMode == CalculationMode::DETERMINISTIC)
+    {
+        for (size_t j = 0; j < trainableParams.size(); ++j)
+        {
+            auto& [param, grad] = trainableParams[j];
+            for (size_t i = 0; i < grad.size(); ++i)
+            {
+                qSum += TODTYPE(grad[i]) * TODTYPE(grad[i]);
+            }
+        }
+    }
+#if defined(_OPENMP)
+    else if (networkParameters.mCalculationMode == CalculationMode::FAST)
+    {
+#pragma omp parallel for reduction(+ : qSum)
+        for (size_t j = 0; j < trainableParams.size(); ++j)
+        {
+            auto& [param, grad] = trainableParams[j];
+            for (size_t i = 0; i < grad.size(); ++i)
+            {
+                qSum += TODTYPE(grad[i]) * TODTYPE(grad[i]);
+            }
+        }
+    }
+#endif
+    else
+    {
+        THROW_NONAME("GradientClipping", "unexpected calculation mode");
+    }
+    qSum = std::sqrt(qSum);
+
+    return qSum;
+}
+
+raul::dtype GradientClipping::calcGlobalNormMixedPrecision(std::vector<ParamAndGrad>& trainableParams,
+                                                           std::vector<ParamAndGradImpl<TensorFP16>>& trainableParamsFP16,
+                                                           const NetworkParameters& networkParameters) const
+{
+    raul::dtype qSum = 0.0_dt;
+    if (networkParameters.mCalculationMode == CalculationMode::DETERMINISTIC)
+    {
+        for (size_t j = 0; j < trainableParams.size(); ++j)
+        {
+            auto& [param, grad] = trainableParams[j];
+            for (size_t i = 0; i < grad.size(); ++i)
+            {
+                qSum += grad[i] * grad[i];
+            }
+        }
+
+        for (size_t j = 0; j < trainableParamsFP16.size(); ++j)
+        {
+            auto& [param, grad] = trainableParamsFP16[j];
+            for (size_t i = 0; i < grad.size(); ++i)
+            {
+                qSum += grad[i] * grad[i];
+            }
+        }
+    }
+#if defined(_OPENMP)
+    else if (networkParameters.mCalculationMode == CalculationMode::FAST)
+    {
+#pragma omp parallel for reduction(+ : qSum)
+        for (size_t j = 0; j < trainableParams.size(); ++j)
+        {
+            auto& [param, grad] = trainableParams[j];
+            for (size_t i = 0; i < grad.size(); ++i)
+            {
+                qSum += grad[i] * grad[i];
+            }
+        }
+
+#pragma omp parallel for reduction(+ : qSum)
+        for (size_t j = 0; j < trainableParamsFP16.size(); ++j)
+        {
+            auto& [param, grad] = trainableParamsFP16[j];
+            for (size_t i = 0; i < grad.size(); ++i)
+            {
+                qSum += grad[i] * grad[i];
+            }
+        }
+    }
+#endif
+    else
+    {
+        THROW_NONAME("GradientClipping", "unexpected calculation mode");
+    }
+    qSum = std::sqrt(qSum);
+
+    return qSum;
+}
+
+void GradientClipping::processGradients(std::vector<ParamAndGrad>& trainableParams, NetworkParameters& networkParameters)
+{
+    if (!mGlobalNorm)
+    {
+        mCurrentGlobalNorm = calcGlobalNorm(trainableParams, networkParameters);
+        if (mCurrentGlobalNorm == 0.0_dt)
+        {
+            // If all zeroes just return
+            return;
+        }
+    }
+
+    const auto factor = mClipNorm / std::max(mGlobalNorm ? mGlobalNorm.value() : mCurrentGlobalNorm, mClipNorm);
+    if (factor == 1.0_dt)
+    {
+        return;
+    }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t j = 0; j < trainableParams.size(); ++j)
+    {
+        auto& [param, grad] = trainableParams[j];
+        std::transform(grad.begin(), grad.end(), grad.begin(), [&factor](auto& element) { return element * factor; });
+    }
+}
+
+void GradientClipping::processGradients(std::vector<ParamAndGradImpl<TensorFP16>>& trainableParams, NetworkParameters& networkParameters)
+{
+    if (!mGlobalNorm)
+    {
+        mCurrentGlobalNorm = calcGlobalNorm(trainableParams, networkParameters);
+        if (mCurrentGlobalNorm == 0.0_dt)
+        {
+            // If all zeroes just return
+            return;
+        }
+    }
+
+    const auto factor = mClipNorm / std::max(mGlobalNorm ? mGlobalNorm.value() : mCurrentGlobalNorm, mClipNorm);
+    if (factor == 1.0_dt)
+    {
+        return;
+    }
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t j = 0; j < trainableParams.size(); ++j)
+    {
+        auto& [param, grad] = trainableParams[j];
+        std::transform(grad.begin(), grad.end(), grad.begin(), [&factor](auto& element) -> half { return element * TOHTYPE(factor); });
+    }
+}
+
+void GradientClipping::processGradientsMixedPrecision(std::vector<ParamAndGrad>& trainableParams, std::vector<ParamAndGradImpl<TensorFP16>>& trainableParamsFP16, NetworkParameters& networkParameters)
+{
+    if (!mGlobalNorm)
+    {
+        mCurrentGlobalNorm = calcGlobalNormMixedPrecision(trainableParams, trainableParamsFP16, networkParameters);
+        if (mCurrentGlobalNorm == 0.0_dt)
+        {
+            // If all zeroes just return
+            return;
+        }
+    }
+
+    const auto factor = mClipNorm / std::max(mGlobalNorm ? mGlobalNorm.value() : mCurrentGlobalNorm, mClipNorm);
+    if (factor == 1.0_dt)
+    {
+        return;
+    }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t j = 0; j < trainableParams.size(); ++j)
+    {
+        auto& [param, grad] = trainableParams[j];
+        std::transform(grad.begin(), grad.end(), grad.begin(), [&factor](auto& element) { return element * factor; });
+    }
+
+#if defined(_OPENMP)
+#pragma omp parallel for
+#endif
+    for (size_t j = 0; j < trainableParamsFP16.size(); ++j)
+    {
+        auto& [param, grad] = trainableParamsFP16[j];
+        std::transform(grad.begin(), grad.end(), grad.begin(), [&factor](auto& element) -> half { return element * TOHTYPE(factor); });
+    }
+}
+
+raul::dtype GradientClipping::getGlobalNorm() const
+{
+    if (!mGlobalNorm)
+    {
+        return mCurrentGlobalNorm;
+    }
+    return mGlobalNorm.value();
+}
+
+std::ostream& GradientClipping::as_ostream(std::ostream& out) const
+{
+    out << "GradientClipping(clip norm = " << mClipNorm;
+    if (mGlobalNorm)
+    {
+        out << ", mGlobalNorm = " << mGlobalNorm.value();
+    }
+    out << ")";
+    return out;
+}
+
+}
\ No newline at end of file
diff --git a/training/src/compiler/training/base/postprocessing/GradientClipping.h b/training/src/compiler/training/base/postprocessing/GradientClipping.h
new file mode 100644
index 00000000..b4811854
--- /dev/null
+++ b/training/src/compiler/training/base/postprocessing/GradientClipping.h
@@ -0,0 +1,54 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRADIENT_CLIPPING_H
+#define GRADIENT_CLIPPING_H
+
+#include "GradientPostprocessor.h"
+#include <optional>
+
+namespace raul::postprocessing
+{
+
+/**
+ * @brief Gradient Clipping
+ * Given a vector of gradients, and a clipping ratio clipNorm, this object
+ * normalize gradients like grad[i] * clipNorm / max(globalNorm, clipNorm),
+ * where globalNorm = sqrt(sum([l2norm(g)**2 for g in grads]))
+ *
+ * @see
+ * https://www.tensorflow.org/api_docs/python/tf/clip_by_global_norm
+ */
+
+struct GradientClipping : public GradientPostprocessor
+{
+    explicit GradientClipping(raul::dtype clipNorm = std::numeric_limits<dtype>::max(), std::optional<raul::dtype> globalNorm = std::nullopt);
+    void processGradients(std::vector<ParamAndGrad>&, NetworkParameters& networkParameters) override;
+    void processGradients(std::vector<ParamAndGradImpl<TensorFP16>>&, NetworkParameters& networkParameters) override;
+    void processGradientsMixedPrecision(std::vector<ParamAndGrad>&, std::vector<ParamAndGradImpl<TensorFP16>>&, NetworkParameters& networkParameters);
+    raul::dtype calcGlobalNorm(const std::vector<ParamAndGrad>&, const NetworkParameters& networkParameters) const;
+    raul::dtype calcGlobalNorm(const std::vector<ParamAndGradImpl<TensorFP16>>&, const NetworkParameters& networkParameters) const;
+    raul::dtype calcGlobalNormMixedPrecision(std::vector<ParamAndGrad>&, std::vector<ParamAndGradImpl<TensorFP16>>&, const NetworkParameters& networkParameters) const;
+    raul::dtype getGlobalNorm() const;
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const override;
+    const raul::dtype mClipNorm;
+    const std::optional<raul::dtype> mGlobalNorm;
+
+    raul::dtype mCurrentGlobalNorm;
+};
+
+} // raul::postprocessing
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/postprocessing/GradientPostprocessor.h b/training/src/compiler/training/base/postprocessing/GradientPostprocessor.h
new file mode 100644
index 00000000..fe8d0622
--- /dev/null
+++ b/training/src/compiler/training/base/postprocessing/GradientPostprocessor.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GRADIENT_POSTPROCESSOR_H
+#define GRADIENT_POSTPROCESSOR_H
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Tensor.h>
+#include <training/base/common/NetworkParameters.h>
+
+namespace raul::postprocessing
+{
+
+/**
+ * @brief Gradient Postprocessor interface
+ */
+struct GradientPostprocessor
+{
+    GradientPostprocessor() = default;
+    virtual ~GradientPostprocessor() = default;
+
+    GradientPostprocessor(const GradientPostprocessor& other) = delete;
+    GradientPostprocessor(const GradientPostprocessor&& other) = delete;
+    GradientPostprocessor& operator=(const GradientPostprocessor& other) = delete;
+    GradientPostprocessor& operator=(const GradientPostprocessor&& other) = delete;
+
+    /**
+     * Call-method of an gradient postproccesor that modifies gradients.
+     * @param trainableParams vector of tensors of parameters and gradients.
+     */
+    virtual void processGradients(std::vector<ParamAndGrad>& trainableParams, NetworkParameters& networkParameters) = 0;
+    virtual void processGradients(std::vector<ParamAndGradImpl<TensorFP16>>& trainableParams, NetworkParameters& networkParameters) = 0;
+    friend std::ostream& operator<<(std::ostream& out, const GradientPostprocessor& instance) { return instance.as_ostream(out); }
+
+  private:
+    virtual std::ostream& as_ostream(std::ostream& out) const = 0;
+};
+
+} // raul::postprocessing
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/tools/DataTransformations.cpp b/training/src/compiler/training/base/tools/DataTransformations.cpp
new file mode 100644
index 00000000..551339c9
--- /dev/null
+++ b/training/src/compiler/training/base/tools/DataTransformations.cpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <algorithm>
+
+#include <training/base/tools/DataTransformations.h>
+
+using namespace raul;
+
+std::unique_ptr<Tensor> Normalize::operator()(const Tensor& tsr)
+{
+    auto result = std::make_unique<Tensor>(tsr.getBatchSize(), tsr.getDepth(), tsr.getHeight(), tsr.getWidth());
+    std::transform(tsr.begin(), tsr.end(), result->begin(), [nc = mNormalizationCoefficient](dtype v) { return v / nc; });
+    return result;
+}
+
+std::unique_ptr<Tensor> BuildOneHotVector::operator()(const Tensor& tsr)
+{
+    auto result = std::make_unique<Tensor>(tsr.getBatchSize(), tsr.getDepth(), tsr.getHeight(), mNumberOfClasses);
+    for (size_t i = 0; i < tsr.size(); ++i)
+    {
+        if (tsr[i] < 0_dt || tsr[i] >= static_cast<dtype>(mNumberOfClasses))
+        {
+            THROW_NONAME("DataTransformations", "Incorrect value for one-hot vector (less then 0 or exceeds max value - number of classes)");
+        }
+
+        if (hasFractionalPart(tsr[i]))
+        {
+            THROW_NONAME("DataTransformations", "Incorrect value for one-hot vector (has fractional part but should not)");
+        }
+
+        (*result)[static_cast<int32_t>(tsr[i]) + mNumberOfClasses * i] = 1_dt;
+    }
+    return result;
+}
+
+bool BuildOneHotVector::hasFractionalPart(dtype v)
+{
+    return static_cast<dtype>(static_cast<int32_t>(v)) != v;
+}
+
+std::unique_ptr<Tensor> Resize::operator()(const Tensor& tsr)
+{
+    auto result = std::make_unique<Tensor>(tsr.getBatchSize(), tsr.getDepth(), mNewHeight, mNewWidth);
+
+    float ratioX = static_cast<float>(tsr.getWidth()) / static_cast<float>(result->getWidth());
+    float ratioY = static_cast<float>(tsr.getHeight()) / static_cast<float>(result->getHeight());
+
+    auto buffer4D = result->get4DView();
+    auto image4D = tsr.get4DView();
+    for (size_t i = 0; i < result->getBatchSize(); ++i)
+    {
+        for (size_t j = 0; j < result->getDepth(); ++j)
+        {
+            for (size_t k = 0; k < result->getHeight(); ++k)
+            {
+                for (size_t l = 0; l < result->getWidth(); ++l)
+                {
+                    const size_t oldX = static_cast<size_t>(std::floor(static_cast<dtype>(l) * ratioX));
+                    const size_t oldY = static_cast<size_t>(std::floor(static_cast<dtype>(k) * ratioY));
+                    buffer4D[i][j][k][l] = image4D[i][j][oldY][oldX];
+                }
+            }
+        }
+    }
+    return result;
+}
diff --git a/training/src/compiler/training/base/tools/DataTransformations.h b/training/src/compiler/training/base/tools/DataTransformations.h
new file mode 100644
index 00000000..5210c7ff
--- /dev/null
+++ b/training/src/compiler/training/base/tools/DataTransformations.h
@@ -0,0 +1,80 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef DATATRANSFORMATIONS_H
+#define DATATRANSFORMATIONS_H
+
+#include <fstream>
+#include <training/base/common/Tensor.h>
+
+namespace raul
+{
+
+class Transform
+{
+  public:
+    virtual std::unique_ptr<Tensor> operator()(const Tensor&) = 0;
+
+    virtual ~Transform() = default;
+};
+
+class Normalize : public Transform
+{
+  public:
+    Normalize(dtype coefficient)
+        : mNormalizationCoefficient(coefficient)
+    {
+    }
+
+    std::unique_ptr<Tensor> operator()(const Tensor& tensor) final;
+
+  private:
+    dtype mNormalizationCoefficient;
+};
+
+class BuildOneHotVector : public Transform
+{
+  public:
+    explicit BuildOneHotVector(size_t numberOfClasses)
+        : mNumberOfClasses(numberOfClasses)
+    {
+    }
+
+    std::unique_ptr<Tensor> operator()(const Tensor& tensor) final;
+
+  private:
+    bool hasFractionalPart(dtype v);
+
+  private:
+    size_t mNumberOfClasses;
+};
+
+class Resize : public Transform
+{
+  public:
+    Resize(size_t newHeight, size_t newWidth)
+        : mNewHeight(newHeight)
+        , mNewWidth(newWidth)
+    {
+    }
+
+    std::unique_ptr<Tensor> operator()(const Tensor&) final;
+
+  private:
+    size_t mNewHeight;
+    size_t mNewWidth;
+};
+
+} // !namespace raul
+
+#endif // DATATRANSFORMATIONS_H
diff --git a/training/src/compiler/training/base/tools/ElementSequence.cpp b/training/src/compiler/training/base/tools/ElementSequence.cpp
new file mode 100644
index 00000000..1f1aeac3
--- /dev/null
+++ b/training/src/compiler/training/base/tools/ElementSequence.cpp
@@ -0,0 +1,62 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <training/base/common/Common.h>
+#include <training/base/tools/ElementSequence.h>
+
+#include <random>
+
+using namespace raul;
+
+MonotonicSequence::MonotonicSequence(uint32_t min, uint32_t max)
+    : ElementSequence(min, max)
+    , mMin(min)
+    , mMax(max)
+    , mCurrentElement(mMin)
+{
+}
+
+uint32_t MonotonicSequence::getElement()
+{
+    if (mCurrentElement > mMax)
+    {
+        mCurrentElement = mMin;
+    }
+
+    return mCurrentElement++;
+}
+
+RandomSequence::RandomSequence(uint32_t min, uint32_t max)
+    : ElementSequence(min, max)
+{
+    mElementSource.resize(max > min ? max - min + 1 : min - max + 1);
+    std::generate(mElementSource.begin(), mElementSource.end(), [n = min]() mutable { return n++; });
+    shuffle();
+}
+
+uint32_t RandomSequence::getElement()
+{
+    if (mCurrentIdx >= mElementSource.size())
+    {
+        mCurrentIdx = 0;
+        shuffle();
+    }
+    return mElementSource[mCurrentIdx++];
+}
+
+void RandomSequence::shuffle()
+{
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(mElementSource.begin(), mElementSource.end(), g);
+}
diff --git a/training/src/compiler/training/base/tools/ElementSequence.h b/training/src/compiler/training/base/tools/ElementSequence.h
new file mode 100644
index 00000000..7e3f1b56
--- /dev/null
+++ b/training/src/compiler/training/base/tools/ElementSequence.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ELEMENTSEQUENCE_H
+#define ELEMENTSEQUENCE_H
+
+#include <cstdint>
+#include <vector>
+
+namespace raul
+{
+
+class ElementSequence
+{
+  public:
+    ElementSequence(uint32_t, uint32_t) {}
+
+    virtual uint32_t getElement() = 0;
+
+    virtual ~ElementSequence() = default;
+};
+
+class MonotonicSequence : public ElementSequence
+{
+  public:
+    MonotonicSequence(uint32_t min, uint32_t max);
+
+    uint32_t getElement() final;
+
+  private:
+    uint32_t mMin;
+    uint32_t mMax;
+    uint32_t mCurrentElement;
+};
+
+class RandomSequence : public ElementSequence
+{
+  public:
+    RandomSequence(uint32_t min, uint32_t max);
+
+    uint32_t getElement() final;
+
+  private:
+    void shuffle();
+
+  private:
+    std::vector<uint32_t> mElementSource;
+    std::vector<uint32_t>::size_type mCurrentIdx = 0;
+};
+
+} // !namespace raul
+
+#endif // ELEMENTSEQUENCE_H
diff --git a/training/src/compiler/training/base/tools/NamedTuple.h b/training/src/compiler/training/base/tools/NamedTuple.h
new file mode 100644
index 00000000..ac2cdbd6
--- /dev/null
+++ b/training/src/compiler/training/base/tools/NamedTuple.h
@@ -0,0 +1,115 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NAMED_TUPLE_H
+#define NAMED_TUPLE_H
+
+#include <cstdint>
+#include <tuple>
+
+namespace
+{
+
+// http://vitiy.info/named-tuple-for-cplusplus/
+
+namespace foonathan
+{
+namespace string_id
+{
+namespace detail
+{
+using hash_type = std::uint64_t;
+
+constexpr hash_type fnv_basis = 14695981039346656037ull;
+constexpr hash_type fnv_prime = 109951162821ull;
+
+// FNV-1a 64 bit hash
+constexpr hash_type sid_hash(const char* str, hash_type hash = fnv_basis) noexcept
+{
+    return *str ? sid_hash(str + 1, (hash ^ *str) * fnv_prime) : hash;
+}
+}
+}
+}
+
+template<typename Hash, typename... Ts>
+struct named_param : public std::tuple<std::decay_t<Ts>...>
+{
+    using hash = Hash;
+
+    named_param(Ts&&... ts)
+        : std::tuple<std::decay_t<Ts>...>(std::forward<Ts>(ts)...){};
+
+    template<typename P>
+    named_param<Hash, P> operator[](P&& p)
+    {
+        return named_param<Hash, P>(std::forward<P>(p));
+    };
+};
+
+template<typename Hash>
+using make_named_param = named_param<Hash>;
+
+template<typename... Params>
+struct named_tuple : public std::tuple<Params...>
+{
+
+    template<typename... Args>
+    named_tuple(Args&&... args)
+        : std::tuple<Args...>(std::forward<Args>(args)...)
+    {
+    }
+
+    static const std::size_t error = -1;
+
+    template<std::size_t I = 0, typename Hash>
+    constexpr typename std::enable_if<I == sizeof...(Params), const std::size_t>::type static get_element_index()
+    {
+        return error;
+    }
+
+    template<std::size_t I = 0, typename Hash>
+        constexpr typename std::enable_if < I<sizeof...(Params), const std::size_t>::type static get_element_index()
+    {
+        using elementType = typename std::tuple_element<I, std::tuple<Params...>>::type;
+        return (std::is_same<typename elementType::hash, Hash>::value) ? I : get_element_index<I + 1, Hash>();
+    }
+
+    template<typename Hash>
+    auto& get()
+    {
+        constexpr std::size_t index = get_element_index<0, Hash>();
+        static_assert((index != error), "Wrong named tuple key");
+        auto& param = (std::get<index>(static_cast<std::tuple<Params...>&>(*this)));
+        return std::get<0>(param);
+    }
+
+    template<typename NP>
+    auto& operator[](NP&& param)
+    {
+        (void)param;
+        return get<typename NP::hash>();
+    }
+};
+
+template<typename... Args>
+auto make_named_tuple(Args&&... args)
+{
+    return named_tuple<Args...>(std::forward<Args>(args)...);
+}
+
+} // anonymous namespace
+
+#define param(x) (make_named_param<std::integral_constant<foonathan::string_id::detail::hash_type, foonathan::string_id::detail::sid_hash(x)>>{})
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/base/tools/Utils.cpp b/training/src/compiler/training/base/tools/Utils.cpp
new file mode 100644
index 00000000..66b4b14b
--- /dev/null
+++ b/training/src/compiler/training/base/tools/Utils.cpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Utils.h"
+#include <fstream>
+#include <iostream>
+#include <training/compiler/Workflow.h>
+
+namespace raul::utils
+{
+
+template<typename MM>
+void printTensorNamesImpl(std::ostream& stream, const raul::Name& name, const Names& a, const MM& memoryManager)
+{
+    if (!a.empty())
+    {
+        stream << "\t" << name << ":";
+        for (const auto& s : a)
+        {
+            stream << " " << memoryManager.getTensor(s);
+        }
+        stream << std::endl;
+    }
+}
+
+void printTensorNames(std::ostream& stream, const raul::Name& name, const Names& a, const MemoryManager& memoryManager)
+{
+    printTensorNamesImpl(stream, name, a, memoryManager);
+}
+
+void printTensorNames(std::ostream& stream, const raul::Name& name, const Names& a, const MemoryManagerFP16& memoryManager)
+{
+    printTensorNamesImpl(stream, name, a, memoryManager);
+}
+
+[[maybe_unused]] void printList(std::ostream& stream, const raul::Name& name, const Names& a)
+{
+    if (!a.empty())
+    {
+        stream << "\t" << name << ":";
+        for (const auto& s : a)
+        {
+            stream << " " << s;
+        }
+        stream << std::endl;
+    }
+}
+
+void traceTensor(const raul::Name& tensorName, const raul::NetworkParameters& networkParameters, const raul::Name& prefix)
+{
+    std::ofstream outfile;
+    const auto& workflow = networkParameters.mWorkflow;
+    const auto& target = workflow.getExecutionTarget();
+    if (target == ExecutionTarget::CPU)
+    {
+        outfile.open(prefix + tensorName + ".cpu.trace", std::ios_base::app);
+        const auto& memoryManager = workflow.getMemoryManager();
+        const auto& tensor = memoryManager[tensorName];
+        outfile << tensor.getDescription() << "[CPU]" << std::endl;
+        for (const auto& val : tensor)
+        {
+            outfile << static_cast<double>(val) << std::endl;
+        }
+    }
+    else if (target == ExecutionTarget::CPUFP16)
+    {
+        outfile.open(prefix + tensorName + ".cpu.fp16.trace", std::ios_base::app);
+        const auto& memoryManager = workflow.getMemoryManager<MemoryManagerFP16>();
+        const auto& tensor = memoryManager[tensorName];
+        outfile << tensor.getDescription() << "[CPU FP16]" << std::endl;
+        for (const auto& val : tensor)
+        {
+            outfile << static_cast<double>(val) << std::endl;
+        }
+    }
+    else
+    {
+        outfile << tensorName << "[unsupported target]" << std::endl;
+    }
+}
+
+} // namespace raul::utils
\ No newline at end of file
diff --git a/training/src/compiler/training/base/tools/Utils.h b/training/src/compiler/training/base/tools/Utils.h
new file mode 100644
index 00000000..6dfd500a
--- /dev/null
+++ b/training/src/compiler/training/base/tools/Utils.h
@@ -0,0 +1,35 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <training/base/common/MemoryManager.h>
+#include <training/system/Name.h>
+#include <training/base/common/NetworkParameters.h>
+
+namespace raul::utils
+{
+
+void printTensorNames(std::ostream& stream, const raul::Name& name, const Names& a, const MemoryManager& memoryManager);
+void printTensorNames(std::ostream& stream, const raul::Name& name, const Names& a, const MemoryManagerFP16& memoryManager);
+[[maybe_unused]] void printList(std::ostream& stream, const raul::Name& name, const Names& a);
+
+void traceTensor(const raul::Name& tensorName, const raul::NetworkParameters& networkParameters, const raul::Name& prefix = "");
+
+#define traceWithPrefix(tensor, prefix) utils::traceTensor(tensor, mNetworkParams, (prefix) + mName / __func__ + ".")
+#define trace(tensor) traceWithPrefix(tensor, "")
+
+} // namespace raul::utils
+
+#endif // UTILS_H
diff --git a/training/src/compiler/training/compiler/Compiler.cpp b/training/src/compiler/training/compiler/Compiler.cpp
new file mode 100644
index 00000000..7fdc0656
--- /dev/null
+++ b/training/src/compiler/training/compiler/Compiler.cpp
@@ -0,0 +1,760 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Compiler.h"
+
+#include <algorithm>
+
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/basic/ConvertPrecisionLayer.h>
+#include <training/compiler/LayerBuilder.h>
+
+namespace
+{
+
+const raul::Name suffixFP32 = "FP32";
+const raul::Name suffixFP16 = "FP16";
+
+/**
+ * @brief Params for TrainableParamsConvertLayer
+ *
+ */
+struct TrainableParamsConvertLayerParams : public raul::BasicParams
+{
+    TrainableParamsConvertLayerParams() = delete;
+
+    TrainableParamsConvertLayerParams(const raul::Name& layerFromName, bool isBeforeLayer = true, raul::LayerExecutionTarget target = raul::LayerExecutionTarget::CPU)
+        : raul::BasicParams({}, {})
+        , mLayerFromName(layerFromName)
+        , mIsBeforeLayer(isBeforeLayer)
+        , mTarget(target)
+    {
+        if (layerFromName.empty())
+        {
+            THROW_NONAME("TrainableParamsConvertLayerParams", "provided layer name is empty");
+        }
+    }
+
+    raul::Name mLayerFromName;
+    bool mIsBeforeLayer;
+    raul::LayerExecutionTarget mTarget;
+
+    void print(std::ostream& stream) const override
+    {
+        raul::BasicParams::print(stream);
+        stream << "Layer from: " << mLayerFromName << ", position: " << (mIsBeforeLayer ? "before" : "after");
+    }
+};
+
+/**
+ * @brief TrainableParamsConvertLayer
+ * Do copy-convert operation with trainable params
+ * Should be used only implicitly inside compiler
+ *
+ *
+ */
+class TrainableParamsConvertLayer : public raul::BasicLayer
+{
+  public:
+    TrainableParamsConvertLayer(const raul::Name& name, const TrainableParamsConvertLayerParams& params, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "TrainableParamsConvert", params, networkParameters, { false, false })
+        , mLayerFromName(params.mLayerFromName)
+        , mIsBeforeLayer(params.mIsBeforeLayer)
+        , mMasterWeightsInitiated(false)
+    {
+        if (!(params.mTarget == raul::LayerExecutionTarget::CPU && mNetworkParams.mWorkflow.getExecutionTarget() == raul::ExecutionTarget::CPUFP16))
+        {
+            THROW(mTypeName, mName, "unsupported combination of layer's and global execution targets provided");
+        }
+
+        if (!mNetworkParams.mWorkflow.isCompilerEnabled())
+        {
+            THROW(mTypeName, mName, "complier should be enabled to use this layer");
+        }
+
+        if (!mIsBeforeLayer)
+        {
+            const auto prevLayerTrainableParams = mNetworkParams.mWorkflow.getLayerTrainableParameterNames(mLayerFromName);
+
+            if (prevLayerTrainableParams.empty())
+            {
+                THROW(mTypeName, mName, mLayerFromName + " layer should be trainable");
+            }
+
+            for (size_t i = 0; i < prevLayerTrainableParams.size(); ++i)
+            {
+                // Layer frozen if no grad declared
+                auto isLayerFrozen = !mNetworkParams.mWorkflow.isTensorDeclared(prevLayerTrainableParams[i].grad());
+                mNetworkParams.mWorkflow.copyDeclaration(mLayerFromName,
+                                                         prevLayerTrainableParams[i],
+                                                         prevLayerTrainableParams[i] + "_fp32",
+                                                         raul::Workflow::Usage::ForwardAndBackward,
+                                                         raul::Workflow::Mode::Read,
+                                                         false,
+                                                         false,
+                                                         !isLayerFrozen,
+                                                         false,
+                                                         false,
+                                                         params.mTarget);
+                if (!isLayerFrozen)
+                {
+                    mNetworkParams.mWorkflow.copyDeclaration(
+                        mLayerFromName, prevLayerTrainableParams[i] + "_fp32", raul::Name(prevLayerTrainableParams[i] + "_fp32").grad(), DEC_TRAINABLE_GRAD, params.mTarget);
+                }
+            }
+        }
+    }
+
+    TrainableParamsConvertLayer(TrainableParamsConvertLayer&&) = default;
+    TrainableParamsConvertLayer(const TrainableParamsConvertLayer&) = delete;
+    TrainableParamsConvertLayer& operator=(const TrainableParamsConvertLayer&) = delete;
+
+    void forwardComputeImpl(raul::NetworkMode) override
+    {
+        if (mIsBeforeLayer)
+        {
+            raul::Workflow& work = mNetworkParams.mWorkflow;
+
+            const auto prevLayerTrainableParams = work.getLayerTrainableParameterNames(mLayerFromName);
+
+            auto& memoryManagerFP16 = work.getMemoryManager<raul::MemoryManagerFP16>();
+            auto& memoryManagerFP32 = work.getMemoryManager<raul::MemoryManager>();
+
+            for (size_t i = 0; i < prevLayerTrainableParams.size(); ++i)
+            {
+                if (!memoryManagerFP16.tensorExists(prevLayerTrainableParams[i]))
+                {
+                    continue;
+                }
+
+                if (!memoryManagerFP32.tensorExists(prevLayerTrainableParams[i] + "_fp32"))
+                {
+                    THROW(mTypeName, mName, "FP32 copy of needed parameter does not exist");
+                }
+
+                auto& paramsFP16 = memoryManagerFP16[prevLayerTrainableParams[i]];
+                auto& paramsFP32 = memoryManagerFP32[prevLayerTrainableParams[i] + "_fp32"];
+                if (!mMasterWeightsInitiated)
+                {
+                    // Init master weights
+                    std::transform(paramsFP16.begin(), paramsFP16.end(), paramsFP32.begin(), [](const raul::half& val) { return raul::toFloat32(val); });
+                }
+                else
+                {
+                    // Convert
+                    std::transform(paramsFP32.begin(), paramsFP32.end(), paramsFP16.begin(), [](const raul::dtype& val) { return raul::toFloat16(val); });
+                }
+            }
+
+            if (!mMasterWeightsInitiated)
+            {
+                mMasterWeightsInitiated = true;
+            }
+        }
+    }
+
+    void backwardComputeImpl() override
+    {
+        if (mIsBeforeLayer)
+        {
+            raul::Workflow& work = mNetworkParams.mWorkflow;
+
+            const auto prevLayerTrainableParams = work.getLayerTrainableParameterNames(mLayerFromName);
+
+            auto& memoryManagerFP16 = work.getMemoryManager<raul::MemoryManagerFP16>();
+            auto& memoryManagerFP32 = work.getMemoryManager<raul::MemoryManager>();
+
+            for (size_t i = 0; i < prevLayerTrainableParams.size(); ++i)
+            {
+                if (!memoryManagerFP16.tensorExists(prevLayerTrainableParams[i].grad()))
+                {
+                    continue;
+                }
+
+                if (!memoryManagerFP32.tensorExists(raul::Name(prevLayerTrainableParams[i] + "_fp32").grad()))
+                {
+                    THROW(mTypeName, mName, "FP32 copy of needed parameter's gradient does not exist");
+                }
+
+                auto& paramsGradFP16 = memoryManagerFP16[prevLayerTrainableParams[i].grad()];
+                auto& paramsGradFP32 = memoryManagerFP32[raul::Name(prevLayerTrainableParams[i] + "_fp32").grad()];
+                std::transform(paramsGradFP16.begin(), paramsGradFP16.end(), paramsGradFP32.begin(), [](const raul::half& val) { return raul::toFloat32(val); });
+            }
+        }
+    }
+
+  private:
+    raul::Name mLayerFromName;
+    bool mIsBeforeLayer;
+    bool mMasterWeightsInitiated;
+};
+
+} // anonymous
+
+namespace raul
+{
+
+Constraint::Constraint(const Name& layer, ConstraintImpl cImpl)
+    : mLayerFrom(layer)
+    , mOutputConversion(true)
+    , mConstraintImpl(cImpl)
+{
+    if (layer.empty())
+    {
+        THROW_NONAME("Constraint", "Empty layer name");
+    }
+}
+
+Constraint::Constraint(const Name& layerFrom, const Name& layerTo, ConstraintImpl cImpl)
+    : mLayerFrom(layerFrom)
+    , mLayerTo(layerTo)
+    , mOutputConversion(true)
+    , mConstraintImpl(cImpl)
+{
+    if (layerFrom.empty())
+    {
+        THROW_NONAME("Constraint", "Empty layerFrom name");
+    }
+
+    if (layerTo.empty())
+    {
+        THROW_NONAME("Constraint", "Empty layerTo name");
+    }
+}
+
+Compiler::Compiler(ExecutionTarget executionTarget)
+    : mImplementationResolved(false)
+    , mGlobalExecutionTarget(executionTarget)
+{
+}
+
+void Compiler::setImpl(const std::vector<LayerMem>& redefinedFrontLayers, size_t index, bool constrainActivated)
+{
+    std::unique_ptr<BasicImpl> foundImpl;
+
+    const auto& front = redefinedFrontLayers.back();
+
+    auto& frontref = *front;
+    Name type = typeid(frontref).name();
+
+    auto& map = getMapImpl(mPerLayerConstraintImpl[index]);
+
+    const auto impl = map.find(type);
+    if (impl != map.end())
+    {
+        foundImpl = impl->second->create(front.get());
+
+        if (foundImpl == nullptr)
+        {
+            THROW_NONAME("Compiler", "Type \"" + type + "\" not instantiated");
+        }
+    }
+    else
+    {
+        if (constrainActivated)
+        {
+            THROW_NONAME("Compiler", "Type \"" + type + "\" not resolved");
+        }
+    }
+
+    if (foundImpl)
+    {
+        front->setImpl(std::move(foundImpl));
+    }
+}
+
+std::vector<LayerMem> Compiler::resolveImplementation(Builders& builders, NetworkParameters& networkParams)
+{
+    // it is not possible to instantiate implementations several times
+    // there might be duplication of tensor declarations
+    if (mImplementationResolved)
+    {
+        THROW_NONAME("Compiler", "Already resolved");
+    }
+
+    std::vector<LayerMem> redefinedFrontLayers;
+
+    for (size_t index = 0; index < builders.size(); ++index)
+    {
+        mMapLayerNameToBuilderIndex.insert({ builders[index].getName(), index });
+    }
+
+    if (!checkConstraints(builders))
+    {
+        THROW_NONAME("Compiler", "Constraints not correct");
+    }
+
+    std::unordered_map<size_t, std::pair<Names, Names>> inOutUnpair;
+    std::set<Name> requireTrainableParamsFP32Copies;
+    for (size_t index = 0; index < mConstraints.size(); ++index)
+    {
+        if (isConstraintApplicableForConversion(mConstraints[index]))
+        {
+            inOutUnpair.emplace(index, getUnpairNames(builders, mConstraints[index]));
+        }
+        else if (isConstraintRequireFP32CopyOfTrainableParams(mConstraints[index]))
+        {
+            const auto layerSeq = getConstraintSequenceLayerNames(builders, mConstraints[index]);
+            requireTrainableParamsFP32Copies.insert(layerSeq.begin(), layerSeq.end());
+        }
+    }
+
+    // layer name (from, to) to constraint index
+    std::unordered_map<Name, size_t> mapLayerNameToConstraintFrom;
+    std::unordered_map<Name, size_t> mapLayerNameToConstraintTo;
+
+    for (size_t index = 0; index < mConstraints.size(); ++index)
+    {
+        if (isConstraintApplicableForConversion(mConstraints[index]))
+        {
+            auto& layerFrom = mConstraints[index].getLayerFrom();
+            auto& layerTo = mConstraints[index].getLayerTo();
+
+            mapLayerNameToConstraintFrom.insert({ layerFrom, index });
+
+            // constraint with only one layer
+            if (layerTo.empty())
+            {
+                mapLayerNameToConstraintTo.insert({ layerFrom, index });
+            }
+            else
+            {
+                mapLayerNameToConstraintTo.insert({ layerTo, index });
+            }
+        }
+    }
+
+    bool constrainActivated = false;
+    size_t constrainIndex = 0;
+
+    for (size_t index = 0; index < builders.size(); ++index)
+    {
+        auto& builder = builders[index];
+
+        Name layerName = builder.getName();
+        auto foundItFrom = mapLayerNameToConstraintFrom.find(layerName);
+        if (foundItFrom != mapLayerNameToConstraintFrom.end())
+        {
+            constrainActivated = true;
+            constrainIndex = (*foundItFrom).second;
+
+            LayerExecutionTarget toTarget = constraintImplToLayerExecutionTarget(mConstraints[constrainIndex].getConstraintImpl());
+
+            Names inUnpairTensors = inOutUnpair[constrainIndex].first;
+
+            for (size_t q = 0; q < inUnpairTensors.size(); ++q)
+            {
+                Name convertorName = Name("TensorConvertor_conv") / Conversions::toString(index) / Conversions::toString(q);
+                Name toName = inUnpairTensors[q];
+                if (mConstraints[constrainIndex].getConstraintImpl() == ConstraintImpl::CPU)
+                    toName += "_" + suffixFP32 + "_" + Conversions::toString(constrainIndex);
+                else if (mConstraints[constrainIndex].getConstraintImpl() == ConstraintImpl::CPUFP16)
+                    toName += "_" + suffixFP16 + "_" + Conversions::toString(constrainIndex);
+                else
+                    THROW_NONAME("Compiler", "Conversion not supported");
+
+                bool optimizeMem = networkParams.mWorkflow.isTensorOptimizeMem(inUnpairTensors[q]);
+                redefinedFrontLayers.emplace_back(std::make_unique<ConvertPrecisionLayer>(
+                    convertorName, ConvertPrecisionParams{ { inUnpairTensors[q] }, { toName }, LayerExecutionTarget::Default, toTarget, optimizeMem }, networkParams.mWorkflow.getNetworkParameters()));
+            }
+        }
+
+        // adjust names of inputs / outputs if conversion applied
+        if (constrainActivated)
+        {
+            std::pair<Names, Names> inOut = inOutUnpair[constrainIndex];
+
+            Names& layerIn = builder.getParams().inputs;
+            Names& layerOut = builder.getParams().outputs;
+
+            for (auto& inUnpair : inOut.first)
+            {
+                auto it = std::find(layerIn.begin(), layerIn.end(), inUnpair);
+                if (it != layerIn.end())
+                {
+                    if (mConstraints[constrainIndex].getConstraintImpl() == ConstraintImpl::CPU)
+                        (*it) += "_" + suffixFP32 + "_" + Conversions::toString(constrainIndex);
+                    else if (mConstraints[constrainIndex].getConstraintImpl() == ConstraintImpl::CPUFP16)
+                        (*it) += "_" + suffixFP16 + "_" + Conversions::toString(constrainIndex);
+                    else
+                        THROW_NONAME("Compiler", "Conversion not supported");
+                }
+            }
+
+            if (mConstraints[constrainIndex].isOutputConversion())
+            {
+                for (auto& outUnpair : inOut.second)
+                {
+                    auto it = std::find(layerOut.begin(), layerOut.end(), outUnpair);
+                    if (it != layerOut.end())
+                    {
+                        if (mConstraints[constrainIndex].getConstraintImpl() == ConstraintImpl::CPU)
+                            (*it) += "_" + suffixFP32 + "_" + Conversions::toString(constrainIndex);
+                        else if (mConstraints[constrainIndex].getConstraintImpl() == ConstraintImpl::CPUFP16)
+                            (*it) += "_" + suffixFP16 + "_" + Conversions::toString(constrainIndex);
+                        else
+                            THROW_NONAME("Compiler", "Conversion not supported");
+                    }
+                }
+            }
+        }
+
+        if (requireTrainableParamsFP32Copies.find(layerName) != requireTrainableParamsFP32Copies.end())
+        {
+            // networkParams.mWorkflow.overrideLayerExecutionTarget(constraintImplToLayerExecutionTarget(ConstraintImpl::CPUFP16FP32MasterWeights));
+            Name convertorName = Name("TrainableParams_conv") / Conversions::toString(index) / "before";
+            redefinedFrontLayers.emplace_back(std::make_unique<TrainableParamsConvertLayer>(
+                convertorName, TrainableParamsConvertLayerParams{ layerName, true, LayerExecutionTarget::CPU }, networkParams.mWorkflow.getNetworkParameters()));
+        }
+
+        // needed to select proper memory manager in front layers tensors declaration
+        if (constrainActivated)
+        {
+            networkParams.mWorkflow.overrideLayerExecutionTarget(constraintImplToLayerExecutionTarget(mConstraints[constrainIndex].getConstraintImpl()));
+        }
+
+        redefinedFrontLayers.emplace_back(builder.build(networkParams));
+
+        if (constrainActivated)
+        {
+            networkParams.mWorkflow.resetLayerExecutionTargetOverride();
+        }
+
+        setImpl(redefinedFrontLayers, index, constrainActivated);
+
+        auto foundItTo = mapLayerNameToConstraintTo.find(layerName);
+        if (foundItTo != mapLayerNameToConstraintTo.end())
+        {
+            constrainActivated = false;
+
+            LayerExecutionTarget fromTarget = constraintImplToLayerExecutionTarget(mConstraints[constrainIndex].getConstraintImpl());
+
+            if (mConstraints[constrainIndex].isOutputConversion())
+            {
+                Names outUnpairTensors = inOutUnpair[constrainIndex].second;
+
+                for (size_t q = 0; q < outUnpairTensors.size(); ++q)
+                {
+                    Name convertorName = Name("TensorConvertor_deconv") / Conversions::toString(index) / Conversions::toString(q);
+                    Name fromName = outUnpairTensors[q];
+                    if (mConstraints[constrainIndex].getConstraintImpl() == ConstraintImpl::CPU)
+                        fromName += "_" + suffixFP32 + "_" + Conversions::toString(constrainIndex);
+                    else if (mConstraints[constrainIndex].getConstraintImpl() == ConstraintImpl::CPUFP16)
+                        fromName += "_" + suffixFP16 + "_" + Conversions::toString(constrainIndex);
+                    else
+                        THROW_NONAME("Compiler", "Conversion not supported");
+
+                    bool optimizeMem = networkParams.mWorkflow.isTensorOptimizeMem(fromName);
+                    redefinedFrontLayers.emplace_back(
+                        std::make_unique<ConvertPrecisionLayer>(convertorName,
+                                                                ConvertPrecisionParams{ { fromName }, { outUnpairTensors[q] }, fromTarget, LayerExecutionTarget::Default, optimizeMem },
+                                                                networkParams.mWorkflow.getNetworkParameters()));
+                }
+            }
+        }
+
+        if (requireTrainableParamsFP32Copies.find(layerName) != requireTrainableParamsFP32Copies.end())
+        {
+            Name convertorName = Name("TrainableParams_conv") / Conversions::toString(index) / "after";
+            redefinedFrontLayers.emplace_back(std::make_unique<TrainableParamsConvertLayer>(
+                convertorName, TrainableParamsConvertLayerParams{ layerName, false, LayerExecutionTarget::CPU }, networkParams.mWorkflow.getNetworkParameters()));
+
+            setImpl(redefinedFrontLayers, index);
+        }
+    }
+
+    if (constrainActivated)
+    {
+        THROW_NONAME("Compiler", "Error - constraint not deactivated");
+    }
+
+    mImplementationResolved = true;
+
+    return redefinedFrontLayers;
+}
+
+void Compiler::setConstraint(const Constraint& constraint)
+{
+
+    if (mImplementationResolved)
+    {
+        THROW_NONAME("Compiler", "Implementation already resolved");
+    }
+
+    ConstraintImpl globalImpl = executionTargetToConstraintImpl(mGlobalExecutionTarget);
+
+    if (constraint.getConstraintImpl() == globalImpl)
+    {
+        THROW_NONAME("Compiler", "Redundant constraint");
+    }
+
+    if (globalImpl == ConstraintImpl::CPU && constraint.getConstraintImpl() == ConstraintImpl::CPUFP16FP32MasterWeights)
+    {
+        THROW_NONAME("Compiler", "Not possible combination");
+    }
+
+    if (globalImpl == ConstraintImpl::CPUFP16 && constraint.getConstraintImpl() == ConstraintImpl::CPUFP32FP16MixedLocal)
+    {
+        THROW_NONAME("Compiler", "Not possible combination");
+    }
+
+    mConstraints.push_back(constraint);
+}
+
+ImplFactory& Compiler::getImplFactory()
+{
+    return TheImplFactory::Instance();
+}
+
+bool Compiler::checkConstraints(const Builders& builders)
+{
+    bool constraintsCorrect = false;
+
+    ConstraintImpl globalImpl = executionTargetToConstraintImpl(mGlobalExecutionTarget);
+
+    mPerLayerConstraintImpl.clear();
+    mPerLayerConstraintImpl.resize(builders.size(), globalImpl);
+
+    std::vector<bool> isDefined(builders.size(), false);
+
+    bool noOverlaps = true;
+
+    for (auto& constraint : mConstraints)
+    {
+        auto& layerFrom = constraint.getLayerFrom();
+        auto& layerTo = constraint.getLayerTo();
+
+        auto foundItFrom = mMapLayerNameToBuilderIndex.find(layerFrom);
+
+        if (foundItFrom == mMapLayerNameToBuilderIndex.end())
+        {
+            THROW_NONAME("Compiler", layerFrom + " incorrect layerFrom name");
+        }
+
+        size_t indexFrom = (*foundItFrom).second;
+
+        // constraint with only one layer
+        if (layerTo.empty())
+        {
+            if (!isDefined[indexFrom])
+            {
+                isDefined[indexFrom] = true;
+                mPerLayerConstraintImpl[indexFrom] = constraint.getConstraintImpl();
+            }
+            else
+            {
+                noOverlaps = false;
+                break;
+            }
+        }
+        else
+        {
+            auto foundItTo = mMapLayerNameToBuilderIndex.find(layerTo);
+            if (foundItTo == mMapLayerNameToBuilderIndex.end())
+            {
+                THROW_NONAME("Compiler", layerTo + " incorrect layerTo name");
+            }
+
+            size_t indexTo = (*foundItTo).second;
+
+            if (indexTo < indexFrom)
+            {
+                THROW_NONAME("Compiler", "layers order in constraint not correct");
+            }
+
+            for (size_t q = indexFrom; q <= indexTo; ++q)
+            {
+                if (!isDefined[q])
+                {
+                    isDefined[q] = true;
+                    mPerLayerConstraintImpl[q] = constraint.getConstraintImpl();
+                }
+                else
+                {
+                    noOverlaps = false;
+                    break;
+                }
+            }
+
+            if (!noOverlaps)
+            {
+                break;
+            }
+        }
+    }
+
+    if (noOverlaps)
+    {
+        constraintsCorrect = true;
+    }
+
+    return constraintsCorrect;
+}
+
+const ImplFactory::MapImpl& Compiler::getMapImpl(ConstraintImpl cImpl) const
+{
+    if (cImpl == ConstraintImpl::CPU)
+    {
+        return TheImplFactory::Instance().getCPUFP32Map();
+    }
+    else if (cImpl == ConstraintImpl::CPUFP16)
+    {
+        return TheImplFactory::Instance().getCPUFP16Map();
+    }
+    else if (cImpl == ConstraintImpl::CPUFP16FP32MasterWeights)
+    {
+        // Use of master weights do not change implementation
+        // So FP16 impl is used when CPUFP16FP32MasterWeights enabled
+        return TheImplFactory::Instance().getCPUFP16Map();
+    }
+    else if (cImpl == ConstraintImpl::CPUFP32FP16MixedLocal)
+    {
+        return TheImplFactory::Instance().getCPUFP32FP16MixedLocalMap();
+    }
+
+    THROW_NONAME("Compiler", "Unknown map type");
+}
+
+ConstraintImpl Compiler::executionTargetToConstraintImpl(ExecutionTarget target) const
+{
+    if (target == ExecutionTarget::CPU)
+    {
+        return ConstraintImpl::CPU;
+    }
+    else if (target == ExecutionTarget::CPUFP16)
+    {
+        return ConstraintImpl::CPUFP16;
+    }
+
+    THROW_NONAME("Compiler", "Unknown target type");
+}
+
+LayerExecutionTarget Compiler::constraintImplToLayerExecutionTarget(ConstraintImpl cImpl) const
+{
+    if (cImpl == ConstraintImpl::CPU)
+    {
+        return LayerExecutionTarget::CPU;
+    }
+    else if (cImpl == ConstraintImpl::CPUFP16)
+    {
+        return LayerExecutionTarget::CPUFP16;
+    }
+
+    THROW_NONAME("Compiler", "Unknown target type");
+}
+
+std::pair<Names, Names> Compiler::getUnpairNames(const Builders& builders, const Constraint& constraint) const
+{
+    std::pair<Names, Names> ret;
+
+    auto& layerFrom = constraint.getLayerFrom();
+    auto& layerTo = constraint.getLayerTo();
+
+    auto foundItFrom = mMapLayerNameToBuilderIndex.find(layerFrom);
+
+    if (foundItFrom == mMapLayerNameToBuilderIndex.end())
+    {
+        THROW_NONAME("Compiler", layerFrom + " incorrect layerFrom name");
+    }
+
+    size_t indexFrom = (*foundItFrom).second;
+
+    // constraint with only one layer
+    if (layerTo.empty())
+    {
+        ret.first = builders[indexFrom].getParams().inputs;
+        ret.second = builders[indexFrom].getParams().outputs;
+    }
+    else
+    {
+        auto foundItTo = mMapLayerNameToBuilderIndex.find(layerTo);
+        if (foundItTo == mMapLayerNameToBuilderIndex.end())
+        {
+            THROW_NONAME("Compiler", layerTo + " incorrect layerTo name");
+        }
+
+        size_t indexTo = (*foundItTo).second;
+
+        std::set<Name> inputs;
+        std::set<Name> outputs;
+
+        for (size_t q = indexFrom; q <= indexTo; ++q)
+        {
+            auto& in = builders[q].getParams().inputs;
+            std::copy(in.begin(), in.end(), std::inserter(inputs, inputs.end()));
+
+            auto& out = builders[q].getParams().outputs;
+            std::copy(out.begin(), out.end(), std::inserter(outputs, outputs.end()));
+        }
+
+        std::set_difference(inputs.begin(), inputs.end(), outputs.begin(), outputs.end(), std::back_inserter(ret.first));
+
+        std::set_difference(outputs.begin(), outputs.end(), inputs.begin(), inputs.end(), std::back_inserter(ret.second));
+    }
+
+    return ret;
+}
+
+Names Compiler::getConstraintSequenceLayerNames(const Builders& builders, const Constraint& constraint) const
+{
+    Names layerSequence;
+
+    auto& layerFrom = constraint.getLayerFrom();
+    auto& layerTo = constraint.getLayerTo();
+
+    auto foundItFrom = mMapLayerNameToBuilderIndex.find(layerFrom);
+
+    if (foundItFrom == mMapLayerNameToBuilderIndex.end())
+    {
+        THROW_NONAME("Compiler", layerFrom + " incorrect layerFrom name");
+    }
+
+    size_t indexFrom = (*foundItFrom).second;
+
+    // constraint with only one layer
+    if (layerTo.empty())
+    {
+        layerSequence.push_back(layerFrom);
+    }
+    else
+    {
+        auto foundItTo = mMapLayerNameToBuilderIndex.find(layerTo);
+        if (foundItTo == mMapLayerNameToBuilderIndex.end())
+        {
+            THROW_NONAME("Compiler", layerTo + " incorrect layerTo name");
+        }
+
+        size_t indexTo = (*foundItTo).second;
+
+        for (size_t q = indexFrom; q <= indexTo; ++q)
+        {
+            layerSequence.push_back(builders[q].getName());
+        }
+    }
+
+    return layerSequence;
+}
+
+bool Compiler::isConstraintApplicableForConversion(const Constraint& constraint) const
+{
+    // assume all other types no need for conversions
+    return constraint.getConstraintImpl() == ConstraintImpl::CPU || constraint.getConstraintImpl() == ConstraintImpl::CPUFP16;
+}
+
+bool Compiler::isConstraintRequireFP32CopyOfTrainableParams(const Constraint& constraint) const
+{
+    // assume all other types no need for conversions
+    return constraint.getConstraintImpl() == ConstraintImpl::CPUFP16FP32MasterWeights;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/Compiler.h b/training/src/compiler/training/compiler/Compiler.h
new file mode 100644
index 00000000..e31ba1f0
--- /dev/null
+++ b/training/src/compiler/training/compiler/Compiler.h
@@ -0,0 +1,136 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef COMPILER_H
+#define COMPILER_H
+
+#include <memory>
+#include <typeinfo>
+#include <unordered_map>
+#include <vector>
+
+#include <training/system/Name.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/impl/ImplFactory.h>
+
+namespace raul
+{
+
+class BasicLayer;
+class BasicImpl;
+class BasicLayerBuilder;
+struct NetworkParameters;
+
+typedef std::unique_ptr<BasicLayer> LayerMem;
+
+enum class ConstraintImpl
+{
+    CPU,
+    CPUFP16,
+    CPUFP16FP32MasterWeights, // input/output FP16, master weights FP32
+    CPUFP32FP16MixedLocal     // input/output FP32, calculation FP16
+};
+
+class Constraint
+{
+  public:
+    /**
+     * @brief Define execution target for layer
+     */
+    Constraint(const Name& layer, ConstraintImpl cImpl);
+
+    /**
+     * @brief Define execution target for sequence of layers [layerFrom - layerTo]
+     */
+    Constraint(const Name& layerFrom, const Name& layerTo, ConstraintImpl cImpl);
+
+    [[nodiscard]] const Name& getLayerFrom() const { return mLayerFrom; }
+    [[nodiscard]] const Name& getLayerTo() const { return mLayerTo; }
+
+    [[nodiscard]] ConstraintImpl getConstraintImpl() const { return mConstraintImpl; }
+
+    void disableOutputConversion() { mOutputConversion = false; }
+    [[nodiscard]] bool isOutputConversion() const { return mOutputConversion; }
+
+  private:
+    Name mLayerFrom;
+    Name mLayerTo; // inclusive
+
+    bool mOutputConversion;
+
+    ConstraintImpl mConstraintImpl;
+};
+
+class Compiler
+{
+  public:
+    explicit Compiler(ExecutionTarget executionTarget);
+
+    typedef std::vector<BasicLayerBuilder> Builders;
+
+    std::vector<std::unique_ptr<BasicLayer>> resolveImplementation(Builders& builders, NetworkParameters& networkParams);
+
+    void setConstraint(const Constraint& constraint);
+
+    typedef std::vector<Constraint> vConstraints;
+
+    /**
+     * @brief Must be used from user code if registration of other implementations needed
+     */
+    ImplFactory& getImplFactory();
+
+    bool isResolved() const { return mImplementationResolved; }
+
+  private:
+    bool checkConstraints(const Builders& builders);
+
+    const ImplFactory::MapImpl& getMapImpl(ConstraintImpl cImpl) const;
+
+    ConstraintImpl executionTargetToConstraintImpl(ExecutionTarget target) const;
+    LayerExecutionTarget constraintImplToLayerExecutionTarget(ConstraintImpl cImpl) const;
+
+    /**
+     * @ret Inputs / Outputs pair
+     */
+    std::pair<Names, Names> getUnpairNames(const Builders& builders, const Constraint& constraint) const;
+
+    /**
+     * @ret Sequence of layers with same constraint
+     */
+    Names getConstraintSequenceLayerNames(const Builders& builders, const Constraint& constraint) const;
+
+    bool isConstraintApplicableForConversion(const Constraint& constraint) const;
+
+    bool isConstraintRequireFP32CopyOfTrainableParams(const Constraint& constraint) const;
+
+    bool mImplementationResolved;
+
+    // if there will be need to change mGlobalExecutionTarget after Compiler constructed
+    // make sure all added constraints are removed (otherwise there might be duplications with mGlobalExecutionTarget)
+    ExecutionTarget mGlobalExecutionTarget;
+
+    vConstraints mConstraints;
+
+    std::vector<ConstraintImpl> mPerLayerConstraintImpl;
+
+    // layer name to builder index
+    std::unordered_map<Name, size_t> mMapLayerNameToBuilderIndex;
+
+    // set implementation
+    void setImpl(const std::vector<LayerMem>& redefinedFrontLayers, size_t index, bool constrainActivated = false);
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/FrontendCompiler.h b/training/src/compiler/training/compiler/FrontendCompiler.h
new file mode 100644
index 00000000..9023f518
--- /dev/null
+++ b/training/src/compiler/training/compiler/FrontendCompiler.h
@@ -0,0 +1,109 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_COMPILER_H
+#define FRONTEND_COMPILER_H
+
+#include "Workflow.h"
+#include "WorkflowBuilder.h"
+#include <training/base/common/Common.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/loss/CrossEntropyLoss.h>
+#include <training/frontend/Frontend.h>
+
+#include <sstream>
+
+namespace raul
+{
+
+struct FrontendCompiler
+{
+    enum class LossType
+    {
+        CrossEntropy
+    };
+
+    void setTopology(const frontend::Generator& init) { topology = init; }
+
+    void addLoss(const LossType& lossType, const frontend::PortNames& inputs)
+    {
+        auto pair = std::make_pair(lossType, inputs);
+        losses.push_back(pair);
+    }
+
+    auto& getWorkflow() { return work; }
+
+    void compile(const std::initializer_list<size_t> shape = { 1, 1, 1, 1 })
+    {
+        std::unordered_map<Name, std::initializer_list<size_t>> init;
+
+        for (const auto& input : topology.getInputs())
+        {
+            init[input] = shape;
+        }
+        compile(init);
+    }
+
+    void compile(std::unordered_map<Name, std::initializer_list<size_t>> shapeDict)
+    {
+        std::optional<size_t> batchSize;
+        for (const auto& input : topology.getInputs())
+        {
+            const auto shape = shapeDict[input];
+            if (std::distance(shape.begin(), shape.end()) != 4)
+            {
+                THROW_NONAME("FrontendCompiler", "core supports only 4 dimensional tensors")
+            }
+
+            if (!batchSize)
+            {
+                batchSize = *shape.begin();
+            }
+            work.add<DataLayer>(input, DataParams{ { input }, *(shape.begin() + 1), *(shape.begin() + 2), *(shape.begin() + 3) });
+        }
+
+        WorkflowBuilder builder{ work };
+        topology.apply(builder);
+
+        size_t lossIdx = 0;
+        for (const auto& [lossType, inputs] : losses)
+        {
+            std::stringstream ss;
+            ss << "loss" << lossIdx;
+            auto realInputs = builder.getSourcePorts(std::nullopt, inputs);
+            Names realInputsNames(realInputs.size());
+            std::transform(realInputs.cbegin(), realInputs.cend(), realInputsNames.begin(), [](const frontend::Name& x) { return Name{ x }; });
+
+            switch (lossType)
+            {
+                case LossType::CrossEntropy:
+                    LossWrapperFunction<raul::CrossEntropyLoss>(ss.str(), raul::LossParams{ realInputsNames, Names{ ss.str() }, "batch_mean" }, work);
+                    break;
+            }
+            ++lossIdx;
+        }
+
+        work.preparePipelines();
+        work.setBatchSize(batchSize ? *batchSize : 1);
+        work.prepareMemoryForTraining();
+    }
+
+  private:
+    std::vector<std::pair<LossType, frontend::PortNames>> losses;
+    frontend::Generator topology;
+    Workflow work;
+};
+
+} // raul namespace
+
+#endif // FRONTEND_COMPILER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/IntervalTree.cpp b/training/src/compiler/training/compiler/IntervalTree.cpp
new file mode 100644
index 00000000..3a797543
--- /dev/null
+++ b/training/src/compiler/training/compiler/IntervalTree.cpp
@@ -0,0 +1,152 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "IntervalTree.h"
+
+namespace raul
+{
+
+// https://www.geeksforgeeks.org/interval-tree/
+
+IntervalTree::IntervalTree(size_t totalNodes)
+    : mRoot(nullptr)
+    , mNodes(totalNodes)
+    , mNodesCount(0)
+{
+}
+
+IntervalTree::Node* IntervalTree::insert(IntervalTree::Node* root, const Interval* interval)
+{
+    IntervalTree::Node* ret;
+
+    if (root == nullptr)
+    {
+        ret = newNode(interval);
+    }
+    else
+    {
+        size_t rootStart = root->interval->start;
+
+        if (interval->start < rootStart)
+        {
+            root->left = insert(root->left, interval);
+        }
+        else
+        {
+            root->right = insert(root->right, interval);
+        }
+
+        root->max = std::max(root->max, interval->finish);
+
+        ret = root;
+    }
+
+    return ret;
+}
+
+IntervalTree::Node* IntervalTree::newNode(const Interval* interval)
+{
+    if (mNodesCount == mNodes.size())
+    {
+        THROW_NONAME("IntervalTree", "no insertion possible");
+    }
+
+    IntervalTree::Node* ret = &mNodes[mNodesCount];
+
+    ret->interval = interval;
+    ret->left = nullptr;
+    ret->right = nullptr;
+    ret->max = interval->finish;
+
+    ++mNodesCount;
+
+    return ret;
+}
+
+void IntervalTree::insert(const Interval* interval)
+{
+    mRoot = insert(mRoot, interval);
+}
+
+bool IntervalTree::isOverlap(const Interval* intervalA, const Interval* intervalB) const
+{
+    if (intervalA->start <= intervalB->finish && intervalA->finish >= intervalB->start)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+const IntervalTree::Interval* IntervalTree::find(const IntervalTree::Node* root, const Interval* interval) const
+{
+    const Interval* ret = nullptr;
+
+    if (root != nullptr)
+    {
+        if (isOverlap(root->interval, interval))
+        {
+            ret = root->interval;
+        }
+        else
+        {
+            if (root->left != nullptr && root->left->max >= interval->start)
+            {
+                ret = find(root->left, interval);
+            }
+            else
+            {
+                ret = find(root->right, interval);
+            }
+        }
+    }
+
+    return ret;
+}
+
+const IntervalTree::Interval* IntervalTree::find(const Interval* interval) const
+{
+    return find(mRoot, interval);
+}
+
+void IntervalTree::findAll(const IntervalTree::Node* root, const Interval* interval, std::vector<const Interval*>& res) const
+{
+    if (root != nullptr)
+    {
+        if (root->left != nullptr && root->left->max >= interval->start)
+        {
+            findAll(root->left, interval, res);
+        }
+
+        if (root->right != nullptr && root->right->max >= interval->start)
+        {
+            findAll(root->right, interval, res);
+        }
+
+        if (isOverlap(root->interval, interval))
+        {
+            res.push_back(root->interval);
+        }
+    }
+}
+
+std::vector<const IntervalTree::Interval*> IntervalTree::findAll(const Interval* interval) const
+{
+    std::vector<const Interval*> ret;
+
+    findAll(mRoot, interval, ret);
+
+    return ret;
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/IntervalTree.h b/training/src/compiler/training/compiler/IntervalTree.h
new file mode 100644
index 00000000..2f86e3a8
--- /dev/null
+++ b/training/src/compiler/training/compiler/IntervalTree.h
@@ -0,0 +1,61 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef INTERVALTREE_H
+#define INTERVALTREE_H
+
+#include "WorkflowPool.h"
+
+namespace raul
+{
+/**
+ * @brief Class to calculate intervals overlap
+ *
+ */
+class IntervalTree
+{
+  public:
+    IntervalTree(size_t totalNodes);
+
+    typedef WorkflowPool<MemoryManager>::Interval Interval;
+
+    void insert(const Interval* interval);
+
+    const Interval* find(const Interval* interval) const;
+    std::vector<const Interval*> findAll(const Interval* interval) const;
+
+  private:
+    struct Node
+    {
+        const Interval* interval;
+        size_t max;
+        Node *left, *right;
+    };
+
+    Node* insert(Node* root, const Interval* interval);
+
+    Node* newNode(const Interval* interval);
+
+    bool isOverlap(const Interval* intervalA, const Interval* intervalB) const;
+
+    const Interval* find(const Node* root, const Interval* interval) const;
+    void findAll(const Node* root, const Interval* interval, std::vector<const Interval*>& res) const;
+
+    Node* mRoot;
+
+    std::vector<Node> mNodes;
+    size_t mNodesCount;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/LayerBuilder.cpp b/training/src/compiler/training/compiler/LayerBuilder.cpp
new file mode 100644
index 00000000..ac5e71c7
--- /dev/null
+++ b/training/src/compiler/training/compiler/LayerBuilder.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "LayerBuilder.h"
+
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/parameters/BasicParameters.h>
+
+namespace raul
+{
+
+BasicLayerBuilder::BasicLayerBuilder(const Name& name, const BasicParams& params)
+    : mName(name)
+    , mParams(params.getInputs(), params.getOutputs())
+{
+}
+
+void BasicLayerBuilder::alterBasicParam(const BasicLayerInfo& param, BasicParams& bParams)
+{
+    bParams.getInputs() = param.inputs;
+    bParams.getOutputs() = param.outputs;
+}
+
+LayerMem BasicLayerBuilder::build(NetworkParameters& networkParameters) const
+{
+    return mConstr(networkParameters, mParams);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/LayerBuilder.h b/training/src/compiler/training/compiler/LayerBuilder.h
new file mode 100644
index 00000000..c15f0074
--- /dev/null
+++ b/training/src/compiler/training/compiler/LayerBuilder.h
@@ -0,0 +1,94 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAYER_BUILDER_H
+#define LAYER_BUILDER_H
+
+#include <functional>
+#include <memory>
+
+#include <training/system/Name.h>
+
+namespace raul
+{
+
+class BasicLayer;
+struct BasicParams;
+struct NetworkParameters;
+
+typedef std::unique_ptr<BasicLayer> LayerMem;
+
+struct BasicLayerInfo
+{
+    BasicLayerInfo(const Names& in, const Names& out)
+        : inputs(in)
+        , outputs(out)
+    {
+    }
+
+    Names inputs;
+    Names outputs;
+};
+
+class BasicLayerBuilder
+{
+  public:
+    BasicLayerBuilder(const Name& name, const BasicParams& params);
+
+    const Name& getName() const { return mName; }
+
+    BasicLayerInfo& getParams() { return mParams; }
+    const BasicLayerInfo& getParams() const { return mParams; }
+
+    LayerMem build(NetworkParameters& networkParameters) const;
+
+  protected:
+    static void alterBasicParam(const BasicLayerInfo& param, BasicParams& bParams);
+
+    Name mName;
+    BasicLayerInfo mParams;
+    std::function<LayerMem(NetworkParameters&, const BasicLayerInfo& param)> mConstr;
+};
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
+template<typename T, class... Args>
+class LayerBuilder : public BasicLayerBuilder
+{
+  public:
+    LayerBuilder(const Name& name, Args&&... args)
+        : BasicLayerBuilder(name, getParamFromArgs(args...))
+    {
+        mConstr = [name, args...](NetworkParameters& networkParameters, const BasicLayerInfo& param) mutable {
+            BasicLayerBuilder::alterBasicParam(param, getParamFromArgs(args...));
+            return std::make_unique<T>(name, std::forward<Args>(args)..., networkParameters);
+        };
+    }
+
+  private:
+    template<typename T0, typename... TT>
+    static T0& getParamFromArgs(T0& t, [[maybe_unused]] TT&&... args)
+    {
+        return t;
+    }
+};
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/Layers.h b/training/src/compiler/training/compiler/Layers.h
new file mode 100644
index 00000000..d438ea0e
--- /dev/null
+++ b/training/src/compiler/training/compiler/Layers.h
@@ -0,0 +1,113 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAYERS_H
+#define LAYERS_H
+
+#include <training/base/layers/basic/ArgExtremumLayer.h>
+#include <training/base/layers/basic/ArgMaxLayer.h>
+#include <training/base/layers/basic/ArgMinLayer.h>
+#include <training/base/layers/basic/AveragePoolLayer.h>
+#include <training/base/layers/basic/BatchExpanderLayer.h>
+#include <training/base/layers/basic/ClampLayer.h>
+#include <training/base/layers/basic/ConcatenationLayer.h>
+#include <training/base/layers/basic/CumSumLayer.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/DropoutLayer.h>
+#include <training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.h>
+#include <training/base/layers/basic/ElementWiseCompareLayer.h>
+#include <training/base/layers/basic/ElementWiseDivLayer.h>
+#include <training/base/layers/basic/ElementWiseExtremumLayer.h>
+#include <training/base/layers/basic/ElementWiseMaxLayer.h>
+#include <training/base/layers/basic/ElementWiseMinLayer.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+#include <training/base/layers/basic/ElementWiseSubLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/basic/ExpLayer.h>
+#include <training/base/layers/basic/FakeQuantLayer.h>
+#include <training/base/layers/basic/FixedBiasLayer.h>
+#include <training/base/layers/basic/GlobalAveragePoolLayer.h>
+#include <training/base/layers/basic/IndexFillLayer.h>
+#include <training/base/layers/basic/L2NormLayer.h>
+#include <training/base/layers/basic/L2SquaredNormLayer.h>
+#include <training/base/layers/basic/LabelSmoothing.h>
+#include <training/base/layers/basic/LogLayer.h>
+#include <training/base/layers/basic/LossWrapperHelperLayer.h>
+#include <training/base/layers/basic/MaskedFillLayer.h>
+#include <training/base/layers/basic/MatMulLayer.h>
+#include <training/base/layers/basic/MaxPoolLayer.h>
+#include <training/base/layers/basic/PaddingLayer.h>
+#include <training/base/layers/basic/PositionalEncoding.h>
+#include <training/base/layers/basic/RSqrtLayer.h>
+#include <training/base/layers/basic/RandomChoiceLayer.h>
+#include <training/base/layers/basic/RandomSelectLayer.h>
+#include <training/base/layers/basic/RandomTensorLayer.h>
+#include <training/base/layers/basic/ReduceArithmeticLayer.h>
+#include <training/base/layers/basic/ReduceBatchMeanLayer.h>
+#include <training/base/layers/basic/ReduceExtremumLayer.h>
+#include <training/base/layers/basic/ReduceMaxLayer.h>
+#include <training/base/layers/basic/ReduceMeanLayer.h>
+#include <training/base/layers/basic/ReduceMinLayer.h>
+#include <training/base/layers/basic/ReduceNonZeroLayer.h>
+#include <training/base/layers/basic/ReduceStdLayer.h>
+#include <training/base/layers/basic/ReduceSumLayer.h>
+#include <training/base/layers/basic/RepeatInterleaveLayer.h>
+#include <training/base/layers/basic/ReshapeLayer.h>
+#include <training/base/layers/basic/ReverseLayer.h>
+#include <training/base/layers/basic/RollLayer.h>
+#include <training/base/layers/basic/ScaleLayer.h>
+#include <training/base/layers/basic/SelectLayer.h>
+#include <training/base/layers/basic/SlicerLayer.h>
+#include <training/base/layers/basic/SplitterLayer.h>
+#include <training/base/layers/basic/SqrtLayer.h>
+#include <training/base/layers/basic/SquareLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/base/layers/basic/TileLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+
+#include <training/base/layers/basic/trainable/Batchnorm.h>
+#include <training/base/layers/basic/trainable/Convolution1DLayer.h>
+#include <training/base/layers/basic/trainable/Convolution2DLayer.h>
+#include <training/base/layers/basic/trainable/ConvolutionDepthwiseLayer.h>
+#include <training/base/layers/basic/trainable/Embedding.h>
+#include <training/base/layers/basic/trainable/LayerNorm.h>
+#include <training/base/layers/basic/trainable/LayerNorm2D.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+#include <training/base/layers/basic/trainable/TransposedConvolution1DLayer.h>
+#include <training/base/layers/basic/trainable/TransposedConvolution2DLayer.h>
+
+#include <training/base/layers/composite/AdditiveAttentionLayer.h>
+#include <training/base/layers/composite/rnn/GRUFusedGatesCalcLayer.h>
+#include <training/base/layers/composite/rnn/GRULayer.h>
+
+#include <training/base/layers/activations/GeLUActivation.h>
+#include <training/base/layers/activations/HSigmoidActivation.h>
+#include <training/base/layers/activations/HSwishActivation.h>
+#include <training/base/layers/activations/LeakyReLUActivation.h>
+#include <training/base/layers/activations/LogSoftMaxActivation.h>
+#include <training/base/layers/activations/ReLUActivation.h>
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/activations/SwishActivation.h>
+#include <training/base/layers/activations/TanhActivation.h>
+
+#include <training/base/loss/BinaryCrossEntropyLoss.h>
+#include <training/base/loss/CrossEntropyLoss.h>
+#include <training/base/loss/KLDivLoss.h>
+#include <training/base/loss/L1Loss.h>
+#include <training/base/loss/MSELoss.h>
+#include <training/base/loss/NegativeLogLikelihoodLoss.h>
+#include <training/base/loss/SigmoidCrossEntropyLoss.h>
+#include <training/base/loss/SoftmaxCrossEntropyLoss.h>
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/LayersResolver.h b/training/src/compiler/training/compiler/LayersResolver.h
new file mode 100644
index 00000000..59b8f06b
--- /dev/null
+++ b/training/src/compiler/training/compiler/LayersResolver.h
@@ -0,0 +1,200 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef WORKFLOW_LAYERS_RESOLVER_H
+#define WORKFLOW_LAYERS_RESOLVER_H
+
+#include <training/compiler/LayerBuilder.h>
+#include <training/frontend/Frontend.h>
+
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/activations/TanhActivation.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ReshapeLayer.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+#include <training/compiler/Workflow.h>
+
+#include <training/system/Errors.h>
+
+namespace raul
+{
+
+struct LayersResolver : frontend::Processor
+{
+    explicit LayersResolver(std::vector<BasicLayerBuilder>& init)
+        : layers{ init }
+    {
+    }
+
+    void process(const frontend::GraphDeclaration& graph, const std::optional<frontend::Path> path) override
+    {
+        prepareCurrentPorts(graph, path);
+
+        for (auto& [elementName, elementGenerator] : graph.elements)
+        {
+            auto fullName = path ? *path / elementName : elementName;
+            try
+            {
+                elementGenerator.apply(*this, fullName);
+            }
+            catch (...)
+            {
+                std::stringstream ss;
+                ss << "cannot process element \"" << fullName << "\"";
+                THROW_NONAME("LayersResolver", ss.str());
+            }
+        }
+    }
+
+    void process(const frontend::LinearDeclaration& x, const std::optional<frontend::Path> path) override
+    {
+        auto name = getName(path);
+        const auto inputNames = getSourcePorts(path, x.inputs);
+        layers.emplace_back(raul::LayerBuilder<LinearLayer, raul::LinearParams>(name, LinearParams{ inputNames[0], name / x.outputs[0], x.features, x.bias }));
+    }
+
+    void process(const frontend::SigmoidDeclaration& x, const std::optional<frontend::Path> path) override
+    {
+        const auto name = getName(path);
+        const auto inputNames = getSourcePorts(path, x.inputs);
+        layers.emplace_back(raul::LayerBuilder<SigmoidActivation, raul::BasicParams>(name, BasicParams{ { inputNames[0] }, Names{ name / x.outputs[0] } }));
+    }
+
+    void process(const frontend::TanhDeclaration& x, const std::optional<frontend::Path> path) override
+    {
+        const auto name = getName(path);
+        const auto inputNames = getSourcePorts(path, x.inputs);
+        layers.emplace_back(raul::LayerBuilder<TanhActivation, raul::BasicParams>(name, BasicParams{ { inputNames[0] }, Names{ name / x.outputs[0] } }));
+    }
+
+    void process(const frontend::SoftmaxDeclaration& x, const std::optional<frontend::Path> path) override
+    {
+        const auto name = getName(path);
+        const auto inputNames = getSourcePorts(path, x.inputs);
+        layers.emplace_back(raul::LayerBuilder<SoftMaxActivation, raul::BasicParamsWithDim>(name, BasicParamsWithDim{ { inputNames[0] }, Names{ name / x.outputs[0] } }));
+    }
+
+    void process(const frontend::ReshapeDeclaration& x, const std::optional<frontend::Path> path) override
+    {
+        const auto name = getName(path);
+        const auto inputNames = getSourcePorts(path, x.inputs);
+        if (x.shape.size() != 3)
+        {
+            THROW_NONAME("LayersResolver", "workflow support 3 component tensor reshape");
+        }
+        layers.emplace_back(raul::LayerBuilder<ReshapeLayer, raul::ViewParams>(name, ViewParams{ inputNames[0], name / x.outputs[0], x.shape[0], x.shape[1], x.shape[2] }));
+    }
+
+    //  private:
+    static Name getName(const std::optional<frontend::Path>& path) { return path ? path->fullname("::") : "noname"; }
+
+    void prepareCurrentPorts(const frontend::GraphDeclaration& graph, const std::optional<frontend::Path>& path)
+    {
+        for (const auto& [from, to] : graph.connections)
+        {
+            const auto toPath = path ? *path / to.getPath() : to.getPath();
+            const auto fromPath = path ? *path / from.getPath() : from.getPath();
+            const auto toName = toPath.fullname("::");
+            auto fromName = fromPath.fullname("::");
+
+            if (currentPorts.find(toName) != currentPorts.end())
+            {
+                std::stringstream ss;
+                ss << "input tensor collision: ? -> \"" << toName << "\"";
+                THROW_NONAME("LayersResolver", ss.str());
+            }
+
+            while (currentPorts.find(fromName) != currentPorts.end())
+            {
+                fromName = currentPorts[fromName];
+            }
+
+            if (toName == fromName)
+            {
+                std::stringstream ss;
+                ss << "loop found: \"" << toName << "\""
+                   << " -> \"" << toName << "\"";
+                THROW_NONAME("LayersResolver", ss.str());
+            }
+
+            currentPorts[toName] = fromName;
+        }
+    }
+
+    [[nodiscard]] frontend::PortNames getSourcePorts(const std::optional<frontend::Path>& path, const frontend::PortNames& ports)
+    {
+        frontend::PortNames results;
+        for (const auto& port : ports)
+        {
+
+            std::string originalName = path ? (*path / port).fullname("::") : port;
+
+            while (currentPorts.find(originalName) != currentPorts.end())
+            {
+                originalName = currentPorts[originalName];
+            }
+            results.emplace_back(originalName);
+        }
+        return results;
+    }
+
+    void resolveInputs(const std::initializer_list<size_t> shape = { 1, 1, 1, 1 })
+    {
+        std::unordered_map<Name, std::initializer_list<size_t>> init;
+
+        if (layers.empty())
+        {
+            return;
+        }
+
+        for (const auto& input : layers[0].getParams().inputs)
+        {
+            init[input] = shape;
+        }
+        resolveInputs(init);
+    }
+
+    void resolveInputs(std::unordered_map<Name, std::initializer_list<size_t>> shapeDict)
+    {
+        std::optional<size_t> batchSize;
+        if (layers.empty())
+        {
+            return;
+        }
+        for (const auto& input : layers[0].getParams().inputs)
+        {
+            const auto shape = shapeDict[input];
+            if (std::distance(shape.begin(), shape.end()) != 4)
+            {
+                THROW_NONAME("LayerResolver", "core supports only 4 dimensional tensors")
+            }
+
+            if (!batchSize)
+            {
+                batchSize = *shape.begin();
+            }
+
+            layers.emplace(layers.begin(), raul::LayerBuilder<DataLayer, raul::DataParams>(input, DataParams{ { input }, *(shape.begin() + 1), *(shape.begin() + 2), *(shape.begin() + 3) }));
+        }
+    }
+
+  private:
+    std::map<Name, Name> currentPorts;
+    std::vector<BasicLayerBuilder>& layers;
+};
+
+} // namespace raul
+
+#endif // WORKFLOW_LAYERS_RESOLVER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/WShape.h b/training/src/compiler/training/compiler/WShape.h
new file mode 100644
index 00000000..f429017e
--- /dev/null
+++ b/training/src/compiler/training/compiler/WShape.h
@@ -0,0 +1,133 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef WSHAPE_H
+#define WSHAPE_H
+
+namespace raul
+{
+
+
+template<bool B, class T, class F>
+using if_t = typename std::conditional<B, T, F>::type;
+
+/**
+ * @brief Placeholder for batch size
+ */
+struct BS
+{
+    BS()
+        : multiplier(1)
+    {
+    }
+    explicit BS(size_t multi)
+        : multiplier(multi)
+    {
+    }
+
+    size_t multiplier;
+};
+
+class Workflow;
+
+/**
+ * @brief Shape of tensor with batch size placeholders and multiplier possible
+ */
+class WShape
+{
+  public:
+    WShape();
+
+    WShape(const shape& shapeVal);
+
+    template<typename T, typename U, typename V, typename W>
+    WShape(T a, U b, V c, W d)
+    {
+        struct CheckNone
+        {
+            static void fill(WShape& shape, size_t val, size_t index)
+            {
+                shape.mShape[index] = val;
+                shape.mIsBS[index] = false;
+                shape.mMultiplier[index] = 1u;
+            }
+        };
+
+        struct CheckBS
+        {
+            static void fill(WShape& shape, const BS& val, size_t index)
+            {
+                shape.mShape[index] = 0u; // getShape will recalculate
+                shape.mIsBS[index] = true;
+                shape.mMultiplier[index] = val.multiplier;
+            }
+        };
+
+        typedef if_t<std::is_same<T, BS>::value, CheckBS, CheckNone> TypeA;
+
+        typedef if_t<std::is_same<U, BS>::value, CheckBS, CheckNone> TypeB;
+
+        typedef if_t<std::is_same<V, BS>::value, CheckBS, CheckNone> TypeC;
+
+        typedef if_t<std::is_same<W, BS>::value, CheckBS, CheckNone> TypeD;
+
+        TypeA::fill(*this, a, 0);
+        TypeB::fill(*this, b, 1);
+        TypeC::fill(*this, c, 2);
+        TypeD::fill(*this, d, 3);
+    }
+
+    [[nodiscard]] bool isBSDependent() const;
+
+    /**
+     * @brief Get shape of tensor. If placeholder used and no BS defined - exception
+     */
+    [[nodiscard]] shape getShape(const Workflow& work) const;
+
+    void selectMaxShape(WShape& other)
+    {
+        for (size_t q = 0; q < shape::dimensions_number; ++q)
+        {
+            if (!mIsBS[q] && !other.mIsBS[q])
+            {
+                mMultiplier[q] = std::max(mMultiplier[q], other.mMultiplier[q]);
+                mShape[q] = std::max(mShape[q], other.mShape[q]);
+                other.mMultiplier[q] = mMultiplier[q];
+                other.mShape[q] = mShape[q];
+            }
+            else if ((!mIsBS[q] && other.mIsBS[q]) || (mIsBS[q] && !other.mIsBS[q]))
+            {
+                THROW_NONAME("WShape", "not BS layout in shapes");
+            }
+        }
+    }
+
+    bool operator==(const WShape&) const;
+    bool operator!=(const WShape& other) const { return !operator==(other); }
+
+    [[nodiscard]] std::string toString() const;
+
+    friend class Workflow;
+
+    friend std::ostream& operator<<(std::ostream& out, const WShape& instance) { return out << instance.toString(); }
+
+  private:
+    shape mShape;
+    bool mIsBS[shape::dimensions_number];
+    size_t mMultiplier[shape::dimensions_number];
+};
+
+}
+
+
+#endif // WSHAPE_H
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/Workflow.cpp b/training/src/compiler/training/compiler/Workflow.cpp
new file mode 100644
index 00000000..ad07896f
--- /dev/null
+++ b/training/src/compiler/training/compiler/Workflow.cpp
@@ -0,0 +1,2836 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Workflow.h"
+
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/base/tools/Utils.h>
+
+#include <training/compiler/Compiler.h>
+#include <training/compiler/WorkflowActions.h>
+
+#include "WorkflowDB.h"
+
+namespace
+{
+const raul::Name suffixRecalced = "recalc";
+const raul::Name suffixForward = "forwardPass";
+const raul::Name suffixBack = "back";
+} // anonymous
+
+namespace raul
+{
+
+WShape::WShape()
+{
+    for (size_t q = 0; q < shape::dimensions_number; ++q)
+    {
+        mIsBS[q] = false;
+        mMultiplier[q] = 1u;
+    }
+}
+
+WShape::WShape(const shape& shapeVal)
+    : mShape(shapeVal)
+{
+    for (size_t q = 0; q < shape::dimensions_number; ++q)
+    {
+        mIsBS[q] = false;
+        mMultiplier[q] = 1u;
+    }
+}
+
+bool WShape::isBSDependent() const
+{
+    bool ret = false;
+
+    for (size_t q = 0; q < shape::dimensions_number; ++q)
+    {
+        ret |= mIsBS[q];
+    }
+
+    return ret;
+}
+
+std::string WShape::toString() const
+{
+    std::string s;
+    s += "[";
+    for (size_t i = 0; i < 4; ++i)
+    {
+        if (mIsBS[i])
+        {
+            s += "BATCH";
+        }
+        else
+        {
+            s += std::to_string(mShape[i]);
+        }
+        if (i < 3)
+        {
+            s += ", ";
+        }
+    }
+    s += "]";
+    return s;
+}
+
+shape WShape::getShape(const Workflow& work) const
+{
+    shape ret = mShape;
+
+    for (size_t q = 0; q < shape::dimensions_number; ++q)
+    {
+        if (mIsBS[q])
+        {
+            size_t bs = work.getBatchSize();
+            ret[q] = bs * mMultiplier[q];
+        }
+    }
+
+    return ret;
+}
+
+bool WShape::operator==(const WShape& other) const
+{
+    bool ret = (mShape == other.mShape);
+
+    for (size_t q = 0; q < shape::dimensions_number; ++q)
+    {
+        ret &= (mIsBS[q] == other.mIsBS[q]);
+        if (mIsBS[q] && other.mIsBS[q])
+        {
+            ret &= (mMultiplier[q] == other.mMultiplier[q]);
+        }
+    }
+
+    return ret;
+}
+
+Workflow::Workflow(CompressionMode compressionMode,
+                   CalculationMode calculationMode,
+                   AllocationMode allocationMode,
+                   ExecutionTarget executionTarget,
+                   bool useCompiler,
+                   quantization::IQuantizer* quantizer)
+    : mNetworkParameters(mMemoryManager, mMemoryManagerFP16, *this, 0, compressionMode, calculationMode, quantizer)
+    , mCompiler(std::make_unique<Compiler>(executionTarget))
+    , mAllocationMode(allocationMode)
+    , mExecutionTarget(executionTarget)
+    , mOverridedLayerExecutionTarget(LayerExecutionTarget::Default)
+    , mWorkflowDB(std::make_shared<WorkflowDB>())
+    , mUseCompiler(useCompiler)
+    , mCompilationStarted(false)
+    , mIsPipelinesPrepared(false)
+    , mBatchSize(0)
+    , mIsBatchSizeInited(false)
+    , mIsMemoryPrepared(false)
+    , mIsForwardCalled(false)
+{
+    try
+    {
+        if (mAllocationMode == AllocationMode::POOL)
+        {
+            mWorkflowPoolTest = std::make_shared<WorkflowPool<MemoryManager>>();
+            mWorkflowPoolTrain = std::make_shared<WorkflowPool<MemoryManager>>();
+
+            mWorkflowPoolTestFP16 = std::make_shared<WorkflowPool<MemoryManagerFP16>>();
+            mWorkflowPoolTrainFP16 = std::make_shared<WorkflowPool<MemoryManagerFP16>>();
+        }
+
+        if (mExecutionTarget == ExecutionTarget::CPU || mExecutionTarget == ExecutionTarget::CPUFP16)
+        {
+            if (mAllocationMode == AllocationMode::POOL && mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+            {
+                if (mAllocationMode == AllocationMode::POOL && mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+                {
+                    THROW_NONAME("Workflow", "allocation mode POOL not possible for compressions");
+                }
+            }
+        }
+        else
+        {
+            THROW_NONAME("Workflow", "unsupported execution target");
+        }
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "cannot create workflow");
+    }
+}
+
+Workflow::~Workflow() = default;
+
+void Workflow::addLayer(LayerMem layer)
+{
+    try
+    {
+        if (mIsPipelinesPrepared)
+        {
+            THROW_NONAME("Workflow", "pipelines prepared, no addition possible");
+        }
+
+        const std::string layerName = layer->getName();
+
+        if (layerName.empty())
+        {
+            THROW_NONAME("Workflow", "empty layer name");
+        }
+
+        if (mLayersDict.find(layerName) != mLayersDict.end())
+        {
+            THROW_NONAME("Workflow", "layer with the same name [" + layerName + "] already exists");
+        }
+
+        if (!checkOutputsNeeded(layer.get()))
+        {
+            THROW_NONAME("Workflow", "layer [" + layerName + "] does not declare outputs for forward pass");
+        }
+
+        mLayersDict.insert({ layerName, layer.get() });
+        mLayers.emplace_back(std::move(layer));
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot add layer");
+    }
+}
+
+void Workflow::tensorNeeded(const Name& layerName,
+                            const Name& tensorName,
+                            WShape shape,
+                            Workflow::Usage usage,
+                            Workflow::Mode mode,
+                            bool isOptimizeGraph,
+                            bool isOptimizeMem,
+                            bool isTrainable,
+                            bool isZero,
+                            bool isCompress,
+                            LayerExecutionTarget layerExecutionTarget)
+{
+    try
+    {
+
+        if (mOverridedLayerExecutionTarget != LayerExecutionTarget::Default)
+        {
+            layerExecutionTarget = mOverridedLayerExecutionTarget;
+        }
+
+        if (mIsPipelinesPrepared)
+        {
+            THROW_NONAME("Workflow", "pipelines prepared, no declaration possible");
+        }
+
+        WorkflowDB::TensorUsage tensorUsage({ layerName, tensorName, shape, usage, mode, isOptimizeGraph, isOptimizeMem, isTrainable, isZero, isCompress, layerExecutionTarget });
+
+        if (mWorkflowDB->isTensorExistsInTable(tensorName, layerName, usage))
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorUsage.tensorName + "] has been already declared for layer [" + tensorUsage.layerName + "] with same usage");
+        }
+
+        if (isOptimizeMem && isTrainable)
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorUsage.tensorName + "] has been declared as trainable and memory optimizable at the same time");
+        }
+
+        if (mode == Mode::Read && isZero)
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorUsage.tensorName + "] has been declared as read only and zeroed at the same time");
+        }
+
+        if (!isOptimizeMem && isCompress)
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorUsage.tensorName + "] has been declared as persistent and compressed at the same time");
+        }
+
+        if (layerExecutionTarget != LayerExecutionTarget::Default)
+        {
+            if (mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+            {
+                THROW_NONAME("Workflow", "compressions not possible for customized layer execution target");
+            }
+        }
+
+        mWorkflowDB->addUsage(tensorUsage);
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot declare tensor");
+    }
+}
+
+void Workflow::tensorNeededMaxShape(const Name& layerName,
+                                    const Name& tensorName,
+                                    WShape shape,
+                                    Workflow::Usage usage,
+                                    Workflow::Mode mode,
+                                    bool isOptimizeGraph,
+                                    bool isOptimizeMem,
+                                    bool isTrainable,
+                                    bool isZero,
+                                    bool isCompress,
+                                    LayerExecutionTarget layerExecutionTarget)
+{
+    try
+    {
+        if (mOverridedLayerExecutionTarget != LayerExecutionTarget::Default)
+        {
+            layerExecutionTarget = mOverridedLayerExecutionTarget;
+        }
+
+        if (mIsPipelinesPrepared)
+        {
+            THROW_NONAME("Workflow", "pipelines prepared, no declaration possible");
+        }
+
+        mWorkflowDB->chooseMaxShape(tensorName, shape); // shape might be adjusted
+
+        WorkflowDB::TensorUsage tensorUsage({ layerName, tensorName, shape, usage, mode, isOptimizeGraph, isOptimizeMem, isTrainable, isZero, isCompress, layerExecutionTarget });
+
+        if (isOptimizeMem && isTrainable)
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorUsage.tensorName + "] has been declared as trainable and memory optimizable at the same time");
+        }
+
+        if (mode == Mode::Read && isZero)
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorUsage.tensorName + "] has been declared as read only and zeroed at the same time");
+        }
+
+        if (!isOptimizeMem && isCompress)
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorUsage.tensorName + "] has been declared as persistent and compressed at the same time");
+        }
+
+        if (layerExecutionTarget != LayerExecutionTarget::Default)
+        {
+            // d.polubotko: assume user knows what to do so skip this check
+            // if (mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+            {
+                // THROW_NONAME("Workflow", "compressions not possible for customized layer execution target");
+            }
+        }
+
+        mWorkflowDB->addUsage(tensorUsage);
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot declare tensor");
+    }
+}
+
+void Workflow::copyDeclaration(const Name& layerName,
+                               const Name& tensorName,
+                               Workflow::Usage usage,
+                               Workflow::Mode mode,
+                               bool isOptimizeGraph,
+                               bool isOptimizeMem,
+                               bool isTrainable,
+                               bool isZero,
+                               bool isCompress,
+                               LayerExecutionTarget layerExecutionTarget)
+{
+    try
+    {
+        if (mIsPipelinesPrepared)
+        {
+            THROW_NONAME("Workflow", "pipelines prepared, no declaration possible");
+        }
+
+        if (mWorkflowDB->isTensorExistsInTable(tensorName, layerName, usage))
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorName + "] has been already declared for layer [" + layerName + "] with same usage");
+        }
+
+        if (mOverridedLayerExecutionTarget != LayerExecutionTarget::Default)
+        {
+            layerExecutionTarget = mOverridedLayerExecutionTarget;
+        }
+
+        if (layerExecutionTarget != LayerExecutionTarget::Default)
+        {
+            if (mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+            {
+                THROW_NONAME("Workflow", "compressions not possible for customized layer execution target");
+            }
+        }
+
+        WorkflowDB::TensorUsage usg = mWorkflowDB->findFirstTensor(tensorName);
+
+        usg.layerName = layerName;
+        usg.usage = usage;
+        usg.mode = mode;
+        usg.isOptimizeGraph = isOptimizeGraph;
+        usg.isOptimizeMem = isOptimizeMem;
+        usg.isTrainable = isTrainable;
+        usg.isZero = isZero;
+        usg.isCompress = isCompress;
+        usg.layerExecutionTarget = layerExecutionTarget;
+
+        if (mode == Mode::Read && isZero)
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorName + "] has been declared as read only and zeroed at the same time");
+        }
+
+        mWorkflowDB->addUsage(usg);
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot copy declaration");
+    }
+}
+
+void Workflow::copyDeclaration(const Name& layerName, const Name& tensorName, Workflow::Usage usage, Workflow::Mode mode)
+{
+    try
+    {
+        if (mIsPipelinesPrepared)
+        {
+            THROW_NONAME("Workflow", "pipelines prepared, no declaration possible");
+        }
+
+        if (mWorkflowDB->isTensorExistsInTable(tensorName, layerName, usage))
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorName + "] has been already declared for layer [" + layerName + "] with same usage");
+        }
+
+        WorkflowDB::TensorUsage usg = mWorkflowDB->findFirstTensor(tensorName);
+
+        usg.layerName = layerName;
+        usg.usage = usage;
+        usg.mode = mode;
+
+        if (mOverridedLayerExecutionTarget != LayerExecutionTarget::Default)
+        {
+            usg.layerExecutionTarget = mOverridedLayerExecutionTarget;
+        }
+
+        if (mode == Mode::Read && usg.isZero)
+        {
+            THROW_NONAME("Workflow", "tensor [" + tensorName + "] has been declared as read only and zeroed at the same time");
+        }
+
+        mWorkflowDB->addUsage(usg);
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot copy declaration");
+    }
+}
+
+void Workflow::copyDeclaration(const Name& layerName,
+                               const Name& fromTensorName,
+                               const Name& toTensorName,
+                               Workflow::Usage usage,
+                               Workflow::Mode mode,
+                               bool isOptimizeGraph,
+                               bool isOptimizeMem,
+                               bool isTrainable,
+                               bool isZero,
+                               bool isCompress,
+                               LayerExecutionTarget layerExecutionTarget)
+{
+    try
+    {
+        if (mOverridedLayerExecutionTarget != LayerExecutionTarget::Default)
+        {
+            layerExecutionTarget = mOverridedLayerExecutionTarget;
+        }
+
+        if (mIsPipelinesPrepared)
+        {
+            THROW_NONAME("Workflow", "pipelines prepared, no declaration possible");
+        }
+
+        if (mWorkflowDB->isTensorExistsInTable(toTensorName, layerName, usage))
+        {
+            THROW_NONAME("Workflow", "tensor [" + toTensorName + "] has been already declared for layer [" + layerName + "] with same usage");
+        }
+
+        if (layerExecutionTarget != LayerExecutionTarget::Default)
+        {
+            if (mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+            {
+                THROW_NONAME("Workflow", "compressions not possible for customized layer execution target");
+            }
+        }
+
+        WorkflowDB::TensorUsage usg = mWorkflowDB->findFirstTensor(fromTensorName);
+
+        usg.layerName = layerName;
+        usg.tensorName = toTensorName;
+        usg.usage = usage;
+        usg.mode = mode;
+        usg.isOptimizeGraph = isOptimizeGraph;
+        usg.isOptimizeMem = isOptimizeMem;
+        usg.isTrainable = isTrainable;
+        usg.isZero = isZero;
+        usg.isCompress = isCompress;
+        usg.layerExecutionTarget = layerExecutionTarget;
+
+        if (mode == Mode::Read && isZero)
+        {
+            THROW_NONAME("Workflow", "tensor [" + toTensorName + "] has been declared as read only and zeroed at the same time");
+        }
+
+        mWorkflowDB->addUsage(usg);
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot copy declaration");
+    }
+}
+
+void Workflow::copyDec(const Name& layerName,
+                       const Name& fromTensorName,
+                       const Name& toTensorName,
+                       Usage usage,
+                       Mode mode,
+                       bool isOptimizeGraph,
+                       bool isOptimizeMem,
+                       bool isTrainable,
+                       bool isZero,
+                       bool isCompress)
+{
+    if (mIsPipelinesPrepared)
+    {
+        THROW_NONAME("Workflow", "pipelines prepared, no declaration possible");
+    }
+
+    if (mWorkflowDB->isTensorExistsInTable(toTensorName, layerName, usage))
+    {
+        THROW_NONAME("Workflow", "tensor [" + toTensorName + "] has been already declared for layer [" + layerName + "] with same usage");
+    }
+
+    if (isTensorDeclared(toTensorName))
+    {
+        copyDeclaration(layerName, toTensorName, usage, mode);
+    }
+    else
+    {
+        copyDeclaration(layerName, fromTensorName, toTensorName, usage, mode, isOptimizeGraph, isOptimizeMem, isTrainable, isZero, isCompress);
+    }
+}
+
+void Workflow::createImplementations()
+{
+    if (mCompiler->isResolved())
+    {
+        THROW_NONAME("Workflow", "implementations already resolved");
+    }
+
+    mCompilationStarted = true;
+
+    auto fronts = mCompiler->resolveImplementation(mBuilders, mNetworkParameters);
+    for (auto& front : fronts)
+    {
+        addLayer(std::move(front));
+    }
+
+    mBuilders.clear();
+    mBuilders.shrink_to_fit();
+}
+
+void Workflow::preparePipelines(Execution execution)
+{
+    try
+    {
+        if (mUseCompiler)
+        {
+            if (!mCompiler->isResolved())
+            {
+                createImplementations();
+            }
+        }
+
+        if (mIsForwardCalled)
+        {
+            THROW_NONAME("Workflow", "forward called without leading backward");
+        }
+
+        if (mOverridedLayerExecutionTarget != LayerExecutionTarget::Default)
+        {
+            THROW_NONAME("Workflow", "override should be reseted before pipelines preparation");
+        }
+
+        if (execution == Execution::Checkpointed)
+        {
+            if (mExecutionTarget == ExecutionTarget::CPUFP16)
+            {
+                THROW_NONAME("Workflow", "checkpointed pipeline not supported for CPUFP16 targets");
+            }
+        }
+
+        fillExternalInputs();
+
+        // check same outputs usage globally
+        {
+            std::unordered_set<Name> allOutputs;
+            for (auto& layer : mLayers)
+            {
+                for (const auto& output : layer->getOutputs())
+                {
+                    auto it = allOutputs.find(output);
+                    if (it != allOutputs.end())
+                    {
+                        THROW_NONAME("Workflow", "the workflow is not correct, there are same outputs defined: " + output);
+                    }
+                    else
+                    {
+                        allOutputs.insert(output);
+                    }
+                }
+            }
+        }
+
+        // check unique names (inputs, weights) usage per layer
+        {
+            for (auto& layer : mLayers)
+            {
+                if (!isUniqueNames(layer->getInputs()))
+                {
+                    THROW_NONAME("Workflow", "the workflow is not correct, there are same inputs defined for layer " + layer->getName());
+                }
+
+                if (!isUniqueNames(layer->getSharedWeights()))
+                {
+                    THROW_NONAME("Workflow", "the workflow is not correct, there are same weights defined for layer " + layer->getName());
+                }
+            }
+        }
+
+        if (!isGraphCorrected())
+        {
+            const auto separator = ", ";
+            std::string externalInputs;
+
+            auto begin = mExternalInputs.cbegin();
+            const auto end = mExternalInputs.cend();
+
+            if (begin != end)
+            {
+                externalInputs = *begin;
+                ++begin;
+            }
+
+            for (; begin != end; ++begin)
+            {
+                externalInputs += separator;
+                externalInputs += *begin;
+            }
+
+            THROW_NONAME("Workflow", "the workflow is not correct, there are external inputs: " + externalInputs);
+        }
+
+        clearPipelines();
+
+        // d.polubotko: order matter (createAuxPipelines adjust isZero, isCompress flags)
+        createAuxPipelines();
+        createForwardTestPipeline();
+
+        if (execution == Execution::Normal)
+        {
+            createTrainPipeline();
+        }
+
+        if (execution == Execution::Checkpointed)
+        {
+            createTrainCheckpointedPipeline();
+        }
+
+        mIsPipelinesPrepared = true;
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot prepare pipelines");
+    }
+}
+
+void Workflow::fillExternalInputs()
+{
+    std::unordered_map<std::string, size_t> edgeParent;
+
+    for (size_t q = 0; q < mLayers.size(); ++q)
+    {
+        for (const raul::Name& output : mLayers[q]->getOutputs())
+        {
+            if (output.empty())
+            {
+                THROW_NONAME("Workflow", "output tensor name is empty");
+            }
+        }
+
+        for (const raul::Name& input : mLayers[q]->getInputs())
+        {
+            if (input.empty())
+            {
+                THROW_NONAME("Workflow", "input tensor name is empty");
+            }
+            auto it = edgeParent.find(input);
+
+            if (it != edgeParent.end())
+            {
+            }
+            else
+            {
+                mExternalInputs.insert(input);
+            }
+        }
+
+        for (const raul::Name& output : mLayers[q]->getOutputs())
+        {
+            edgeParent[output] = q;
+        }
+    }
+}
+
+bool Workflow::checkOutputsNeeded(const BasicLayer* layer) const
+{
+    bool ret = true;
+
+    const Names& outputs = layer->getOutputs();
+
+    for (const auto& output : outputs)
+    {
+        auto tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), layer->getName(), output);
+
+        if (mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Forward))
+        {
+            ret = false;
+            break;
+        }
+    }
+
+    return ret;
+}
+
+void Workflow::createAuxPipelines()
+{
+    try
+    {
+        // Tensor -> index in mTensorNeeded
+        std::unordered_map<Name, size_t> uniqueTensors;
+
+        // check inequality, fill uniqueTensors
+        for (auto& mLayer : mLayers)
+        {
+            const Name& lName = mLayer->getName();
+            std::vector<Name> tensors = mWorkflowDB->getSlice(mWorkflowDB->getLayersTable(), lName);
+            for (const auto& tName : tensors)
+            {
+                auto uniqueIt = uniqueTensors.find(tName);
+                if (uniqueIt != uniqueTensors.end())
+                {
+                    auto tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+
+                    if (!mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Forward))
+                    {
+                        auto& tUsg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]);
+
+                        if (mWorkflowDB->getUsage((*uniqueIt).second).isZero || tUsg.isZero)
+                        {
+                            mWorkflowDB->getUsage((*uniqueIt).second).isZero = true;
+                            tUsg.isZero = true;
+                        }
+
+                        if (mWorkflowDB->getUsage((*uniqueIt).second).isCompress || tUsg.isCompress)
+                        {
+                            mWorkflowDB->getUsage((*uniqueIt).second).isCompress = true;
+                            tUsg.isCompress = true;
+                        }
+
+                        checkAttributesInequality((*uniqueIt).second, tensorUsage[static_cast<size_t>(Usage::Forward)], tName);
+                    }
+
+                    if (!mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Backward))
+                    {
+                        auto& tUsg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Backward)]);
+
+                        if (mWorkflowDB->getUsage((*uniqueIt).second).isZero || tUsg.isZero)
+                        {
+                            mWorkflowDB->getUsage((*uniqueIt).second).isZero = true;
+                            tUsg.isZero = true;
+                        }
+
+                        if (mWorkflowDB->getUsage((*uniqueIt).second).isCompress || tUsg.isCompress)
+                        {
+                            mWorkflowDB->getUsage((*uniqueIt).second).isCompress = true;
+                            tUsg.isCompress = true;
+                        }
+
+                        checkAttributesInequality((*uniqueIt).second, tensorUsage[static_cast<size_t>(Usage::Backward)], tName);
+                    }
+                }
+                else
+                {
+                    auto tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+
+                    Usage tUsage = Usage::Backward;
+                    if (!mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Forward))
+                    {
+                        tUsage = Usage::Forward;
+                    }
+
+                    if (!mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Forward) && !mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Backward))
+                    {
+                        auto& tUsgF = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]);
+                        auto& tUsgB = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Backward)]);
+
+                        if (tUsgF.isZero || tUsgB.isZero)
+                        {
+                            tUsgF.isZero = true;
+                            tUsgB.isZero = true;
+                        }
+
+                        if (tUsgF.isCompress || tUsgB.isCompress)
+                        {
+                            tUsgF.isCompress = true;
+                            tUsgB.isCompress = true;
+                        }
+
+                        checkAttributesInequality(tensorUsage[static_cast<size_t>(Usage::Forward)], tensorUsage[static_cast<size_t>(Usage::Backward)], tName);
+                    }
+
+                    uniqueTensors.insert({ tName, tensorUsage[static_cast<size_t>(tUsage)] });
+                }
+            }
+        }
+
+        // create pipelines
+        execTargetCreateAuxPipelines(uniqueTensors);
+
+        for (const auto& layer : mLayers)
+        {
+            mPipelineCreateNotBatched.push_back(std::make_shared<InitNonBS>(layer.get()));
+            mPipelineCreateBatched.push_back(std::make_shared<UpdateBS>(layer.get(), *this));
+        }
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot create auxiliary pipelines");
+    }
+}
+
+void Workflow::checkAttributesInequality(size_t indexA, size_t indexB, const Name& name) const
+{
+    const WorkflowDB::TensorUsage& usageA = mWorkflowDB->getUsage(indexA);
+    const WorkflowDB::TensorUsage& usageB = mWorkflowDB->getUsage(indexB);
+
+    if (usageA.isOptimizeGraph != usageB.isOptimizeGraph)
+    {
+        THROW_NONAME("Workflow",
+                     "attribute isOptimizeGraph inequality for tensor " + name + ". " + Conversions::toString(usageA.isOptimizeGraph) + " from layer '" + usageA.layerName + "' vs " +
+                         Conversions::toString(usageB.isOptimizeGraph) + " from layer '" + usageB.layerName + "'");
+    }
+    if (usageA.isOptimizeMem != usageB.isOptimizeMem)
+    {
+        THROW_NONAME("Workflow",
+                     "attribute isOptimizeMem inequality for tensor " + name + ". " + Conversions::toString(usageA.isOptimizeMem) + " from layer '" + usageA.layerName + "' vs " +
+                         Conversions::toString(usageB.isOptimizeMem) + " from layer '" + usageB.layerName + "'");
+    }
+    if (usageA.isTrainable != usageB.isTrainable)
+    {
+        THROW_NONAME("Workflow",
+                     "attribute isTrainable inequality for tensor " + name + ". " + Conversions::toString(usageA.isTrainable) + " from layer '" + usageA.layerName + "' vs " +
+                         Conversions::toString(usageB.isTrainable) + " from layer '" + usageB.layerName + "'");
+    }
+    if (usageA.isZero != usageB.isZero)
+    {
+        THROW_NONAME("Workflow",
+                     "attribute isZero inequality for tensor " + name + ". " + Conversions::toString(usageA.isZero) + " from layer '" + usageA.layerName + "' vs " +
+                         Conversions::toString(usageB.isZero) + " from layer '" + usageB.layerName + "'");
+    }
+    if (usageA.isCompress != usageB.isCompress)
+    {
+        THROW_NONAME("Workflow",
+                     "attribute isCompress inequality for tensor " + name + ". " + Conversions::toString(usageA.isCompress) + " from layer '" + usageA.layerName + "' vs " +
+                         Conversions::toString(usageB.isCompress) + " from layer '" + usageB.layerName + "'");
+    }
+    if (usageA.layerExecutionTarget != usageB.layerExecutionTarget)
+    {
+        THROW_NONAME("Workflow",
+                     "attribute layerExecutionTarget inequality for tensor " + name + ". " + Conversions::toString(usageA.layerExecutionTarget) + " from layer '" + usageA.layerName + "' vs " +
+                         Conversions::toString(usageB.layerExecutionTarget) + " from layer '" + usageB.layerName + "'");
+    }
+    if (usageA.shape != usageB.shape)
+    {
+        THROW_NONAME("Workflow",
+                     "attribute shape inequality for tensor " + name + ". " + usageA.shape.toString() + " from layer '" + usageA.layerName + "' vs " + usageB.shape.toString() + " from layer '" +
+                         usageB.layerName + "'");
+    }
+}
+
+void Workflow::createForwardTestPipeline()
+{
+    try
+    {
+
+        Timeline timelineTensors = getTimeline(getLayerNames(), Usage::Forward);
+
+        const std::pair<Appearance, Appearance> appearance = timelineToAppearance(timelineTensors);
+        const Appearance& timelineLayersFirst = appearance.first;
+        const Appearance& timelineLayersLast = appearance.second;
+
+        // create pipelines
+
+        if (mAllocationMode == AllocationMode::POOL)
+        {
+            std::tuple<Timeline, Timeline> timelineJoint = appearanceToTimeline(appearance, {});
+
+            mWorkflowPoolTest->createIntervals(getLayerNames(), std::get<0>(timelineJoint));
+            mWorkflowPoolTestFP16->createIntervals(getLayerNames(), std::get<1>(timelineJoint));
+        }
+
+        execTargetCreateForwardTestPipeline(timelineLayersFirst, timelineLayersLast);
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot create forward testing pipeline");
+    }
+}
+
+void Workflow::createTrainPipeline()
+{
+    try
+    {
+
+        Names layers = getLayerNames();
+
+        // Tensor name -> [first, last] layer
+        Timeline timelineTensorsForward = getTimeline(layers, Usage::Forward);
+        Timeline timelineTensorsBackward = getTimeline(layers, Usage::Backward);
+
+        if (mNetworkParameters.mCompressionMode == CompressionMode::NONE)
+        {
+            // remove duplication of potential allocations
+            for (auto& timeForward : timelineTensorsForward)
+            {
+                const Name& tName = timeForward.first;
+
+                auto it = timelineTensorsBackward.find(tName);
+                if (it != timelineTensorsBackward.end())
+                {
+                    (*it).second.second = "";
+                }
+            }
+
+            // remove duplication of potential deallocations
+            for (auto& timeBackward : timelineTensorsBackward)
+            {
+                const Name& tName = timeBackward.first;
+
+                auto it = timelineTensorsForward.find(tName);
+                if (it != timelineTensorsForward.end())
+                {
+                    (*it).second.second = "";
+                }
+            }
+        }
+
+        std::pair<Appearance, Appearance> appearanceForward = timelineToAppearance(timelineTensorsForward);
+        const Appearance& timelineLayersFirstForward = appearanceForward.first;
+        Appearance& timelineLayersLastForward = appearanceForward.second;
+
+        std::pair<Appearance, Appearance> appearanceBackward = timelineToAppearance(timelineTensorsBackward);
+        const Appearance& timelineLayersFirstBackward = appearanceBackward.first;
+        Appearance& timelineLayersLastBackward = appearanceBackward.second;
+
+        Appearance forwardCompress;
+        Appearance backwardDecompress;
+
+        if (mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+        {
+            // remove duplication of potential allocations
+            for (const auto& appearForward : timelineLayersFirstForward)
+            {
+                for (const auto& tName : appearForward.second)
+                {
+                    for (auto& appearBackward : timelineLayersLastBackward)
+                    {
+                        const Name& lName = appearBackward.first;
+
+                        auto lastBackIt = std::find(appearBackward.second.begin(), appearBackward.second.end(), tName);
+                        if (lastBackIt != appearBackward.second.end())
+                        {
+                            appearBackward.second.erase(lastBackIt);
+
+                            // decompression case
+                            auto decompressLayer = backwardDecompress.find(lName);
+                            if (decompressLayer != backwardDecompress.end())
+                            {
+                                (*decompressLayer).second.push_back(tName);
+                            }
+                            else
+                            {
+                                backwardDecompress.insert({ lName, { tName } });
+                            }
+                        }
+                    }
+                }
+            }
+
+            // remove duplication of potential deallocations
+            for (const auto& appearBackward : timelineLayersFirstBackward)
+            {
+                for (const auto& tName : appearBackward.second)
+                {
+                    for (auto& appearForward : timelineLayersLastForward)
+                    {
+                        const Name& lName = appearForward.first;
+
+                        auto lastForwardIt = std::find(appearForward.second.begin(), appearForward.second.end(), tName);
+                        if (lastForwardIt != appearForward.second.end())
+                        {
+                            appearForward.second.erase(lastForwardIt);
+
+                            // compression case
+                            auto compressLayer = forwardCompress.find(lName);
+                            if (compressLayer != forwardCompress.end())
+                            {
+                                (*compressLayer).second.push_back(tName);
+                            }
+                            else
+                            {
+                                forwardCompress.insert({ lName, { tName } });
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        if (mAllocationMode == AllocationMode::POOL)
+        {
+            std::tuple<Timeline, Timeline> timelineJoint = appearanceToTimeline(appearanceForward, appearanceBackward);
+
+            Names lNames = getLayerNames();
+            Names lNamesOrig = lNames;
+
+            for (auto it = lNamesOrig.rbegin(); it != lNamesOrig.rend(); ++it)
+            {
+                lNames.push_back((*it) + suffixBack);
+            }
+
+            mWorkflowPoolTrain->createIntervals(lNames, std::get<0>(timelineJoint));
+            mWorkflowPoolTrainFP16->createIntervals(lNames, std::get<1>(timelineJoint));
+        }
+
+        // create pipelines
+        // forward
+        for (auto& layer : mLayers)
+        {
+            fillTrainPipeline(mPipelineForwardTrain, layer.get(), timelineLayersFirstForward, timelineLayersLastForward, Usage::Forward);
+
+            // compress
+            if (mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+            {
+                fillTrainPipelineCompression(mPipelineForwardTrain, layer.get(), forwardCompress, Usage::Forward);
+            }
+        }
+
+        // backward
+        for (auto it = mLayers.rbegin(); it != mLayers.rend(); ++it)
+        {
+            // decompress
+            if (mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+            {
+                fillTrainPipelineCompression(mPipelineBackwardTrain, (*it).get(), backwardDecompress, Usage::Backward);
+            }
+
+            fillTrainPipeline(mPipelineBackwardTrain, (*it).get(), timelineLayersLastBackward, timelineLayersFirstBackward, Usage::Backward);
+        }
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot create training pipelines");
+    }
+}
+
+Names Workflow::getLayerNames() const
+{
+    Names ret(mLayers.size());
+
+    for (size_t q = 0; q < mLayers.size(); ++q)
+    {
+        ret[q] = mLayers[q]->getName();
+    }
+
+    return ret;
+}
+
+Workflow::Timeline Workflow::getTimeline(const Names& layers, Usage usage) const
+{
+    Timeline ret;
+
+    if (usage != Usage::Forward && usage != Usage::Backward)
+    {
+        THROW_NONAME("Workflow", "incorrect parameter");
+    }
+
+    for (const auto& lName : layers)
+    {
+        std::vector<Name> tensors = mWorkflowDB->getSlice(mWorkflowDB->getLayersTable(), lName);
+        for (const auto& tName : tensors)
+        {
+            auto tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+            if (!mWorkflowDB->isCellElementEmpty(tensorUsage, usage))
+            {
+                WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(usage)]);
+                if (usg.isOptimizeMem)
+                {
+                    auto timelinedI = ret.find(tName);
+                    if (timelinedI != ret.end())
+                    {
+                        (*timelinedI).second.second = lName;
+                    }
+                    else
+                    {
+                        ret.insert({ tName, { lName, lName } });
+                    }
+                }
+            }
+        }
+    }
+
+    return ret;
+}
+
+std::pair<Workflow::Appearance, Workflow::Appearance> Workflow::timelineToAppearance(const Timeline& timelines) const
+{
+    std::pair<Workflow::Appearance, Workflow::Appearance> ret;
+
+    for (auto& timelineT : timelines)
+    {
+        const Name& tName = timelineT.first;
+        const Name& lNameF = timelineT.second.first;
+        const Name& lNameL = timelineT.second.second;
+
+        if (!lNameF.empty())
+        {
+            auto lItF = ret.first.find(lNameF);
+            if (lItF != ret.first.end())
+            {
+                (*lItF).second.push_back(tName);
+            }
+            else
+            {
+                ret.first.insert({ lNameF, { tName } });
+            }
+        }
+
+        if (!lNameL.empty())
+        {
+            auto lItL = ret.second.find(lNameL);
+            if (lItL != ret.second.end())
+            {
+                (*lItL).second.push_back(tName);
+            }
+            else
+            {
+                ret.second.insert({ lNameL, { tName } });
+            }
+        }
+    }
+
+    return ret;
+}
+
+std::tuple<Workflow::Timeline, Workflow::Timeline>
+Workflow::appearanceToTimeline(const std::pair<Appearance, Appearance>& forward, const std::pair<Appearance, Appearance>& backward) const
+{
+    std::tuple<Timeline, Timeline> ret; // d.polubotko: CPUFP32, CPUFP16
+
+    const Appearance& forwardFirst = forward.first;
+    const Appearance& forwardLast = forward.second;
+
+    const Appearance& backwardFirst = backward.first;
+    const Appearance& backwardLast = backward.second;
+
+    for (const auto& appear : forwardFirst)
+    {
+        const Name& lName = appear.first;
+        const Names& tNames = appear.second;
+
+        for (const auto& tName : tNames)
+        {
+            std::vector<size_t> tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+            WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]);
+
+            ExecutionTarget target = mExecutionTarget;
+
+            if (usg.layerExecutionTarget != LayerExecutionTarget::Default)
+            {
+                target = static_cast<ExecutionTarget>(usg.layerExecutionTarget);
+            }
+
+            Timeline* dst = &std::get<0>(ret);
+
+            if (target == ExecutionTarget::CPU)
+            {
+            }
+            else if (target == ExecutionTarget::CPUFP16)
+            {
+                dst = &std::get<1>(ret);
+            }
+            else
+            {
+                THROW_NONAME("Workflow", "unsopported execution target");
+            }
+
+            auto it = dst->find(tName);
+            if (it != dst->end())
+            {
+                THROW_NONAME("Workflow", "tensor already added for forward");
+            }
+            else
+            {
+                dst->insert({ tName, { lName, "" } });
+            }
+        }
+    }
+
+    for (const auto& appear : forwardLast)
+    {
+        const Name& lName = appear.first;
+        const Names& tNames = appear.second;
+
+        for (const auto& tName : tNames)
+        {
+            std::vector<size_t> tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+            WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]);
+
+            ExecutionTarget target = mExecutionTarget;
+
+            if (usg.layerExecutionTarget != LayerExecutionTarget::Default)
+            {
+                target = static_cast<ExecutionTarget>(usg.layerExecutionTarget);
+            }
+
+            Timeline* dst = &std::get<0>(ret);
+
+            if (target == ExecutionTarget::CPU)
+            {
+            }
+            else if (target == ExecutionTarget::CPUFP16)
+            {
+                dst = &std::get<1>(ret);
+            }
+            else
+            {
+                THROW_NONAME("Workflow", "unsopported execution target");
+            }
+
+            auto it = dst->find(tName);
+            if (it != dst->end())
+            {
+                (*it).second.second = lName;
+            }
+            else
+            {
+                THROW_NONAME("Workflow", "tensor has not been added for forward");
+            }
+        }
+    }
+
+    for (const auto& appear : backwardLast)
+    {
+        const Name& lName = appear.first;
+        const Names& tNames = appear.second;
+
+        for (const auto& tName : tNames)
+        {
+            std::vector<size_t> tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+            WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Backward)]);
+
+            ExecutionTarget target = mExecutionTarget;
+
+            if (usg.layerExecutionTarget != LayerExecutionTarget::Default)
+            {
+                target = static_cast<ExecutionTarget>(usg.layerExecutionTarget);
+            }
+
+            Timeline* dst = &std::get<0>(ret);
+
+            if (target == ExecutionTarget::CPU)
+            {
+            }
+            else if (target == ExecutionTarget::CPUFP16)
+            {
+                dst = &std::get<1>(ret);
+            }
+            else
+            {
+                THROW_NONAME("Workflow", "unsopported execution target");
+            }
+
+            auto it = dst->find(tName);
+            if (it != dst->end())
+            {
+                THROW_NONAME("Workflow", "tensor already added for backward");
+            }
+            else
+            {
+                dst->insert({ tName, { lName + suffixBack, "" } });
+            }
+        }
+    }
+
+    for (const auto& appear : backwardFirst)
+    {
+        const Name& lName = appear.first;
+        const Names& tNames = appear.second;
+
+        for (const auto& tName : tNames)
+        {
+            std::vector<size_t> tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+            WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Backward)]);
+
+            ExecutionTarget target = mExecutionTarget;
+
+            if (usg.layerExecutionTarget != LayerExecutionTarget::Default)
+            {
+                target = static_cast<ExecutionTarget>(usg.layerExecutionTarget);
+            }
+
+            Timeline* dst = &std::get<0>(ret);
+
+            if (target == ExecutionTarget::CPU)
+            {
+            }
+            else if (target == ExecutionTarget::CPUFP16)
+            {
+                dst = &std::get<1>(ret);
+            }
+            else
+            {
+                THROW_NONAME("Workflow", "unsopported execution target");
+            }
+
+            auto it = dst->find(tName);
+            if (it != dst->end())
+            {
+                (*it).second.second = lName + suffixBack;
+            }
+            else
+            {
+                THROW_NONAME("Workflow", "tensor has not been added for backward");
+            }
+        }
+    }
+
+    return ret;
+}
+
+void Workflow::clearPipelines()
+{
+    mPipelineCreateBatched.clear();
+    mPipelineCreateNotBatched.clear();
+    mPipelineDeleteBatched.clear();
+    mPipelineZeroTensors.clear();
+
+    mPipelineForwardTest.clear();
+    mPipelineForwardTrain.clear();
+    mPipelineBackwardTrain.clear();
+}
+
+void Workflow::flush()
+{
+    // Clear memory manager
+    mMemoryManager.clear();
+
+    // Clear all pipelines
+    clearPipelines();
+
+    // Set all flags to initial values
+    mIsPipelinesPrepared = false;
+    mBatchSize = 0;
+    mIsBatchSizeInited = false;
+    mIsMemoryPrepared = false;
+    mIsForwardCalled = false;
+}
+
+void Workflow::prepareMemoryForTraining()
+{
+    executePipeline(Workflow::Pipelines::CreateNotBatched);
+
+    mIsMemoryPrepared = true;
+}
+
+template<typename MM>
+std::vector<ParamAndGradImpl<typename MM::tensor>> Workflow::getTrainableParameters()
+{
+    typedef ParamAndGradImpl<typename MM::tensor> PAndG;
+
+    std::vector<PAndG> res;
+
+    const auto& names = getTrainableParameterNames();
+
+    for (auto& name : names)
+    {
+        if (!getMemoryManager<MM>().tensorExists(name))
+        {
+            THROW_NONAME("Workflow", "tensor [" + name + "] does not exist");
+        }
+
+        if (!getMemoryManager<MM>().tensorExists(name.grad()))
+        {
+            THROW_NONAME("Workflow", "tensor [" + name.grad() + "] does not exist");
+        }
+
+        res.push_back(PAndG{ getMemoryManager<MM>().getTensor(name), getMemoryManager<MM>().getTensor(name.grad()) });
+    }
+
+    return res;
+}
+
+template<typename MM>
+std::vector<ParamAndGradImpl<typename MM::tensor>> Workflow::getTrainableParametersSafe()
+{
+    typedef ParamAndGradImpl<typename MM::tensor> PAndG;
+
+    std::vector<PAndG> res;
+
+    const auto& names = getTrainableParameterNames();
+
+    for (auto& name : names)
+    {
+        if (!getMemoryManager<MM>().tensorExists(name) || !getMemoryManager<MM>().tensorExists(name.grad()))
+        {
+            continue;
+        }
+
+        res.push_back(PAndG{ getMemoryManager<MM>().getTensor(name), getMemoryManager<MM>().getTensor(name.grad()) });
+    }
+
+    return res;
+}
+
+Names Workflow::getTrainableParameterNames() const
+{
+    Names ret;
+    std::unordered_set<Name> trainableTensors;
+
+    for (auto& layer : mLayers)
+    {
+        const auto layerTrainableParams = getLayerTrainableParameterNames(layer->getName());
+        for (size_t i = 0; i < layerTrainableParams.size(); ++i)
+        {
+            if (trainableTensors.find(layerTrainableParams[i]) == trainableTensors.end())
+            {
+                trainableTensors.insert(layerTrainableParams[i]);
+                ret.push_back(layerTrainableParams[i]);
+            }
+        }
+    }
+
+    return ret;
+}
+
+Names Workflow::getLayerTrainableParameterNames(const Name& layerName) const
+{
+    Names ret;
+
+    auto slice = mWorkflowDB->getSlice(mWorkflowDB->getLayersTable(), layerName);
+    std::set<Name> orderedNames(slice.begin(), slice.end());
+
+    for (const auto& tName : orderedNames)
+    {
+        if (mWorkflowDB->findFirstTensor(tName).isTrainable)
+        {
+            ret.push_back(tName);
+        }
+    }
+
+    return ret;
+}
+
+bool Workflow::isTensorTrainable(const Name& name) const
+{
+    return mWorkflowDB->findFirstTensor(name).isTrainable;
+}
+
+bool Workflow::isTensorOptimizeMem(const Name& name) const
+{
+    return mWorkflowDB->findFirstTensor(name).isOptimizeMem;
+}
+
+Names Workflow::getLayerParameterNames(const Name& layerName) const
+{
+    if (mLayersDict.find(layerName) == mLayersDict.end())
+    {
+        THROW_NONAME("Workflow", "Layer \"" + layerName + "\" not found");
+    }
+    const auto layer = mLayersDict.find(layerName)->second;
+    if (layer->getTypeName() == "Data")
+    {
+        return Names();
+    }
+
+    auto slice = mWorkflowDB->getSlice(mWorkflowDB->getLayersTable(), layerName);
+    std::set<Name> orderedNames(slice.begin(), slice.end());
+    std::set<Name> trainableTensors;
+
+    for (const auto& tName : orderedNames)
+    {
+        if (mWorkflowDB->findFirstTensor(tName).isTrainable || (!mWorkflowDB->findFirstTensor(tName).isOptimizeMem && !mWorkflowDB->findFirstTensor(tName).isZero))
+        {
+            // check if tensor was declared inside the layer
+            if (Common::startsWith(tName, layerName))
+            {
+                trainableTensors.insert(tName);
+            }
+        }
+    }
+
+    Names ret(trainableTensors.begin(), trainableTensors.end());
+
+    return ret;
+}
+
+bool Workflow::isTensorDeclared(const Name& tensorName) const
+{
+    return mWorkflowDB->isTensorDeclared(tensorName);
+}
+
+size_t Workflow::getDimension(const Name& tensorName, size_t dim) const
+{
+    WorkflowDB::TensorUsage usage = mWorkflowDB->findFirstTensor(tensorName);
+
+    size_t ret = 0u;
+
+    if (usage.shape.mIsBS[dim])
+    {
+        ret = getBatchSize() * usage.shape.mMultiplier[dim];
+    }
+    else
+    {
+        ret = usage.shape.mShape[dim];
+    }
+
+    return ret;
+}
+
+bool Workflow::isDimensionPlaceholded(const Name& tensorName, size_t dim) const
+{
+    WorkflowDB::TensorUsage usage = mWorkflowDB->findFirstTensor(tensorName);
+
+    bool ret = false;
+
+    if (usage.shape.mIsBS[dim])
+    {
+        ret = true;
+    }
+
+    return ret;
+}
+
+size_t Workflow::getBatch(const Name& tensorName) const
+{
+    return getDimension(tensorName, 0u);
+}
+
+size_t Workflow::getDepth(const Name& tensorName) const
+{
+    return getDimension(tensorName, 1u);
+}
+
+size_t Workflow::getHeight(const Name& tensorName) const
+{
+    return getDimension(tensorName, 2u);
+}
+
+size_t Workflow::getWidth(const Name& tensorName) const
+{
+    return getDimension(tensorName, 3u);
+}
+
+WShape Workflow::getShape(const Name& tensorName) const
+{
+    return mWorkflowDB->findFirstTensor(tensorName).shape;
+}
+
+bool Workflow::isBatchPlaceholded(const Name& tensorName) const
+{
+    return isDimensionPlaceholded(tensorName, 0u);
+}
+
+bool Workflow::isDepthPlaceholded(const Name& tensorName) const
+{
+    return isDimensionPlaceholded(tensorName, 1u);
+}
+
+bool Workflow::isHeightPlaceholded(const Name& tensorName) const
+{
+    return isDimensionPlaceholded(tensorName, 2u);
+}
+
+bool Workflow::isWidthPlaceholded(const Name& tensorName) const
+{
+    return isDimensionPlaceholded(tensorName, 3u);
+}
+
+size_t Workflow::getBatchSize() const
+{
+    if (!mIsBatchSizeInited)
+    {
+        THROW_NONAME("Workflow", "batch size not inited");
+    }
+
+    return mBatchSize;
+}
+
+void Workflow::setBatchSize(size_t batchSize)
+{
+    if (!mIsPipelinesPrepared)
+    {
+        THROW_NONAME("Workflow", "pipelines not prepared, batch size setup not possible");
+    }
+
+    if (batchSize < 1u)
+    {
+        THROW_NONAME("Workflow", "batch size is 0");
+    }
+
+    mBatchSize = batchSize;
+
+    mIsBatchSizeInited = true; // order is important for Workflow::Pipelines::CreateBatched
+
+    executePipeline(Workflow::Pipelines::DeleteBatched);
+    executePipeline(Workflow::Pipelines::CreateBatched);
+
+    mNetworkParameters.mLossReductionCoefficient = batchSize;
+
+    if (mAllocationMode == AllocationMode::POOL)
+    {
+        mWorkflowPoolTest->clearPool();
+        mWorkflowPoolTrain->clearPool();
+
+        mWorkflowPoolTestFP16->clearPool();
+        mWorkflowPoolTrainFP16->clearPool();
+    }
+}
+
+void Workflow::forwardPassTesting()
+{
+    if (!mIsBatchSizeInited)
+    {
+        THROW_NONAME("Workflow", "tensors not allocated, set batch size");
+    }
+
+    if (!mIsMemoryPrepared)
+    {
+        THROW_NONAME("Workflow", "tensors not allocated, call prepareMemoryForTraining()");
+    }
+
+    if (mAllocationMode == AllocationMode::POOL)
+    {
+        mWorkflowPoolTest->createPool(mMemoryManager);
+        mWorkflowPoolTestFP16->createPool(mMemoryManagerFP16);
+    }
+
+    try
+    {
+        executePipeline(Workflow::Pipelines::ForwardTest);
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot execute forward testing pipeline");
+    }
+}
+
+void Workflow::forwardPassTraining(bool performZero)
+{
+    if (!mIsBatchSizeInited)
+    {
+        THROW_NONAME("Workflow", "tensors not allocated, set batch size");
+    }
+
+    if (!mIsMemoryPrepared)
+    {
+        THROW_NONAME("Workflow", "tensors not allocated, call prepareMemoryForTraining()");
+    }
+
+    if (mIsForwardCalled)
+    {
+        THROW_NONAME("Workflow", "already executed, execute backward");
+    }
+
+    if (performZero)
+    {
+        executePipeline(Workflow::Pipelines::Zero); // fill non optimizable tensors (gradients for weights etc)
+    }
+
+    if (mAllocationMode == AllocationMode::POOL)
+    {
+        mWorkflowPoolTrain->createPool(mMemoryManager);
+        mWorkflowPoolTrainFP16->createPool(mMemoryManagerFP16);
+    }
+
+    try
+    {
+        executePipeline(Workflow::Pipelines::ForwardTrain);
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot execute forward training pipeline");
+    }
+
+    mIsForwardCalled = true;
+}
+
+void Workflow::backwardPassTraining()
+{
+    if (!mIsBatchSizeInited)
+    {
+        THROW_NONAME("Workflow", "tensors not allocated, set batch size");
+    }
+
+    if (!mIsMemoryPrepared)
+    {
+        THROW_NONAME("Workflow", "tensors not allocated, call prepareMemoryForTraining()");
+    }
+
+    if (!mIsForwardCalled)
+    {
+        THROW_NONAME("Workflow", "execute forward first");
+    }
+
+    try
+    {
+        executePipeline(Workflow::Pipelines::BackwardTrain);
+    }
+    catch (...)
+    {
+        THROW_NONAME("Workflow", "Cannot execute backward training pipeline");
+    }
+
+    mIsForwardCalled = false;
+}
+
+void Workflow::setCheckpoints(const Names& checkpoints)
+{
+    if (!isUniqueNames(checkpoints))
+    {
+        THROW_NONAME("Workflow", "there are same checkpoints defined");
+    }
+
+    for (const auto& tName : checkpoints)
+    {
+        std::pair<bool, size_t> found = findLayerByOutput(tName);
+
+        if (!found.first)
+        {
+            THROW_NONAME("Workflow", "tensor " + tName + " not available for checkpointing");
+        }
+    }
+
+    mCheckpoints = checkpoints;
+}
+
+Names Workflow::getPotentialCheckpoints() const
+{
+    if (mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+    {
+        THROW_NONAME("Workflow", "compression not possible for checkpointed mode");
+    }
+
+    Names layers = getLayerNames();
+
+    // Tensor name vs pair - Layer first, last names in sequence
+    const Timeline timelineTensorsForward = getTimeline(layers, Usage::Forward);
+    const Timeline timelineTensorsBackward = getTimeline(layers, Usage::Backward);
+
+    const std::pair<Appearance, Appearance> appearanceForward = timelineToAppearance(timelineTensorsForward);
+    const Appearance& timelineLayersLastForward = appearanceForward.second;
+
+    const std::pair<Appearance, Appearance> appearanceBackward = timelineToAppearance(timelineTensorsBackward);
+    const Appearance& timelineLayersFirstBackward = appearanceBackward.first;
+
+    std::set<Name> recalcActivations;
+
+    // find duplication of potential deallocations - tensors needed in backward (if output)
+    for (const auto& appearBackward : timelineLayersFirstBackward)
+    {
+        for (const auto& tName : appearBackward.second)
+        {
+            for (auto& appearForward : timelineLayersLastForward)
+            {
+                auto lastForwardIt = std::find(appearForward.second.begin(), appearForward.second.end(), tName);
+                if (lastForwardIt != appearForward.second.end())
+                {
+                    std::pair<bool, size_t> found = findLayerByOutput(tName);
+
+                    if (found.first) // tensor is in outputs
+                    {
+                        recalcActivations.insert(tName);
+                    }
+                }
+            }
+        }
+    }
+
+    return Names(recalcActivations.begin(), recalcActivations.end());
+}
+
+template<>
+MemoryManager& Workflow::getMemoryManager()
+{
+    return mMemoryManager;
+}
+
+template<>
+MemoryManagerFP16& Workflow::getMemoryManager<MemoryManagerFP16>()
+{
+    return mMemoryManagerFP16;
+}
+
+template<>
+const MemoryManager& Workflow::getMemoryManager() const
+{
+    return mMemoryManager;
+}
+
+template<>
+const MemoryManagerFP16& Workflow::getMemoryManager<MemoryManagerFP16>() const
+{
+    return mMemoryManagerFP16;
+}
+
+Compiler& Workflow::getCompiler()
+{
+    if (!mUseCompiler)
+    {
+        THROW_NONAME("Workflow", "Compiler disabled");
+    }
+
+    return *mCompiler;
+}
+
+const Compiler& Workflow::getCompiler() const
+{
+    if (!mUseCompiler)
+    {
+        THROW_NONAME("Workflow", "Compiler disabled");
+    }
+
+    return *mCompiler;
+}
+
+void Workflow::overrideLayerExecutionTarget(LayerExecutionTarget layerExecutionTarget)
+{
+    if (layerExecutionTarget == LayerExecutionTarget::Default)
+    {
+        THROW_NONAME("Workflow", "Default parameter not possible");
+    }
+
+    if (mOverridedLayerExecutionTarget != LayerExecutionTarget::Default)
+    {
+        THROW_NONAME("Workflow", "Reset override first");
+    }
+
+    mOverridedLayerExecutionTarget = layerExecutionTarget;
+}
+
+const BasicLayer* Workflow::operator[](const raul::Name& name) const
+{
+    auto i = mLayersDict.find(name);
+    if (i == mLayersDict.end())
+    {
+        THROW_NONAME("Workflow[operator", "layer with name [" + name + "] is not found");
+    }
+    return (*i).second;
+}
+
+BasicLayer* Workflow::operator[](const raul::Name& name)
+{
+    auto i = mLayersDict.find(name);
+    if (i == mLayersDict.end())
+    {
+        THROW_NONAME("Workflow[operator", "layer with name [" + name + "] is not found");
+    }
+    return (*i).second;
+}
+
+void Workflow::printInfo(std::ostream& stream) const
+{
+    for (const auto& layer : mLayers)
+    {
+        stream << layer->getTypeName() << " [" << layer->getName() << "]: ";
+        stream << std::endl;
+        switch (mExecutionTarget)
+        {
+            case ExecutionTarget::CPU:
+                utils::printTensorNames(stream, "inputs", layer->getInputs(), mMemoryManager);
+                utils::printTensorNames(stream, "outputs", layer->getOutputs(), mMemoryManager);
+                break;
+            case ExecutionTarget::CPUFP16:
+                utils::printTensorNames(stream, "inputs", layer->getInputs(), mMemoryManagerFP16);
+                utils::printTensorNames(stream, "outputs", layer->getOutputs(), mMemoryManagerFP16);
+                break;
+        }
+    }
+}
+
+std::unordered_set<std::string> Workflow::getSetOfLayers() const
+{
+    std::unordered_set<std::string> layers;
+    for (const auto& layer : mLayers)
+    {
+        layers.insert(layer->getTypeName());
+    }
+    return layers;
+}
+
+const Workflow::Pipeline& Workflow::getPipeline(Workflow::Pipelines pipeline) const
+{
+    if (pipeline == Workflow::Pipelines::CreateBatched)
+    {
+        return mPipelineCreateBatched;
+    }
+    if (pipeline == Workflow::Pipelines::CreateNotBatched)
+    {
+        return mPipelineCreateNotBatched;
+    }
+    if (pipeline == Workflow::Pipelines::DeleteBatched)
+    {
+        return mPipelineDeleteBatched;
+    }
+    if (pipeline == Workflow::Pipelines::Zero)
+    {
+        return mPipelineZeroTensors;
+    }
+    if (pipeline == Workflow::Pipelines::ForwardTrain)
+    {
+        return mPipelineForwardTrain;
+    }
+    if (pipeline == Workflow::Pipelines::BackwardTrain)
+    {
+        return mPipelineBackwardTrain;
+    }
+
+    return mPipelineForwardTest;
+}
+
+void Workflow::addCallback(const Name& layerName, WorkflowListener& listener)
+{
+    if (!mIsPipelinesPrepared)
+    {
+        THROW_NONAME("Workflow", "pipelines not prepared");
+    }
+
+    if (mLayersDict.find(layerName) == mLayersDict.end())
+    {
+        THROW_NONAME("Workflow", "layer " + layerName + " not in topology");
+    }
+
+    auto i = mListeners.find(layerName);
+    if (i == mListeners.end())
+    {
+        mListeners.insert({ layerName, { &listener } });
+    }
+    else
+    {
+        (*i).second.push_back(&listener);
+    }
+}
+
+Workflow::Listeners Workflow::getListeners(const Name& layerName) const
+{
+    Listeners ret;
+
+    auto i = mListeners.find(layerName);
+    if (i != mListeners.end())
+    {
+        ret = (*i).second;
+    }
+
+    return ret;
+}
+
+void Workflow::executePipeline(Pipelines pipeline, const ActionParam& param) const
+{
+    if (!mIsPipelinesPrepared)
+    {
+        THROW_NONAME("Workflow", "pipelines not prepared");
+    }
+
+    const Pipeline& pipe = getPipeline(pipeline);
+    for (auto& action : pipe)
+    {
+        action->execute(param);
+    }
+}
+
+void Workflow::createTrainCheckpointedPipeline()
+{
+
+    if (mNetworkParameters.mCompressionMode != CompressionMode::NONE)
+    {
+        THROW_NONAME("Workflow", "compression not possible for checkpointed mode");
+    }
+
+    Names layers = getLayerNames();
+
+    // Tensor name vs pair - Layer first, last names in sequence
+    Timeline timelineTensorsForward = getTimeline(layers, Usage::Forward);
+    Timeline timelineTensorsBackward = getTimeline(layers, Usage::Backward);
+
+    std::pair<Appearance, Appearance> appearanceForward = timelineToAppearance(timelineTensorsForward);
+    const Appearance& timelineLayersFirstForward = appearanceForward.first;
+    Appearance& timelineLayersLastForward = appearanceForward.second;
+
+    std::pair<Appearance, Appearance> appearanceBackward = timelineToAppearance(timelineTensorsBackward);
+    Appearance& timelineLayersFirstBackward = appearanceBackward.first;
+    Appearance& timelineLayersLastBackward = appearanceBackward.second;
+
+    std::unordered_set<Name> recalcActivations;
+
+    // to handle skip connection issue and deletion of already calculated activations
+    std::unordered_set<Name> recalcActivationsAllocated;
+
+    std::unordered_set<Name> internalTensors;
+
+    // remove duplication of potential allocations not including checkpoints - internal tensors
+    for (const auto& appearForward : timelineLayersFirstForward)
+    {
+        for (const auto& tName : appearForward.second)
+        {
+            for (auto& appearBackward : timelineLayersLastBackward)
+            {
+                auto lastBackIt = std::find(appearBackward.second.begin(), appearBackward.second.end(), tName);
+                if (lastBackIt != appearBackward.second.end())
+                {
+                    if (!isCheckpoint(tName))
+                    {
+                        std::pair<bool, size_t> found = findLayerByOutput(tName);
+
+                        if (!found.first) // tensor not in outputs
+                        {
+                            internalTensors.insert(tName);
+                            appearBackward.second.erase(lastBackIt);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // find duplication of potential deallocations not including checkpoints - tensors needed in backward (if output) or internal tensors (remove duplication)
+    for (const auto& appearBackward : timelineLayersFirstBackward)
+    {
+        for (const auto& tName : appearBackward.second)
+        {
+            for (auto& appearForward : timelineLayersLastForward)
+            {
+                auto lastForwardIt = std::find(appearForward.second.begin(), appearForward.second.end(), tName);
+                if (lastForwardIt != appearForward.second.end())
+                {
+                    if (!isCheckpoint(tName))
+                    {
+                        std::pair<bool, size_t> found = findLayerByOutput(tName);
+
+                        if (found.first) // tensor is in outputs
+                        {
+                            recalcActivations.insert(tName);
+                        }
+                        else
+                        {
+                            internalTensors.insert(tName);
+                            appearForward.second.erase(lastForwardIt);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // remove duplication of potential checkpoints deallocations, allocations
+    removeDuplications(mCheckpoints, timelineLayersLastForward, timelineLayersLastBackward);
+
+    // create pipelines
+    // forward
+    for (auto& mLayer : mLayers)
+    {
+        fillTrainPipeline(mPipelineForwardTrain, mLayer.get(), timelineLayersFirstForward, timelineLayersLastForward, Usage::Forward);
+    }
+
+    // if checkpoint lifetime differs for forward/backward (different layers)
+    // then deallocation might happen earlier than expected
+    // better to correct
+    for (auto& cp : mCheckpoints)
+    {
+        Name forwardLayerName;
+        Name backwardLayerName;
+
+        auto itTF = timelineTensorsForward.find(cp);
+        if (itTF != timelineTensorsForward.end())
+        {
+            forwardLayerName = (*itTF).second.first;
+        }
+
+        auto itTB = timelineTensorsBackward.find(cp);
+        if (itTB != timelineTensorsBackward.end())
+        {
+            backwardLayerName = (*itTB).second.first;
+        }
+
+        if (!forwardLayerName.empty() && !backwardLayerName.empty() && forwardLayerName != backwardLayerName)
+        {
+            removeDuplications({ cp }, timelineLayersFirstBackward);
+
+            auto itfB = timelineLayersFirstBackward.find(forwardLayerName);
+            if (itfB != timelineLayersFirstBackward.end())
+            {
+                (*itfB).second.push_back(cp);
+            }
+            else
+            {
+                timelineLayersFirstBackward.insert({ forwardLayerName, { cp } });
+            }
+        }
+    }
+
+    size_t recalcAmount = 0;
+
+    Timeline timelineJointPool;
+    Names lNamesPool;
+
+    if (mAllocationMode == AllocationMode::POOL)
+    {
+        mWorkflowPoolTrain->clearTensorNameMapper();
+
+        timelineJointPool = std::get<0>(appearanceToTimeline(appearanceForward, {})); // d.polubotko: use CPUFP32 only
+
+        lNamesPool = getLayerNames();
+    }
+
+    // backward
+    for (auto it = mLayers.rbegin(); it != mLayers.rend(); ++it)
+    {
+        const Name& lName = (*it)->getName();
+        auto lItL = timelineLayersLastBackward.find(lName);
+
+        // potential allocations
+        if (lItL != timelineLayersLastBackward.end())
+        {
+            // perform forward processing before gradients allocation and zero finished (issues with pool)
+            for (const auto& tName : (*lItL).second)
+            {
+                if (recalcActivations.find(tName) != recalcActivations.end())
+                {
+                    Layers forwardPathLayers = findPathFromActivationTillCheckpoint(tName, recalcActivationsAllocated);
+                    Names forwardPath;
+
+                    for (auto layer : forwardPathLayers)
+                    {
+                        forwardPath.push_back(layer->getName());
+                    }
+
+                    Timeline subTimelineTensorsForward = getTimeline(forwardPath, Usage::Forward);
+                    std::pair<Appearance, Appearance> subAppearanceForward = timelineToAppearance(subTimelineTensorsForward);
+                    Appearance& subTimelineLayersFirstForward = subAppearanceForward.first;
+                    Appearance& subTimelineLayersLastForward = subAppearanceForward.second;
+
+                    // remove duplication of checkpoints allocations, deallocations
+                    removeDuplications(mCheckpoints, subTimelineLayersFirstForward, subTimelineLayersLastForward);
+
+                    // remove duplication of potential activation deallocations
+                    for (auto& appearForward : subTimelineLayersLastForward)
+                    {
+                        auto lastForwardIt = std::find(appearForward.second.begin(), appearForward.second.end(), tName);
+                        if (lastForwardIt != appearForward.second.end())
+                        {
+                            appearForward.second.erase(lastForwardIt);
+                        }
+                    }
+
+                    // remove duplication of internal tensors allocations, deallocations
+                    removeDuplications(internalTensors, subTimelineLayersFirstForward, subTimelineLayersLastForward);
+
+                    // remove duplication of already found tensors (skip connection issue)
+                    removeDuplications(recalcActivationsAllocated, subTimelineLayersFirstForward, subTimelineLayersLastForward);
+
+                    if (mAllocationMode == AllocationMode::POOL)
+                    {
+                        Timeline timelineJoint = std::get<0>(appearanceToTimeline(subAppearanceForward, {})); // d.polubotko: use CPUFP32 only
+
+                        const Name suffix = suffixForward + Conversions::toString(recalcAmount);
+
+                        for (const auto& timeline : timelineJoint)
+                        {
+                            if (timeline.first == tName)
+                            {
+                                mWorkflowPoolTrain->setTensorNameMapper(tName + suffixRecalced, tName);
+
+                                Name lNameJF = timeline.second.first + suffix;
+
+                                timelineJointPool.insert({ tName + suffixRecalced, { lNameJF, "" } });
+                            }
+                            else
+                            {
+                                mWorkflowPoolTrain->setTensorNameMapper(timeline.first + suffix, timeline.first);
+                                Name tNameJ = timeline.first + suffix;
+                                Name lNameJF = timeline.second.first + suffix;
+                                Name lNameJL = timeline.second.second + suffix;
+
+                                timelineJointPool.insert({ tNameJ, { lNameJF, lNameJL } });
+                            }
+                        }
+
+                        for (auto& lNameForw : forwardPath)
+                        {
+                            lNamesPool.push_back(lNameForw + suffixForward + Conversions::toString(recalcAmount));
+                        }
+                    }
+
+                    // create subpipeline
+                    for (const auto& lForwardLayer : forwardPathLayers)
+                    {
+                        fillTrainPipelineCheckpointed(mPipelineBackwardTrain, lForwardLayer, subTimelineLayersFirstForward, subTimelineLayersLastForward, mWorkflowPoolTrain, recalcAmount, tName);
+                    }
+
+                    recalcActivationsAllocated.insert(tName);
+
+                    ++recalcAmount;
+                }
+            }
+
+            // perform gradients allocation and zero after forward pass finished (issues with pool)
+            for (const auto& tName : (*lItL).second)
+            {
+                if (recalcActivations.find(tName) == recalcActivations.end()) // not layer output (gradient)
+                {
+                    mPipelineBackwardTrain.push_back(std::make_shared<Allocate<MemoryManager>>(mMemoryManager, tName, *this, mWorkflowPoolTrain));
+
+                    auto tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+                    WorkflowDB::TensorUsage usage = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Backward)]);
+                    if (usage.isZero)
+                    {
+                        mPipelineBackwardTrain.push_back(std::make_shared<Zero<MemoryManager>>(mMemoryManager, tName));
+                    }
+
+                    if (mAllocationMode == AllocationMode::POOL)
+                    {
+                        timelineJointPool.insert({ tName, { lName + suffixBack, "" } });
+                    }
+                }
+            }
+        }
+
+        auto layer = (*it).get();
+        auto backward = std::make_shared<Backward>(layer, *this);
+        auto itScale = mScalingStrategies.find(layer->getName());
+        if (itScale != mScalingStrategies.end())
+        {
+            backward->setScaling(itScale->second);
+        }
+        mPipelineBackwardTrain.push_back(backward);
+
+        auto lItF = timelineLayersFirstBackward.find(lName);
+
+        if (lItF != timelineLayersFirstBackward.end())
+        {
+            for (const auto& tName : (*lItF).second)
+            {
+                mPipelineBackwardTrain.push_back(std::make_shared<Deallocate<MemoryManager>>(mMemoryManager, tName));
+
+                if (recalcActivationsAllocated.find(tName) != recalcActivationsAllocated.end())
+                {
+                    recalcActivationsAllocated.erase(tName);
+                }
+
+                if (mAllocationMode == AllocationMode::POOL)
+                {
+                    auto itTimeJP = timelineJointPool.find(tName);
+
+                    if (recalcActivations.find(tName) != recalcActivations.end())
+                    {
+                        itTimeJP = timelineJointPool.find(tName + suffixRecalced);
+                    }
+
+                    (*itTimeJP).second.second = lName + suffixBack;
+                }
+            }
+        }
+
+        if (mAllocationMode == AllocationMode::POOL)
+        {
+            lNamesPool.push_back(lName + suffixBack);
+        }
+    }
+
+    if (mAllocationMode == AllocationMode::POOL)
+    {
+        mWorkflowPoolTrain->createIntervals(lNamesPool, timelineJointPool);
+    }
+}
+
+std::pair<bool, size_t> Workflow::findLayerByOutput(const Name& tensor) const
+{
+    std::pair<bool, size_t> ret(false, 0);
+
+    for (size_t q = 0; q < mLayers.size(); ++q)
+    {
+        const Names& outputs = mLayers[q]->getOutputs();
+
+        if (std::find(outputs.begin(), outputs.end(), tensor) != outputs.end())
+        {
+            ret.first = true;
+            ret.second = q;
+            break;
+        }
+    }
+
+    return ret;
+}
+
+bool Workflow::isCheckpoint(const Name& tensor) const
+{
+    return std::find(mCheckpoints.begin(), mCheckpoints.end(), tensor) != mCheckpoints.end();
+}
+
+bool Workflow::isPersistent(const Name& tensor) const
+{
+    WorkflowDB::TensorUsage usg = mWorkflowDB->findFirstTensor(tensor);
+    return !usg.isOptimizeMem;
+}
+
+Workflow::Layers Workflow::findPathFromActivationTillCheckpoint(const Name& tensor, const std::unordered_set<Name>& alreadyFound) const
+{
+    std::pair<bool, size_t> found = findLayerByOutput(tensor);
+
+    if (!found.first)
+    {
+        THROW_NONAME("Workflow", "tensor " + tensor + " not in outputs");
+    }
+
+    Layers tempRet;
+
+    traverseGraphTillCheckpoint(mLayers[found.second].get(), tempRet, alreadyFound);
+
+    std::reverse(tempRet.begin(), tempRet.end());
+
+    Layers ret; // layers in forward order
+    std::unordered_set<Name> visited;
+
+    for (const auto& layer : tempRet)
+    {
+        if (visited.find(layer->getName()) == visited.end())
+        {
+            visited.insert(layer->getName());
+            ret.push_back(layer);
+        }
+    }
+
+    return ret;
+}
+
+void Workflow::traverseGraphTillCheckpoint(BasicLayer* layer, Layers& names, const std::unordered_set<Name>& alreadyFound) const
+{
+    names.push_back(layer);
+
+    if (layer->getInputs().empty())
+    {
+        std::string outNames;
+        for (const auto& name : layer->getOutputs())
+        {
+            outNames += name + " ";
+        }
+        THROW_NONAME("Workflow", "not enough checkpoints in layer " + layer->getName() + " outputs: " + outNames);
+    }
+
+    for (const auto& tName : layer->getInputs())
+    {
+        if (!isCheckpoint(tName) && !isPersistent(tName) && alreadyFound.find(tName) == alreadyFound.end())
+        {
+            std::pair<bool, size_t> found = findLayerByOutput(tName);
+
+            if (!found.first)
+            {
+                THROW_NONAME("Workflow", "tensor " + tName + " not in outputs");
+            }
+
+            traverseGraphTillCheckpoint(mLayers[found.second].get(), names, alreadyFound);
+        }
+    }
+}
+
+void Workflow::removeDuplications(const Names& tNames, Appearance& appear) const
+{
+    for (const auto& tName : tNames)
+    {
+        for (auto& appearForward : appear)
+        {
+            auto lastForwardIt = std::find(appearForward.second.begin(), appearForward.second.end(), tName);
+            if (lastForwardIt != appearForward.second.end())
+            {
+                appearForward.second.erase(lastForwardIt);
+            }
+        }
+    }
+}
+
+void Workflow::removeDuplications(const Names& tNames, Appearance& appearA, Appearance& appearB) const
+{
+    removeDuplications(tNames, appearA);
+    removeDuplications(tNames, appearB);
+}
+
+void Workflow::removeDuplications(const std::unordered_set<Name>& tNames, Appearance& appearA, Appearance& appearB) const
+{
+    for (const auto& tName : tNames)
+    {
+        for (auto& appearForward : appearA)
+        {
+            auto lastForwardIt = std::find(appearForward.second.begin(), appearForward.second.end(), tName);
+            if (lastForwardIt != appearForward.second.end())
+            {
+                appearForward.second.erase(lastForwardIt);
+            }
+        }
+
+        for (auto& appearForward : appearB)
+        {
+            auto lastForwardIt = std::find(appearForward.second.begin(), appearForward.second.end(), tName);
+            if (lastForwardIt != appearForward.second.end())
+            {
+                appearForward.second.erase(lastForwardIt);
+            }
+        }
+    }
+}
+
+bool Workflow::isUniqueNames(const Names& names) const
+{
+    std::unordered_set<Name> unique;
+    for (const auto& tName : names)
+    {
+        auto it = unique.find(tName);
+        if (it != unique.end())
+        {
+            return false;
+        }
+        else
+        {
+            unique.insert(tName);
+        }
+    }
+
+    return true;
+}
+
+void Workflow::fillTrainPipeline(Pipeline& pipeline, BasicLayer* layer, const Appearance& first, const Appearance& last, Usage usage)
+{
+    if (usage != Usage::Forward && usage != Usage::Backward)
+    {
+        THROW_NONAME("Workflow", "incorrect usage parameter");
+    }
+
+    execTargetFillTrainPipeline(pipeline, layer, first, last, usage);
+}
+
+void Workflow::fillTrainPipelineCheckpointed(Pipeline& pipeline,
+                                             BasicLayer* layer,
+                                             const Appearance& first,
+                                             const Appearance& last,
+                                             std::shared_ptr<WorkflowPool<MemoryManager>>& pool,
+                                             size_t recalcAmount,
+                                             const Name& recalcTensor)
+{
+    const Name& lName = layer->getName();
+    auto lItF = first.find(lName);
+    auto lItL = last.find(lName);
+
+    if (lItF != first.end())
+    {
+        for (const auto& tName : (*lItF).second)
+        {
+            if (mAllocationMode == AllocationMode::POOL)
+            {
+                if (recalcTensor == tName)
+                {
+                    Name tNamePool = tName + suffixRecalced;
+                    pipeline.push_back(std::make_shared<Allocate<MemoryManager>>(mMemoryManager, tName, tNamePool, *this, pool));
+                }
+                else
+                {
+                    Name suffix = suffixForward + Conversions::toString(recalcAmount);
+                    Name tNamePool = tName + suffix;
+                    pipeline.push_back(std::make_shared<Allocate<MemoryManager>>(mMemoryManager, tName, tNamePool, *this, pool));
+                }
+            }
+            else
+            {
+                pipeline.push_back(std::make_shared<Allocate<MemoryManager>>(mMemoryManager, tName, *this, pool));
+            }
+
+            auto tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+            WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]);
+            if (usg.isZero)
+            {
+                pipeline.push_back(std::make_shared<Zero<MemoryManager>>(mMemoryManager, tName));
+            }
+        }
+    }
+
+    pipeline.push_back(std::make_shared<Forward>(layer, NetworkMode::TrainCheckpointed, *this));
+
+    if (lItL != last.end())
+    {
+        for (const auto& tName : (*lItL).second)
+        {
+            pipeline.push_back(std::make_shared<Deallocate<MemoryManager>>(mMemoryManager, tName));
+        }
+    }
+}
+
+void Workflow::fillTrainPipelineCompression(Pipeline& pipeline, BasicLayer* layer, const Appearance& appear, Usage usage)
+{
+    if (usage != Usage::Forward && usage != Usage::Backward)
+    {
+        THROW_NONAME("Workflow", "incorrect usage parameter");
+    }
+
+    const Name& lName = layer->getName();
+    auto lItApp = appear.find(lName);
+
+    if (lItApp != appear.end())
+    {
+        for (const auto& tName : (*lItApp).second)
+        {
+            auto tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+            WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(usage)]);
+            if (usg.isCompress)
+            {
+                if (mExecutionTarget == ExecutionTarget::CPU)
+                {
+                    if (usage == Usage::Forward)
+                    {
+                        pipeline.push_back(std::make_shared<Compress<MemoryManager>>(mMemoryManager, tName, mNetworkParameters.mCompressionMode));
+                    }
+
+                    if (usage == Usage::Backward)
+                    {
+                        pipeline.push_back(std::make_shared<Decompress<MemoryManager>>(mMemoryManager, tName, mNetworkParameters.mCompressionMode));
+                    }
+                }
+                else if (mExecutionTarget == ExecutionTarget::CPUFP16)
+                {
+                    if (usage == Usage::Forward)
+                    {
+                        pipeline.push_back(std::make_shared<Compress<MemoryManagerFP16>>(mMemoryManagerFP16, tName, mNetworkParameters.mCompressionMode));
+                    }
+
+                    if (usage == Usage::Backward)
+                    {
+                        pipeline.push_back(std::make_shared<Decompress<MemoryManagerFP16>>(mMemoryManagerFP16, tName, mNetworkParameters.mCompressionMode));
+                    }
+                }
+            }
+        }
+    }
+}
+
+void Workflow::execTargetCreateAuxPipelines(const std::unordered_map<Name, size_t>& uniqueTensors)
+{
+    for (const auto& uniqueTensor : uniqueTensors)
+    {
+        WorkflowDB::TensorUsage usage = mWorkflowDB->getUsage(uniqueTensor.second);
+
+        if (usage.shape.isBSDependent())
+        {
+            if (usage.isOptimizeMem)
+            {
+                mPipelineCreateBatched.push_back(newActionCreateShape(usage.tensorName, usage.layerExecutionTarget, usage.shape));
+            }
+            else
+            {
+                mPipelineCreateBatched.push_back(newActionCreateTensor(usage.tensorName, usage.layerExecutionTarget, usage.shape));
+            }
+
+            mPipelineDeleteBatched.push_back(newActionDeleteTensor(usage.tensorName, usage.layerExecutionTarget));
+        }
+        else
+        {
+            if (usage.isOptimizeMem)
+            {
+                mPipelineCreateNotBatched.push_back(newActionCreateShape(usage.tensorName, usage.layerExecutionTarget, usage.shape));
+            }
+            else
+            {
+                mPipelineCreateNotBatched.push_back(newActionCreateTensor(usage.tensorName, usage.layerExecutionTarget, usage.shape));
+            }
+        }
+
+        if (usage.isZero)
+        {
+            if (!usage.isOptimizeMem)
+            {
+                mPipelineZeroTensors.push_back(newActionZero(usage.tensorName, usage.layerExecutionTarget));
+            }
+        }
+    }
+}
+
+void Workflow::execTargetCreateForwardTestPipeline(const Appearance& timelineLayersFirst, const Appearance& timelineLayersLast)
+{
+    for (auto& layer : mLayers)
+    {
+        const Name& lName = layer->getName();
+        auto lItF = timelineLayersFirst.find(lName);
+        auto lItL = timelineLayersLast.find(lName);
+
+        if (lItF != timelineLayersFirst.end())
+        {
+            for (const auto& tName : (*lItF).second)
+            {
+                std::vector<size_t> tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+                WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]);
+
+                mPipelineForwardTest.push_back(newActionAllocate(tName, usg.layerExecutionTarget, false));
+            }
+        }
+
+        mPipelineForwardTest.push_back(std::make_shared<Forward>(layer.get(), NetworkMode::Test, *this));
+
+        if (lItL != timelineLayersLast.end())
+        {
+            for (const auto& tName : (*lItL).second)
+            {
+                std::vector<size_t> tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+                WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]);
+
+                mPipelineForwardTest.push_back(newActionDeallocate(tName, usg.layerExecutionTarget));
+            }
+        }
+    }
+}
+
+void Workflow::execTargetFillTrainPipeline(Pipeline& pipeline, BasicLayer* layer, const Appearance& first, const Appearance& last, Usage usage)
+{
+    const Name& lName = layer->getName();
+    auto lItF = first.find(lName);
+    auto lItL = last.find(lName);
+
+    if (lItF != first.end())
+    {
+        for (const auto& tName : (*lItF).second)
+        {
+            auto tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+            WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(usage)]);
+
+            pipeline.push_back(newActionAllocate(tName, usg.layerExecutionTarget, true));
+
+            if (usg.isZero)
+            {
+                pipeline.push_back(newActionZero(tName, usg.layerExecutionTarget));
+            }
+        }
+    }
+
+    if (usage == Usage::Forward)
+    {
+        pipeline.push_back(std::make_shared<Forward>(layer, NetworkMode::Train, *this));
+    }
+
+    if (usage == Usage::Backward)
+    {
+        auto backward = std::make_shared<Backward>(layer, *this);
+        auto it = mScalingStrategies.find(layer->getName());
+        if (it != mScalingStrategies.end())
+        {
+            backward->setScaling(it->second);
+        }
+        pipeline.push_back(backward);
+    }
+
+    if (lItL != last.end())
+    {
+        for (const auto& tName : (*lItL).second)
+        {
+            std::vector<size_t> tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+            WorkflowDB::TensorUsage usg = mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(usage)]);
+
+            pipeline.push_back(newActionDeallocate(tName, usg.layerExecutionTarget));
+        }
+    }
+}
+
+Workflow::ActionMem Workflow::newActionCreateShape(const Name& name, LayerExecutionTarget layerExecutionTarget, const WShape& shape)
+{
+    ActionMem ret;
+
+    ExecutionTarget target = mExecutionTarget;
+
+    if (layerExecutionTarget != LayerExecutionTarget::Default)
+    {
+        target = static_cast<ExecutionTarget>(layerExecutionTarget);
+    }
+
+    if (target == ExecutionTarget::CPU)
+    {
+        ret = std::make_shared<CreateShape<MemoryManager>>(getMemoryManager<MemoryManager>(), name, shape, *this);
+    }
+    else if (target == ExecutionTarget::CPUFP16)
+    {
+        ret = std::make_shared<CreateShape<MemoryManagerFP16>>(getMemoryManager<MemoryManagerFP16>(), name, shape, *this);
+    }
+    else
+    {
+        THROW_NONAME("Workflow", "wrong execution target");
+    }
+
+    return ret;
+}
+
+Workflow::ActionMem Workflow::newActionCreateTensor(const Name& name, LayerExecutionTarget layerExecutionTarget, const WShape& shape)
+{
+    ActionMem ret;
+
+    ExecutionTarget target = mExecutionTarget;
+
+    if (layerExecutionTarget != LayerExecutionTarget::Default)
+    {
+        target = static_cast<ExecutionTarget>(layerExecutionTarget);
+    }
+
+    if (target == ExecutionTarget::CPU)
+    {
+        ret = std::make_shared<CreateTensor<MemoryManager>>(getMemoryManager<MemoryManager>(), name, shape, *this);
+    }
+    else if (target == ExecutionTarget::CPUFP16)
+    {
+        ret = std::make_shared<CreateTensor<MemoryManagerFP16>>(getMemoryManager<MemoryManagerFP16>(), name, shape, *this);
+    }
+    else
+    {
+        THROW_NONAME("Workflow", "wrong execution target");
+    }
+
+    return ret;
+}
+
+Workflow::ActionMem Workflow::newActionDeleteTensor(const Name& name, LayerExecutionTarget layerExecutionTarget)
+{
+    ActionMem ret;
+
+    ExecutionTarget target = mExecutionTarget;
+
+    if (layerExecutionTarget != LayerExecutionTarget::Default)
+    {
+        target = static_cast<ExecutionTarget>(layerExecutionTarget);
+    }
+
+    if (target == ExecutionTarget::CPU)
+    {
+        ret = std::make_shared<DeleteTensor<MemoryManager>>(getMemoryManager<MemoryManager>(), name);
+    }
+    else if (target == ExecutionTarget::CPUFP16)
+    {
+        ret = std::make_shared<DeleteTensor<MemoryManagerFP16>>(getMemoryManager<MemoryManagerFP16>(), name);
+    }
+    else
+    {
+        THROW_NONAME("Workflow", "wrong execution target");
+    }
+
+    return ret;
+}
+
+Workflow::ActionMem Workflow::newActionZero(const Name& name, LayerExecutionTarget layerExecutionTarget)
+{
+    ActionMem ret;
+
+    ExecutionTarget target = mExecutionTarget;
+
+    if (layerExecutionTarget != LayerExecutionTarget::Default)
+    {
+        target = static_cast<ExecutionTarget>(layerExecutionTarget);
+    }
+
+    if (target == ExecutionTarget::CPU)
+    {
+        ret = std::make_shared<Zero<MemoryManager>>(getMemoryManager<MemoryManager>(), name);
+    }
+    else if (target == ExecutionTarget::CPUFP16)
+    {
+        ret = std::make_shared<Zero<MemoryManagerFP16>>(getMemoryManager<MemoryManagerFP16>(), name);
+    }
+    else
+    {
+        THROW_NONAME("Workflow", "wrong execution target");
+    }
+
+    return ret;
+}
+
+Workflow::ActionMem Workflow::newActionAllocate(const Name& name, LayerExecutionTarget layerExecutionTarget, bool isTrain)
+{
+    ActionMem ret;
+
+    ExecutionTarget target = mExecutionTarget;
+
+    if (layerExecutionTarget != LayerExecutionTarget::Default)
+    {
+        target = static_cast<ExecutionTarget>(layerExecutionTarget);
+    }
+
+    if (target == ExecutionTarget::CPU)
+    {
+        ret = std::make_shared<Allocate<MemoryManager>>(getMemoryManager<MemoryManager>(), name, *this, isTrain ? mWorkflowPoolTrain : mWorkflowPoolTest);
+    }
+    else if (target == ExecutionTarget::CPUFP16)
+    {
+        ret = std::make_shared<Allocate<MemoryManagerFP16>>(getMemoryManager<MemoryManagerFP16>(), name, *this, isTrain ? mWorkflowPoolTrainFP16 : mWorkflowPoolTestFP16);
+    }
+    else
+    {
+        THROW_NONAME("Workflow", "wrong execution target");
+    }
+
+    return ret;
+}
+
+Workflow::ActionMem Workflow::newActionDeallocate(const Name& name, LayerExecutionTarget layerExecutionTarget)
+{
+    ActionMem ret;
+
+    ExecutionTarget target = mExecutionTarget;
+
+    if (layerExecutionTarget != LayerExecutionTarget::Default)
+    {
+        target = static_cast<ExecutionTarget>(layerExecutionTarget);
+    }
+
+    if (target == ExecutionTarget::CPU)
+    {
+        ret = std::make_shared<Deallocate<MemoryManager>>(getMemoryManager<MemoryManager>(), name);
+    }
+    else if (target == ExecutionTarget::CPUFP16)
+    {
+        ret = std::make_shared<Deallocate<MemoryManagerFP16>>(getMemoryManager<MemoryManagerFP16>(), name);
+    }
+    else
+    {
+        THROW_NONAME("Workflow", "wrong execution target");
+    }
+
+    return ret;
+}
+
+template std::vector<ParamAndGradImpl<TensorFP16>> Workflow::getTrainableParameters<MemoryManagerFP16>();
+template std::vector<ParamAndGradImpl<Tensor>> Workflow::getTrainableParameters<MemoryManager>();
+template std::vector<ParamAndGradImpl<TensorFP16>> Workflow::getTrainableParametersSafe<MemoryManagerFP16>();
+template std::vector<ParamAndGradImpl<Tensor>> Workflow::getTrainableParametersSafe<MemoryManager>();
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/Workflow.h b/training/src/compiler/training/compiler/Workflow.h
new file mode 100644
index 00000000..41607286
--- /dev/null
+++ b/training/src/compiler/training/compiler/Workflow.h
@@ -0,0 +1,811 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef WORKFLOW_H
+#define WORKFLOW_H
+
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/common/NetworkParameters.h>
+#include <training/base/common/quantization/IQuantizer.h>
+#include <training/base/loss/scaling/ScalingStrategy.h>
+#include <training/compiler/LayerBuilder.h>
+#include <training/compiler/WShape.h>
+#include <training/system/Name.h>
+
+namespace raul
+{
+
+class BasicLayer;
+class Compiler;
+class ConvertPrecisionLayer;
+struct BasicParams;
+typedef std::unique_ptr<BasicLayer> LayerMem;
+
+class WorkflowListener
+{
+  public:
+    virtual void BeforeForward(Workflow&) {}
+    virtual void AfterForward(Workflow&) {}
+    virtual void BeforeBackward(Workflow&) {}
+    virtual void AfterBackward(Workflow&) {}
+};
+
+class WorkflowDB;
+class WorkflowBasicPool;
+template<typename MM>
+class WorkflowPool;
+
+/**
+ * @brief Class to store all layers of deep neural network (DNN) and declarations of tensors.
+ *
+ * Topology of network represented as directed acyclic graph (DAG). Sequence of
+ * layers should be added in topologically sorted order. Declaration of tensors needed
+ * for automatic memory management. To declare tensor layer name and tensor name with
+ * additional information should be defined. Layer name is a name of layer added into DAG.
+ * Execution of test or train of DNN performed by pipelines - sequences of actions.
+ * Action is an object to manipulate memory (allocate, deallocate, zero etc.) or
+ * to execute layer (forward, backward).
+ *
+ */
+class Workflow
+{
+  public:
+    /**
+     * Training stage where tensor is used
+     */
+    enum class Usage
+    {
+        Forward = 0,  // mapped to std::vector<size_t> in TLTable
+        Backward = 1, // mapped to std::vector<size_t> in TLTable
+        ForwardAndBackward
+    };
+
+    /**
+     * Tensor access mode
+     */
+    enum class Mode
+    {
+        Read,
+        Write
+    };
+
+    /**
+     * Pipeline execution mode
+     */
+    enum class Execution
+    {
+        Normal,
+        Checkpointed
+    };
+
+    struct ActionParam
+    {
+        ActionParam() = default;
+    };
+
+    /**
+     * @brief Action interface.
+     *
+     * Action is an object to manipulate memory.
+     */
+    struct Action
+    {
+        virtual ~Action() = default;
+        virtual void execute(const ActionParam& param) = 0;
+        [[nodiscard]] virtual std::string type() const = 0;
+    };
+
+    typedef std::shared_ptr<Action> ActionMem;
+    typedef std::vector<ActionMem> Pipeline;
+
+    enum class Pipelines
+    {
+        CreateBatched,    // create tensors with batch size in N
+        CreateNotBatched, // create tensors with arbitrary number in N (once per workflow)
+        DeleteBatched,    // delete tensors with batch size in N
+        Zero,             // zero tensors (mostly gradients, once per iteration at the beginning)
+        ForwardTest,
+        ForwardTrain,
+        BackwardTrain
+    };
+
+    /**
+     * @brief Workflow ctor
+     *
+     * @param compressionMode
+     * @param calculationMode
+     * @param allocationMode memory allocation mode
+     * @param executionTarget
+     * @param quantizer
+     *
+     * A memory pool and other buffers are initialized in ctor if they are required.
+     *
+     */
+    explicit Workflow(CompressionMode compressionMode = CompressionMode::NONE,
+                      CalculationMode calculationMode = CalculationMode::DETERMINISTIC,
+                      AllocationMode allocationMode = AllocationMode::STANDARD,
+                      ExecutionTarget executionTarget = ExecutionTarget::CPU,
+                      bool useCompiler = false,
+                      quantization::IQuantizer* quantizer = nullptr);
+
+    Workflow(const Workflow&) = delete;
+    Workflow& operator=(const Workflow&) = delete;
+
+    virtual ~Workflow();
+
+    /**
+     * @brief Add layer helper
+     */
+    template<typename T, class... Args>
+    void add(const Name& name, Args&&... _args)
+    {
+        if (mIsPipelinesPrepared)
+        {
+            THROW_NONAME("Workflow", "pipelines prepared, no addition possible");
+        }
+
+        if (mUseCompiler)
+        {
+            if (mCompilationStarted)
+            {
+                THROW_NONAME("Workflow", "compilation already started");
+            }
+
+            if (std::is_same<T, ConvertPrecisionLayer>::value)
+            {
+                THROW_NONAME("Workflow", "Do not use conversions directly");
+            }
+
+            mBuilders.emplace_back(LayerBuilder<T, Args...>(name, std::forward<Args>(_args)...));
+        }
+        else
+        {
+            addLayer(std::make_unique<T>(name, std::forward<Args>(_args)..., mNetworkParameters));
+        }
+    }
+
+    /**
+     * @brief Declare tensor
+     * layerName - name of layer tensor needed by
+     * tensorName - name of tensor to create
+     * shapeVar - shape of tensor
+     * usage - tensor usage location
+     * mode - tensor access mode
+     * isOptimizeGraph - tensor might be removed if it is not used for mode read
+     * isOptimizeMem - dynamically allocate / deallocate memory during training / testing, not possible together with isTrainable
+     * isTrainable - indicate tensor will be updated by optimizer, not possible together with isOptimizeMem
+     * isZero - fill tensor with zeros once per training iteration
+     * isCompress - compress tensor after last usage in forward, decompress before first usage in backward
+     *
+     */
+    void tensorNeeded(const Name& layerName,
+                      const Name& tensorName,
+                      WShape shape,
+                      Usage usage,
+                      Mode mode,
+                      bool isOptimizeGraph,
+                      bool isOptimizeMem,
+                      bool isTrainable,
+                      bool isZero,
+                      bool isCompress,
+                      LayerExecutionTarget layerExecutionTarget = LayerExecutionTarget::Default);
+
+    /**
+     * @brief Declare tensor if not exists or copy declaration with maximizing shape
+     * layerName - name of layer tensor neeeded by
+     * tensorName - name of tensor to create
+     * shapeVar - shape of tensor
+     * usage - tensor usage location
+     * mode - tensor access mode
+     * isOptimizeGraph - tensor might be removed if it is not used for mode read
+     * isOptimizeMem - dynamicly allocate / deallocate memory during training / testing, not possible together with isTrainable
+     * isTrainable - indicate tensor will be updated by optimizer, not possible together with isOptimizeMem
+     * isZero - fill tensor with zeros once per training iteration
+     * isCompress - compress tensor after last usage in forward, decompress before first usage in backward
+     */
+    void tensorNeededMaxShape(const Name& layerName,
+                              const Name& tensorName,
+                              WShape shape,
+                              Usage usage,
+                              Mode mode,
+                              bool isOptimizeGraph,
+                              bool isOptimizeMem,
+                              bool isTrainable,
+                              bool isZero,
+                              bool isCompress,
+                              LayerExecutionTarget layerExecutionTarget = LayerExecutionTarget::Default);
+
+    /**
+     * @brief Copy declaration of tensor to different layer. Tensor name and shape will remain the same
+     * Different tensors should be equivalent in terms of shape and flags (except zero)
+     * Typical usage - change isZero flag using general tensorNeeded interface
+     */
+    void copyDeclaration(const Name& layerName,
+                         const Name& tensorName,
+                         Usage usage,
+                         Mode mode,
+                         bool isOptimizeGraph,
+                         bool isOptimizeMem,
+                         bool isTrainable,
+                         bool isZero,
+                         bool isCompress,
+                         LayerExecutionTarget layerExecutionTarget = LayerExecutionTarget::Default);
+
+    /**
+     * @brief Copy declaration of tensor to different layer. Tensor name, shape, flags will remain the same
+     * Typical usage - set declaration for input CHW tensor
+     */
+    void copyDeclaration(const Name& layerName, const Name& tensorName, Usage usage, Mode mode);
+
+    /**
+     * @brief Copy declaration of tensor to new tensor to some layer. Shape will remain the same
+     * Typical usage - copy declaration of input tensor to declaration of output tensor without shape (CHW or NCHW) analysis
+     */
+    void copyDeclaration(const Name& layerName,
+                         const Name& fromTensorName,
+                         const Name& toTensorName,
+                         Usage usage,
+                         Mode mode,
+                         bool isOptimizeGraph,
+                         bool isOptimizeMem,
+                         bool isTrainable,
+                         bool isZero,
+                         bool isCompress,
+                         LayerExecutionTarget layerExecutionTarget = LayerExecutionTarget::Default);
+
+    /**
+     * @brief Copy declaration to layer if declaration exists (flags in params are ignored) or to tensor (flags in params are used) if not.
+     * fromTensorName - if toTensorName declared then fromTensorName ignored, if not then source of copy
+     * toTensorName - if toTensorName declared then source of copy, if not then destination
+     */
+    void copyDec(const Name& layerName,
+                 const Name& fromTensorName,
+                 const Name& toTensorName,
+                 Usage usage,
+                 Mode mode,
+                 bool isOptimizeGraph,
+                 bool isOptimizeMem,
+                 bool isTrainable,
+                 bool isZero,
+                 bool isCompress);
+
+    /**
+     * @brief Create pipelines for training process
+     */
+    virtual void preparePipelines(Execution execution = Execution::Normal);
+
+    /**
+     * @brief Flush pipelines
+     */
+    void flush();
+
+    /**
+     * @brief Create tensors and allocate memory for not optimized not batched tensors, create shapes for optimized not batched tensors
+     */
+    virtual void prepareMemoryForTraining();
+
+    template<typename MM = MemoryManager>
+    std::vector<ParamAndGradImpl<typename MM::tensor>> getTrainableParameters();
+
+    /**
+     * @brief Get trainable params without exception if paramter is missed
+     */
+    template<typename MM = MemoryManager>
+    std::vector<ParamAndGradImpl<typename MM::tensor>> getTrainableParametersSafe();
+
+    /**
+     * @brief Getter to names of trainable parameters
+     *
+     * @return names of trainable parameters
+     *
+     */
+    Names getTrainableParameterNames() const;
+
+    Names getLayerTrainableParameterNames(const Name& layerName) const;
+
+    /**
+     * @brief Check if tensor is a trainable parameter
+     */
+    bool isTensorTrainable(const Name& name) const;
+
+    /**
+     * @brief Check if tensor is a trainable parameter
+     */
+    bool isTensorOptimizeMem(const Name& name) const;
+
+    /**
+     * @brief List of layer parameters names (both trainable and non-trainable)
+     *
+     * Layer parameters = trainable or (non-optimizable and non-zero)
+     *
+     * DataLayer has no parameters
+     */
+    Names getLayerParameterNames(const Name& layerName) const;
+
+    /**
+     * @brief Check if tensor has been declared previously
+     */
+    bool isTensorDeclared(const Name& tensorName) const;
+
+    /**
+     * @brief Get N of NCHW tensor. If placeholder used and no BS defined - exception
+     */
+    size_t getBatch(const Name& tensorName) const;
+
+    /**
+     * @brief Check if placeholder used for N
+     *
+     */
+    bool isBatchPlaceholded(const Name& tensorName) const;
+
+    /**
+     * @brief Get C of NCHW tensor. If placeholder used and no BS defined - exception
+     */
+    size_t getDepth(const Name& tensorName) const;
+
+    /**
+     * @brief Check if placeholder used for C
+     *
+     */
+    bool isDepthPlaceholded(const Name& tensorName) const;
+
+    /**
+     * @brief Get H of NCHW tensor. If placeholder used and no BS defined - exception
+     */
+    size_t getHeight(const Name& tensorName) const;
+
+    /**
+     * @brief Check if placeholder used for H
+     *
+     */
+    bool isHeightPlaceholded(const Name& tensorName) const;
+
+    /**
+     * @brief Get W of NCHW tensor. If placeholder used and no BS defined - exception
+     */
+    size_t getWidth(const Name& tensorName) const;
+
+    /**
+     * @brief Check if placeholder used for W
+     *
+     */
+    bool isWidthPlaceholded(const Name& tensorName) const;
+
+    WShape getShape(const Name& tensorName) const;
+
+    /**
+     * @brief Get batch size assigned to workflow
+     */
+    size_t getBatchSize() const;
+
+    /**
+     * @brief Assign batch size to workflow
+     * Create tensors and allocate memory for not optimized batched tensors, create shapes for optimized batched tensors
+     */
+    void setBatchSize(size_t batchSize);
+
+    /**
+     * @brief Execute forward test pipeline
+     */
+    virtual void forwardPassTesting();
+
+    /**
+     * @brief Execute forward train pipeline
+     *
+     * @param performZero true/false, global override of zeroing for not optimizable tensors in terms of memory (typically, gradients for weights), param is useful for microbatching mode
+     */
+    virtual void forwardPassTraining(bool performZero = true);
+
+    /**
+     * @brief Execute backward train pipeline
+     */
+    virtual void backwardPassTraining();
+
+    /**
+     * @brief Select checkpointed tensors among activations (layers outputs)
+     */
+    void setCheckpoints(const Names& checkpoints);
+
+    /**
+     * @brief Get list of activations, available for checkpointing (not ordered)
+     */
+    Names getPotentialCheckpoints() const;
+
+    MemoryManager& getMemoryManager() { return mMemoryManager; }
+    template<typename MM>
+    MM& getMemoryManager();
+
+    const MemoryManager& getMemoryManager() const { return mMemoryManager; }
+    template<typename MM>
+    const MM& getMemoryManager() const;
+
+    NetworkParameters& getNetworkParameters() { return mNetworkParameters; }
+    const NetworkParameters& getNetworkParameters() const { return mNetworkParameters; }
+
+    bool isCompilerEnabled() const { return mUseCompiler; }
+    Compiler& getCompiler();
+    const Compiler& getCompiler() const;
+
+    AllocationMode getAllocationMode() const { return mAllocationMode; }
+
+    ExecutionTarget getExecutionTarget() const { return mExecutionTarget; }
+
+    /**
+     * Override LayerExecutionTarget for further added layers. Memory manager and implementation behavior will be altered.
+     * Use together with ConvertPrecisionLayer
+     * Use internally in Compiler only!
+     */
+    void overrideLayerExecutionTarget(LayerExecutionTarget layerExecutionTarget);
+    void resetLayerExecutionTargetOverride() { mOverridedLayerExecutionTarget = LayerExecutionTarget::Default; }
+    LayerExecutionTarget getOverrideLayerExecutionTarget() const { return mOverridedLayerExecutionTarget; }
+
+    const BasicLayer* operator[](const raul::Name& name) const;
+    BasicLayer* operator[](const raul::Name& name);
+
+    void printInfo(std::ostream& stream) const;
+
+    std::unordered_set<std::string> getSetOfLayers() const;
+
+    const Pipeline& getPipeline(Pipelines pipeline) const;
+
+    void addCallback(const Name& layerName, WorkflowListener& listener);
+    // used by actions
+    typedef std::vector<WorkflowListener*> Listeners;
+    Listeners getListeners(const Name& layerName) const;
+
+    void setScaling(const Name& layerName, ScalingStrategy scaling) { mScalingStrategies.insert({ layerName, scaling }); }
+
+  protected:
+    /**
+     * @brief Add layer explicitly
+     *
+     * @param layer shared pointer to layer
+     */
+    void addLayer(LayerMem layer);
+
+    /**
+     * @brief Fill a list of dangling input tensors
+     *
+     * This is an auxiliary protection procedure.
+     */
+    void fillExternalInputs();
+
+    /**
+     * @brief Check a feature of incorrect graph
+     *
+     * Graph is incorrect if there is at least one dangling input tensor.
+     *
+     * @return true/false
+     */
+    [[nodiscard]] bool isGraphCorrected() const { return mExternalInputs.empty(); }
+
+    NameSet mExternalInputs;
+
+    bool checkOutputsNeeded(const BasicLayer* layer) const;
+
+    [[nodiscard]] size_t getDimension(const Name& tensorName, size_t dim) const;
+    [[nodiscard]] bool isDimensionPlaceholded(const Name& tensorName, size_t dim) const;
+
+    /**
+     * @brief Create implementations of layers
+     * Call before preparePipelines, only once
+     */
+    void createImplementations();
+
+    MemoryManager mMemoryManager;
+    MemoryManagerFP16 mMemoryManagerFP16;
+    NetworkParameters mNetworkParameters;
+
+    std::unique_ptr<Compiler> mCompiler;
+
+    const AllocationMode mAllocationMode;
+
+    const ExecutionTarget mExecutionTarget;
+    LayerExecutionTarget mOverridedLayerExecutionTarget;
+
+    std::shared_ptr<WorkflowDB> mWorkflowDB;
+    std::shared_ptr<WorkflowPool<MemoryManager>> mWorkflowPoolTest;
+    std::shared_ptr<WorkflowPool<MemoryManager>> mWorkflowPoolTrain;
+    std::shared_ptr<WorkflowPool<MemoryManagerFP16>> mWorkflowPoolTestFP16;
+    std::shared_ptr<WorkflowPool<MemoryManagerFP16>> mWorkflowPoolTrainFP16;
+
+    bool mUseCompiler;
+    bool mCompilationStarted;
+    bool mIsPipelinesPrepared;
+
+    size_t mBatchSize;
+    bool mIsBatchSizeInited;
+
+    bool mIsMemoryPrepared;
+
+    bool mIsForwardCalled;
+
+    std::vector<BasicLayerBuilder> mBuilders;
+
+    std::vector<LayerMem> mLayers;                     ///< Ordered list of layers genrated by compiler (generated layers not added in mLayersDict)
+    std::unordered_map<Name, BasicLayer*> mLayersDict; ///< Layer search dictionary by name
+
+    /**
+     * @brief Fill auxiliary pipelines with commands
+     *
+     */
+    void createAuxPipelines();
+
+    /**
+     * @brief Fill test pipeline with commands
+     *
+     * @see execTargetCreateForwardTestPipeline
+     *
+     */
+    void createForwardTestPipeline();
+
+    /**
+     * @brief Fill train pipeline with commands
+     *
+     * @see fillTrainPipeline
+     * @see fillTrainPipelineCompression
+     *
+     */
+    void createTrainPipeline();
+
+    /**
+     * @brief Check equality of attributes and throw exception if they are not equal
+     * @param indexA index in workflow DB of the 1st usage
+     * @param indexB index in workflow DB of the 2nd usage
+     * @param name tensor name (only for message generation)
+     *
+     * @note: called from createAuxPipelines()
+     */
+    void checkAttributesInequality(size_t indexA, size_t indexB, const Name& name) const;
+
+    /**
+     * @brief Map of tensor names to their usage interval in layers
+     *
+     * Tensor name -> [first, last] layer
+     *
+     *
+     * @note Order is important to simplify checkpointing tests
+     *
+     * @see Name
+     */
+    typedef std::map<Name, std::pair<Name, Name>> Timeline;
+
+    /**
+     * @brief Tensors collection of the layer [Layer name -> tensor names]
+     *
+     *
+     * @see Name
+     * @see Names
+     */
+    typedef std::unordered_map<Name, Names> Appearance;
+
+    /**
+     * @brief Extract names of layers
+     * @return Names
+     */
+    [[nodiscard]] Names getLayerNames() const;
+
+    /**
+     * @brief Generate timeline (a map of tensor names and their usage interval in layers)
+     * @param layers
+     * @param usage
+     * @return
+     *
+     * @see Names
+     * @see Usage
+     */
+    [[nodiscard]] Timeline getTimeline(const Names& layers, Usage usage) const;
+
+    /**
+     * @brief Generate pair of appearances for first and last layers of each timeline
+     *
+     * Steps:
+     * - Iterate over timeline collection (a timeline per tensor)
+     * - Fill two appearances (starting and ending) for layers
+     *
+     * Appearance describes what tensors are allocated and deallocated on a layer.
+     *
+     * @param timelines
+     * @return pair of appearances
+     *
+     * @see Timeline
+     * @see Appearance
+     */
+    [[nodiscard]] std::pair<Appearance, Appearance> timelineToAppearance(const Timeline& timelines) const;
+    [[nodiscard]] std::tuple<Timeline, Timeline> appearanceToTimeline(const std::pair<Appearance, Appearance>& forward, const std::pair<Appearance, Appearance>& backward) const;
+
+    Pipeline mPipelineCreateBatched;
+    Pipeline mPipelineCreateNotBatched;
+    Pipeline mPipelineDeleteBatched;
+    Pipeline mPipelineZeroTensors;
+
+    Pipeline mPipelineForwardTest;
+    Pipeline mPipelineForwardTrain;
+    Pipeline mPipelineBackwardTrain;
+
+    void executePipeline(Pipelines pipeline, const ActionParam& param = ActionParam()) const;
+
+    void createTrainCheckpointedPipeline();
+
+    Names mCheckpoints;
+
+    [[nodiscard]] std::pair<bool, size_t> findLayerByOutput(const Name& tensor) const;
+    [[nodiscard]] bool isCheckpoint(const Name& tensor) const;
+    [[nodiscard]] bool isPersistent(const Name& tensor) const;
+
+    typedef std::vector<BasicLayer*> Layers;
+    [[nodiscard]] Layers findPathFromActivationTillCheckpoint(const Name& tensor, const std::unordered_set<Name>& alreadyFound) const;
+    void traverseGraphTillCheckpoint(BasicLayer* layer, Layers& names, const std::unordered_set<Name>& alreadyFound) const;
+    void removeDuplications(const Names& tNames, Appearance& appear) const;
+    void removeDuplications(const Names& tNames, Appearance& appearA, Appearance& appearB) const;
+    void removeDuplications(const std::unordered_set<Name>& tNames, Appearance& appearA, Appearance& appearB) const;
+
+    /**
+     * @brief Checker
+     *
+     * @param names
+     * @return true/false
+     *
+     */
+    [[nodiscard]] bool isUniqueNames(const Names& names) const;
+
+    /**
+     * @brief Fill training pipeline
+     *
+     * @param pipeline
+     * @param layer
+     * @param first
+     * @param last
+     * @param usage
+     * @param pool
+     * @param poolfp16
+     */
+    void fillTrainPipeline(Pipeline& pipeline, BasicLayer* layer, const Appearance& first, const Appearance& last, Usage usage);
+
+    void fillTrainPipelineCheckpointed(Pipeline& pipeline,
+                                       BasicLayer* layer,
+                                       const Appearance& first,
+                                       const Appearance& last,
+                                       std::shared_ptr<WorkflowPool<MemoryManager>>& pool,
+                                       size_t recalcAmount,
+                                       const Name& recalcTensor);
+
+    void fillTrainPipelineCompression(Pipeline& pipeline, BasicLayer* layer, const Appearance& appear, Usage usage);
+
+    /**
+     * @brief Clean pipelines
+     *
+     * Delete commands in sequences of actions
+     */
+    void clearPipelines();
+
+    /**
+     * @brief Fill pipelines (mPipelineCreateBatched, mPipelineCreateNotBatched, mPipelineZeroTensors)
+     *
+     * @tparam MM
+     * @tparam CreateShape
+     * @tparam CreateTensor
+     * @tparam DeleteTensor
+     * @tparam Zero
+     * @param uniqueTensors
+     */
+    void execTargetCreateAuxPipelines(const std::unordered_map<Name, size_t>& uniqueTensors);
+
+    /**
+     * @brief Fill test pipeline
+     *
+     * Create a list of action respective to the provided appearances.
+     *
+     * Steps:
+     * - Iterate over layers
+     * - Check whether any tensor timeline starts at the layer or not
+     * - Do allocate actions for such tensors
+     * - Do forward action for the layer
+     * - Check whether any tensor timeline ends at the layer or not
+     * - Do deallocate actions for such tensors
+     *
+     * @tparam MM memory manager type
+     * @tparam Allocate Allocator pipeline action type
+     * @tparam Deallocate Deallocator pipeline action type
+     * @tparam Pool Memory pool type
+     *
+     * @param timelineLayersFirst Appearance of timeline starting layers
+     * @param timelineLayersLast Appearance of timeline ending layers
+     * @param pool Memory pool
+     *
+     * @see Appearance
+     * @see Pool
+     */
+    void execTargetCreateForwardTestPipeline(const Appearance& timelineLayersFirst, const Appearance& timelineLayersLast);
+
+    /**
+     * @brief Fill train pipeline
+     *
+     * @tparam MM
+     * @tparam Allocate
+     * @tparam Deallocate
+     * @tparam Zero
+     * @tparam Pool
+     * @param pipeline
+     * @param layer
+     * @param first
+     * @param last
+     * @param usage
+     * @param pool
+     */
+    void execTargetFillTrainPipeline(Pipeline& pipeline, BasicLayer* layer, const Appearance& first, const Appearance& last, Usage usage);
+
+    ActionMem newActionCreateShape(const Name& name, LayerExecutionTarget layerExecutionTarget, const WShape& shape);
+    ActionMem newActionCreateTensor(const Name& name, LayerExecutionTarget layerExecutionTarget, const WShape& shape);
+    ActionMem newActionDeleteTensor(const Name& name, LayerExecutionTarget layerExecutionTarget);
+    ActionMem newActionZero(const Name& name, LayerExecutionTarget layerExecutionTarget);
+    ActionMem newActionAllocate(const Name& name, LayerExecutionTarget layerExecutionTarget, bool isTrain);
+    ActionMem newActionDeallocate(const Name& name, LayerExecutionTarget layerExecutionTarget);
+
+    std::unordered_map<Name, Listeners> mListeners;
+
+    std::unordered_map<Name, ScalingStrategy> mScalingStrategies;
+};
+
+#define DEC_FORW_READ raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, true, true, false, false, false
+#define DEC_FORW_READ_NOMEMOPT raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false
+#define DEC_FORW_WRIT raul::Workflow::Usage::Forward, raul::Workflow::Mode::Write, true, true, false, false, false
+#define DEC_FORW_WRIT_COMP raul::Workflow::Usage::Forward, raul::Workflow::Mode::Write, true, true, false, false, params.isCompressOutput()
+#define DEC_FORW_WRIT_NOMEMOPT raul::Workflow::Usage::Forward, raul::Workflow::Mode::Write, false, false, false, false, false
+
+#define DEC_BACK_READ raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, true, true, false, false, false
+#define DEC_BACK_READ_COMP raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, true, true, false, false, params.isCompressOutput()
+#define DEC_BACK_READ_NOMEMOPT raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false
+#define DEC_BACK_WRIT raul::Workflow::Usage::Backward, raul::Workflow::Mode::Write, true, true, false, false, false
+#define DEC_BACK_WRIT_ZERO raul::Workflow::Usage::Backward, raul::Workflow::Mode::Write, true, true, false, true, false
+
+#define DEC_FRBC_WRIT raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Write, true, true, false, false, false
+#define DEC_FRBC_READ_NOMEMOPT raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, false, false, false
+#define DEC_FRBC_WRIT_NOMEMOPT raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Write, false, false, false, false, false
+
+#define DEC_TRAINABLE raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, !mFrozen, false, false
+#define DEC_TRAINABLE_GRAD raul::Workflow::Usage::Backward, raul::Workflow::Mode::Write, false, false, false, true, false
+
+#define NETWORK_PARAMS_DEFINE(varName) [[maybe_unused]] raul::NetworkParameters& varName = work.getNetworkParameters();
+
+#define TENSORS_CREATE(batch_size)                                                                                                                                                                     \
+    work.preparePipelines();                                                                                                                                                                           \
+    work.setBatchSize(batch_size);                                                                                                                                                                     \
+    work.prepareMemoryForTraining();
+
+#define DECLARE_IMPL(name, typeCPU, typeCPUFP16)                                                                                                                                                       \
+    if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPU && std::string(#typeCPU) != "NotImplemented")                                                                            \
+    {                                                                                                                                                                                                  \
+        mImpl = std::make_unique<typeCPU>(*this);                                                                                                                                                      \
+    }                                                                                                                                                                                                  \
+    else if (mNetworkParams.mWorkflow.getExecutionTarget() == ExecutionTarget::CPUFP16 && std::string(#typeCPUFP16) != "NotImplemented")                                                               \
+    {                                                                                                                                                                                                  \
+        mImpl = std::make_unique<typeCPUFP16>(*this);                                                                                                                                                  \
+    }                                                                                                                                                                                                  \
+    else                                                                                                                                                                                               \
+    {                                                                                                                                                                                                  \
+        THROW_NONAME(#name, "unsupported execution target");                                                                                                                                           \
+    }
+
+#define INSTANTIATE_IMPL(impl)                                                                                                                                                                         \
+    template class impl<MemoryManager>;                                                                                                                                                                \
+    template class impl<MemoryManagerFP16>;                                                                                                                                                            \
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/WorkflowActions.cpp b/training/src/compiler/training/compiler/WorkflowActions.cpp
new file mode 100644
index 00000000..7eca3969
--- /dev/null
+++ b/training/src/compiler/training/compiler/WorkflowActions.cpp
@@ -0,0 +1,168 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "WorkflowActions.h"
+
+namespace raul
+{
+
+void Backward::rescaleGrads(Names& grads)
+{
+    auto& memoryManagerFP32 = mWork.getMemoryManager<MemoryManager>();
+    auto& memoryManagerFP16 = mWork.getMemoryManager<MemoryManagerFP16>();
+
+    auto maxScale = 0.0_dt;
+    for (auto& gradName : grads)
+    {
+        if (memoryManagerFP32.tensorExists(gradName))
+        {
+            auto grad = memoryManagerFP32[gradName];
+            auto scale = grad.getScale();
+            if (scale && std::abs(*scale) > std::abs(maxScale)) maxScale = *scale;
+        }
+
+        if (memoryManagerFP16.tensorExists(gradName))
+        {
+            auto grad = memoryManagerFP16[gradName];
+            auto scale = grad.getScale();
+            if (scale && std::abs(static_cast<dtype>(*scale)) > std::abs(maxScale)) maxScale = static_cast<dtype>(*scale);
+        }
+    }
+
+    for (auto& gradName : grads)
+    {
+        if (memoryManagerFP32.tensorExists(gradName))
+        {
+            auto grad = memoryManagerFP32[gradName];
+            auto scale = grad.getScale();
+            if (scale)
+            {
+                auto unified_scale = maxScale / *scale;
+                grad.scale(unified_scale);
+            }
+            else
+            {
+                grad.scale(maxScale);
+            }
+        }
+
+        if (memoryManagerFP16.tensorExists(gradName))
+        {
+            auto grad = memoryManagerFP16[gradName];
+            auto scale = grad.getScale();
+            if (scale)
+            {
+                auto unified_scale = maxScale / *scale;
+                grad.scale(static_cast<half>(unified_scale));
+            }
+            else
+            {
+                grad.scale(static_cast<half>(maxScale));
+            }
+        }
+    }
+}
+
+void Backward::applyScale(const Names& grads, dtype scale)
+{
+    auto& memoryManagerFP32 = mWork.getMemoryManager<MemoryManager>();
+    auto& memoryManagerFP16 = mWork.getMemoryManager<MemoryManagerFP16>();
+
+    for (auto& gradName : grads)
+    {
+        if (memoryManagerFP32.tensorExists(gradName))
+        {
+            memoryManagerFP32[gradName].resetScale(scale);
+        }
+        else if (memoryManagerFP16.tensorExists(gradName))
+        {
+            memoryManagerFP16[gradName].resetScale(scale);
+        }
+    }
+}
+
+void Backward::scaleGrads()
+{
+    try
+    {
+        auto& memoryManagerFP32 = mWork.getMemoryManager<MemoryManager>();
+        auto& memoryManagerFP16 = mWork.getMemoryManager<MemoryManagerFP16>();
+
+        auto outputsGrads = mLayer->getOutputs();
+        std::transform(outputsGrads.cbegin(), outputsGrads.cend(), outputsGrads.begin(), [](auto& x) { return x.grad(); });
+
+        if (outputsGrads.size() > 1)
+        {
+            rescaleGrads(outputsGrads);
+        }
+
+        for (auto& gradName : outputsGrads)
+        {
+            if (memoryManagerFP32.tensorExists(gradName))
+            {
+                mScaling->scale(memoryManagerFP32[gradName]);
+            }
+            else if (memoryManagerFP16.tensorExists(gradName))
+            {
+                mScaling->scale(memoryManagerFP16[gradName]);
+            }
+        }
+    }
+    catch (...)
+    {
+        THROW_NONAME("Backward", "Cannot scale gradients");
+    }
+}
+
+void Backward::propagateScale()
+{
+    try
+    {
+        auto outputsGrads = mLayer->getOutputs();
+        auto inputsGrads = mLayer->getInputs();
+        auto paramsGrads = mWork.getLayerParameterNames(mLayer->getName());
+        std::transform(outputsGrads.cbegin(), outputsGrads.cend(), outputsGrads.begin(), [](auto& x) { return x.grad(); });
+        std::transform(inputsGrads.cbegin(), inputsGrads.cend(), inputsGrads.begin(), [](auto& x) { return x.grad(); });
+        std::transform(paramsGrads.cbegin(), paramsGrads.cend(), paramsGrads.begin(), [](auto& x) { return x.grad(); });
+
+        auto& memoryManagerFP32 = mWork.getMemoryManager<MemoryManager>();
+        auto& memoryManagerFP16 = mWork.getMemoryManager<MemoryManagerFP16>();
+
+        std::optional<dtype> scale = std::nullopt;
+
+        if (!outputsGrads.empty())
+        {
+            const auto deltaName = outputsGrads[0];
+            if (memoryManagerFP32.tensorExists(deltaName))
+            {
+                scale = memoryManagerFP32[deltaName].getScale();
+            }
+            else if (memoryManagerFP16.tensorExists(deltaName))
+            {
+                scale = memoryManagerFP16[deltaName].getScale();
+            }
+        }
+
+        if (scale)
+        {
+            applyScale(inputsGrads, *scale);
+            applyScale(paramsGrads, *scale);
+        }
+    }
+    catch (...)
+    {
+        THROW_NONAME("Backward", "Cannot propagate scale");
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/WorkflowActions.h b/training/src/compiler/training/compiler/WorkflowActions.h
new file mode 100644
index 00000000..a8396f0e
--- /dev/null
+++ b/training/src/compiler/training/compiler/WorkflowActions.h
@@ -0,0 +1,357 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef WORKFLOW_ACTIONS_H
+#define WORKFLOW_ACTIONS_H
+
+#include "WorkflowPool.h"
+#include <training/base/common/Tensor.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/loss/scaling/ScalingStrategy.h>
+
+namespace raul
+{
+
+template<typename MM>
+struct TensorAction : public Workflow::Action
+{
+    TensorAction(MM& manager, const Name& name)
+        : mMemoryManager(manager)
+        , mName(name)
+    {
+    }
+
+    MM& mMemoryManager;
+    Name mName;
+};
+
+struct LayerAction : public Workflow::Action
+{
+    LayerAction(BasicLayer* layer)
+        : mLayer(layer)
+    {
+    }
+
+    BasicLayer* mLayer;
+};
+
+template<typename MM>
+struct CreateTensor : public TensorAction<MM>
+{
+    CreateTensor(MM& manager, const Name& name, const WShape shapeVar, const Workflow& work)
+        : TensorAction<MM>(manager, name)
+        , mShape(shapeVar)
+        , mWork(work)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override { this->mMemoryManager.createTensor(this->mName, mShape.getShape(mWork)); }
+
+    virtual std::string type() const override { return "CreateTensor"; }
+
+    WShape mShape;
+    const Workflow& mWork;
+};
+
+template<typename MM>
+struct CreateShape : public TensorAction<MM>
+{
+    CreateShape(MM& manager, const Name& name, const WShape shapeVar, const Workflow& work)
+        : TensorAction<MM>(manager, name)
+        , mShape(shapeVar)
+        , mWork(work)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override { this->mMemoryManager.createShape(this->mName, mShape.getShape(mWork), mWork.getAllocationMode()); }
+
+    virtual std::string type() const override { return "CreateShape"; }
+
+    WShape mShape;
+    const Workflow& mWork;
+};
+
+template<typename MM>
+struct DeleteTensor : public TensorAction<MM>
+{
+    DeleteTensor(MM& manager, const Name& name)
+        : TensorAction<MM>(manager, name)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override
+    {
+        if (this->mMemoryManager.tensorExists(this->mName)) this->mMemoryManager.deleteTensor(this->mName);
+    }
+
+    virtual std::string type() const override { return "DeleteTensor"; }
+};
+
+template<typename MM>
+struct Allocate : public TensorAction<MM>
+{
+    Allocate(MM& manager, const Name& name, const Workflow& work, std::shared_ptr<WorkflowPool<MM>>& pool)
+        : TensorAction<MM>(manager, name)
+        , mWork(work)
+        , mPool(pool)
+        , mPoolTensorName(name)
+    {
+    }
+
+    Allocate(MM& manager, const Name& name, const Name& namePool, const Workflow& work, std::shared_ptr<WorkflowPool<MM>>& pool)
+        : TensorAction<MM>(manager, name)
+        , mWork(work)
+        , mPool(pool)
+        , mPoolTensorName(namePool)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override
+    {
+        typename MM::type* offset = nullptr;
+
+        if (mWork.getAllocationMode() == AllocationMode::POOL)
+        {
+            offset = mPool->getOffset(mPoolTensorName);
+        }
+
+        this->mMemoryManager[this->mName].memAllocate(offset);
+    }
+
+    virtual std::string type() const override { return "Allocate"; }
+
+    const Workflow& mWork;
+    std::shared_ptr<WorkflowPool<MM>>& mPool;
+    const Name mPoolTensorName;
+};
+
+template<typename MM>
+struct Deallocate : public TensorAction<MM>
+{
+    Deallocate(MM& manager, const Name& name)
+        : TensorAction<MM>(manager, name)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override { this->mMemoryManager[this->mName].memClear(); }
+
+    virtual std::string type() const override { return "Deallocate"; }
+};
+
+struct Forward : public LayerAction
+{
+    Forward(BasicLayer* layer, raul::NetworkMode mode, Workflow& work)
+        : LayerAction(layer)
+        , mMode(mode)
+        , mWork(work)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override
+    {
+        try
+        {
+            auto listeners = mWork.getListeners(mLayer->getName());
+
+            for (auto listener : listeners)
+            {
+                listener->BeforeForward(mWork);
+            }
+
+            mLayer->forwardCompute(mMode);
+
+            for (auto listener : listeners)
+            {
+                listener->AfterForward(mWork);
+            }
+        }
+        catch (...)
+        {
+            THROW_NONAME("Forward", "Cannot execute a layer action");
+        }
+    }
+
+    virtual std::string type() const override { return "Forward"; }
+
+    raul::NetworkMode mMode;
+    Workflow& mWork;
+};
+
+struct InitNonBS : public LayerAction
+{
+    InitNonBS(BasicLayer* layer)
+        : LayerAction(layer)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override { mLayer->initNotBSTensors(); }
+
+    virtual std::string type() const override { return "InitNonBS"; }
+};
+
+struct UpdateBS : public LayerAction
+{
+    UpdateBS(BasicLayer* layer, Workflow& work)
+        : LayerAction(layer)
+        , mWork(work)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override { mLayer->onBatchSizeChanged(mWork.getBatchSize()); }
+
+    virtual std::string type() const override { return "UpdateBS"; }
+
+    Workflow& mWork;
+};
+
+struct Backward : public LayerAction
+{
+    Backward(BasicLayer* layer, Workflow& work)
+        : LayerAction(layer)
+        , mWork(work)
+    {
+    }
+
+    Backward setScaling(ScalingStrategy scaling)
+    {
+        mScaling = scaling;
+        return *this;
+    }
+
+    void execute(const Workflow::ActionParam&) override
+    {
+        try
+        {
+            auto listeners = mWork.getListeners(mLayer->getName());
+
+            try
+            {
+                for (auto listener : listeners)
+                {
+                    listener->BeforeBackward(mWork);
+                }
+            }
+            catch (...)
+            {
+                THROW_NONAME("Backward", "Cannot execute BeforeBackward listeners");
+            }
+
+            if (mScaling)
+            {
+                scaleGrads();
+            }
+
+            // expected to be a pure function
+            mLayer->backwardCompute();
+
+            /// Currently, it cannot be implemented due to architecture restrictions.
+            propagateScale();
+
+            try
+            {
+                for (auto listener : listeners)
+                {
+                    listener->AfterBackward(mWork);
+                }
+            }
+            catch (...)
+            {
+                THROW_NONAME("Backward", "Cannot execute AfterBackward listeners");
+            }
+        }
+        catch (...)
+        {
+            THROW_NONAME("Backward", "Cannot execute a layer action");
+        }
+    }
+
+    [[nodiscard]] std::string type() const override { return "Backward"; }
+
+  private:
+    void rescaleGrads(Names& grads);
+    void applyScale(const Names& grads, dtype scale);
+    void scaleGrads();
+
+    /**
+     * Copy scale from output gradients of the layer to input
+     * gradients and parameter ones if they are presented.
+     *
+     * This function is a workaround.
+     */
+    void propagateScale();
+
+  public:
+    Workflow& mWork;
+    std::optional<ScalingStrategy> mScaling;
+};
+
+template<typename MM>
+struct Compress : public TensorAction<MM>
+{
+    Compress(MM& manager, const Name& name, CompressionMode mode)
+        : TensorAction<MM>(manager, name)
+        , mMode(mode)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override
+    {
+        auto& t = this->mMemoryManager[this->mName];
+        t.compress(mMode);
+    }
+
+    virtual std::string type() const override { return "Compress"; }
+
+    CompressionMode mMode;
+};
+
+template<typename MM>
+struct Decompress : public TensorAction<MM>
+{
+    Decompress(MM& manager, const Name& name, CompressionMode mode)
+        : TensorAction<MM>(manager, name)
+        , mMode(mode)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override
+    {
+        auto& t = this->mMemoryManager[this->mName];
+        t.decompress(mMode);
+    }
+
+    virtual std::string type() const override { return "Decompress"; }
+
+    CompressionMode mMode;
+};
+
+template<typename MM>
+struct Zero : public TensorAction<MM>
+{
+    Zero(MM& manager, const Name& name)
+        : TensorAction<MM>(manager, name)
+    {
+    }
+
+    virtual void execute(const Workflow::ActionParam&) override
+    {
+        auto& t = this->mMemoryManager[this->mName];
+        std::fill(t.begin(), t.end(), TOMMTYPE(0));
+    }
+
+    virtual std::string type() const override { return "Zero"; }
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/WorkflowBuilder.h b/training/src/compiler/training/compiler/WorkflowBuilder.h
new file mode 100644
index 00000000..387c9c4f
--- /dev/null
+++ b/training/src/compiler/training/compiler/WorkflowBuilder.h
@@ -0,0 +1,159 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef WORKFLOW_BUILDER_H
+#define WORKFLOW_BUILDER_H
+
+#include <training/frontend/Frontend.h>
+#include <training/compiler/LayerBuilder.h>
+
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/activations/TanhActivation.h>
+#include <training/base/layers/basic/ReshapeLayer.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+#include <training/compiler/Workflow.h>
+
+#include <training/system/Errors.h>
+
+namespace raul
+{
+
+struct WorkflowBuilder : frontend::Processor
+{
+    explicit WorkflowBuilder(Workflow& workflow)
+        : work{ workflow }
+    {
+    }
+
+    void process(const frontend::GraphDeclaration& graph, const std::optional<frontend::Path> path) override
+    {
+        prepareCurrentPorts(graph, path);
+
+        for (auto& [elementName, elementGenerator] : graph.elements)
+        {
+            auto fullName = path ? *path / elementName : elementName;
+            try
+            {
+                elementGenerator.apply(*this, fullName);
+            }
+            catch (...)
+            {
+                std::stringstream ss;
+                ss << "cannot process element \"" << fullName << "\"";
+                THROW_NONAME("WorkflowBuilder", ss.str());
+            }
+        }
+    }
+
+    void process(const frontend::LinearDeclaration& x, const std::optional<frontend::Path> path) override
+    {
+        auto name = getName(path);
+        const auto inputNames = getSourcePorts(path, x.inputs);
+        work.add<LinearLayer>(name, LinearParams{ inputNames[0], name / x.outputs[0], x.features, x.bias });
+    }
+
+    void process(const frontend::SigmoidDeclaration& x, const std::optional<frontend::Path> path) override
+    {
+        const auto name = getName(path);
+        const auto inputNames = getSourcePorts(path, x.inputs);
+        work.add<SigmoidActivation>(name, BasicParams{ { inputNames[0] }, Names{ name / x.outputs[0] } });
+    }
+
+    void process(const frontend::TanhDeclaration& x, const std::optional<frontend::Path> path) override
+    {
+        const auto name = getName(path);
+        const auto inputNames = getSourcePorts(path, x.inputs);
+        work.add<TanhActivation>(name, BasicParams{ { inputNames[0] }, Names{ name / x.outputs[0] } });
+    }
+
+    void process(const frontend::SoftmaxDeclaration& x, const std::optional<frontend::Path> path) override
+    {
+        const auto name = getName(path);
+        const auto inputNames = getSourcePorts(path, x.inputs);
+        work.add<SoftMaxActivation>(name, BasicParamsWithDim{ { inputNames[0] }, Names{ name / x.outputs[0] } });
+    }
+
+    void process(const frontend::ReshapeDeclaration& x, const std::optional<frontend::Path> path) override
+    {
+        const auto name = getName(path);
+        const auto inputNames = getSourcePorts(path, x.inputs);
+        if (x.shape.size() != 3)
+        {
+            THROW_NONAME("WorkflowBuilder", "workflow support 3 component tensor reshape");
+        }
+        work.add<ReshapeLayer>(name, ViewParams{ inputNames[0], name / x.outputs[0], x.shape[0], x.shape[1], x.shape[2] });
+    }
+
+    //  private:
+    static Name getName(const std::optional<frontend::Path>& path) { return path ? path->fullname("::") : "noname"; }
+
+    void prepareCurrentPorts(const frontend::GraphDeclaration& graph, const std::optional<frontend::Path>& path)
+    {
+        for (const auto& [from, to] : graph.connections)
+        {
+            const auto toPath = path ? *path / to.getPath() : to.getPath();
+            const auto fromPath = path ? *path / from.getPath() : from.getPath();
+            const auto toName = toPath.fullname("::");
+            auto fromName = fromPath.fullname("::");
+
+            if (currentPorts.find(toName) != currentPorts.end())
+            {
+                std::stringstream ss;
+                ss << "input tensor collision: ? -> \"" << toName << "\"";
+                THROW_NONAME("WorkflowBuilder", ss.str());
+            }
+
+            while (currentPorts.find(fromName) != currentPorts.end())
+            {
+                fromName = currentPorts[fromName];
+            }
+
+            if (toName == fromName)
+            {
+                std::stringstream ss;
+                ss << "loop found: \"" << toName << "\""
+                   << " -> \"" << toName << "\"";
+                THROW_NONAME("WorkflowBuilder", ss.str());
+            }
+
+            currentPorts[toName] = fromName;
+        }
+    }
+
+    [[nodiscard]] frontend::PortNames getSourcePorts(const std::optional<frontend::Path>& path, const frontend::PortNames& ports)
+    {
+        frontend::PortNames results;
+        for (const auto& port : ports)
+        {
+
+            std::string originalName = path ? (*path / port).fullname("::") : port;
+
+            while (currentPorts.find(originalName) != currentPorts.end())
+            {
+                originalName = currentPorts[originalName];
+            }
+            results.emplace_back(originalName);
+        }
+        return results;
+    }
+
+  private:
+    std::map<Name, Name> currentPorts;
+    Workflow& work;
+};
+
+} // namespace raul
+
+#endif // WORKFLOW_BUILDER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/WorkflowDB.cpp b/training/src/compiler/training/compiler/WorkflowDB.cpp
new file mode 100644
index 00000000..93b1b3f5
--- /dev/null
+++ b/training/src/compiler/training/compiler/WorkflowDB.cpp
@@ -0,0 +1,229 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "WorkflowDB.h"
+
+namespace raul
+{
+
+bool WorkflowDB::isTensorExistsInTable(const Name& tensorName, const Name& layerName, Workflow::Usage usage) const
+{
+    bool ret = false;
+
+    TLTable::const_iterator i = mTensorsTable.find(tensorName);
+    if (i != mTensorsTable.end())
+    {
+        std::unordered_map<Name, DBCell>::const_iterator j = (*i).second.find(layerName);
+        if (j != (*i).second.end())
+        {
+            DBCell indexes = (*j).second;
+
+            if (usage == Workflow::Usage::Forward)
+            {
+                if (!isCellElementEmpty(indexes, Workflow::Usage::Forward)) ret = true;
+            }
+
+            if (usage == Workflow::Usage::Backward)
+            {
+                if (!isCellElementEmpty(indexes, Workflow::Usage::Backward)) ret = true;
+            }
+
+            if (usage == Workflow::Usage::ForwardAndBackward)
+            {
+                if (!isCellElementEmpty(indexes, Workflow::Usage::Forward) || !isCellElementEmpty(indexes, Workflow::Usage::Backward)) ret = true;
+            }
+        }
+    }
+
+    return ret;
+}
+
+void WorkflowDB::addTensorToTable(const TensorUsage& tensorUsage, TLTable& table, const Name& keyA, const Name& keyB)
+{
+    auto i = table.find(keyA);
+    if (i != table.end())
+    {
+        auto j = (*i).second.find(keyB);
+        if (j != (*i).second.end())
+        {
+            assignIndexes(tensorUsage, (*j).second);
+        }
+        else
+        {
+            auto k = (*i).second.insert({ keyB, { std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max() } });
+            assignIndexes(tensorUsage, (*k.first).second);
+        }
+    }
+    else
+    {
+        auto j = table.insert({ keyA, {} });
+        auto k = (*j.first).second.insert({ keyB, { std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max() } });
+        assignIndexes(tensorUsage, (*k.first).second);
+    }
+}
+
+void WorkflowDB::chooseMaxShape(const Name& tensorName, WShape& shape)
+{
+    TLTable::const_iterator i = mTensorsTable.find(tensorName);
+    if (i != mTensorsTable.end())
+    {
+        for (auto j = (*i).second.begin(); j != (*i).second.end(); ++j)
+        {
+            std::vector<size_t> tensorUsage = (*j).second;
+
+            if (!isCellElementEmpty(tensorUsage, Workflow::Usage::Forward))
+            {
+                TensorUsage& usg = mTensorNeeded[tensorUsage[static_cast<size_t>(Workflow::Usage::Forward)]];
+                usg.shape.selectMaxShape(shape);
+            }
+
+            if (!isCellElementEmpty(tensorUsage, Workflow::Usage::Backward))
+            {
+                TensorUsage& usg = mTensorNeeded[tensorUsage[static_cast<size_t>(Workflow::Usage::Backward)]];
+                usg.shape.selectMaxShape(shape);
+            }
+        }
+    }
+}
+
+void WorkflowDB::assignIndexes(const TensorUsage& tensorUsage, DBCell& indexes)
+{
+    if (tensorUsage.usage == Workflow::Usage::Forward)
+    {
+        indexes[static_cast<size_t>(Workflow::Usage::Forward)] = mTensorNeeded.size() - 1;
+    }
+
+    if (tensorUsage.usage == Workflow::Usage::Backward)
+    {
+        indexes[static_cast<size_t>(Workflow::Usage::Backward)] = mTensorNeeded.size() - 1;
+    }
+
+    if (tensorUsage.usage == Workflow::Usage::ForwardAndBackward)
+    {
+        indexes[static_cast<size_t>(Workflow::Usage::Forward)] = mTensorNeeded.size() - 1;
+        indexes[static_cast<size_t>(Workflow::Usage::Backward)] = mTensorNeeded.size() - 1;
+    }
+}
+
+std::vector<Name> WorkflowDB::getSlice(const TLTable& table, const Name& keyA) const
+{
+    std::vector<Name> ret;
+
+    auto i = table.find(keyA);
+    if (i != table.end())
+    {
+        for (auto j = (*i).second.begin(); j != (*i).second.end(); ++j)
+        {
+            ret.push_back((*j).first);
+        }
+    }
+
+    return ret;
+}
+
+WorkflowDB::DBCell WorkflowDB::getCell(const TLTable& table, const Name& keyA, const Name& keyB) const
+{
+    DBCell ret;
+
+    auto i = table.find(keyA);
+    if (i != table.end())
+    {
+        auto j = (*i).second.find(keyB);
+        if (j != (*i).second.end())
+        {
+            ret = (*j).second;
+        }
+    }
+
+    return ret;
+}
+
+WorkflowDB::TensorUsage WorkflowDB::findFirstTensor(const Name& tensorName) const
+{
+    TensorUsage usg;
+    bool found = false;
+
+    TLTable::const_iterator i = mTensorsTable.find(tensorName);
+    if (i != mTensorsTable.end())
+    {
+        if (!(*i).second.empty())
+        {
+            found = true;
+
+            DBCell tensorUsage = (*(*i).second.begin()).second;
+            if (!isCellElementEmpty(tensorUsage, Workflow::Usage::Forward))
+            {
+                const auto stageIdx = static_cast<size_t>(Workflow::Usage::Forward);
+                const auto usgIdx = tensorUsage[stageIdx];
+                usg = mTensorNeeded[usgIdx];
+            }
+            else if (!isCellElementEmpty(tensorUsage, Workflow::Usage::Backward))
+            {
+                const auto stageIdx = static_cast<size_t>(Workflow::Usage::Backward);
+                const auto usgIdx = tensorUsage[stageIdx];
+                usg = mTensorNeeded[usgIdx];
+            }
+        }
+    }
+
+    if (!found)
+    {
+        THROW_NONAME("WorkflowDB", "tensor [" + tensorName + "] hasn`t been declared");
+    }
+
+    return usg;
+}
+
+bool WorkflowDB::isCellElementEmpty(const DBCell& cell, Workflow::Usage usage) const
+{
+    bool ret = true;
+
+#ifdef _DEBUG
+    if (usage != Workflow::Usage::Forward && usage != Workflow::Usage::Backward)
+    {
+        THROW_NONAME("WorkflowDB", "incorrect parameter");
+    }
+#endif
+
+    if (!cell.empty())
+    {
+        ret = !(cell[static_cast<size_t>(usage)] < mTensorNeeded.size());
+    }
+
+    return ret;
+}
+
+bool WorkflowDB::isTensorDeclared(const Name& tensorName) const
+{
+    bool ret = false;
+
+    TLTable::const_iterator i = mTensorsTable.find(tensorName);
+    if (i != mTensorsTable.end())
+    {
+        ret = true;
+    }
+
+    return ret;
+}
+
+const WorkflowDB::TensorUsage& WorkflowDB::getUsage(size_t index) const
+{
+    return mTensorNeeded.at(index);
+}
+
+WorkflowDB::TensorUsage& WorkflowDB::getUsage(size_t index)
+{
+    return mTensorNeeded.at(index);
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/WorkflowDB.h b/training/src/compiler/training/compiler/WorkflowDB.h
new file mode 100644
index 00000000..4f32e4ab
--- /dev/null
+++ b/training/src/compiler/training/compiler/WorkflowDB.h
@@ -0,0 +1,213 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef WORKFLOW_DB_H
+#define WORKFLOW_DB_H
+
+#include <training/system/Name.h>
+#include <unordered_map>
+
+#include "Workflow.h"
+
+namespace raul
+{
+
+/**
+ * @brief Class to store information of tensor`s declaration
+ *
+ * Each tensor declaration is bound to layer and execution stage (forward, backward).
+ * This information is stored into plain 1D array of TensorUsage and structured 3D table tensor name vs layer name vs `usage`.
+ * `Usage` is an std::vector of size == 2 to store index from TensorUsage array.
+ *
+ * For example if tensor A from layer L used only in backward pass then [A][L][0] == std::numeric_limits<size_t>::max(), [A][L][1] == `index`.
+ */
+class WorkflowDB
+{
+  public:
+    struct TensorUsage
+    {
+        Name layerName;
+        Name tensorName;
+
+        WShape shape;          ///< set for whole workflow (should be same)
+        Workflow::Usage usage; ///< set for each usage location (might be different)
+        Workflow::Mode mode;   ///< set for each usage location (might be different)
+
+        bool isOptimizeGraph; ///< set for whole workflow (should be same)
+        bool isOptimizeMem;   ///< set for whole workflow (should be same)
+        bool isTrainable;     ///< set for whole workflow (should be same)
+        bool isZero;          ///< set for each usage location (might be different)
+        bool isCompress;      ///< set for each usage location (might be different)
+
+        LayerExecutionTarget layerExecutionTarget;
+    };
+
+    WorkflowDB()
+        : mTensorsTable{}
+        , mLayersTable{}
+        , mTensorNeeded{}
+    {
+    }
+
+    /**
+     * Cell of the 3d table
+     *
+     */
+    using DBCell = std::vector<size_t>;
+
+    /**
+     * 3d table of [tensor, layer, usage (forward | backward)]
+     */
+    using TLTable = std::unordered_map<Name, std::unordered_map<Name, DBCell>>;
+
+    /**
+     * Check whether provided tensor with context is in database
+     *
+     * @param tensorName name of a tensor
+     * @param layerName layer with the tensor
+     * @param usage Training stage where tensor is used
+     * @return true/false
+     */
+    [[nodiscard]] bool isTensorExistsInTable(const Name& tensorName, const Name& layerName, Workflow::Usage usage) const;
+
+    /**
+     * Get table slice by key
+     *
+     * @param table target table
+     * @param keyA search key
+     * @return vector of names of the keys in the slice
+     *
+     * @tod(ck): is it really a slice?
+     */
+    [[nodiscard]] Names getSlice(const TLTable& table, const Name& keyA) const;
+
+    /**
+     * Get cell of the search table (pair of indexes for forward and backward)
+     *
+     * @param table target table
+     * @param keyA the first search key
+     * @param keyB the second search key
+     * @return Cell
+     */
+    [[nodiscard]] DBCell getCell(const TLTable& table, const Name& keyA, const Name& keyB) const;
+
+    /**
+     * Return the first meet usage information of the tensor
+     *
+     * @param tensorName
+     * @return
+     */
+    [[nodiscard]] TensorUsage findFirstTensor(const Name& tensorName) const;
+
+    /**
+     * Check whether provided cell empty or not
+     *
+     * Function checks index stored in dictionary. If it goes out of array length
+     * it means cell is empty.
+     *
+     * @param cell
+     * @param usage
+     * @return true/false
+     *
+     */
+    [[nodiscard]] bool isCellElementEmpty(const DBCell& cell, Workflow::Usage usage) const;
+
+    /**
+     * Check whether provided tensor declared in the database
+     *
+     * @param tensorName
+     * @return true/false
+     */
+    [[nodiscard]] bool isTensorDeclared(const Name& tensorName) const;
+
+    /**
+     * Getter to search table by tensor
+     * @return Search table (read-only)
+     *
+     */
+    [[nodiscard]] const TLTable& getTensorsTable() const { return mTensorsTable; }
+
+    /**
+     * Getter to search table by tensor
+     * @return Search table
+     *
+     */
+    TLTable& getTensorsTable() { return mTensorsTable; }
+
+    /**
+     * Getter to search table by layer
+     * @return Search table (read-only)
+     *
+     */
+    [[nodiscard]] const TLTable& getLayersTable() const { return mLayersTable; }
+
+    /**
+     * Getter to search table by layer
+     * @return Search table
+     *
+     */
+    TLTable& getLayersTable() { return mLayersTable; }
+
+    /**
+     * Getter to usage
+     *
+     * @param index Plain index
+     * @return Usage (read-only)
+     *
+     */
+    [[nodiscard]] const TensorUsage& getUsage(size_t index) const;
+
+    /**
+     * Getter to usage
+     *
+     * @param index Plain index
+     * @return Usage
+     */
+    TensorUsage& getUsage(size_t index);
+
+    /**
+     * Add usage information to the workflow database
+     *
+     * @param usage Usage
+     */
+    void addUsage(const TensorUsage& usage)
+    {
+        mTensorNeeded.push_back(usage);
+        addTensorToTable(usage, mTensorsTable, usage.tensorName, usage.layerName);
+        addTensorToTable(usage, mLayersTable, usage.layerName, usage.tensorName);
+    }
+
+    void chooseMaxShape(const Name& tensorName, WShape& shape);
+
+  private:
+    /**
+     * Add usage information of the tensor to a search table
+     *
+     * @param tensorUsage usage of the tensor
+     * @param table quick access table with structure [Tensor, Layer, Usage] or [Layer, Tensor, Usage]
+     * @param keyA the first key in table (Tensor or Layer)
+     * @param keyB the second key in table (Tensor or Layer)
+     */
+    void addTensorToTable(const TensorUsage& tensorUsage, TLTable& table, const Name& keyA, const Name& keyB);
+    void assignIndexes(const TensorUsage& tensorUsage, DBCell& indexes);
+
+  private:
+    TLTable mTensorsTable; ///< Search table: [Tensor, Layer, Usage] --> Index in storage aka mTensorNeeded
+    TLTable mLayersTable;  ///< Search table: [Layer, Tensor, Usage] --> Index in storage aka mTensorNeeded
+
+    std::vector<TensorUsage> mTensorNeeded; ///< Storage of all workflow tensors
+};
+
+} // raul namespace
+
+#endif // WORKFLOW_DB_H
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/WorkflowEager.cpp b/training/src/compiler/training/compiler/WorkflowEager.cpp
new file mode 100644
index 00000000..c9d2920b
--- /dev/null
+++ b/training/src/compiler/training/compiler/WorkflowEager.cpp
@@ -0,0 +1,263 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <training/base/layers/BasicLayer.h>
+#include <training/compiler/Compiler.h>
+
+#include "WorkflowActions.h"
+#include "WorkflowDB.h"
+#include "WorkflowEager.h"
+
+namespace raul
+{
+
+void WorkflowEager::prepareMemoryForTraining()
+{
+    Workflow::prepareMemoryForTraining();
+
+    for (auto& layer : mLayers)
+    {
+        layer->initNotBSTensors();
+    }
+
+    executePipeline(Workflow::Pipelines::Zero);
+}
+
+void WorkflowEager::preparePipelines(Execution)
+{
+    try
+    {
+        if (mUseCompiler)
+        {
+            if (!mCompiler->isResolved())
+            {
+                createImplementations();
+            }
+        }
+
+        if (mIsForwardCalled)
+        {
+            THROW_NONAME("WorkflowEager", "forward called without leading backward");
+        }
+
+        // check same outputs usage globally
+        {
+            std::unordered_set<Name> allOutputs;
+            for (auto& layer : mLayers)
+            {
+                for (const auto& output : layer->getOutputs())
+                {
+                    auto it = allOutputs.find(output);
+                    if (it != allOutputs.end())
+                    {
+                        THROW_NONAME("WorkflowEager", "the workflow is not correct, there are same outputs defined: " + output);
+                    }
+                    else
+                    {
+                        allOutputs.insert(output);
+                    }
+                }
+            }
+        }
+
+        // check unique names (inputs, weights) usage per layer
+        {
+            for (auto& layer : mLayers)
+            {
+                if (!isUniqueNames(layer->getInputs()))
+                {
+                    THROW_NONAME("WorkflowEager", "the workflow is not correct, there are same inputs defined for layer " + layer->getName());
+                }
+
+                if (!isUniqueNames(layer->getSharedWeights()))
+                {
+                    THROW_NONAME("WorkflowEager", "the workflow is not correct, there are same weights defined for layer " + layer->getName());
+                }
+            }
+        }
+
+        clearPipelines();
+
+        createAuxPipelines();
+
+        mIsPipelinesPrepared = true;
+    }
+    catch (...)
+    {
+        THROW_NONAME("WorkflowEager", "Cannot prepare pipelines");
+    }
+}
+
+void WorkflowEager::createAuxPipelines()
+{
+    try
+    {
+        // Tensor vs index in mTensorNeeded
+        std::unordered_map<Name, size_t> uniqueTensors;
+
+        std::unordered_set<Name> layers;
+        for (const auto& layer : mWorkflowDB->getLayersTable())
+        {
+            layers.insert(layer.first);
+        }
+
+        // check inequality, fill uniqueTensors
+        for (const Name& lName : layers)
+        {
+            try
+            {
+                std::vector<Name> tensors = mWorkflowDB->getSlice(mWorkflowDB->getLayersTable(), lName);
+                for (const auto& tName : tensors)
+                {
+                    auto uniqueIt = uniqueTensors.find(tName);
+                    if (uniqueIt != uniqueTensors.end())
+                    {
+                        auto tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+
+                        if (!mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Forward))
+                        {
+                            if (mWorkflowDB->getUsage((*uniqueIt).second).isZero || mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]).isZero)
+                            {
+                                mWorkflowDB->getUsage((*uniqueIt).second).isZero = true;
+                                mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]).isZero = true;
+                            }
+
+                            checkAttributesInequality((*uniqueIt).second, tensorUsage[static_cast<size_t>(Usage::Forward)], tName);
+                        }
+                        if (!mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Backward))
+                        {
+                            if (mWorkflowDB->getUsage((*uniqueIt).second).isZero || mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Backward)]).isZero)
+                            {
+                                mWorkflowDB->getUsage((*uniqueIt).second).isZero = true;
+                                mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Backward)]).isZero = true;
+                            }
+
+                            checkAttributesInequality((*uniqueIt).second, tensorUsage[static_cast<size_t>(Usage::Backward)], tName);
+                        }
+                    }
+                    else
+                    {
+                        auto tensorUsage = mWorkflowDB->getCell(mWorkflowDB->getLayersTable(), lName, tName);
+
+                        Usage tUsage = Usage::Backward;
+                        if (!mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Forward))
+                        {
+                            tUsage = Usage::Forward;
+                        }
+
+                        if (!mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Forward) && !mWorkflowDB->isCellElementEmpty(tensorUsage, Usage::Backward))
+                        {
+                            if (mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]).isZero || mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Backward)]).isZero)
+                            {
+                                mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Forward)]).isZero = true;
+                                mWorkflowDB->getUsage(tensorUsage[static_cast<size_t>(Usage::Backward)]).isZero = true;
+                            }
+
+                            checkAttributesInequality(tensorUsage[static_cast<size_t>(Usage::Forward)], tensorUsage[static_cast<size_t>(Usage::Backward)], tName);
+                        }
+
+                        uniqueTensors.insert({ tName, tensorUsage[static_cast<size_t>(tUsage)] });
+                    }
+                }
+            }
+            catch (...)
+            {
+                THROW_NONAME("WorkflowEager", "Cannot process layer `" + lName + "`");
+            }
+        }
+
+        // create pipelines
+        execTargetCreateAuxPipelines(uniqueTensors);
+
+        for (const auto& layer : mLayers)
+        {
+            mPipelineCreateBatched.push_back(std::make_shared<UpdateBS>(layer.get(), *this));
+        }
+    }
+    catch (...)
+    {
+        THROW_NONAME("WorkflowEager", "Cannot create auxiliary pipelines");
+    }
+}
+
+void WorkflowEager::execTargetCreateAuxPipelines(const std::unordered_map<Name, size_t>& uniqueTensors)
+{
+    for (const auto& uniqueTensor : uniqueTensors)
+    {
+        const WorkflowDB::TensorUsage& usage = mWorkflowDB->getUsage(uniqueTensor.second);
+
+        if (usage.shape.isBSDependent())
+        {
+            mPipelineCreateBatched.push_back(newActionCreateTensor(usage.tensorName, usage.layerExecutionTarget, usage.shape));
+            mPipelineDeleteBatched.push_back(newActionDeleteTensor(usage.tensorName, usage.layerExecutionTarget));
+        }
+        else
+        {
+            mPipelineCreateNotBatched.push_back(newActionCreateTensor(usage.tensorName, usage.layerExecutionTarget, usage.shape));
+        }
+
+        if (usage.isZero)
+        {
+            mPipelineZeroTensors.push_back(newActionZero(usage.tensorName, usage.layerExecutionTarget));
+        }
+    }
+}
+
+void WorkflowEager::forwardPassTesting()
+{
+    for (auto& layer : mLayers)
+    {
+        layer->forwardCompute(NetworkMode::Test);
+    }
+}
+
+void WorkflowEager::forwardPassTraining(bool)
+{
+    try
+    {
+        executePipeline(Workflow::Pipelines::Zero);
+    }
+    catch (...)
+    {
+        THROW_NONAME("WorkflowEager", "Cannot execute zeroing pipeline");
+    }
+
+    try
+    {
+        for (auto& layer : mLayers)
+        {
+            layer->forwardCompute(NetworkMode::Train);
+        }
+    }
+    catch (...)
+    {
+        THROW_NONAME("WorkflowEager", "Cannot execute forward in training mode");
+    }
+}
+
+void WorkflowEager::backwardPassTraining()
+{
+    try
+    {
+        for (auto it = mLayers.rbegin(); it != mLayers.rend(); ++it)
+        {
+            (*it)->backwardCompute();
+        }
+    }
+    catch (...)
+    {
+        THROW_NONAME("WorkflowEager", "Cannot execute backward");
+    }
+}
+
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/WorkflowEager.h b/training/src/compiler/training/compiler/WorkflowEager.h
new file mode 100644
index 00000000..274a6857
--- /dev/null
+++ b/training/src/compiler/training/compiler/WorkflowEager.h
@@ -0,0 +1,82 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef WORKFLOW_EAGER_H
+#define WORKFLOW_EAGER_H
+
+#include "Workflow.h"
+
+namespace raul
+{
+
+/**
+ * @brief No memory optimizations performed, all tensors allocated once, external control of layers execution possible
+ */
+class WorkflowEager : public Workflow
+{
+  public:
+    explicit WorkflowEager(CompressionMode compressionMode = CompressionMode::NONE,
+                           CalculationMode calculationMode = CalculationMode::DETERMINISTIC,
+                           AllocationMode allocationMode = AllocationMode::STANDARD,
+                           ExecutionTarget executionTarget = ExecutionTarget::CPU,
+                           bool useCompiler = false,
+                           quantization::IQuantizer* quantizer = nullptr)
+        : Workflow(compressionMode, calculationMode, allocationMode, executionTarget, useCompiler, quantizer)
+    {
+    }
+
+    /**
+     * @brief Create tensors and allocate memory for not optimized not batched tensors, create shapes for optimized not batched tensors
+     */
+    void prepareMemoryForTraining() override;
+
+    /**
+     * @brief Create pipelines for training process
+     * execution - ignored in eager mode
+     *
+     * @note(ck): Default arguments on override methods are prohibited
+     */
+    void preparePipelines(Execution execution = Execution::Normal) override;
+
+    /**
+     * @brief Execute forward test pipeline
+     */
+    void forwardPassTesting() override;
+
+    /**
+     * @brief Execute forward train pipeline
+     * performZero - ignored in eager mode
+     *
+     * @note(ck): Default arguments on override methods are prohibited
+     */
+    void forwardPassTraining(bool performZero = true) override;
+
+    /**
+     * @brief Execute backward train pipeline
+     */
+    void backwardPassTraining() override;
+
+  private:
+    void createAuxPipelines();
+
+    void execTargetCreateAuxPipelines(const std::unordered_map<Name, size_t>& uniqueTensors);
+};
+
+// for test purposes
+#define MANAGERS_DEFINE                                                                                                                                                                                \
+    raul::WorkflowEager work;                                                                                                                                                                          \
+    [[maybe_unused]] raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+} // namespace raul
+
+#endif // WORKFLOW_EAGER_H
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/WorkflowPool.cpp b/training/src/compiler/training/compiler/WorkflowPool.cpp
new file mode 100644
index 00000000..d80fc50b
--- /dev/null
+++ b/training/src/compiler/training/compiler/WorkflowPool.cpp
@@ -0,0 +1,202 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "WorkflowPool.h"
+
+namespace raul
+{
+WorkflowBasicPool::WorkflowBasicPool()
+    : mPoolPrepared(false)
+    , mIntervalsPrepared(false)
+    , mTotalMemory(0u)
+{
+}
+
+void WorkflowBasicPool::createIntervals(const Names& lNames, const Timeline& timeline)
+{
+    if (timeline.empty())
+    {
+        return;
+    }
+
+    mPoolPrepared = false;
+    mTotalMemory = 0u;
+
+    std::unordered_map<Name, size_t> layers;
+
+    for (size_t q = 0; q < lNames.size(); ++q)
+    {
+        layers.insert({ lNames[q], q });
+    }
+
+    mIntervals.clear();
+    mNameToInterval.clear();
+
+    for (const auto& range : timeline)
+    {
+        const Name& tName = range.first;
+        auto itF = layers.find(range.second.first);
+        auto itL = layers.find(range.second.second);
+
+        if (itF == layers.end())
+        {
+            THROW_NONAME("WorkflowPool", "layers not equal timeline");
+        }
+
+        if (itL == layers.end())
+        {
+            THROW_NONAME("WorkflowPool", "layers not equal timeline");
+        }
+
+        size_t fLayerIndex = (*itF).second;
+        size_t lLayerIndex = (*itL).second;
+
+        if (mNameToInterval.find(tName) != mNameToInterval.end())
+        {
+            THROW_NONAME("WorkflowPool", "tensor duplication detected");
+        }
+
+        if (lLayerIndex < fLayerIndex)
+        {
+            THROW_NONAME("WorkflowPool", "wrong layers order");
+        }
+
+        mNameToInterval.insert({ tName, mIntervals.size() });
+        mIntervals.emplace_back(Interval(fLayerIndex, lLayerIndex, tName));
+    }
+
+    if (mIntervals.empty())
+    {
+        THROW_NONAME("WorkflowPool", "no data");
+    }
+
+    mIntervalsPrepared = true;
+}
+
+template<typename MM>
+void WorkflowPool<MM>::createPool(const MM& manager)
+{
+    if (mIntervals.empty())
+    {
+        return;
+    }
+
+    if (!mIntervalsPrepared)
+    {
+        THROW_NONAME("WorkflowPool", "intervals has not been prepared");
+    }
+
+    if (mPoolPrepared)
+    {
+        return;
+    }
+
+    mPool.clear();
+
+    size_t poolSize = 0;
+
+    std::vector<Interval*> intervalsOrdered;
+
+    for (auto& interval : mIntervals)
+    {
+        Name targetName = interval.tName;
+        if (!manager.tensorExists(targetName))
+        {
+            auto itM = mTensorNameMapper.find(targetName);
+            if (itM != mTensorNameMapper.end())
+            {
+                targetName = (*itM).second;
+            }
+        }
+        const auto& t = manager[targetName];
+
+        if (t.getAllocationMode() != AllocationMode::POOL)
+        {
+            THROW_NONAME("WorkflowPool", "tensor [" + targetName + "] not in POOL allocation mode");
+        }
+
+        const size_t totalSize = t.getShape().total_size();
+
+        if (totalSize == 0)
+        {
+            interval.offset = 0;
+            continue;
+        }
+
+        size_t curOffset = 0;
+
+        for (const auto intervalOrdered : intervalsOrdered)
+        {
+            // time overlap
+            if (interval.start <= intervalOrdered->finish && interval.finish >= intervalOrdered->start)
+            {
+                if (curOffset + totalSize <= intervalOrdered->offset)
+                {
+                    break;
+                }
+                curOffset = std::max(curOffset, intervalOrdered->upperBound);
+            }
+        }
+
+        interval.offset = curOffset;
+        interval.upperBound = curOffset + totalSize;
+
+        poolSize = std::max(poolSize, curOffset + totalSize);
+
+        {
+            auto cmp = [](Interval* interval, size_t offset) { return interval->offset < offset; };
+            auto it = std::lower_bound(intervalsOrdered.begin(), intervalsOrdered.end(), interval.offset, cmp);
+            intervalsOrdered.insert(it, &interval);
+        }
+    }
+
+    mPool.resize(poolSize);
+
+    mTotalMemory = poolSize * sizeof(typename MM::type);
+
+    mPoolPrepared = true;
+}
+
+template<typename MM>
+void WorkflowPool<MM>::clearPool()
+{
+    mPoolPrepared = false;
+}
+
+template<typename MM>
+typename WorkflowPool<MM>::type* WorkflowPool<MM>::getOffset(const Name& tName)
+{
+    if (!mPoolPrepared)
+    {
+        THROW_NONAME("WorkflowPool", "pool has not been prepared");
+    }
+
+    auto it = mNameToInterval.find(tName);
+
+    if (it == mNameToInterval.end())
+    {
+        THROW_NONAME("WorkflowPool", "wrong tensor name " + tName);
+    }
+
+    return &mPool[mIntervals[(*it).second].offset];
+}
+
+template<typename MM>
+void WorkflowPool<MM>::setTensorNameMapper(const Name& from, const Name& to)
+{
+    mTensorNameMapper.insert({ from, to });
+}
+
+template class WorkflowPool<MemoryManager>;
+template class WorkflowPool<MemoryManagerFP16>;
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/compiler/training/compiler/WorkflowPool.h b/training/src/compiler/training/compiler/WorkflowPool.h
new file mode 100644
index 00000000..6d7fc6a5
--- /dev/null
+++ b/training/src/compiler/training/compiler/WorkflowPool.h
@@ -0,0 +1,98 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef WORKFLOWPOOL_H
+#define WORKFLOWPOOL_H
+
+#include <training/system/Name.h>
+
+#include "Workflow.h"
+
+namespace raul
+{
+class WorkflowBasicPool
+{
+  public:
+    WorkflowBasicPool();
+
+    // Tensor name vs pair - Layer first, last names in sequence
+    typedef std::map<Name, std::pair<Name, Name>> Timeline;
+
+    void createIntervals(const Names& lNames, const Timeline& timeline);
+
+    struct Interval
+    {
+        Interval(size_t s, size_t f, const raul::Name& name)
+            : start(s)
+            , finish(f)
+            , tName(name)
+            , offset(0)
+            , upperBound(0)
+        {
+        }
+
+        size_t start, finish;
+        raul::Name tName;
+        size_t offset;
+        size_t upperBound;
+    };
+
+  protected:
+    bool mPoolPrepared;
+
+    bool mIntervalsPrepared;
+
+    std::vector<Interval> mIntervals;
+
+    std::unordered_map<Name, size_t> mNameToInterval;
+
+    size_t mTotalMemory;
+};
+
+/**
+ * @brief Class to calculate one large pool for tensors
+ *
+ */
+template<typename MM>
+class WorkflowPool : public WorkflowBasicPool
+{
+  public:
+    WorkflowPool() {}
+
+    void createPool(const MM& manager);
+
+    void clearPool();
+
+    typedef typename MM::type type;
+
+    type* getOffset(const Name& tName);
+
+    typedef std::vector<type> Pool;
+    const Pool& getPool() const { return mPool; } // for tests
+
+    /**
+     * @brief Get size of allocated memory by pool in bytes
+     */
+    size_t getPoolSize() const { return mTotalMemory; }
+
+    void setTensorNameMapper(const Name& from, const Name& to);
+    void clearTensorNameMapper() { mTensorNameMapper.clear(); }
+
+  private:
+    std::unordered_map<Name, Name> mTensorNameMapper;
+
+    Pool mPool;
+};
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/frontend/CMakeLists.txt b/training/src/frontend/CMakeLists.txt
new file mode 100644
index 00000000..057cb94f
--- /dev/null
+++ b/training/src/frontend/CMakeLists.txt
@@ -0,0 +1,84 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+
+############################
+##  Modules
+############################
+if (RAUL_CONFIG_ENABLE_IO_JSON)
+    include(external/json/CMakeLists.txt)
+endif ()
+
+############################
+##  Library sources
+############################
+add_library(Raul-Frontend OBJECT)
+include(sources.cmake)
+
+############################
+##  Library build config
+############################
+target_include_directories(Raul-Frontend
+        PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+        $<INSTALL_INTERFACE:raul/frontend>
+        )
+
+
+target_link_libraries(Raul-Frontend
+        PUBLIC
+        Raul-System
+        $<$<BOOL:${RAUL_CONFIG_ENABLE_IO_JSON}>:libjson>)
+
+target_compile_definitions(Raul-Frontend
+        PUBLIC
+        $<$<BOOL:${RAUL_CONFIG_ENABLE_IO_JSON}>:ENABLE_IO_JSON>
+        )
+
+target_compile_features(Raul-Frontend PUBLIC cxx_std_17)
+set_target_properties(Raul-Frontend PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+
+# Parallelize mode
+if (RAUL_CPU_CORES)
+    target_compile_options(Raul-Frontend PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/MP${RAUL_CPU_CORES}>)
+endif ()
+
+# Pedantic mode (more warnigns and warnings as errors)
+if (RAUL_CONFIG_ENABLE_PEDANTIC)
+    target_compile_options(Raul-Frontend PUBLIC
+            $<$<OR:$<CXX_COMPILER_ID:GNU>,$<CXX_COMPILER_ID:Clang>>:-Wall;-Wextra;-pedantic;-Werror>
+            $<$<CXX_COMPILER_ID:MSVC>:/W4;/WX>
+            )
+
+    target_compile_definitions(Raul-Frontend PUBLIC
+            $<$<CXX_COMPILER_ID:MSVC>:_CRT_SECURE_NO_WARNINGS>
+            )
+endif ()
+
+############################
+##  Appearance in IDEs
+############################
+# Static sources
+get_target_property(raul-frontend-src-static Raul-Frontend SOURCES)
+list(FILTER raul-frontend-src-static EXCLUDE REGEX "${CMAKE_CURRENT_BINARY_DIR}/.*")
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/training FILES ${raul-frontend-src-static})
+# Generated sources
+get_target_property(raul-frontend-src-generated Raul-Frontend SOURCES)
+list(FILTER raul-frontend-src-generated INCLUDE REGEX "${CMAKE_CURRENT_BINARY_DIR}/.*")
+source_group(TREE ${CMAKE_CURRENT_BINARY_DIR}/training FILES ${raul-frontend-src-generated})
+
+set_target_properties(Raul-Frontend PROPERTIES FOLDER Raul)
\ No newline at end of file
diff --git a/training/src/frontend/external/json/CMakeLists.txt b/training/src/frontend/external/json/CMakeLists.txt
new file mode 100644
index 00000000..d394e488
--- /dev/null
+++ b/training/src/frontend/external/json/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.11...3.20 FATAL_ERROR)
+
+if (POLICY CMP0077)
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+endif ()
+
+include(FetchContent)
+
+set(FETCHCONTENT_UPDATES_DISCONNECTED ON)
+FetchContent_Declare(json
+        URL https://github.com/nlohmann/json/releases/download/v3.10.2/include.zip)
+
+FetchContent_GetProperties(json)
+if (NOT json_Target_POPULATED)
+    message(STATUS "Preparing json")
+    FetchContent_Populate(json)
+    add_library(libjson INTERFACE)
+    target_include_directories(libjson INTERFACE ${json_SOURCE_DIR}/include)
+    add_library(nlohmann_json::nlohmann_json ALIAS libjson)
+endif ()
\ No newline at end of file
diff --git a/training/src/frontend/sources.cmake b/training/src/frontend/sources.cmake
new file mode 100644
index 00000000..969c3804
--- /dev/null
+++ b/training/src/frontend/sources.cmake
@@ -0,0 +1,36 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+target_sources(Raul-Frontend PRIVATE
+        training/frontend/Types.h
+        training/frontend/Path.h
+        training/frontend/Frontend.h
+        training/frontend/Declaration.h
+        training/frontend/Generator.h
+        training/frontend/Port.h
+        training/frontend/processors/Processor.h
+        training/frontend/processors/Processor.cpp
+        training/frontend/processors/TextPrinter.h
+        training/frontend/processors/DotLangPrinter.h
+        training/frontend/Layers.h
+        training/frontend/Graph.h
+        )
+
+
+if (RAUL_CONFIG_ENABLE_IO_JSON)
+    target_sources(Raul-Frontend PRIVATE
+            training/frontend/io/JSON.cpp
+            training/frontend/io/JSON.h
+            )
+endif ()
\ No newline at end of file
diff --git a/training/src/frontend/training/frontend/Declaration.h b/training/src/frontend/training/frontend/Declaration.h
new file mode 100644
index 00000000..b9b5a818
--- /dev/null
+++ b/training/src/frontend/training/frontend/Declaration.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_DECLARATION_H
+#define FRONTEND_DECLARATION_H
+
+#include <utility>
+
+#include "Types.h"
+
+namespace raul::frontend
+{
+
+/**
+ * Base graph element declaration
+ */
+struct Declaration
+{
+    explicit Declaration(Type type)
+        : type{ type }
+    {
+    }
+
+    explicit Declaration(Type type, Inputs inputs, Outputs outputs)
+        : type{ type }
+        , inputs{ std::move(inputs) }
+        , outputs{ std::move(outputs) }
+    {
+    }
+
+    Declaration(const Declaration&) = delete;
+    Declaration(Declaration&&) = default;
+
+    Type type;
+    PortNames inputs{};
+    PortNames outputs{};
+};
+
+}
+
+#endif // FRONTEND_DECLARATION_H
diff --git a/training/src/frontend/training/frontend/Frontend.h b/training/src/frontend/training/frontend/Frontend.h
new file mode 100644
index 00000000..efb281de
--- /dev/null
+++ b/training/src/frontend/training/frontend/Frontend.h
@@ -0,0 +1,27 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_H
+#define FRONTEND_H
+
+#include "Declaration.h"
+#include "Generator.h"
+#include "Graph.h"
+#include "Layers.h"
+#include "Path.h"
+#include "Port.h"
+#include "Types.h"
+
+#include "processors/Processor.h"
+
+#endif // FRONTEND_H
diff --git a/training/src/frontend/training/frontend/Generator.h b/training/src/frontend/training/frontend/Generator.h
new file mode 100644
index 00000000..5116d788
--- /dev/null
+++ b/training/src/frontend/training/frontend/Generator.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_GENERATOR_H
+#define FRONTEND_GENERATOR_H
+
+#include <utility>
+
+#include "Types.h"
+#include "processors/Processor.h"
+
+namespace raul::frontend
+{
+
+struct Generator
+{
+    [[nodiscard]] Type getType() const { return declaration->type; }
+    [[nodiscard]] const Inputs& getInputs() const { return declaration->inputs; }
+    [[nodiscard]] const Outputs& getOutputs() const { return declaration->outputs; }
+
+    void apply(Processor& proc) { handler(proc, std::nullopt); }
+    void apply(Processor& proc) const { handler(proc, std::nullopt); }
+    void apply(Processor& proc, Path position) { handler(proc, std::move(position)); }
+    void apply(Processor& proc, Path position) const { handler(proc, std::move(position)); }
+
+  protected:
+    Ref<Declaration> declaration;
+    Handler handler;
+};
+
+template<class T>
+struct GeneratorTyped : Generator
+{
+    template<typename...>
+    struct typelist;
+
+    template<class... Args, typename = std::enable_if_t<!std::is_same_v<typelist<GeneratorTyped>, typelist<std::decay_t<Args>...>>>>
+    explicit GeneratorTyped(Args&&... args)
+    {
+        declaration = std::make_shared<T>(args...);
+        handler = [declaration = getDeclaration()](Processor& proc, std::optional<Path> position) { proc.process(*declaration, position); };
+    }
+
+  protected:
+    [[nodiscard]] Ref<T> getDeclaration() { return std::static_pointer_cast<T>(declaration); };
+};
+}
+
+#endif // FRONTEND_GENERATOR_H
diff --git a/training/src/frontend/training/frontend/Graph.h b/training/src/frontend/training/frontend/Graph.h
new file mode 100644
index 00000000..62ee3996
--- /dev/null
+++ b/training/src/frontend/training/frontend/Graph.h
@@ -0,0 +1,275 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_GRAPH_H
+#define FRONTEND_GRAPH_H
+
+#include "Declaration.h"
+#include "Generator.h"
+#include "Port.h"
+#include "Types.h"
+
+#include <set>
+#include <sstream>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include <training/system/Errors.h>
+#include <training/system/ordered_map.h>
+
+namespace raul::frontend
+{
+
+/**
+ * Graph element
+ *
+ * It explains a structure of a graph. An element can be a layer or
+ * a subgraph. Element can have name (optional parameter).
+ */
+struct GraphElement
+{
+    /**
+     * Ctor
+     *
+     * @param name element name
+     * @param generator element (layer or graph)
+     */
+    GraphElement(Name name, Generator generator)
+        : name{ std::move(name) }
+        , generator{ std::move(generator) }
+    {
+    }
+
+    /**
+     * Ctor
+     *
+     * @param generator generator element (layer or graph)
+     *
+     * @note name is implicitly set (default value: "element" + id)
+     */
+    GraphElement(Generator generator)
+        : name{ std::nullopt }
+        , generator{ std::move(generator) }
+    {
+    }
+
+    std::optional<Name> name;
+    Generator generator;
+};
+
+/**
+ * Graph declaration
+ *
+ * The declaration contains a map from the name of element to the name of subgraph.
+ */
+struct GraphDeclaration : Declaration
+{
+    using InterconnectType = std::pair<std::optional<Path>, PortNames>;
+
+    /**
+     * Ctor
+     * @param init list of graph elements
+     *
+     * A graph element has two constructors (generator with default name, generator with specified name).
+     */
+    GraphDeclaration(std::initializer_list<GraphElement> init)
+        : GraphDeclaration(init, {})
+    {
+        generateConnections();
+    }
+
+    GraphDeclaration(std::initializer_list<GraphElement> initElements, std::initializer_list<PortMap> initConnections)
+        : Declaration(Type::Graph)
+        , connections{ initConnections }
+    {
+        for (auto&& [k, el] : initElements)
+        {
+            auto name = k ? *k : genNextName();
+            if (elements.contains(name))
+            {
+                std::stringstream ss;
+                ss << "element names collision detected: \"" << name << "\" is already in the topology";
+                THROW_NONAME("GraphDeclaration", ss.str());
+            }
+            elements[name] = el;
+        }
+
+        fillPorts();
+    }
+
+    [[nodiscard]] std::string genNextName() const
+    {
+        std::stringstream ss;
+        ss << elements.size();
+        return ss.str();
+    }
+
+  private:
+    void connectLayers(const InterconnectType& from, const InterconnectType& to)
+    {
+        const auto& [fromName, fromPortNames] = from;
+        const auto& [toName, toPortNames] = to;
+        const auto size = std::min(fromPortNames.size(), toPortNames.size());
+        for (size_t i = 0; i < size; ++i)
+        {
+            const auto& fromPortName = fromPortNames[i];
+            const auto& toPortName = toPortNames[i];
+            const auto fromPort = fromName ? Port(*fromName, fromPortName) : Port(fromPortName);
+            const auto toPort = toName ? Port(*toName, toPortName) : Port(toPortName);
+            connections.emplace_back(fromPort.to(toPort));
+        }
+    }
+
+    void fillPorts()
+    {
+        std::unordered_set<Name> inputsMap;
+        std::unordered_set<Name> outputsMap;
+
+        for (const auto& [from, to] : connections)
+        {
+            if (!from.getLayer())
+            {
+                inputsMap.insert(from.getPort());
+            }
+
+            if (!to.getLayer())
+            {
+                outputsMap.insert(to.getPort());
+            }
+        }
+
+        inputs.clear();
+        outputs.clear();
+
+        inputs.resize(inputsMap.size());
+        outputs.resize(outputsMap.size());
+
+        std::copy(inputsMap.cbegin(), inputsMap.cend(), inputs.begin());
+        std::copy(outputsMap.cbegin(), outputsMap.cend(), outputs.begin());
+    }
+
+  public:
+    void generateConnections()
+    {
+        connections.clear();
+
+        std::optional<InterconnectType> interconnect = std::nullopt;
+
+        for (auto&& [name, element] : elements)
+        {
+            const auto& elementInputs = element.getInputs();
+            const auto& elementOutputs = element.getOutputs();
+
+            auto from = interconnect ? *interconnect : InterconnectType{ std::nullopt, elementInputs };
+            connectLayers(from, InterconnectType{ name, elementInputs });
+
+            interconnect = InterconnectType{ name, PortNames(elementOutputs) };
+        }
+
+        if (interconnect)
+        {
+            connectLayers(*interconnect, InterconnectType{ std::nullopt, interconnect->second });
+        }
+
+        fillPorts();
+    }
+
+    system::ordered_map<Name, Generator> elements;
+    std::vector<PortMap> connections;
+};
+
+/**
+ * Graph generator
+ */
+struct Graph : GeneratorTyped<GraphDeclaration>
+{
+    /**
+     * Ctor
+     * @param init list of graph elements
+     *
+     * A graph element has two constructors (generator with default name, generator with specified name).
+     */
+    Graph(std::initializer_list<GraphElement> init)
+        : GeneratorTyped(init)
+    {
+    }
+
+    Graph(std::initializer_list<GraphElement> initElements, std::initializer_list<PortMap> initConnections)
+        : GeneratorTyped(initElements, initConnections)
+    {
+    }
+
+    /**
+     * Unchecked access: operator[]
+     * Elements in a Graph object can be accessed via operator[] similar to a std::vector.
+     *
+     * @param pos index of the graph element
+     * @return reference to the graph element
+     */
+    Generator& operator[](size_t pos)
+    {
+        auto it = getDeclaration()->elements.begin();
+        std::advance(it, pos);
+        return it->second;
+    }
+
+    /**
+     * Checked access: at()
+     * Elements in a Graph object can be accessed via at() similar to a std::vector.
+     *
+     * @param pos index of the graph element
+     * @return reference to the graph element
+     */
+    Generator& at(size_t pos)
+    {
+        const auto& elements = getDeclaration()->elements;
+        if (pos >= elements.size())
+        {
+            std::stringstream ss;
+            ss << "out of range: " << pos << " > " << elements.size();
+            throw std::out_of_range(ss.str());
+        }
+        return operator[](pos);
+    }
+
+    /**
+     * Access: operator[]
+     * Elements in a Graph object can be accessed via operator[] similar to a std::map.
+     *
+     * @param name name of the elements
+     * @return reference to the graph element
+     */
+    Generator& operator[](const Name& name) { return getDeclaration()->elements[name]; }
+
+    // Generator& operator[](const Path& name) { return getDeclaration()->elements[name.str()]; }
+
+    auto insert(const GraphElement& value)
+    {
+
+        auto name = value.name ? *value.name : getDeclaration()->genNextName();
+        getDeclaration()->elements[name] = value.generator;
+        getDeclaration()->generateConnections();
+        return getDeclaration()->elements[name];
+    }
+
+    Graph connect(const PortMap& value)
+    {
+        getDeclaration()->connections.push_back(value);
+        return *this;
+    }
+};
+
+} // namespace raul::frontend
+
+#endif // FRONTEND_GRAPH_H
diff --git a/training/src/frontend/training/frontend/Layers.h b/training/src/frontend/training/frontend/Layers.h
new file mode 100644
index 00000000..ca23aaf0
--- /dev/null
+++ b/training/src/frontend/training/frontend/Layers.h
@@ -0,0 +1,233 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_LAYERS_H
+#define FRONTEND_LAYERS_H
+
+#include <utility>
+
+#include "Declaration.h"
+#include "Generator.h"
+
+namespace raul::frontend
+{
+
+struct LinearDeclaration : Declaration
+{
+    explicit LinearDeclaration(size_t features)
+        : Declaration(Type::Linear, Inputs{ "in" }, Outputs{ "out" })
+        , features{ features }
+    {
+    }
+    size_t features;
+    bool bias = false;
+};
+
+struct Linear : GeneratorTyped<LinearDeclaration>
+{
+    explicit Linear(size_t features)
+        : GeneratorTyped(features)
+    {
+    }
+
+    Linear enableBias()
+    {
+        getDeclaration()->bias = true;
+        return *this;
+    }
+
+    Linear disableBias()
+    {
+        getDeclaration()->bias = false;
+        return *this;
+    }
+};
+
+struct ReLUDeclaration : Declaration
+{
+    explicit ReLUDeclaration()
+        : Declaration(Type::ReLU, Inputs{ "in" }, Outputs{ "out" })
+    {
+    }
+};
+
+struct ReLU : GeneratorTyped<ReLUDeclaration>
+{
+    explicit ReLU()
+        : GeneratorTyped()
+    {
+    }
+};
+
+struct Conv1dDeclaration : Declaration
+{
+    explicit Conv1dDeclaration()
+        : Declaration(Type::Conv1d, Inputs{ "in" }, Outputs{ "out" })
+    {
+    }
+};
+
+struct Conv1d : GeneratorTyped<Conv1dDeclaration>
+{
+    explicit Conv1d()
+        : GeneratorTyped()
+    {
+    }
+};
+
+struct DropoutDeclaration : Declaration
+{
+    explicit DropoutDeclaration()
+        : Declaration(Type::Dropout, Inputs{ "in" }, Outputs{ "out" })
+    {
+    }
+};
+
+struct Dropout : GeneratorTyped<DropoutDeclaration>
+{
+    explicit Dropout()
+        : GeneratorTyped()
+    {
+    }
+};
+
+struct NormDeclaration : Declaration
+{
+    explicit NormDeclaration()
+        : Declaration(Type::Norm, Inputs{ "in" }, Outputs{ "out" })
+    {
+    }
+};
+
+struct Norm : GeneratorTyped<NormDeclaration>
+{
+    explicit Norm()
+        : GeneratorTyped()
+    {
+    }
+};
+
+struct SumDeclaration : Declaration
+{
+    explicit SumDeclaration()
+        : Declaration(Type::Sum, Inputs{ "in" }, Outputs{ "out" })
+    {
+    }
+};
+
+struct Sum : GeneratorTyped<SumDeclaration>
+{
+    explicit Sum()
+        : GeneratorTyped()
+    {
+    }
+};
+
+struct SigmoidDeclaration : Declaration
+{
+    explicit SigmoidDeclaration()
+        : Declaration(Type::Sigmoid, Inputs{ "in" }, Outputs{ "out" })
+    {
+    }
+};
+
+struct Sigmoid : GeneratorTyped<SigmoidDeclaration>
+{
+    explicit Sigmoid()
+        : GeneratorTyped()
+    {
+    }
+};
+
+struct TanhDeclaration : Declaration
+{
+    explicit TanhDeclaration()
+        : Declaration(Type::Tanh, Inputs{ "in" }, Outputs{ "out" })
+    {
+    }
+};
+
+struct Tanh : GeneratorTyped<TanhDeclaration>
+{
+    explicit Tanh()
+        : GeneratorTyped()
+    {
+    }
+};
+
+struct SoftmaxDeclaration : Declaration
+{
+    explicit SoftmaxDeclaration()
+        : Declaration(Type::Softmax, Inputs{ "in" }, Outputs{ "out" })
+    {
+    }
+};
+
+struct Softmax : GeneratorTyped<SoftmaxDeclaration>
+{
+    explicit Softmax()
+        : GeneratorTyped()
+    {
+    }
+};
+
+struct ReshapeDeclaration : Declaration
+{
+    ReshapeDeclaration(std::initializer_list<int> shape)
+        : Declaration(Type::Reshape, Inputs{ "in" }, Outputs{ "out" })
+        , shape{ shape }
+    {
+        if (shape.size() == 0)
+        {
+            throw std::invalid_argument("shape has to contain at least one dimension");
+        }
+    }
+
+    std::vector<int> shape;
+};
+
+struct Reshape : GeneratorTyped<ReshapeDeclaration>
+{
+    Reshape(std::initializer_list<int> shape)
+        : GeneratorTyped(shape)
+    {
+    }
+};
+
+struct LambdaDeclaration : Declaration
+{
+    using Func = std::function<void(const Inputs, Outputs)>;
+    using Forward = Func;
+    using Backward = Func;
+
+    explicit LambdaDeclaration(Forward forward, Backward backward)
+        : Declaration(Type::Lambda, Inputs{ "in" }, Outputs{ "out" })
+        , forward{ std::move(forward) }
+        , backward{ std::move(backward) }
+    {
+    }
+    Forward forward;
+    Backward backward;
+};
+
+struct Lambda : GeneratorTyped<LambdaDeclaration>
+{
+    explicit Lambda(const LambdaDeclaration::Forward& forward, const LambdaDeclaration::Backward& backward)
+        : GeneratorTyped(forward, backward)
+    {
+    }
+};
+
+} // namespace raul::frontend
+
+#endif // FRONTEND_LAYERS_H
diff --git a/training/src/frontend/training/frontend/Path.h b/training/src/frontend/training/frontend/Path.h
new file mode 100644
index 00000000..748c196d
--- /dev/null
+++ b/training/src/frontend/training/frontend/Path.h
@@ -0,0 +1,133 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_NAME_H
+#define FRONTEND_NAME_H
+
+#include <string>
+
+#define RAUL_FRONTEND_NAME_SEPARATOR "/"
+
+namespace raul::frontend
+{
+
+/**
+ * Name of declaration entities
+ * Supports hierarchy paths: x/y/z
+ */
+struct Path
+{
+    /**
+     * Ctor
+     * Default ctor is disabled due to meaningless
+     */
+    Path() = delete;
+    Path(const Path&) = default;
+    Path(Path&&) noexcept = default;
+
+    Path& operator=(Path other)
+    {
+        std::swap(storage, other.storage);
+        return *this;
+    }
+
+    Path(const std::string& init) { storage.emplace_back(init); }
+    Path(std::string&& init) { storage.emplace_back(init); }
+    Path(const char* init) { storage.emplace_back(init); }
+    Path(std::initializer_list<std::string> init)
+        : storage{ init }
+    {
+    }
+
+    /**
+     * Return a list of name parts
+     *
+     * Example: /x/y/z -> {x,y,z}
+     *
+     * @return list of name parts
+     */
+    [[nodiscard]] const auto& parts() const { return storage; }
+
+    /**
+     * Returns number of name parts
+     * @return number of name parts
+     */
+    [[nodiscard]] size_t depth() const { return storage.size(); }
+
+    Path& operator/=(const Path& rhs)
+    {
+        const auto newDataSize = storage.size() + rhs.storage.size();
+        const auto newCapacity = std::max(storage.capacity(), newDataSize);
+        storage.reserve(newCapacity);
+        storage.insert(storage.end(), rhs.storage.begin(), rhs.storage.end());
+        return *this;
+    }
+
+    bool operator==(const Path& other) const { return storage == other.storage; }
+    bool operator<(const Path& other) const { return storage < other.storage; }
+
+    bool operator!=(const Path& other) const { return !(*this == other); }
+    bool operator<=(const Path& other) const { return !(other < *this); }
+    bool operator>(const Path& other) const { return other < *this; }
+    bool operator>=(const Path& other) const { return !(*this < other); }
+
+    /**
+     * Returns the name with scope (=full path)
+     * @return string
+     */
+    [[nodiscard]] std::string fullname(const std::string& sep = RAUL_FRONTEND_NAME_SEPARATOR) const
+    {
+        std::string result;
+
+        for (auto& level : storage)
+        {
+            if (!result.empty())
+            {
+                result += sep;
+            }
+            result += level;
+        }
+
+        return result;
+    }
+
+    /**
+     * Returns the name in string
+     * @return string
+     */
+    [[nodiscard]] std::string str() const { return storage.back(); }
+
+    friend std::ostream& operator<<(std::ostream& out, const Path& instance)
+    {
+        out << instance.str();
+        return out;
+    }
+
+  private:
+    std::vector<std::string> storage;
+};
+
+inline Path operator/(Path lhs, const Path& rhs)
+{
+    lhs /= rhs;
+    return lhs;
+}
+
+inline Path operator"" _name(const char* s, std::size_t)
+{
+    return { s };
+}
+
+} // namespace raul::frontend
+
+#endif // FRONTEND_NAME_H
diff --git a/training/src/frontend/training/frontend/Port.h b/training/src/frontend/training/frontend/Port.h
new file mode 100644
index 00000000..33d4a979
--- /dev/null
+++ b/training/src/frontend/training/frontend/Port.h
@@ -0,0 +1,80 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_PORT_H
+#define FRONTEND_PORT_H
+
+#include <iostream>
+#include <optional>
+#include <utility>
+
+namespace raul::frontend
+{
+
+using Name = std::string;
+struct Path;
+struct Port;
+
+using PortMap = std::pair<Port, Port>;
+
+struct Port
+{
+    Port(Path layer, Name port)
+        : layer{ std::move(layer) }
+        , port{ std::move(port) }
+    {
+    }
+
+    explicit Port(Name port)
+        : layer{ std::nullopt }
+        , port{ std::move(port) }
+    {
+    }
+
+    [[nodiscard]] PortMap to(const Port& target) const { return { *this, target }; }
+    [[nodiscard]] PortMap to(Port&& target) const { return { *this, target }; }
+
+    [[nodiscard]] std::optional<Path> getLayer() const { return layer; }
+    [[nodiscard]] Name getPort() const { return port; }
+    [[nodiscard]] Path getPath() const
+    {
+        if (layer)
+        {
+            return *layer / port;
+        }
+
+        return port;
+    }
+
+    friend std::ostream& operator<<(std::ostream& out, const Port& obj)
+    {
+        out << "Port(" << obj.getPath().fullname("::") << ")";
+        return out;
+    }
+
+    bool operator==(const Port& other) const { return (layer == other.layer) && (port == other.port); }
+    bool operator<(const Port& other) const { return (layer < other.layer) && (port < other.port); }
+
+    bool operator!=(const Port& other) const { return !(*this == other); }
+    bool operator<=(const Port& other) const { return !(other < *this); }
+    bool operator>(const Port& other) const { return other < *this; }
+    bool operator>=(const Port& other) const { return !(*this < other); }
+
+  private:
+    std::optional<Path> layer;
+    Name port;
+};
+
+} // namespace raul::frontend
+
+#endif // FRONTEND_PORT_H
diff --git a/training/src/frontend/training/frontend/Types.h b/training/src/frontend/training/frontend/Types.h
new file mode 100644
index 00000000..bf972108
--- /dev/null
+++ b/training/src/frontend/training/frontend/Types.h
@@ -0,0 +1,88 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_TYPES_H
+#define FRONTEND_TYPES_H
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "Path.h"
+#include "Port.h"
+
+namespace raul::frontend
+{
+
+using Name = std::string;
+
+template<class T>
+using Ref = std::shared_ptr<T>;
+
+struct Processor;
+using Handler = std::function<void(Processor&, std::optional<Path>)>;
+
+using PortNames = std::vector<Name>;
+using Inputs = PortNames;
+using Outputs = PortNames;
+
+enum class Type
+{
+    Graph,
+    // Trainable
+    Linear,
+    Conv1d,
+    // Activations
+    ReLU,
+    Tanh,
+    Sigmoid,
+    Softmax,
+    // Other
+    Dropout,
+    Sum,
+    Norm,
+    Reshape,
+    Lambda
+};
+
+#define PRINT_TYPE_STR(TYPE) #TYPE
+#define PRINT_TYPE(TYPE)                                                                                                                                                                               \
+    case Type::TYPE:                                                                                                                                                                                   \
+        out << PRINT_TYPE_STR(TYPE);                                                                                                                                                                   \
+        break
+
+inline std::ostream& operator<<(std::ostream& out, const Type& type)
+{
+    switch (type)
+    {
+        PRINT_TYPE(Graph);
+        PRINT_TYPE(Linear);
+        PRINT_TYPE(ReLU);
+        PRINT_TYPE(Sigmoid);
+        PRINT_TYPE(Tanh);
+        PRINT_TYPE(Softmax);
+        PRINT_TYPE(Conv1d);
+        PRINT_TYPE(Dropout);
+        PRINT_TYPE(Norm);
+        PRINT_TYPE(Sum);
+        PRINT_TYPE(Reshape);
+        PRINT_TYPE(Lambda);
+    }
+
+    return out;
+}
+
+}
+
+#endif // FRONTEND_TYPES_H
diff --git a/training/src/frontend/training/frontend/io/JSON.cpp b/training/src/frontend/training/frontend/io/JSON.cpp
new file mode 100644
index 00000000..88217137
--- /dev/null
+++ b/training/src/frontend/training/frontend/io/JSON.cpp
@@ -0,0 +1,116 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "JSON.h"
+
+#include <training/frontend/Declaration.h>
+#include <training/frontend/Generator.h>
+#include <training/frontend/Graph.h>
+#include <training/frontend/Layers.h>
+
+namespace raul::frontend::io
+{
+
+template<class T>
+std::optional<T> getOptional(const json& object, const std::string& key)
+{
+    if (object.find(key) != object.end())
+    {
+        return object[key];
+    }
+
+    return std::nullopt;
+}
+
+Generator LinearFromJSON(const json& object)
+{
+    size_t features = object["features"];
+
+    auto layer = Linear{ features };
+
+    auto bias = getOptional<bool>(object, "bias");
+
+    if (bias && *bias)
+    {
+        layer = layer.enableBias();
+    }
+
+    return std::move(layer);
+}
+
+Generator GraphFromJSON(const json& object)
+{
+    auto graph = Graph{};
+
+    for (const auto& [name, node] : object["nodes"].items())
+    {
+        graph[name] = fromJSON(node);
+    }
+
+    auto edges = getOptional<json>(object, "edges");
+    if (edges)
+    {
+        auto createPort = [&](const json& x) -> Port
+        {
+            auto layer = getOptional<std::string>(x, "layer");
+            auto port = static_cast<std::string>(x["port"]);
+
+            if (layer)
+            {
+                return Port(*layer, port); // NOLINT(modernize-return-braced-init-list)
+            }
+            else
+            {
+                return Port(port);
+            }
+        };
+
+        for (const auto& connection : *edges)
+        {
+            auto src = createPort(connection["from"]);
+            auto dst = createPort(connection["to"]);
+            auto connect = src.to(dst);
+            graph.connect(connect);
+        }
+    }
+
+    return std::move(graph);
+}
+
+std::optional<Type> getJSONObjectType(const json& object)
+{
+    auto type = getOptional<std::string>(object, "type");
+
+    if (type)
+    {
+        if (*type == "linear") return Type::Linear;
+        if (*type == "relu") return Type::ReLU;
+        if (*type == "graph") return Type::Graph;
+    }
+
+    return std::nullopt;
+}
+
+Generator fromJSON(const json& object)
+{
+    auto elementType = getJSONObjectType(object);
+
+    if (elementType)
+    {
+        if (*elementType == Type::Graph) return GraphFromJSON(object);
+        if (*elementType == Type::Linear) return LinearFromJSON(object);
+    }
+    return Graph{};
+}
+
+} // namespace raul::frontend::io
diff --git a/training/src/frontend/training/frontend/io/JSON.h b/training/src/frontend/training/frontend/io/JSON.h
new file mode 100644
index 00000000..0ebe74e4
--- /dev/null
+++ b/training/src/frontend/training/frontend/io/JSON.h
@@ -0,0 +1,33 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_JSON_READER_H
+#define FRONTEND_JSON_READER_H
+
+#include <nlohmann/json.hpp>
+
+namespace raul::frontend
+{
+struct Generator;
+}
+
+namespace raul::frontend::io
+{
+
+using json = nlohmann::json;
+
+Generator fromJSON(const json& object);
+
+} // namespace raul::frontend::io
+
+#endif // FRONTEND_JSON_READER_H
diff --git a/training/src/frontend/training/frontend/processors/DotLangPrinter.h b/training/src/frontend/training/frontend/processors/DotLangPrinter.h
new file mode 100644
index 00000000..59570407
--- /dev/null
+++ b/training/src/frontend/training/frontend/processors/DotLangPrinter.h
@@ -0,0 +1,255 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_DOT_LANG_PRINTER_H
+#define FRONTEND_DOT_LANG_PRINTER_H
+
+#include "Processor.h"
+
+#include <iostream>
+#include <optional>
+
+#include <training/system/ordered_map.h>
+
+namespace raul::frontend
+{
+
+struct DotLangPrinter : Processor
+{
+  private:
+    auto dotDigraphConfig() const
+    {
+        std::stringstream ss;
+        ss << "rankdir=LR;" << end;
+        ss << "concentrate=true;" << end;
+        ss << "node[shape=record];" << end;
+        return ss.str();
+    }
+
+    auto portsDeclarations(const PortNames& ports) const
+    {
+        std::stringstream ss;
+        for (auto it = ports.begin(); it != ports.end(); ++it)
+        {
+            ss << "<" << *it << ">" << *it;
+            if (std::distance(it, ports.end()) > 1)
+            {
+                ss << "|";
+            }
+        }
+        return ss.str();
+    }
+
+    auto nameString(const std::optional<Path>& path, const std::string& prefix) const
+    {
+        std::stringstream ss;
+        ss << prefix;
+        if (path)
+        {
+            if (!prefix.empty())
+            {
+                ss << "_";
+            }
+            ss << path->fullname("_");
+        }
+        return ss.str();
+    }
+
+    auto elementString(const std::optional<Path>& path, const Type type, const PortNames& inputs, const PortNames& outputs) const
+    {
+        std::stringstream ss;
+        ss << nameString(path, "element");
+        ss << "[";
+        ss << "label="
+           << "\"";
+        if (path)
+        {
+            ss << *path << ":" << type << "|";
+        }
+        else
+        {
+            ss << "noname:" << type << "|";
+        }
+        ss << "{";
+        ss << portsDeclarations(inputs);
+        if (!(inputs.empty() || outputs.empty()))
+        {
+            ss << "|";
+        }
+        ss << portsDeclarations(outputs);
+        ss << "}"
+           << "\"";
+        ss << "];" << end;
+        return ss.str();
+    }
+
+    static Port getPort(const std::optional<Path>& path, const Port& port)
+    {
+        auto layer = port.getLayer();
+
+        if (path && layer)
+        {
+            return { *path / *layer, port.getPort() };
+        }
+
+        if (path)
+        {
+            return { *path, port.getPort() };
+        }
+
+        if (layer)
+        {
+            return { *layer, port.getPort() };
+        }
+
+        return Port{ "port_" + port.getPort() };
+    }
+
+    auto portString(const std::optional<Path>& path, const std::set<Path>& subgraphs, const Port& port)
+    {
+        std::stringstream ss;
+
+        if (port.getLayer())
+        {
+            const auto elementPath = path ? *path / *port.getLayer() : *port.getLayer();
+            if(subgraphs.find(*port.getLayer()) == subgraphs.end())
+            {
+                ss << nameString(elementPath, "element") << ":" << port.getPort();
+            }
+            else
+            {
+                const auto p = port.getPath();
+                const auto portPath = path ? *path / p : p;
+                ss << nameString(portPath, "port");
+            }
+        }
+        else
+        {
+            const auto p = port.getPath();
+            const auto portPath = path ? *path / p : p;
+            ss << nameString(portPath, "port");
+        }
+
+        return ss.str();
+    }
+
+    auto connectionString(const std::optional<Path>& path, const std::set<Path>& subgraphs, const Port& from, const Port& to)
+    {
+        std::stringstream ss;
+
+        ss << portString(path, subgraphs, from);
+        ss << "->";
+        ss << portString(path, subgraphs, to);
+        ss << ";";
+        ss << end;
+
+        return ss.str();
+    }
+
+    auto portsString(const std::optional<Path>& path, const std::vector<PortMap>& connections)
+    {
+        std::stringstream ss;
+
+        std::set<Name> ports;
+        auto insertExternalPort = [&](const auto port)
+        {
+            if (!port.getLayer())
+            {
+                ports.insert(port.getPort());
+            }
+        };
+
+        for (const auto& [from, to] : connections)
+        {
+            insertExternalPort(from);
+            insertExternalPort(to);
+        }
+
+        for (const auto& port : ports)
+        {
+            const auto portPath = path ? *path / port : Path(port);
+            ss << nameString(portPath, "port") << "[label=\"" << port << "\" shape=oval];" << end;
+        }
+
+        return ss.str();
+    }
+
+    auto connectionsString(const std::optional<Path>& path, const std::vector<PortMap>& connections, const system::ordered_map<Name, Generator>& elements)
+    {
+        std::stringstream ss;
+
+        std::set<Path> subgraphs;
+
+        for (auto& [elementName, elementGenerator] : elements)
+        {
+            if (elementGenerator.getType() == Type::Graph)
+            {
+                subgraphs.insert(elementName);
+            }
+        }
+
+        for (const auto& [from, to] : connections)
+        {
+            ss << connectionString(path, subgraphs, from, to);
+        }
+
+        return ss.str();
+    }
+
+  public:
+    void print(std::ostream& out = std::cout) const
+    {
+        out << "digraph {" << end;
+        out << dotDigraphConfig();
+        out << stream.str();
+        out << "}" << end;
+    }
+
+    void process(const GraphDeclaration& x, const std::optional<Path> path) override
+    {
+        if (x.elements.empty())
+        {
+            return;
+        }
+
+        stream << "subgraph " << nameString(path, "cluster") << end;
+        stream << "{" << end;
+        if (path)
+        {
+            stream << "label="
+                   << "\"" << *path << "\"" << end;
+        }
+
+        for (auto& [elementName, elementGenerator] : x.elements)
+        {
+            auto fullName = path ? *path / elementName : elementName;
+            elementGenerator.apply(*this, fullName);
+        }
+
+        stream << portsString(path, x.connections);
+        stream << connectionsString(path, x.connections, x.elements);
+
+        stream << "}" << end;
+    }
+
+    void process(const LinearDeclaration& x, const std::optional<Path> path) override { stream << elementString(path, x.type, x.inputs, x.outputs); }
+
+  private:
+    std::stringstream stream;
+    std::string end = "\n";
+    std::set<Path> objects;
+};
+
+} // namespace raul::frontend
+
+#endif // FRONTEND_DOT_LANG_PRINTER_H
diff --git a/training/src/frontend/training/frontend/processors/Processor.cpp b/training/src/frontend/training/frontend/processors/Processor.cpp
new file mode 100644
index 00000000..c1d48fe2
--- /dev/null
+++ b/training/src/frontend/training/frontend/processors/Processor.cpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <training/frontend/Graph.h>
+
+namespace raul::frontend
+{
+
+void Processor::process(const GraphDeclaration& graph, const std::optional<Path> path)
+{
+    for (auto& [elementName, elementGenerator] : graph.elements)
+    {
+        auto fullName = path ? *path / elementName : elementName;
+        elementGenerator.apply(*this, fullName);
+    }
+}
+
+}
diff --git a/training/src/frontend/training/frontend/processors/Processor.h b/training/src/frontend/training/frontend/processors/Processor.h
new file mode 100644
index 00000000..95bf1ba8
--- /dev/null
+++ b/training/src/frontend/training/frontend/processors/Processor.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_PROCESSOR_H
+#define FRONTEND_PROCESSOR_H
+
+#include <training/frontend/Types.h>
+
+namespace raul::frontend
+{
+
+struct LinearDeclaration;
+struct ReLUDeclaration;
+struct SigmoidDeclaration;
+struct TanhDeclaration;
+struct SoftmaxDeclaration;
+struct ReshapeDeclaration;
+struct DropoutDeclaration;
+struct LambdaDeclaration;
+struct GraphDeclaration;
+struct Processor
+{
+    virtual void process(const ReLUDeclaration&, const std::optional<Path>) {}
+    virtual void process(const SigmoidDeclaration&, const std::optional<Path>) {}
+    virtual void process(const TanhDeclaration&, const std::optional<Path>) {}
+    virtual void process(const SoftmaxDeclaration&, const std::optional<Path>) {}
+    virtual void process(const ReshapeDeclaration&, const std::optional<Path>) {}
+    virtual void process(const DropoutDeclaration&, const std::optional<Path>) {}
+    virtual void process(const LinearDeclaration&, const std::optional<Path>) {}
+    virtual void process(const LambdaDeclaration&, const std::optional<Path>) {}
+    virtual void process(const GraphDeclaration&, const std::optional<Path>);
+
+    virtual void process(const Declaration&, const std::optional<Path>) {}
+};
+
+}
+
+#endif // FRONTEND_PROCESSOR_H
diff --git a/training/src/frontend/training/frontend/processors/TextPrinter.h b/training/src/frontend/training/frontend/processors/TextPrinter.h
new file mode 100644
index 00000000..23a2b94f
--- /dev/null
+++ b/training/src/frontend/training/frontend/processors/TextPrinter.h
@@ -0,0 +1,175 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef FRONTEND_PROCESSOR_PRINTER_H
+#define FRONTEND_PROCESSOR_PRINTER_H
+
+#include "Processor.h"
+
+namespace raul::frontend
+{
+
+struct TextPrinter : Processor
+{
+    TextPrinter() = default;
+    explicit TextPrinter(std::ostream& init)
+        : stream{ init }
+    {
+    }
+
+    TextPrinter useIndent(size_t indent = 2)
+    {
+        indentFlag = true;
+        indentStep = indent;
+        return *this;
+    }
+
+    TextPrinter showAddress()
+    {
+        addressFlag = true;
+        return *this;
+    }
+
+  private:
+    [[nodiscard]] static auto printBool(bool x) { return x ? "true" : "false"; }
+    [[nodiscard]] static auto printName(const std::optional<Path>& x, const std::string& prefix, const std::string& suffix)
+    {
+        std::stringstream ss;
+        if (x)
+        {
+            ss << prefix << *x << suffix;
+        }
+
+        return ss.str();
+    }
+
+    [[nodiscard]] auto printAddress(const Declaration& declaration, const std::string& suffix) const
+    {
+        std::stringstream ss;
+        if (addressFlag)
+        {
+            ss << "address=" << &declaration << suffix;
+        }
+        return ss.str();
+    }
+
+    [[nodiscard]] auto printIndent() const { return indentFlag ? std::string(level * indentStep, ' ') : ""s; }
+    [[nodiscard]] auto printNewLine() const { return indentFlag ? "\n" : ""; }
+
+    void printGraphPrelude(const std::optional<Path>& name, const GraphDeclaration& declaration) const
+    {
+        stream << printIndent() << printName(name, "", ":") << "[" << printAddress(declaration, "") << printNewLine();
+    }
+    void printGraphInterlude() const
+    {
+        if (indentFlag)
+        {
+            stream << printIndent() << "----" << printNewLine();
+        }
+        else
+        {
+            stream << "|";
+        }
+    }
+    void printGraphPostlude() const { stream << printIndent() << "]"; }
+
+    void indentStepIn()
+    {
+        if (indentFlag)
+        {
+            ++level;
+        }
+    }
+
+    void indentStepOut()
+    {
+        if (indentFlag)
+        {
+            --level;
+        }
+    }
+
+  public:
+    void process(const LinearDeclaration& x, const std::optional<Path> name) override
+    {
+        stream << printIndent() << "Linear(" << printAddress(x, ", ") << printName(name, "name=", ", ") << "features=" << x.features << ", bias=" << printBool(x.bias) << ")";
+    }
+    void process(const ReLUDeclaration& x, const std::optional<Path> name) override { stream << printIndent() << "ReLU(" << printAddress(x, "") << printName(name, "name=", "") << ")"; }
+    void process(const SigmoidDeclaration& x, const std::optional<Path> name) override { stream << printIndent() << "Sigmoid(" << printAddress(x, "") << printName(name, "name=", "") << ")"; }
+    void process(const TanhDeclaration& x, const std::optional<Path> name) override { stream << printIndent() << "Tanh(" << printAddress(x, "") << printName(name, "name=", "") << ")"; }
+    void process(const SoftmaxDeclaration& x, const std::optional<Path> name) override { stream << printIndent() << "Softmax(" << printAddress(x, "") << printName(name, "name=", "") << ")"; }
+    void process(const DropoutDeclaration& x, const std::optional<Path> name) override { stream << printIndent() << "Dropout(" << printAddress(x, "") << printName(name, "name=", "") << ")"; }
+    void process(const LambdaDeclaration& x, const std::optional<Path> name) override { stream << printIndent() << "Lambda(" << printAddress(x, "") << printName(name, "name=", "") << ")"; }
+    void process(const ReshapeDeclaration& x, const std::optional<Path> name) override
+    {
+        stream << printIndent() << "Reshape(" << printAddress(x, "") << printName(name, "name=", ", ");
+        stream << "[";
+        for (auto it = x.shape.cbegin(); it != x.shape.cend(); ++it)
+        {
+            stream << *it;
+            if (std::distance(it, x.shape.cend()) > 1)
+            {
+                stream << ",";
+            }
+        }
+        stream << "]";
+        stream << ")";
+    }
+    void process(const GraphDeclaration& x, const std::optional<Path> rootPath) override
+    {
+        printGraphPrelude(rootPath, x);
+        for (auto it = x.elements.begin(); it != x.elements.end(); ++it)
+        {
+            auto& [elementName, elementGenerator] = *it;
+            auto path = rootPath ? *rootPath / elementName : elementName;
+            indentStepIn();
+            elementGenerator.apply(*this, path);
+            indentStepOut();
+            if (std::distance(it, x.elements.end()) > 1)
+            {
+                stream << ",";
+            }
+            stream << printNewLine();
+        }
+        indentStepIn();
+        if (!x.connections.empty())
+        {
+            printGraphInterlude();
+        }
+        for (auto it = x.connections.begin(); it != x.connections.end(); ++it)
+        {
+            auto& [from, to] = *it;
+            stream << printIndent() << from << "->" << to;
+            if (std::distance(it, x.connections.end()) > 1)
+            {
+                stream << ",";
+            }
+            stream << printNewLine();
+        }
+        indentStepOut();
+        printGraphPostlude();
+    }
+
+    void process(const Declaration&, const std::optional<Path> name) override { stream << printIndent() << "Unknown(" << printName(name, "name=", "") << ")"; }
+
+  private:
+    std::ostream& stream = std::cout;
+    bool indentFlag = false;
+    bool addressFlag = false;
+    size_t indentStep = 2;
+    size_t level = 0;
+};
+
+} // namespace raul::frontend
+
+#endif // FRONTEND_PROCESSOR_PRINTER_H
diff --git a/training/src/raul.natvis b/training/src/raul.natvis
new file mode 100644
index 00000000..e3a4121f
--- /dev/null
+++ b/training/src/raul.natvis
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="utf-8"?> 
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+  <Type Name="raul::TensorImpl&lt;*&gt;">
+    <Expand>
+      <Item Name="[name]">mName</Item>
+      <Item Name="[size]">mData.size()</Item>
+	  <Item Name="[data]">mData</Item>
+      <Synthetic Name="[shape]">
+        <DisplayString>[{mShape.m_extents[0]}, {mShape.m_extents[1]}, {mShape.m_extents[2]}, {mShape.m_extents[3]}]</DisplayString>
+      </Synthetic>
+    </Expand>
+  </Type>
+  <Type Name="raul::TensorDeclarator::TensorParameters">
+    <Expand>
+      <Synthetic Name="[shape]">
+        <DisplayString>[{shape.m_extents[0]}, {shape.m_extents[1]}, {shape.m_extents[2]}]</DisplayString>
+      </Synthetic>
+    </Expand>
+  </Type>
+  <Type Name="raul::activationShape">
+    <DisplayString>[{m_extents[0]}, {m_extents[1]}, {m_extents[2]}]</DisplayString>
+    <Expand>
+      <Item Name="[depth]">m_extents[0]</Item>
+      <Item Name="[height]">m_extents[1]</Item>
+      <Item Name="[width]">m_extents[2]</Item>
+    </Expand>
+  </Type>
+  <Type Name="raul::shape">
+    <DisplayString>[{m_extents[0]}, {m_extents[1]}, {m_extents[2]}, {m_extents[3]}]</DisplayString>
+	<Expand>
+	  <Item Name="[batch]">m_extents[0]</Item>
+      <Item Name="[depth]">m_extents[1]</Item>
+	  <Item Name="[height]">m_extents[2]</Item>
+	  <Item Name="[width]">m_extents[3]</Item>
+	</Expand>
+  </Type>
+  <Type Name="raul::Name">
+    <DisplayString>{string}</DisplayString>
+  </Type>
+  <Type Name="raul::BasicLayer">
+    <Expand>
+      <Item Name="[name]">mName</Item>
+      <Item Name="[type]">mTypeName</Item>
+      <Synthetic Name="[inputs]">
+        <DisplayString>[size={mInputs.size()}]</DisplayString>
+        <Expand>
+          <ArrayItems>
+            <Size>mInputs.size()</Size>
+            <ValuePointer>&amp;mInputs[0]</ValuePointer>
+          </ArrayItems>
+        </Expand>
+      </Synthetic>
+      <Synthetic Name="[outputs]">
+        <DisplayString>[size={mOutputs.size()}]</DisplayString>
+        <Expand>
+          <ArrayItems>
+            <Size>mOutputs.size()</Size>
+            <ValuePointer>&amp;mOutputs[0]</ValuePointer>
+          </ArrayItems>
+        </Expand>
+      </Synthetic>
+    </Expand>
+  </Type>
+</AutoVisualizer>
\ No newline at end of file
diff --git a/training/src/system/CMakeLists.txt b/training/src/system/CMakeLists.txt
new file mode 100644
index 00000000..3054b836
--- /dev/null
+++ b/training/src/system/CMakeLists.txt
@@ -0,0 +1,75 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+############################
+##  Modules
+############################
+add_subdirectory(external/half)
+
+############################
+##  Library sources
+############################
+add_library(Raul-System OBJECT)
+include(sources.cmake)
+
+############################
+##  Library build config
+############################
+target_include_directories(Raul-System
+        PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+        $<INSTALL_INTERFACE:raul/frontend>
+        )
+
+target_link_libraries(Raul-System
+        PUBLIC
+        libhalf
+        )
+
+target_compile_features(Raul-System PUBLIC cxx_std_17)
+set_target_properties(Raul-System PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# Parallelize mode
+if (RAUL_CPU_CORES)
+    target_compile_options(Raul-System PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/MP${RAUL_CPU_CORES}>)
+endif ()
+
+# Pedantic mode (more warnigns and warnings as errors)
+if (RAUL_CONFIG_ENABLE_PEDANTIC)
+    target_compile_options(Raul-System PUBLIC
+            $<$<OR:$<CXX_COMPILER_ID:GNU>,$<CXX_COMPILER_ID:Clang>>:-Wall;-Wextra;-pedantic;-Werror>
+            $<$<CXX_COMPILER_ID:MSVC>:/W4;/WX>
+            )
+
+    target_compile_definitions(Raul-System PUBLIC
+            $<$<CXX_COMPILER_ID:MSVC>:_CRT_SECURE_NO_WARNINGS>
+            )
+endif ()
+
+
+############################
+##  Appearance in IDEs
+############################
+# Static sources
+get_target_property(raul-system-src-static Raul-System SOURCES)
+list(FILTER raul-system-src-static EXCLUDE REGEX "${CMAKE_CURRENT_BINARY_DIR}/.*")
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/training FILES ${raul-system-src-static})
+# Generated sources
+get_target_property(raul-system-src-generated Raul-System SOURCES)
+list(FILTER raul-system-src-generated INCLUDE REGEX "${CMAKE_CURRENT_BINARY_DIR}/.*")
+source_group(TREE ${CMAKE_CURRENT_BINARY_DIR}/training FILES ${raul-system-src-generated})
+
+set_target_properties(Raul-System PROPERTIES FOLDER Raul)
\ No newline at end of file
diff --git a/training/src/system/external/half/CMakeLists.txt b/training/src/system/external/half/CMakeLists.txt
new file mode 100644
index 00000000..b319c798
--- /dev/null
+++ b/training/src/system/external/half/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.11...3.20 FATAL_ERROR)
+
+if (POLICY CMP0077)
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+endif ()
+
+include(FetchContent)
+
+set(FETCHCONTENT_UPDATES_DISCONNECTED ON)
+execute_process(COMMAND mkdir -p ${PROJECT_SOURCE_DIR}/cache)
+if (EXISTS ${BOLT_ROOT}/third_party/sources/half-2.2.0.zip)
+    file(COPY ${BOLT_ROOT}/third_party/sources/half-2.2.0.zip DESTINATION ${PROJECT_SOURCE_DIR}/cache)
+else ()
+    execute_process(COMMAND wget --no-check-certificate https://telkomuniversity.dl.sourceforge.net/project/half/half/2.2.0/half-2.2.0.zip WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/cache)
+    if (EXISTS ${BOLT_ROOT}/third_party/sources)
+        file(COPY ${PROJECT_SOURCE_DIR}/cache/half-2.2.0.zip DESTINATION ${BOLT_ROOT}/third_party/sources/)
+    endif ()
+endif ()
+FetchContent_Declare(half URL ${PROJECT_SOURCE_DIR}/cache/half-2.2.0.zip)
+
+FetchContent_GetProperties(half)
+if (NOT half_Target_POPULATED)
+    message(STATUS "Preparing half")
+    FetchContent_Populate(half)
+    add_library(libhalf INTERFACE)
+    target_include_directories(libhalf INTERFACE ${half_SOURCE_DIR}/include)
+endif ()
+
+execute_process(COMMAND rm -rf ${PROJECT_SOURCE_DIR}/cache)
diff --git a/training/src/system/sources.cmake b/training/src/system/sources.cmake
new file mode 100644
index 00000000..7763f13c
--- /dev/null
+++ b/training/src/system/sources.cmake
@@ -0,0 +1,28 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+target_sources(Raul-System PRIVATE
+        training/system/Types.h
+        training/system/TypeHalf.h
+        training/system/Errors.h
+        training/system/Errors.cpp
+        training/system/NameGenerator.h
+        training/system/Profiler.h
+        training/system/Profiler.cpp
+        training/system/Name.h
+        training/system/Name.cpp
+        training/system/Zip.h
+        training/system/ordered_map.h
+        training/system/Singleton.h
+        )
diff --git a/training/src/system/training/system/Errors.cpp b/training/src/system/training/system/Errors.cpp
new file mode 100644
index 00000000..12540968
--- /dev/null
+++ b/training/src/system/training/system/Errors.cpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Errors.h"
+#include <sstream>
+
+namespace raul
+{
+
+std::string Exception::getMessage(size_t level) const
+{
+    const auto end = '\n';
+    std::stringstream ss;
+
+    if (level == 0)
+    {
+        ss << "Runtime error:";
+    }
+
+    ss << end << std::string(level, ' ');
+
+    if (typeName)
+    {
+        ss << *typeName;
+    }
+    else
+    {
+        ss << "NoneType";
+    }
+    ss << "[";
+    if (objName)
+    {
+        ss << *objName;
+    }
+    if (funcName)
+    {
+        ss << "::" << *funcName;
+    }
+    ss << "]: ";
+
+    ss << message;
+
+    if (fileName && lineNumber)
+    {
+        ss << " - " << *fileName << ":" << *lineNumber;
+    }
+
+    try
+    {
+        if (innerException)
+        {
+            std::rethrow_exception(innerException);
+        }
+    }
+    catch (const Exception& e)
+    {
+        ss << e.getMessage(level + 1);
+    }
+    catch (const std::exception& e)
+    {
+        ss << end << std::string(level + 1, ' ') << "Exception: " << e.what();
+    }
+
+    return ss.str();
+}
+
+const char* Exception::what() const noexcept
+{
+    return errorText.c_str();
+}
+
+} // namespace raul
diff --git a/training/src/system/training/system/Errors.h b/training/src/system/training/system/Errors.h
new file mode 100644
index 00000000..6c9f305a
--- /dev/null
+++ b/training/src/system/training/system/Errors.h
@@ -0,0 +1,107 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef ERRORS_H
+#define ERRORS_H
+
+#include <exception>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+#include "TypeHalf.h"
+#include "Types.h"
+
+namespace raul
+{
+
+/**
+ * Exception with tracing
+ */
+class Exception : public std::exception
+{
+    std::string message;
+    std::optional<std::string> typeName;
+    std::optional<std::string> objName;
+    std::optional<std::string> funcName;
+    std::optional<std::string> fileName;
+    std::optional<size_t> lineNumber;
+
+    std::string errorText;
+    std::exception_ptr innerException;
+
+    [[nodiscard]] std::string getMessage(size_t level = 0) const;
+
+  public:
+    explicit Exception(std::string message)
+        : message{ std::move(message) }
+        , typeName{ std::nullopt }
+        , objName{ std::nullopt }
+        , funcName{ std::nullopt }
+        , fileName{ std::nullopt }
+        , lineNumber{ std::nullopt }
+    {
+        errorText = getMessage();
+    }
+
+    Exception setType(std::string name)
+    {
+        typeName = std::move(name);
+        errorText = getMessage();
+        return *this;
+    }
+
+    Exception setObject(std::string name)
+    {
+        objName = std::move(name);
+        errorText = getMessage();
+        return *this;
+    }
+
+    Exception setFunction(std::string name)
+    {
+        funcName = std::move(name);
+        errorText = getMessage();
+        return *this;
+    }
+
+    Exception setPosition(std::string name, size_t line)
+    {
+        fileName = std::move(name);
+        lineNumber = line;
+        errorText = getMessage();
+        return *this;
+    }
+
+    Exception setInnerException(std::exception_ptr exception)
+    {
+        if (exception)
+        {
+            innerException = std::move(exception);
+            errorText = getMessage();
+        }
+        return *this;
+    }
+
+    [[nodiscard]] const char* what() const noexcept override;
+};
+
+#define BASE_TYPE_NAME(TYPE) (std::is_same_v<TYPE, dtype> ? "dtype" : std::is_same_v<TYPE, half> ? "half" : "T")
+
+#define THROW(TYPE, OBJECT, MESSAGE) throw raul::Exception(MESSAGE).setType(TYPE).setObject(OBJECT).setFunction(__func__).setPosition(__FILE__, __LINE__).setInnerException(std::current_exception());
+#define THROW_NONAME(TYPE, MESSAGE) throw raul::Exception(MESSAGE).setType(TYPE).setFunction(__func__).setPosition(__FILE__, __LINE__).setInnerException(std::current_exception());
+
+} // namespace raul
+
+#endif // ERRORS_H
diff --git a/training/src/system/training/system/Name.cpp b/training/src/system/training/system/Name.cpp
new file mode 100644
index 00000000..b145b23b
--- /dev/null
+++ b/training/src/system/training/system/Name.cpp
@@ -0,0 +1,65 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Name.h"
+
+namespace raul
+{
+Name Name::getPrefix() const
+{
+    size_t pos = 0;
+
+    std::string tmpStr(string);
+
+    size_t finalPos = 0;
+    const size_t separatorLength = std::string(TENSOR_SEP).length();
+
+    while ((pos = tmpStr.find(TENSOR_SEP)) != std::string::npos)
+    {
+        tmpStr.erase(0, pos + separatorLength);
+
+        finalPos += pos + separatorLength;
+    }
+
+    if (finalPos == std::string::npos || finalPos == 0)
+    {
+        return string;
+    }
+
+    return string.substr(0, finalPos - separatorLength);
+}
+
+Name Name::getLastName() const
+{
+    size_t pos = 0;
+
+    std::string tmpStr(string);
+
+    size_t finalPos = 0;
+    const size_t separatorLength = std::string(TENSOR_SEP).length();
+
+    while ((pos = tmpStr.find(TENSOR_SEP)) != std::string::npos)
+    {
+        tmpStr.erase(0, pos + separatorLength);
+
+        finalPos += pos + separatorLength;
+    }
+
+    if (finalPos == std::string::npos || finalPos == 0)
+    {
+        return string;
+    }
+
+    return string.substr(finalPos, string.length());
+}
+} // namespace raul
\ No newline at end of file
diff --git a/training/src/system/training/system/Name.h b/training/src/system/training/system/Name.h
new file mode 100644
index 00000000..310abcac
--- /dev/null
+++ b/training/src/system/training/system/Name.h
@@ -0,0 +1,150 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TENSOR_NAME_H
+#define TENSOR_NAME_H
+
+#include <iostream>
+#include <set>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+
+#define TENSOR_SEP "::"
+#define TENSOR_GRADIENT_POSTFIX "Gradient"
+
+namespace raul
+{
+
+/**
+ * @brief Wrapper for names of entities
+ */
+struct Name
+{
+    Name() = default;
+    ~Name() = default;
+
+    Name(const std::string& str)
+        : string(str)
+    {
+    }
+    Name(const char* str)
+        : string(str)
+    {
+    }
+
+    Name(const Name& other)
+        : string(other.string)
+    {
+    }
+
+    Name& operator=(Name other)
+    {
+        std::swap(string, other.string);
+        return *this;
+    }
+
+    operator std::string() const { return string; }
+    operator std::string_view() const { return std::string_view(string); }
+
+    Name& operator+=(const Name& rhs)
+    {
+        this->string += rhs.string;
+        return *this;
+    }
+
+    Name& operator/=(const Name& rhs)
+    {
+        if (!rhs.empty())
+        {
+            if (!empty())
+            {
+                this->string += std::string(TENSOR_SEP) + rhs.string;
+            }
+            else
+            {
+                this->string = rhs.string;
+            }
+        }
+        return *this;
+    }
+
+    /**
+     * @brief Get prefix string before last TENSOR_SEP
+     *
+     */
+    [[nodiscard]] Name getPrefix() const;
+
+    /**
+     * @brief Get last string after last TENSOR_SEP
+     *
+     */
+    [[nodiscard]] Name getLastName() const;
+
+    [[nodiscard]] std::string grad() const { return string + TENSOR_GRADIENT_POSTFIX; }
+
+    [[nodiscard]] size_t size() const { return string.size(); }
+    [[nodiscard]] bool empty() const { return string.empty(); }
+    [[nodiscard]] const char* c_str() const { return string.c_str(); }
+    [[nodiscard]] std::string str() const { return string; }
+
+    bool operator==(const Name& other) const { return string == other.string; }
+    bool operator<(const Name& other) const { return string < other.string; }
+
+    bool operator!=(const Name& other) const { return !(*this == other); }
+    bool operator<=(const Name& other) const { return !(other < *this); }
+    bool operator>(const Name& other) const { return other < *this; }
+    bool operator>=(const Name& other) const { return !(*this < other); }
+
+    friend std::ostream& operator<<(std::ostream& out, const Name& instance) { return instance.as_ostream(out); }
+
+  private:
+    std::ostream& as_ostream(std::ostream& out) const
+    {
+        out << string;
+        return out;
+    }
+    std::string string;
+};
+
+inline Name operator+(Name lhs, const Name& rhs)
+{
+    lhs += rhs;
+    return lhs;
+}
+
+inline Name operator/(Name lhs, const Name& rhs)
+{
+    lhs /= rhs;
+    return lhs;
+}
+
+typedef std::vector<Name> Names;
+typedef std::set<Name> NameSet;
+typedef std::unordered_set<Name> NameUnorderedSet;
+
+} // namespace raul
+
+namespace std
+{
+
+template<>
+struct hash<raul::Name>
+{
+    std::size_t operator()(const raul::Name& k) const { return hash<std::string>()(k.str()); }
+};
+
+}
+
+#endif // TENSOR_NAME_H
diff --git a/training/src/system/training/system/NameGenerator.h b/training/src/system/training/system/NameGenerator.h
new file mode 100644
index 00000000..24b557b8
--- /dev/null
+++ b/training/src/system/training/system/NameGenerator.h
@@ -0,0 +1,148 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NAME_GENERATOR_H
+#define NAME_GENERATOR_H
+
+#include <random>
+#include <string>
+#include <utility>
+
+#include <training/base/common/Conversions.h>
+
+#define DEFAULT_COUNTER_VALUE 1U
+#define DEFAULT_RANDOM_PREFIX_LENGTH 10U
+#define ASCII_FROM 97U
+#define ASCII_TO 122U
+
+namespace raul
+{
+/**
+ * @brief Name generator with an incremental suffix
+ *
+ * The generator produces name with fixed prefix and incremental suffix
+ * in the following manner: prefixNN, where NN is an integer.
+ *
+ * The generator uses a random prefix if it is not specified.
+ */
+class NameGenerator
+{
+  public:
+    /**
+     * @brief Get random string of a specific length
+     *
+     * @param Length Length of the random string
+     *
+     * This function uses uniform distribution for generating random sequence of characters
+     * withing the range [97, 122]. This range can be redefined using the fillowing macro
+     * [ASCII_FROM, ASCII_TO].
+     */
+    static std::string getRandomString(const unsigned int length = DEFAULT_RANDOM_PREFIX_LENGTH)
+    {
+        std::random_device random_device;
+        std::mt19937 generator(random_device());
+        std::uniform_int_distribution<> distribution(ASCII_FROM, ASCII_TO);
+
+        std::string random_string;
+        random_string.reserve(length);
+
+        for (size_t i = 0; i < length; ++i)
+        {
+            random_string += static_cast<char>(distribution(generator));
+        }
+
+        return random_string;
+    }
+
+    /**
+     * @brief Construct a new Name Generator object with a specific prefix
+     *
+     * @param prefix Prefix string
+     */
+    explicit NameGenerator(std::string prefix)
+        : mPrefix(std::move(prefix))
+        , mNext(DEFAULT_COUNTER_VALUE)
+    {
+    }
+    /**
+     * @brief Construct a new Name Generator object with a random prefix specified by the length
+     *
+     * @param length Length of the random prefix
+     */
+    explicit NameGenerator(const unsigned int length)
+        : mPrefix(getRandomString(length))
+        , mNext(DEFAULT_COUNTER_VALUE)
+    {
+    }
+    /**
+     * @brief Construct a new Name Generator object with a random prefix of length 10
+     *
+     * Default values of the length can be redefined, see DEFAULT_RANDOM_PREFIX_LENGTH macro.
+     */
+    NameGenerator()
+        : mPrefix(getRandomString(DEFAULT_RANDOM_PREFIX_LENGTH))
+        , mNext(DEFAULT_COUNTER_VALUE)
+    {
+    }
+
+    /**
+     * @brief Generate the next name with prefix and incremented suffix
+     *
+     * The generator starts from 1 (can be redefined, see DEFAULT_COUNTER_VALUE). The counter can be reset by reset() method.
+     *
+     * @return std::string
+     */
+    std::string generate() { return mPrefix + Conversions::toString(mNext++); }
+
+    /**
+     * @brief Set the internal counter value to 1 (can be redefined, see DEFAULT_COUNTER_VALUE).
+     *
+     */
+    void reset() { mNext = DEFAULT_COUNTER_VALUE; }
+
+    /**
+     * @brief Set the internal counter specific value.
+     *
+     * @param next New counter value
+     */
+    void setNext(size_t next) { mNext = next; }
+
+    /**
+     * @brief Get the next counter value
+     *
+     * @return size_t Current counter value
+     */
+    size_t getNext() const { return mNext; }
+
+    /**
+     * @brief Get the prefix
+     *
+     * @return std::string Prefix string
+     */
+    std::string getPrefix() const { return mPrefix; }
+
+    /**
+     * @brief Set the prefix
+     *
+     * @param prefix New prefix
+     */
+    void setPrefix(const std::string& prefix) { mPrefix = prefix; }
+
+  private:
+    std::string mPrefix;
+    size_t mNext;
+};
+
+} // raul namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/system/training/system/Profiler.cpp b/training/src/system/training/system/Profiler.cpp
new file mode 100644
index 00000000..2d655911
--- /dev/null
+++ b/training/src/system/training/system/Profiler.cpp
@@ -0,0 +1,204 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Profiler.h"
+
+namespace raul
+{
+
+void Profiler::initialize(std::ostream* out, bool nesting, bool enable, bool dontWork, bool toJson)
+{
+    mLogFile = out;
+    mNesting = nesting;
+    mEnable = enable;
+    mFullyDisabled = dontWork;
+    mUseJsonFormat = toJson;
+    // For JSON
+    mPrevIndent = 0;
+    mFirstWrite = true;
+}
+
+Profiler& Profiler::getInstance()
+{
+    static Profiler instance;
+    return instance;
+}
+
+bool Profiler::isDisabled() const
+{
+    return mFullyDisabled;
+}
+
+void Profiler::enableProfiler()
+{
+    mEnable = true;
+}
+
+void Profiler::disableProfiler()
+{
+    mEnable = false;
+}
+
+void Profiler::increasePrefix(const std::string& prefix)
+{
+    mPrefix.push_back(prefix);
+}
+
+void Profiler::decreasePrefix(size_t num)
+{
+    for (size_t i = 0; i < num; ++i)
+    {
+        mPrefix.pop_back();
+    }
+}
+
+std::string Profiler::generatePrefix() const
+{
+    std::string prefix;
+    for (size_t i = 0; i < mPrefix.size() - 1; ++i)
+    {
+        prefix += mPrefix[i] + ":";
+    }
+    prefix += mPrefix.back();
+    return prefix;
+}
+
+void Profiler::clearPrefix()
+{
+    mPrefix.clear();
+}
+
+void Profiler::tic(const std::string& opName)
+{
+    // Return if disabled
+    if (!mEnable)
+    {
+        return;
+    }
+
+    if (mNesting)
+    {
+        mOpOrder.emplace_back(mOpStartPoints.size(), opName);
+    }
+    mOpStartPoints.push(std::chrono::steady_clock::now());
+}
+
+void Profiler::toc()
+{
+    // Nothing to print
+    if (mOpStartPoints.empty())
+    {
+        return;
+    }
+    // Fix end time
+    const auto endTime = std::chrono::steady_clock::now();
+    // Get top elements
+    const auto startTime = mOpStartPoints.top();
+    // Pop these elements
+    mOpStartPoints.pop();
+
+    // Find duration
+    const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(endTime - startTime);
+    if (mNesting)
+    {
+        mOpDurations.emplace_back(mOpStartPoints.size(), duration);
+
+        // Log
+        if (mOpStartPoints.empty() && mLogFile != nullptr)
+        {
+            std::string prefix = generatePrefix();
+            mPrevIndent = mPrefix.size() - 1;
+            for (size_t i = 0; i < mOpOrder.size(); ++i)
+            {
+                for (size_t j = 0; j < mOpDurations.size(); ++j)
+                {
+                    if (mOpOrder[i].first == mOpDurations[j].first)
+                    {
+                        if (mUseJsonFormat)
+                        {
+                            // Prefix
+                            if (mOpOrder[i].first > mPrevIndent)
+                            {
+                                increasePrefix(mOpOrder[i - 1].second);
+                                prefix += ":" + mPrefix.back();
+                            }
+                            else if (mOpOrder[i].first < mPrevIndent)
+                            {
+                                decreasePrefix(mPrevIndent - mOpOrder[i].first);
+                                prefix = generatePrefix();
+                            }
+                            mPrevIndent = mOpOrder[i].first;
+                            // Begin
+                            if (!mFirstWrite)
+                            {
+                                *mLogFile << "},\n";
+                            }
+                            else
+                            {
+                                mFirstWrite = false;
+                            }
+                            // Parse incoming name
+                            std::string fullOpName = mOpOrder[i].second;
+                            size_t pos1 = fullOpName.find("[");
+                            size_t pos2 = fullOpName.rfind("::");
+                            std::string layerType = fullOpName.substr(0, pos1);
+                            std::string layerName = fullOpName.substr(pos1 + 1, pos2 - pos1 - 1);
+                            std::string funcName = fullOpName.substr(pos2 + 2, fullOpName.length() - pos2 - 3);
+                            *mLogFile << "{\n";
+                            *mLogFile << "    \"cat\": \"PROFILING\",\n";
+                            *mLogFile << "    \"pid\": 0,\n";
+                            *mLogFile << "    \"tid\": 0,\n";
+                            *mLogFile << "    \"ts\": 0,\n";
+                            *mLogFile << "    \"ph\": \"B\",\n";
+                            *mLogFile << "    \"name\": \"" + fullOpName << "\",\n";
+                            *mLogFile << "    \"args\": { \"layer type\": \"" + layerType + "\", ";
+                            *mLogFile << "\"layer name\": \"" + layerName + "\", \"function\": \"" + funcName + "\", ";
+                            *mLogFile << "\"nesting order\": \"" + prefix + "\" }\n";
+                            *mLogFile << "},\n";
+
+                            // End
+                            *mLogFile << "{\n";
+                            *mLogFile << "    \"cat\": \"PROFILING\",\n";
+                            *mLogFile << "    \"pid\": 0,\n";
+                            *mLogFile << "    \"tid\": 0,\n";
+                            *mLogFile << "    \"ts\": " << mOpDurations[j].second.count() << ",\n";
+                            *mLogFile << "    \"ph\": \"E\",\n";
+                            *mLogFile << "    \"name\": \"" + fullOpName << "\",\n";
+                            *mLogFile << "    \"args\": { \"layer type\": \"" + layerType + "\", ";
+                            *mLogFile << "\"layer name\": \"" + layerName + "\", \"function\": \"" + funcName + "\", ";
+                            *mLogFile << "\"nesting order\": \"" + prefix + "\" }\n";
+                        }
+                        else
+                        {
+                            *mLogFile << std::string((mOpOrder[i].first + 1) * 4, ' ') << mOpOrder[i].second << ": " << mOpDurations[j].second.count() << "us\n";
+                            mOpDurations.erase(mOpDurations.begin() + j);
+                        }
+                        break;
+                    }
+                }
+            }
+            mOpDurations.clear();
+            mOpOrder.clear();
+        }
+    }
+    else
+    {
+        if (mLogFile != nullptr)
+        {
+            *mLogFile << mOpOrder.back().second << ": " << duration.count() << "us\n";
+            mOpOrder.pop_back();
+        }
+    }
+}
+
+} // namespace raul
diff --git a/training/src/system/training/system/Profiler.h b/training/src/system/training/system/Profiler.h
new file mode 100644
index 00000000..c2b2f3a5
--- /dev/null
+++ b/training/src/system/training/system/Profiler.h
@@ -0,0 +1,108 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef PROFILER_H
+#define PROFILER_H
+
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <stack>
+#include <string>
+#include <vector>
+
+#include <training/system/Name.h>
+
+namespace raul
+{
+
+class Profiler
+{
+
+  public:
+    // Restrict copy semantic
+    Profiler(const Profiler&) = delete;
+    Profiler& operator=(const Profiler&) = delete;
+
+    // Get instance
+    static Profiler& getInstance();
+
+    // Initialize
+    void initialize(std::ostream* out, bool nesting = true, bool enable = false, bool dontWork = true, bool toJson = false);
+
+    // Main functions
+    void tic(const std::string& opName);
+    void toc();
+
+    // Is globally disabled
+    bool isDisabled() const;
+
+    // Enable or disable
+    void enableProfiler();
+    void disableProfiler();
+
+    // Get internal ostream
+    std::ostream* getOstream() const { return mLogFile; }
+
+    // Get state
+    bool getState() const { return mEnable; }
+
+    // Is JSON
+    bool useJsonFormat() const { return mUseJsonFormat; }
+
+    // Create or add/delete part of a prefix
+    std::string generatePrefix() const;
+    void clearPrefix();
+    void increasePrefix(const std::string& prefix);
+    void decreasePrefix(size_t numToRemove);
+
+  private:
+    Profiler() {}
+    std::ostream* mLogFile{ nullptr };
+    bool mNesting{ false };
+    // Use to follow nesting and print
+    std::stack<std::chrono::steady_clock::time_point> mOpStartPoints;
+    std::vector<std::pair<size_t, std::chrono::microseconds>> mOpDurations;
+    std::vector<std::pair<size_t, std::string>> mOpOrder;
+    // On or off
+    bool mEnable{ false };
+
+    // Global off
+    bool mFullyDisabled{ true };
+
+    // For JSON
+    size_t mPrevIndent;
+    bool mUseJsonFormat;
+    std::vector<std::string> mPrefix;
+    bool mFirstWrite;
+};
+
+namespace helpers
+{
+
+class ProfilerMeasurer
+{
+  public:
+    ProfilerMeasurer(const raul::Name& name) { raul::Profiler::getInstance().tic(name); }
+    ~ProfilerMeasurer() { raul::Profiler::getInstance().toc(); }
+};
+
+} // namespace helpers
+
+#define PROFILER_TIC(name) raul::Profiler::getInstance().tic(name);
+#define PROFILER_TOC raul::Profiler::getInstance().toc();
+
+#define MEASURE_BLOCK(name) raul::helpers::ProfilerMeasurer instance{ name };
+} // namespace raul
+
+#endif
diff --git a/training/src/system/training/system/Singleton.h b/training/src/system/training/system/Singleton.h
new file mode 100644
index 00000000..f9e1eabe
--- /dev/null
+++ b/training/src/system/training/system/Singleton.h
@@ -0,0 +1,210 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SINGLETON_H
+#define SINGLETON_H
+
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <list>
+#include <memory>
+#include <new>
+#include <stdexcept>
+#include <vector>
+
+#ifdef _MSC_VER
+#define LOKI_C_CALLING_CONVENTION_QUALIFIER __cdecl
+#else
+#define LOKI_C_CALLING_CONVENTION_QUALIFIER
+#endif
+
+namespace raul
+{
+typedef void(LOKI_C_CALLING_CONVENTION_QUALIFIER* atexit_pfn_t)();
+
+/** Auxiliary structure for standard memory allocation in heap
+ */
+template<class T>
+struct CreateUsingNew
+{
+    static T* Create() { return new T; }
+
+    static void Destroy(T* p) { delete p; }
+};
+
+/** Auxiliary structure for memory allocation in static region
+ */
+template<class T>
+struct CreateStatic
+{
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4121)
+// alignment of a member was sensitive to packing
+#endif // _MSC_VER
+    /** Auxiliary union for correct static memory alignment
+     */
+    union MaxAlign
+    {
+        char t_[sizeof(T)];
+        short int shortInt_;
+        int int_;
+        long int longInt_;
+        float float_;
+        double double_;
+        long double longDouble_;
+        struct Test;
+        int Test::*pMember_;
+        int (Test::*pMemberFn_)(int);
+    };
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif // _MSC_VER
+
+    static T* Create()
+    {
+        static MaxAlign staticMemory_;
+        return new (&staticMemory_) T;
+    }
+
+    static void Destroy(T* p)
+    {
+        (void)p;
+        p->~T();
+    }
+};
+
+/** Auxiliary structure for default lifetime policy
+ */
+template<class T>
+struct DefaultLifetime
+{
+    static void ScheduleDestruction(T*, atexit_pfn_t pFun) { std::atexit(pFun); }
+
+    static void OnDeadReference() { throw std::logic_error("Dead Reference Detected"); }
+};
+
+/** Auxiliary class for phoenix lifetime policy
+ */
+template<class T>
+class PhoenixSingleton
+{
+  public:
+    static void ScheduleDestruction(T*, atexit_pfn_t pFun)
+    {
+#ifndef ATEXIT_FIXED
+        if (!destroyedOnce_)
+        {
+#endif
+            std::atexit(pFun);
+#ifndef ATEXIT_FIXED
+        }
+#endif
+    }
+
+    static void OnDeadReference()
+    {
+#ifndef ATEXIT_FIXED
+        destroyedOnce_ = true;
+#endif
+    }
+
+  private:
+#ifndef ATEXIT_FIXED
+    static bool destroyedOnce_;
+#endif
+};
+
+#ifndef ATEXIT_FIXED
+template<class T>
+bool PhoenixSingleton<T>::destroyedOnce_ = false;
+#endif
+
+/** Auxiliary structure for lifetime policy without destruction
+ */
+template<class T>
+struct NoDestroy
+{
+    static void ScheduleDestruction(T*, atexit_pfn_t) {}
+
+    static void OnDeadReference() {}
+};
+
+/** Singleton class
+ */
+template<typename T, template<class> class CreationPolicy = CreateUsingNew, template<class> class LifetimePolicy = DefaultLifetime>
+class SingletonHolder
+{
+  public:
+    ///  Type of the singleton object
+    typedef T ObjectType;
+
+    ///  Returns a reference to singleton object
+    static T& Instance();
+
+  private:
+    // Helpers
+    static void MakeInstance();
+    static void LOKI_C_CALLING_CONVENTION_QUALIFIER DestroySingleton();
+
+    // Protection
+    SingletonHolder();
+
+    // Data
+    static T* pInstance_;
+    static bool destroyed_;
+};
+
+template<class T, template<class> class C, template<class> class L>
+T* SingletonHolder<T, C, L>::pInstance_ = nullptr;
+
+template<class T, template<class> class C, template<class> class L>
+bool SingletonHolder<T, C, L>::destroyed_ = false;
+
+template<class T, template<class> class CreationPolicy, template<class> class LifetimePolicy>
+inline T& SingletonHolder<T, CreationPolicy, LifetimePolicy>::Instance()
+{
+    if (!pInstance_)
+    {
+        MakeInstance();
+    }
+    return *pInstance_;
+}
+
+template<class T, template<class> class CreationPolicy, template<class> class LifetimePolicy>
+void SingletonHolder<T, CreationPolicy, LifetimePolicy>::MakeInstance()
+{
+
+    if (!pInstance_)
+    {
+        if (destroyed_)
+        {
+            destroyed_ = false;
+            LifetimePolicy<T>::OnDeadReference();
+        }
+        pInstance_ = CreationPolicy<T>::Create();
+        LifetimePolicy<T>::ScheduleDestruction(pInstance_, &DestroySingleton);
+    }
+}
+
+template<class T, template<class> class CreationPolicy, template<class> class L>
+void LOKI_C_CALLING_CONVENTION_QUALIFIER SingletonHolder<T, CreationPolicy, L>::DestroySingleton()
+{
+    assert(!destroyed_);
+    CreationPolicy<T>::Destroy(pInstance_);
+    pInstance_ = 0;
+    destroyed_ = true;
+}
+} // namespace raul
+#endif // SINGLETON_H
diff --git a/training/src/system/training/system/TypeHalf.h b/training/src/system/training/system/TypeHalf.h
new file mode 100644
index 00000000..34447f46
--- /dev/null
+++ b/training/src/system/training/system/TypeHalf.h
@@ -0,0 +1,146 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TYPE_HALF_H
+#define TYPE_HALF_H
+
+#include <cstdint>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4804)
+#pragma warning(disable : 4805)
+#endif // _MSC_VER
+
+#if !defined(ANDROID)
+#include <half.hpp>
+#endif
+
+namespace raul
+{
+
+#if defined(ANDROID)
+typedef _Float16 half;
+#else
+typedef half_float::half half;
+#endif
+
+inline float toFloat32(float value)
+{
+    return value;
+}
+
+#if defined(ANDROID)
+inline float toFloat32(half value)
+{
+    return static_cast<float>(value);
+}
+#else
+inline float toFloat32(half value)
+{
+    return half_float::half_cast<float>(value);
+}
+#endif
+
+inline half toFloat16(half& f)
+{
+    return f;
+}
+
+#if defined(ANDROID)
+inline half toFloat16(float const& f)
+{
+    return static_cast<half>(f);
+}
+#else
+inline half toFloat16(float const& f)
+{
+    return half_float::half_cast<half>(f);
+}
+#endif
+
+// left without implementation on purpose to allow only float <-> half conversion
+template<typename T, typename F>
+inline T toFloat(const F&);
+
+template<>
+inline float toFloat<float>(const half& val)
+{
+    return toFloat32(val);
+}
+
+template<>
+inline half toFloat<half>(const float& val)
+{
+    return toFloat16(val);
+}
+
+template<>
+inline float toFloat<float>(const float& val)
+{
+    return val;
+}
+
+template<>
+inline half toFloat<half>(const half& val)
+{
+    return val;
+}
+
+#if !defined(ANDROID)
+template <typename T>
+class castHelper
+{
+public:
+    template <typename U>
+    static T cast(const U& val) { return static_cast<T>(val); }
+
+    static T cast(const half& val) { return half_float::half_cast<T>(val); }
+};
+
+template <>
+class castHelper<half>
+{
+public:
+    template <typename U>
+    static half cast(const U& val) { return half_float::half_cast<half>(val); }
+
+    static half cast(const bool& val) { return half_float::half_cast<half>(static_cast<int>(val)); }
+
+    static half cast(const half& val) { return val; }
+};
+#endif
+
+} // namespace raul
+
+#if defined(ANDROID)
+#define TOHTYPE(var) static_cast<raul::half>(var)
+#else
+#define TOHTYPE(var) castHelper<raul::half>::cast(var)
+#endif
+
+inline raul::half operator"" _hf(long double value)
+{
+    return raul::toFloat16(static_cast<float>(value));
+}
+
+inline raul::half operator"" _hf(unsigned long long value)
+{
+    return raul::toFloat16(static_cast<float>(value));
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif // _MSC_VER
+
+#endif // TYPE_HALF_H
diff --git a/training/src/system/training/system/Types.h b/training/src/system/training/system/Types.h
new file mode 100644
index 00000000..bb5f1916
--- /dev/null
+++ b/training/src/system/training/system/Types.h
@@ -0,0 +1,36 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TYPES_H
+#define TYPES_H
+
+namespace raul
+{
+
+typedef float dtype;
+
+} // namespace raul
+
+#define TODTYPE(var) static_cast<raul::dtype>(var)
+
+[[nodiscard]] constexpr raul::dtype operator"" _dt(long double value)
+{
+    return TODTYPE(value);
+}
+
+[[nodiscard]] constexpr raul::dtype operator"" _dt(unsigned long long value)
+{
+    return TODTYPE(value);
+}
+
+#endif // TYPES_H
diff --git a/training/src/system/training/system/Zip.h b/training/src/system/training/system/Zip.h
new file mode 100644
index 00000000..81f030f4
--- /dev/null
+++ b/training/src/system/training/system/Zip.h
@@ -0,0 +1,74 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SYSTEM_ZIP_H
+#define SYSTEM_ZIP_H
+
+namespace raul::system
+{
+template<typename T>
+using iter = std::conditional_t<std::is_const_v<std::remove_reference_t<T>>, typename std::decay_t<T>::const_iterator, typename std::decay_t<T>::iterator>;
+
+template<typename A, typename B>
+struct Zipper
+{
+    using IterA = iter<A>;
+    using IterB = iter<B>;
+
+    struct Iterator
+    {
+        Iterator(IterA initIterA, IterB initIterB)
+            : iterA{ initIterA }
+            , iterB{ initIterB }
+        {
+        }
+
+        auto operator++()
+        {
+            ++iterA;
+            ++iterB;
+            return *this;
+        }
+
+        auto operator==(const Iterator& other) { return iterA == other.iterA || iterB == other.iterB; }
+        auto operator!=(const Iterator& other) { return !(*this == other); }
+
+        auto operator*() { return std::make_pair(*iterA, *iterB); }
+
+        IterA iterA;
+        IterB iterB;
+    };
+
+    Zipper(A initA, B initB)
+        : a(initA)
+        , b(initB)
+    {
+    }
+
+    auto begin() { return Iterator{ std::begin(a), std::begin(b) }; }
+    auto end() { return Iterator{ std::end(a), std::end(b) }; }
+
+  private:
+    A a;
+    B b;
+};
+
+template<typename A, typename B>
+auto zip(A a, B b)
+{
+    return Zipper<A, B>{ std::forward<A>(a), std::forward<B>(b) };
+}
+
+}
+
+#endif // SYSTEM_ZIP_H
diff --git a/training/src/system/training/system/ordered_map.h b/training/src/system/training/system/ordered_map.h
new file mode 100644
index 00000000..eda89488
--- /dev/null
+++ b/training/src/system/training/system/ordered_map.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef SYSTEM_ORDERED_MAP_H
+#define SYSTEM_ORDERED_MAP_H
+
+namespace raul::system
+{
+template<typename Key, typename T>
+struct ordered_map
+{
+    using value_type = std::pair<Key, T>;
+    using iterator = typename std::vector<value_type>::iterator;
+    using const_iterator = typename std::vector<value_type>::const_iterator;
+    
+    ordered_map() = default;
+    ordered_map(const ordered_map& other)
+        : data{ other.data }
+        , dictionary{ other.dictionary }
+    {
+    }
+
+    T& operator[](const Key& k)
+    {
+        if (dictionary.find(k) == dictionary.end())
+        {
+            dictionary[k] = data.insert(data.end(), value_type(k, T{}));
+        }
+        return dictionary[k]->second;
+    }
+    T& operator[](size_t pos) { return data[pos].second; }
+
+    iterator find(const Key& key) { return dictionary.find(key); }
+    const_iterator find(const Key& key) const { return dictionary.find(key); }
+    bool contains(const Key& key) const { return dictionary.find(key) != dictionary.end(); }
+
+    iterator begin() noexcept { return data.begin(); }
+    const_iterator begin() const noexcept { return data.begin(); }
+    const_iterator cbegin() const noexcept { return data.cbegin(); }
+    iterator end() noexcept { return data.end(); }
+    const_iterator end() const noexcept { return data.end(); }
+    const_iterator cend() const noexcept { return data.cend(); }
+
+    void clear() noexcept
+    {
+        dictionary.clear();
+        data.clear();
+    }
+
+    [[nodiscard]] size_t size() const noexcept { return data.size(); }
+    [[nodiscard]] bool empty() const noexcept { return data.empty(); }
+
+    std::vector<value_type> data;
+    std::unordered_map<Key, iterator> dictionary;
+};
+
+template<typename Key, typename T>
+auto begin(ordered_map<Key, T>& x)
+{
+    return x.begin();
+}
+
+template<typename Key, typename T>
+auto end(ordered_map<Key, T>& x)
+{
+    return x.end();
+}
+
+}
+
+#endif // SYSTEM_ORDERED_MAP_H
diff --git a/training/src/tests/CMakeLists.txt b/training/src/tests/CMakeLists.txt
new file mode 100644
index 00000000..d3a785fc
--- /dev/null
+++ b/training/src/tests/CMakeLists.txt
@@ -0,0 +1,96 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+############################
+##  Options
+############################
+option(RAUL_TESTS_BUILD_CORE "Build core library unit tests" ON)
+option(RAUL_TESTS_BUILD_FRONTEND "Build frontend unit tests" ON)
+option(RAUL_TESTS_BUILD_ACTIVATIONS "Build activation functions unit tests" ON)
+option(RAUL_TESTS_BUILD_INITIALIZERS "Build initializers unit tests" ON)
+option(RAUL_TESTS_BUILD_LAYERS "Build layers unit tests" ON)
+option(RAUL_TESTS_BUILD_LOSS "Build loss functions unit tests" ON)
+option(RAUL_TESTS_BUILD_OPTIMIZERS "Build optimizers unit tests" ON)
+option(RAUL_TESTS_BUILD_TOPOLOGIES "Build topologies unit tests" ON)
+option(RAUL_TESTS_BUILD_POSTPROCESSING "Build postprocessing unit tests" ON)
+
+############################
+##  Modules
+############################
+add_subdirectory(external/googletest EXCLUDE_FROM_ALL)
+add_subdirectory(external/tclap EXCLUDE_FROM_ALL)
+
+include(GoogleTest)
+
+############################
+##  Tests sources
+############################
+add_executable(Raul-Tests tests/main.cpp tests/GTestExtensions.h)
+include(sources.cmake)
+
+############################
+##  Tests dependencies
+############################
+target_link_libraries(Raul-Tests
+        PRIVATE
+        gtest
+        libtclap
+        Raul
+        )
+
+target_include_directories(Raul-Tests
+        PRIVATE
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+        )
+
+if (CMAKE_BUILD_TYPE STREQUAL Release)
+    add_custom_command(TARGET Raul-Tests POST_BUILD
+            COMMAND ${CMAKE_STRIP} $<TARGET_FILE:Raul-Tests>)
+endif ()
+
+set_target_properties(Raul-Tests PROPERTIES OUTPUT_NAME RaulTests)
+
+############################
+##  Building layout
+############################
+set_target_properties(Raul-Tests PROPERTIES
+        ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+        LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+        RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+        )
+
+############################
+##  Install
+############################
+if (RAUL_INSTALL_TESTS)
+    if (RAUL_INSTALL_ENABLE_SUBDIRS)
+        install(TARGETS Raul-Tests EXPORT raul-export)
+    else ()
+        install(TARGETS Raul-Tests EXPORT raul-export DESTINATION ".")
+    endif ()
+endif ()
+
+
+############################
+##  Appearance in IDEs
+############################
+get_target_property(raul-tests-src Raul-Tests SOURCES)
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/tests FILES ${raul-tests-src})
+set_target_properties(Raul-Tests PROPERTIES FOLDER Raul)
+
+if (NOT ANDROID)
+    gtest_discover_tests(Raul-Tests)
+endif ()
\ No newline at end of file
diff --git a/training/src/tests/external/googletest/CMakeLists.txt b/training/src/tests/external/googletest/CMakeLists.txt
new file mode 100644
index 00000000..b91bf7fc
--- /dev/null
+++ b/training/src/tests/external/googletest/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.11...3.20 FATAL_ERROR)
+
+if (POLICY CMP0077)
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+endif ()
+
+include(FetchContent)
+
+set(FETCHCONTENT_UPDATES_DISCONNECTED ON)
+FetchContent_Declare(googletest
+        GIT_REPOSITORY https://github.com/google/googletest.git
+        GIT_TAG main # they follow "Live at Head" (see repository README)
+        GIT_SHALLOW TRUE
+        GIT_PROGRESS FALSE
+        )
+
+message(STATUS "Preparing googletest")
+set(BUILD_GMOCK OFF)
+set(INSTALL_GTEST OFF)
+set(gtest_force_shared_crt ON)
+if (ANDROID)
+    set(gtest_disable_pthreads ON)
+endif ()
+FetchContent_MakeAvailable(googletest)
+set_target_properties(gtest PROPERTIES FOLDER External/gtest)
diff --git a/training/src/tests/external/tclap/CMakeLists.txt b/training/src/tests/external/tclap/CMakeLists.txt
new file mode 100644
index 00000000..9bf83531
--- /dev/null
+++ b/training/src/tests/external/tclap/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.11...3.20 FATAL_ERROR)
+
+if (POLICY CMP0077)
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+endif ()
+
+include(FetchContent)
+
+set(FETCHCONTENT_UPDATES_DISCONNECTED ON)
+FetchContent_Declare(tclap
+        GIT_REPOSITORY https://github.com/mirror/tclap.git
+        GIT_TAG v1.2.4
+        GIT_SHALLOW FALSE
+        GIT_PROGRESS FALSE
+        )
+
+FetchContent_GetProperties(tclap)
+if (NOT tclap_Target_POPULATED)
+    message(STATUS "Preparing tclap")
+    FetchContent_Populate(tclap)
+    add_library(libtclap INTERFACE)
+    target_include_directories(libtclap INTERFACE ${tclap_SOURCE_DIR}/include)
+endif ()
\ No newline at end of file
diff --git a/training/src/tests/sources.cmake b/training/src/tests/sources.cmake
new file mode 100644
index 00000000..a68a0df5
--- /dev/null
+++ b/training/src/tests/sources.cmake
@@ -0,0 +1,221 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.10...3.20 FATAL_ERROR)
+
+if (RAUL_TESTS_BUILD_FRONTEND)
+    target_sources(Raul-Tests PRIVATE
+            tests/frontend/Test_Graph.cpp
+            tests/frontend/Test_Declaration.cpp
+            tests/frontend/Test_Path.cpp
+            tests/frontend/Test_DotLang.cpp
+            tests/frontend/Test_Compilation.cpp
+            )
+    if (RAUL_CONFIG_ENABLE_IO_JSON)
+        target_sources(Raul-Tests PRIVATE
+                tests/frontend/Test_JSON.cpp
+                )
+    endif ()
+
+endif ()
+
+if (RAUL_TESTS_BUILD_ACTIVATIONS)
+    target_sources(Raul-Tests PRIVATE
+            tests/activations/Test_ActivationFunc_GeLU.cpp
+            tests/activations/Test_ActivationFunc_Hsigmoid.cpp
+            tests/activations/Test_ActivationFunc_Hswish.cpp
+            tests/activations/Test_ActivationFunc_LeakyReLU.cpp
+            tests/activations/Test_ActivationFunc_Sigmoid.cpp
+            tests/activations/Test_ActivationFunc_SoftPlus.cpp
+            tests/activations/Test_ActivationFunc_Swish.cpp
+            tests/activations/Test_ActivationFunc_Tanh.cpp
+            tests/activations/Test_LogSoftMax.cpp
+            tests/activations/Test_SoftMax.cpp
+            )
+endif ()
+
+if (RAUL_TESTS_BUILD_INITIALIZERS)
+    target_sources(Raul-Tests PRIVATE
+            tests/initializers/Test_Initializer_ConstantInitializer.cpp
+            tests/initializers/Test_Initializer_RandomNormInitializer.cpp
+            tests/initializers/Test_Initializer_RandomUniformInitializer.cpp
+            tests/initializers/Test_Initializer_XavierInitializer.cpp
+            )
+endif ()
+
+if (RAUL_TESTS_BUILD_LAYERS)
+    target_sources(Raul-Tests PRIVATE
+            tests/layers/Test_CNNAveragePool.cpp
+            tests/layers/Test_CNNBatchNorm.cpp
+            tests/layers/Test_CNNGlobalAverage.cpp
+            tests/layers/Test_CNNLayer1D.cpp
+            tests/layers/Test_CNNLayer2D.cpp
+            tests/layers/Test_CNNLayerDepthwise.cpp
+            tests/layers/Test_CNNMaxPool.cpp
+            tests/layers/Test_CNNPaddingLayer.cpp
+            tests/layers/Test_DataLayer.cpp
+            tests/layers/Test_LayerNorm.cpp
+            tests/layers/Test_Layer_AdditiveAttention.cpp
+            tests/layers/Test_Layer_ArgMax.cpp
+            tests/layers/Test_Layer_ArgMin.cpp
+            tests/layers/Test_Layer_BahdanauMonotonicAttention.cpp
+            tests/layers/Test_Layer_BatchExpander.cpp
+            tests/layers/Test_Layer_BatchNorm.cpp
+            tests/layers/Test_Layer_BiLSTM.cpp
+            tests/layers/Test_Layer_Clamp.cpp
+            tests/layers/Test_Layer_CumSum.cpp
+            tests/layers/Test_Layer_Dropout.cpp
+            tests/layers/Test_Layer_DynamicConvolutionAttention.cpp
+            tests/layers/Test_Layer_DynamicDepthwiseConvolution2D.cpp
+            tests/layers/Test_Layer_ElementWiseCompare.cpp
+            tests/layers/Test_Layer_ElementWiseDiv.cpp
+            tests/layers/Test_Layer_ElementWiseMax.cpp
+            tests/layers/Test_Layer_ElementWiseMin.cpp
+            tests/layers/Test_Layer_ElementWiseMul.cpp
+            tests/layers/Test_Layer_ElementWiseSub.cpp
+            tests/layers/Test_Layer_ElementWiseSum.cpp
+            tests/layers/Test_Layer_Exp.cpp
+            tests/layers/Test_Layer_FakeQuant.cpp
+            tests/layers/Test_Layer_FixedBias.cpp
+            tests/layers/Test_Layer_GRU.cpp
+            tests/layers/Test_Layer_GRUCell.cpp
+            tests/layers/Test_Layer_IndexFill.cpp
+            tests/layers/Test_Layer_L2Norm.cpp
+            tests/layers/Test_Layer_L2SquaredNorm.cpp
+            tests/layers/Test_Layer_LSTM.cpp
+            tests/layers/Test_Layer_LSTMCell.cpp
+            tests/layers/Test_Layer_LSTMFused.cpp
+            tests/layers/Test_Layer_LocationSensitiveAttention.cpp
+            tests/layers/Test_Layer_Log.cpp
+            tests/layers/Test_Layer_NonZeroMask.cpp
+            tests/layers/Test_Layer_RSqrt.cpp
+            tests/layers/Test_Layer_RandomChoice.cpp
+            tests/layers/Test_Layer_RandomSelect.cpp
+            tests/layers/Test_Layer_RandomTensor.cpp
+            tests/layers/Test_Layer_ReduceMax.cpp
+            tests/layers/Test_Layer_ReduceMean.cpp
+            tests/layers/Test_Layer_ReduceMin.cpp
+            tests/layers/Test_Layer_ReduceNonZero.cpp
+            tests/layers/Test_Layer_ReduceStd.cpp
+            tests/layers/Test_Layer_ReduceSum.cpp
+            tests/layers/Test_Layer_RepeatInterleave.cpp
+            tests/layers/Test_Layer_Reverse.cpp
+            tests/layers/Test_Layer_Roll.cpp
+            tests/layers/Test_Layer_Round.cpp
+            tests/layers/Test_Layer_Scale.cpp
+            tests/layers/Test_Layer_Select.cpp
+            tests/layers/Test_Layer_Sqrt.cpp
+            tests/layers/Test_Layer_Square.cpp
+            tests/layers/Test_Layer_Tile.cpp
+            tests/layers/Test_Layer_TrainableParamsConvert.cpp
+            tests/layers/Test_Layer_ZeroOutput.cpp
+            tests/layers/Test_Layer_Zoneout.cpp
+            tests/layers/Test_Linear.cpp
+            tests/layers/Test_MaskedFillLayer.cpp
+            tests/layers/Test_MatMul.cpp
+            tests/layers/Test_NetworkParams.cpp
+            tests/layers/Test_ReshapeLayer.cpp
+            tests/layers/Test_Slicing.cpp
+            tests/layers/Test_Transpose.cpp
+            tests/layers/Test_TransposedCNNLayer1D.cpp
+            tests/layers/Test_TransposedCNNLayer2D.cpp
+            )
+endif ()
+
+if (RAUL_TESTS_BUILD_CORE)
+    target_sources(Raul-Tests PRIVATE
+            tests/lib/Test_Common.cpp
+            tests/lib/Test_Compiler.cpp
+            tests/lib/Test_DataTransformations.cpp
+            tests/lib/Test_ElementSequence.cpp
+            tests/lib/Test_LossScale.cpp
+            tests/lib/Test_MemoryManager.cpp
+            tests/lib/Test_Name.cpp
+            tests/lib/Test_NameGenerator.cpp
+            tests/lib/Test_Quantization.cpp
+            tests/lib/Test_Random.cpp
+            tests/lib/Test_Tensor.cpp
+            tests/lib/Test_Yato.cpp
+            tests/lib/workflow/Test_IntervalTree.cpp
+            tests/lib/workflow/Test_Workflow.cpp
+            tests/lib/workflow/Test_WorkflowAllocation.cpp
+            tests/lib/workflow/Test_WorkflowBenchmarks.cpp
+            tests/lib/workflow/Test_WorkflowCheckpointing.cpp
+            tests/lib/workflow/Test_WorkflowCompression.cpp
+            tests/lib/workflow/Test_WorkflowInterference.cpp
+            tests/lib/workflow/Test_WorkflowPool.cpp
+            tests/lib/workflow/Test_WorkflowOverrideLayerExecutionTarget.cpp
+            tests/lib/workflow/Test_WorkflowTools.h
+            )
+endif ()
+
+if (RAUL_TESTS_BUILD_LOSS)
+    target_sources(Raul-Tests PRIVATE
+            tests/losses/Test_BinaryCrossEntropyLoss.cpp
+            tests/losses/Test_KLDivLoss.cpp
+            tests/losses/Test_L1Loss.cpp
+            tests/losses/Test_MSELoss.cpp
+            tests/losses/Test_SigmoidCrossEntropyLoss.cpp
+            tests/losses/Test_SoftmaxCrossEntropyLoss.cpp
+            tests/losses/Test_WeightedLoss.cpp
+            )
+endif ()
+
+if (RAUL_TESTS_BUILD_OPTIMIZERS)
+    target_sources(Raul-Tests PRIVATE
+            tests/optimizers/Test_Optimizer.cpp
+            tests/optimizers/Test_Optimizer_ASGD.cpp
+            tests/optimizers/Test_Optimizer_Adadelta.cpp
+            tests/optimizers/Test_Optimizer_Adagrad.cpp
+            tests/optimizers/Test_Optimizer_Adam.cpp
+            tests/optimizers/Test_Optimizer_AdamW.cpp
+            tests/optimizers/Test_Optimizer_Adamax.cpp
+            tests/optimizers/Test_Optimizer_LAMB.cpp
+            tests/optimizers/Test_Optimizer_Momentum.cpp
+            tests/optimizers/Test_Optimizer_Nesterov.cpp
+            tests/optimizers/Test_Optimizer_RMSprop.cpp
+            tests/optimizers/Test_Optimizer_Ranger.cpp
+            tests/optimizers/Test_Optimizer_Rprop.cpp
+            tests/optimizers/Test_Optimizer_SGD.cpp
+            tests/optimizers/schedulers/Test_Scheduler.cpp
+            tests/optimizers/schedulers/Test_Scheduler_CosineAnnealing.cpp
+            tests/optimizers/schedulers/Test_Scheduler_Exponential.cpp
+            tests/optimizers/schedulers/Test_Scheduler_Lambda.cpp
+            tests/optimizers/schedulers/Test_Scheduler_WarmUp.cpp
+            )
+endif ()
+
+if (RAUL_TESTS_BUILD_TOPOLOGIES)
+    target_sources(Raul-Tests PRIVATE
+            tests/topologies/Test_MobilenetV2.cpp
+            tests/topologies/Test_MobilenetV3.cpp
+            tests/topologies/Test_NINCifar.cpp
+            tests/topologies/Test_ResNet.cpp
+            tests/topologies/Test_Transformer.cpp
+            )
+endif ()
+
+if (RAUL_TESTS_BUILD_POSTPROCESSING)
+    target_sources(Raul-Tests PRIVATE
+            tests/postprocessing/Test_PostProcessing_GradientClipping.cpp
+            )
+endif ()
+
+target_sources(Raul-Tests PRIVATE
+        tests/tools/TestTools.cpp
+        tests/tools/TestTools.h
+        tests/tools/callbacks/LayerTypeStatistics.h
+        tests/tools/callbacks/MultiCallback.h
+        tests/tools/callbacks/TensorChecker.h
+        tests/tools/callbacks/TensorTracer.h
+        )
\ No newline at end of file
diff --git a/training/src/tests/tests/GTestExtensions.h b/training/src/tests/tests/GTestExtensions.h
new file mode 100644
index 00000000..4600cdad
--- /dev/null
+++ b/training/src/tests/tests/GTestExtensions.h
@@ -0,0 +1,179 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef GTESTEXTENSIONS_H
+#define GTESTEXTENSIONS_H
+
+#include <gtest/gtest.h>
+
+#include <training/base/common/Tensor.h>
+
+#define ASSERT_U8_TENSORS_EQ(reference_value, target_value)                                                                                                                                            \
+    {                                                                                                                                                                                                  \
+        static_assert(std::is_same_v<std::decay_t<decltype(reference_value)>, raul::TensorU8>, #reference_value " should have type raul::Tensor");                                                     \
+        static_assert(std::is_same_v<std::decay_t<decltype(target_value)>, raul::TensorU8>, #target_value " should have type raul::Tensor");                                                           \
+                                                                                                                                                                                                       \
+        if (reference_value.getBatchSize() != target_value.getBatchSize())                                                                                                                             \
+        {                                                                                                                                                                                              \
+            FAIL() << "expected that " #reference_value " and " #target_value " have the same batch size, but they have not\n"                                                                         \
+                   << #reference_value " has batch size " << reference_value.getBatchSize() << "\n"                                                                                                    \
+                   << #target_value " has batch size " << target_value.getBatchSize() << "\n";                                                                                                         \
+        }                                                                                                                                                                                              \
+                                                                                                                                                                                                       \
+        if (reference_value.getDepth() != target_value.getDepth())                                                                                                                                     \
+        {                                                                                                                                                                                              \
+            FAIL() << "expected that " #reference_value " and " #target_value " have the same depth, but they have not\n"                                                                              \
+                   << #reference_value " has depth " << reference_value.getDepth() << "\n"                                                                                                             \
+                   << #target_value " has depth " << target_value.getDepth() << "\n";                                                                                                                  \
+        }                                                                                                                                                                                              \
+                                                                                                                                                                                                       \
+        if (reference_value.getHeight() != target_value.getHeight())                                                                                                                                   \
+        {                                                                                                                                                                                              \
+            FAIL() << "expected that " #reference_value " and " #target_value " have the same height, but they have not\n"                                                                             \
+                   << #reference_value " has height " << reference_value.getHeight() << "\n"                                                                                                           \
+                   << #target_value " has height " << target_value.getHeight() << "\n";                                                                                                                \
+        }                                                                                                                                                                                              \
+                                                                                                                                                                                                       \
+        if (reference_value.getWidth() != target_value.getWidth())                                                                                                                                     \
+        {                                                                                                                                                                                              \
+            FAIL() << "expected that " #reference_value " and " #target_value " have the same width, but they have not\n"                                                                              \
+                   << #reference_value " has width " << reference_value.getWidth() << "\n"                                                                                                             \
+                   << #target_value " has width " << target_value.getWidth() << "\n";                                                                                                                  \
+        }                                                                                                                                                                                              \
+                                                                                                                                                                                                       \
+        for (size_t i = 0; i < reference_value.size(); ++i)                                                                                                                                            \
+        {                                                                                                                                                                                              \
+            if (reference_value[i] != target_value[i])                                                                                                                                                 \
+            {                                                                                                                                                                                          \
+                FAIL() << #reference_value " and " #target_value << " have different values at position " << i << "\n"                                                                                 \
+                       << #reference_value "[" << i << "] evaluates to " << reference_value[i] << "\n"                                                                                                 \
+                       << #target_value "[" << i << "] evaluates to " << target_value[i] << "\n";                                                                                                      \
+            }                                                                                                                                                                                          \
+        }                                                                                                                                                                                              \
+    }
+
+#define ASSERT_FLOAT_TENSORS_EQ(reference_value, target_value, epsilon)                                                                                                                                \
+    {                                                                                                                                                                                                  \
+        static_assert(std::is_same_v<std::decay_t<decltype(reference_value)>, raul::Tensor>, #reference_value " should have type raul::Tensor");                                                       \
+        static_assert(std::is_same_v<std::decay_t<decltype(target_value)>, raul::Tensor>, #target_value " should have type raul::Tensor");                                                             \
+        static_assert(std::is_same_v<std::decay_t<decltype(epsilon)>, float>, "epsilon should have type float");                                                                                       \
+                                                                                                                                                                                                       \
+        if (reference_value.getBatchSize() != target_value.getBatchSize())                                                                                                                             \
+        {                                                                                                                                                                                              \
+            FAIL() << "expected that " #reference_value " and " #target_value " have the same batch size, but they have not\n"                                                                         \
+                   << #reference_value " has batch size " << reference_value.getBatchSize() << "\n"                                                                                                    \
+                   << #target_value " has batch size " << target_value.getBatchSize() << "\n";                                                                                                         \
+        }                                                                                                                                                                                              \
+                                                                                                                                                                                                       \
+        if (reference_value.getDepth() != target_value.getDepth())                                                                                                                                     \
+        {                                                                                                                                                                                              \
+            FAIL() << "expected that " #reference_value " and " #target_value " have the same depth, but they have not\n"                                                                              \
+                   << #reference_value " has depth " << reference_value.getDepth() << "\n"                                                                                                             \
+                   << #target_value " has depth " << target_value.getDepth() << "\n";                                                                                                                  \
+        }                                                                                                                                                                                              \
+                                                                                                                                                                                                       \
+        if (reference_value.getHeight() != target_value.getHeight())                                                                                                                                   \
+        {                                                                                                                                                                                              \
+            FAIL() << "expected that " #reference_value " and " #target_value " have the same height, but they have not\n"                                                                             \
+                   << #reference_value " has height " << reference_value.getHeight() << "\n"                                                                                                           \
+                   << #target_value " has height " << target_value.getHeight() << "\n";                                                                                                                \
+        }                                                                                                                                                                                              \
+                                                                                                                                                                                                       \
+        if (reference_value.getWidth() != target_value.getWidth())                                                                                                                                     \
+        {                                                                                                                                                                                              \
+            FAIL() << "expected that " #reference_value " and " #target_value " have the same width, but they have not\n"                                                                              \
+                   << #reference_value " has width " << reference_value.getWidth() << "\n"                                                                                                             \
+                   << #target_value " has width " << target_value.getWidth() << "\n";                                                                                                                  \
+        }                                                                                                                                                                                              \
+                                                                                                                                                                                                       \
+        for (size_t i = 0; i < reference_value.size(); ++i)                                                                                                                                            \
+        {                                                                                                                                                                                              \
+            if (std::abs(reference_value[i] - target_value[i]) > epsilon)                                                                                                                              \
+            {                                                                                                                                                                                          \
+                FAIL() << #reference_value " and " #target_value << " have different values at position " << i << "\n"                                                                                 \
+                       << #reference_value "[" << i << "] evaluates to " << reference_value[i] << "\n"                                                                                                 \
+                       << #target_value "[" << i << "] evaluates to " << target_value[i] << "\n"                                                                                                       \
+                       << #epsilon " evaluates to " << epsilon << "\n";                                                                                                                                \
+            }                                                                                                                                                                                          \
+        }                                                                                                                                                                                              \
+    }
+
+#define ASSERT_STD_STR_EQ(reference_value, target_value)                                                                                                                                               \
+    {                                                                                                                                                                                                  \
+        static_assert(std::is_same_v<std::decay_t<decltype(reference_value)>, std::string>, #reference_value " should have type std::string");                                                         \
+        static_assert(std::is_same_v<std::decay_t<decltype(target_value)>, std::string> || std::is_convertible_v<decltype(target_value), std::string>,                                                 \
+                      #target_value " should have type std::string or can be converted to it");                                                                                                        \
+        if (reference_value != target_value)                                                                                                                                                           \
+        {                                                                                                                                                                                              \
+            FAIL() << "Expected that " #reference_value << " and " << #target_value << " are equal, but they are differ\n"                                                                             \
+                   << #reference_value " is\n"                                                                                                                                                         \
+                   << "\t" << reference_value << "\n"                                                                                                                                                  \
+                   << #target_value " is\n"                                                                                                                                                            \
+                   << "\t" << target_value << "\n";                                                                                                                                                    \
+        }                                                                                                                                                                                              \
+    }
+
+#define ASSERT_FLOAT_VECTORS_EQ(reference_value, target_value, epsilon)                                                                                                                                \
+    {                                                                                                                                                                                                  \
+        static_assert(std::is_same_v<std::decay_t<decltype(reference_value)>, std::vector<float>>, #reference_value " should have type std::vector<float>");                                           \
+        static_assert(std::is_same_v<std::decay_t<decltype(target_value)>, std::vector<float>>, #target_value " should have type std::vector<float>");                                                 \
+        static_assert(std::is_same_v<std::decay_t<decltype(epsilon)>, float>, "epsilon should have type float");                                                                                       \
+                                                                                                                                                                                                       \
+        if (reference_value.size() != target_value.size())                                                                                                                                             \
+        {                                                                                                                                                                                              \
+            FAIL() << "expected that " #reference_value " and " #target_value " have the same size, but they have not\n"                                                                               \
+                   << #reference_value " has size " << reference_value.size() << "\n"                                                                                                                  \
+                   << #target_value " has size " << target_value.size() << "\n";                                                                                                                       \
+        }                                                                                                                                                                                              \
+                                                                                                                                                                                                       \
+        for (size_t i = 0; i < reference_value.size(); ++i)                                                                                                                                            \
+        {                                                                                                                                                                                              \
+            if (std::abs(reference_value[i] - target_value[i]) > epsilon)                                                                                                                              \
+            {                                                                                                                                                                                          \
+                FAIL() << #reference_value " and " #target_value << " have different values at position " << i << "\n"                                                                                 \
+                       << #reference_value "[" << i << "] evaluates to " << reference_value[i] << "\n"                                                                                                 \
+                       << #target_value "[" << i << "] evaluates to " << target_value[i] << "\n"                                                                                                       \
+                       << #epsilon " evaluates to " << epsilon << "\n";                                                                                                                                \
+            }                                                                                                                                                                                          \
+        }                                                                                                                                                                                              \
+    }
+
+#define ASSERT_INTERVALS_NEAR(i1_begin, i1_end, i2_begin, i2_end, epsilon)                                                                                                                             \
+    {                                                                                                                                                                                                  \
+        size_t i1_size = i1_end - i1_begin;                                                                                                                                                            \
+        size_t i2_size = i2_end - i2_begin;                                                                                                                                                            \
+        if (i1_size != i2_size)                                                                                                                                                                        \
+        {                                                                                                                                                                                              \
+            FAIL() << "expected that intervals have the same size, but they have different\n"                                                                                                          \
+                   << "size of first interval evaluated to:\n"                                                                                                                                         \
+                   << "\t" << i1_size << "\n"                                                                                                                                                          \
+                   << "size of second interval evaluated to:\n"                                                                                                                                        \
+                   << "\t" << i2_size << "\n";                                                                                                                                                         \
+        }                                                                                                                                                                                              \
+                                                                                                                                                                                                       \
+        size_t pos = 0;                                                                                                                                                                                \
+        auto it1 = i1_begin;                                                                                                                                                                           \
+        auto it2 = i2_begin;                                                                                                                                                                           \
+        while (it1 != i1_end)                                                                                                                                                                          \
+        {                                                                                                                                                                                              \
+            if (std::abs(*it1 - *it2) >= epsilon)                                                                                                                                                      \
+            {                                                                                                                                                                                          \
+                FAIL() << "intervals differ in position " << pos << "\n";                                                                                                                              \
+            }                                                                                                                                                                                          \
+            ++it1;                                                                                                                                                                                     \
+            ++it2;                                                                                                                                                                                     \
+            ++pos;                                                                                                                                                                                     \
+        }                                                                                                                                                                                              \
+    }
+
+#endif // GTESTEXTENSIONS_H
diff --git a/training/src/tests/tests/activations/Test_ActivationFunc_GeLU.cpp b/training/src/tests/tests/activations/Test_ActivationFunc_GeLU.cpp
new file mode 100644
index 00000000..b971715f
--- /dev/null
+++ b/training/src/tests/tests/activations/Test_ActivationFunc_GeLU.cpp
@@ -0,0 +1,130 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/activations/GeLUActivation.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestActivationFuncGeLU, GeLUErfUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    constexpr auto eps = 1e-4_dt;
+
+    size_t BATCH_SIZE = 2;
+    size_t WIDTH = 2;
+    size_t HEIGHT = 2;
+    size_t DEPTH = 3;
+
+    Tensor realOut = { 0.8413_dt, 0.8413_dt, 1.9545_dt, 1.9545_dt, 0.8413_dt, 0.8413_dt, 1.9545_dt, 1.9545_dt, 0.8413_dt, 0.8413_dt, 1.9545_dt, 1.9545_dt,
+                       0.8413_dt, 0.8413_dt, 2.9960_dt, 2.9960_dt, 3.9999_dt, 3.9999_dt, 2.9960_dt, 2.9960_dt, 1.9545_dt, 0.8413_dt, 2.9960_dt, 7.0000_dt };
+
+    Tensor realGrad = { 1.0833_dt, 1.0833_dt, 2.1705_dt, 2.1705_dt, 1.0833_dt, 1.0833_dt, 2.1705_dt, 2.1705_dt, 1.0833_dt, 1.0833_dt, 2.1705_dt, 2.1705_dt,
+                        1.0833_dt, 1.0833_dt, 3.0358_dt, 3.0358_dt, 4.0020_dt, 4.0020_dt, 3.0358_dt, 3.0358_dt, 2.1705_dt, 1.0833_dt, 3.0358_dt, 7.0000_dt };
+
+    Tensor raw = { 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 3._dt, 3._dt, 4._dt, 4._dt, 3._dt, 3._dt, 2._dt, 1._dt, 3._dt, 7._dt };
+
+    work.add<DataLayer>("data2", DataParams{ { "in", "target", "weights" }, DEPTH, HEIGHT, WIDTH });
+    GeLUErf gelu{ "gelu", { { "in" }, { "out" } }, networkParameters };
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = TORANGE(raw);
+    gelu.forwardCompute(NetworkMode::Train);
+
+    memory_manager[Name("out").grad()].memAllocate(nullptr);
+    memory_manager[Name("out").grad()] = TORANGE(raw);
+
+    const raul::Tensor& out = memory_manager["out"];
+
+    EXPECT_EQ(out.size(), realOut.size());
+    for (size_t i = 0; i < out.size(); ++i)
+        EXPECT_NEAR(out[i], realOut[i], eps);
+
+    std::cout << " - GeLU_Erf forward is Ok.\n";
+
+    gelu.backwardCompute();
+
+    const raul::Tensor& inGrad = memory_manager[raul::Name("in").grad()];
+
+    EXPECT_EQ(inGrad.size(), realGrad.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+        EXPECT_NEAR(inGrad[i], realGrad[i], eps);
+
+    std::cout << " - GeLU_Erf backward is Ok.\n";
+}
+
+TEST(TestActivationFuncGeLU, GeLUATanhUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    constexpr auto eps = 2e-3_dt;
+
+    size_t BATCH_SIZE = 2;
+    size_t WIDTH = 2;
+    size_t HEIGHT = 2;
+    size_t DEPTH = 3;
+
+    Tensor realOut = { 0.8412_dt, 0.8412_dt, 1.9546_dt, 1.9546_dt, 0.8412_dt, 0.8412_dt, 1.9546_dt, 1.9546_dt, 0.8412_dt, 0.8412_dt, 1.9546_dt, 1.9546_dt,
+                       0.8412_dt, 0.8412_dt, 2.9964_dt, 2.9964_dt, 3.9999_dt, 3.9999_dt, 2.9964_dt, 2.9964_dt, 1.9546_dt, 0.8412_dt, 2.9964_dt, 7.0000_dt };
+
+    Tensor realGrad = { 1.0830_dt, 1.0830_dt, 2.1722_dt, 2.1722_dt, 1.0830_dt, 1.0830_dt, 2.1722_dt, 2.1722_dt, 1.0830_dt, 1.0830_dt, 2.1722_dt, 2.1722_dt,
+                        1.0830_dt, 1.0830_dt, 3.0348_dt, 3.0348_dt, 4.0013_dt, 4.0013_dt, 3.0348_dt, 3.0348_dt, 2.1722_dt, 1.0830_dt, 3.0348_dt, 7.0000_dt };
+
+    Tensor raw = { 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 3._dt, 3._dt, 4._dt, 4._dt, 3._dt, 3._dt, 2._dt, 1._dt, 3._dt, 7._dt };
+
+    work.add<DataLayer>("data2", DataParams{ { "in", "target", "weights" }, DEPTH, HEIGHT, WIDTH });
+    GeLUTanh gelu{ "gelu", { { "in" }, { "out" } }, networkParameters };
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = TORANGE(raw);
+    gelu.forwardCompute(NetworkMode::Train);
+
+    memory_manager[Name("out").grad()].memAllocate(nullptr);
+    memory_manager[Name("out").grad()] = TORANGE(raw);
+
+    const raul::Tensor& out = memory_manager["out"];
+
+    EXPECT_EQ(out.size(), realOut.size());
+    for (size_t i = 0; i < out.size(); ++i)
+        EXPECT_NEAR(out[i], realOut[i], eps);
+
+    std::cout << " - GeLU_Tanh forward is Ok.\n";
+
+    gelu.backwardCompute();
+
+    const raul::Tensor& inGrad = memory_manager[raul::Name("in").grad()];
+
+    EXPECT_EQ(inGrad.size(), realGrad.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+        EXPECT_NEAR(inGrad[i], realGrad[i], eps);
+
+    std::cout << " - GeLU_Tanh backward is Ok.\n";
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/activations/Test_ActivationFunc_Hsigmoid.cpp b/training/src/tests/tests/activations/Test_ActivationFunc_Hsigmoid.cpp
new file mode 100644
index 00000000..d28e329e
--- /dev/null
+++ b/training/src/tests/tests/activations/Test_ActivationFunc_Hsigmoid.cpp
@@ -0,0 +1,224 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <chrono>
+#include <cstdio>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/activations/HSigmoidActivation.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+namespace
+{
+const raul::dtype indeterminate_value_m3 = .0;
+const raul::dtype indeterminate_value_p3 = .0;
+
+raul::dtype golden_hsigmoid(const raul::dtype x)
+{
+    return std::min(std::max(x + 3.0_dt, .0_dt), 6.0_dt) / 6.0_dt;
+}
+
+raul::dtype golden_hswish_grad(raul::dtype x, raul::dtype grad)
+{
+    if (x == -3.0_dt)
+    {
+        return indeterminate_value_m3 * grad;
+    }
+    if (x == 3.0_dt)
+    {
+        return indeterminate_value_p3 * grad;
+    }
+    if (x > -3.0_dt && x < 3.0_dt)
+    {
+        return grad / 6.0_dt;
+    }
+    return 0.0_dt;
+}
+}
+
+TEST(TestActivationFuncHSigmoid, PointsUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = 1e-6_dt;
+    const auto tensor_size = 5U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, 1, 1 });
+
+    const auto params = raul::HSigmoidActivationParams{ { "in" }, { "out" } };
+
+    // Apply function (forward)
+    raul::HSigmoidActivation hsigmoid_activation("hsigmoid", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+
+    auto& in_tensor = memory_manager["in"];
+    in_tensor = TORANGE((raul::Tensor{ -4.0_dt, -3.0_dt, 0.0_dt, 3.0_dt, 4.0_dt }));
+
+    auto& grad_tensor = memory_manager[raul::Name("out").grad()];
+    for (auto& val : grad_tensor)
+    {
+        val = 2.0_dt;
+    }
+
+    hsigmoid_activation.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), in_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto in_value = in_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_hsigmoid(in_value);
+        EXPECT_NEAR(out_value, golden_out_value, eps);
+    }
+
+    // Apply function (backward)
+    hsigmoid_activation.backwardCompute();
+
+    // Checks
+    const auto& out_tensor_grad = memory_manager[raul::Name("in").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), grad_tensor.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto in_value = in_tensor[i];
+        const auto grad_value = grad_tensor[i];
+        const auto out_value = out_tensor_grad[i];
+        const auto golden_out_value = golden_hswish_grad(in_value, grad_value);
+        EXPECT_NEAR(out_value, golden_out_value, eps);
+    }
+}
+
+TEST(TestActivationFuncHSigmoid, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = 1e-6_dt;
+    const auto tensor_size = 1000U;
+    // We have a significant change in the range [-4,4]
+    // See https://www.wolframalpha.com/input/?i=min%28max%28x%2B3%2C0%29%2C6%29%2F6
+    const auto random_range = std::pair<raul::dtype, raul::dtype>(-4.0f, 4.0f);
+
+    // Random generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<> dis(random_range.first, std::nextafter(random_range.second, std::numeric_limits<raul::dtype>::max()));
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, 1, 1 });
+
+    const auto params = raul::HSigmoidActivationParams{ { "in" }, { "out" } };
+
+    // Apply function
+    raul::HSigmoidActivation hsigmoid_activation("hsigmoid", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+
+    auto& in_tensor = memory_manager["in"];
+    for (auto& val : in_tensor)
+    {
+        val = static_cast<raul::dtype>(dis(gen));
+    }
+
+    hsigmoid_activation.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), in_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto in_value = in_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_hsigmoid(in_value);
+        EXPECT_NEAR(out_value, golden_out_value, eps);
+    }
+}
+
+TEST(TestActivationFuncHSigmoid, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = 1e-6_dt;
+    const auto tensor_size = 1000U;
+    // We have a significant change in the range [-5,5]
+    // See https://www.wolframalpha.com/input/?i=x%2F6*min%28max%28x%2B3%2C0%29%2C6%29
+    const auto random_range = std::pair<raul::dtype, raul::dtype>(-4.0f, 4.0f);
+
+    // Random generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<> dis(random_range.first, std::nextafter(random_range.second, std::numeric_limits<raul::dtype>::max()));
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, 1, 1 });
+
+    const auto params = raul::HSigmoidActivationParams{ { "in" }, { "out" } };
+
+    // Apply function
+    raul::HSigmoidActivation hsigmoid_activation("hsigmoid", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+
+    auto& in_tensor = memory_manager["in"];
+    for (auto& val : in_tensor)
+    {
+        val = static_cast<raul::dtype>(dis(gen));
+    }
+
+    auto& grad_tensor = memory_manager[raul::Name("out").grad()];
+    for (auto& val : grad_tensor)
+    {
+        val = static_cast<raul::dtype>(dis(gen));
+    }
+
+    hsigmoid_activation.forwardCompute(raul::NetworkMode::Train);
+    hsigmoid_activation.backwardCompute();
+
+    // Checks
+    const auto& out_tensor_grad = memory_manager[raul::Name("in").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), grad_tensor.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto in_value = in_tensor[i];
+        const auto grad_value = grad_tensor[i];
+        const auto out_value = out_tensor_grad[i];
+        const auto golden_out_value = golden_hswish_grad(in_value, grad_value);
+        EXPECT_NEAR(out_value, golden_out_value, eps);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/activations/Test_ActivationFunc_Hswish.cpp b/training/src/tests/tests/activations/Test_ActivationFunc_Hswish.cpp
new file mode 100644
index 00000000..cb01586c
--- /dev/null
+++ b/training/src/tests/tests/activations/Test_ActivationFunc_Hswish.cpp
@@ -0,0 +1,255 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <chrono>
+#include <cstdio>
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/activations/HSwishActivation.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+namespace
+{
+const auto indeterminate_value_m3 = 0.0_dt;
+const auto indeterminate_value_p3 = 1.0_dt;
+
+raul::dtype golden_hswish(const raul::dtype x)
+{
+    return x / 6.0_dt * std::min(std::max(x + 3.0_dt, .0_dt), 6.0_dt);
+}
+
+raul::dtype golden_hswish_grad(raul::dtype x, raul::dtype grad)
+{
+    if (x == -3.0_dt)
+    {
+        return indeterminate_value_m3 * grad;
+    }
+    if (x == 3.0_dt)
+    {
+        return indeterminate_value_p3 * grad;
+    }
+    if (x > 3.0_dt)
+    {
+        return grad;
+    }
+    if (x > -3.0_dt && x < 3.0_dt)
+    {
+        return 1.0_dt / 6.0_dt * (2.0_dt * x + 3.0_dt) * grad;
+    }
+    return 0.0_dt;
+}
+}
+
+TEST(TestActivationFuncHSwish, Unit)
+{
+    PROFILE_TEST
+    auto eps = 1e-4_dt;
+
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        raul::Tensor raw{ 0.37479503f, 0.74949179f, 0.92839539f, 0.40420301f, 0.9961107f,  0.66630404f, 0.82047117f, 0.99906097f, 0.48825795f, 0.6590186f,  0.14482136f, 0.6286064f,  0.54629563f,
+                          0.18286226f, 0.73805489f, 0.50778409f, 0.76171372f, 0.74803807f, 0.22766416f, 0.80420801f, 0.76574229f, 0.795332f,   0.00788858f, 0.21592507f, 0.66575409f, 0.91054865f,
+                          0.07787714f, 0.93616972f, 0.78133592f, 0.89633024f, 0.96769797f, 0.57117974f, 0.58974003f, 0.6582992f,  0.47186511f, 0.03024406f, 0.86331904f, 0.4101691f,  0.74629559f,
+                          0.1786914f,  0.26763513f, 0.68119195f, 0.41871249f, 0.41247165f, 0.48607614f, 0.65607838f, 0.47523406f, 0.45516007f, 0.39255511f, 0.43910054f, 0.34682715f, 0.08459205f,
+                          0.68237903f, 0.84103279f, 0.54420833f, 0.15065369f, 0.72408225f, 0.20047205f, 0.26391343f, 0.51592856f, 0.09968541f, 0.90827312f, 0.85315302f, 0.98048446f, 0.59192641f,
+                          0.54249449f, 0.51840919f, 0.36040054f, 0.3678461f,  0.68468829f, 0.41993762f, 0.20087144f, 0.21774159f, 0.58968185f, 0.05935208f, 0.22305229f, 0.50138518f, 0.65757976f,
+                          0.14150964f, 0.07537156f, 0.34186466f, 0.34769964f, 0.89359716f, 0.97368842f, 0.55870338f, 0.75948831f, 0.23420801f, 0.06065636f, 0.10140947f, 0.59413656f, 0.88430418f,
+                          0.29935133f, 0.36099395f, 0.59542665f, 0.25830884f, 0.07518427f, 0.09374659f, 0.2718588f,  0.32238792f, 0.65717813f, 0.17969255f, 0.79472564f, 0.54135358f, 0.02137773f,
+                          0.70453999f, 0.1567105f,  0.30232926f, 0.76933107f, 0.13083392f, 0.24884672f, 0.94565419f, 0.18572746f, 0.59809335f, 0.81106049f, 0.43165358f, 0.4994478f,  0.10799541f,
+                          0.83711806f, 0.32547974f, 0.5652622f,  0.70515491f, 0.9385492f,  0.8703726f,  0.05524975f, 0.543256f,   0.33594836f, 0.90279934f, 0.00769521f, 0.22831708f, 0.55580578f,
+                          0.57967145f, 0.47428567f, 0.98923259f, 0.01861445f, 0.08208355f, 0.53100689f, 0.27314956f, 0.03744627f, 0.70743921f, 0.48315441f, 0.89972966f, 0.82618481f, 0.79273583f,
+                          0.71205182f, 0.70734133f, 0.22427137f, 0.94323804f, 0.45126228f, 0.72933639f, 0.7159068f,  0.1407426f,  0.39469651f, 0.25006817f, 0.81597341f, 0.94659794f, 0.96640427f,
+                          0.0459238f,  0.83164318f, 0.31587163f, 0.09632278f, 0.50825189f, 0.89314523f, 0.05260463f, 0.97404436f, 0.33124845f, 0.49051957f, 0.35728925f, 0.08471391f, 0.54142684f,
+                          0.93035542f, 0.55526997f, 0.34721205f, 0.67594097f, 0.19324233f, 0.91126217f, 0.93079399f, 0.31820165f, 0.63886702f, 0.7115078f,  0.04140918f, 0.42797978f, 0.31839019f,
+                          0.15497529f, 0.9243903f,  0.10203447f, 0.02429839f, 0.79189611f, 0.77230195f, 0.5449423f,  0.40429956f, 0.00810897f, 0.69576717f, 0.81711272f, 0.1335145f,  0.84167646f,
+                          0.70258132f, 0.81158035f, 0.2585105f,  0.09862106f, 0.37432664f, 0.74594114f, 0.84899346f, 0.8828963f,  0.37912551f, 0.34303555f, 0.88711791f, 0.40549404f, 0.70713591f,
+                          0.5982242f,  0.3516502f,  0.72905793f, 0.9808251f,  0.36860929f, 0.90998312f, 0.36086885f, 0.88600347f, 0.94211039f, 0.85760511f, 0.12342954f, 0.14555429f, 0.80846367f,
+                          0.67246539f, 0.2034387f,  0.76852984f, 0.92605775f, 0.26831058f, 0.67321516f, 0.94508101f, 0.56843361f, 0.3550119f,  0.99057556f, 0.43329525f, 0.35586554f, 0.00825131f,
+                          0.98605478f, 0.21982703f, 0.89359794f, 0.21367511f, 0.11302765f, 0.05410334f, 0.55786171f, 0.47368395f, 0.85497701f, 0.9911735f,  0.41700463f, 0.90840312f, 0.07303944f,
+                          0.84749951f, 0.71403399f, 0.03476528f, 0.61093322f, 0.05770533f, 0.33471661f, 0.04379381f, 0.10857736f, 0.87996621f, 0.90394243f, 0.73068102f, 0.29301801f, 0.19324834f,
+                          0.5182907f,  0.92317623f, 0.3433689f,  0.70590362f, 0.85885382f, 0.08763445f, 0.34156856f, 0.31501855f, 0.99025243f, 0.28233952f, 0.99291062f, 0.11555683f, 0.13840601f,
+                          0.18735525f, 0.50227961f, 0.53595563f, 0.70512296f, 0.41612818f, 0.14544152f, 0.99252427f, 0.88588884f, 0.86861712f, 0.22435276f, 0.81870535f, 0.86298226f, 0.21892615f,
+                          0.89298307f, 0.56381208f, 0.37950085f, 0.61620922f, 0.73818377f, 0.70532032f, 0.91082064f, 0.27172544f, 0.55463835f, 0.62264405f, 0.71936221f, 0.71908206f, 0.12704653f,
+                          0.53684262f, 0.71610209f, 0.28590477f, 0.55936588f, 0.62242997f, 0.879633f,   0.32378584f, 0.75152276f, 0.17728816f, 0.18134275f, 0.14376092f, 0.80051592f, 0.55731164f,
+                          0.58153594f, 0.93270095f, 0.34074676f, 0.42614444f, 0.10055618f, 0.82608708f, 0.28488983f, 0.49550837f, 0.79601586f, 0.32068777f, 0.57682384f, 0.6719388f,  0.38668566f,
+                          0.19383373f, 0.70447052f, 0.9298161f,  0.18070789f, 0.93694095f, 0.33000843f, 0.58258544f, 0.07584233f, 0.43948981f, 0.15010104f, 0.55299119f, 0.76852814f, 0.09187515f,
+                          0.79264914f, 0.37365331f, 0.31016106f, 0.2233624f,  0.88615481f, 0.37971135f, 0.41666305f, 0.94860019f, 0.69704601f, 0.36850484f, 0.89811771f, 0.3676741f,  0.44550689f,
+                          0.48621984f, 0.23688324f, 0.99989737f, 0.99483116f, 0.1974391f,  0.6843357f,  0.31812494f, 0.19924475f, 0.77234562f };
+        size_t batch = 3;
+        size_t in_w = 6;
+        size_t in_h = 5;
+        size_t depth = 4;
+
+        std::vector<raul::dtype> realOut{ 0.2108f, 0.4684f, 0.6079f, 0.2293f, 0.6634f, 0.4071f, 0.5224f, 0.6659f, 0.2839f, 0.4019f, 0.0759f, 0.3802f, 0.3229f, 0.0970f, 0.4598f,
+                                          0.2969f, 0.4776f, 0.4673f, 0.1225f, 0.5099f, 0.4806f, 0.5031f, 0.0040f, 0.1157f, 0.4067f, 0.5935f, 0.0399f, 0.6142f, 0.4924f, 0.5821f,
+
+                                          0.6399f, 0.3400f, 0.3528f, 0.4014f, 0.2730f, 0.0153f, 0.5559f, 0.2331f, 0.4660f, 0.0947f, 0.1458f, 0.4179f, 0.2386f, 0.2346f, 0.2824f,
+                                          0.3998f, 0.2753f, 0.2621f, 0.2220f, 0.2517f, 0.1935f, 0.0435f, 0.4188f, 0.5384f, 0.3215f, 0.0791f, 0.4494f, 0.1069f, 0.1436f, 0.3023f,
+
+                                          0.0515f, 0.5916f, 0.5479f, 0.6505f, 0.3544f, 0.3203f, 0.3040f, 0.2018f, 0.2065f, 0.4205f, 0.2394f, 0.1072f, 0.1168f, 0.3528f, 0.0303f,
+                                          0.1198f, 0.2926f, 0.4009f, 0.0741f, 0.0386f, 0.1904f, 0.1940f, 0.5799f, 0.6449f, 0.3314f, 0.4759f, 0.1262f, 0.0309f, 0.0524f, 0.3559f,
+
+                                          0.5725f, 0.1646f, 0.2022f, 0.3568f, 0.1403f, 0.0385f, 0.0483f, 0.1482f, 0.1785f, 0.4006f, 0.0952f, 0.5026f, 0.3195f, 0.0108f, 0.4350f,
+                                          0.0824f, 0.1664f, 0.4833f, 0.0683f, 0.1347f, 0.6219f, 0.0986f, 0.3587f, 0.5152f, 0.2469f, 0.2913f, 0.0559f, 0.5354f, 0.1804f, 0.3359f,
+
+                                          0.4355f, 0.6161f, 0.5614f, 0.0281f, 0.3208f, 0.1868f, 0.5872f, 0.0039f, 0.1228f, 0.3294f, 0.3458f, 0.2746f, 0.6577f, 0.0094f, 0.0422f,
+                                          0.3125f, 0.1490f, 0.0190f, 0.4371f, 0.2805f, 0.5848f, 0.5269f, 0.5011f, 0.4405f, 0.4371f, 0.1205f, 0.6199f, 0.2596f, 0.4533f, 0.4434f,
+
+                                          0.0737f, 0.2233f, 0.1355f, 0.5190f, 0.6226f, 0.6389f, 0.0233f, 0.5311f, 0.1746f, 0.0497f, 0.2972f, 0.5795f, 0.0268f, 0.6451f, 0.1839f,
+                                          0.2854f, 0.1999f, 0.0436f, 0.3196f, 0.6094f, 0.3290f, 0.1937f, 0.4141f, 0.1028f, 0.5940f, 0.6098f, 0.1760f, 0.3875f, 0.4401f, 0.0210f,
+
+                                          0.2445f, 0.1761f, 0.0815f, 0.6046f, 0.0528f, 0.0122f, 0.5005f, 0.4856f, 0.3220f, 0.2294f, 0.0041f, 0.4286f, 0.5198f, 0.0697f, 0.5389f,
+                                          0.4336f, 0.5156f, 0.1404f, 0.0509f, 0.2105f, 0.4657f, 0.5446f, 0.5714f, 0.2135f, 0.1911f, 0.5747f, 0.2302f, 0.4369f, 0.3588f, 0.1964f,
+
+                                          0.4531f, 0.6507f, 0.2070f, 0.5930f, 0.2021f, 0.5738f, 0.6190f, 0.5514f, 0.0643f, 0.0763f, 0.5132f, 0.4116f, 0.1086f, 0.4827f, 0.6060f,
+                                          0.1462f, 0.4121f, 0.6214f, 0.3381f, 0.1985f, 0.6588f, 0.2479f, 0.1990f, 0.0041f, 0.6551f, 0.1180f, 0.5799f, 0.1144f, 0.0586f, 0.0275f,
+
+                                          0.3308f, 0.2742f, 0.5493f, 0.6593f, 0.2375f, 0.5917f, 0.0374f, 0.5435f, 0.4420f, 0.0176f, 0.3677f, 0.0294f, 0.1860f, 0.0222f, 0.0563f,
+                                          0.5690f, 0.5882f, 0.4543f, 0.1608f, 0.1028f, 0.3039f, 0.6036f, 0.1913f, 0.4360f, 0.5524f, 0.0451f, 0.1902f, 0.1740f, 0.6586f, 0.1545f,
+
+                                          0.6608f, 0.0600f, 0.0724f, 0.0995f, 0.2932f, 0.3159f, 0.4354f, 0.2369f, 0.0762f, 0.6604f, 0.5737f, 0.5601f, 0.1206f, 0.5211f, 0.5556f,
+                                          0.1175f, 0.5794f, 0.3349f, 0.2138f, 0.3714f, 0.4599f, 0.4356f, 0.5937f, 0.1482f, 0.3286f, 0.3759f, 0.4459f, 0.4457f, 0.0662f, 0.3165f,
+
+                                          0.4435f, 0.1566f, 0.3318f, 0.3758f, 0.5688f, 0.1794f, 0.4699f, 0.0939f, 0.0962f, 0.0753f, 0.5071f, 0.3304f, 0.3471f, 0.6113f, 0.1897f,
+                                          0.2433f, 0.0520f, 0.5268f, 0.1560f, 0.2887f, 0.5036f, 0.1775f, 0.3439f, 0.4112f, 0.2183f, 0.1032f, 0.4349f, 0.6090f, 0.0958f, 0.6148f,
+
+                                          0.1832f, 0.3479f, 0.0389f, 0.2519f, 0.0788f, 0.3275f, 0.4827f, 0.0473f, 0.5010f, 0.2101f, 0.1711f, 0.1200f, 0.5740f, 0.2139f, 0.2373f,
+                                          0.6243f, 0.4295f, 0.2069f, 0.5835f, 0.2064f, 0.2558f, 0.2825f, 0.1278f, 0.6666f, 0.6624f, 0.1052f, 0.4202f, 0.1759f, 0.1062f, 0.4856f };
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = raul::HSwishActivationParams{ { "in" }, { "out" } };
+        raul::HSwishActivation hswish("hswish", params, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+
+        const raul::Tensor& out = memory_manager["out"];
+        hswish.forwardCompute(raul::NetworkMode::Test);
+
+        EXPECT_EQ(out.size(), realOut.size());
+        for (size_t i = 0; i < out.size(); ++i)
+            EXPECT_NEAR(TODTYPE(out[i]), TODTYPE(realOut[i]), eps);
+    }
+}
+
+TEST(TestActivationFuncHSwish, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = TODTYPE(1e-4);
+    const auto tensor_size = 1000U;
+    // We have a significant change in the range [-4,4]
+    // See https://www.wolframalpha.com/input/?i=x%2F6*min%28max%28x%2B3%2C0%29%2C6%29
+    const auto random_range = std::pair<raul::dtype, raul::dtype>(-4.0f, 4.0f);
+
+    // Random generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<> dis(random_range.first, std::nextafter(random_range.second, std::numeric_limits<raul::dtype>::max()));
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, 1, 1 });
+    const auto params = raul::HSwishActivationParams{ { "in" }, { "out" } };
+    // Apply function
+    raul::HSwishActivation hswish_activation("hswish", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+
+    auto& in_tensor = memory_manager["in"];
+    for (auto& val : in_tensor)
+    {
+        val = static_cast<raul::dtype>(dis(gen));
+    }
+
+    hswish_activation.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), in_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto in_value = in_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_hswish(in_value);
+        EXPECT_NEAR(out_value, golden_out_value, eps);
+    }
+}
+
+TEST(TestActivationFuncHSwish, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = 1e-6_dt;
+    const auto tensor_size = 1000U;
+    // We have a significant change in the range [-5,5]
+    // See https://www.wolframalpha.com/input/?i=x%2F6*min%28max%28x%2B3%2C0%29%2C6%29
+    const auto random_range = std::pair<raul::dtype, raul::dtype>(-4.0f, 4.0f);
+
+    // Random generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<> dis(random_range.first, std::nextafter(random_range.second, std::numeric_limits<raul::dtype>::max()));
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, 1, 1 });
+
+    const auto params = raul::HSwishActivationParams{ { "in" }, { "out" } };
+
+    // Apply function
+    raul::HSwishActivation hswish_activation("hswish", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+
+    auto& in_tensor = memory_manager["in"];
+    for (auto& val : in_tensor)
+    {
+        val = static_cast<raul::dtype>(dis(gen));
+    }
+
+    auto& grad_tensor = memory_manager[raul::Name("out").grad()];
+    for (auto& val : grad_tensor)
+    {
+        val = static_cast<raul::dtype>(dis(gen));
+    }
+
+    hswish_activation.forwardCompute(raul::NetworkMode::Train);
+    hswish_activation.backwardCompute();
+
+    // Checks
+    const auto& out_tensor_grad = memory_manager[raul::Name("in").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), grad_tensor.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto in_value = in_tensor[i];
+        const auto grad_value = grad_tensor[i];
+        const auto out_value = out_tensor_grad[i];
+        const auto golden_out_value = golden_hswish_grad(in_value, grad_value);
+        EXPECT_NEAR(out_value, golden_out_value, eps);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/activations/Test_ActivationFunc_LeakyReLU.cpp b/training/src/tests/tests/activations/Test_ActivationFunc_LeakyReLU.cpp
new file mode 100644
index 00000000..9dd75e42
--- /dev/null
+++ b/training/src/tests/tests/activations/Test_ActivationFunc_LeakyReLU.cpp
@@ -0,0 +1,177 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <gtest/gtest.h>
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/activations/LeakyReLUActivation.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_lrelu(const raul::dtype x, const raul::dtype negSlope)
+{
+    return std::max(0.0_dt, x) + negSlope * std::min(0.0_dt, x);
+}
+
+raul::dtype golden_lrelu_grad(const raul::dtype out, const raul::dtype grad, const raul::dtype negSlope)
+{
+    return (out > 0.0_dt) ? grad : negSlope * grad;
+}
+
+}
+
+TEST(TestActivationFuncLeakyReLU, DeterministicUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t BATCH_SIZE = 2;
+    constexpr size_t WIDTH = 3;
+    constexpr size_t HEIGHT = 4;
+    constexpr size_t DEPTH = 5;
+    constexpr dtype negSlope = 0.05_dt;
+    constexpr dtype eps = 1.0e-6_dt;
+
+    const Tensor in{ -1.12583983_dt, -1.15236020_dt, -2.50578582e-01_dt, -4.33878809e-01_dt, 8.48710358e-01_dt, 6.92009151e-01_dt,
+        -3.16012770e-01_dt, -2.11521935_dt, 3.22274923e-01_dt, -1.26333475_dt, 3.49983186e-01_dt, 3.08133930e-01_dt, 1.19841509e-01_dt,
+        1.23765790_dt, 1.11677718_dt, -2.47278154e-01_dt, -1.35265374_dt, -1.69593120_dt, 5.66650629e-01_dt, 7.93508351e-01_dt,
+        5.98839462e-01_dt, -1.55509508_dt, -3.41360390e-01_dt, 1.85300612_dt, 7.50189483e-01_dt, -5.85497558e-01_dt, -1.73396751e-01_dt,
+        1.83477938e-01_dt, 1.38936615_dt, 1.58633423_dt, 9.46298361e-01_dt, -8.43676746e-01_dt, -6.13583088e-01_dt, 3.15927416e-02_dt,
+        -4.92676973e-01_dt, 2.48414755e-01_dt, 4.39695835e-01_dt, 1.12411186e-01_dt, 6.40792370e-01_dt, 4.41156268e-01_dt,
+        -1.02309652e-01_dt, 7.92443991e-01_dt, -2.89667696e-01_dt, 5.25074862e-02_dt, 5.22860467e-01_dt, 2.30220532_dt, -1.46889389_dt,
+        -1.58668876_dt, -6.73089921e-01_dt, 8.72831225e-01_dt, 1.05535746_dt, 1.77843720e-01_dt, -2.30335474e-01_dt, -3.91754389e-01_dt,
+        5.43294728e-01_dt, -3.95157546e-01_dt, -4.46217179e-01_dt, 7.44020700e-01_dt, 1.52097952_dt, 3.41050267_dt, -1.53118432_dt,
+        -1.23413503_dt, 1.81972528_dt, -5.51528692e-01_dt, -5.69248080e-01_dt, 9.19971406e-01_dt, 1.11081612_dt, 1.28987408_dt,
+        -1.47817397_dt, 2.56723285_dt, -4.73119795e-01_dt, 3.35550755e-01_dt, -1.62932599_dt, -5.49743652e-01_dt, -4.79834259e-01_dt,
+        -4.99681532e-01_dt, -1.06698036_dt, 1.11493957_dt, -1.40671432e-01_dt, 8.05753589e-01_dt, -9.33482349e-02_dt, 6.87050223e-01_dt,
+        -8.38315368e-01_dt, 8.91821750e-04_dt, 8.41894090e-01_dt, -4.00034159e-01_dt, 1.03946197_dt, 3.58153105e-01_dt, -2.46000946e-01_dt,
+        2.30251646_dt, -1.88168919_dt, -4.97270226e-02_dt, -1.04497862_dt, -9.56500828e-01_dt, 3.35318595e-02_dt, 7.10086584e-01_dt,
+        1.64586699_dt, -1.36016893_dt, 3.44565421e-01_dt, 5.19867718e-01_dt, -2.61332250_dt, -1.69647443_dt, -2.28241786e-01_dt,
+        2.79955000e-01_dt, -7.01523602e-01_dt, 1.03668678_dt, -6.03670120e-01_dt, -1.27876520_dt, 9.29502323e-02_dt, -6.66099727e-01_dt,
+        6.08047187e-01_dt, -7.30019867e-01_dt, 1.37503791_dt, 6.59631073e-01_dt, 4.76557106e-01_dt, -1.01630747_dt, 1.80366978e-01_dt,
+        1.08331867e-01_dt, -7.54823267e-01_dt, 2.44318530e-01_dt };
+    
+    const Tensor realOut{ -5.62919937e-02_dt, -5.76180108e-02_dt, -1.25289289e-02_dt, -2.16939412e-02_dt, 8.48710358e-01_dt,
+        6.92009151e-01_dt, -1.58006381e-02_dt, -1.05760969e-01_dt, 3.22274923e-01_dt, -6.31667376e-02_dt, 3.49983186e-01_dt,
+        3.08133930e-01_dt, 1.19841509e-01_dt, 1.23765790_dt, 1.11677718_dt, -1.23639079e-02_dt, -6.76326901e-02_dt, -8.47965628e-02_dt,
+        5.66650629e-01_dt, 7.93508351e-01_dt, 5.98839462e-01_dt, -7.77547583e-02_dt, -1.70680191e-02_dt, 1.85300612_dt, 7.50189483e-01_dt,
+        -2.92748790e-02_dt, -8.66983738e-03_dt, 1.83477938e-01_dt, 1.38936615_dt, 1.58633423_dt, 9.46298361e-01_dt, -4.21838388e-02_dt,
+        -3.06791551e-02_dt, 3.15927416e-02_dt, -2.46338490e-02_dt, 2.48414755e-01_dt, 4.39695835e-01_dt, 1.12411186e-01_dt,
+        6.40792370e-01_dt, 4.41156268e-01_dt, -5.11548249e-03_dt, 7.92443991e-01_dt, -1.44833848e-02_dt, 5.25074862e-02_dt,
+        5.22860467e-01_dt, 2.30220532_dt, -7.34446943e-02_dt, -7.93344378e-02_dt, -3.36544961e-02_dt, 8.72831225e-01_dt,
+        1.05535746_dt, 1.77843720e-01_dt, -1.15167741e-02_dt, -1.95877198e-02_dt, 5.43294728e-01_dt, -1.97578780e-02_dt,
+        -2.23108586e-02_dt, 7.44020700e-01_dt, 1.52097952_dt, 3.41050267_dt, -7.65592158e-02_dt, -6.17067516e-02_dt, 1.81972528_dt,
+        -2.75764354e-02_dt, -2.84624044e-02_dt, 9.19971406e-01_dt, 1.11081612_dt, 1.28987408_dt, -7.39087015e-02_dt, 2.56723285_dt,
+        -2.36559901e-02_dt, 3.35550755e-01_dt, -8.14663023e-02_dt, -2.74871830e-02_dt, -2.39917133e-02_dt, -2.49840766e-02_dt,
+        -5.33490181e-02_dt, 1.11493957_dt, -7.03357160e-03_dt, 8.05753589e-01_dt, -4.66741202e-03_dt, 6.87050223e-01_dt,
+        -4.19157706e-02_dt, 8.91821750e-04_dt, 8.41894090e-01_dt, -2.00017076e-02_dt, 1.03946197_dt, 3.58153105e-01_dt,
+        -1.23000471e-02_dt, 2.30251646_dt, -9.40844640e-02_dt, -2.48635118e-03_dt, -5.22489324e-02_dt, -4.78250422e-02_dt,
+        3.35318595e-02_dt, 7.10086584e-01_dt, 1.64586699_dt, -6.80084452e-02_dt, 3.44565421e-01_dt, 5.19867718e-01_dt,
+        -1.30666122e-01_dt, -8.48237202e-02_dt, -1.14120897e-02_dt, 2.79955000e-01_dt, -3.50761823e-02_dt, 1.03668678_dt,
+        -3.01835071e-02_dt, -6.39382601e-02_dt, 9.29502323e-02_dt, -3.33049856e-02_dt, 6.08047187e-01_dt, -3.65009941e-02_dt,
+        1.37503791_dt, 6.59631073e-01_dt, 4.76557106e-01_dt, -5.08153737e-02_dt, 1.80366978e-01_dt, 1.08331867e-01_dt, -3.77411656e-02_dt,
+        2.44318530e-01_dt };
+
+    const Tensor realInGrad{ 0.05_dt, 0.05_dt, 0.05_dt, 0.05_dt, 1.00_dt, 1.00_dt, 0.05_dt, 0.05_dt, 1.00_dt, 0.05_dt, 1.00_dt, 1.00_dt,
+        1.00_dt, 1.00_dt, 1.00_dt, 0.05_dt, 0.05_dt, 0.05_dt, 1.00_dt, 1.00_dt, 1.00_dt, 0.05_dt, 0.05_dt, 1.00_dt, 1.00_dt, 0.05_dt,
+        0.05_dt, 1.00_dt, 1.00_dt, 1.00_dt, 1.00_dt, 0.05_dt, 0.05_dt, 1.00_dt, 0.05_dt, 1.00_dt, 1.00_dt, 1.00_dt, 1.00_dt, 1.00_dt,
+        0.05_dt, 1.00_dt, 0.05_dt, 1.00_dt, 1.00_dt, 1.00_dt, 0.05_dt, 0.05_dt, 0.05_dt, 1.00_dt, 1.00_dt, 1.00_dt, 0.05_dt, 0.05_dt,
+        1.00_dt, 0.05_dt, 0.05_dt, 1.00_dt, 1.00_dt, 1.00_dt, 0.05_dt, 0.05_dt, 1.00_dt, 0.05_dt, 0.05_dt, 1.00_dt, 1.00_dt, 1.00_dt,
+        0.05_dt, 1.00_dt, 0.05_dt, 1.00_dt, 0.05_dt, 0.05_dt, 0.05_dt, 0.05_dt, 0.05_dt, 1.00_dt, 0.05_dt, 1.00_dt, 0.05_dt, 1.00_dt,
+        0.05_dt, 1.00_dt, 1.00_dt, 0.05_dt, 1.00_dt, 1.00_dt, 0.05_dt, 1.00_dt, 0.05_dt, 0.05_dt, 0.05_dt, 0.05_dt, 1.00_dt, 1.00_dt,
+        1.00_dt, 0.05_dt, 1.00_dt, 1.00_dt, 0.05_dt, 0.05_dt, 0.05_dt, 1.00_dt, 0.05_dt, 1.00_dt, 0.05_dt, 0.05_dt, 1.00_dt, 0.05_dt,
+        1.00_dt, 0.05_dt, 1.00_dt, 1.00_dt, 1.00_dt, 0.05_dt, 1.00_dt, 1.00_dt, 0.05_dt, 1.00_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<LeakyReLUActivation>("lrelu", LeakyReLUParams{ { "in" }, { "out" }, negSlope });
+
+    TENSORS_CREATE(BATCH_SIZE);
+
+    memory_manager["in"] = TORANGE(in);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    const Tensor& out = memory_manager["out"];
+    EXPECT_EQ(realOut.size(), out.size());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], eps);
+    }
+
+    memory_manager[Name("out").grad()] = 1.0_dt;
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+    const Tensor& inGrad = memory_manager[Name("in").grad()];
+    EXPECT_EQ(realInGrad.size(), inGrad.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], realInGrad[i], eps);
+    }
+}
+
+TEST(TestActivationFuncLeakyReLU, RandomUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t BATCH_SIZE = 5;
+    constexpr size_t WIDTH = 23;
+    constexpr size_t HEIGHT = 11;
+    constexpr size_t DEPTH = 7;
+    constexpr dtype negSlope = 0.1_dt;
+    constexpr dtype eps = 1.0e-6_dt;
+    constexpr auto range = std::make_pair(-1.0_dt, 1.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<LeakyReLUActivation>("lrelu", LeakyReLUParams{ { "in" }, { "out" }, negSlope });
+
+    TENSORS_CREATE(BATCH_SIZE);
+
+    tools::init_rand_tensor("in", range, memory_manager);
+    tools::init_rand_tensor("outGradient", range, memory_manager);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    const Tensor& in = memory_manager["in"];
+    const Tensor& out = memory_manager["out"];
+    EXPECT_EQ(in.size(), out.size());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], golden_lrelu(in[i], negSlope), eps);
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+    const Tensor& inGrad = memory_manager[Name("in").grad()];
+    const Tensor& outGrad = memory_manager[Name("out").grad()];
+    EXPECT_EQ(in.size(), inGrad.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], golden_lrelu_grad(in[i], outGrad[i], negSlope), eps);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/activations/Test_ActivationFunc_Sigmoid.cpp b/training/src/tests/tests/activations/Test_ActivationFunc_Sigmoid.cpp
new file mode 100644
index 00000000..cb1d465a
--- /dev/null
+++ b/training/src/tests/tests/activations/Test_ActivationFunc_Sigmoid.cpp
@@ -0,0 +1,181 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/layers/basic/DataLayer.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::half golden_sigmoid_fp16(const raul::half x)
+{
+    return TOHTYPE(1.0_dt / (1.0_dt + std::exp(-TODTYPE(x))));
+}
+
+raul::half golden_sigmoid_grad_fp16(const raul::half out, const raul::half grad)
+{
+    return TOHTYPE(grad * out * (1.0_hf - out));
+}
+
+}
+
+TEST(TestActivationFuncSigmoid, FP16Unit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t BATCH_SIZE = 2;
+    constexpr size_t WIDTH = 50;
+    constexpr size_t HEIGHT = 40;
+    constexpr size_t DEPTH = 50;
+    constexpr dtype eps = 3.0e-4_dt;
+    const auto range = std::make_pair(0_hf, 1.0_hf);
+
+    WorkflowEager work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<SigmoidActivation>("sigmoid", BasicParams{ { "in" }, { "out" } });
+
+    TENSORS_CREATE(BATCH_SIZE);
+
+    auto& memory_manager = work.getMemoryManager<MemoryManagerFP16>();
+
+    tools::init_rand_tensor("in", range, memory_manager);
+    tools::init_rand_tensor("outGradient", range, memory_manager);
+
+    work.forwardPassTraining();
+    const auto& in = memory_manager["in"];
+    const auto& out = memory_manager["out"];
+    EXPECT_EQ(in.size(), out.size());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], golden_sigmoid_fp16(in[i]), eps);
+    }
+
+    work.backwardPassTraining();
+    const auto& inGrad = memory_manager[Name("in").grad()];
+    const auto& outGrad = memory_manager[Name("out").grad()];
+    EXPECT_EQ(in.size(), inGrad.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], golden_sigmoid_grad_fp16(out[i], outGrad[i]), eps);
+    }
+}
+
+TEST(TestActivationFuncSigmoid, FP16INT8Unit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t BATCH_SIZE = 2;
+    constexpr size_t WIDTH = 50;
+    constexpr size_t HEIGHT = 40;
+    constexpr size_t DEPTH = 50;
+    constexpr dtype eps = 3.0e-4_dt;
+    const auto range = std::make_pair(0.0_hf, 1.0_hf);
+
+    WorkflowEager work(raul::CompressionMode::INT8, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<SigmoidActivation>("sigmoid", BasicParams{ { "in" }, { "out" } });
+
+    TENSORS_CREATE(BATCH_SIZE);
+
+    auto& memory_manager = work.getMemoryManager<MemoryManagerFP16>();
+
+    tools::init_rand_tensor("in", range, memory_manager);
+    tools::init_rand_tensor("outGradient", range, memory_manager);
+
+    work.forwardPassTraining();
+    const auto& in = memory_manager["in"];
+    const auto& out = memory_manager["out"];
+    EXPECT_EQ(in.size(), out.size());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], golden_sigmoid_fp16(in[i]), eps);
+    }
+
+    work.backwardPassTraining();
+    const auto& inGrad = memory_manager[Name("in").grad()];
+    const auto& outGrad = memory_manager[Name("out").grad()];
+    EXPECT_EQ(in.size(), inGrad.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], golden_sigmoid_grad_fp16(out[i], outGrad[i]), eps);
+    }
+}
+
+TEST(TestActivationFuncSigmoid, FP16BigUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t BATCH_SIZE = 200;
+    constexpr size_t WIDTH = 50;
+    constexpr size_t HEIGHT = 40;
+    constexpr size_t DEPTH = 50;
+    const auto range = std::make_pair(0_hf, 1.0_hf);
+
+    Workflow work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<SigmoidActivation>("sigmoid", BasicParams{ { "in" }, { "out" } });
+
+    TENSORS_CREATE(BATCH_SIZE);
+
+    auto& memory_manager = work.getMemoryManager<MemoryManagerFP16>();
+
+    tools::init_rand_tensor("in", range, memory_manager);
+    tools::init_rand_tensor("outGradient", range, memory_manager);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestActivationFuncSigmoid, FP16INT8BigUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t BATCH_SIZE = 200;
+    constexpr size_t WIDTH = 50;
+    constexpr size_t HEIGHT = 40;
+    constexpr size_t DEPTH = 50;
+    const auto range = std::make_pair(0.0_hf, 1.0_hf);
+
+    Workflow work(raul::CompressionMode::INT8, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<SigmoidActivation>("sigmoid", BasicParams{ { "in" }, { "out" } });
+
+    TENSORS_CREATE(BATCH_SIZE);
+
+    auto& memory_manager = work.getMemoryManager<MemoryManagerFP16>();
+
+    tools::init_rand_tensor("in", range, memory_manager);
+    tools::init_rand_tensor("outGradient", range, memory_manager);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+} // namespace UT
\ No newline at end of file
diff --git a/training/src/tests/tests/activations/Test_ActivationFunc_SoftPlus.cpp b/training/src/tests/tests/activations/Test_ActivationFunc_SoftPlus.cpp
new file mode 100644
index 00000000..70f78b3b
--- /dev/null
+++ b/training/src/tests/tests/activations/Test_ActivationFunc_SoftPlus.cpp
@@ -0,0 +1,134 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/activations/SoftPlusActivation.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestActivationFuncSoftPlus, PositiveBetaAndThresholdUnit)
+{
+    PROFILE_TEST
+
+    // Test parameters
+    constexpr auto eps = 1e-6_dt;
+    constexpr auto BATCH = 1U;
+    constexpr auto DEPTH = 2U;
+    constexpr auto HEIGHT = 3U;
+    constexpr auto WIDTH = 4U;
+    constexpr auto beta = 5.0_dt;
+    constexpr auto threshold = 2.5_dt;
+
+    const raul::Tensor input{ 0.49625659_dt, 0.76822180_dt, 0.08847743_dt, 0.13203049_dt, 0.30742282_dt, 0.63407868_dt, 0.49009341_dt, 0.89644474_dt,
+                              0.45562798_dt, 0.63230628_dt, 0.34889346_dt, 0.40171731_dt, 0.02232575_dt, 0.16885895_dt, 0.29388845_dt, 0.51852179_dt,
+                              0.69766760_dt, 0.80001140_dt, 0.16102946_dt, 0.28226858_dt, 0.68160856_dt, 0.91519397_dt, 0.39709991_dt, 0.87415588_dt };
+
+    // Outputs
+    const raul::Tensor realOut{ 0.51232100_dt, 0.76822180_dt, 0.18772143_dt, 0.21534744_dt, 0.34637174_dt, 0.63407868_dt, 0.50664032_dt, 0.89644474_dt,
+                                0.47513944_dt, 0.63230628_dt, 0.38110250_dt, 0.42689896_dt, 0.15010367_dt, 0.24037433_dt, 0.33530003_dt, 0.51852179_dt,
+                                0.69766760_dt, 0.80001140_dt, 0.23493099_dt, 0.32590535_dt, 0.68160856_dt, 0.91519397_dt, 0.42283344_dt, 0.87415588_dt };
+    const raul::Tensor realInGrad{ 1.00000000_dt, 1.00000000_dt, 0.60882771_dt, 0.65929461_dt, 0.82304484_dt, 1.00000000_dt, 1.00000000_dt, 1.00000000_dt,
+                                   0.90705031_dt, 1.00000000_dt, 0.85125363_dt, 0.88169563_dt, 0.52787828_dt, 0.69936901_dt, 0.81297261_dt, 1.00000000_dt,
+                                   1.00000000_dt, 1.00000000_dt, 0.69107443_dt, 0.80397767_dt, 1.00000000_dt, 1.00000000_dt, 0.87926620_dt, 1.00000000_dt };
+
+    // Initialization
+    raul::WorkflowEager work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPU);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<raul::SoftPlusActivation>("softplus", raul::SoftPlusActivationParams{ { "in" }, { "out" }, beta, threshold });
+
+    TENSORS_CREATE(BATCH);
+    auto& memory_manager = work.getMemoryManager();
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["outGradient"] = 1.0_dt;
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Forward checks
+    const auto& out = memory_manager["out"];
+    EXPECT_EQ(out.size(), realOut.size());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], eps);
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Backward checks
+    const auto& inGrad = memory_manager["inGradient"];
+    EXPECT_EQ(inGrad.size(), realInGrad.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], realInGrad[i], eps);
+    }
+}
+
+TEST(TestActivationFuncSoftPlus, NegativeBetaAndThresholdUnit)
+{
+    PROFILE_TEST
+
+    // Test parameters
+    constexpr auto eps = 1e-6_dt;
+    constexpr auto BATCH = 1U;
+    constexpr auto DEPTH = 2U;
+    constexpr auto HEIGHT = 3U;
+    constexpr auto WIDTH = 4U;
+    constexpr auto beta = -4.0_dt;
+    constexpr auto threshold = -2.5_dt;
+
+    const raul::Tensor input{ 0.41940832_dt, 0.55290705_dt, 0.95273811_dt, 0.03616482_dt, 0.18523103_dt, 0.37341738_dt, 0.30510002_dt, 0.93200040_dt,
+                              0.17591017_dt, 0.26983356_dt, 0.15067977_dt, 0.03171951_dt, 0.20812976_dt, 0.92979902_dt, 0.72310919_dt, 0.74233627_dt,
+                              0.52629578_dt, 0.24365824_dt, 0.58459234_dt, 0.03315264_dt, 0.13871688_dt, 0.24223500_dt, 0.81546897_dt, 0.79316062_dt };
+
+    // Output
+    const raul::Tensor realOut{ 0.41940832_dt, 0.55290705_dt, -0.00547146_dt, 0.03616482_dt, 0.18523103_dt, 0.37341738_dt,  0.30510002_dt,  -0.00593910_dt,
+                                0.17591017_dt, 0.26983356_dt, 0.15067977_dt,  0.03171951_dt, 0.20812976_dt, -0.00599100_dt, -0.01348966_dt, -0.01251565_dt,
+                                0.52629578_dt, 0.24365824_dt, 0.58459234_dt,  0.03315264_dt, 0.13871688_dt, 0.24223500_dt,  -0.00940015_dt, -0.01025975_dt };
+
+    // Initialization
+    raul::WorkflowEager work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPU);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<raul::SoftPlusActivation>("softplus", raul::SoftPlusActivationParams{ { "in" }, { "out" }, beta, threshold });
+
+    TENSORS_CREATE(BATCH);
+    auto& memory_manager = work.getMemoryManager();
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["outGradient"] = 1.0_dt;
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Forward checks
+    const auto& out = memory_manager["out"];
+    EXPECT_EQ(out.size(), realOut.size());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], eps);
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Backward checks
+    const auto& inGrad = memory_manager["inGradient"];
+    EXPECT_EQ(inGrad.size(), input.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_EQ(inGrad[i], 1.0_dt);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/activations/Test_ActivationFunc_Swish.cpp b/training/src/tests/tests/activations/Test_ActivationFunc_Swish.cpp
new file mode 100644
index 00000000..622cc1c2
--- /dev/null
+++ b/training/src/tests/tests/activations/Test_ActivationFunc_Swish.cpp
@@ -0,0 +1,153 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <chrono>
+#include <cstdio>
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/activations/SwishActivation.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_sigmoid(const raul::dtype x)
+{
+    return 1.0_dt / (1.0_dt + std::exp(-x));
+}
+
+raul::dtype golden_swish(const raul::dtype x)
+{
+    return x * golden_sigmoid(x);
+}
+
+// See https://www.wolframalpha.com/input/?i=derivative+x*+sigmoid%28x%29
+raul::dtype golden_swish_grad(const raul::dtype x, const raul::dtype grad)
+{
+    return grad * (golden_sigmoid(x) + x * golden_sigmoid(x) * (1.0_dt - golden_sigmoid(x)));
+}
+
+}
+
+TEST(TestActivationFuncSwish, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = 1e-6_dt;
+    const auto tensor_size = 1000U;
+    // We have a significant change in the range [-5,5]
+    // See https://www.wolframalpha.com/input/?i=x*sigmoid%28x%29
+    const auto random_range = std::pair<raul::dtype, raul::dtype>(-5.0f, 5.0f);
+
+    // Random generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<> dis(random_range.first, std::nextafter(random_range.second, std::numeric_limits<raul::dtype>::max()));
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, 1, 1 });
+
+    const auto params = raul::BasicParams{ { "in" }, { "out" } };
+
+    // Apply function
+    raul::SwishActivation swish_activation("swish", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+
+    auto& in_tensor = memory_manager["in"];
+    for (auto& val : in_tensor)
+    {
+        val = static_cast<raul::dtype>(dis(gen));
+    }
+
+    swish_activation.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), in_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto in_value = in_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_swish(in_value);
+        EXPECT_NEAR(out_value, golden_out_value, eps);
+    }
+}
+
+TEST(TestActivationFuncSwish, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = 1e-5_dt;
+    const auto tensor_size = 1000U;
+    // We have a significant change in the range [-5,5]
+    // See https://www.wolframalpha.com/input/?i=x*sigmoid%28x%29
+    const auto random_range = std::pair<raul::dtype, raul::dtype>(-5.0f, 5.0f);
+
+    // Random generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<> dis(random_range.first, std::nextafter(random_range.second, std::numeric_limits<raul::dtype>::max()));
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, 1, 1 });
+
+    const auto params = raul::BasicParams{ { "in" }, { "out" } };
+
+    // Apply function
+    raul::SwishActivation swish_activation("swish", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+
+    auto& in_tensor = memory_manager["in"];
+    for (auto& val : in_tensor)
+    {
+        val = static_cast<raul::dtype>(dis(gen));
+    }
+
+    auto& grad_tensor = memory_manager[raul::Name("out").grad()];
+    for (auto& val : grad_tensor)
+    {
+        val = static_cast<raul::dtype>(dis(gen));
+    }
+
+    swish_activation.forwardCompute(raul::NetworkMode::Train);
+    swish_activation.backwardCompute();
+
+    // Checks
+    const auto& out_tensor_grad = memory_manager[raul::Name("in").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), grad_tensor.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto in_value = in_tensor[i];
+        const auto grad_value = grad_tensor[i];
+        const auto out_value = out_tensor_grad[i];
+        const auto golden_out_value = golden_swish_grad(in_value, grad_value);
+        EXPECT_NEAR(out_value, golden_out_value, eps);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/activations/Test_ActivationFunc_Tanh.cpp b/training/src/tests/tests/activations/Test_ActivationFunc_Tanh.cpp
new file mode 100644
index 00000000..22e4da3c
--- /dev/null
+++ b/training/src/tests/tests/activations/Test_ActivationFunc_Tanh.cpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/activations/TanhActivation.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_tanh_layer(const raul::dtype x)
+{
+    return std::tanh(x);
+}
+
+raul::dtype golden_tanh_layer_grad(const raul::dtype x, const raul::dtype grad)
+{
+    return grad * (1.0_dt - std::tanh(x) * std::tanh(x));
+}
+
+}
+
+TEST(TestActivationFuncTanh, Unit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t BATCH_SIZE = 7;
+    constexpr size_t WIDTH = 21;
+    constexpr size_t HEIGHT = 3;
+    constexpr size_t DEPTH = 11;
+    constexpr dtype eps = 1.0e-6_dt;
+    constexpr auto range = std::make_pair(-1.0_dt, 1.0_dt);
+
+    WorkflowEager work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPU);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<TanhActivation>("tanh", BasicParams{ { "in" }, { "out" } });
+
+    TENSORS_CREATE(BATCH_SIZE);
+
+    auto& memory_manager = work.getMemoryManager<MemoryManager>();
+
+    tools::init_rand_tensor("in", range, memory_manager);
+    tools::init_rand_tensor("outGradient", range, memory_manager);
+
+    work.forwardPassTraining();
+    const Tensor& in = memory_manager["in"];
+    const Tensor& out = memory_manager["out"];
+    EXPECT_EQ(in.size(), out.size());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], golden_tanh_layer(in[i]), eps);
+    }
+
+    work.backwardPassTraining();
+    const Tensor& inGrad = memory_manager[Name("in").grad()];
+    const Tensor& outGrad = memory_manager[Name("out").grad()];
+    EXPECT_EQ(in.size(), inGrad.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], golden_tanh_layer_grad(in[i], outGrad[i]), eps);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/activations/Test_LogSoftMax.cpp b/training/src/tests/tests/activations/Test_LogSoftMax.cpp
new file mode 100644
index 00000000..9ec18486
--- /dev/null
+++ b/training/src/tests/tests/activations/Test_LogSoftMax.cpp
@@ -0,0 +1,190 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <chrono>
+#include <cstdio>
+#include <tests/tools/TestTools.h>
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/activations/LogSoftMaxActivation.h>
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/loss/NegativeLogLikelihoodLoss.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLogSoftMax, LogSoftMaxWithDimUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    constexpr auto eps = 1e-4_dt;
+
+    size_t BATCH_SIZE = 2;
+    size_t WIDTH = 2;
+    size_t HEIGHT = 2;
+    size_t DEPTH = 3;
+
+    Dimension dims[] = { Dimension::Depth, Dimension::Height, Dimension::Width };
+    std::string names[] = { "depth", "height", "width" };
+
+    Tensor realOut[] = { { 1.0986_dt,  -1.0986_dt, -1.0986_dt, -1.0986_dt, -1.0986_dt, -1.0986_dt, -1.0986_dt, -1.0986_dt, -1.0986_dt, -1.0986_dt, -1.0986_dt, -1.0986_dt,
+                           -3.1698_dt, -3.0949_dt, -1.0986_dt, -4.0360_dt, -0.1698_dt, -0.0949_dt, -1.0986_dt, -4.0360_dt, -2.1698_dt, -3.0949_dt, -1.0986_dt, -0.0360_dt },
+                         { -1.3133e+00_dt, -1.3133e+00_dt, -3.1326e-01_dt, -3.1326e-01_dt, -1.3133e+00_dt, -1.3133e+00_dt, -3.1326e-01_dt, -3.1326e-01_dt,
+                           -1.3133e+00_dt, -1.3133e+00_dt, -3.1326e-01_dt, -3.1326e-01_dt, -2.1269e+00_dt, -2.1269e+00_dt, -1.2693e-01_dt, -1.2693e-01_dt,
+                           -3.1326e-01_dt, -3.1326e-01_dt, -1.3133e+00_dt, -1.3133e+00_dt, -1.3133e+00_dt, -6.0025e+00_dt, -3.1326e-01_dt, -2.4757e-03_dt },
+
+                         { -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt,
+                           -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.6931_dt, -0.3133_dt, -1.3133_dt, -4.0181_dt, -0.0181_dt } };
+
+    Tensor realGrad[] = { { 5.9605e-08_dt,  5.9605e-08_dt,  1.1921e-07_dt, 1.1921e-07_dt, 5.9605e-08_dt, 5.9605e-08_dt, 1.1921e-07_dt, 1.1921e-07_dt,
+                            5.9605e-08_dt,  5.9605e-08_dt,  1.1921e-07_dt, 1.1921e-07_dt, 7.0593e-01_dt, 7.2833e-01_dt, 1.7881e-07_dt, 2.7703e+00_dt,
+                            -1.9066e+00_dt, -1.4567e+00_dt, 1.7881e-07_dt, 2.7703e+00_dt, 1.2006e+00_dt, 7.2833e-01_dt, 1.7881e-07_dt, -5.5406e+00_dt },
+                          { 0.1932_dt, 0.1932_dt, -0.1932_dt, -0.1932_dt, 0.1932_dt,  0.1932_dt,  -0.1932_dt, -0.1932_dt, 0.1932_dt, 0.1932_dt, -0.1932_dt, -0.1932_dt,
+                            0.5232_dt, 0.5232_dt, -0.5232_dt, -0.5232_dt, -1.1174_dt, -1.1174_dt, 1.1174_dt,  1.1174_dt,  0.6553_dt, 0.9802_dt, -0.6553_dt, -0.9802_dt },
+                          { 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt,  0.0000_dt, 0.0000_dt, 0.0000_dt,
+                            0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, -0.1932_dt, 0.1932_dt, 2.8201_dt, -2.8201_dt } };
+
+    Tensor raw = { 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 3._dt, 3._dt, 4._dt, 4._dt, 3._dt, 3._dt, 2._dt, 1._dt, 3._dt, 7._dt };
+
+    for (size_t k = 0; k < std::size(dims); ++k)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+
+        if (k < 2)
+        {
+            EXPECT_THROW(work.add<LogSoftMaxActivation>("sm", BasicParamsWithDim{ { "in" }, { "out" }, dims[k] }), raul::Exception);
+            continue;
+        }
+        else
+        {
+            work.add<LogSoftMaxActivation>("sm", BasicParamsWithDim{ { "in" }, { "out" }, dims[k] });
+        }
+        TENSORS_CREATE(BATCH_SIZE)
+        memory_manager["in"] = TORANGE(raw);
+        memory_manager[raul::Name("out").grad()] = TORANGE(raw);
+
+        work.forwardPassTraining();
+        const raul::Tensor& out = memory_manager["out"];
+
+        EXPECT_EQ(out.size(), realOut[k].size());
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_NEAR(out[i], realOut[k][i], eps);
+        }
+
+        std::cout << " - LogSoftMax [" << names[k] << "] forward is Ok.\n";
+
+        work.backwardPassTraining();
+
+        const raul::Tensor& inGrad = memory_manager[raul::Name("in").grad()];
+
+        EXPECT_EQ(inGrad.size(), realGrad[k].size());
+        for (size_t i = 0; i < inGrad.size(); ++i)
+        {
+            EXPECT_NEAR(inGrad[i], realGrad[k][i], eps);
+        }
+
+        std::cout << " - LogSoftMax [" << names[k] << "] backward is Ok.\n";
+    }
+}
+
+TEST(TestLogSoftMax, LogSoftMaxWidthUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    constexpr auto eps = 1e-4_dt;
+
+    size_t BATCH_SIZE = 32;
+    size_t WIDTH = 2;
+    size_t HEIGHT = 1;
+    size_t DEPTH = 1;
+
+    const Tensor realOut(BATCH_SIZE,
+                         DEPTH,
+                         HEIGHT,
+                         WIDTH,
+                         {
+                             -1.3756_dt, -0.2913_dt, -2.2145_dt, -0.1157_dt, -0.9564_dt, -0.4850_dt, -0.9344_dt, -0.4989_dt, -1.3858_dt, -0.2879_dt, -1.9440_dt, -0.1545_dt, -1.4673_dt,
+                             -0.2621_dt, -1.8594_dt, -0.1693_dt, -1.9028_dt, -0.1615_dt, -1.3114_dt, -0.3140_dt, -1.7309_dt, -0.1950_dt, -1.3068_dt, -0.3157_dt, -1.3508_dt, -0.2998_dt,
+                             -1.4258_dt, -0.2748_dt, -1.5950_dt, -0.2268_dt, -1.1323_dt, -0.3890_dt, -1.2286_dt, -0.3463_dt, -1.4807_dt, -0.2581_dt, -1.4699_dt, -0.2613_dt, -1.3373_dt,
+                             -0.3046_dt, -1.0710_dt, -0.4195_dt, -1.8663_dt, -0.1681_dt, -1.4564_dt, -0.2653_dt, -1.9998_dt, -0.1454_dt, -1.8108_dt, -0.1786_dt, -2.0779_dt, -0.1338_dt,
+                             -1.3858_dt, -0.2879_dt, -1.0112_dt, -0.4522_dt, -0.9970_dt, -0.4604_dt, -1.7337_dt, -0.1944_dt, -1.2894_dt, -0.3222_dt, -1.2220_dt, -0.3491_dt,
+                         });
+
+    const Tensor realGrad(BATCH_SIZE,
+                          DEPTH,
+                          HEIGHT,
+                          WIDTH,
+                          {
+                              -0.0233_dt, 0.0233_dt, -0.0278_dt, 0.0278_dt,  0.0120_dt,  -0.0120_dt, -0.0189_dt, 0.0189_dt,  -0.0234_dt, 0.0234_dt,  -0.0267_dt, 0.0267_dt,  0.0072_dt,
+                              -0.0072_dt, 0.0049_dt, -0.0049_dt, -0.0265_dt, 0.0265_dt,  0.0084_dt,  -0.0084_dt, -0.0257_dt, 0.0257_dt,  -0.0228_dt, 0.0228_dt,  -0.0231_dt, 0.0231_dt,
+                              -0.0237_dt, 0.0237_dt, -0.0249_dt, 0.0249_dt,  0.0101_dt,  -0.0101_dt, -0.0221_dt, 0.0221_dt,  0.0071_dt,  -0.0071_dt, -0.0240_dt, 0.0240_dt,  0.0082_dt,
+                              -0.0082_dt, 0.0107_dt, -0.0107_dt, 0.0048_dt,  -0.0048_dt, 0.0073_dt,  -0.0073_dt, 0.0042_dt,  -0.0042_dt, -0.0261_dt, 0.0261_dt,  0.0039_dt,  -0.0039_dt,
+                              -0.0234_dt, 0.0234_dt, 0.0113_dt,  -0.0113_dt, 0.0115_dt,  -0.0115_dt, -0.0257_dt, 0.0257_dt,  -0.0226_dt, 0.0226_dt,  0.0092_dt,  -0.0092_dt,
+                          });
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<LogSoftMaxActivation>("sm", BasicParamsWithDim{ { "in" }, { "out" }, "width" });
+    TENSORS_CREATE(BATCH_SIZE)
+
+    memory_manager["in"] = TORANGE(Tensor({ -0.6226_dt, 0.4617_dt,  -1.4773_dt, 0.6215_dt,  -0.1313_dt, 0.3401_dt,  -0.1132_dt, 0.3223_dt,  -0.6378_dt, 0.4601_dt,  -0.9472_dt, 0.8423_dt,  -0.4721_dt,
+                                            0.7331_dt,  -0.9548_dt, 0.7353_dt,  -0.7862_dt, 0.9551_dt,  -0.2192_dt, 0.7782_dt,  -0.6109_dt, 0.9250_dt,  -0.4453_dt, 0.5458_dt,  -0.7532_dt, 0.2978_dt,
+                                            -0.5299_dt, 0.6211_dt,  -0.5609_dt, 0.8073_dt,  0.1008_dt,  0.8441_dt,  -0.6271_dt, 0.2552_dt,  -0.3445_dt, 0.8781_dt,  -0.5200_dt, 0.6886_dt,  -0.5070_dt,
+                                            0.5257_dt,  -0.2960_dt, 0.3555_dt,  -1.1601_dt, 0.5381_dt,  -0.5758_dt, 0.6153_dt,  -1.0586_dt, 0.7958_dt,  -1.0672_dt, 0.5650_dt,  -0.7843_dt, 1.1598_dt,
+                                            -0.7141_dt, 0.3838_dt,  -0.0126_dt, 0.5464_dt,  0.0409_dt,  0.5775_dt,  -0.8350_dt, 0.7043_dt,  -0.7880_dt, 0.1792_dt,  -0.1886_dt, 0.6843_dt }));
+
+    work.forwardPassTraining();
+    const raul::Tensor& out = memory_manager["out"];
+
+    EXPECT_EQ(out.getShape(), realOut.getShape());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], eps);
+    }
+
+    std::cout << " - LogSoftMax [width] forward is Ok.\n";
+
+    memory_manager[raul::Name("out").grad()] = TORANGE(Tensor({
+        -0.0312_dt, 0.0000_dt, -0.0312_dt, 0.0000_dt,  0.0000_dt,  -0.0312_dt, -0.0312_dt, 0.0000_dt,  -0.0312_dt, 0.0000_dt,  -0.0312_dt, 0.0000_dt,  0.0000_dt,  -0.0312_dt, 0.0000_dt, -0.0312_dt,
+        -0.0312_dt, 0.0000_dt, 0.0000_dt,  -0.0312_dt, -0.0312_dt, 0.0000_dt,  -0.0312_dt, 0.0000_dt,  -0.0312_dt, 0.0000_dt,  -0.0312_dt, 0.0000_dt,  -0.0312_dt, 0.0000_dt,  0.0000_dt, -0.0312_dt,
+        -0.0312_dt, 0.0000_dt, 0.0000_dt,  -0.0312_dt, -0.0312_dt, 0.0000_dt,  0.0000_dt,  -0.0312_dt, 0.0000_dt,  -0.0312_dt, 0.0000_dt,  -0.0312_dt, 0.0000_dt,  -0.0312_dt, 0.0000_dt, -0.0312_dt,
+        -0.0312_dt, 0.0000_dt, 0.0000_dt,  -0.0312_dt, -0.0312_dt, 0.0000_dt,  0.0000_dt,  -0.0312_dt, 0.0000_dt,  -0.0312_dt, -0.0312_dt, 0.0000_dt,  -0.0312_dt, 0.0000_dt,  0.0000_dt, -0.0312_dt,
+    }));
+
+    work.backwardPassTraining();
+
+    const raul::Tensor& inGrad = memory_manager[raul::Name("in").grad()];
+
+    EXPECT_EQ(inGrad.getShape(), realGrad.getShape());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], realGrad[i], eps);
+    }
+
+    std::cout << " - LogSoftMax [width] backward is Ok.\n";
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/activations/Test_SoftMax.cpp b/training/src/tests/tests/activations/Test_SoftMax.cpp
new file mode 100644
index 00000000..ae0b9a3c
--- /dev/null
+++ b/training/src/tests/tests/activations/Test_SoftMax.cpp
@@ -0,0 +1,165 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+TEST(TestSoftMax, SoftMaxUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    constexpr auto eps = 1e-4_dt;
+
+    size_t BATCH_SIZE = 2;
+    size_t WIDTH = 2;
+    size_t HEIGHT = 2;
+    size_t DEPTH = 3;
+
+    Dimension dims[] = { Dimension::Batch, Dimension::Depth, Dimension::Height, Dimension::Width };
+    std::string names[] = { "batch", "depth", "height", "width" };
+
+    Tensor realOut[] = { { 0.5_dt, 0.5_dt, 0.26894143_dt, 0.26894143_dt, 0.04742587_dt, 0.04742587_dt, 0.26894143_dt, 0.26894143_dt, 0.26894143_dt, 0.5_dt, 0.26894143_dt, 0.00669285_dt,
+                           0.5_dt, 0.5_dt, 0.7310586_dt,  0.7310586_dt,  0.95257413_dt, 0.95257413_dt, 0.7310586_dt,  0.7310586_dt,  0.7310586_dt,  0.5_dt, 0.7310586_dt,  0.9933072_dt },
+                         { 0.3333_dt, 0.3333_dt, 0.3333_dt, 0.3333_dt, 0.3333_dt, 0.3333_dt, 0.3333_dt, 0.3333_dt, 0.3333_dt, 0.3333_dt, 0.3333_dt, 0.3333_dt,
+                           0.0420_dt, 0.0453_dt, 0.3333_dt, 0.0177_dt, 0.8438_dt, 0.9094_dt, 0.3333_dt, 0.0177_dt, 0.1142_dt, 0.0453_dt, 0.3333_dt, 0.9647_dt },
+                         { 0.2689_dt, 0.2689_dt, 0.7311_dt, 0.7311_dt, 0.2689_dt, 0.2689_dt, 0.7311_dt, 0.7311_dt, 0.2689_dt, 0.2689_dt, 0.7311_dt, 0.7311_dt,
+                           0.1192_dt, 0.1192_dt, 0.8808_dt, 0.8808_dt, 0.7311_dt, 0.7311_dt, 0.2689_dt, 0.2689_dt, 0.2689_dt, 0.0025_dt, 0.7311_dt, 0.9975_dt },
+                         { 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt,
+                           0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.5000_dt, 0.7311_dt, 0.2689_dt, 0.0180_dt, 0.9820_dt } };
+
+    Tensor realGrad[] = { { -0.0_dt,        0.0_dt,        -0.19661196_dt, -0.19661196_dt, -0.13552998_dt, -0.13552998_dt, -0.19661196_dt, -0.19661196_dt,
+                            -0.19661196_dt, 0.0_dt,        -0.19661196_dt, -0.03324028_dt, 0.00000000_dt,  0.00000000_dt,  0.19661188_dt,  0.19661188_dt,
+                            0.13552995_dt,  0.13552995_dt, 0.19661188_dt,  0.19661188_dt,  0.19661190_dt,  0.00000000,     0.19661188_dt,  0.03324006_dt },
+                          { -9.9341e-09_dt, -9.9341e-09_dt, -1.9868e-08_dt, -1.9868e-08_dt, -9.9341e-09_dt, -9.9341e-09_dt, -1.9868e-08_dt, -1.9868e-08_dt,
+                            -9.9341e-09_dt, -9.9341e-09_dt, -1.9868e-08_dt, -1.9868e-08_dt, -1.1114e-01_dt, -1.2353e-01_dt, 0.0000e+00_dt,  -6.8176e-02_dt,
+                            2.9906e-01_dt,  2.4707e-01_dt,  0.0000e+00_dt,  -6.8176e-02_dt, -1.8792e-01_dt, -1.2353e-01_dt, 0.0000e+00_dt,  1.3635e-01_dt },
+                          { -0.1966_dt, -0.1966_dt, 0.1966_dt, 0.1966_dt, -0.1966_dt, -0.1966_dt, 0.1966_dt,  0.1966_dt,  -0.1966_dt, -0.1966_dt, 0.1966_dt, 0.1966_dt,
+                            -0.2100_dt, -0.2100_dt, 0.2100_dt, 0.2100_dt, 0.1966_dt,  0.1966_dt,  -0.1966_dt, -0.1966_dt, -0.1966_dt, -0.0148_dt, 0.1966_dt, 0.0148_dt },
+                          { 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt,  0.0000_dt,  0.0000_dt,
+                            0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.0000_dt, 0.1966_dt, -0.1966_dt, -0.0707_dt, 0.0707_dt } };
+
+    const Tensor raw = { 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 2._dt, 2._dt, 1._dt, 1._dt, 3._dt, 3._dt, 4._dt, 4._dt, 3._dt, 3._dt, 2._dt, 1._dt, 3._dt, 7._dt };
+
+    for (size_t k = 0; k < std::size(dims); ++k)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        memory_manager.createTensor("in", BATCH_SIZE, DEPTH, HEIGHT, WIDTH, raw);
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+        work.add<SoftMaxActivation>("sm", BasicParamsWithDim{ { "in" }, { "out" }, dims[k] });
+
+        TENSORS_CREATE(BATCH_SIZE)
+        memory_manager["in"] = TORANGE(raw);
+        memory_manager[raul::Name("out").grad()] = TORANGE(raw);
+
+        work.forwardPassTraining();
+        const raul::Tensor& out = memory_manager["out"];
+
+        EXPECT_EQ(out.size(), realOut[k].size());
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_NEAR(out[i], realOut[k][i], eps);
+        }
+
+        std::cout << " - SoftMax [" << names[k] << "] forward is Ok.\n";
+
+        work.backwardPassTraining();
+
+        const raul::Tensor& inGrad = memory_manager[raul::Name("in").grad()];
+
+        EXPECT_EQ(inGrad.size(), realGrad[k].size());
+        for (size_t i = 0; i < inGrad.size(); ++i)
+        {
+            EXPECT_NEAR(inGrad[i], realGrad[k][i], eps);
+        }
+
+        std::cout << " - SoftMax [" << names[k] << "] backward is Ok.\n";
+    }
+}
+
+TEST(TestSoftMax, DoubleForwardHeightUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t BATCH_SIZE = 2;
+    constexpr size_t DEPTH = 1;
+    constexpr size_t HEIGHT = 48;
+    constexpr size_t WIDTH = 1;
+    constexpr dtype eps = 1.0e-5_dt;
+
+    const Tensor in{ 8.34375_dt, 6.94922_dt, 7.875_dt, 7.07422_dt, 8.58594_dt, 8.39062_dt, 8.53906_dt, 9.40625_dt, 9.6875_dt,
+        9.48438_dt, 8.66406_dt, 9.42188_dt, 10.1172_dt, 10.1328_dt, 9.51562_dt, 10.1562_dt, 12.1641_dt, 12.75_dt, 13.0547_dt,
+        12.6016_dt, 12.4688_dt, 12.3594_dt, 12.2422_dt, 12.1016_dt, 11.9375_dt, 11.8047_dt, 11.7812_dt, 11.9141_dt, 12.0625_dt,
+        12.2422_dt, 12.4141_dt, 12.5547_dt, 12.7031_dt, 12.8281_dt, 12.9531_dt, 13.0625_dt, 13.1797_dt, 13.3047_dt, 13.3906_dt,
+        13.5078_dt, 13.5781_dt, 13.6562_dt, 13.7422_dt, 13.8203_dt, 13.8906_dt, 13.9531_dt, 14.0156_dt, 14.0938_dt, 8.34375_dt,
+        6.94922_dt, 7.875_dt, 7.07422_dt, 8.58594_dt, 8.39062_dt, 8.53906_dt, 9.40625_dt, 9.6875_dt, 9.48438_dt, 8.66406_dt,
+        9.42188_dt, 10.1172_dt, 10.1328_dt, 9.51562_dt, 10.1562_dt, 12.1641_dt, 12.75_dt, 13.0547_dt, 12.6016_dt, 12.4688_dt,
+        12.3594_dt, 12.2422_dt, 12.1016_dt, 11.9375_dt, 11.8047_dt, 11.7812_dt, 11.9141_dt, 12.0625_dt, 12.2422_dt, 12.4141_dt,
+        12.5547_dt, 12.7031_dt, 12.8281_dt, 12.9531_dt, 13.0625_dt, 13.1797_dt, 13.3047_dt, 13.3906_dt, 13.5078_dt, 13.5781_dt,
+        13.6562_dt, 13.7422_dt, 13.8203_dt, 13.8906_dt, 13.9531_dt, 14.0156_dt, 14.0938_dt };
+
+    const Tensor realOut{ 0.0002581_dt, 6.39958e-05_dt, 0.000161515_dt, 7.25167e-05_dt, 0.000328829_dt, 0.000270485_dt,
+        0.000313769_dt, 0.000746837_dt, 0.000989398_dt, 0.000807527_dt, 0.000355547_dt, 0.000758601_dt, 0.0015205_dt,
+        0.00154441_dt, 0.000833153_dt, 0.00158098_dt, 0.0117746_dt, 0.0211543_dt, 0.0286898_dt, 0.0182368_dt, 0.0159689_dt,
+        0.014314_dt, 0.012731_dt, 0.0110612_dt, 0.00938715_dt, 0.00821976_dt, 0.00802886_dt, 0.00917004_dt, 0.010637_dt,
+        0.012731_dt, 0.0151188_dt, 0.0174012_dt, 0.020185_dt, 0.0228726_dt, 0.0259181_dt, 0.0289145_dt, 0.0325098_dt,
+        0.0368384_dt, 0.0401427_dt, 0.0451343_dt, 0.0484214_dt, 0.0523547_dt, 0.0570565_dt, 0.0616912_dt, 0.0661842_dt,
+        0.0704527_dt, 0.0749965_dt, 0.0810966_dt, 0.0002581_dt, 6.39958e-05_dt, 0.000161515_dt, 7.25167e-05_dt, 0.000328829_dt,
+        0.000270485_dt, 0.000313769_dt, 0.000746837_dt, 0.000989398_dt, 0.000807527_dt, 0.000355547_dt, 0.000758601_dt,
+        0.0015205_dt, 0.00154441_dt, 0.000833153_dt, 0.00158098_dt, 0.0117746_dt, 0.0211543_dt, 0.0286898_dt, 0.0182368_dt,
+        0.0159689_dt, 0.014314_dt, 0.012731_dt, 0.0110612_dt, 0.00938715_dt, 0.00821976_dt, 0.00802886_dt, 0.00917004_dt,
+        0.010637_dt, 0.012731_dt, 0.0151188_dt, 0.0174012_dt, 0.020185_dt, 0.0228726_dt, 0.0259181_dt, 0.0289145_dt, 0.0325098_dt,
+        0.0368384_dt, 0.0401427_dt, 0.0451343_dt, 0.0484214_dt, 0.0523547_dt, 0.0570565_dt, 0.0616912_dt, 0.0661842_dt,
+        0.0704527_dt, 0.0749965_dt, 0.0810966_dt };
+    
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<SoftMaxActivation>("softmax", BasicParamsWithDim{ { "in" }, { "out" }, "height" });
+
+    TENSORS_CREATE(BATCH_SIZE)
+    memory_manager["in"] = TORANGE(in);
+
+    // Step one
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    
+    const auto& out = memory_manager["out"];
+    EXPECT_EQ(out.size(), realOut.size());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], eps);
+    }
+
+    // Step two
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    EXPECT_EQ(out.size(), realOut.size());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], eps);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/frontend/Test_Compilation.cpp b/training/src/tests/tests/frontend/Test_Compilation.cpp
new file mode 100644
index 00000000..9fb0ef1a
--- /dev/null
+++ b/training/src/tests/tests/frontend/Test_Compilation.cpp
@@ -0,0 +1,231 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/frontend/Frontend.h>
+#include <training/compiler/FrontendCompiler.h>
+#include <training/compiler/Workflow.h>
+
+#include <training/frontend/processors/TextPrinter.h>
+
+using namespace raul::frontend;
+
+namespace UT
+{
+
+TEST(TestCompilation, EmptyGraphUnit)
+{
+    FrontendCompiler compiler;
+
+    auto g = Graph{};
+
+    EXPECT_NO_THROW(compiler.setTopology(g));
+    EXPECT_NO_THROW(compiler.compile());
+
+    const auto& work = compiler.getWorkflow();
+    EXPECT_EQ(work.getSetOfLayers().size(), 0);
+    EXPECT_EQ(work.getBatchSize(), 1);
+}
+
+TEST(TestCompilation, LinearUnit)
+{
+    FrontendCompiler compiler;
+
+    auto g = Linear{ 3 };
+
+    EXPECT_NO_THROW(compiler.setTopology(g));
+    EXPECT_NO_THROW(compiler.compile());
+
+    const auto& work = compiler.getWorkflow();
+    EXPECT_EQ(work.getSetOfLayers().size(), 2); // Data (implicitly declares) + Linear
+    EXPECT_EQ(work.getBatchSize(), 1);
+}
+
+TEST(TestCompilation, LinearSequenceUnit)
+{
+    FrontendCompiler compiler;
+
+    auto g = Graph{ Linear{ 3 }, Linear{ 4 }, Linear{ 5 } };
+
+    EXPECT_NO_THROW(compiler.setTopology(g));
+    EXPECT_NO_THROW(compiler.compile());
+
+    const auto& work = compiler.getWorkflow();
+    EXPECT_EQ(work.getSetOfLayers().size(), 2); // Data (implicitly declares) + Linear
+    EXPECT_EQ(work.getBatchSize(), 1);
+
+    std::stringstream ss;
+    work.printInfo(ss);
+
+    ASSERT_STREQ(ss.str().c_str(),
+                 "Data [in]: \n"
+                 "\toutputs: Tensor 'in' (1,1,1,1)\n"
+                 "Linear [0]: \n"
+                 "\tinputs: Tensor 'in' (1,1,1,1)\n"
+                 "\toutputs: Tensor '0::out' (1,1,1,3)\n"
+                 "Linear [1]: \n"
+                 "\tinputs: Tensor '0::out' (1,1,1,3)\n"
+                 "\toutputs: Tensor '1::out' (1,1,1,4)\n"
+                 "Linear [2]: \n"
+                 "\tinputs: Tensor '1::out' (1,1,1,4)\n"
+                 "\toutputs: Tensor '2::out' (1,1,1,5)\n");
+}
+
+TEST(TestCompilation, LinearSubgraphsUnit)
+{
+    FrontendCompiler compiler;
+
+    auto g = Graph{ Graph{ Linear{ 3 } }, Graph{ Linear{ 4 } } };
+
+    EXPECT_NO_THROW(compiler.setTopology(g));
+    EXPECT_NO_THROW(compiler.compile());
+
+    const auto& work = compiler.getWorkflow();
+    EXPECT_EQ(work.getSetOfLayers().size(), 2); // Data (implicitly declares) + Linear
+    EXPECT_EQ(work.getBatchSize(), 1);
+
+    std::stringstream ss;
+    work.printInfo(ss);
+
+    ASSERT_STREQ(ss.str().c_str(),
+                 "Data [in]: \n"
+                 "\toutputs: Tensor 'in' (1,1,1,1)\n"
+                 "Linear [0::0]: \n"
+                 "\tinputs: Tensor 'in' (1,1,1,1)\n"
+                 "\toutputs: Tensor '0::0::out' (1,1,1,3)\n"
+                 "Linear [1::0]: \n"
+                 "\tinputs: Tensor '0::0::out' (1,1,1,3)\n"
+                 "\toutputs: Tensor '1::0::out' (1,1,1,4)\n");
+}
+
+TEST(TestCompilation, LinearSubgraphsMixedUnit)
+{
+    FrontendCompiler compiler;
+
+    auto g = Graph{ Graph{ Linear{ 3 } }, Linear{ 4 }, Graph{ Linear{ 5 } }, Graph{ Linear{ 6 }, Linear{ 7 } } };
+
+    EXPECT_NO_THROW(compiler.setTopology(g));
+    EXPECT_NO_THROW(compiler.compile());
+
+    const auto& work = compiler.getWorkflow();
+    EXPECT_EQ(work.getSetOfLayers().size(), 2); // Data (implicitly declares) + Linear
+    EXPECT_EQ(work.getBatchSize(), 1);
+
+    std::stringstream ss;
+    work.printInfo(ss);
+
+    ASSERT_STREQ(ss.str().c_str(),
+                 "Data [in]: \n"
+                 "\toutputs: Tensor 'in' (1,1,1,1)\n"
+                 "Linear [0::0]: \n"
+                 "\tinputs: Tensor 'in' (1,1,1,1)\n"
+                 "\toutputs: Tensor '0::0::out' (1,1,1,3)\n"
+                 "Linear [1]: \n"
+                 "\tinputs: Tensor '0::0::out' (1,1,1,3)\n"
+                 "\toutputs: Tensor '1::out' (1,1,1,4)\n"
+                 "Linear [2::0]: \n"
+                 "\tinputs: Tensor '1::out' (1,1,1,4)\n"
+                 "\toutputs: Tensor '2::0::out' (1,1,1,5)\n"
+                 "Linear [3::0]: \n"
+                 "\tinputs: Tensor '2::0::out' (1,1,1,5)\n"
+                 "\toutputs: Tensor '3::0::out' (1,1,1,6)\n"
+                 "Linear [3::1]: \n"
+                 "\tinputs: Tensor '3::0::out' (1,1,1,6)\n"
+                 "\toutputs: Tensor '3::1::out' (1,1,1,7)\n");
+}
+
+TEST(TestCompilation, LinearNamedSubgraphsParametrizedUnit)
+{
+    FrontendCompiler compiler;
+
+    auto genLinearSequence = [](size_t n)
+    {
+        Graph g{};
+        for (size_t i = 0; i < n; ++i)
+        {
+            g.insert(Linear{ i + 1 }.enableBias());
+        }
+
+        return g;
+    };
+
+    auto g = Graph{ { "block_a", genLinearSequence(5) }, { "block_b", genLinearSequence(3) } };
+
+    EXPECT_NO_THROW(compiler.setTopology(g));
+    EXPECT_NO_THROW(compiler.compile());
+
+    const auto& work = compiler.getWorkflow();
+    EXPECT_EQ(work.getSetOfLayers().size(), 2); // Data (implicitly declares) + Linear
+    EXPECT_EQ(work.getBatchSize(), 1);
+
+    std::stringstream ss;
+    work.printInfo(ss);
+
+    ASSERT_STREQ(ss.str().c_str(),
+                 "Data [in]: \n"
+                 "\toutputs: Tensor 'in' (1,1,1,1)\n"
+                 "Linear [block_a::0]: \n"
+                 "\tinputs: Tensor 'in' (1,1,1,1)\n"
+                 "\toutputs: Tensor 'block_a::0::out' (1,1,1,1)\n"
+                 "Linear [block_a::1]: \n"
+                 "\tinputs: Tensor 'block_a::0::out' (1,1,1,1)\n"
+                 "\toutputs: Tensor 'block_a::1::out' (1,1,1,2)\n"
+                 "Linear [block_a::2]: \n"
+                 "\tinputs: Tensor 'block_a::1::out' (1,1,1,2)\n"
+                 "\toutputs: Tensor 'block_a::2::out' (1,1,1,3)\n"
+                 "Linear [block_a::3]: \n"
+                 "\tinputs: Tensor 'block_a::2::out' (1,1,1,3)\n"
+                 "\toutputs: Tensor 'block_a::3::out' (1,1,1,4)\n"
+                 "Linear [block_a::4]: \n"
+                 "\tinputs: Tensor 'block_a::3::out' (1,1,1,4)\n"
+                 "\toutputs: Tensor 'block_a::4::out' (1,1,1,5)\n"
+                 "Linear [block_b::0]: \n"
+                 "\tinputs: Tensor 'block_a::4::out' (1,1,1,5)\n"
+                 "\toutputs: Tensor 'block_b::0::out' (1,1,1,1)\n"
+                 "Linear [block_b::1]: \n"
+                 "\tinputs: Tensor 'block_b::0::out' (1,1,1,1)\n"
+                 "\toutputs: Tensor 'block_b::1::out' (1,1,1,2)\n"
+                 "Linear [block_b::2]: \n"
+                 "\tinputs: Tensor 'block_b::1::out' (1,1,1,2)\n"
+                 "\toutputs: Tensor 'block_b::2::out' (1,1,1,3)\n");
+}
+
+TEST(TestCompilation, LinearSubgraphsPortsUnit)
+{
+    FrontendCompiler compiler;
+
+    auto g = Graph({ { "a", Linear{ 1 } }, { "b", Linear{ 2 } } }, { Port{ "in" }.to(Port{ "a", "in" }), Port{ "in" }.to(Port{ "b", "in" }) });
+
+    EXPECT_NO_THROW(compiler.setTopology(g));
+    EXPECT_NO_THROW(compiler.compile({ 1, 2, 3, 4 }));
+
+    const auto& work = compiler.getWorkflow();
+    EXPECT_EQ(work.getSetOfLayers().size(), 2); // Data (implicitly declares) + Linear
+    EXPECT_EQ(work.getBatchSize(), 1);
+
+    std::stringstream ss;
+    work.printInfo(ss);
+
+    ASSERT_STREQ(ss.str().c_str(),
+                 "Data [in]: \n"
+                 "\toutputs: Tensor 'in' (1,2,3,4)\n"
+                 "Linear [a]: \n"
+                 "\tinputs: Tensor 'in' (1,2,3,4)\n"
+                 "\toutputs: Tensor 'a::out' (1,2,3,1)\n"
+                 "Linear [b]: \n"
+                 "\tinputs: Tensor 'in' (1,2,3,4)\n"
+                 "\toutputs: Tensor 'b::out' (1,2,3,2)\n");
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/frontend/Test_Declaration.cpp b/training/src/tests/tests/frontend/Test_Declaration.cpp
new file mode 100644
index 00000000..38026040
--- /dev/null
+++ b/training/src/tests/tests/frontend/Test_Declaration.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/frontend/Declaration.h>
+#include <training/frontend/Layers.h>
+
+namespace UT
+{
+
+TEST(TestDeclaration, PrimitiveLinearDeclarationUnit)
+{
+    using namespace raul::frontend;
+
+    auto generateLinear = [](size_t features, bool bias)
+    {
+        auto x = Linear{ features };
+        if (bias)
+        {
+            x = x.enableBias();
+        }
+        return x;
+    };
+
+    struct Checker : Processor
+    {
+        size_t features = 1;
+        bool bias = false;
+
+        auto setFeatures(size_t value)
+        {
+            features = value;
+            return *this;
+        }
+        auto setBias(bool value)
+        {
+            bias = value;
+            return *this;
+        }
+
+        void process(const LinearDeclaration& layer, std::optional<frontend::Path>) override
+        {
+            ASSERT_EQ(layer.type, Type::Linear);
+            ASSERT_EQ(layer.features, features);
+            ASSERT_EQ(layer.bias, bias);
+            ASSERT_EQ(layer.inputs.size(), 1);
+            ASSERT_EQ(layer.outputs.size(), 1);
+        }
+    };
+
+    {
+        auto features = 10;
+        auto bias = false;
+        auto layer = generateLinear(features, bias);
+        auto checker = Checker().setBias(bias).setFeatures(features);
+        layer.apply(checker);
+    }
+
+    {
+        auto features = 20;
+        auto bias = true;
+        auto layer = generateLinear(features, bias);
+        auto checker = Checker().setBias(bias).setFeatures(features);
+        layer.apply(checker);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/frontend/Test_DotLang.cpp b/training/src/tests/tests/frontend/Test_DotLang.cpp
new file mode 100644
index 00000000..9d7e92dc
--- /dev/null
+++ b/training/src/tests/tests/frontend/Test_DotLang.cpp
@@ -0,0 +1,299 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/frontend/Graph.h>
+#include <training/frontend/Layers.h>
+#include <training/frontend/processors/DotLangPrinter.h>
+
+using namespace raul::frontend;
+
+namespace UT
+{
+
+void check(std::string value, std::string reference)
+{
+    value.erase(std::remove_if(value.begin(), value.end(), [](unsigned char x) { return std::isspace(x); }), value.end());
+    reference.erase(std::remove_if(reference.begin(), reference.end(), [](unsigned char x) { return std::isspace(x); }), reference.end());
+    ASSERT_STRCASEEQ(value.c_str(), reference.c_str());
+}
+
+TEST(TestDotLang, EmptyGraphUnit)
+{
+    std::stringstream ss;
+
+    auto g = Graph{};
+
+    auto dotPrinter = DotLangPrinter{};
+    g.apply(dotPrinter);
+    dotPrinter.print(ss);
+
+    check(ss.str(),
+          "digraph {"
+          "rankdir=LR;"
+          "concentrate=true;"
+          "node[shape=record];"
+          "}");
+}
+
+TEST(TestDotLang, LayerUnit)
+{
+    std::stringstream ss;
+
+    auto g = Linear{ 3 };
+    auto dotPrinter = DotLangPrinter{};
+    g.apply(dotPrinter);
+    dotPrinter.print(ss);
+
+    check(ss.str(),
+          "digraph {"
+          "rankdir=LR;"
+          "concentrate=true;"
+          "node[shape=record];"
+          "element[label=\"noname:Linear|{<in>in|<out>out}\"];"
+          "}");
+}
+
+TEST(TestDotLang, OneElementGraphUnit)
+{
+    std::stringstream ss;
+
+    auto g = Graph{ Linear{ 3 } };
+    auto dotPrinter = DotLangPrinter{};
+    g.apply(dotPrinter);
+    dotPrinter.print(ss);
+
+    check(ss.str(),
+          "digraph {"
+          "rankdir=LR;"
+          "concentrate=true;"
+          "node[shape=record];"
+          "subgraph cluster {"
+          "element_0[label=\"0:Linear|{<in>in|<out>out}\"];"
+          "port_in[label=\"in\" shape=oval];"
+          "port_out[label=\"out\" shape=oval];"
+          "port_in->element_0:in;"
+          "element_0:out->port_out;"
+          "}"
+          "}");
+}
+
+TEST(TestDotLang, TwoElementsGraphUnit)
+{
+    std::stringstream ss;
+
+    auto g = Graph{ Linear{ 3 }, Linear{ 8 } };
+    auto dotPrinter = DotLangPrinter{};
+    g.apply(dotPrinter);
+    dotPrinter.print(ss);
+
+    check(ss.str(),
+          "digraph {"
+          "rankdir=LR;"
+          "concentrate=true;"
+          "node[shape=record];"
+          "subgraph cluster"
+          "{"
+          "element_0[label=\"0:Linear|{<in>in|<out>out}\"];"
+          "element_1[label=\"1:Linear|{<in>in|<out>out}\"];"
+          "port_in[label=\"in\" shape=oval];"
+          "port_out[label=\"out\" shape=oval];"
+          "port_in->element_0:in;"
+          "element_0:out->element_1:in;"
+          "element_1:out->port_out;"
+          "}"
+          "}");
+}
+
+TEST(TestDotLang, NestedGraphUnit)
+{
+    std::stringstream ss;
+
+    auto g = Graph{ { Graph{ Linear{ 3 } } } };
+    auto dotPrinter = DotLangPrinter{};
+    g.apply(dotPrinter);
+    dotPrinter.print(ss);
+
+    check(ss.str(),
+          "digraph {"
+          "rankdir=LR;"
+          "concentrate=true;"
+          "node[shape=record];"
+          "subgraph cluster"
+          "{"
+          "subgraph cluster_0"
+          "{"
+          "label=\"0\""
+          "element_0_0[label=\"0:Linear|{<in>in|<out>out}\"];"
+          "port_0_in[label=\"in\" shape=oval];"
+          "port_0_out[label=\"out\" shape=oval];"
+          "port_0_in->element_0_0:in;"
+          "element_0_0:out->port_0_out;"
+          "}"
+          "port_in[label=\"in\" shape=oval];"
+          "port_out[label=\"out\" shape=oval];"
+          "port_in->port_0_in;"
+          "port_0_out->port_out;"
+          "}"
+          "}");
+}
+
+TEST(TestDotLang, TwoNestedGraphsUnit)
+{
+    std::stringstream ss;
+
+    auto g = Graph{ Graph{ Linear{ 3 }, Linear{ 4 } }, Graph{ Linear{ 5 }, Linear{ 6 } } };
+    auto dotPrinter = DotLangPrinter{};
+    g.apply(dotPrinter);
+    dotPrinter.print(ss);
+
+    check(ss.str(),
+          "digraph {"
+          "rankdir=LR;"
+          "concentrate=true;"
+          "node[shape=record];"
+          "subgraph cluster"
+          "{"
+          "subgraph cluster_0"
+          "{"
+          "label=\"0\""
+          "element_0_0[label=\"0:Linear|{<in>in|<out>out}\"];"
+          "element_0_1[label=\"1:Linear|{<in>in|<out>out}\"];"
+          "port_0_in[label=\"in\" shape=oval];"
+          "port_0_out[label=\"out\" shape=oval];"
+          "port_0_in->element_0_0:in;"
+          "element_0_0:out->element_0_1:in;"
+          "element_0_1:out->port_0_out;"
+          "}"
+          "subgraph cluster_1"
+          "{"
+          "label=\"1\""
+          "element_1_0[label=\"0:Linear|{<in>in|<out>out}\"];"
+          "element_1_1[label=\"1:Linear|{<in>in|<out>out}\"];"
+          "port_1_in[label=\"in\" shape=oval];"
+          "port_1_out[label=\"out\" shape=oval];"
+          "port_1_in->element_1_0:in;"
+          "element_1_0:out->element_1_1:in;"
+          "element_1_1:out->port_1_out;"
+          "}"
+          "port_in[label=\"in\" shape=oval];"
+          "port_out[label=\"out\" shape=oval];"
+          "port_in->port_0_in;"
+          "port_0_out->port_1_in;"
+          "port_1_out->port_out;"
+          "}"
+          "}");
+}
+
+TEST(TestDotLang, TwoNamedNestedGraphsUnit)
+{
+    std::stringstream ss;
+
+    auto g = Graph{ { "x", Graph{ { "l1", Linear{ 3 } }, Linear{ 4 } } }, { "y", Graph{ { "l2", Linear{ 5 } }, Linear{ 6 } } } };
+    auto dotPrinter = DotLangPrinter{};
+    g.apply(dotPrinter);
+    dotPrinter.print(ss);
+
+    check(ss.str(),
+          "digraph {"
+          "rankdir=LR;"
+          "concentrate=true;"
+          "node[shape=record];"
+          "subgraph cluster"
+          "{"
+          "subgraph cluster_x"
+          "{"
+          "label=\"x\""
+          "element_x_l1[label=\"l1:Linear|{<in>in|<out>out}\"];"
+          "element_x_1[label=\"1:Linear|{<in>in|<out>out}\"];"
+          "port_x_in[label=\"in\" shape=oval];"
+          "port_x_out[label=\"out\" shape=oval];"
+          "port_x_in->element_x_l1:in;"
+          "element_x_l1:out->element_x_1:in;"
+          "element_x_1:out->port_x_out;"
+          "}"
+          "subgraph cluster_y"
+          "{"
+          "label=\"y\""
+          "element_y_l2[label=\"l2:Linear|{<in>in|<out>out}\"];"
+          "element_y_1[label=\"1:Linear|{<in>in|<out>out}\"];"
+          "port_y_in[label=\"in\" shape=oval];"
+          "port_y_out[label=\"out\" shape=oval];"
+          "port_y_in->element_y_l2:in;"
+          "element_y_l2:out->element_y_1:in;"
+          "element_y_1:out->port_y_out;"
+          "}"
+          "port_in[label=\"in\" shape=oval];"
+          "port_out[label=\"out\" shape=oval];"
+          "port_in->port_x_in;"
+          "port_x_out->port_y_in;"
+          "port_y_out->port_out;"
+          "}"
+          "}");
+}
+
+TEST(TestDotLang, TwoNamedNestedGraphsAndSkipUnit)
+{
+    std::stringstream ss;
+
+    auto subGraph1 = Graph{ { "l1", Linear{ 3 } }, { "l2", Linear{ 4 } } };
+    auto subGraph2 = Graph{ { "l3", Linear{ 5 } }, { "l4", Linear{ 6 } } };
+    auto g = Graph{ { { "g1", subGraph1 }, { "g2", subGraph2 } },
+                    { Port("in").to(Port("g1", "in")), Port("g1", "out").to(Port("g2", "in")), Port("g2", "out").to(Port("out")), Port("g1", "out").to(Port("out_aux")) } };
+
+    auto dotPrinter = DotLangPrinter{};
+    g.apply(dotPrinter);
+    dotPrinter.print(ss);
+
+    check(ss.str(),
+          "digraph {"
+          "rankdir=LR;"
+          "concentrate=true;"
+          "node[shape=record];"
+          "subgraph cluster"
+          "{"
+          "subgraph cluster_g1"
+          "{"
+          "label=\"g1\""
+          "element_g1_l1[label=\"l1:Linear|{<in>in|<out>out}\"];"
+          "element_g1_l2[label=\"l2:Linear|{<in>in|<out>out}\"];"
+          "port_g1_in[label=\"in\" shape=oval];"
+          "port_g1_out[label=\"out\" shape=oval];"
+          "port_g1_in->element_g1_l1:in;"
+          "element_g1_l1:out->element_g1_l2:in;"
+          "element_g1_l2:out->port_g1_out;"
+          "}"
+          "subgraph cluster_g2"
+          "{"
+          "label=\"g2\""
+          "element_g2_l3[label=\"l3:Linear|{<in>in|<out>out}\"];"
+          "element_g2_l4[label=\"l4:Linear|{<in>in|<out>out}\"];"
+          "port_g2_in[label=\"in\" shape=oval];"
+          "port_g2_out[label=\"out\" shape=oval];"
+          "port_g2_in->element_g2_l3:in;"
+          "element_g2_l3:out->element_g2_l4:in;"
+          "element_g2_l4:out->port_g2_out;"
+          "}"
+          "port_in[label=\"in\" shape=oval];"
+          "port_out[label=\"out\" shape=oval];"
+          "port_out_aux[label=\"out_aux\" shape=oval];"
+          "port_in->port_g1_in;"
+          "port_g1_out->port_g2_in;"
+          "port_g2_out->port_out;"
+          "port_g1_out->port_out_aux;"
+          "}"
+          "}");
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/frontend/Test_Graph.cpp b/training/src/tests/tests/frontend/Test_Graph.cpp
new file mode 100644
index 00000000..955d9a3a
--- /dev/null
+++ b/training/src/tests/tests/frontend/Test_Graph.cpp
@@ -0,0 +1,134 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/frontend/Graph.h>
+#include <training/frontend/Layers.h>
+#include <training/frontend/processors/DotLangPrinter.h>
+#include <training/frontend/processors/TextPrinter.h>
+
+using namespace raul::frontend;
+
+namespace UT
+{
+
+TEST(TestGraph, SequencialGraphUnit)
+{
+    EXPECT_NO_THROW(Graph({ Linear{ 1 }, Linear{ 2 }, Linear{ 3 }, Linear{ 4 }, Linear{ 5 } }));
+}
+
+TEST(TestGraph, InnerGraphUnit)
+{
+
+    EXPECT_NO_THROW(Graph({ Graph{ Linear{ 1 }, Linear{ 2 } }, Graph{ Linear{ 3 }, Linear{ 4 } }, Linear{ 5 } }));
+}
+
+TEST(TestGraph, GraphMapLikeInterfaceUnit)
+{
+
+    auto g = Graph{};
+    g["linear_1"] = Linear{ 1 };
+    g["linear_2"] = Linear{ 2 };
+    g["linear_3"] = Linear{ 3 };
+    g["linear_4"] = Linear{ 4 };
+    g["linear_5"] = Linear{ 5 };
+
+    struct Checker : Processor
+    {
+        size_t featuresValue = 1;
+        void process(const LinearDeclaration& layer, std::optional<frontend::Path>) override
+        {
+            ASSERT_EQ(layer.features, featuresValue);
+            ++featuresValue;
+        }
+    } checker;
+
+    g.apply(checker);
+
+    ASSERT_EQ(checker.featuresValue, 6);
+}
+
+TEST(TestGraph, GraphVectorLikeInterfaceUnit)
+{
+    auto l = Linear{ 0 };
+    auto g = Graph{ l, l, l, l, l };
+
+    g[0] = Linear{ 1 };
+    g[1] = Linear{ 2 };
+    g[2] = Linear{ 3 };
+    g[3] = Linear{ 4 };
+    g[4] = Linear{ 5 };
+
+    struct Checker : Processor
+    {
+        size_t featuresValue = 1;
+        void process(const LinearDeclaration& layer, std::optional<frontend::Path>) override
+        {
+            ASSERT_EQ(layer.features, featuresValue);
+            ++featuresValue;
+        }
+    } checker;
+
+    for (size_t i = 0; i < 5; ++i)
+    {
+        EXPECT_NO_THROW(g[i]);
+        g[i].apply(checker);
+    }
+
+    EXPECT_THROW(g.at(5), std::out_of_range);
+}
+
+TEST(TestGraph, GraphNamedLayersUnit)
+{
+
+    auto l = Linear{ 1 };
+    EXPECT_NO_THROW(Graph({ { "name 1", l }, { "name 2", l }, l }));
+}
+
+TEST(TestGraph, GraphNameCollisionUnit)
+{
+    auto l = Linear{ 1 };
+    EXPECT_NO_THROW(Graph({ { "a", l }, { "b", l } }));
+    EXPECT_THROW(Graph({ { "a", l }, { "a", l } }), raul::Exception);
+}
+
+TEST(TestGraph, GraphTextPrinterUnit)
+{
+    auto g1 = Graph{ Linear{ 1 }, Linear{ 2 } };
+    auto g2 = Graph{ { "subgraph_1", g1 }, { "element", Linear{ 3 } }, Linear{ 4 } };
+
+    std::stringstream ss;
+    TextPrinter printer(ss);
+    g2.apply(printer);
+
+    ASSERT_STREQ(ss.str().c_str(),
+                 "[subgraph_1:[Linear(name=0, features=1, bias=false),Linear(name=1, features=2, "
+                 "bias=false)|Port(in)->Port(0::in),Port(0::out)->Port(1::in),Port(1::out)->Port(out)],Linear(name=element, features=3, bias=false),Linear(name=2, features=4, "
+                 "bias=false)|Port(in)->Port(subgraph_1::in),Port(subgraph_1::out)->Port(element::in),Port(element::out)->Port(2::in),Port(2::out)->Port(out)]");
+}
+
+TEST(TestGraph, GraphPortsUnit)
+{
+
+    auto l = Linear{ 1 };
+    auto g = Graph({ { "a", l }, { "b", l } }, { Port("in").to(Port("a", "in")), Port("b", "out").to(Port("out")) });
+
+    std::stringstream ss;
+    TextPrinter printer(ss);
+    g.apply(printer);
+
+    ASSERT_STREQ(ss.str().c_str(), "[Linear(name=a, features=1, bias=false),Linear(name=b, features=1, bias=false)|Port(in)->Port(a::in),Port(b::out)->Port(out)]");
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/frontend/Test_JSON.cpp b/training/src/tests/tests/frontend/Test_JSON.cpp
new file mode 100644
index 00000000..1a9f7855
--- /dev/null
+++ b/training/src/tests/tests/frontend/Test_JSON.cpp
@@ -0,0 +1,146 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/frontend/Graph.h>
+#include <training/frontend/Layers.h>
+#include <training/frontend/io/JSON.h>
+#include <training/frontend/processors/TextPrinter.h>
+
+using namespace raul::frontend;
+
+namespace UT
+{
+
+TEST(TestJSON, LinearUnit)
+{
+
+    auto data = R"(
+                    {
+                        "type": "linear",
+                        "features": 20,
+                        "bias": true
+                    }
+                )"_json;
+
+    auto g = frontend::io::fromJSON(data);
+
+    std::stringstream ss;
+    TextPrinter printer(ss);
+    g.apply(printer);
+
+    ASSERT_STREQ(ss.str().c_str(), "Linear(features=20, bias=true)");
+}
+
+TEST(TestJSON, EmptyGraphUnit)
+{
+
+    auto data = R"(
+                   {
+                        "type": "graph",
+                        "nodes": {}
+                   }
+                )"_json;
+
+    auto g = frontend::io::fromJSON(data);
+
+    std::stringstream ss;
+    TextPrinter printer(ss);
+    g.apply(printer);
+
+    ASSERT_STREQ(ss.str().c_str(), "[]");
+}
+
+TEST(TestJSON, NodeOnlyGraphUnit)
+{
+
+    auto data = R"(
+                    {
+                       "type": "graph",
+                       "nodes": {
+                            "A": {"type": "linear", "features": 1},
+                            "B": {"type": "linear", "features": 2}
+                        }
+                    }
+                )"_json;
+
+    auto g = frontend::io::fromJSON(data);
+
+    std::stringstream ss;
+    TextPrinter printer(ss);
+    g.apply(printer);
+
+    ASSERT_STREQ(ss.str().c_str(), "[Linear(name=A, features=1, bias=false),Linear(name=B, features=2, bias=false)]");
+}
+
+TEST(TestJSON, NodeOnlyNestedGraphUnit)
+{
+
+    auto data = R"(
+                    {
+                       "type": "graph",
+                       "nodes": {
+                            "A": {
+                                "type": "graph",
+                                "nodes": {
+                                    "Inner": {"type": "linear", "features": 3}
+                                }
+                            },
+                            "B": {"type": "linear", "features": 2}
+                        }
+                    }
+                )"_json;
+
+    auto g = frontend::io::fromJSON(data);
+
+    std::stringstream ss;
+    TextPrinter printer(ss);
+    g.apply(printer);
+
+    ASSERT_STREQ(ss.str().c_str(), "[A:[Linear(name=Inner, features=3, bias=false)],Linear(name=B, features=2, bias=false)]");
+}
+
+TEST(TestJSON, NodePortsGraphUnit)
+{
+
+    auto data = R"(
+                    {
+                       "type": "graph",
+                       "nodes": {
+                            "A": {"type": "linear", "features": 1},
+                            "B": {"type": "linear", "features": 2}
+                        },
+                        "edges": [
+                            {
+                                "from": {"port": "in"},
+                                "to": {"layer": "A", "port": "in"}
+                            },
+                            {
+                                "from": {"layer": "B", "port": "out"},
+                                "to": {"port": "out"}
+                            }
+                        ]
+                    }
+                )"_json;
+
+    auto g = frontend::io::fromJSON(data);
+
+    std::stringstream ss;
+    TextPrinter printer(ss);
+    g.apply(printer);
+
+    ASSERT_STREQ(ss.str().c_str(), "[Linear(name=A, features=1, bias=false),Linear(name=B, features=2, bias=false)|Port(in)->Port(A::in),Port(B::out)->Port(out)]");
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/frontend/Test_Path.cpp b/training/src/tests/tests/frontend/Test_Path.cpp
new file mode 100644
index 00000000..e471acdd
--- /dev/null
+++ b/training/src/tests/tests/frontend/Test_Path.cpp
@@ -0,0 +1,108 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/frontend/Path.h>
+
+using namespace raul::frontend;
+
+namespace UT
+{
+
+TEST(TestPath, InitializationUnit)
+{
+    const char* fromCharArr = "char";
+    std::string fromString = "str";
+
+    EXPECT_NO_THROW(frontend::Path{ fromCharArr });
+    EXPECT_NO_THROW(frontend::Path{ fromString });
+
+    EXPECT_NO_THROW(frontend::Path("x"));
+    EXPECT_NO_THROW(frontend::Path("x"s));
+
+    EXPECT_NO_THROW(frontend::Path({ "x", "y", "z" }));
+    EXPECT_NO_THROW(frontend::Path({ "x"s, "y"s, "z"s }));
+}
+
+TEST(TestPath, PathUnit)
+{
+    frontend::Path name{ "x" };
+
+    auto& nameParts = name.parts();
+
+    ASSERT_EQ(nameParts.size(), 1);
+    ASSERT_EQ(name.depth(), 1);
+
+    name /= "y";
+
+    ASSERT_EQ(nameParts.size(), 2);
+    ASSERT_EQ(name.depth(), 2);
+
+    name /= "z";
+
+    ASSERT_EQ(nameParts.size(), 3);
+    ASSERT_EQ(name.depth(), 3);
+
+    frontend::Path another = "x"_name / "y" / "z";
+    ASSERT_EQ(another.depth(), 3);
+
+    ASSERT_EQ(name, another);
+
+    {
+        testing::internal::CaptureStdout();
+        std::cout << name.str();
+        std::string output = testing::internal::GetCapturedStdout();
+
+        ASSERT_STREQ(output.c_str(), "z");
+    }
+
+    {
+        testing::internal::CaptureStdout();
+        std::cout << name.fullname();
+        std::string output = testing::internal::GetCapturedStdout();
+
+        ASSERT_STREQ(output.c_str(), "x/y/z");
+    }
+
+    {
+        testing::internal::CaptureStdout();
+        std::cout << name;
+        std::string output = testing::internal::GetCapturedStdout();
+
+        ASSERT_STREQ(output.c_str(), "z");
+    }
+}
+
+TEST(TestPath, CompareUnit)
+{
+    EXPECT_TRUE("x"_name == "x"_name);
+    EXPECT_FALSE("x"_name == "y"_name);
+    EXPECT_TRUE("x"_name != "y"_name);
+
+    EXPECT_EQ("x"_name > "y"_name, "x"s > "y"s);
+    EXPECT_EQ("x"_name < "y"_name, "x"s < "y"s);
+}
+
+TEST(TestPath, ComparePathUnit)
+{
+    EXPECT_TRUE(frontend::Path({ "x", "y" }) == "x"_name / "y");
+    EXPECT_FALSE(frontend::Path({ "x", "y" }) == "x"_name);
+    EXPECT_FALSE(frontend::Path({ "x", "y" }) == "y"_name);
+    EXPECT_FALSE(frontend::Path({ "x", "y" }) == "y"_name / "x");
+
+    EXPECT_TRUE("x"_name < "x"_name / "y");
+    EXPECT_TRUE("x"_name / "y" > "x"_name);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/initializers/Test_Initializer_ConstantInitializer.cpp b/training/src/tests/tests/initializers/Test_Initializer_ConstantInitializer.cpp
new file mode 100644
index 00000000..adfdbd99
--- /dev/null
+++ b/training/src/tests/tests/initializers/Test_Initializer_ConstantInitializer.cpp
@@ -0,0 +1,76 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/initializers/ConstantInitializer.h>
+
+namespace UT
+{
+
+TEST(TestInitializerConstantInitializer, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    const raul::dtype value = TODTYPE(0.0);
+    {
+        raul::initializers::ConstantInitializer initializer{ value };
+        stream << initializer;
+        ASSERT_STREQ(stream.str().c_str(), "ConstantInitializer(value=0)");
+    }
+}
+
+TEST(TestInitializerConstantInitializer, InitializeZerosUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t batch_size = 1;
+    const size_t width = 2;
+    const size_t height = 3;
+    const size_t depth = 4;
+    const raul::dtype value = TODTYPE(0.0);
+    {
+        auto& output = *memory_manager.createTensor(batch_size, depth, height, width);
+        raul::initializers::ConstantInitializer initializer{ value };
+        initializer(output);
+
+        for (raul::dtype d : output)
+        {
+            EXPECT_EQ(d, 0.0_dt);
+        }
+    }
+}
+
+TEST(TestInitializerConstantInitializer, InitializeOnesUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t batch_size = 1;
+    const size_t width = 2;
+    const size_t height = 3;
+    const size_t depth = 4;
+    const raul::dtype value = TODTYPE(1.0);
+    {
+        auto& output = *memory_manager.createTensor(batch_size, depth, height, width);
+        raul::initializers::ConstantInitializer initializer{ value };
+        initializer(output);
+
+        for (raul::dtype d : output)
+        {
+            EXPECT_EQ(d, 1.0_dt);
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/initializers/Test_Initializer_RandomNormInitializer.cpp b/training/src/tests/tests/initializers/Test_Initializer_RandomNormInitializer.cpp
new file mode 100644
index 00000000..83296397
--- /dev/null
+++ b/training/src/tests/tests/initializers/Test_Initializer_RandomNormInitializer.cpp
@@ -0,0 +1,124 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/initializers/RandomNormInitializer.h>
+
+namespace UT
+{
+
+TEST(TestInitializerRandomNorm, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    const size_t seed = 42;
+    {
+        raul::initializers::RandomNormInitializer initializer{ seed };
+        stream << initializer;
+        ASSERT_STREQ(stream.str().c_str(), "RandomNormInitializer(mean=0.000000e+00, stddev=1.000000e+00, seed=42)");
+    }
+}
+
+TEST(TestInitializerRandomNorm, InitializeUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t width = 4;
+    const size_t height = 3;
+    const size_t depth = 2;
+    const size_t batch_size = 1;
+    const size_t seed = 1;
+    {
+        auto& output = *memory_manager.createTensor(batch_size, depth, height, width);
+        raul::initializers::RandomNormInitializer initializer{ seed };
+        EXPECT_NO_THROW(initializer(output));
+    }
+}
+
+TEST(TestInitializerRandomNorm, InitializeBigTensorUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const raul::dtype mean = TODTYPE(1.0);
+    const raul::dtype stddev = TODTYPE(5.0);
+    const size_t seed = 1;
+    const size_t width = 16;
+    const size_t height = 16;
+    const size_t depth = 3;
+    const size_t batch_size = 128;
+    {
+        auto& output = *memory_manager.createTensor(batch_size, depth, height, width);
+        raul::initializers::RandomNormInitializer initializer{ seed, mean, stddev };
+        initializer(output);
+        EXPECT_EQ(output.size(), batch_size * width * height * depth);
+        raul::dtype average = std::accumulate(output.begin(), output.end(), 0.0_dt) / (batch_size * width * height * depth);
+        printf("Average of elements is = %f\n", average);
+        raul::dtype bias = 0.0_dt;
+        for (auto d : output)
+        {
+            bias += (d - average) * (d - average);
+        }
+        printf("Standard deviation of elements is = %f\n", sqrt(bias / static_cast<raul::dtype>(output.size())));
+    }
+}
+
+TEST(TestInitializerRandomNorm, InitializeDifferentSeedsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t depth = 3;
+    const size_t seed1 = 1;
+    const size_t seed2 = 2;
+    {
+        auto& output1 = *memory_manager.createTensor(1, depth, height, width);
+        auto& output2 = *memory_manager.createTensor(1, depth, height, width);
+        raul::initializers::RandomNormInitializer initializer1{ seed1 };
+        raul::initializers::RandomNormInitializer initializer2{ seed2 };
+        initializer1(output1);
+        initializer2(output2);
+        EXPECT_EQ(output1.size(), output2.size());
+        for (size_t i = 0; i < output1.size(); i++)
+        {
+            EXPECT_NE(output1[i], output2[i]);
+        }
+    }
+}
+
+TEST(TestInitializerRandomNorm, InitializeSameSeedsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t depth = 3;
+    const size_t seed = 1;
+    {
+        auto& output1 = *memory_manager.createTensor(1, depth, height, width);
+        auto& output2 = *memory_manager.createTensor(1, depth, height, width);
+        raul::initializers::RandomNormInitializer initializer1{ seed };
+        raul::initializers::RandomNormInitializer initializer2{ seed };
+        initializer1(output1);
+        initializer2(output2);
+        EXPECT_EQ(output1.size(), output2.size());
+        for (size_t i = 0; i < output1.size(); i++)
+        {
+            EXPECT_EQ(output1[i], output2[i]);
+        }
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/initializers/Test_Initializer_RandomUniformInitializer.cpp b/training/src/tests/tests/initializers/Test_Initializer_RandomUniformInitializer.cpp
new file mode 100644
index 00000000..83dec07a
--- /dev/null
+++ b/training/src/tests/tests/initializers/Test_Initializer_RandomUniformInitializer.cpp
@@ -0,0 +1,142 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/initializers/RandomUniformInitializer.h>
+
+namespace UT
+{
+
+TEST(TestInitializerRandomUniform, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    const raul::dtype minval = TODTYPE(0.0);
+    const raul::dtype maxval = TODTYPE(1.0);
+    const size_t seed = 42;
+    {
+        raul::initializers::RandomUniformInitializer initializer{ seed, minval, maxval };
+        stream << initializer;
+        ASSERT_STREQ(stream.str().c_str(), "RandomUniformInitializer(minval=0.000000e+00, maxval=1.000000e+00, seed=42)");
+    }
+}
+
+TEST(TestInitializerRandomUniform, AssertIncorrectBoundariesUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    const raul::dtype minval = TODTYPE(1.0);
+    const raul::dtype maxval = TODTYPE(0.0);
+    {
+        ASSERT_THROW(raul::initializers::RandomUniformInitializer(minval, maxval), raul::Exception);
+    }
+}
+
+TEST(TestInitializerRandomUniform, InitializeUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const raul::dtype minval = TODTYPE(0.0);
+    const raul::dtype maxval = TODTYPE(1.0);
+    const size_t seed = 1;
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t depth = 3;
+    {
+        auto& output = *memory_manager.createTensor(1, depth, height, width);
+        raul::initializers::RandomUniformInitializer initializer{ seed, minval, maxval };
+        initializer(output);
+        EXPECT_EQ(output.size(), width * height * depth);
+        for (raul::dtype d : output)
+        {
+            EXPECT_TRUE(d >= minval);
+            EXPECT_TRUE(d < maxval);
+        }
+    }
+}
+
+TEST(TestInitializerRandomUniform, InitializeBigTensorUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const raul::dtype minval = TODTYPE(0.0);
+    const raul::dtype maxval = TODTYPE(1.0);
+    const size_t seed = 1;
+    const size_t width = 16;
+    const size_t height = 16;
+    const size_t depth = 3;
+    const size_t batch_size = 128;
+    {
+        auto& output = *memory_manager.createTensor(batch_size, depth, height, width);
+        raul::initializers::RandomUniformInitializer initializer{ seed, minval, maxval };
+        initializer(output);
+        EXPECT_EQ(output.size(), batch_size * width * height * depth);
+        raul::dtype average = std::accumulate(output.begin(), output.end(), 0.0_dt) / (batch_size * width * height * depth);
+        printf("Average of elements is = %f\n", average);
+    }
+}
+
+TEST(TestInitializerRandomUniform, InitializeDifferentSeedsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const raul::dtype minval = TODTYPE(0.0);
+    const raul::dtype maxval = TODTYPE(1.0);
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t depth = 3;
+    const size_t seed1 = 1;
+    const size_t seed2 = 2;
+    {
+        auto& output1 = *memory_manager.createTensor(1, depth, height, width);
+        auto& output2 = *memory_manager.createTensor(1, depth, height, width);
+        raul::initializers::RandomUniformInitializer initializer1{ seed1, minval, maxval };
+        raul::initializers::RandomUniformInitializer initializer2{ seed2, minval, maxval };
+        initializer1(output1);
+        initializer2(output2);
+        EXPECT_EQ(output1.size(), output2.size());
+        for (size_t i = 0; i < output1.size(); i++)
+        {
+            EXPECT_NE(output1[i], output2[i]);
+        }
+    }
+}
+
+TEST(TestInitializerRandomUniform, InitializeSameSeedsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const raul::dtype minval = TODTYPE(0.0);
+    const raul::dtype maxval = TODTYPE(1.0);
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t depth = 3;
+    const size_t seed = 1;
+    {
+        auto& output1 = *memory_manager.createTensor(1, depth, height, width);
+        auto& output2 = *memory_manager.createTensor(1, depth, height, width);
+        raul::initializers::RandomUniformInitializer initializer1{ seed, minval, maxval };
+        raul::initializers::RandomUniformInitializer initializer2{ seed, minval, maxval };
+        initializer1(output1);
+        initializer2(output2);
+        EXPECT_EQ(output1.size(), output2.size());
+        for (size_t i = 0; i < output1.size(); i++)
+        {
+            EXPECT_EQ(output1[i], output2[i]);
+        }
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/initializers/Test_Initializer_XavierInitializer.cpp b/training/src/tests/tests/initializers/Test_Initializer_XavierInitializer.cpp
new file mode 100644
index 00000000..9a1f303e
--- /dev/null
+++ b/training/src/tests/tests/initializers/Test_Initializer_XavierInitializer.cpp
@@ -0,0 +1,189 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/initializers/XavierInitializer.h>
+
+namespace UT
+{
+
+TEST(TestInitializerXavierUniform, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    const size_t seed = 42;
+    {
+        raul::initializers::XavierUniformInitializer initializer{ seed };
+        stream << initializer;
+        ASSERT_STREQ(stream.str().c_str(), "XavierUniformInitializer(seed=42)");
+    }
+}
+
+TEST(TestInitializerXavierNorm, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    const size_t seed = 42;
+    {
+        raul::initializers::XavierNormInitializer initializer{ seed };
+        stream << initializer;
+        ASSERT_STREQ(stream.str().c_str(), "XavierNormInitializer(seed=42)");
+    }
+}
+
+TEST(TestInitializerXavierNorm, CalculateFactorUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t dim11 = 1;
+    const size_t dim12 = 1;
+    const size_t dim13 = 3;
+    const size_t dim14 = 4;
+    const size_t torch_factor1 = dim13 + dim14;
+
+    const size_t dim21 = 1;
+    const size_t dim22 = 2;
+    const size_t dim23 = 3;
+    const size_t dim24 = 4;
+    const size_t torch_factor2 = (dim22 + dim23) * dim24;
+
+    const size_t dim31 = 2;
+    const size_t dim32 = 3;
+    const size_t dim33 = 4;
+    const size_t dim34 = 5;
+    const size_t torch_factor3 = (dim31 + dim32) * dim33 * dim34;
+    {
+        auto& output1 = *memory_manager.createTensor(dim11, dim12, dim13, dim14);
+        auto& output2 = *memory_manager.createTensor(dim21, dim22, dim23, dim24);
+        auto& output3 = *memory_manager.createTensor(dim31, dim32, dim33, dim34);
+        EXPECT_EQ(raul::initializers::XavierNormInitializer::calculateFactor(output1), torch_factor1);
+        EXPECT_EQ(raul::initializers::XavierNormInitializer::calculateFactor(output2), torch_factor2);
+        EXPECT_EQ(raul::initializers::XavierNormInitializer::calculateFactor(output3), torch_factor3);
+    }
+}
+
+TEST(TestInitializerXavierUniform, InitializeUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t depth = 3;
+    const size_t batch_size = 1;
+    const raul::dtype minval = TODTYPE(-sqrt(6.0_dt / ((depth + height) * width)));
+    const raul::dtype maxval = TODTYPE(sqrt(6.0_dt / ((depth + height) * width)));
+    {
+        auto& output = *memory_manager.createTensor(batch_size, depth, height, width);
+        raul::initializers::XavierUniformInitializer initializer;
+        initializer(output);
+
+        EXPECT_EQ(output.size(), width * height * depth);
+        for (raul::dtype d : output)
+        {
+            EXPECT_TRUE(d >= minval);
+            EXPECT_TRUE(d < maxval);
+        }
+    }
+}
+
+TEST(TestInitializerXavierNorm, InitializeUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t depth = 3;
+    const size_t batch_size = 1;
+    const size_t seed = 1;
+    {
+        auto& output = *memory_manager.createTensor(batch_size, depth, height, width);
+        raul::initializers::XavierNormInitializer initializer{ seed };
+        EXPECT_NO_THROW(initializer(output));
+    }
+}
+
+TEST(TestInitializerXavierNorm, InitializeBigTensorUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t seed = 1;
+    const size_t width = 16;
+    const size_t height = 16;
+    const size_t depth = 3;
+    const size_t batch_size = 128;
+    {
+        auto& output = *memory_manager.createTensor(batch_size, depth, height, width);
+        raul::initializers::XavierNormInitializer initializer{ seed };
+        initializer(output);
+        EXPECT_EQ(output.size(), batch_size * width * height * depth);
+        raul::dtype average = std::accumulate(output.begin(), output.end(), 0.0_dt) / static_cast<raul::dtype>(output.size());
+        raul::dtype stddev = 0.0_dt;
+        printf("Average of elements is = %f\n", average);
+        for (auto d : output)
+        {
+            stddev += (d - average) * (d - average);
+        }
+        printf("Dispersion of elements is = %f\n", stddev / static_cast<raul::dtype>(output.size()));
+    }
+}
+
+TEST(TestInitializerXavierNorm, InitializeDifferentSeedsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t depth = 3;
+    const size_t seed1 = 1;
+    const size_t seed2 = 2;
+    {
+        auto& output1 = *memory_manager.createTensor(1, depth, height, width);
+        auto& output2 = *memory_manager.createTensor(1, depth, height, width);
+        raul::initializers::XavierNormInitializer initializer1{ seed1 };
+        raul::initializers::XavierNormInitializer initializer2{ seed2 };
+        initializer1(output1);
+        initializer2(output2);
+        EXPECT_EQ(output1.size(), output2.size());
+        for (size_t i = 0; i < output1.size(); i++)
+        {
+            EXPECT_NE(output1[i], output2[i]);
+        }
+    }
+}
+
+TEST(TestInitializerXavierNorm, InitializeSameSeedsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t depth = 3;
+    const size_t seed = 1;
+    {
+        auto& output1 = *memory_manager.createTensor(1, depth, height, width);
+        auto& output2 = *memory_manager.createTensor(1, depth, height, width);
+        raul::initializers::XavierNormInitializer initializer1{ seed };
+        raul::initializers::XavierNormInitializer initializer2{ seed };
+        initializer1(output1);
+        initializer2(output2);
+        EXPECT_EQ(output1.size(), output2.size());
+        for (size_t i = 0; i < output1.size(); i++)
+        {
+            EXPECT_EQ(output1[i], output2[i]);
+        }
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_CNNAveragePool.cpp b/training/src/tests/tests/layers/Test_CNNAveragePool.cpp
new file mode 100644
index 00000000..b553f63f
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_CNNAveragePool.cpp
@@ -0,0 +1,162 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/AveragePoolLayer.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/parameters/LayerParameters.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+using namespace raul;
+TEST(TestCNNAveragePool, Unit)
+{
+    PROFILE_TEST
+    dtype eps = TODTYPE(1e-6);
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+        size_t batch = 1;
+        size_t stride_w = 1;
+        size_t stride_h = 1;
+        size_t in_w = 3;
+        size_t in_h = 3;
+        size_t padding_w = 0;
+        size_t padding_h = 0;
+        size_t depth = 1;
+        size_t kernel_height = 3;
+        size_t kernel_width = 3;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in" }, { "avg" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        AveragePoolLayer avgpool("avg1", params, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        const Tensor& out = memory_manager["avg"];
+        avgpool.forwardCompute(NetworkMode::Train);
+        EXPECT_NEAR(TODTYPE(5.f), out[0], eps);
+        printf(" - AveragePool with square kernel and stride = 1 is Ok.\n");
+
+        memory_manager.clear();
+    }
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
+        size_t batch = 1;
+        size_t in_w = 4;
+        size_t in_h = 3;
+        size_t depth = 1;
+        size_t kernel_height = 3;
+        size_t kernel_width = 2;
+        size_t stride_w = 1;
+        size_t stride_h = 1;
+        size_t padding_w = 0;
+        size_t padding_h = 0;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in" }, { "avg" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        AveragePoolLayer avgpool("avg1", params, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        const Tensor& out = memory_manager["avg"];
+        avgpool.forwardCompute(NetworkMode::Train);
+
+        EXPECT_NEAR(TODTYPE(5.5f), out[0], eps);
+        EXPECT_NEAR(TODTYPE(6.5f), out[1], eps);
+        EXPECT_NEAR(TODTYPE(7.5f), out[2], eps);
+        printf(" - AveragePool with kernel = (2,3) and stride = 1 is Ok.\n");
+
+        memory_manager.clear();
+    }
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+        size_t batch = 1;
+        size_t in_w = 5;
+        size_t in_h = 3;
+        size_t depth = 1;
+        size_t kernel_height = 2;
+        size_t kernel_width = 3;
+        size_t stride_w = 3;
+        size_t stride_h = 1;
+        size_t padding_w = 1;
+        size_t padding_h = 0;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in" }, { "avg" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        AveragePoolLayer avgpool("avg1", params, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        const Tensor& out = memory_manager["avg"];
+        avgpool.forwardCompute(NetworkMode::Train);
+
+        EXPECT_NEAR(TODTYPE(2.6666667f), out[0], eps);
+        EXPECT_NEAR(TODTYPE(6.5f), out[1], eps);
+        EXPECT_NEAR(TODTYPE(6.0f), out[2], eps);
+        EXPECT_NEAR(TODTYPE(11.5f), out[3], eps);
+
+        printf(" - AveragePool with kernel = (3,2) and stride = (3,1) and padding (1,0) is Ok.\n");
+    }
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        size_t in_w = 5;
+        size_t in_h = 3;
+        size_t depth = 1;
+        size_t kernel_height = 2;
+        size_t kernel_width = 3;
+        size_t stride_w = 3;
+        size_t stride_h = 1;
+        size_t padding_w = 2;
+        size_t padding_h = 0;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in" }, { "out" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        EXPECT_THROW(AveragePoolLayer averagepool4("avg1", params, networkParameters), raul::Exception);
+        printf(" - AveragePool with wrong Padding make throw - Ok.\n");
+    }
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+        size_t in_w = 0;
+        size_t in_h = 0;
+        size_t depth = 1;
+        size_t kernel_height = 2;
+        size_t kernel_width = 3;
+        size_t stride_w = 1;
+        size_t stride_h = 1;
+        size_t padding_w = 2;
+        size_t padding_h = 0;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in2" }, { "out2" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        EXPECT_THROW(AveragePoolLayer averagepool5("avg1", params, networkParameters), raul::Exception);
+        printf(" - AveragePool with wrong Input size make throw - Ok.\n");
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_CNNBatchNorm.cpp b/training/src/tests/tests/layers/Test_CNNBatchNorm.cpp
new file mode 100644
index 00000000..06b9bfe1
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_CNNBatchNorm.cpp
@@ -0,0 +1,153 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <tests/GTestExtensions.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/trainable/Batchnorm.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+using namespace raul;
+
+using dtype = raul::dtype;
+using shape = raul::shape;
+using dvec = std::vector<dtype>;
+
+struct TestCNNBatchNormUnit : public testing::TestWithParam<std::tuple<shape, NetworkMode, dvec, dvec>>
+{
+    static constexpr dtype EPSILON = 1e-4_dt;
+
+    const shape inputShape = std::get<0>(GetParam());
+    const NetworkMode networkMode = std::get<1>(GetParam());
+    const dvec& inputData = std::get<2>(GetParam());
+    const dvec& forwardPassResult = std::get<3>(GetParam());
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    TestCNNBatchNorm,
+    TestCNNBatchNormUnit,
+    testing::Values(
+        std::make_tuple(shape(1u, 3u, 3u, 1u),
+                        NetworkMode::Train,
+                        dvec{ 3_dt, 7_dt, 4_dt, 8_dt, 9_dt, 5_dt, 3_dt, 15_dt, 15_dt },
+                        dvec{ -0.98058051_dt, 1.37281271_dt, -0.3922322_dt, 0.3922322_dt, 0.98058051_dt, -1.37281271_dt, -1.41421354_dt, 0.70710677_dt, 0.70710677_dt }),
+        std::make_tuple(shape(1u, 5u, 5u, 1u),
+                        NetworkMode::Train,
+                        dvec{ 0.2992_dt, 0.0614_dt, 0.3442_dt, 0.4992_dt, 0.1848_dt, 0.3404_dt, 0.3627_dt, 0.6232_dt, 0.5426_dt, 0.1261_dt, 0.9982_dt, 0.7149_dt, 0.8062_dt,
+                              0.6040_dt, 0.0333_dt, 0.3870_dt, 0.2276_dt, 0.0830_dt, 0.0222_dt, 0.9375_dt, 0.9395_dt, 0.4894_dt, 0.4846_dt, 0.3932_dt, 0.3220_dt },
+                        dvec{ 0.1449_dt,  -1.4626_dt, 0.4491_dt, 1.4969_dt,  -0.6284_dt, -0.3381_dt, -0.2095_dt, 1.2937_dt, 0.8286_dt,  -1.5747_dt, 1.1262_dt,  0.2566_dt, 0.5368_dt,
+                              -0.0839_dt, -1.8358_dt, 0.1692_dt, -0.3165_dt, -0.7570_dt, -0.9423_dt, 1.8466_dt,  1.9153_dt, -0.1682_dt, -0.1904_dt, -0.6135_dt, -0.9431_dt }),
+        std::make_tuple(shape(5u, 5u, 1u, 1u),
+                        NetworkMode::Train,
+                        dvec{ 0.2992_dt, 0.0614_dt, 0.3442_dt, 0.4992_dt, 0.1848_dt, 0.3404_dt, 0.3627_dt, 0.6232_dt, 0.5426_dt, 0.1261_dt, 0.9982_dt, 0.7149_dt, 0.8062_dt,
+                              0.6040_dt, 0.0333_dt, 0.3870_dt, 0.2276_dt, 0.0830_dt, 0.0222_dt, 0.9375_dt, 0.9395_dt, 0.4894_dt, 0.4846_dt, 0.3932_dt, 0.3220_dt },
+                        dvec{ -0.9509_dt, -1.3887_dt, -0.5044_dt, 0.4205_dt,  -0.4218_dt, -0.8175_dt, -0.0381_dt, 0.6302_dt, 0.6303_dt, -0.6039_dt, 1.3125_dt,  1.5407_dt, 1.3744_dt,
+                              0.9272_dt,  -0.8918_dt, -0.6666_dt, -0.6437_dt, -1.5666_dt, -1.8859_dt, 1.9135_dt,  1.1225_dt, 0.5299_dt, 0.0665_dt,  -0.0921_dt, 0.0039_dt }),
+        std::make_tuple(
+            shape(3u, 4u, 5u, 6u),
+            NetworkMode::Train,
+            dvec{ 9.77800555e-02_dt, 3.53264511e-01_dt, 2.66772072e-01_dt, 2.30766091e-01_dt, 5.32899343e-01_dt, 6.20913405e-01_dt, 7.86335246e-01_dt, 2.74646282e-01_dt, 3.12343771e-01_dt,
+                  4.02963045e-01_dt, 4.85350802e-01_dt, 2.89835029e-01_dt, 1.70157161e-01_dt, 1.00855877e-01_dt, 5.98972123e-01_dt, 8.27687293e-01_dt, 4.88252325e-01_dt, 1.68515994e-01_dt,
+                  5.70838646e-01_dt, 5.30862343e-01_dt, 1.31262474e-01_dt, 6.30811189e-01_dt, 9.13114234e-01_dt, 6.81991949e-01_dt, 3.23860465e-01_dt, 2.06175012e-02_dt, 2.03504954e-01_dt,
+                  6.40338708e-01_dt, 5.53627927e-01_dt, 5.40967746e-02_dt, 1.58681187e-01_dt, 6.56620605e-01_dt, 4.42853989e-01_dt, 5.08290012e-01_dt, 3.22312184e-01_dt, 7.21623362e-01_dt,
+                  3.49950677e-01_dt, 8.75714009e-01_dt, 8.63942188e-01_dt, 6.82456763e-01_dt, 2.73577567e-01_dt, 8.48642819e-01_dt, 3.65199037e-01_dt, 1.05601541e-02_dt, 9.80572690e-01_dt,
+                  8.60514799e-01_dt, 4.84604094e-01_dt, 7.60105140e-01_dt, 4.66072587e-01_dt, 2.47301122e-01_dt, 6.40744327e-02_dt, 7.28925523e-01_dt, 8.53794478e-01_dt, 6.68397252e-01_dt,
+                  6.84656544e-01_dt, 8.09576923e-01_dt, 8.25028327e-01_dt, 6.65535876e-01_dt, 4.29366847e-01_dt, 2.77251618e-01_dt, 7.91509393e-01_dt, 8.41398446e-01_dt, 2.28866685e-01_dt,
+                  2.29016293e-01_dt, 2.22881412e-01_dt, 2.69195090e-01_dt, 8.00508533e-01_dt, 7.41912269e-02_dt, 7.55123748e-01_dt, 2.75609209e-01_dt, 9.48978733e-01_dt, 3.30928431e-01_dt,
+                  5.77432716e-01_dt, 3.49660816e-01_dt, 9.31502288e-01_dt, 5.16939480e-01_dt, 9.10499887e-01_dt, 1.73638747e-01_dt, 3.88130457e-01_dt, 3.45477286e-01_dt, 6.86662669e-01_dt,
+                  7.92149137e-01_dt, 3.90295754e-02_dt, 1.10321019e-01_dt, 4.97502057e-02_dt, 8.32182702e-01_dt, 7.64158188e-01_dt, 6.96318842e-01_dt, 9.49956434e-01_dt, 5.03834540e-02_dt,
+                  7.86579335e-02_dt, 6.30414127e-01_dt, 2.22023199e-02_dt, 3.94991261e-01_dt, 6.50833365e-01_dt, 8.60519904e-01_dt, 2.96603298e-01_dt, 9.47043113e-02_dt, 6.35972032e-01_dt,
+                  8.68504949e-01_dt, 4.91585448e-01_dt, 6.99195166e-01_dt, 8.71770936e-01_dt, 3.74931195e-01_dt, 5.01519928e-01_dt, 1.83905789e-01_dt, 6.51205570e-01_dt, 3.66360260e-01_dt,
+                  1.36366328e-01_dt, 4.05065553e-02_dt, 8.60385871e-01_dt, 2.65456852e-01_dt, 6.56418341e-01_dt, 8.94826597e-01_dt, 2.48995226e-02_dt, 4.37947070e-01_dt, 5.85282099e-01_dt,
+                  6.71394233e-01_dt, 6.02241152e-01_dt, 9.65663736e-01_dt, 4.00695162e-01_dt, 9.93102079e-01_dt, 4.02632450e-01_dt, 4.22737765e-01_dt, 1.73848008e-01_dt, 1.76786305e-01_dt,
+                  8.66916864e-01_dt, 5.66841437e-03_dt, 8.03129096e-01_dt, 4.42671085e-01_dt, 9.16227084e-01_dt, 7.00328453e-01_dt, 9.44741760e-02_dt, 9.34049081e-01_dt, 2.03394853e-01_dt,
+                  6.23594385e-01_dt, 5.85712888e-01_dt, 4.08449001e-01_dt, 3.68527464e-01_dt, 2.08109771e-01_dt, 3.45570101e-01_dt, 1.83936504e-02_dt, 1.56312847e-01_dt, 8.87980434e-01_dt,
+                  9.37872032e-02_dt, 4.28870517e-01_dt, 5.50659311e-01_dt, 4.32980322e-02_dt, 8.18403199e-01_dt, 7.09507867e-01_dt, 3.46883176e-01_dt, 3.78409741e-01_dt, 8.77498840e-01_dt,
+                  2.45052710e-01_dt, 4.22903936e-01_dt, 6.44538239e-01_dt, 4.99365558e-01_dt, 3.69989594e-01_dt, 7.50699293e-02_dt, 9.56065916e-01_dt, 9.51984030e-01_dt, 2.78706080e-01_dt,
+                  2.69315753e-01_dt, 6.01302626e-01_dt, 7.02577001e-01_dt, 6.64154408e-01_dt, 7.14875033e-01_dt, 3.27179933e-01_dt, 5.91972917e-02_dt, 7.14381080e-01_dt, 8.87892953e-01_dt,
+                  3.20857017e-01_dt, 8.24997813e-01_dt, 6.86640613e-01_dt, 9.24991170e-02_dt, 5.08710774e-01_dt, 3.86655566e-01_dt, 9.48000320e-03_dt, 6.70939201e-01_dt, 8.41139957e-01_dt,
+                  5.14979669e-01_dt, 7.15719938e-01_dt, 1.92648993e-01_dt, 5.03256477e-01_dt, 4.63148905e-01_dt, 6.85503830e-01_dt, 1.98912970e-01_dt, 5.84024824e-01_dt, 8.88997997e-01_dt,
+                  5.31634905e-02_dt, 6.06487929e-01_dt, 4.29464322e-01_dt, 2.36725764e-01_dt, 2.28998152e-01_dt, 6.25303905e-01_dt, 1.50221802e-01_dt, 3.37435605e-01_dt, 5.36270640e-01_dt,
+                  8.64611451e-01_dt, 9.78729876e-01_dt, 1.93830601e-01_dt, 5.17394174e-01_dt, 2.29681530e-01_dt, 5.78217062e-01_dt, 8.62663963e-01_dt, 3.21392055e-01_dt, 8.45685916e-01_dt,
+                  5.50340659e-01_dt, 6.55309423e-02_dt, 5.65544950e-01_dt, 7.15176673e-01_dt, 5.79537144e-01_dt, 8.33700710e-01_dt, 4.74959880e-01_dt, 2.54744694e-01_dt, 7.42147328e-01_dt,
+                  1.58136040e-01_dt, 3.85223493e-01_dt, 9.85215764e-01_dt, 5.16292160e-01_dt, 1.01594892e-01_dt, 9.20983496e-01_dt, 8.60057920e-01_dt, 8.47588110e-01_dt, 3.08111721e-01_dt,
+                  5.05987773e-01_dt, 4.65895734e-01_dt, 2.90976326e-01_dt, 8.56196221e-02_dt, 7.32879282e-01_dt, 6.84766994e-01_dt, 5.69435709e-01_dt, 1.88649486e-01_dt, 1.61360028e-01_dt,
+                  3.99288566e-01_dt, 1.49470396e-02_dt, 3.89089149e-01_dt, 5.26952393e-01_dt, 4.16219183e-01_dt, 9.52534523e-01_dt, 4.60849520e-01_dt, 2.26519364e-01_dt, 2.63218338e-01_dt,
+                  5.53670207e-01_dt, 8.41728726e-01_dt, 1.42615284e-01_dt, 3.14047941e-01_dt, 9.47645952e-01_dt, 5.68324155e-01_dt, 2.92418953e-01_dt, 5.85206850e-01_dt, 7.16189851e-01_dt,
+                  2.79699618e-01_dt, 8.08198921e-01_dt, 1.31956514e-01_dt, 5.74294152e-01_dt, 6.14908312e-01_dt, 9.42539483e-01_dt, 9.35341614e-01_dt, 6.83345064e-01_dt, 7.55447454e-01_dt,
+                  5.78529630e-01_dt, 8.18808225e-01_dt, 4.70827923e-01_dt, 1.54525369e-01_dt, 4.73310652e-01_dt, 5.81352818e-01_dt, 6.58587084e-01_dt, 9.35398824e-01_dt, 9.30131433e-01_dt,
+                  9.57851946e-01_dt, 4.22153639e-01_dt, 1.98542014e-01_dt, 5.80882907e-01_dt, 8.65020864e-01_dt, 3.95721425e-01_dt, 7.18954571e-01_dt, 1.15222564e-01_dt, 3.97037035e-01_dt,
+                  1.96768928e-01_dt, 9.32734804e-01_dt, 9.93195003e-01_dt, 1.30222986e-01_dt, 6.67470852e-01_dt, 2.29211835e-01_dt, 6.18951999e-01_dt, 9.21457211e-01_dt, 5.57187707e-01_dt,
+                  9.45881969e-01_dt, 2.84178130e-02_dt, 8.78387170e-01_dt, 1.10789595e-02_dt, 3.54636486e-01_dt, 2.91366752e-01_dt, 1.64180135e-01_dt, 5.42968507e-01_dt, 7.21547476e-03_dt,
+                  6.14725367e-01_dt, 1.96544825e-01_dt, 3.44890029e-01_dt, 8.08677890e-02_dt, 4.79162747e-02_dt, 6.77029300e-01_dt, 3.62922112e-01_dt, 2.93552370e-02_dt, 7.51271351e-01_dt,
+                  8.86820681e-01_dt, 3.13008928e-01_dt, 4.78607774e-01_dt, 1.75519517e-04_dt, 4.71257801e-01_dt, 6.27473921e-01_dt, 4.00250703e-01_dt, 1.22122798e-01_dt, 5.89811442e-01_dt,
+                  2.85812992e-02_dt, 2.46674273e-01_dt, 7.55656793e-01_dt, 2.59860216e-01_dt, 7.53702495e-02_dt, 1.64286066e-01_dt, 2.12510764e-01_dt, 3.78010682e-01_dt, 6.76076391e-01_dt,
+                  4.51130171e-01_dt, 5.39201458e-01_dt, 4.96618901e-01_dt, 2.51728547e-01_dt, 1.70370586e-01_dt, 7.22909479e-01_dt, 5.15302168e-01_dt, 4.94214077e-01_dt, 7.77286501e-01_dt,
+                  8.66233252e-01_dt, 7.17398895e-01_dt, 8.74810652e-01_dt, 6.11908038e-01_dt, 3.57020557e-01_dt, 5.99800063e-01_dt, 4.47885923e-01_dt, 3.08365632e-01_dt, 7.87660046e-01_dt,
+                  5.53478768e-01_dt, 9.89349385e-01_dt, 6.24752792e-01_dt, 5.83571353e-01_dt, 1.24808488e-01_dt, 2.08414956e-01_dt, 2.27309689e-01_dt, 1.11749298e-01_dt, 8.30051922e-01_dt,
+                  2.20119350e-01_dt, 2.60861680e-01_dt, 8.93376200e-01_dt, 1.40987484e-01_dt, 3.73134442e-01_dt, 7.27785481e-01_dt, 1.11244021e-01_dt, 8.54631643e-01_dt, 9.52460160e-01_dt },
+            dvec{
+                -1.3869_dt, -0.4622_dt, -0.7753_dt, -0.9056_dt, 0.1880_dt,  0.5065_dt,  1.1053_dt,  -0.7468_dt, -0.6103_dt, -0.2823_dt, 0.0159_dt,  -0.6918_dt, -1.1250_dt, -1.3758_dt, 0.4271_dt,
+                1.2550_dt,  0.0264_dt,  -1.1309_dt, 0.3253_dt,  0.1806_dt,  -1.2657_dt, 0.5424_dt,  1.5642_dt,  0.7276_dt,  -0.5686_dt, -1.6662_dt, -1.0043_dt, 0.5769_dt,  0.2630_dt,  -1.5450_dt,
+                -1.2474_dt, 0.4880_dt,  -0.2570_dt, -0.0290_dt, -0.6771_dt, 0.7145_dt,  -0.5808_dt, 1.2515_dt,  1.2105_dt,  0.5780_dt,  -0.8470_dt, 1.1572_dt,  -0.5277_dt, -1.7636_dt, 1.6170_dt,
+                1.1986_dt,  -0.1115_dt, 0.8486_dt,  -0.1761_dt, -0.9385_dt, -1.5771_dt, 0.7400_dt,  1.1751_dt,  0.5290_dt,  0.5857_dt,  1.0210_dt,  1.0749_dt,  0.5190_dt,  -0.3040_dt, -0.8342_dt,
+                1.2089_dt,  1.3868_dt,  -0.7975_dt, -0.7969_dt, -0.8188_dt, -0.6537_dt, 1.2410_dt,  -1.3490_dt, 1.0791_dt,  -0.6308_dt, 1.7704_dt,  -0.4335_dt, 0.4455_dt,  -0.3667_dt, 1.7081_dt,
+                0.2298_dt,  1.6332_dt,  -0.9944_dt, -0.2295_dt, -0.3816_dt, 0.8350_dt,  1.2112_dt,  -1.4744_dt, -1.2202_dt, -1.4362_dt, 1.3539_dt,  1.1114_dt,  0.8694_dt,  1.7739_dt,  -1.4339_dt,
+                -1.5415_dt, 0.4238_dt,  -1.7426_dt, -0.4148_dt, 0.4965_dt,  1.2434_dt,  -0.7652_dt, -1.4844_dt, 0.4436_dt,  1.2719_dt,  -0.0707_dt, 0.6688_dt,  1.2835_dt,  -0.4862_dt, -0.0353_dt,
+                -1.1666_dt, 0.4978_dt,  -0.5168_dt, -1.3360_dt, -1.6774_dt, 1.2429_dt,  -0.8762_dt, 0.5164_dt,  1.3656_dt,  -1.7330_dt, -0.2618_dt, 0.2630_dt,  0.5698_dt,  0.3234_dt,  1.6179_dt,
+                -0.2905_dt, 1.8537_dt,  -0.2835_dt, -0.2107_dt, -1.1116_dt, -1.1010_dt, 1.3970_dt,  -1.7203_dt, 1.1661_dt,  -0.1386_dt, 1.5754_dt,  0.7940_dt,  -1.3989_dt, 1.6399_dt,  -1.0047_dt,
+                0.5163_dt,  0.3791_dt,  -0.2625_dt, -0.4070_dt, -0.9876_dt, -0.4901_dt, -1.6743_dt, -1.1751_dt, 1.4732_dt,  -1.4014_dt, -0.1886_dt, 0.2523_dt,  -1.5841_dt, 1.2214_dt,  0.8272_dt,
+                -0.5915_dt, -0.4816_dt, 1.2577_dt,  -0.9464_dt, -0.3266_dt, 0.4459_dt,  -0.0601_dt, -0.5110_dt, -1.5388_dt, 1.5316_dt,  1.5173_dt,  -0.8291_dt, -0.8618_dt, 0.2952_dt,  0.6481_dt,
+                0.5142_dt,  0.6910_dt,  -0.6602_dt, -1.5941_dt, 0.6893_dt,  1.2940_dt,  -0.6822_dt, 1.0748_dt,  0.5926_dt,  -1.4780_dt, -0.0275_dt, -0.4529_dt, -1.7674_dt, 0.5379_dt,  1.1310_dt,
+                0.2228_dt,  0.9386_dt,  -0.9266_dt, 0.1810_dt,  0.0380_dt,  0.8309_dt,  -0.9043_dt, 0.4690_dt,  1.5565_dt,  -1.4240_dt, 0.5491_dt,  -0.0822_dt, -0.7695_dt, -0.7970_dt, 0.6162_dt,
+                -1.0779_dt, -0.4103_dt, 0.2987_dt,  1.4696_dt,  1.8765_dt,  -0.9224_dt, 0.2314_dt,  -0.7946_dt, 0.4483_dt,  1.4626_dt,  -0.4675_dt, 1.4021_dt,  0.3489_dt,  -1.3799_dt, 0.4031_dt,
+                0.7257_dt,  0.2426_dt,  1.1479_dt,  -0.1299_dt, -0.9143_dt, 0.8218_dt,  -1.2584_dt, -0.4496_dt, 1.6876_dt,  0.0173_dt,  -1.4598_dt, 1.4588_dt,  1.2418_dt,  1.1974_dt,  -0.7242_dt,
+                -0.0194_dt, -0.1622_dt, -0.7853_dt, -1.5167_dt, 0.7888_dt,  0.6174_dt,  0.2066_dt,  -1.1498_dt, -1.2470_dt, -0.3995_dt, -1.7685_dt, -0.4358_dt, 0.0553_dt,  -0.3392_dt, 1.5712_dt,
+                -0.0728_dt, -0.9210_dt, -0.7881_dt, 0.2632_dt,  1.3058_dt,  -1.2246_dt, -0.6042_dt, 1.6892_dt,  0.3162_dt,  -0.6824_dt, 0.3773_dt,  0.8514_dt,  -0.7285_dt, 1.1844_dt,  -1.2632_dt,
+                0.3378_dt,  0.4848_dt,  1.6707_dt,  1.6446_dt,  0.7325_dt,  0.9935_dt,  0.3531_dt,  1.2228_dt,  -0.0367_dt, -1.1815_dt, -0.0277_dt, 0.3634_dt,  0.6429_dt,  1.6448_dt,  1.6258_dt,
+                1.5378_dt,  -0.3292_dt, -1.1085_dt, 0.2240_dt,  1.2143_dt,  -0.4213_dt, 0.7052_dt,  -1.3988_dt, -0.4167_dt, -1.1146_dt, 1.4502_dt,  1.6610_dt,  -1.3466_dt, 0.5258_dt,  -1.0016_dt,
+                0.3567_dt,  1.4109_dt,  0.1414_dt,  1.4961_dt,  -1.7014_dt, 1.2608_dt,  -1.7618_dt, -0.5645_dt, -0.7850_dt, -1.2282_dt, 0.0919_dt,  -1.7753_dt, 0.3420_dt,  -1.1154_dt, -0.5984_dt,
+                -1.3252_dt, -1.4427_dt, 0.8007_dt,  -0.3194_dt, -1.5089_dt, 1.0654_dt,  1.5488_dt,  -0.4974_dt, 0.0931_dt,  -1.6130_dt, 0.0669_dt,  0.6239_dt,  -0.1863_dt, -1.1781_dt, 0.4896_dt,
+                -1.5117_dt, -0.7340_dt, 1.0810_dt,  -0.6870_dt, -1.3448_dt, -1.0278_dt, -0.8558_dt, -0.2656_dt, 0.7973_dt,  -0.0049_dt, 0.3092_dt,  0.1573_dt,  -0.7160_dt, -1.0061_dt, 0.9643_dt,
+                0.0138_dt,  -0.0613_dt, 0.9469_dt,  1.2638_dt,  0.7336_dt,  1.2943_dt,  0.3579_dt,  -0.5500_dt, 0.3147_dt,  -0.2264_dt, -0.7233_dt, 0.9839_dt,  0.1498_dt,  1.7023_dt,  0.4036_dt,
+                0.2569_dt,  -1.3771_dt, -1.0793_dt, -1.0120_dt, -1.4237_dt, 1.1349_dt,  -1.0377_dt, -0.8925_dt, 1.3604_dt,  -1.3195_dt, -0.4926_dt, 0.7706_dt,  -1.4255_dt, 1.2224_dt,  1.5709_dt })));
+
+TEST_P(TestCNNBatchNormUnit, Unit)
+{
+    PROFILE_TEST
+    size_t batch = inputShape[0];
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, inputShape[1], inputShape[2], inputShape[3] });
+    BatchNormLayer bn("bn", BatchnormParams{ { "in" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(batch);
+    bn.initNotBSTensors();
+    std::copy(inputData.begin(), inputData.end(), &memory_manager["in"][0]);
+
+    bn.forwardCompute(networkMode);
+    const Tensor& out = memory_manager["out"];
+
+    ASSERT_INTERVALS_NEAR(out.begin(), out.end(), forwardPassResult.begin(), forwardPassResult.end(), EPSILON);
+
+    printf(" - BatchNorm forward [%zd, %zd, %zd, %zd] is Ok.\n", inputShape[0], inputShape[1], inputShape[2], inputShape[3]);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_CNNGlobalAverage.cpp b/training/src/tests/tests/layers/Test_CNNGlobalAverage.cpp
new file mode 100644
index 00000000..a0d24db1
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_CNNGlobalAverage.cpp
@@ -0,0 +1,106 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/GlobalAveragePoolLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+using namespace raul;
+
+TEST(TestCNNGlobalAvgPool, Unit)
+{
+    PROFILE_TEST
+    dtype eps = TODTYPE(1e-4);
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+        size_t batch = 1;
+        size_t in_w = 3;
+        size_t in_h = 3;
+        size_t depth = 1;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        GlobAveragePoolLayer avgpool("avg1", { { "in" }, { "mp" } }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        const Tensor& out = memory_manager["mp"];
+        avgpool.forwardCompute(NetworkMode::Train);
+        EXPECT_EQ(TODTYPE(5.f), out[0]);
+
+        memory_manager.clear();
+    }
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 0.37479503f, 0.74949179f, 0.92839539f, 0.40420301f, 0.9961107f,  0.66630404f, 0.82047117f, 0.99906097f, 0.48825795f, 0.6590186f,  0.14482136f, 0.6286064f,  0.54629563f,
+                       0.18286226f, 0.73805489f, 0.50778409f, 0.76171372f, 0.74803807f, 0.22766416f, 0.80420801f, 0.76574229f, 0.795332f,   0.00788858f, 0.21592507f, 0.66575409f, 0.91054865f,
+                       0.07787714f, 0.93616972f, 0.78133592f, 0.89633024f, 0.96769797f, 0.57117974f, 0.58974003f, 0.6582992f,  0.47186511f, 0.03024406f, 0.86331904f, 0.4101691f,  0.74629559f,
+                       0.1786914f,  0.26763513f, 0.68119195f, 0.41871249f, 0.41247165f, 0.48607614f, 0.65607838f, 0.47523406f, 0.45516007f, 0.39255511f, 0.43910054f, 0.34682715f, 0.08459205f,
+                       0.68237903f, 0.84103279f, 0.54420833f, 0.15065369f, 0.72408225f, 0.20047205f, 0.26391343f, 0.51592856f, 0.09968541f, 0.90827312f, 0.85315302f, 0.98048446f, 0.59192641f,
+                       0.54249449f, 0.51840919f, 0.36040054f, 0.3678461f,  0.68468829f, 0.41993762f, 0.20087144f, 0.21774159f, 0.58968185f, 0.05935208f, 0.22305229f, 0.50138518f, 0.65757976f,
+                       0.14150964f, 0.07537156f, 0.34186466f, 0.34769964f, 0.89359716f, 0.97368842f, 0.55870338f, 0.75948831f, 0.23420801f, 0.06065636f, 0.10140947f, 0.59413656f, 0.88430418f,
+                       0.29935133f, 0.36099395f, 0.59542665f, 0.25830884f, 0.07518427f, 0.09374659f, 0.2718588f,  0.32238792f, 0.65717813f, 0.17969255f, 0.79472564f, 0.54135358f, 0.02137773f,
+                       0.70453999f, 0.1567105f,  0.30232926f, 0.76933107f, 0.13083392f, 0.24884672f, 0.94565419f, 0.18572746f, 0.59809335f, 0.81106049f, 0.43165358f, 0.4994478f,  0.10799541f,
+                       0.83711806f, 0.32547974f, 0.5652622f,  0.70515491f, 0.9385492f,  0.8703726f,  0.05524975f, 0.543256f,   0.33594836f, 0.90279934f, 0.00769521f, 0.22831708f, 0.55580578f,
+                       0.57967145f, 0.47428567f, 0.98923259f, 0.01861445f, 0.08208355f, 0.53100689f, 0.27314956f, 0.03744627f, 0.70743921f, 0.48315441f, 0.89972966f, 0.82618481f, 0.79273583f,
+                       0.71205182f, 0.70734133f, 0.22427137f, 0.94323804f, 0.45126228f, 0.72933639f, 0.7159068f,  0.1407426f,  0.39469651f, 0.25006817f, 0.81597341f, 0.94659794f, 0.96640427f,
+                       0.0459238f,  0.83164318f, 0.31587163f, 0.09632278f, 0.50825189f, 0.89314523f, 0.05260463f, 0.97404436f, 0.33124845f, 0.49051957f, 0.35728925f, 0.08471391f, 0.54142684f,
+                       0.93035542f, 0.55526997f, 0.34721205f, 0.67594097f, 0.19324233f, 0.91126217f, 0.93079399f, 0.31820165f, 0.63886702f, 0.7115078f,  0.04140918f, 0.42797978f, 0.31839019f,
+                       0.15497529f, 0.9243903f,  0.10203447f, 0.02429839f, 0.79189611f, 0.77230195f, 0.5449423f,  0.40429956f, 0.00810897f, 0.69576717f, 0.81711272f, 0.1335145f,  0.84167646f,
+                       0.70258132f, 0.81158035f, 0.2585105f,  0.09862106f, 0.37432664f, 0.74594114f, 0.84899346f, 0.8828963f,  0.37912551f, 0.34303555f, 0.88711791f, 0.40549404f, 0.70713591f,
+                       0.5982242f,  0.3516502f,  0.72905793f, 0.9808251f,  0.36860929f, 0.90998312f, 0.36086885f, 0.88600347f, 0.94211039f, 0.85760511f, 0.12342954f, 0.14555429f, 0.80846367f,
+                       0.67246539f, 0.2034387f,  0.76852984f, 0.92605775f, 0.26831058f, 0.67321516f, 0.94508101f, 0.56843361f, 0.3550119f,  0.99057556f, 0.43329525f, 0.35586554f, 0.00825131f,
+                       0.98605478f, 0.21982703f, 0.89359794f, 0.21367511f, 0.11302765f, 0.05410334f, 0.55786171f, 0.47368395f, 0.85497701f, 0.9911735f,  0.41700463f, 0.90840312f, 0.07303944f,
+                       0.84749951f, 0.71403399f, 0.03476528f, 0.61093322f, 0.05770533f, 0.33471661f, 0.04379381f, 0.10857736f, 0.87996621f, 0.90394243f, 0.73068102f, 0.29301801f, 0.19324834f,
+                       0.5182907f,  0.92317623f, 0.3433689f,  0.70590362f, 0.85885382f, 0.08763445f, 0.34156856f, 0.31501855f, 0.99025243f, 0.28233952f, 0.99291062f, 0.11555683f, 0.13840601f,
+                       0.18735525f, 0.50227961f, 0.53595563f, 0.70512296f, 0.41612818f, 0.14544152f, 0.99252427f, 0.88588884f, 0.86861712f, 0.22435276f, 0.81870535f, 0.86298226f, 0.21892615f,
+                       0.89298307f, 0.56381208f, 0.37950085f, 0.61620922f, 0.73818377f, 0.70532032f, 0.91082064f, 0.27172544f, 0.55463835f, 0.62264405f, 0.71936221f, 0.71908206f, 0.12704653f,
+                       0.53684262f, 0.71610209f, 0.28590477f, 0.55936588f, 0.62242997f, 0.879633f,   0.32378584f, 0.75152276f, 0.17728816f, 0.18134275f, 0.14376092f, 0.80051592f, 0.55731164f,
+                       0.58153594f, 0.93270095f, 0.34074676f, 0.42614444f, 0.10055618f, 0.82608708f, 0.28488983f, 0.49550837f, 0.79601586f, 0.32068777f, 0.57682384f, 0.6719388f,  0.38668566f,
+                       0.19383373f, 0.70447052f, 0.9298161f,  0.18070789f, 0.93694095f, 0.33000843f, 0.58258544f, 0.07584233f, 0.43948981f, 0.15010104f, 0.55299119f, 0.76852814f, 0.09187515f,
+                       0.79264914f, 0.37365331f, 0.31016106f, 0.2233624f,  0.88615481f, 0.37971135f, 0.41666305f, 0.94860019f, 0.69704601f, 0.36850484f, 0.89811771f, 0.3676741f,  0.44550689f,
+                       0.48621984f, 0.23688324f, 0.99989737f, 0.99483116f, 0.1974391f,  0.6843357f,  0.31812494f, 0.19924475f, 0.77234562f };
+        size_t batch = 3;
+        size_t in_w = 6;
+        size_t in_h = 5;
+        size_t depth = 4;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        Tensor realOut{ 0.6143f, 0.4842f, 0.4620f, 0.4325f, 0.5440f, 0.5097f, 0.5119f, 0.5587f, 0.5132f, 0.5656f, 0.5228f, 0.4996f };
+
+        GlobAveragePoolLayer avgpool("avg1", { { "in" }, { "mp" } }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        const Tensor& out = memory_manager["mp"];
+        avgpool.forwardCompute(NetworkMode::Train);
+
+        EXPECT_EQ(out.size(), realOut.size());
+        for (size_t i = 0; i < out.size(); ++i)
+            EXPECT_NEAR(out[i], realOut[i], eps);
+
+        memory_manager.clear();
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_CNNLayer1D.cpp b/training/src/tests/tests/layers/Test_CNNLayer1D.cpp
new file mode 100644
index 00000000..fae1bc0b
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_CNNLayer1D.cpp
@@ -0,0 +1,798 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/GTestExtensions.h>
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/trainable/Convolution1DLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+using dtype = raul::dtype;
+using shape = raul::shape;
+using dvec = std::vector<dtype>;
+
+struct ConvParams
+{
+    size_t KERNEL_SIZE;
+    size_t CHANNELS;
+    size_t STRIDE;
+    size_t PADDING;
+    size_t DILATION;
+    size_t GROUPS;
+};
+
+class Convolution1DLayerWithResetTemps : public Convolution1DLayer
+{
+    public:
+    using Convolution1DLayer::Convolution1DLayer;
+
+    template <typename MM>
+    void resetTempTensors()
+    {
+        size_t maxThreads = mTempWeightsGrag.size();
+        for (size_t q = 0; q < maxThreads; ++q)
+        {
+            auto& tempWeightsGrag = mNetworkParams.mWorkflow.getMemoryManager<MM>()[mTempWeightsGrag[q]];
+            tempWeightsGrag = TOMMTYPE(0);
+        }
+    }
+};
+
+struct VariousConv1dLayerParameters : public testing::TestWithParam<std::tuple<ConvParams, shape, dvec, dvec, dvec, dvec>>
+{
+    static constexpr dtype EPSILON = 1e-6_dt;
+    size_t kernelSize = std::get<0>(GetParam()).KERNEL_SIZE;
+    size_t channels = std::get<0>(GetParam()).CHANNELS;
+    size_t stride = std::get<0>(GetParam()).STRIDE;
+    size_t padding = std::get<0>(GetParam()).PADDING;
+    size_t dilation = std::get<0>(GetParam()).DILATION;
+    size_t groups = std::get<0>(GetParam()).GROUPS;
+
+    const dvec& forwardPassResult = std::get<2>(GetParam());
+    const dvec& backwardPassResult = std::get<3>(GetParam());
+    const dvec& weightsGradientResult = std::get<4>(GetParam());
+    const dvec& biasGradientResult = std::get<5>(GetParam());
+
+    dvec weightsGradientAfterTwoBackwardPasses;
+    dvec biasGradientAfterTwoBackwardPasses;
+
+    void SetUp() final
+    {
+        std::transform(weightsGradientResult.begin(), weightsGradientResult.end(), std::back_inserter(weightsGradientAfterTwoBackwardPasses), [](dtype v) { return v * 2; });
+        std::transform(biasGradientResult.begin(), biasGradientResult.end(), std::back_inserter(biasGradientAfterTwoBackwardPasses), [](dtype v) { return v * 2; });
+    }
+};
+
+// conv1d_test.py
+TEST_P(VariousConv1dLayerParameters, Convolution1DUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 2, 1, 3 });
+    Convolution1DParams params{ "in", "cnn1", kernelSize, channels, stride, padding, dilation, groups, true };
+    Convolution1DLayer cnnLayer("cnn1", params, networkParameters);
+    params.print(std::cout);
+    std::cout << std::endl;
+
+    TENSORS_CREATE(2);
+
+    Common::arange(memory_manager["in"].begin(), memory_manager["in"].end(), 1_dt);
+    memory_manager["cnn1::Weights"] = 1_dt;
+    memory_manager["cnn1::Biases"] = 1_dt;
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+    ASSERT_EQ(memory_manager["cnn1"].getShape(), std::get<1>(GetParam()));
+    ASSERT_INTERVALS_NEAR(memory_manager["cnn1"].begin(), memory_manager["cnn1"].end(), forwardPassResult.begin(), forwardPassResult.end(), EPSILON);
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+
+    ASSERT_EQ(memory_manager["in"].getShape(), memory_manager[raul::Name("in").grad()].getShape());
+    ASSERT_INTERVALS_NEAR(memory_manager[raul::Name("in").grad()].begin(), memory_manager[raul::Name("in").grad()].end(), backwardPassResult.begin(), backwardPassResult.end(), EPSILON);
+
+    ASSERT_EQ(memory_manager["cnn1::WeightsGradient"].getShape(), memory_manager["cnn1::Weights"].getShape());
+    ASSERT_INTERVALS_NEAR(memory_manager["cnn1::WeightsGradient"].begin(), memory_manager["cnn1::WeightsGradient"].end(), weightsGradientResult.begin(), weightsGradientResult.end(), EPSILON);
+
+    ASSERT_EQ(memory_manager["cnn1::BiasesGradient"].getShape(), memory_manager["cnn1::Biases"].getShape());
+    ASSERT_INTERVALS_NEAR(memory_manager["cnn1::BiasesGradient"].begin(), memory_manager["cnn1::BiasesGradient"].end(), biasGradientResult.begin(), biasGradientResult.end(), EPSILON);
+
+    memory_manager.clear();
+}
+
+TEST_P(VariousConv1dLayerParameters, ShouldAccumulateGradientsDuringBackwardPassByDefaultUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 2, 1, 3 });
+    Convolution1DParams params{ "in", "cnn1", kernelSize, channels, stride, padding, dilation, groups, true };
+    Convolution1DLayerWithResetTemps cnnLayer("cnn1", params, networkParameters);
+    params.print(std::cout);
+    std::cout << std::endl;
+
+    TENSORS_CREATE(2);
+
+    Common::arange(memory_manager["in"].begin(), memory_manager["in"].end(), 1_dt);
+    memory_manager["cnn1::Weights"] = 1_dt;
+    memory_manager["cnn1::Biases"] = 1_dt;
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    cnnLayer.resetTempTensors<raul::MemoryManager>();
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+
+    ASSERT_INTERVALS_NEAR(memory_manager["cnn1::WeightsGradient"].begin(),
+                          memory_manager["cnn1::WeightsGradient"].end(),
+                          weightsGradientAfterTwoBackwardPasses.begin(),
+                          weightsGradientAfterTwoBackwardPasses.end(),
+                          EPSILON);
+
+    ASSERT_INTERVALS_NEAR(
+        memory_manager["cnn1::BiasesGradient"].begin(), memory_manager["cnn1::BiasesGradient"].end(), biasGradientAfterTwoBackwardPasses.begin(), biasGradientAfterTwoBackwardPasses.end(), EPSILON);
+
+    memory_manager.clear();
+}
+
+INSTANTIATE_TEST_SUITE_P(TestCNN1DLayer,
+                         VariousConv1dLayerParameters,
+                         testing::Values(std::make_tuple(ConvParams{ 3u, 2u, 1u, 0u, 1u, 1u },
+                                                         shape{ 2u, 2u, 1u, 1u },
+                                                         dvec{ 22_dt, 22_dt, 58_dt, 58_dt },
+                                                         dvec{ 2_dt, 2_dt, 2_dt, 2_dt, 2_dt, 2_dt, 2_dt, 2_dt, 2_dt, 2_dt, 2_dt, 2_dt },
+                                                         dvec{ 8_dt, 10_dt, 12_dt, 14_dt, 16_dt, 18_dt, 8_dt, 10_dt, 12_dt, 14_dt, 16_dt, 18_dt },
+                                                         dvec{ 2., 2. }),
+                                         std::make_tuple(ConvParams{ 3u, 2u, 1u, 0, 1u, 2u },
+                                                         shape{ 2u, 2u, 1u, 1u },
+                                                         dvec{ 7_dt, 16_dt, 25_dt, 34_dt },
+                                                         dvec{ 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt },
+                                                         dvec{ 8_dt, 10_dt, 12_dt, 14_dt, 16_dt, 18_dt },
+                                                         dvec{ 2., 2. }),
+                                         std::make_tuple(ConvParams{ 3u, 2u, 1u, 2u, 1u, 1u },
+                                                         shape{ 2u, 2u, 1u, 5u },
+                                                         dvec{ 6., 13., 22., 17., 10., 6., 13., 22., 17., 10., 18., 37., 58., 41., 22., 18., 37., 58., 41., 22. },
+                                                         dvec{ 6_dt, 6_dt, 6_dt, 6_dt, 6_dt, 6_dt, 6_dt, 6_dt, 6_dt, 6_dt, 6_dt, 6_dt },
+                                                         dvec{ 30_dt, 30_dt, 30_dt, 48_dt, 48_dt, 48_dt, 30_dt, 30_dt, 30_dt, 48_dt, 48_dt, 48_dt },
+                                                         dvec{ 10., 10. }),
+                                         std::make_tuple(ConvParams{ 3u, 2u, 2u, 2u, 1u, 1u },
+                                                         shape{ 2u, 2u, 1u, 3u },
+                                                         dvec{ 6_dt, 22_dt, 10_dt, 6_dt, 22_dt, 10_dt, 18_dt, 58_dt, 22_dt, 18_dt, 58_dt, 22_dt },
+                                                         dvec{ 4_dt, 2_dt, 4_dt, 4_dt, 2_dt, 4_dt, 4_dt, 2_dt, 4_dt, 4_dt, 2_dt, 4_dt },
+                                                         dvec{ 20_dt, 10_dt, 20_dt, 32_dt, 16_dt, 32_dt, 20_dt, 10_dt, 20_dt, 32_dt, 16_dt, 32_dt },
+                                                         dvec{ 6., 6. }),
+                                         std::make_tuple(ConvParams{ 3u, 1u, 3u, 2u, 1u, 1u },
+                                                         shape{ 2u, 1u, 1u, 2u },
+                                                         dvec{ 6_dt, 17_dt, 18_dt, 41_dt },
+                                                         dvec{ 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt },
+                                                         dvec{ 10_dt, 12_dt, 8_dt, 16_dt, 18_dt, 14_dt },
+                                                         dvec{ 4. }),
+                                         std::make_tuple(ConvParams{ 3u, 1u, 5u, 2u, 1u, 1u },
+                                                         shape{ 2u, 1u, 1u, 1u },
+                                                         dvec{ 6_dt, 18_dt },
+                                                         dvec{ 1_dt, 0_dt, 0_dt, 1_dt, 0_dt, 0_dt, 1_dt, 0_dt, 0_dt, 1_dt, 0_dt, 0_dt },
+                                                         dvec{ 0_dt, 0_dt, 8_dt, 0_dt, 0_dt, 14_dt },
+                                                         dvec{ 2. }),
+                                         std::make_tuple(ConvParams{ 3u, 4u, 5u, 2u, 1u, 2u },
+                                                         shape{ 2u, 4u, 1u, 1u },
+                                                         dvec{ 2_dt, 2_dt, 5_dt, 5_dt, 8_dt, 8_dt, 11_dt, 11_dt },
+                                                         dvec{ 2_dt, 0_dt, 0_dt, 2_dt, 0_dt, 0_dt, 2_dt, 0_dt, 0_dt, 2_dt, 0_dt, 0_dt },
+                                                         dvec{ 0_dt, 0_dt, 8_dt, 0_dt, 0_dt, 8_dt, 0_dt, 0_dt, 14_dt, 0_dt, 0_dt, 14_dt },
+                                                         dvec{ 2., 2., 2., 2. })));
+
+TEST(TestCNN1DLayer, SimpleDilationUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const size_t filters = 1;
+    const size_t kernelSize = 2;
+    const size_t stride = 1;
+    const size_t padding = 0;
+    const size_t dilation = 2;
+    const size_t groups = 1;
+    const size_t BATCH_SIZE = 1;
+    constexpr auto EPSILON = 1e-6_dt;
+
+    const Tensor realOutput = { -0.47918504_dt, 0.13827267_dt, 0.21124730_dt };
+    const Tensor realInputNabla = { -0.00374341_dt, -0.00374341_dt, 0.26447839_dt, 0.26822180_dt, 0.26822180_dt, -0.41152257_dt, -0.41152257_dt, -0.77949208_dt, -0.36796951_dt, -0.36796951_dt };
+    const Tensor realWeightsGradient = { -2.69488049_dt, -2.07977104_dt, -0.28457478_dt, -0.81794238_dt };
+
+    const Tensor x = { -2.17878938_dt, 0.56843126_dt, -1.08452237_dt, -1.39859545_dt, 0.40334684_dt, 0.83802634_dt, -0.71925759_dt, -0.40334353_dt, -0.59663534_dt, 0.18203649_dt };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 2, 1, 5 });
+    Convolution1DLayer l("cnn1", { "in", "cnn1", kernelSize, filters, stride, padding, dilation, groups, false }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE)
+
+    memory_manager["in"] = TORANGE(x);
+    memory_manager["cnn1::Weights"] = TORANGE((raul::Tensor{ -0.00374341_dt, 0.26822180_dt, -0.41152257_dt, -0.36796951_dt }));
+
+    // Forward checks
+    l.forwardCompute(raul::NetworkMode::Train);
+    const auto& output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    // Backward checks
+
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+    ASSERT_NO_THROW(l.backwardCompute());
+    const auto& inputNabla = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+
+    const auto& weightsGradient = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGradient.size(), realWeightsGradient.size());
+    for (size_t i = 0; i < weightsGradient.size(); ++i)
+    {
+        CHECK_NEAR(weightsGradient[i], realWeightsGradient[i], EPSILON);
+    }
+}
+
+TEST(TestCNN1DLayer, DilationUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const size_t filters = 3;
+    const size_t kernelSize = 3;
+    const size_t stride = 2;
+    const size_t padding = 1;
+    const size_t dilation = 3;
+    const size_t groups = 1;
+    const size_t BATCH_SIZE = 2;
+    const auto EPSILON = 1e-6_dt;
+
+    const Tensor x = { -0.59103316_dt, -0.56924808_dt, 0.91997141_dt,  1.11081612_dt,  1.28987408_dt,  -1.47817397_dt, 2.56723285_dt,  -0.47311980_dt, 1.32527483_dt,  -1.62932599_dt,
+                       -0.54974365_dt, -0.47983426_dt, -0.49968153_dt, -1.06698036_dt, 1.11493957_dt,  -0.14067143_dt, -0.08519430_dt, -0.09334823_dt, 0.68705022_dt,  -0.83831537_dt,
+                       0.00089182_dt,  0.84189409_dt,  -0.40003416_dt, 1.03946197_dt,  0.27995500_dt,  0.07324605_dt,  1.11331844_dt,  0.28226724_dt,  0.43422565_dt,  -0.80249292_dt,
+                       -1.29518616_dt, -0.75018150_dt, -0.92678940_dt, 0.20641631_dt,  -0.33344787_dt, -0.42883000_dt, 0.23291829_dt,  0.79688716_dt,  -0.18484163_dt, -0.37014726_dt };
+
+    const Tensor realOutput = { 0.13889819_dt, 0.45487607_dt, -0.13117632_dt, -0.32477066_dt, 0.41605633_dt, -0.13640109_dt };
+    const Tensor realInputNabla = { 0._dt, 0._dt, 0.26827440_dt,  0._dt, 0._dt, 0._dt, 0._dt, -0.05198006_dt, 0._dt, 0._dt, 0._dt, 0._dt, -0.19009148_dt, 0._dt, 0._dt,
+                                    0._dt, 0._dt, -0.21962076_dt, 0._dt, 0._dt, 0._dt, 0._dt, 0.26827440_dt,  0._dt, 0._dt, 0._dt, 0._dt, -0.05198006_dt, 0._dt, 0._dt,
+                                    0._dt, 0._dt, -0.19009148_dt, 0._dt, 0._dt, 0._dt, 0._dt, -0.21962076_dt, 0._dt, 0._dt };
+    const Tensor realWeightsGradient = { 0._dt, 0.51993728_dt, 0._dt, 0._dt, -0.19085255_dt, 0._dt, 0._dt, -1.42647099_dt, 0._dt, 0._dt, 0.70353889_dt, 0._dt,
+                                         0._dt, 0.51993728_dt, 0._dt, 0._dt, -0.19085255_dt, 0._dt, 0._dt, -1.42647099_dt, 0._dt, 0._dt, 0.70353889_dt, 0._dt,
+                                         0._dt, 0.51993728_dt, 0._dt, 0._dt, -0.19085255_dt, 0._dt, 0._dt, -1.42647099_dt, 0._dt, 0._dt, 0.70353889_dt, 0._dt };
+    const Tensor realBiasGradient = { 2.0_dt, 2.0_dt, 2.0_dt };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 1, 4, 5 });
+    Convolution1DParams params{ "in", "cnn1", kernelSize, filters, stride, padding, dilation, groups, true };
+    Convolution1DLayer cnnLayer("cnn1", params, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+
+    memory_manager["in"] = TORANGE(x);
+    memory_manager["cnn1::Weights"] =
+        TORANGE((raul::Tensor{ -0.04652963_dt, 0.03054590_dt,  0.26138845_dt,  -0.26779535_dt, -0.18173194_dt, -0.07308251_dt, -0.11252555_dt, 0.24941555_dt,  -0.18711334_dt,
+                               -0.13288665_dt, -0.20168012_dt, -0.27036187_dt, -0.16851136_dt, 0.24814458_dt,  0.12881215_dt,  0.13991290_dt,  0.01518188_dt,  -0.14799897_dt,
+                               0.04883941_dt,  -0.26953444_dt, -0.20858690_dt, -0.14882068_dt, 0.18213609_dt,  0.16925636_dt,  -0.12802599_dt, -0.01041609_dt, 0.18462527_dt,
+                               0.28698149_dt,  0.11457001_dt,  0.03899795_dt,  0.19355273_dt,  -0.16997258_dt, 0.05379289_dt,  -0.22381142_dt, -0.20007673_dt, -0.14912483_dt }));
+    memory_manager["cnn1::Biases"] = TORANGE((raul::Tensor{ 0.13061771_dt, 0.11609371_dt, -0.17099744_dt }));
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+
+    // Forward checks
+    const auto& output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    // Backward checks
+    memory_manager[Name("cnn1").grad()] = 1.0_dt;
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto& inputNabla = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+
+    const auto& weightsGradient = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGradient.size(), realWeightsGradient.size());
+    for (size_t i = 0; i < weightsGradient.size(); ++i)
+    {
+        CHECK_NEAR(weightsGradient[i], realWeightsGradient[i], EPSILON);
+    }
+
+    const auto& biasGradient = memory_manager[raul::Name("cnn1::Biases").grad()];
+    EXPECT_EQ(biasGradient.size(), realBiasGradient.size());
+    for (size_t i = 0; i < biasGradient.size(); ++i)
+    {
+        CHECK_NEAR(biasGradient[i], realBiasGradient[i], EPSILON);
+    }
+}
+
+TEST(TestCNN1DLayer, TFStyleSimpleUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const size_t filters = 3;
+    const size_t kernelSize = 2;
+    const size_t stride = 1;
+    const size_t padding = 0;
+    const size_t dilation = 1;
+    const size_t groups = 1;
+    const size_t BATCH_SIZE = 2;
+    const auto EPSILON = 1e-5_dt;
+
+    const Tensor x = { 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt, 8.0_dt, 9.0_dt, 10.0_dt, 11.0_dt, 12.0_dt };
+    const Tensor deltas = { 12.0_dt, 11.0_dt, 10.0_dt, 9.0_dt, 8.0_dt, 7.0_dt, 6.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt };
+
+    const Tensor realOutput = { 10.738498_dt, 4.2573757_dt, 4.5316195_dt, 21.469763_dt, 8.778179_dt, 7.236649_dt, 42.932293_dt, 17.819784_dt, 12.646708_dt, 53.663555_dt, 22.34059_dt, 15.351738_dt };
+    const Tensor realInputNabla = {
+        35.35774_dt, 37.661076_dt, 34.63284_dt, 49.321957_dt, 6.6174064_dt, 15.02765_dt, 16.500051_dt, 18.874037_dt, 11.602078_dt, 18.48142_dt, 2.4443321_dt, 2.9741561_dt
+    };
+    const Tensor realWeightsGradient = { 108.0_dt, 88.0_dt, 68.0_dt, 138.0_dt, 114.0_dt, 90.0_dt, 168.0_dt, 140.0_dt, 112.0_dt, 198.0_dt, 166.0_dt, 134.0_dt };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3, 1, 2 });
+    Convolution1DParams params{ "in", "cnn1", kernelSize, filters, stride, padding, dilation, groups, false, false, true };
+    Convolution1DLayer cnnLayer("cnn1", params, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = TORANGE(x);
+    memory_manager["cnn1::Weights"] = TORANGE((Tensor{ 1.764052345967664_dt,
+                                                       0.4001572083672233_dt,
+                                                       0.9787379841057392_dt,
+                                                       2.240893199201458_dt,
+                                                       1.8675579901499675_dt,
+                                                       -0.977277879876411_dt,
+                                                       0.9500884175255894_dt,
+                                                       -0.1513572082976979_dt,
+                                                       -0.10321885179355784_dt,
+                                                       0.41059850193837233_dt,
+                                                       0.144043571160878_dt,
+                                                       1.454273506962975_dt }));
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+
+    // Forward checks
+    const auto& output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    memory_manager[Name("cnn1").grad()] = TORANGE(deltas);
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto& inputNabla = memory_manager[raul::Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+
+    const auto& weightsGradient = memory_manager[raul::Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGradient.size(), realWeightsGradient.size());
+    for (size_t i = 0; i < weightsGradient.size(); ++i)
+    {
+        CHECK_NEAR(weightsGradient[i], realWeightsGradient[i], EPSILON);
+    }
+}
+
+TEST(TestCNN1DLayer, TFStyleUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const size_t filters = 2;
+    const size_t kernelSize = 3;
+    const size_t stride = 1;
+    const size_t padding = 2;
+    const size_t dilation = 2;
+    const size_t groups = 1;
+    const size_t BATCH_SIZE = 2;
+    const auto EPSILON = 1e-4_dt;
+
+    const Tensor x = { 1.0_dt,  2.0_dt,  3.0_dt,  4.0_dt,  5.0_dt,  6.0_dt,  7.0_dt,  8.0_dt,  9.0_dt,  10.0_dt, 11.0_dt, 12.0_dt, 13.0_dt, 14.0_dt, 15.0_dt,
+                       16.0_dt, 17.0_dt, 18.0_dt, 19.0_dt, 20.0_dt, 21.0_dt, 22.0_dt, 23.0_dt, 24.0_dt, 25.0_dt, 26.0_dt, 27.0_dt, 28.0_dt, 29.0_dt, 30.0_dt };
+    const Tensor deltas = { 20.0_dt, 19.0_dt, 18.0_dt, 17.0_dt, 16.0_dt, 15.0_dt, 14.0_dt, 13.0_dt, 12.0_dt, 11.0_dt, 10._dt, 9.0_dt, 8.0_dt, 7.0_dt, 6.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt };
+
+    const Tensor realOutput = { 23.500664_dt, 6.707356_dt,  34.570343_dt, 12.598474_dt, 54.964226_dt, 20.439703_dt, 33.24925_dt,   27.39572_dt, 50.05303_dt,  37.52758_dt,
+                                78.84906_dt,  36.162945_dt, 89.91874_dt,  42.05406_dt,  179.46786_dt, 74.851875_dt, 117.268166_dt, 78.05503_dt, 134.07195_dt, 88.18689_dt };
+    const Tensor realInputNabla = { 50.353176_dt, 55.0102_dt,   45.733826_dt,  44.427296_dt, 47.956177_dt, 40.756634_dt, 56.033997_dt, 56.119232_dt, 61.763016_dt, 27.10075_dt,
+                                    17.55472_dt,  44.3279_dt,   23.73786_dt,   15.384884_dt, 38.55342_dt,  20.72377_dt,  19.740091_dt, 20.847855_dt, 14.797889_dt, 12.68607_dt,
+                                    15.870661_dt, 17.577457_dt, 13.0737505_dt, 23.987835_dt, 10.286309_dt, 6.7055464_dt, 15.455521_dt, 6.923421_dt,  4.535712_dt,  9.681044_dt };
+    const Tensor realWeightsGradient = { 372.0_dt,  303.0_dt,  426.0_dt,  351.0_dt,  480.0_dt,  399.0_dt,  1100.0_dt, 955.0_dt,  1210.0_dt,
+                                         1055.0_dt, 1320.0_dt, 1155.0_dt, 1116.0_dt, 1011.0_dt, 1194.0_dt, 1083.0_dt, 1272.0_dt, 1155.0_dt };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 1, 5, 3 });
+    Convolution1DParams params{ "in", "cnn1", kernelSize, filters, stride, padding, dilation, groups, false, false, true };
+    Convolution1DLayer cnnLayer("cnn1", params, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = TORANGE(x);
+    memory_manager["cnn1::Weights"] = TORANGE((raul::Tensor{ 1.7640524_dt,
+                                                             0.4001572_dt,
+                                                             0.978738_dt,
+                                                             2.2408931_dt,
+                                                             1.867558_dt,
+                                                             -0.9772779_dt,
+                                                             0.95008844_dt,
+                                                             -0.1513572_dt,
+                                                             -0.10321885_dt,
+                                                             0.41059852_dt,
+                                                             0.14404356_dt,
+                                                             1.4542735_dt,
+                                                             0.7610377_dt,
+                                                             0.121675014_dt,
+                                                             0.44386324_dt,
+                                                             0.33367434_dt,
+                                                             1.4940791_dt,
+                                                             -0.20515826_dt }));
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+
+    // Forward checks
+    const auto& output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    memory_manager[Name("cnn1").grad()] = TORANGE(deltas);
+    cnnLayer.backwardCompute();
+    const auto& inputNabla = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+
+    const auto& weightsGradient = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGradient.size(), realWeightsGradient.size());
+    for (size_t i = 0; i < weightsGradient.size(); ++i)
+    {
+        CHECK_NEAR(weightsGradient[i], realWeightsGradient[i], EPSILON);
+    }
+}
+
+// conv1d_double_step.py
+TEST(TestCNN1DLayer, DoubleStep1Unit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const size_t filters = 3;
+    const size_t kernelSize = 2;
+    const size_t stride = 2;
+    const size_t padding = 1;
+    const size_t dilation = 2;
+    const size_t groups = 1;
+    const size_t BATCH_SIZE = 2;
+    const auto EPSILON = 1e-6_dt;
+
+    const Tensor x = { -1.55509508_dt, -0.34136039_dt, 1.85300612_dt, 0.46809641_dt,  -0.15771244_dt, 1.44366014_dt,  0.26604941_dt,  0.16645534_dt,  1.58633423_dt,  0.94629836_dt,
+                       -0.84367675_dt, 0.93182659_dt,  1.25900924_dt, 2.00498056_dt,  0.17784372_dt,  -0.23033547_dt, -0.39175439_dt, 0.54329473_dt,  -0.39515755_dt, 0.20552567_dt,
+                       -0.45032975_dt, -0.57307708_dt, 3.41050267_dt, -1.53118432_dt, -1.23413503_dt, 1.81972528_dt,  -0.55152869_dt, -1.32532597_dt, 0.18855357_dt,  -0.06907269_dt };
+
+    const Tensor realOutput[] = {
+        { -0.32944888_dt,
+          -0.66623712_dt,
+          -1.12647843_dt,
+          -0.33430994_dt,
+          -0.13762742_dt,
+          -0.48640594_dt,
+          0.47286037_dt,
+          0.77560359_dt,
+          0.02234657_dt,
+          -0.25076321_dt,
+          0.39784792_dt,
+          0.20927989_dt,
+          -0.32223558_dt,
+          -0.39425132_dt,
+          -0.14236981_dt,
+          0.11043543_dt,
+          0.43837607_dt,
+          0.59051555_dt },
+        { -0.29641706_dt, -0.35044304_dt, -0.47054291_dt, -0.26309419_dt, 0.51632619_dt, 0.55646789_dt, -0.02319643_dt, -0.21444283_dt, -0.12676388_dt, -0.22079742_dt, 0.14213601_dt, 0.13021433_dt }
+    };
+    const Tensor deltas = { 1.71789527_dt,  -0.85072410_dt, -0.84863919_dt, -0.18478005_dt, -1.19375718_dt, -0.22327407_dt,
+                            -1.27057660_dt, 0.01933064_dt,  0.88683778_dt,  0.05517140_dt,  0.68803781_dt,  1.23262453_dt };
+    const Tensor realWeightsGradient = { 0.65647715_dt,  -1.14853525_dt, 0.62395668_dt,  2.09043622_dt, 0.19213718_dt, 1.68474627_dt,  0.13944964_dt,  0.67229474_dt, -0.23088974_dt,
+                                         -1.11849952_dt, -0.54473877_dt, -0.81111443_dt, 0.55264896_dt, 1.05027425_dt, -0.51726854_dt, -0.19892836_dt, 0.36443216_dt, -0.42750430_dt };
+    const Tensor realBiasesGradient = { -0.52708888_dt, -0.11514002_dt, 0.73527777_dt };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 1, 3, 5 });
+    Convolution1DParams params1{ "in", "cnn1", kernelSize, filters, stride, padding, dilation, groups, true, false };
+    Convolution1DLayer cnnLayer1("cnn1", params1, networkParameters);
+    Convolution1DParams params2{ { { "cnn1" }, { "out" }, { "cnn1::Weights", "cnn1::Biases" } }, kernelSize, filters, stride, padding, dilation, groups, true, false };
+    Convolution1DLayer cnnLayer2("cnn2", params2, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = TORANGE(x);
+    memory_manager["cnn1::Weights"] = TORANGE((Tensor{ -0.00305648_dt,
+                                                       0.21900219_dt,
+                                                       -0.33600679_dt,
+                                                       -0.30044585_dt,
+                                                       -0.15723862_dt,
+                                                       0.10947479_dt,
+                                                       -0.00808870_dt,
+                                                       0.32369578_dt,
+                                                       -0.03622961_dt,
+                                                       0.10802763_dt,
+                                                       -0.12337797_dt,
+                                                       -0.08024748_dt,
+                                                       -0.39001942_dt,
+                                                       -0.27037555_dt,
+                                                       -0.16828938_dt,
+                                                       0.01512298_dt,
+                                                       0.16139492_dt,
+                                                       0.24495828_dt }));
+    memory_manager["cnn1::Biases"] = TORANGE((Tensor{ -0.27676830_dt, -0.17777696_dt, 0.14828277_dt }));
+
+    // see conv1d_double_step.py
+    // Forward
+    // Step 1
+    ASSERT_NO_THROW(cnnLayer1.forwardCompute(NetworkMode::Train));
+    const auto interOutput = memory_manager["cnn1"];
+    EXPECT_EQ(interOutput.size(), realOutput[0].size());
+    for (size_t i = 0; i < interOutput.size(); ++i)
+    {
+        CHECK_NEAR(interOutput[i], realOutput[0][i], EPSILON);
+    }
+    // Step 2
+    ASSERT_NO_THROW(cnnLayer2.forwardCompute(NetworkMode::Train));
+    const auto output = memory_manager["out"];
+    EXPECT_EQ(output.size(), realOutput[1].size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[1][i], EPSILON);
+    }
+
+    // Backward
+    memory_manager[Name("out").grad()] = TORANGE(deltas);
+    ASSERT_NO_THROW(cnnLayer2.backwardCompute());
+    ASSERT_NO_THROW(cnnLayer1.backwardCompute());
+
+    const auto weightsGradient = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGradient.size(), realWeightsGradient.size());
+    for (size_t i = 0; i < weightsGradient.size(); ++i)
+    {
+        CHECK_NEAR(weightsGradient[i], realWeightsGradient[i], EPSILON);
+    }
+    const auto biasesGradient = memory_manager[Name("cnn1::Biases").grad()];
+    EXPECT_EQ(biasesGradient.size(), realBiasesGradient.size());
+    for (size_t i = 0; i < biasesGradient.size(); ++i)
+    {
+        CHECK_NEAR(biasesGradient[i], realBiasesGradient[i], EPSILON);
+    }
+}
+
+// conv1d_double_step.py
+TEST(TestCNN1DLayer, DoubleStep2Unit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const size_t filters = 3;
+    const size_t kernelSize = 2;
+    const size_t stride = 2;
+    const size_t padding = 1;
+    const size_t dilation = 2;
+    const size_t groups = 1;
+    const size_t BATCH_SIZE = 2;
+    const auto EPSILON = 1e-6_dt;
+
+    const Tensor x = { -1.55509508_dt, -0.34136039_dt, 1.85300612_dt, 0.46809641_dt,  -0.15771244_dt, 1.44366014_dt,  0.26604941_dt,  0.16645534_dt,  1.58633423_dt,  0.94629836_dt,
+                       -0.84367675_dt, 0.93182659_dt,  1.25900924_dt, 2.00498056_dt,  0.17784372_dt,  -0.23033547_dt, -0.39175439_dt, 0.54329473_dt,  -0.39515755_dt, 0.20552567_dt,
+                       -0.45032975_dt, -0.57307708_dt, 3.41050267_dt, -1.53118432_dt, -1.23413503_dt, 1.81972528_dt,  -0.55152869_dt, -1.32532597_dt, 0.18855357_dt,  -0.06907269_dt };
+
+    const Tensor realOutput[] = {
+        { -0.32944888_dt,
+          -0.66623712_dt,
+          -1.12647843_dt,
+          -0.33430994_dt,
+          -0.13762742_dt,
+          -0.48640594_dt,
+          0.47286037_dt,
+          0.77560359_dt,
+          0.02234657_dt,
+          -0.25076321_dt,
+          0.39784792_dt,
+          0.20927989_dt,
+          -0.32223558_dt,
+          -0.39425132_dt,
+          -0.14236981_dt,
+          0.11043543_dt,
+          0.43837607_dt,
+          0.59051555_dt },
+        { -0.29641706_dt, -0.35044304_dt, -0.47054291_dt, -0.26309419_dt, 0.51632619_dt, 0.55646789_dt, -0.02319643_dt, -0.21444283_dt, -0.12676388_dt, -0.22079742_dt, 0.14213601_dt, 0.13021433_dt }
+    };
+    const Tensor deltas = { 1.71789527_dt,  -0.85072410_dt, -0.84863919_dt, -0.18478005_dt, -1.19375718_dt, -0.22327407_dt,
+                            -1.27057660_dt, 0.01933064_dt,  0.88683778_dt,  0.05517140_dt,  0.68803781_dt,  1.23262453_dt };
+    const Tensor realWeightsGradient = { 0.65647715_dt,  -1.14853525_dt, 0.62395668_dt,  2.09043622_dt, 0.19213718_dt, 1.68474627_dt,  0.13944964_dt,  0.67229474_dt, -0.23088974_dt,
+                                         -1.11849952_dt, -0.54473877_dt, -0.81111443_dt, 0.55264896_dt, 1.05027425_dt, -0.51726854_dt, -0.19892836_dt, 0.36443216_dt, -0.42750430_dt };
+    const Tensor realBiasesGradient = { -0.52708888_dt, -0.11514002_dt, 0.73527777_dt };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 1, 3, 5 });
+    Convolution1DParams params1{ "in", "cnn1", kernelSize, filters, stride, padding, dilation, groups, true, false };
+    Convolution1DLayer cnnLayer1("cnn1", params1, networkParameters);
+    Convolution1DParams params2{ { { "cnn1" }, { "out" }, Name{ "cnn1" } }, kernelSize, filters, stride, padding, dilation, groups, true, false };
+    Convolution1DLayer cnnLayer2("cnn2", params2, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = TORANGE(x);
+    memory_manager["cnn1::Weights"] = TORANGE((Tensor{ -0.00305648_dt,
+                                                       0.21900219_dt,
+                                                       -0.33600679_dt,
+                                                       -0.30044585_dt,
+                                                       -0.15723862_dt,
+                                                       0.10947479_dt,
+                                                       -0.00808870_dt,
+                                                       0.32369578_dt,
+                                                       -0.03622961_dt,
+                                                       0.10802763_dt,
+                                                       -0.12337797_dt,
+                                                       -0.08024748_dt,
+                                                       -0.39001942_dt,
+                                                       -0.27037555_dt,
+                                                       -0.16828938_dt,
+                                                       0.01512298_dt,
+                                                       0.16139492_dt,
+                                                       0.24495828_dt }));
+    memory_manager["cnn1::Biases"] = TORANGE((Tensor{ -0.27676830_dt, -0.17777696_dt, 0.14828277_dt }));
+
+    // see conv1d_double_step.py
+    // Forward
+    // Step 1
+    ASSERT_NO_THROW(cnnLayer1.forwardCompute(NetworkMode::Train));
+    const auto interOutput = memory_manager["cnn1"];
+    EXPECT_EQ(interOutput.size(), realOutput[0].size());
+    for (size_t i = 0; i < interOutput.size(); ++i)
+    {
+        CHECK_NEAR(interOutput[i], realOutput[0][i], EPSILON);
+    }
+    // Step 2
+    ASSERT_NO_THROW(cnnLayer2.forwardCompute(NetworkMode::Train));
+    const auto output = memory_manager["out"];
+    EXPECT_EQ(output.size(), realOutput[1].size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[1][i], EPSILON);
+    }
+
+    // Backward
+    memory_manager[Name("out").grad()] = TORANGE(deltas);
+    ASSERT_NO_THROW(cnnLayer2.backwardCompute());
+    ASSERT_NO_THROW(cnnLayer1.backwardCompute());
+
+    const auto weightsGradient = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGradient.size(), realWeightsGradient.size());
+    for (size_t i = 0; i < weightsGradient.size(); ++i)
+    {
+        CHECK_NEAR(weightsGradient[i], realWeightsGradient[i], EPSILON);
+    }
+    const auto biasesGradient = memory_manager[Name("cnn1::Biases").grad()];
+    EXPECT_EQ(biasesGradient.size(), realBiasesGradient.size());
+    for (size_t i = 0; i < biasesGradient.size(); ++i)
+    {
+        CHECK_NEAR(biasesGradient[i], realBiasesGradient[i], EPSILON);
+    }
+}
+
+TEST(TestCNN1DLayer, DepthwiseDilationUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const size_t filters = 4;
+    const size_t kernelSize = 3;
+    const size_t stride = 2;
+    const size_t padding = 1;
+    const size_t dilation = 2;
+    const size_t groups = 4;
+    const size_t BATCH_SIZE = 2;
+    const auto EPSILON = 1e-6_dt;
+
+    const Tensor x = { -0.5911577344_dt, 0.2737623453_dt,  -0.9648520350_dt, -0.2358119786_dt, -0.6969724298_dt, -1.1607614756_dt, 0.6995424032_dt, 0.1990816295_dt,
+                       0.1990565062_dt,  0.0457027778_dt,  0.1529569179_dt,  -0.4756788015_dt, -1.8821425438_dt, -0.7765450478_dt, 2.0242021084_dt, -0.0865411982_dt,
+                       2.3571109772_dt,  -1.0373387337_dt, 1.5747981071_dt,  -0.6298472285_dt, 2.4069781303_dt,  0.2785662413_dt,  0.2467529178_dt, 1.1843266487_dt };
+
+    const Tensor realOutput = { -0.0697628111_dt, 0.3788315654_dt, 0.0584749356_dt, 0.0244936608_dt, 0.1978868395_dt, -1.2811813354_dt, -0.1850008667_dt, 0.0395136252_dt };
+    const Tensor realInputNabla = { 0.0000000000_dt,  -0.2548298240_dt, 0.0000000000_dt, 0.0000000000_dt, -0.5435388088_dt, 0.0000000000_dt,  0.0000000000_dt, 0.2937234044_dt,
+                                    0.0000000000_dt,  0.0000000000_dt,  0.1601343751_dt, 0.0000000000_dt, 0.0000000000_dt,  -0.2548298240_dt, 0.0000000000_dt, 0.0000000000_dt,
+                                    -0.5435388088_dt, 0.0000000000_dt,  0.0000000000_dt, 0.2937234044_dt, 0.0000000000_dt,  0.0000000000_dt,  0.1601343751_dt, 0.0000000000_dt };
+    const Tensor realWeightsGradient = { 0.0000000000_dt, -0.5027827024_dt, 0.0000000000_dt, 0.0000000000_dt, 1.6601386070_dt, 0.0000000000_dt,
+                                         0.0000000000_dt, -0.4307655990_dt, 0.0000000000_dt, 0.0000000000_dt, 0.3997098207_dt, 0.0000000000_dt };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 1, 4, 3 });
+    Convolution1DParams params{ "in", "cnn1", kernelSize, filters, stride, padding, dilation, groups, false };
+    Convolution1DLayer cnnLayer("cnn1", params, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = TORANGE(x);
+    memory_manager["cnn1::Weights"] = TORANGE((raul::Tensor{ 0.2974873185_dt,
+                                                             -0.2548298240_dt,
+                                                             -0.1119259894_dt,
+                                                             0.2709902525_dt,
+                                                             -0.5435388088_dt,
+                                                             0.3462468982_dt,
+                                                             -0.1187755764_dt,
+                                                             0.2937234044_dt,
+                                                             0.0802614689_dt,
+                                                             -0.0706931353_dt,
+                                                             0.1601343751_dt,
+                                                             0.0284817219_dt }));
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+
+    // Forward checks
+    const auto& output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    // Backward checks
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto& inputNabla = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+
+    const auto& weightsGradient = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGradient.size(), realWeightsGradient.size());
+    for (size_t i = 0; i < weightsGradient.size(); ++i)
+    {
+        CHECK_NEAR(weightsGradient[i], realWeightsGradient[i], EPSILON);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_CNNLayer2D.cpp b/training/src/tests/tests/layers/Test_CNNLayer2D.cpp
new file mode 100644
index 00000000..b3b8b723
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_CNNLayer2D.cpp
@@ -0,0 +1,672 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/quantization/SymmetricQuantizer.h>
+#include <training/base/layers/basic/trainable/Convolution2DLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+#include <chrono>
+
+namespace UT
+{
+
+using namespace raul;
+
+TEST(TestCNN2DLayer, BiasesUnit)
+{
+    PROFILE_TEST
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 2;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 0;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    Tensor input = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 1, 3, 3 });
+    Convolution2DLayer cnnLayer("cnn1", Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, true }, networkParameters);
+    TENSORS_CREATE(1);
+
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["cnn1::Weights"] = 1_dt;
+    memory_manager["cnn1::Biases"] = TORANGE((Tensor{ 2.0f, 3.0f }));
+    ASSERT_NO_THROW(memory_manager["cnn1::Biases"]);
+    ASSERT_NO_THROW(memory_manager["cnn1::BiasesGradient"]);
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    EXPECT_EQ(memory_manager["cnn1"].size(), static_cast<size_t>(FILTERS));
+    CHECK_NEAR(memory_manager["cnn1"][0], 11.0f, EPSILON);
+    CHECK_NEAR(memory_manager["cnn1"][1], 12.0f, EPSILON);
+
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+}
+
+TEST(TestCNN2DLayer, Biases2Unit)
+{
+    PROFILE_TEST
+
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 2;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 0;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 1, 3, 3 });
+    Convolution2DLayer cnnLayer("cnn1", Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, false }, networkParameters);
+    TENSORS_CREATE(1);
+
+    memory_manager["in"] = 1_dt;
+    memory_manager["cnn1::Weights"] = 1_dt;
+    ASSERT_THROW(memory_manager["cnn1::Biases"], raul::Exception);
+    ASSERT_THROW(memory_manager["cnn1::BiasesGradient"], raul::Exception);
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    EXPECT_EQ(memory_manager["cnn1"].size(), static_cast<size_t>(FILTERS));
+    CHECK_NEAR(memory_manager["cnn1"][0], 9.0f, EPSILON);
+    CHECK_NEAR(memory_manager["cnn1"][1], 9.0f, EPSILON);
+
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+}
+
+TEST(TestCNN2DLayer, QuantizeWeightsUnit)
+{
+    PROFILE_TEST
+
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 1;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 0;
+
+    const dtype EPSILON = 1e-6_dt;
+
+    auto quantizer = quantization::SymmetricQuantizer(static_cast<dtype (*)(dtype)>(std::trunc));
+
+    WorkflowEager work(CompressionMode::NONE, CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPU, false, &quantizer);
+    MemoryManager& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(net_params)
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 1, 3, 3 });
+    Convolution2DLayer cnnLayer("cnn", Convolution2DParams{ { "in" }, { "cnn" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, false, true }, net_params);
+    TENSORS_CREATE(1)
+    memory_manager["in"] = 1.0_dt;
+    memory_manager["cnn::Weights"] = 1.0_dt;
+
+    ASSERT_NO_THROW(memory_manager["cnn::Weights_backup"]);
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    EXPECT_EQ(memory_manager["cnn"].size(), FILTERS);
+    CHECK_NEAR(memory_manager["cnn"][0], 9.0_dt, EPSILON);
+
+    for (const auto& val : memory_manager["cnn::Weights_backup"])
+    {
+        CHECK_NEAR(val, 1.0_dt, EPSILON);
+    }
+
+    memory_manager[Name("cnn").grad()] = 1.0_dt;
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+}
+
+TEST(TestCNN2DLayer, TimeMeasurementUnit)
+{
+    PROFILE_TEST
+    const size_t BATCH_SIZE = 200;
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 16;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 0;
+    const size_t INITIAL_SIZE = 28;
+    const size_t INITIAL_DEPTH = 3;
+
+    const size_t FORWARD_STEPS = 100;
+    std::vector<double> time(FORWARD_STEPS, 0.);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, INITIAL_DEPTH, INITIAL_SIZE, INITIAL_SIZE });
+    Convolution2DLayer cnnLayer("cnn", Convolution2DParams{ { "in" }, { "cnn" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE)
+    memory_manager["in"] = 1_dt;
+    memory_manager["cnn::Weights"] = 1_dt;
+
+    for (size_t step = 0; step < FORWARD_STEPS; ++step)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+        cnnLayer.forwardCompute(NetworkMode::Train);
+        time[step] = static_cast<double>(std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - start).count());
+    }
+    auto mean = std::accumulate(time.begin(), time.end(), 0.0) / FORWARD_STEPS;
+    auto dispersion = 0.0;
+    std::for_each(time.begin(), time.end(), [&](const auto d) { dispersion += (d - mean) * (d - mean); });
+    auto stddev = std::sqrt(dispersion / static_cast<double>(time.size() - 1));
+    std::cout << "ConvolutionLayer forward time: mean = " << mean << ", standard deviation = " << stddev << std::endl;
+}
+
+TEST(TestCNN2DLayer, SimpleDilationUnit)
+{
+    PROFILE_TEST
+    const size_t KERNEL_SIZE = 2;
+    const size_t FILTERS = 1;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 0;
+    const size_t DILATION = 2;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const Tensor input = { 1.92691529_dt,  1.48728406_dt,  0.90071720_dt,  -2.10552096_dt, 0.67841846_dt,  -1.23454487_dt, -0.04306748_dt, -1.60466695_dt, -0.75213528_dt,
+                           -0.68662298_dt, -0.49335635_dt, 0.24148779_dt,  -1.11090386_dt, 0.09154566_dt,  -2.31692266_dt, -0.21680473_dt, -1.38467371_dt, -0.39571050_dt,
+                           0.80340934_dt,  -0.62159538_dt, -0.59200054_dt, -0.06307438_dt, -0.82855427_dt, 0.33089843_dt,  -1.55757248_dt };
+
+    const Tensor realOutput = { 0.84618479_dt, -0.70337778_dt, 1.48831379_dt, -0.19095580_dt, 0.07261647_dt, 0.21341163_dt, 0.25238249_dt, -0.07215345_dt, 0.29681736_dt };
+
+    const Tensor realInputNabla = { -0.00374341_dt, -0.00374341_dt, 0.26447839_dt,  0.26822180_dt,  0.26822180_dt,  -0.00374341_dt, -0.00374341_dt, 0.26447839_dt,  0.26822180_dt,
+                                    0.26822180_dt,  -0.41526598_dt, -0.41526598_dt, -0.51501369_dt, -0.09974772_dt, -0.09974772_dt, -0.41152257_dt, -0.41152257_dt, -0.77949208_dt,
+                                    -0.36796951_dt, -0.36796951_dt, -0.41152257_dt, -0.41152257_dt, -0.77949208_dt, -0.36796951_dt, -0.36796951_dt };
+
+    const Tensor realWeightsGrad = { 0.06986499_dt, -6.90609121_dt, -4.84359074_dt, -5.60540581_dt };
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 1, 5, 5 });
+    Convolution2DLayer cnnLayer("cnn1", Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, false, false, DILATION }, networkParameters);
+
+    TENSORS_CREATE(1);
+
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["cnn1::Weights"] = TORANGE((Tensor{ -0.00374341_dt, 0.26822180_dt, -0.41152257_dt, -0.36796951_dt }));
+    ASSERT_THROW(memory_manager["cnn1::Biases"], raul::Exception);
+    ASSERT_THROW(memory_manager["cnn1::BiasesGradient"], raul::Exception);
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    const auto output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto inputNabla = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+    const auto weightsGrad = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGrad.size(), realWeightsGrad.size());
+    for (size_t i = 0; i < weightsGrad.size(); ++i)
+    {
+        CHECK_NEAR(weightsGrad[i], realWeightsGrad[i], EPSILON);
+    }
+}
+
+TEST(TestCNN2DLayer, DilationUnit)
+{
+    PROFILE_TEST
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 3;
+    const size_t STRIDE = 2;
+    const size_t PADDING = 2;
+    const size_t DILATION = 3;
+    const size_t BATCH = 2;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const Tensor input = { 1.92691529_dt,  1.48728406_dt,  0.90071720_dt,  -2.10552096_dt, 0.67841846_dt,  -1.23454487_dt, -0.04306748_dt, -1.60466695_dt, -0.75213528_dt, 1.64872301_dt,
+                           -0.39247864_dt, -1.40360713_dt, -0.72788131_dt, -0.55943018_dt, -0.76883888_dt, 0.76244539_dt,  1.64231694_dt,  -0.15959747_dt, -0.49739754_dt, 0.43958926_dt,
+                           -0.75813115_dt, 1.07831764_dt,  0.80080056_dt,  1.68062055_dt,  1.27912438_dt,  1.29642284_dt,  0.61046648_dt,  1.33473778_dt,  -0.23162432_dt, 0.04175949_dt,
+                           -0.25157529_dt, 0.85985851_dt,  -1.38467371_dt, -0.87123615_dt, -0.22336592_dt, 1.71736145_dt,  0.31888032_dt,  -0.42451897_dt, 0.30572093_dt,  -0.77459252_dt,
+                           -1.55757248_dt, 0.99563611_dt,  -0.87978584_dt, -0.60114205_dt, -1.27415121_dt, 2.12278509_dt,  -1.23465312_dt, -0.48791388_dt, -0.91382301_dt, -0.65813726_dt,
+                           0.07802387_dt,  0.52580875_dt,  -0.48799172_dt, 1.19136906_dt,  -0.81400764_dt, -0.73599279_dt, -1.40324783_dt, 0.03600367_dt,  -0.06347727_dt, 0.67561489_dt,
+                           -0.09780689_dt, 1.84459400_dt,  -1.18453741_dt, 1.38354933_dt,  1.44513381_dt,  0.85641253_dt,  2.21807575_dt,  0.52316552_dt,  0.34664667_dt,  -0.19733144_dt,
+                           -1.05458891_dt, 1.27799559_dt,  -0.17219013_dt, 0.52378845_dt,  0.05662182_dt,  0.42629614_dt,  0.57500505_dt,  -0.64172411_dt, -2.20639849_dt, -0.75080305_dt,
+                           0.01086814_dt,  -0.33874235_dt, -1.34067953_dt, -0.58537054_dt, 0.64075530_dt,  0.58324742_dt,  1.06692672_dt,  -0.45015338_dt, -0.67875296_dt, 0.57431608_dt,
+                           0.18774910_dt,  -0.35762301_dt, 0.26490951_dt,  1.27316833_dt,  -0.00131086_dt, -0.30360377_dt, -0.98643863_dt, 0.12329912_dt,  0.34986776_dt,  0.61728072_dt };
+
+    const Tensor realOutput = { -0.03776294_dt, 0.72608572_dt,  -0.14190413_dt, 0.18520108_dt,  1.15060508_dt,  -0.36858386_dt, 0.20764011_dt, -0.14685461_dt,
+                                -0.83236200_dt, -0.10481618_dt, 0.23594269_dt,  -0.63483143_dt, 0.39375004_dt,  0.35512355_dt,  0.06034210_dt, -0.08678990_dt,
+                                0.09048741_dt,  -0.23272288_dt, -0.37920359_dt, 0.49226546_dt,  -0.36308318_dt, -0.27676097_dt, 0.05685749_dt, -0.07759568_dt };
+
+    const Tensor realInputNabla = { -0.29914585_dt, 0.22641060_dt,  0._dt, 0.22641060_dt,  -0.00320783_dt, 0.13650024_dt,  -0.12689337_dt, 0._dt, -0.12689337_dt, 0.11874340_dt,
+                                    0._dt,          0._dt,          0._dt, 0._dt,          0._dt,          0.13650024_dt,  -0.12689337_dt, 0._dt, -0.12689337_dt, 0.11874340_dt,
+                                    -0.00278408_dt, -0.00824748_dt, 0._dt, -0.00824748_dt, 0.02219512_dt,  -0.27779582_dt, -0.07090232_dt, 0._dt, -0.07090232_dt, 0.03219472_dt,
+                                    -0.42158729_dt, 0.03904085_dt,  0._dt, 0.03904085_dt,  -0.09919342_dt, 0._dt,          0._dt,          0._dt, 0._dt,          0._dt,
+                                    -0.42158729_dt, 0.03904085_dt,  0._dt, 0.03904085_dt,  -0.09919342_dt, 0.13454917_dt,  0.02205629_dt,  0._dt, 0.02205629_dt,  -0.04748112_dt,
+                                    -0.29914585_dt, 0.22641060_dt,  0._dt, 0.22641060_dt,  -0.00320783_dt, 0.13650024_dt,  -0.12689337_dt, 0._dt, -0.12689337_dt, 0.11874340_dt,
+                                    0._dt,          0._dt,          0._dt, 0._dt,          0._dt,          0.13650024_dt,  -0.12689337_dt, 0._dt, -0.12689337_dt, 0.11874340_dt,
+                                    -0.00278408_dt, -0.00824748_dt, 0._dt, -0.00824748_dt, 0.02219512_dt,  -0.27779582_dt, -0.07090232_dt, 0._dt, -0.07090232_dt, 0.03219472_dt,
+                                    -0.42158729_dt, 0.03904085_dt,  0._dt, 0.03904085_dt,  -0.09919342_dt, 0._dt,          0._dt,          0._dt, 0._dt,          0._dt,
+                                    -0.42158729_dt, 0.03904085_dt,  0._dt, 0.03904085_dt,  -0.09919342_dt, 0.13454917_dt,  0.02205629_dt,  0._dt, 0.02205629_dt,  -0.04748112_dt };
+
+    const Tensor realWeightsGrad = { 2.00493908_dt, 1.09894097_dt,  -0.13558918_dt, -0.35167974_dt, 1.44771397_dt, 2.56659555_dt,  -1.81272006_dt, 4.56072235_dt,  1.33574617_dt,
+                                     1.72271895_dt, -1.25255132_dt, -0.70904356_dt, -1.61053061_dt, 0.37454879_dt, -0.85807270_dt, 1.81918132_dt,  -2.78504705_dt, -0.04085654_dt,
+                                     2.00493908_dt, 1.09894097_dt,  -0.13558918_dt, -0.35167974_dt, 1.44771397_dt, 2.56659555_dt,  -1.81272006_dt, 4.56072235_dt,  1.33574617_dt,
+                                     1.72271895_dt, -1.25255132_dt, -0.70904356_dt, -1.61053061_dt, 0.37454879_dt, -0.85807270_dt, 1.81918132_dt,  -2.78504705_dt, -0.04085654_dt,
+                                     2.00493908_dt, 1.09894097_dt,  -0.13558918_dt, -0.35167974_dt, 1.44771397_dt, 2.56659555_dt,  -1.81272006_dt, 4.56072235_dt,  1.33574617_dt,
+                                     1.72271895_dt, -1.25255132_dt, -0.70904356_dt, -1.61053061_dt, 0.37454879_dt, -0.85807270_dt, 1.81918132_dt,  -2.78504705_dt, -0.04085654_dt };
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 2, 5, 5 });
+    Convolution2DLayer cnnLayer("cnn1", Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, false, false, DILATION }, networkParameters);
+    TENSORS_CREATE(BATCH);
+
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["cnn1::Weights"] =
+        TORANGE((Tensor{ -0.00176466_dt, 0.12644097_dt,  -0.19399360_dt, -0.17346250_dt, -0.09078176_dt, 0.06320529_dt,  -0.00467001_dt, 0.18688585_dt,  -0.02091717_dt, 0.06236978_dt, -0.07123230_dt,
+                         -0.04633091_dt, -0.22517779_dt, -0.15610139_dt, -0.09716192_dt, 0.00873125_dt,  0.09318140_dt,  0.14142673_dt,  -0.15979224_dt, -0.10263957_dt, 0.08561110_dt, 0.19572432_dt,
+                         -0.04850757_dt, 0.17637877_dt,  -0.03799128_dt, 0.02494062_dt,  0.21342279_dt,  -0.21865401_dt, -0.14838351_dt, -0.05967162_dt, -0.09187673_dt, 0.20364694_dt, -0.15277740_dt,
+                         -0.10850150_dt, -0.16467114_dt, -0.22074954_dt, -0.13758895_dt, 0.20260920_dt,  0.10517468_dt,  0.11423842_dt,  0.01239595_dt,  -0.12084066_dt, 0.03987721_dt, -0.22007395_dt,
+                         -0.17031050_dt, -0.12151159_dt, 0.14871350_dt,  0.13819724_dt,  -0.10453279_dt, -0.00850470_dt, 0.15074590_dt,  0.23431942_dt,  0.09354603_dt,  0.03184169_dt }));
+    ASSERT_THROW(memory_manager["cnn1::Biases"], raul::Exception);
+    ASSERT_THROW(memory_manager["cnn1::BiasesGradient"], raul::Exception);
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    const auto output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto inputNabla = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+    const auto weightsGrad = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGrad.size(), realWeightsGrad.size());
+    for (size_t i = 0; i < weightsGrad.size(); ++i)
+    {
+        CHECK_NEAR(weightsGrad[i], realWeightsGrad[i], EPSILON);
+    }
+}
+
+TEST(TestCNN2DLayer, Dilation2Unit)
+{
+    PROFILE_TEST
+    const size_t KERNEL_SIZE_H = 1;
+    const size_t KERNEL_SIZE_W = 3;
+    const size_t FILTERS = 2;
+    const size_t STRIDE_H = 1;
+    const size_t STRIDE_W = 3;
+    const size_t PADDING_H = 3;
+    const size_t PADDING_W = 1;
+    const size_t DILATION_H = 2;
+    const size_t DILATION_W = 3;
+    const size_t BATCH = 2;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const Tensor input = { 1.92691529_dt,  1.48728406_dt,  0.90071720_dt,  -2.10552096_dt, 0.67841846_dt,  -1.23454487_dt, -0.04306748_dt, -1.60466695_dt, -0.75213528_dt, 1.64872301_dt,
+                           -0.39247864_dt, -1.40360713_dt, -0.72788131_dt, -0.55943018_dt, -0.76883888_dt, 0.76244539_dt,  1.64231694_dt,  -0.15959747_dt, -0.49739754_dt, 0.43958926_dt,
+                           -0.75813115_dt, 1.07831764_dt,  0.80080056_dt,  1.68062055_dt,  1.27912438_dt,  1.29642284_dt,  0.61046648_dt,  1.33473778_dt,  -0.23162432_dt, 0.04175949_dt,
+                           -0.25157529_dt, 0.85985851_dt,  -1.38467371_dt, -0.87123615_dt, -0.22336592_dt, 1.71736145_dt,  0.31888032_dt,  -0.42451897_dt, 0.30572093_dt,  -0.77459252_dt,
+                           -1.55757248_dt, 0.99563611_dt,  -0.87978584_dt, -0.60114205_dt, -1.27415121_dt, 2.12278509_dt,  -1.23465312_dt, -0.48791388_dt, -0.91382301_dt, -0.65813726_dt,
+                           0.07802387_dt,  0.52580875_dt,  -0.48799172_dt, 1.19136906_dt,  -0.81400764_dt, -0.73599279_dt, -1.40324783_dt, 0.03600367_dt,  -0.06347727_dt, 0.67561489_dt,
+                           -0.09780689_dt, 1.84459400_dt,  -1.18453741_dt, 1.38354933_dt,  1.44513381_dt,  0.85641253_dt,  2.21807575_dt,  0.52316552_dt,  0.34664667_dt,  -0.19733144_dt,
+                           -1.05458891_dt, 1.27799559_dt,  -0.17219013_dt, 0.52378845_dt,  0.05662182_dt,  0.42629614_dt,  0.57500505_dt,  -0.64172411_dt, -2.20639849_dt, -0.75080305_dt,
+                           0.01086814_dt,  -0.33874235_dt, -1.34067953_dt, -0.58537054_dt, 0.53618813_dt,  0.52462262_dt,  1.14120162_dt,  0.05164360_dt,  0.74395198_dt,  -0.48158440_dt,
+                           -1.04946613_dt, 0.60389882_dt,  -1.72229505_dt, -0.82776886_dt, 1.33470297_dt,  0.48353928_dt,  -2.50954437_dt, 0.48800105_dt,  0.78458685_dt,  0.02864719_dt,
+                           0.64075530_dt,  0.58324742_dt,  1.06692672_dt,  -0.45015338_dt, -0.18526748_dt, 0.75275886_dt,  0.40475780_dt,  0.17846599_dt,  0.26490951_dt,  1.27316833_dt,
+                           -0.00131086_dt, -0.30360377_dt, -1.45702910_dt, -0.10233524_dt, -0.59915304_dt, 0.47705641_dt,  0.72617722_dt,  0.09115186_dt,  -0.38906521_dt, 0.52791649_dt,
+                           -0.01268548_dt, 0.24083632_dt,  0.13253537_dt,  0.76424062_dt,  1.09500968_dt,  0.33989096_dt,  0.71996748_dt,  0.41140762_dt,  1.93116057_dt,  1.01186383_dt,
+                           -1.43640649_dt, -1.12985981_dt, -0.13603453_dt, 1.63540959_dt,  -0.73280275_dt, 0.10429783_dt,  1.04140103_dt,  -0.39973062_dt, -2.29333448_dt, 0.49756259_dt,
+                           -0.42572311_dt, -1.33714700_dt, -1.19545376_dt, 0.81233692_dt,  -0.30627838_dt, -0.33015838_dt, -0.98080349_dt, 0.19473360_dt,  -1.65352094_dt, 0.68141943_dt };
+
+    const Tensor realOutput = { -0.22598037_dt, -0.22598037_dt, -0.22598037_dt, -0.36525360_dt, -0.32563144_dt, -0.61470342_dt, -0.00329676_dt, -0.06565411_dt, -0.22598037_dt,
+                                -0.22598037_dt, -0.22598037_dt, -0.14515428_dt, -0.14515428_dt, -0.14515428_dt, -0.59485489_dt, 0.32692224_dt,  -0.13420853_dt, 0.13408725_dt,
+                                -0.14080381_dt, -0.14515428_dt, -0.14515428_dt, -0.14515428_dt, -0.22598037_dt, -0.22598037_dt, -0.22598037_dt, -0.36897355_dt, -0.52457917_dt,
+                                -0.13533276_dt, -0.86160851_dt, -0.10426680_dt, -0.22598037_dt, -0.22598037_dt, -0.22598037_dt, -0.14515428_dt, -0.14515428_dt, -0.14515428_dt,
+                                -0.26182932_dt, -0.06742202_dt, 0.11862217_dt,  -0.14931199_dt, -0.19791129_dt, -0.14515428_dt, -0.14515428_dt, -0.14515428_dt };
+
+    const Tensor realInputNabla = { 0._dt, 0._dt, 0.07807684_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.07807684_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.07807684_dt,  0._dt, 0._dt,
+                                    0._dt, 0._dt, 0.07807684_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.07807684_dt,  0._dt, 0._dt, 0._dt, 0._dt, -0.34914550_dt, 0._dt, 0._dt,
+                                    0._dt, 0._dt, -0.34914550_dt, 0._dt, 0._dt, 0._dt, 0._dt, -0.34914550_dt, 0._dt, 0._dt, 0._dt, 0._dt, -0.34914550_dt, 0._dt, 0._dt,
+                                    0._dt, 0._dt, -0.34914550_dt, 0._dt, 0._dt, 0._dt, 0._dt, 0.39607489_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.39607489_dt,  0._dt, 0._dt,
+                                    0._dt, 0._dt, 0.39607489_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.39607489_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.39607489_dt,  0._dt, 0._dt,
+                                    0._dt, 0._dt, 0.07807684_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.07807684_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.07807684_dt,  0._dt, 0._dt,
+                                    0._dt, 0._dt, 0.07807684_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.07807684_dt,  0._dt, 0._dt, 0._dt, 0._dt, -0.34914550_dt, 0._dt, 0._dt,
+                                    0._dt, 0._dt, -0.34914550_dt, 0._dt, 0._dt, 0._dt, 0._dt, -0.34914550_dt, 0._dt, 0._dt, 0._dt, 0._dt, -0.34914550_dt, 0._dt, 0._dt,
+                                    0._dt, 0._dt, -0.34914550_dt, 0._dt, 0._dt, 0._dt, 0._dt, 0.39607489_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.39607489_dt,  0._dt, 0._dt,
+                                    0._dt, 0._dt, 0.39607489_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.39607489_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.39607489_dt,  0._dt, 0._dt };
+    const Tensor realWeightsGrad = { 0._dt, -3.95568180_dt, 0._dt, 0._dt, -1.83010375_dt, 0._dt, 0._dt, -2.41062760_dt, 0._dt,
+                                     0._dt, -3.95568180_dt, 0._dt, 0._dt, -1.83010375_dt, 0._dt, 0._dt, -2.41062760_dt, 0._dt };
+    const Tensor realBiasGrad = { 22.0_dt, 22.0_dt };
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 3, 5, 5 });
+    Convolution2DLayer cnnLayer(
+        "cnn1", Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE_W, KERNEL_SIZE_H, FILTERS, STRIDE_W, STRIDE_H, PADDING_W, PADDING_H, true, false, DILATION_W, DILATION_H }, networkParameters);
+    TENSORS_CREATE(BATCH);
+
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["cnn1::Weights"] = TORANGE((Tensor{
+        -0.00249561_dt,
+        0.17881453_dt,
+        -0.27434838_dt,
+        -0.24531302_dt,
+        -0.12838480_dt,
+        0.08938579_dt,
+        -0.00660439_dt,
+        0.26429650_dt,
+        -0.02958135_dt,
+        0.08820419_dt,
+        -0.10073769_dt,
+        -0.06552180_dt,
+        -0.31844950_dt,
+        -0.22076070_dt,
+        -0.13740771_dt,
+        0.01234786_dt,
+        0.13177840_dt,
+        0.20000760_dt,
+    }));
+    memory_manager["cnn1::Biases"] = TORANGE((Tensor{ -0.22598037_dt, -0.14515428_dt }));
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    const auto output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto inputNabla = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+    const auto weightsGrad = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGrad.size(), realWeightsGrad.size());
+    for (size_t i = 0; i < weightsGrad.size(); ++i)
+    {
+        CHECK_NEAR(weightsGrad[i], realWeightsGrad[i], EPSILON);
+    }
+    const auto biasGrad = memory_manager[Name("cnn1::Biases").grad()];
+    EXPECT_EQ(biasGrad.size(), realBiasGrad.size());
+    for (size_t i = 0; i < biasGrad.size(); ++i)
+    {
+        CHECK_NEAR(biasGrad[i], realBiasGrad[i], EPSILON);
+    }
+}
+
+TEST(TestCNN2DLayer, IncorrectGroupsUnit)
+{
+    PROFILE_TEST
+    const size_t KERNEL_SIZE_H = 1;
+    const size_t KERNEL_SIZE_W = 3;
+    const size_t FILTERS = 2;
+    const size_t STRIDE_H = 1;
+    const size_t STRIDE_W = 3;
+    const size_t PADDING_H = 3;
+    const size_t PADDING_W = 1;
+    const size_t DILATION_H = 2;
+    const size_t DILATION_W = 3;
+    const size_t GROUPS = 0;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    ASSERT_THROW(Convolution2DLayer cnnLayer(
+                     "cnn1",
+                     Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE_W, KERNEL_SIZE_H, FILTERS, STRIDE_W, STRIDE_H, PADDING_W, PADDING_H, true, false, DILATION_W, DILATION_H, GROUPS },
+                     networkParameters),
+                 raul::Exception);
+}
+
+TEST(TestCNN2DLayer, SimpleDepthwiseUnit)
+{
+    PROFILE_TEST
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 2;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 0;
+    const size_t DILATION = 1;
+    const size_t BATCH = 2;
+    const size_t GROUPS = 2;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const Tensor input = { 0.6995424032_dt,  0.1990816295_dt,  0.8656923771_dt,  0.2444039285_dt,  -0.6629113555_dt, 0.8073082566_dt,  0.4391415715_dt,  1.1712007523_dt,  -2.2455577850_dt,
+                           -1.4464579821_dt, 0.0611552820_dt,  -0.6177445054_dt, -0.7980698347_dt, -0.1316232085_dt, -0.7984398007_dt, 0.3357305229_dt,  0.1577706039_dt,  -0.7734549046_dt,
+                           0.1990565062_dt,  0.0457027778_dt,  1.1651384830_dt,  2.0153918266_dt,  0.2151824534_dt,  -0.5241936445_dt, -1.8033639193_dt, -1.3083208799_dt, 0.4532545805_dt,
+                           1.1421611309_dt,  -3.3311536312_dt, -0.7478722334_dt, 1.1173496246_dt,  0.2981353104_dt,  0.1098855436_dt,  -0.6463385224_dt, 0.4285422862_dt,  1.4760777950_dt };
+
+    const Tensor realOutput = { 0.5070145130_dt, -0.0075466228_dt, 0.0179799497_dt, -0.2341260314_dt };
+
+    const Tensor realInputNabla = { 0.1717543900_dt,  -0.1471260786_dt, -0.0646204948_dt, 0.1564563215_dt, -0.3138123155_dt, 0.1999057829_dt,  -0.0685751140_dt, 0.1695813239_dt, 0.0463390052_dt,
+                                    -0.0408147275_dt, 0.0924536288_dt,  0.0164439380_dt,  0.1217427254_dt, -0.1299003363_dt, -0.0243029296_dt, -0.0300091207_dt, 0.0483146608_dt, -0.0013315976_dt,
+                                    0.1717543900_dt,  -0.1471260786_dt, -0.0646204948_dt, 0.1564563215_dt, -0.3138123155_dt, 0.1999057829_dt,  -0.0685751140_dt, 0.1695813239_dt, 0.0463390052_dt,
+                                    -0.0408147275_dt, 0.0924536288_dt,  0.0164439380_dt,  0.1217427254_dt, -0.1299003363_dt, -0.0243029296_dt, -0.0300091207_dt, 0.0483146608_dt, -0.0013315976_dt };
+    const Tensor realWeightsGrad = { 0.8985989094_dt,  0.2447844148_dt,  2.0308308601_dt,  2.2597956657_dt, -0.4477289021_dt, 0.2831146121_dt,  -1.3642222881_dt, -0.1371201277_dt, -1.7923032045_dt,
+                                     -0.3042968512_dt, -3.2699983120_dt, -1.3656167984_dt, 0.3192797899_dt, 0.1665121019_dt,  -0.6885542870_dt, -0.3106079996_dt, 0.5863128901_dt,  0.7026228905_dt };
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 2, 3, 3 });
+    Convolution2DLayer cnnLayer("cnn1", Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, false, false, DILATION, GROUPS }, networkParameters);
+    TENSORS_CREATE(BATCH);
+
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["cnn1::Weights"] = TORANGE((Tensor{ 0.1717543900_dt,
+                                                       -0.1471260786_dt,
+                                                       -0.0646204948_dt,
+                                                       0.1564563215_dt,
+                                                       -0.3138123155_dt,
+                                                       0.1999057829_dt,
+                                                       -0.0685751140_dt,
+                                                       0.1695813239_dt,
+                                                       0.0463390052_dt,
+                                                       -0.0408147275_dt,
+                                                       0.0924536288_dt,
+                                                       0.0164439380_dt,
+                                                       0.1217427254_dt,
+                                                       -0.1299003363_dt,
+                                                       -0.0243029296_dt,
+                                                       -0.0300091207_dt,
+                                                       0.0483146608_dt,
+                                                       -0.0013315976_dt }));
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    const auto output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto inputNabla = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+
+    const auto weightsGrad = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGrad.size(), realWeightsGrad.size());
+    for (size_t i = 0; i < weightsGrad.size(); ++i)
+    {
+        CHECK_NEAR(weightsGrad[i], realWeightsGrad[i], EPSILON);
+    }
+}
+
+TEST(TestCNN2DLayer, DepthwiseUnit)
+{
+    PROFILE_TEST
+    const size_t KERNEL_SIZE_H = 1;
+    const size_t KERNEL_SIZE_W = 3;
+    const size_t FILTERS = 4;
+    const size_t STRIDE_H = 1;
+    const size_t STRIDE_W = 3;
+    const size_t PADDING_H = 3;
+    const size_t PADDING_W = 1;
+    const size_t DILATION_H = 2;
+    const size_t DILATION_W = 3;
+    const size_t BATCH = 2;
+    const size_t GROUPS = 4;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const Tensor input = {
+        1.9269152880_dt,  1.4872840643_dt,  0.9007171988_dt,  -2.1055209637_dt, 0.6784183979_dt,  -1.2345448732_dt, -0.0430674776_dt, -1.6046669483_dt, -0.7521352768_dt, 1.6487230062_dt,
+        -0.3924786448_dt, -1.4036071301_dt, -0.7278813124_dt, -0.5594301820_dt, -0.7688388824_dt, 0.7624453902_dt,  1.6423169374_dt,  -0.1595974714_dt, -0.4973975420_dt, 0.4395892620_dt,
+        -0.7581311464_dt, 1.0783176422_dt,  0.8008005619_dt,  1.6806205511_dt,  1.2791243792_dt,  1.2964228392_dt,  0.6104664803_dt,  1.3347377777_dt,  -0.2316243201_dt, 0.0417594910_dt,
+        -0.2515752912_dt, 0.8598585129_dt,  -1.3846737146_dt, -0.8712361455_dt, -0.2233659178_dt, 1.7173614502_dt,  0.3188803196_dt,  -0.4245189726_dt, 0.3057209253_dt,  -0.7745925188_dt,
+        -1.5575724840_dt, 0.9956361055_dt,  -0.8797858357_dt, -0.6011420488_dt, -1.2741512060_dt, 2.1227850914_dt,  -1.2346531153_dt, -0.4879138768_dt, -0.9138230085_dt, -0.6581372619_dt,
+        0.0780238733_dt,  0.5258087516_dt,  -0.4879917204_dt, 1.1913690567_dt,  -0.8140076399_dt, -0.7359927893_dt, -1.4032478333_dt, 0.0360036679_dt,  -0.0634772703_dt, 0.6756148934_dt,
+        -0.0978068933_dt, 1.8445940018_dt,  -1.1845374107_dt, 1.3835493326_dt,  1.4451338053_dt,  0.8564125299_dt,  2.2180757523_dt,  0.5231655240_dt,  0.3466466665_dt,  -0.1973314434_dt,
+        -1.0545889139_dt, 1.2779955864_dt,  -0.1721901298_dt, 0.5237884521_dt,  0.0566218197_dt,  0.4262961447_dt,  0.5750050545_dt,  -0.6417241096_dt, -2.2063984871_dt, -0.7508030534_dt,
+        0.0108681442_dt,  -0.3387423456_dt, -1.3406795263_dt, -0.5853705406_dt, 0.5361881256_dt,  0.5246226192_dt,  1.1412016153_dt,  0.0516435951_dt,  0.7439519763_dt,  -0.4815843999_dt,
+        -1.0494660139_dt, 0.6038988233_dt,  -1.7222950459_dt, -0.8277688622_dt, 1.3347029686_dt,  0.4835392833_dt,  -2.5095443726_dt, 0.4880010486_dt,  0.7845868468_dt,  0.0286471862_dt,
+        0.6407552958_dt,  0.5832474232_dt,  1.0669267178_dt,  -0.4501533806_dt, -0.1852674782_dt, 0.7527588606_dt,  0.4047577977_dt,  0.1784659922_dt,  0.2649095058_dt,  1.2731683254_dt,
+        -0.0013108636_dt, -0.3036037683_dt, -1.4570291042_dt, -0.1023352370_dt, -0.5991530418_dt, 0.4770564139_dt,  0.7261772156_dt,  0.0911518633_dt,  -0.3890652061_dt, 0.5279164910_dt,
+        -0.0126854787_dt, 0.2408363223_dt,  0.1325353682_dt,  0.7642406225_dt,  1.0950096846_dt,  0.3398909569_dt,  0.7199674845_dt,  0.4114076197_dt,  1.9311605692_dt,  1.0118638277_dt,
+        -1.4364064932_dt, -1.1298598051_dt, -0.1360345334_dt, 1.6354095936_dt,  0.6547407508_dt,  0.5760045648_dt,  1.1415079832_dt,  0.0185645763_dt,  -1.8058050871_dt, 0.9254348874_dt,
+        -0.3753443658_dt, 1.0330873728_dt,  -0.6866509318_dt, 0.6368136406_dt,  -0.9726738930_dt, 0.9584577680_dt,  1.6192004681_dt,  1.4506098032_dt,  0.2694815397_dt,  -0.2103759795_dt,
+        -0.7328027487_dt, 0.1042978317_dt,  0.3487516940_dt,  0.9675941467_dt,  -0.4656884372_dt, 1.6047972441_dt,  -2.4801201820_dt, -0.4175437391_dt, -1.1954537630_dt, 0.8123369217_dt,
+        -1.9005532265_dt, 0.2285765260_dt,  0.0248594042_dt,  -0.3459502459_dt, 0.2868328094_dt,  -0.7308424115_dt, 0.1748202592_dt,  -1.0939292908_dt, -1.6021603346_dt, 1.3528969288_dt,
+        1.2888276577_dt,  0.0522954725_dt,  -1.5468504429_dt, 0.7567060590_dt,  0.7755194902_dt,  2.0265355110_dt,  0.0358176120_dt,  0.1205887273_dt,  -0.8056637049_dt, -0.2075768262_dt,
+        -0.9319477677_dt, -1.5909662247_dt, -1.1359757185_dt, -0.5225976110_dt, -0.1593310684_dt, -0.4249436855_dt, 0.9442309737_dt,  -0.1849345267_dt, 1.0607991219_dt,  0.2083034515_dt,
+        -0.5778480768_dt, 0.3254614472_dt,  0.2617779672_dt,  -0.7599348426_dt, -2.0461385250_dt, -1.5294533968_dt, 0.4048692286_dt,  0.6318764687_dt,  0.3125324845_dt,  -0.0335015208_dt
+    };
+
+    const Tensor realOutput = { 0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.2789663970_dt,  -0.4969908297_dt, -0.2254363894_dt, -0.0494298674_dt, 0.2480206341_dt,  0.0000000000_dt,
+                                0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  -0.2968042791_dt, 0.3079084754_dt,  0.0943998545_dt,  0.1956370771_dt,
+                                0.1084969118_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  -0.2233904153_dt, 0.0164815784_dt,
+                                -0.5422515869_dt, 0.2394921035_dt,  -0.0788243338_dt, 0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,
+                                0.1119698137_dt,  0.2339255065_dt,  -0.0090109184_dt, 0.3005108535_dt,  -0.0851477832_dt, 0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,
+                                0.0000000000_dt,  0.0000000000_dt,  0.3304441273_dt,  0.0552737489_dt,  -0.4512650371_dt, 0.0282311775_dt,  0.0410483070_dt,  0.0000000000_dt,  0.0000000000_dt,
+                                0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  -0.0914842933_dt, 0.0302498620_dt,  -0.0041281860_dt, 0.1526898742_dt,  -0.3225706220_dt,
+                                0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.1596498042_dt,  -0.1911410838_dt, 0.0113800140_dt,
+                                -0.5007734895_dt, -0.7081094384_dt, 0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt,  -0.0210406575_dt,
+                                0.1982082129_dt,  0.0322678909_dt,  -0.0456757508_dt, -0.1102515683_dt, 0.0000000000_dt,  0.0000000000_dt,  0.0000000000_dt };
+
+    const Tensor realInputNabla = {
+        0.0000000000_dt, 0.0000000000_dt, 0.3097158670_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.3097158670_dt,  0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.3097158670_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.3097158670_dt,  0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.3097158670_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.2223689854_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, -0.2223689854_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.2223689854_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, -0.2223689854_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.2223689854_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.4577749968_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.4577749968_dt,  0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.4577749968_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.4577749968_dt,  0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.4577749968_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.1744827926_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, -0.1744827926_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.1744827926_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, -0.1744827926_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.1744827926_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.3097158670_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.3097158670_dt,  0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.3097158670_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.3097158670_dt,  0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.3097158670_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.2223689854_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, -0.2223689854_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.2223689854_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, -0.2223689854_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.2223689854_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.4577749968_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.4577749968_dt,  0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.4577749968_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.4577749968_dt,  0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, 0.4577749968_dt,  0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.1744827926_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, -0.1744827926_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.1744827926_dt, 0.0000000000_dt, 0.0000000000_dt,
+        0.0000000000_dt, 0.0000000000_dt, -0.1744827926_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, 0.0000000000_dt, -0.1744827926_dt, 0.0000000000_dt, 0.0000000000_dt
+    };
+    const Tensor realWeightsGrad = { 0.0000000000_dt, -0.7785770893_dt, 0.0000000000_dt, 0.0000000000_dt, -0.7842580080_dt, 0.0000000000_dt,
+                                     0.0000000000_dt, -3.9702625275_dt, 0.0000000000_dt, 0.0000000000_dt, -3.4717206955_dt, 0.0000000000_dt };
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 4, 5, 5 });
+    Convolution2DLayer cnnLayer(
+        "cnn1",
+        Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE_W, KERNEL_SIZE_H, FILTERS, STRIDE_W, STRIDE_H, PADDING_W, PADDING_H, false, false, DILATION_W, DILATION_H, GROUPS },
+        networkParameters);
+    TENSORS_CREATE(BATCH);
+
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["cnn1::Weights"] = TORANGE((Tensor{ -0.0043225288_dt,
+                                                       0.3097158670_dt,
+                                                       -0.4751853347_dt,
+                                                       -0.4248945713_dt,
+                                                       -0.2223689854_dt,
+                                                       0.1548207402_dt,
+                                                       -0.0114391446_dt,
+                                                       0.4577749968_dt,
+                                                       -0.0512363911_dt,
+                                                       0.1527741551_dt,
+                                                       -0.1744827926_dt,
+                                                       -0.1134870648_dt }));
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    const auto output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto inputNabla = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+
+    const auto weightsGrad = memory_manager[Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGrad.size(), realWeightsGrad.size());
+    for (size_t i = 0; i < weightsGrad.size(); ++i)
+    {
+        CHECK_NEAR(weightsGrad[i], realWeightsGrad[i], EPSILON);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_CNNLayerDepthwise.cpp b/training/src/tests/tests/layers/Test_CNNLayerDepthwise.cpp
new file mode 100644
index 00000000..f4fc7adc
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_CNNLayerDepthwise.cpp
@@ -0,0 +1,174 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/trainable/ConvolutionDepthwiseLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestCNNDepthwiseLayer, BiasesUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 1;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 0;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    Tensor input = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, 3, 3 });
+    ConvolutionDepthwiseLayer cnnLayer("cnn1", Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, true }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["in"] = TORANGE(input);
+
+    memory_manager["cnn1::Weights"] = TORANGE((Tensor{ 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }));
+    memory_manager["cnn1::Biases"] = TORANGE((Tensor{ 2.0f }));
+    ASSERT_NO_THROW(memory_manager["cnn1::Biases"]);
+    ASSERT_NO_THROW(memory_manager["cnn1::BiasesGradient"]);
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    EXPECT_EQ(memory_manager["cnn1"].size(), static_cast<size_t>(FILTERS));
+    CHECK_NEAR(memory_manager["cnn1"][0], 11.0f, EPSILON);
+
+    memory_manager[Name("cnn1").grad()] = TORANGE((Tensor{ 1.0f }));
+    auto& tt = memory_manager[Name("in").grad()];
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    for (auto& t : tt)
+    {
+        cout << t << " ";
+    }
+    cout << endl;
+}
+
+TEST(TestCNNDepthwiseLayer, Biases3Unit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 2;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 1;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    Tensor input = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, -1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f };
+
+    Tensor realOutput(1, 2, 3, 2, { 11.0f, 11.0f, 22.0f, 22.0f, 19.0f, 19.0f, -18.0f, -18.0f, -40.0f, -40.0f, -34.0f, -34.0f });
+    Tensor realInputGrad(1, 2, 3, 2, { 4.0f, 4.0f, 6.0f, 6.0f, 4.0f, 4.0f, 8.0f, 8.0f, 12.0f, 12.0f, 8.0f, 8.0f });
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 2, 3, 2 });
+    ConvolutionDepthwiseLayer cnnLayer("cnn1", Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, true }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["in"] = TORANGE(input);
+
+    memory_manager["cnn1::Weights"] = TORANGE((Tensor{ 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f }));
+    memory_manager["cnn1::Biases"] = TORANGE((Tensor{ 1.0f, 2.0f }));
+
+    Tensor realWeightsGrad(memory_manager["cnn1::Weights"].getShape(), { 4.0f, 10.0f, 6.0f, 9.0f, 21.0f, 12.0f, 8.0f, 18.0f, 10.0f, -4.0f, -10.0f, -6.0f, -9.0f, -21.0f, -12.0f, -8.0f, -18.0f, -10.0f });
+    Tensor realBiasesGrad(memory_manager["cnn1::Biases"].getShape(), { 6.0f, 6.0f });
+
+    ASSERT_NO_THROW(memory_manager["cnn1::Biases"]);
+    ASSERT_NO_THROW(memory_manager["cnn1::BiasesGradient"]);
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    EXPECT_EQ(memory_manager["cnn1"].getShape(), realOutput.getShape());
+    for (size_t i = 0; i < realOutput.size(); ++i)
+    {
+        CHECK_NEAR(memory_manager["cnn1"][i], realOutput[i], EPSILON);
+    }
+
+    memory_manager[Name("cnn1").grad()] = 1_dt;
+    
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    EXPECT_EQ(memory_manager[Name("in").grad()].getShape(), realInputGrad.getShape());
+    for (size_t i = 0; i < realInputGrad.size(); ++i)
+    {
+        CHECK_NEAR(memory_manager[Name("in").grad()][i], realInputGrad[i], EPSILON);
+    }
+
+    EXPECT_EQ(memory_manager[Name("cnn1::Weights").grad()].getShape(), realWeightsGrad.getShape());
+    for (size_t i = 0; i < realWeightsGrad.size(); ++i)
+    {
+        CHECK_NEAR(memory_manager[Name("cnn1::Weights").grad()][i], realWeightsGrad[i], EPSILON);
+    }
+
+    EXPECT_EQ(memory_manager[Name("cnn1::Biases").grad()].getShape(), realBiasesGrad.getShape());
+    for (size_t i = 0; i < realBiasesGrad.size(); ++i)
+    {
+        CHECK_NEAR(memory_manager[Name("cnn1::Biases").grad()][i], realBiasesGrad[i], EPSILON);
+    }
+}
+
+TEST(TestCNNDepthwiseLayer, Biases2Unit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 1;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 0;
+
+    const raul::dtype EPSILON = TODTYPE(1e-6);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    Tensor input = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in", "labels" }, 1, 3, 3, 1 });
+    ConvolutionDepthwiseLayer cnnLayer("cnn1", Convolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, false }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["in"] = TORANGE(input);
+
+    memory_manager["cnn1::Weights"] = TORANGE((Tensor{ 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }));
+    ASSERT_THROW(memory_manager["cnn1::Biases"], raul::Exception);
+    ASSERT_THROW(memory_manager["cnn1::BiasesGradient"], raul::Exception);
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(NetworkMode::Train));
+
+    EXPECT_EQ(memory_manager["cnn1"].size(), static_cast<size_t>(FILTERS));
+    CHECK_NEAR(memory_manager["cnn1"][0], 9.0f, EPSILON);
+
+    memory_manager[Name("cnn1").grad()] = TORANGE((Tensor{ 1.0f }));
+
+    auto& tt = memory_manager[Name("in").grad()];
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    for (auto& t : tt)
+    {
+        cout << t << " ";
+    }
+    cout << endl;
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_CNNMaxPool.cpp b/training/src/tests/tests/layers/Test_CNNMaxPool.cpp
new file mode 100644
index 00000000..c619fe51
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_CNNMaxPool.cpp
@@ -0,0 +1,307 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/MaxPoolLayer.h>
+#include <training/base/layers/parameters/LayerParameters.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+using namespace raul;
+
+TEST(TestCNNMaxPool, Unit)
+{
+    PROFILE_TEST
+    dtype eps = TODTYPE(1e-4);
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+        size_t batch = 1;
+        size_t stride_w = 1;
+        size_t stride_h = 1;
+        size_t in_w = 3;
+        size_t in_h = 3;
+        size_t padding_w = 0;
+        size_t padding_h = 0;
+        size_t depth = 1;
+        size_t kernel_height = 3;
+        size_t kernel_width = 3;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in" }, { "mp" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        MaxPoolLayer2D maxpool("mp1", params, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        const Tensor& out = memory_manager["mp"];
+        maxpool.forwardCompute(NetworkMode::Train);
+        EXPECT_EQ(TODTYPE(9.f), out[0]);
+
+        memory_manager.clear();
+    }
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
+        size_t batch = 1;
+        size_t in_w = 4;
+        size_t in_h = 3;
+        size_t depth = 1;
+        size_t kernel_height = 3;
+        size_t kernel_width = 2;
+        size_t stride_w = 1;
+        size_t stride_h = 1;
+        size_t padding_w = 0;
+        size_t padding_h = 0;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in" }, { "mp" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        MaxPoolLayer2D maxpool("mp1", params, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        const Tensor& out = memory_manager["mp"];
+
+        maxpool.forwardCompute(NetworkMode::Train);
+
+        EXPECT_EQ(TODTYPE(10.f), out[0]);
+        EXPECT_EQ(TODTYPE(11.f), out[1]);
+        EXPECT_EQ(TODTYPE(12.f), out[2]);
+
+        memory_manager.clear();
+    }
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+        size_t batch = 1;
+        size_t in_w = 5;
+        size_t in_h = 3;
+        size_t depth = 1;
+        size_t kernel_height = 2;
+        size_t kernel_width = 3;
+        size_t stride_w = 3;
+        size_t stride_h = 1;
+        size_t padding_w = 1;
+        size_t padding_h = 0;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in" }, { "mp" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        MaxPoolLayer2D maxpool("mp1", params, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        const Tensor& out = memory_manager["mp"];
+
+        maxpool.forwardCompute(NetworkMode::Train);
+
+        EXPECT_EQ(TODTYPE(7.f), out[0]);
+        EXPECT_EQ(TODTYPE(10.f), out[1]);
+        EXPECT_EQ(TODTYPE(12.f), out[2]);
+        EXPECT_EQ(TODTYPE(15.f), out[3]);
+
+        memory_manager.clear();
+    }
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 0.37479503f, 0.74949179f, 0.92839539f, 0.40420301f, 0.9961107f,  0.66630404f, 0.82047117f, 0.99906097f, 0.48825795f, 0.6590186f,  0.14482136f, 0.6286064f,  0.54629563f,
+                       0.18286226f, 0.73805489f, 0.50778409f, 0.76171372f, 0.74803807f, 0.22766416f, 0.80420801f, 0.76574229f, 0.795332f,   0.00788858f, 0.21592507f, 0.66575409f, 0.91054865f,
+                       0.07787714f, 0.93616972f, 0.78133592f, 0.89633024f, 0.96769797f, 0.57117974f, 0.58974003f, 0.6582992f,  0.47186511f, 0.03024406f, 0.86331904f, 0.4101691f,  0.74629559f,
+                       0.1786914f,  0.26763513f, 0.68119195f, 0.41871249f, 0.41247165f, 0.48607614f, 0.65607838f, 0.47523406f, 0.45516007f, 0.39255511f, 0.43910054f, 0.34682715f, 0.08459205f,
+                       0.68237903f, 0.84103279f, 0.54420833f, 0.15065369f, 0.72408225f, 0.20047205f, 0.26391343f, 0.51592856f, 0.09968541f, 0.90827312f, 0.85315302f, 0.98048446f, 0.59192641f,
+                       0.54249449f, 0.51840919f, 0.36040054f, 0.3678461f,  0.68468829f, 0.41993762f, 0.20087144f, 0.21774159f, 0.58968185f, 0.05935208f, 0.22305229f, 0.50138518f, 0.65757976f,
+                       0.14150964f, 0.07537156f, 0.34186466f, 0.34769964f, 0.89359716f, 0.97368842f, 0.55870338f, 0.75948831f, 0.23420801f, 0.06065636f, 0.10140947f, 0.59413656f, 0.88430418f,
+                       0.29935133f, 0.36099395f, 0.59542665f, 0.25830884f, 0.07518427f, 0.09374659f, 0.2718588f,  0.32238792f, 0.65717813f, 0.17969255f, 0.79472564f, 0.54135358f, 0.02137773f,
+                       0.70453999f, 0.1567105f,  0.30232926f, 0.76933107f, 0.13083392f, 0.24884672f, 0.94565419f, 0.18572746f, 0.59809335f, 0.81106049f, 0.43165358f, 0.4994478f,  0.10799541f,
+                       0.83711806f, 0.32547974f, 0.5652622f,  0.70515491f, 0.9385492f,  0.8703726f,  0.05524975f, 0.543256f,   0.33594836f, 0.90279934f, 0.00769521f, 0.22831708f, 0.55580578f,
+                       0.57967145f, 0.47428567f, 0.98923259f, 0.01861445f, 0.08208355f, 0.53100689f, 0.27314956f, 0.03744627f, 0.70743921f, 0.48315441f, 0.89972966f, 0.82618481f, 0.79273583f,
+                       0.71205182f, 0.70734133f, 0.22427137f, 0.94323804f, 0.45126228f, 0.72933639f, 0.7159068f,  0.1407426f,  0.39469651f, 0.25006817f, 0.81597341f, 0.94659794f, 0.96640427f,
+                       0.0459238f,  0.83164318f, 0.31587163f, 0.09632278f, 0.50825189f, 0.89314523f, 0.05260463f, 0.97404436f, 0.33124845f, 0.49051957f, 0.35728925f, 0.08471391f, 0.54142684f,
+                       0.93035542f, 0.55526997f, 0.34721205f, 0.67594097f, 0.19324233f, 0.91126217f, 0.93079399f, 0.31820165f, 0.63886702f, 0.7115078f,  0.04140918f, 0.42797978f, 0.31839019f,
+                       0.15497529f, 0.9243903f,  0.10203447f, 0.02429839f, 0.79189611f, 0.77230195f, 0.5449423f,  0.40429956f, 0.00810897f, 0.69576717f, 0.81711272f, 0.1335145f,  0.84167646f,
+                       0.70258132f, 0.81158035f, 0.2585105f,  0.09862106f, 0.37432664f, 0.74594114f, 0.84899346f, 0.8828963f,  0.37912551f, 0.34303555f, 0.88711791f, 0.40549404f, 0.70713591f,
+                       0.5982242f,  0.3516502f,  0.72905793f, 0.9808251f,  0.36860929f, 0.90998312f, 0.36086885f, 0.88600347f, 0.94211039f, 0.85760511f, 0.12342954f, 0.14555429f, 0.80846367f,
+                       0.67246539f, 0.2034387f,  0.76852984f, 0.92605775f, 0.26831058f, 0.67321516f, 0.94508101f, 0.56843361f, 0.3550119f,  0.99057556f, 0.43329525f, 0.35586554f, 0.00825131f,
+                       0.98605478f, 0.21982703f, 0.89359794f, 0.21367511f, 0.11302765f, 0.05410334f, 0.55786171f, 0.47368395f, 0.85497701f, 0.9911735f,  0.41700463f, 0.90840312f, 0.07303944f,
+                       0.84749951f, 0.71403399f, 0.03476528f, 0.61093322f, 0.05770533f, 0.33471661f, 0.04379381f, 0.10857736f, 0.87996621f, 0.90394243f, 0.73068102f, 0.29301801f, 0.19324834f,
+                       0.5182907f,  0.92317623f, 0.3433689f,  0.70590362f, 0.85885382f, 0.08763445f, 0.34156856f, 0.31501855f, 0.99025243f, 0.28233952f, 0.99291062f, 0.11555683f, 0.13840601f,
+                       0.18735525f, 0.50227961f, 0.53595563f, 0.70512296f, 0.41612818f, 0.14544152f, 0.99252427f, 0.88588884f, 0.86861712f, 0.22435276f, 0.81870535f, 0.86298226f, 0.21892615f,
+                       0.89298307f, 0.56381208f, 0.37950085f, 0.61620922f, 0.73818377f, 0.70532032f, 0.91082064f, 0.27172544f, 0.55463835f, 0.62264405f, 0.71936221f, 0.71908206f, 0.12704653f,
+                       0.53684262f, 0.71610209f, 0.28590477f, 0.55936588f, 0.62242997f, 0.879633f,   0.32378584f, 0.75152276f, 0.17728816f, 0.18134275f, 0.14376092f, 0.80051592f, 0.55731164f,
+                       0.58153594f, 0.93270095f, 0.34074676f, 0.42614444f, 0.10055618f, 0.82608708f, 0.28488983f, 0.49550837f, 0.79601586f, 0.32068777f, 0.57682384f, 0.6719388f,  0.38668566f,
+                       0.19383373f, 0.70447052f, 0.9298161f,  0.18070789f, 0.93694095f, 0.33000843f, 0.58258544f, 0.07584233f, 0.43948981f, 0.15010104f, 0.55299119f, 0.76852814f, 0.09187515f,
+                       0.79264914f, 0.37365331f, 0.31016106f, 0.2233624f,  0.88615481f, 0.37971135f, 0.41666305f, 0.94860019f, 0.69704601f, 0.36850484f, 0.89811771f, 0.3676741f,  0.44550689f,
+                       0.48621984f, 0.23688324f, 0.99989737f, 0.99483116f, 0.1974391f,  0.6843357f,  0.31812494f, 0.19924475f, 0.77234562f };
+        size_t batch = 3;
+        size_t in_w = 6;
+        size_t in_h = 5;
+        size_t depth = 4;
+        size_t kernel_height = 3;
+        size_t kernel_width = 4;
+        size_t stride_w = 2;
+        size_t stride_h = 1;
+        size_t padding_w = 0;
+        size_t padding_h = 1;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in" }, { "mp" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        MaxPoolLayer2D maxpool("mp1", params, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        const Tensor& out = memory_manager["mp"];
+        maxpool.forwardCompute(NetworkMode::Train);
+
+        Tensor realOut{ 0.9991f, 0.9961f, 0.9991f, 0.9961f, 0.9991f, 0.7953f, 0.9362f, 0.9362f, 0.9362f, 0.9362f,
+
+                        0.9677f, 0.7463f, 0.9677f, 0.7463f, 0.8633f, 0.8410f, 0.7241f, 0.8410f, 0.7241f, 0.8410f,
+
+                        0.9805f, 0.9805f, 0.9805f, 0.9805f, 0.6847f, 0.9737f, 0.7595f, 0.9737f, 0.7595f, 0.9737f,
+
+                        0.8843f, 0.7947f, 0.8843f, 0.7947f, 0.9457f, 0.9457f, 0.9457f, 0.9457f, 0.9457f, 0.9457f,
+
+                        0.9385f, 0.8704f, 0.9892f, 0.8704f, 0.9892f, 0.8997f, 0.9892f, 0.9432f, 0.9432f, 0.9432f,
+
+                        0.8316f, 0.9664f, 0.9740f, 0.9664f, 0.9740f, 0.8931f, 0.9740f, 0.7115f, 0.9308f, 0.7115f,
+
+                        0.9244f, 0.9244f, 0.9244f, 0.9244f, 0.8490f, 0.8829f, 0.8871f, 0.8829f, 0.8871f, 0.8829f,
+
+                        0.9808f, 0.9100f, 0.9808f, 0.9451f, 0.9906f, 0.9906f, 0.9906f, 0.9906f, 0.9906f, 0.9906f,
+
+                        0.9912f, 0.9912f, 0.9912f, 0.9912f, 0.9232f, 0.9232f, 0.9232f, 0.9903f, 0.9232f, 0.9903f,
+
+                        0.9929f, 0.9925f, 0.9929f, 0.9925f, 0.9925f, 0.9925f, 0.8630f, 0.9108f, 0.7382f, 0.9108f,
+
+                        0.7515f, 0.8796f, 0.9327f, 0.8796f, 0.9327f, 0.8261f, 0.9327f, 0.9369f, 0.9298f, 0.9369f,
+
+                        0.7926f, 0.7926f, 0.9486f, 0.9486f, 0.9486f, 0.9999f, 0.9948f, 0.9999f, 0.9948f, 0.9999f };
+        EXPECT_EQ(out.size(), realOut.size());
+        for (size_t i = 0; i < out.size(); ++i)
+            EXPECT_NEAR(out[i], realOut[i], eps);
+
+        memory_manager.clear();
+    }
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 0.96348341f, 0.15586064f, 0.85801312f, 0.90698303f, 0.90365074f, 0.94244516f, 0.2677996f,  0.80804349f, 0.74314807f, 0.87085426f, 0.86303181f, 0.78441631f, 0.47724223f,
+                       0.87729077f, 0.6889961f,  0.10920712f, 0.502198f,   0.12459541f, 0.12104428f, 0.47144409f, 0.36667848f, 0.04591243f, 0.95481916f, 0.96010846f, 0.04127819f, 0.20963143f,
+                       0.74559766f, 0.45135642f, 0.19580783f, 0.83668515f, 0.16764521f, 0.9361491f,  0.85118285f, 0.29937364f, 0.16381705f, 0.62936431f, 0.31180585f, 0.43994714f, 0.88493073f,
+                       0.85579459f, 0.968812f,   0.09730115f, 0.06377765f, 0.28767497f, 0.0827621f,  0.72560324f, 0.47989796f, 0.22874052f, 0.84744393f, 0.984151f,   0.45700374f, 0.43884011f,
+                       0.72322093f, 0.40692036f, 0.62789155f, 0.67226483f, 0.28128765f, 0.84102396f, 0.04455707f, 0.80358821f, 0.81809951f, 0.33479248f, 0.81593632f, 0.24718449f, 0.43681173f,
+                       0.96602754f, 0.47296632f, 0.40400648f, 0.9589902f,  0.75884039f, 0.10686399f, 0.0876774f,  0.24735809f, 0.91264605f, 0.95306229f, 0.19035099f, 0.66844836f, 0.92064013f,
+                       0.74866233f, 0.63805628f, 0.43843872f, 0.30932513f, 0.52097359f, 0.47408321f, 0.30412759f, 0.62930547f, 0.69347172f, 0.90716773f, 0.01624499f, 0.40114112f, 0.20085816f,
+                       0.64728273f, 0.36900157f, 0.73347583f, 0.58660132f, 0.13134383f, 0.65294528f, 0.20504019f, 0.32097811f, 0.64777429f, 0.99471421f, 0.90513771f, 0.54918972f, 0.57589054f,
+                       0.0360899f,  0.14018544f, 0.16177198f, 0.65594011f, 0.57178114f, 0.19571533f, 0.82147836f, 0.51522836f, 0.99352937f, 0.75577097f, 0.1459991f,  0.73017348f, 0.13535563f,
+                       0.98249258f, 0.0399929f,  0.29204748f, 0.19490314f, 0.8181297f,  0.59210481f, 0.01795114f, 0.56539541f, 0.80806186f, 0.11588852f, 0.04440137f, 0.5408287f,  0.03401919f,
+                       0.6801882f,  0.52087458f, 0.58138926f, 0.15212478f, 0.40800414f, 0.25477715f, 0.09736978f, 0.37100523f, 0.03739581f, 0.19658006f, 0.04517923f, 0.91809436f, 0.1281456f,
+                       0.48575601f, 0.73038811f, 0.8124972f,  0.67501298f, 0.84129705f, 0.6783478f,  0.24936025f, 0.52186499f, 0.94674669f, 0.51971989f, 0.51616039f, 0.91161219f, 0.97109265f,
+                       0.83201668f, 0.79153971f, 0.3663171f,  0.99702797f, 0.2939686f,  0.67578681f, 0.57569441f, 0.49055591f, 0.34768672f, 0.5827828f,  0.85240913f, 0.76038318f, 0.2918782f,
+                       0.36743325f, 0.54590447f, 0.23366542f, 0.02593257f, 0.77679126f, 0.43271272f, 0.63130965f, 0.58377302f, 0.41068779f, 0.38479488f, 0.92633595f, 0.17081529f, 0.89736346f,
+                       0.25978647f, 0.93597133f, 0.60374193f, 0.42577718f, 0.12651146f, 0.13197426f, 0.31386783f, 0.82884385f, 0.79896298f, 0.29060389f, 0.83111172f, 0.49998233f, 0.39149675f,
+                       0.2833226f,  0.07848344f, 0.53238081f, 0.77137435f, 0.9494026f,  0.26211563f, 0.42989275f, 0.82957998f, 0.14372078f, 0.75329345f, 0.42269241f, 0.58228735f, 0.49442186f,
+                       0.9368008f,  0.54981332f, 0.85718476f, 0.79682913f, 0.76635431f, 0.64293444f, 0.76701953f, 0.07457774f, 0.05627946f, 0.45719259f, 0.44601716f, 0.59308929f, 0.03508906f,
+                       0.48355423f, 0.03819983f, 0.59721818f, 0.09493453f, 0.71203695f, 0.69309733f, 0.55120406f, 0.97079439f, 0.92886076f, 0.01575748f, 0.79944308f, 0.17436161f, 0.2199184f,
+                       0.91300355f, 0.33894585f, 0.65701187f, 0.3347884f,  0.94329126f, 0.80632752f, 0.2943788f,  0.53829543f, 0.20138331f, 0.37048269f, 0.71986792f, 0.00633717f, 0.75972272f,
+                       0.45640044f, 0.50374774f, 0.14231403f, 0.03533724f, 0.15261205f, 0.61933864f, 0.65028447f, 0.08359025f, 0.83675682f, 0.24401583f, 0.97756609f, 0.39470918f, 0.63805997f,
+                       0.24649591f, 0.28133793f, 0.66552684f, 0.30532626f, 0.43491843f, 0.12394513f, 0.99693077f, 0.86963143f, 0.9114928f,  0.68174528f, 0.32254159f, 0.86191744f, 0.40482803f,
+                       0.9846427f,  0.01945828f, 0.84647737f, 0.48184591f, 0.05604898f, 0.72827427f, 0.94253392f, 0.40046416f, 0.59490486f, 0.95523345f, 0.44728608f, 0.0415417f,  0.58937309f,
+                       0.72378481f, 0.25805044f, 0.93793498f, 0.23147678f, 0.53864162f, 0.13551794f, 0.88318072f, 0.54479799f, 0.06905515f, 0.11122608f, 0.36390078f, 0.77017189f, 0.33376034f,
+                       0.56859618f, 0.34538924f, 0.69229637f, 0.50362595f, 0.32238991f, 0.593114f,   0.5276401f,  0.70368907f, 0.16368335f, 0.5398101f,  0.61860845f, 0.28642138f, 0.11475666f,
+                       0.24564118f, 0.50615165f, 0.32953329f, 0.04286386f, 0.54089183f, 0.66019695f, 0.55185473f, 0.57669059f, 0.02843475f, 0.61376359f, 0.85501606f, 0.0516355f,  0.88004504f,
+                       0.35604976f, 0.68486446f, 0.36970255f, 0.67258104f, 0.72354148f, 0.93205347f, 0.61432838f, 0.4404678f,  0.64398353f, 0.11808112f, 0.70818733f, 0.4822714f,  0.27013521f,
+                       0.22835508f, 0.54965191f, 0.82348522f, 0.39969524f, 0.55174928f, 0.25527024f, 0.36193352f, 0.06758679f, 0.63336846f, 0.23996246f, 0.92850972f, 0.01016019f, 0.40463047f,
+                       0.78515765f, 0.36628844f, 0.83752248f, 0.10819647f, 0.19302783f, 0.62098004f, 0.02504087f, 0.5898777f,  0.52094242f };
+        size_t batch = 4;
+        size_t in_w = 5;
+        size_t in_h = 6;
+        size_t depth = 3;
+        size_t kernel_height = 4;
+        size_t kernel_width = 3;
+        size_t stride_w = 1;
+        size_t stride_h = 2;
+        size_t padding_w = 1;
+        size_t padding_h = 0;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in" }, { "mp" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        MaxPoolLayer2D maxpool("mp1", params, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        const Tensor& out = memory_manager["mp"];
+        maxpool.forwardCompute(NetworkMode::Train);
+
+        Tensor realOut{ 0.9635f, 0.9635f, 0.9070f, 0.9070f, 0.9070f, 0.8630f, 0.9548f, 0.9601f, 0.9601f, 0.9601f,
+
+                        0.9688f, 0.9688f, 0.9361f, 0.9842f, 0.9842f, 0.9688f, 0.9688f, 0.8474f, 0.9842f, 0.9842f,
+
+                        0.9660f, 0.9660f, 0.9590f, 0.9590f, 0.9590f, 0.6935f, 0.9206f, 0.9206f, 0.9531f, 0.9531f,
+
+                        0.9947f, 0.9947f, 0.9051f, 0.7335f, 0.7335f, 0.9947f, 0.9947f, 0.9935f, 0.9935f, 0.7558f,
+
+                        0.8181f, 0.8181f, 0.8181f, 0.5921f, 0.5654f, 0.9181f, 0.9181f, 0.9181f, 0.8413f, 0.7304f,
+
+                        0.9711f, 0.9711f, 0.9467f, 0.9970f, 0.9970f, 0.8524f, 0.8524f, 0.8524f, 0.9263f, 0.9263f,
+
+                        0.8974f, 0.8974f, 0.9360f, 0.9494f, 0.9494f, 0.7990f, 0.8311f, 0.9368f, 0.9494f, 0.9494f,
+
+                        0.8572f, 0.8572f, 0.9708f, 0.9708f, 0.9708f, 0.7994f, 0.7994f, 0.9708f, 0.9708f, 0.9708f,
+
+                        0.8368f, 0.9776f, 0.9776f, 0.9776f, 0.7199f, 0.9969f, 0.9969f, 0.9969f, 0.9776f, 0.9115f,
+
+                        0.8619f, 0.9552f, 0.9846f, 0.9846f, 0.9846f, 0.7238f, 0.9552f, 0.9552f, 0.9552f, 0.9379f,
+
+                        0.7037f, 0.7037f, 0.7037f, 0.6602f, 0.6186f, 0.6849f, 0.8550f, 0.8550f, 0.8800f, 0.8800f,
+
+                        0.9321f, 0.9321f, 0.9285f, 0.9285f, 0.9285f, 0.8235f, 0.8235f, 0.9285f, 0.9285f, 0.9285f };
+        EXPECT_EQ(out.size(), realOut.size());
+        for (size_t i = 0; i < out.size(); ++i)
+            EXPECT_NEAR(out[i], realOut[i], eps);
+
+        memory_manager.clear();
+    }
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        Tensor raw = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+        size_t in_w = 5;
+        size_t in_h = 3;
+        size_t depth = 1;
+        size_t kernel_height = 2;
+        size_t kernel_width = 3;
+        size_t stride_w = 3;
+        size_t stride_h = 1;
+        size_t padding_w = 5;
+        size_t padding_h = 0;
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, depth, in_h, in_w });
+        auto params = Pool2DParams{ { "in" }, { "mp" }, kernel_width, kernel_height, stride_w, stride_h, padding_w, padding_h };
+        EXPECT_THROW(MaxPoolLayer2D maxpool4("mp1", params, networkParameters), raul::Exception);
+
+        memory_manager.clear();
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_CNNPaddingLayer.cpp b/training/src/tests/tests/layers/Test_CNNPaddingLayer.cpp
new file mode 100644
index 00000000..5fa09565
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_CNNPaddingLayer.cpp
@@ -0,0 +1,487 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/PaddingLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+using namespace raul;
+
+namespace UT
+{
+
+struct TestPaddingLayer : public testing::Test
+{
+    std::unique_ptr<Tensor> symmetricPaddingResult;
+    std::unique_ptr<Tensor> asymmetricPaddingResult;
+    std::unique_ptr<Tensor> symmetricReflectionPaddingResult;
+    std::unique_ptr<Tensor> asymmetricReflectionPaddingResult;
+    std::unique_ptr<Tensor> reflectionPaddingBackwardPassResult;
+    std::unique_ptr<Tensor> symmetricReplicationPaddingResult;
+    std::unique_ptr<Tensor> asymmetricReplicationPaddingResult;
+    std::unique_ptr<Tensor> replicationPaddingBackwardPassResult;
+
+    void SetUp() final;
+
+    void ExpectEqual(const Tensor& t1, const Tensor& t2, dtype epsilon)
+    {
+        ASSERT_EQ(t1.size(), t2.size());
+        for (size_t i = 0; i < t1.size(); ++i)
+        {
+            ASSERT_NEAR(t1[i], t2[i], epsilon);
+        }
+    }
+};
+
+TEST_F(TestPaddingLayer, SymmetricConstantPaddingUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    uint32_t commonPadding = 3;
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 2u, 3u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, commonPadding, 5._dt });
+
+    TENSORS_CREATE(2);
+    memory_manager["in"] = 1._dt;
+    work.forwardPassTraining();
+
+    const Tensor& out = memory_manager["out"];
+    const Tensor& in = memory_manager["in"];
+    ASSERT_EQ(out.getBatchSize(), in.getBatchSize());
+    ASSERT_EQ(out.getDepth(), in.getDepth());
+    ASSERT_EQ(out.getHeight(), commonPadding + in.getHeight() + commonPadding);
+    ASSERT_EQ(out.getWidth(), commonPadding + in.getWidth() + commonPadding);
+    ExpectEqual(out, *symmetricPaddingResult, 1e-4_dt);
+}
+
+TEST_F(TestPaddingLayer, AsymmetricConstantPaddingUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    uint32_t topPadding = 1;
+    uint32_t bottomPadding = 2;
+    uint32_t leftPadding = 3;
+    uint32_t rightPadding = 4;
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 2u, 3u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, topPadding, bottomPadding, leftPadding, rightPadding, 5._dt });
+
+    TENSORS_CREATE(2);
+    memory_manager["in"] = 1._dt;
+    work.forwardPassTraining();
+
+    const Tensor& out = memory_manager["out"];
+    const Tensor& in = memory_manager["in"];
+    ASSERT_EQ(out.getBatchSize(), in.getBatchSize());
+    ASSERT_EQ(out.getDepth(), in.getDepth());
+    ASSERT_EQ(out.getHeight(), topPadding + in.getHeight() + bottomPadding);
+    ASSERT_EQ(out.getWidth(), leftPadding + in.getWidth() + rightPadding);
+    ExpectEqual(out, *asymmetricPaddingResult, 1e-4_dt);
+}
+
+TEST_F(TestPaddingLayer, NoConstantPaddingUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    uint32_t noPadding = 0;
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 2u, 3u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, noPadding, 5._dt });
+
+    TENSORS_CREATE(2);
+    memory_manager["in"] = 1._dt;
+    work.forwardPassTraining();
+
+    const Tensor& out = memory_manager["out"];
+    const Tensor& in = memory_manager["in"];
+    ASSERT_EQ(out.getBatchSize(), in.getBatchSize());
+    ASSERT_EQ(out.getDepth(), in.getDepth());
+    ASSERT_EQ(out.getHeight(), in.getHeight());
+    ASSERT_EQ(out.getWidth(), in.getWidth());
+    ExpectEqual(out, in, 1e-4_dt);
+}
+
+TEST_F(TestPaddingLayer, ConstantPaddingBackwardPassUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    uint32_t pad = 2;
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 2u, 3u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, pad, 5._dt });
+
+    TENSORS_CREATE(2);
+    memory_manager["in"] = 1._dt;
+    tools::init_rand_tensor(raul::Name("out").grad(), { -1.f, 1.f }, memory_manager);
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    const auto& in = memory_manager[raul::Name("in")];
+    const auto& in_nabla = memory_manager[raul::Name("in").grad()];
+    const auto& out_nabla = memory_manager[raul::Name("out").grad()];
+    ASSERT_EQ(in_nabla.size(), in.size());
+    auto in_nabla_4d_view = in_nabla.get4DView();
+    auto out_nabla_4d_view = out_nabla.get4DView();
+    dtype epsilon = 1e-4_dt;
+    for (size_t b = 0; b < out_nabla.getBatchSize(); ++b)
+    {
+        for (size_t d = 0; d < out_nabla.getDepth(); ++d)
+        {
+            for (size_t h = 0; h < out_nabla.getHeight(); ++h)
+            {
+                for (size_t w = 0; w < out_nabla.getWidth(); ++w)
+                {
+                    if ((h < pad || h >= pad + in.getHeight()) || (w < pad || w >= pad + in.getWidth()))
+                    {
+                        continue;
+                    }
+                    else
+                    {
+                        ASSERT_NEAR(out_nabla_4d_view.at(b, d, h, w), in_nabla_4d_view.at(b, d, h - pad, w - pad), epsilon);
+                    }
+                }
+            }
+        }
+    }
+}
+
+TEST_F(TestPaddingLayer, SymmetricReflectionPaddingUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    uint32_t commonPadding = 2;
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 3u, 4u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, commonPadding, PaddingLayerParams::FillingMode::REFLECTION });
+
+    TENSORS_CREATE(2);
+    Tensor& in = memory_manager["in"];
+    std::generate(in.begin(), in.end(), [n = 0._dt]() mutable {
+        dtype result = n;
+        n += 1._dt;
+        return result;
+    });
+    work.forwardPassTraining();
+
+    const Tensor& out = memory_manager["out"];
+    ASSERT_EQ(out.getBatchSize(), in.getBatchSize());
+    ASSERT_EQ(out.getDepth(), in.getDepth());
+    ASSERT_EQ(out.getHeight(), commonPadding + in.getHeight() + commonPadding);
+    ASSERT_EQ(out.getWidth(), commonPadding + in.getWidth() + commonPadding);
+    ExpectEqual(out, *symmetricReflectionPaddingResult, 1e-4_dt);
+}
+
+TEST_F(TestPaddingLayer, AsymmetricReflectionPaddingUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    uint32_t topPadding = 1;
+    uint32_t bottomPadding = 0;
+    uint32_t leftPadding = 3;
+    uint32_t rightPadding = 2;
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 3u, 4u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, topPadding, bottomPadding, leftPadding, rightPadding, PaddingLayerParams::FillingMode::REFLECTION });
+
+    TENSORS_CREATE(2);
+    Tensor& in = memory_manager["in"];
+    std::generate(in.begin(), in.end(), [n = 0._dt]() mutable {
+        dtype result = n;
+        n += 1._dt;
+        return result;
+    });
+    work.forwardPassTraining();
+
+    const Tensor& out = memory_manager["out"];
+    ASSERT_EQ(out.getBatchSize(), in.getBatchSize());
+    ASSERT_EQ(out.getDepth(), in.getDepth());
+    ASSERT_EQ(out.getHeight(), topPadding + in.getHeight() + bottomPadding);
+    ASSERT_EQ(out.getWidth(), leftPadding + in.getWidth() + rightPadding);
+    ExpectEqual(out, *asymmetricReflectionPaddingResult, 1e-4_dt);
+}
+
+TEST_F(TestPaddingLayer, NoReflectionPaddingUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 3u, 4u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, 0, PaddingLayerParams::FillingMode::REFLECTION });
+
+    TENSORS_CREATE(2);
+    Tensor& in = memory_manager["in"];
+    std::generate(in.begin(), in.end(), [n = 0._dt]() mutable {
+        dtype result = n;
+        n += 1._dt;
+        return result;
+    });
+    work.forwardPassTraining();
+
+    const Tensor& out = memory_manager["out"];
+    ASSERT_EQ(out.getBatchSize(), in.getBatchSize());
+    ASSERT_EQ(out.getDepth(), in.getDepth());
+    ASSERT_EQ(out.getHeight(), in.getHeight());
+    ASSERT_EQ(out.getWidth(), in.getWidth());
+    ExpectEqual(out, in, 1e-4_dt);
+}
+
+TEST_F(TestPaddingLayer, ThrowExceptionIfReflectionPaddingHasInaccesibleValueUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in1", "in2", "in3", "in4", "in5" }, 3u, 3u, 4u });
+
+    PaddingLayerParams badParams1{ { "in1" }, { "out1" }, 3, 2, 3, 3, PaddingLayerParams::FillingMode::REFLECTION };
+    ASSERT_THROW(work.add<PaddingLayer>("pad", badParams1), raul::Exception);
+
+    PaddingLayerParams badParams2{ { "in2" }, { "out2" }, 2, 3, 3, 3, PaddingLayerParams::FillingMode::REFLECTION };
+    ASSERT_THROW(work.add<PaddingLayer>("pad", badParams2), raul::Exception);
+
+    PaddingLayerParams badParams3{ { "in3" }, { "out3" }, 2, 2, 4, 3, PaddingLayerParams::FillingMode::REFLECTION };
+    ASSERT_THROW(work.add<PaddingLayer>("pad", badParams3), raul::Exception);
+
+    PaddingLayerParams badParams4{ { "in4" }, { "out4" }, 2, 2, 3, 4, PaddingLayerParams::FillingMode::REFLECTION };
+    ASSERT_THROW(work.add<PaddingLayer>("pad", badParams4), raul::Exception);
+
+    PaddingLayerParams badParams5{ { "in5" }, { "out5" }, 3, PaddingLayerParams::FillingMode::REFLECTION };
+    ASSERT_THROW(work.add<PaddingLayer>("pad", badParams5), raul::Exception);
+}
+
+TEST_F(TestPaddingLayer, ReflectionPaddingBackwardPassUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    uint32_t pad = 2;
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 3u, 4u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, pad, PaddingLayerParams::FillingMode::REFLECTION });
+
+    TENSORS_CREATE(2);
+    Tensor& in = memory_manager["in"];
+    std::generate(in.begin(), in.end(), [n = 0._dt]() mutable {
+        dtype result = n;
+        n += 1._dt;
+        return result;
+    });
+    memory_manager[raul::Name("out").grad()] = 1.0_dt;
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    const auto& in_nabla = memory_manager[raul::Name("in").grad()];
+    ASSERT_EQ(in_nabla.size(), in.size());
+    ExpectEqual(in_nabla, *reflectionPaddingBackwardPassResult, 1e-5_dt);
+}
+
+TEST_F(TestPaddingLayer, SymmetricReplicationPaddingUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    uint32_t commonPadding = 2;
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 3u, 4u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, commonPadding, PaddingLayerParams::FillingMode::REPLICATION });
+
+    TENSORS_CREATE(2);
+    Tensor& in = memory_manager["in"];
+    std::generate(in.begin(), in.end(), [n = 0._dt]() mutable {
+        dtype result = n;
+        n += 1._dt;
+        return result;
+    });
+    work.forwardPassTraining();
+
+    const Tensor& out = memory_manager["out"];
+    ASSERT_EQ(out.getBatchSize(), in.getBatchSize());
+    ASSERT_EQ(out.getDepth(), in.getDepth());
+    ASSERT_EQ(out.getHeight(), commonPadding + in.getHeight() + commonPadding);
+    ASSERT_EQ(out.getWidth(), commonPadding + in.getWidth() + commonPadding);
+    ExpectEqual(out, *symmetricReplicationPaddingResult, 1e-4_dt);
+}
+
+TEST_F(TestPaddingLayer, AsymmetricReplicationPaddingUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    uint32_t topPadding = 1;
+    uint32_t bottomPadding = 0;
+    uint32_t leftPadding = 3;
+    uint32_t rightPadding = 2;
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 3u, 4u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, topPadding, bottomPadding, leftPadding, rightPadding, PaddingLayerParams::FillingMode::REPLICATION });
+
+    TENSORS_CREATE(2);
+    Tensor& in = memory_manager["in"];
+    std::generate(in.begin(), in.end(), [n = 0._dt]() mutable {
+        dtype result = n;
+        n += 1._dt;
+        return result;
+    });
+    work.forwardPassTraining();
+
+    const Tensor& out = memory_manager["out"];
+    ASSERT_EQ(out.getBatchSize(), in.getBatchSize());
+    ASSERT_EQ(out.getDepth(), in.getDepth());
+    ASSERT_EQ(out.getHeight(), topPadding + in.getHeight() + bottomPadding);
+    ASSERT_EQ(out.getWidth(), leftPadding + in.getWidth() + rightPadding);
+    ExpectEqual(out, *asymmetricReplicationPaddingResult, 1e-4_dt);
+}
+
+TEST_F(TestPaddingLayer, NoReplicationPaddingUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 3u, 4u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, 0, PaddingLayerParams::FillingMode::REPLICATION });
+
+    TENSORS_CREATE(2);
+    Tensor& in = memory_manager["in"];
+    std::generate(in.begin(), in.end(), [n = 0._dt]() mutable {
+        dtype result = n;
+        n += 1._dt;
+        return result;
+    });
+    work.forwardPassTraining();
+
+    const Tensor& out = memory_manager["out"];
+    ASSERT_EQ(out.getBatchSize(), in.getBatchSize());
+    ASSERT_EQ(out.getDepth(), in.getDepth());
+    ASSERT_EQ(out.getHeight(), in.getHeight());
+    ASSERT_EQ(out.getWidth(), in.getWidth());
+    ExpectEqual(out, in, 1e-4_dt);
+}
+
+TEST_F(TestPaddingLayer, ReplicationPaddingBackwardPassUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    uint32_t pad = 2;
+    work.add<DataLayer>("data", DataParams{ { "in" }, 3u, 3u, 4u });
+    work.add<PaddingLayer>("pad", PaddingLayerParams{ { "in" }, { "out" }, pad, PaddingLayerParams::FillingMode::REPLICATION });
+
+    TENSORS_CREATE(2);
+    Tensor& in = memory_manager["in"];
+    std::generate(in.begin(), in.end(), [n = 0._dt]() mutable {
+        dtype result = n;
+        n += 1._dt;
+        return result;
+    });
+    memory_manager[Name("out").grad()] = 1.0_dt;
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    const auto& in_nabla = memory_manager[raul::Name("in").grad()];
+    ASSERT_EQ(in_nabla.size(), in.size());
+    ExpectEqual(in_nabla, *replicationPaddingBackwardPassResult, 1e-5_dt);
+}
+
+void TestPaddingLayer::SetUp()
+{
+    symmetricPaddingResult = std::make_unique<Tensor>(std::initializer_list<dtype>{
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f });
+
+    asymmetricPaddingResult = std::make_unique<Tensor>(std::initializer_list<dtype>{
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f,
+        1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f,
+        5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 1.f, 1.f, 1.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f, 5.f });
+
+    symmetricReflectionPaddingResult = std::make_unique<Tensor>(std::initializer_list<dtype>{
+        10.f, 9.f,  8.f,  9.f,  10.f, 11.f, 10.f, 9.f,  6.f,  5.f,  4.f,  5.f,  6.f,  7.f,  6.f,  5.f,  2.f,  1.f,  0.f,  1.f,  2.f,  3.f,  2.f,  1.f,  6.f,  5.f,  4.f,  5.f,  6.f,  7.f,  6.f,
+        5.f,  10.f, 9.f,  8.f,  9.f,  10.f, 11.f, 10.f, 9.f,  6.f,  5.f,  4.f,  5.f,  6.f,  7.f,  6.f,  5.f,  2.f,  1.f,  0.f,  1.f,  2.f,  3.f,  2.f,  1.f,  22.f, 21.f, 20.f, 21.f, 22.f, 23.f,
+        22.f, 21.f, 18.f, 17.f, 16.f, 17.f, 18.f, 19.f, 18.f, 17.f, 14.f, 13.f, 12.f, 13.f, 14.f, 15.f, 14.f, 13.f, 18.f, 17.f, 16.f, 17.f, 18.f, 19.f, 18.f, 17.f, 22.f, 21.f, 20.f, 21.f, 22.f,
+        23.f, 22.f, 21.f, 18.f, 17.f, 16.f, 17.f, 18.f, 19.f, 18.f, 17.f, 14.f, 13.f, 12.f, 13.f, 14.f, 15.f, 14.f, 13.f, 34.f, 33.f, 32.f, 33.f, 34.f, 35.f, 34.f, 33.f, 30.f, 29.f, 28.f, 29.f,
+        30.f, 31.f, 30.f, 29.f, 26.f, 25.f, 24.f, 25.f, 26.f, 27.f, 26.f, 25.f, 30.f, 29.f, 28.f, 29.f, 30.f, 31.f, 30.f, 29.f, 34.f, 33.f, 32.f, 33.f, 34.f, 35.f, 34.f, 33.f, 30.f, 29.f, 28.f,
+        29.f, 30.f, 31.f, 30.f, 29.f, 26.f, 25.f, 24.f, 25.f, 26.f, 27.f, 26.f, 25.f, 46.f, 45.f, 44.f, 45.f, 46.f, 47.f, 46.f, 45.f, 42.f, 41.f, 40.f, 41.f, 42.f, 43.f, 42.f, 41.f, 38.f, 37.f,
+        36.f, 37.f, 38.f, 39.f, 38.f, 37.f, 42.f, 41.f, 40.f, 41.f, 42.f, 43.f, 42.f, 41.f, 46.f, 45.f, 44.f, 45.f, 46.f, 47.f, 46.f, 45.f, 42.f, 41.f, 40.f, 41.f, 42.f, 43.f, 42.f, 41.f, 38.f,
+        37.f, 36.f, 37.f, 38.f, 39.f, 38.f, 37.f, 58.f, 57.f, 56.f, 57.f, 58.f, 59.f, 58.f, 57.f, 54.f, 53.f, 52.f, 53.f, 54.f, 55.f, 54.f, 53.f, 50.f, 49.f, 48.f, 49.f, 50.f, 51.f, 50.f, 49.f,
+        54.f, 53.f, 52.f, 53.f, 54.f, 55.f, 54.f, 53.f, 58.f, 57.f, 56.f, 57.f, 58.f, 59.f, 58.f, 57.f, 54.f, 53.f, 52.f, 53.f, 54.f, 55.f, 54.f, 53.f, 50.f, 49.f, 48.f, 49.f, 50.f, 51.f, 50.f,
+        49.f, 70.f, 69.f, 68.f, 69.f, 70.f, 71.f, 70.f, 69.f, 66.f, 65.f, 64.f, 65.f, 66.f, 67.f, 66.f, 65.f, 62.f, 61.f, 60.f, 61.f, 62.f, 63.f, 62.f, 61.f, 66.f, 65.f, 64.f, 65.f, 66.f, 67.f,
+        66.f, 65.f, 70.f, 69.f, 68.f, 69.f, 70.f, 71.f, 70.f, 69.f, 66.f, 65.f, 64.f, 65.f, 66.f, 67.f, 66.f, 65.f, 62.f, 61.f, 60.f, 61.f, 62.f, 63.f, 62.f, 61.f });
+
+    asymmetricReflectionPaddingResult = std::make_unique<Tensor>(std::initializer_list<dtype>{
+        7.f,  6.f,  5.f,  4.f,  5.f,  6.f,  7.f,  6.f,  5.f,  3.f,  2.f,  1.f,  0.f,  1.f,  2.f,  3.f,  2.f,  1.f,  7.f,  6.f,  5.f,  4.f,  5.f,  6.f,  7.f,  6.f,  5.f,  11.f, 10.f, 9.f,  8.f,
+        9.f,  10.f, 11.f, 10.f, 9.f,  19.f, 18.f, 17.f, 16.f, 17.f, 18.f, 19.f, 18.f, 17.f, 15.f, 14.f, 13.f, 12.f, 13.f, 14.f, 15.f, 14.f, 13.f, 19.f, 18.f, 17.f, 16.f, 17.f, 18.f, 19.f, 18.f,
+        17.f, 23.f, 22.f, 21.f, 20.f, 21.f, 22.f, 23.f, 22.f, 21.f, 31.f, 30.f, 29.f, 28.f, 29.f, 30.f, 31.f, 30.f, 29.f, 27.f, 26.f, 25.f, 24.f, 25.f, 26.f, 27.f, 26.f, 25.f, 31.f, 30.f, 29.f,
+        28.f, 29.f, 30.f, 31.f, 30.f, 29.f, 35.f, 34.f, 33.f, 32.f, 33.f, 34.f, 35.f, 34.f, 33.f, 43.f, 42.f, 41.f, 40.f, 41.f, 42.f, 43.f, 42.f, 41.f, 39.f, 38.f, 37.f, 36.f, 37.f, 38.f, 39.f,
+        38.f, 37.f, 43.f, 42.f, 41.f, 40.f, 41.f, 42.f, 43.f, 42.f, 41.f, 47.f, 46.f, 45.f, 44.f, 45.f, 46.f, 47.f, 46.f, 45.f, 55.f, 54.f, 53.f, 52.f, 53.f, 54.f, 55.f, 54.f, 53.f, 51.f, 50.f,
+        49.f, 48.f, 49.f, 50.f, 51.f, 50.f, 49.f, 55.f, 54.f, 53.f, 52.f, 53.f, 54.f, 55.f, 54.f, 53.f, 59.f, 58.f, 57.f, 56.f, 57.f, 58.f, 59.f, 58.f, 57.f, 67.f, 66.f, 65.f, 64.f, 65.f, 66.f,
+        67.f, 66.f, 65.f, 63.f, 62.f, 61.f, 60.f, 61.f, 62.f, 63.f, 62.f, 61.f, 67.f, 66.f, 65.f, 64.f, 65.f, 66.f, 67.f, 66.f, 65.f, 71.f, 70.f, 69.f, 68.f, 69.f, 70.f, 71.f, 70.f, 69.f });
+
+    reflectionPaddingBackwardPassResult = std::make_unique<Tensor>(std::initializer_list<dtype>{
+        2.f, 6.f, 6.f, 2.f, 3.f, 9.f, 9.f, 3.f, 2.f, 6.f, 6.f, 2.f, 2.f, 6.f, 6.f, 2.f, 3.f, 9.f, 9.f, 3.f, 2.f, 6.f, 6.f, 2.f, 2.f, 6.f, 6.f, 2.f, 3.f, 9.f, 9.f, 3.f, 2.f, 6.f, 6.f, 2.f,
+        2.f, 6.f, 6.f, 2.f, 3.f, 9.f, 9.f, 3.f, 2.f, 6.f, 6.f, 2.f, 2.f, 6.f, 6.f, 2.f, 3.f, 9.f, 9.f, 3.f, 2.f, 6.f, 6.f, 2.f, 2.f, 6.f, 6.f, 2.f, 3.f, 9.f, 9.f, 3.f, 2.f, 6.f, 6.f, 2.f });
+
+    symmetricReplicationPaddingResult = std::make_unique<Tensor>(std::initializer_list<dtype>{
+        0.f,  0.f,  0.f,  1.f,  2.f,  3.f,  3.f,  3.f,  0.f,  0.f,  0.f,  1.f,  2.f,  3.f,  3.f,  3.f,  0.f,  0.f,  0.f,  1.f,  2.f,  3.f,  3.f,  3.f,  4.f,  4.f,  4.f,  5.f,  6.f,  7.f,  7.f,
+        7.f,  8.f,  8.f,  8.f,  9.f,  10.f, 11.f, 11.f, 11.f, 8.f,  8.f,  8.f,  9.f,  10.f, 11.f, 11.f, 11.f, 8.f,  8.f,  8.f,  9.f,  10.f, 11.f, 11.f, 11.f, 12.f, 12.f, 12.f, 13.f, 14.f, 15.f,
+        15.f, 15.f, 12.f, 12.f, 12.f, 13.f, 14.f, 15.f, 15.f, 15.f, 12.f, 12.f, 12.f, 13.f, 14.f, 15.f, 15.f, 15.f, 16.f, 16.f, 16.f, 17.f, 18.f, 19.f, 19.f, 19.f, 20.f, 20.f, 20.f, 21.f, 22.f,
+        23.f, 23.f, 23.f, 20.f, 20.f, 20.f, 21.f, 22.f, 23.f, 23.f, 23.f, 20.f, 20.f, 20.f, 21.f, 22.f, 23.f, 23.f, 23.f, 24.f, 24.f, 24.f, 25.f, 26.f, 27.f, 27.f, 27.f, 24.f, 24.f, 24.f, 25.f,
+        26.f, 27.f, 27.f, 27.f, 24.f, 24.f, 24.f, 25.f, 26.f, 27.f, 27.f, 27.f, 28.f, 28.f, 28.f, 29.f, 30.f, 31.f, 31.f, 31.f, 32.f, 32.f, 32.f, 33.f, 34.f, 35.f, 35.f, 35.f, 32.f, 32.f, 32.f,
+        33.f, 34.f, 35.f, 35.f, 35.f, 32.f, 32.f, 32.f, 33.f, 34.f, 35.f, 35.f, 35.f, 36.f, 36.f, 36.f, 37.f, 38.f, 39.f, 39.f, 39.f, 36.f, 36.f, 36.f, 37.f, 38.f, 39.f, 39.f, 39.f, 36.f, 36.f,
+        36.f, 37.f, 38.f, 39.f, 39.f, 39.f, 40.f, 40.f, 40.f, 41.f, 42.f, 43.f, 43.f, 43.f, 44.f, 44.f, 44.f, 45.f, 46.f, 47.f, 47.f, 47.f, 44.f, 44.f, 44.f, 45.f, 46.f, 47.f, 47.f, 47.f, 44.f,
+        44.f, 44.f, 45.f, 46.f, 47.f, 47.f, 47.f, 48.f, 48.f, 48.f, 49.f, 50.f, 51.f, 51.f, 51.f, 48.f, 48.f, 48.f, 49.f, 50.f, 51.f, 51.f, 51.f, 48.f, 48.f, 48.f, 49.f, 50.f, 51.f, 51.f, 51.f,
+        52.f, 52.f, 52.f, 53.f, 54.f, 55.f, 55.f, 55.f, 56.f, 56.f, 56.f, 57.f, 58.f, 59.f, 59.f, 59.f, 56.f, 56.f, 56.f, 57.f, 58.f, 59.f, 59.f, 59.f, 56.f, 56.f, 56.f, 57.f, 58.f, 59.f, 59.f,
+        59.f, 60.f, 60.f, 60.f, 61.f, 62.f, 63.f, 63.f, 63.f, 60.f, 60.f, 60.f, 61.f, 62.f, 63.f, 63.f, 63.f, 60.f, 60.f, 60.f, 61.f, 62.f, 63.f, 63.f, 63.f, 64.f, 64.f, 64.f, 65.f, 66.f, 67.f,
+        67.f, 67.f, 68.f, 68.f, 68.f, 69.f, 70.f, 71.f, 71.f, 71.f, 68.f, 68.f, 68.f, 69.f, 70.f, 71.f, 71.f, 71.f, 68.f, 68.f, 68.f, 69.f, 70.f, 71.f, 71.f, 71.f });
+
+    asymmetricReplicationPaddingResult = std::make_unique<Tensor>(std::initializer_list<dtype>{
+        0.,  0.,  0.,  0.,  1.,  2.,  3.,  3.,  3.,  0.,  0.,  0.,  0.,  1.,  2.,  3.,  3.,  3.,  4.,  4.,  4.,  4.,  5.,  6.,  7.,  7.,  7.,  8.,  8.,  8.,  8.,  9.,  10., 11., 11., 11.,
+        12., 12., 12., 12., 13., 14., 15., 15., 15., 12., 12., 12., 12., 13., 14., 15., 15., 15., 16., 16., 16., 16., 17., 18., 19., 19., 19., 20., 20., 20., 20., 21., 22., 23., 23., 23.,
+        24., 24., 24., 24., 25., 26., 27., 27., 27., 24., 24., 24., 24., 25., 26., 27., 27., 27., 28., 28., 28., 28., 29., 30., 31., 31., 31., 32., 32., 32., 32., 33., 34., 35., 35., 35.,
+        36., 36., 36., 36., 37., 38., 39., 39., 39., 36., 36., 36., 36., 37., 38., 39., 39., 39., 40., 40., 40., 40., 41., 42., 43., 43., 43., 44., 44., 44., 44., 45., 46., 47., 47., 47.,
+        48., 48., 48., 48., 49., 50., 51., 51., 51., 48., 48., 48., 48., 49., 50., 51., 51., 51., 52., 52., 52., 52., 53., 54., 55., 55., 55., 56., 56., 56., 56., 57., 58., 59., 59., 59.,
+        60., 60., 60., 60., 61., 62., 63., 63., 63., 60., 60., 60., 60., 61., 62., 63., 63., 63., 64., 64., 64., 64., 65., 66., 67., 67., 67., 68., 68., 68., 68., 69., 70., 71., 71., 71. });
+
+    replicationPaddingBackwardPassResult = std::make_unique<Tensor>(std::initializer_list<dtype>{ 9., 3., 3., 9., 3., 1., 1., 3., 9., 3., 3., 9., 9., 3., 3., 9., 3., 1., 1., 3., 9., 3., 3., 9.,
+                                                                                                  9., 3., 3., 9., 3., 1., 1., 3., 9., 3., 3., 9., 9., 3., 3., 9., 3., 1., 1., 3., 9., 3., 3., 9.,
+                                                                                                  9., 3., 3., 9., 3., 1., 1., 3., 9., 3., 3., 9., 9., 3., 3., 9., 3., 1., 1., 3., 9., 3., 3., 9. });
+}
+
+} // namespace UT
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_DataLayer.cpp b/training/src/tests/tests/layers/Test_DataLayer.cpp
new file mode 100644
index 00000000..864473ca
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_DataLayer.cpp
@@ -0,0 +1,189 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <utility>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestDataLayer, DataLayerUnit)
+{
+    PROFILE_TEST
+    constexpr auto batch_size = 1000U;
+    constexpr auto depth = 5u;
+    constexpr auto height = 4u;
+    constexpr auto width = 3u;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const auto params = raul::DataParams{ { "out" }, depth, height, width };
+
+    raul::DataLayer data("data", params, networkParameters);
+
+    TENSORS_CREATE(batch_size);
+
+    ASSERT_TRUE(memory_manager.tensorExists("out"));
+
+    auto& out = memory_manager["out"];
+
+    EXPECT_EQ(out.getBatchSize(), batch_size);
+    EXPECT_EQ(out.getDepth(), depth);
+    EXPECT_EQ(out.getHeight(), height);
+    EXPECT_EQ(out.getWidth(), width);
+}
+
+TEST(TestDataLayer, DataLayerWithLabelsUnit)
+{
+    PROFILE_TEST
+    constexpr auto batch_size = 1000U;
+    constexpr auto depth = 5u;
+    constexpr auto height = 4u;
+    constexpr auto width = 3u;
+    constexpr auto labelCnt = 2u;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const auto params = raul::DataParams{ { "out", "labels" }, depth, height, width, labelCnt };
+
+    raul::DataLayer data("data", params, networkParameters);
+
+    TENSORS_CREATE(batch_size);
+
+    ASSERT_TRUE(memory_manager.tensorExists("out"));
+    ASSERT_TRUE(memory_manager.tensorExists("labels"));
+
+    auto& out = memory_manager["out"];
+
+    EXPECT_EQ(out.getBatchSize(), batch_size);
+    EXPECT_EQ(out.getDepth(), depth);
+    EXPECT_EQ(out.getHeight(), height);
+    EXPECT_EQ(out.getWidth(), width);
+
+    auto& labels = memory_manager["labels"];
+
+    EXPECT_EQ(labels.getBatchSize(), batch_size);
+    EXPECT_EQ(labels.getDepth(), 1u);
+    EXPECT_EQ(labels.getHeight(), 1u);
+    EXPECT_EQ(labels.getWidth(), labelCnt);
+}
+
+TEST(TestDataLayer, DataLayerWithLabels2Unit)
+{
+    PROFILE_TEST
+    constexpr auto batch_size = 1000U;
+    constexpr auto depth = 5u;
+    constexpr auto height = 4u;
+    constexpr auto width = 3u;
+    constexpr auto labelCnt = 2u;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const auto params1 = raul::DataParams{ { "out" }, depth, height, width };
+    const auto params2 = raul::DataParams{ { "labels" }, 1u, 1u, labelCnt };
+
+    raul::DataLayer data1("data", params1, networkParameters);
+    raul::DataLayer data2("labels", params2, networkParameters);
+
+    TENSORS_CREATE(batch_size);
+
+    ASSERT_TRUE(memory_manager.tensorExists("out"));
+    ASSERT_TRUE(memory_manager.tensorExists("labels"));
+
+    auto& out = memory_manager["out"];
+
+    EXPECT_EQ(out.getBatchSize(), batch_size);
+    EXPECT_EQ(out.getDepth(), depth);
+    EXPECT_EQ(out.getHeight(), height);
+    EXPECT_EQ(out.getWidth(), width);
+
+    auto& labels = memory_manager["labels"];
+
+    EXPECT_EQ(labels.getBatchSize(), batch_size);
+    EXPECT_EQ(labels.getDepth(), 1u);
+    EXPECT_EQ(labels.getHeight(), 1u);
+    EXPECT_EQ(labels.getWidth(), labelCnt);
+}
+
+TEST(TestDataLayer, DataLayerNoLabelsUnit)
+{
+    PROFILE_TEST
+    constexpr auto batch_size = 1000U;
+    constexpr auto depth = 5u;
+    constexpr auto height = 4u;
+    constexpr auto width = 3u;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const auto params = raul::DataParams{ { "out", "labels" }, depth, height, width };
+
+    raul::DataLayer data("data", params, networkParameters);
+
+    TENSORS_CREATE(batch_size);
+
+    ASSERT_TRUE(memory_manager.tensorExists("out"));
+    ASSERT_TRUE(memory_manager.tensorExists("labels"));
+
+    auto& out = memory_manager["out"];
+
+    EXPECT_EQ(out.getBatchSize(), batch_size);
+    EXPECT_EQ(out.getDepth(), depth);
+    EXPECT_EQ(out.getHeight(), height);
+    EXPECT_EQ(out.getWidth(), width);
+
+    auto& labels = memory_manager["labels"];
+
+    EXPECT_EQ(labels.getBatchSize(), batch_size);
+    EXPECT_EQ(labels.getDepth(), depth);
+    EXPECT_EQ(labels.getHeight(), height);
+    EXPECT_EQ(labels.getWidth(), width);
+}
+
+TEST(TestDataLayer, DataLayerOnlyLabelsUnit)
+{
+    PROFILE_TEST
+    constexpr auto batch_size = 1000U;
+    constexpr auto depth = 5u;
+    constexpr auto height = 4u;
+    constexpr auto width = 3u;
+    constexpr auto labelCnt = 2u;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const auto params = raul::DataParams{ { "labels" }, depth, height, width, labelCnt };
+
+    raul::DataLayer data("data", params, networkParameters);
+
+    TENSORS_CREATE(batch_size);
+
+    ASSERT_TRUE(memory_manager.tensorExists("labels"));
+
+    auto& labels = memory_manager["labels"];
+
+    EXPECT_EQ(labels.getBatchSize(), batch_size);
+    EXPECT_EQ(labels.getDepth(), 1u);
+    EXPECT_EQ(labels.getHeight(), 1u);
+    EXPECT_EQ(labels.getWidth(), labelCnt);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_LayerNorm.cpp b/training/src/tests/tests/layers/Test_LayerNorm.cpp
new file mode 100644
index 00000000..8d60c093
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_LayerNorm.cpp
@@ -0,0 +1,144 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+using namespace raul;
+
+// layernorm.py
+TEST(TestLayerNorm, Unit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    Tensor raw = { -1._dt, 0._dt, 1._dt, 4._dt, 5._dt, 6._dt, 0._dt, 4._dt, 7._dt, -1._dt, 2._dt, 5._dt };
+    size_t BATCH_SIZE = 2;
+    size_t HEIGHT = 2;
+    size_t WIDTH = 3;
+    constexpr dtype eps = 1e-4_dt;
+
+    Tensor realLayerNorm = { -1._dt, 0._dt, 1._dt, -1._dt, 0._dt, 1._dt, -1.0441_dt, 0.0949_dt, 0.9492_dt, -1._dt, 0._dt, 1._dt };
+    Tensor realLayerNormWeightsGrad = { 0.044343_dt, 0.002493_dt, -0.015904_dt };
+    Tensor realLayerNormBiasesGrad = { -0.044287_dt, -0.015210_dt, -0.012081_dt };
+    Tensor realInputGrad = { -0.005130_dt, 0.010260_dt, -0.005130_dt, -0.009077_dt, 0.018154_dt, -0.009077_dt, 0.001306_dt, -0.003048_dt, 0.001742_dt, 0.002105_dt, -0.004209_dt, 0.002105_dt };
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 1, HEIGHT, WIDTH });
+    LayerNormLayer layer("ln", LayerNormParams{ "in", "ln", 0. }, networkParameters);
+    TENSORS_CREATE(BATCH_SIZE)
+    layer.initNotBSTensors();
+    memory_manager["in"] = TORANGE(raw);
+    // Forward pass
+    layer.forwardCompute(NetworkMode::Train);
+
+    const Tensor& ln = memory_manager["ln"];
+    ASSERT_EQ(ln.size(), realLayerNorm.size());
+    for (size_t i = 0; i < ln.size(); ++i)
+    {
+        EXPECT_NEAR(ln[i], realLayerNorm[i], eps);
+    }
+    printf(" - LayerNorm forward is Ok.\n");
+
+    // Backward pass
+    memory_manager[Name("ln").grad()].memAllocate(nullptr);
+    memory_manager[Name("ln").grad()] =
+        TORANGE((Tensor({ 0.001815_dt, -0.037754_dt, -0.108103_dt, -0.147300_dt, -0.012211_dt, 0.068416_dt, -0.001262_dt, 0.026261_dt, 0.075196_dt, 0.102461_dt, 0.008494_dt, -0.047590_dt })));
+    layer.backwardCompute();
+    // lnLayer.backwardCompute();
+    auto& inputGrad = memory_manager[Name("in").grad()];
+    auto& lnWeightGrad = memory_manager["ln::WeightsGradient"];
+    auto& lnBiasGrad = memory_manager["ln::BiasesGradient"];
+
+    for (size_t i = 0; i < lnWeightGrad.size(); ++i)
+    {
+        EXPECT_NEAR(lnWeightGrad[i], realLayerNormWeightsGrad[i], eps);
+    }
+    for (size_t i = 0; i < lnBiasGrad.size(); ++i)
+    {
+        EXPECT_NEAR(lnBiasGrad[i], realLayerNormBiasesGrad[i], eps);
+    }
+    for (size_t i = 0; i < inputGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inputGrad[i], realInputGrad[i], eps);
+    }
+    printf(" - LayerNorm backward is Ok.\n");
+}
+
+// layernorm_tf.py
+TEST(TestLayerNorm, TFStyleUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    Tensor raw = { -1._dt, 0._dt, 1._dt, 4._dt, 5._dt, 6._dt, 0._dt, 4._dt, 7._dt, -1._dt, 2._dt, 5._dt };
+    size_t BATCH_SIZE = 2;
+    size_t HEIGHT = 2;
+    size_t WIDTH = 3;
+    constexpr dtype eps = 1e-4_dt;
+    constexpr dtype lnEps = 1e-12_dt;
+
+    Tensor realLayerNorm = { -1.224653_dt, 0.000000_dt, 1.224653_dt, -1.224653_dt, 0.000000_dt, 1.224653_dt, -1.278716_dt, 0.116247_dt, 1.162469_dt, -1.224735_dt, 0.000000_dt, 1.224735_dt };
+    Tensor realLayerNormWeightsGrad = { 0.066124_dt, 0.002906_dt, -0.022477_dt };
+    Tensor realLayerNormBiasesGrad = { -0.053948_dt, -0.018527_dt, -0.014716_dt };
+    Tensor realInputGrad = { -0.006479_dt, 0.012979_dt, -0.006500_dt, -0.011503_dt, 0.022965_dt, -0.011462_dt, 0.001523_dt, -0.003554_dt, 0.002031_dt, 0.002454_dt, -0.004907_dt, 0.002454_dt };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, 1, HEIGHT, WIDTH });
+    LayerNormLayer layer("ln", LayerNormParams{ "in", "ln", lnEps, true }, networkParameters);
+    TENSORS_CREATE(BATCH_SIZE)
+    layer.initNotBSTensors();
+    memory_manager["in"] = TORANGE(raw);
+    // Forward pass
+
+    layer.forwardCompute(NetworkMode::Train);
+
+    const Tensor& ln = memory_manager["ln"];
+    ASSERT_EQ(ln.size(), realLayerNorm.size());
+    for (size_t i = 0; i < ln.size(); ++i)
+    {
+        EXPECT_NEAR(ln[i], realLayerNorm[i], eps);
+    }
+    printf(" - LayerNorm forward is Ok.\n");
+
+    // Backward pass
+    memory_manager[Name("ln").grad()].memAllocate(nullptr);
+    memory_manager[Name("ln").grad()] =
+        TORANGE((Tensor({ 0.001874_dt, -0.038998_dt, -0.111665_dt, -0.152153_dt, -0.012613_dt, 0.070670_dt, -0.001202_dt, 0.024997_dt, 0.071577_dt, 0.097530_dt, 0.008085_dt, -0.045300_dt })));
+
+    layer.backwardCompute();
+    auto& inputGrad = memory_manager[Name("in").grad()];
+    auto& lnWeightGrad = memory_manager["ln::WeightsGradient"];
+    auto& lnBiasGrad = memory_manager["ln::BiasesGradient"];
+
+    for (size_t i = 0; i < lnWeightGrad.size(); ++i)
+    {
+        EXPECT_NEAR(lnWeightGrad[i], realLayerNormWeightsGrad[i], eps);
+    }
+    for (size_t i = 0; i < lnBiasGrad.size(); ++i)
+    {
+        EXPECT_NEAR(lnBiasGrad[i], realLayerNormBiasesGrad[i], eps);
+    }
+    for (size_t i = 0; i < inputGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inputGrad[i], realInputGrad[i], eps);
+    }
+    printf(" - LayerNorm backward is Ok.\n");
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_AdditiveAttention.cpp b/training/src/tests/tests/layers/Test_Layer_AdditiveAttention.cpp
new file mode 100644
index 00000000..823e4027
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_AdditiveAttention.cpp
@@ -0,0 +1,159 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/composite/AdditiveAttentionLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerAdditiveAttention, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::AdditiveAttentionLayer("attn", raul::DropoutParams{ { "query", "value", "key", "mask1", "mask2" }, { "attention" }, 1.0f }, networkParameters), raul::Exception);
+}
+
+// See bahdanau_attention.py
+TEST(TestLayerAdditiveAttention, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const raul::dtype eps = TODTYPE(1e-6);
+    const raul::dtype proba = 0.0_dt;
+    const size_t depth = 1;
+    const size_t height = 2;
+    const size_t width = 3;
+
+    const raul::Tensor query{ 0.6645621_dt,  0.44100678_dt, 0.3528825_dt, 0.46448255_dt, 0.03366041_dt, 0.68467236_dt, 0.74011743_dt, 0.8724445_dt,  0.22632635_dt,
+                              0.22319686_dt, 0.3103881_dt,  0.7223358_dt, 0.13318717_dt, 0.5480639_dt,  0.5746088_dt,  0.8996835_dt,  0.00946367_dt, 0.5212307_dt };
+
+    const raul::Tensor value{ 0.68789124_dt, 0.48447883_dt, 0.9309944_dt,  0.252187_dt,   0.73115396_dt, 0.89256823_dt,
+                              0.94674826_dt, 0.7493341_dt,  0.34925628_dt, 0.54718256_dt, 0.26160395_dt, 0.69734323_dt };
+
+    const raul::Tensor key{
+        0.7413678_dt, 0.62854624_dt, 0.01738465_dt, 0.3431449_dt, 0.51063764_dt, 0.3777541_dt, 0.07321596_dt, 0.02137029_dt, 0.2871771_dt, 0.4710616_dt, 0.6936141_dt, 0.07321334_dt
+    };
+
+    const raul::Tensor realOutput{ 0.45930246_dt, 0.613895_dt,   0.91083443_dt, 0.47764212_dt, 0.603512_dt,  0.91245186_dt, 0.45157468_dt, 0.6182701_dt, 0.91015285_dt,
+                                   0.68406427_dt, 0.42868868_dt, 0.578097_dt,   0.69424564_dt, 0.4411166_dt, 0.5692273_dt,  0.69163543_dt, 0.4379304_dt, 0.57150126_dt };
+
+    const auto expectedShape = yato::dims(2, 1, 3, 3);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Topology
+    work.add<raul::DataLayer>("data_q", raul::DataParams{ { "query" }, depth, width, width });
+    work.add<raul::DataLayer>("data_vk", raul::DataParams{ { "value", "key" }, depth, height, width });
+    raul::AdditiveAttentionLayer battn("battn", raul::DropoutParams{ { "query", "value", "key" }, { "attn" }, proba }, networkParameters);
+
+    TENSORS_CREATE(2);
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["value"] = TORANGE(value);
+    memory_manager["key"] = TORANGE(key);
+
+    work.forwardPassTraining();
+
+    // Checks
+    const auto& output = memory_manager["attn"];
+    EXPECT_EQ(output.getShape(), expectedShape);
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(output[i], realOutput[i], eps));
+    }
+}
+
+TEST(TestLayerAdditiveAttention, BackwardUnit)
+{
+    PROFILE_TEST
+
+    // Test parameters
+    const raul::dtype eps = TODTYPE(1e-5);
+    const raul::dtype proba = 0.0f;
+    const size_t depth = 1;
+    const size_t height = 2;
+    const size_t width = 3;
+
+    const raul::Tensor query{ 0.6645621_dt,  0.44100678_dt, 0.3528825_dt, 0.46448255_dt, 0.03366041_dt, 0.68467236_dt, 0.74011743_dt, 0.8724445_dt,  0.22632635_dt,
+                              0.22319686_dt, 0.3103881_dt,  0.7223358_dt, 0.13318717_dt, 0.5480639_dt,  0.5746088_dt,  0.8996835_dt,  0.00946367_dt, 0.5212307_dt };
+
+    const raul::Tensor value{ 0.68789124_dt, 0.48447883_dt, 0.9309944_dt,  0.252187_dt,   0.73115396_dt, 0.89256823_dt,
+                              0.94674826_dt, 0.7493341_dt,  0.34925628_dt, 0.54718256_dt, 0.26160395_dt, 0.69734323_dt };
+
+    const raul::Tensor key{
+        0.7413678_dt, 0.62854624_dt, 0.01738465_dt, 0.3431449_dt, 0.51063764_dt, 0.3777541_dt, 0.07321596_dt, 0.02137029_dt, 0.2871771_dt, 0.4710616_dt, 0.6936141_dt, 0.07321334_dt
+    };
+
+    const raul::Tensor realQueryGrad{ -0.0114115_dt, -0.00423195_dt, 0.0149313_dt,   -0.0142751_dt, -0.00510888_dt, 0.01430582_dt,  -0.01027854_dt, -0.00238969_dt, 0.01323442_dt,
+                                      0.03374725_dt, 0.05830509_dt,  -0.01802726_dt, 0.03138524_dt, 0.0564916_dt,   -0.02018242_dt, 0.02612901_dt,  0.04563681_dt,  -0.02028692_dt };
+    const raul::Tensor realValueGrad{
+        1.4504294_dt, 1.4504294_dt, 1.4504294_dt, 1.5495706_dt, 1.5495706_dt, 1.5495706_dt, 1.072158_dt, 1.072158_dt, 1.072158_dt, 1.927842_dt, 1.927842_dt, 1.927842_dt
+    };
+    const raul::Tensor realKeyGrad{ 0.0398373_dt, 0.06927019_dt, 0.13879874_dt, -0.07580243_dt, -0.08100071_dt, -0.0963272_dt,
+                                    0.2860422_dt, 0.32549983_dt, 0.18352844_dt, -0.19478072_dt, -0.16506633_dt, -0.24202503_dt };
+
+    const auto expectedShape = yato::dims(2, 1, 3, 3);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data_q", raul::DataParams{ { "query" }, depth, width, width });
+    work.add<raul::DataLayer>("data_vk", raul::DataParams{ { "value", "key" }, depth, height, width });
+
+    // Apply function
+    raul::AdditiveAttentionLayer battn("battn", raul::DropoutParams{ { "query", "value", "key" }, { "attn" }, proba }, networkParameters);
+
+    TENSORS_CREATE(2);
+
+    ASSERT_EQ(memory_manager[raul::Name("attn").grad()].getShape(), expectedShape);
+    std::fill(memory_manager[raul::Name("attn").grad()].begin(), memory_manager[raul::Name("attn").grad()].end(), 1_dt);
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["value"] = TORANGE(value);
+    memory_manager["key"] = TORANGE(key);
+
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    // Checks
+    const auto& queryNabla = memory_manager[raul::Name("query").grad()];
+    const auto& valueNabla = memory_manager[raul::Name("value").grad()];
+    const auto& keyNabla = memory_manager[raul::Name("key").grad()];
+    for (size_t i = 0; i < queryNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(queryNabla[i], realQueryGrad[i], eps));
+    }
+    for (size_t i = 0; i < valueNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(valueNabla[i], realValueGrad[i], eps));
+    }
+    for (size_t i = 0; i < keyNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(keyNabla[i], realKeyGrad[i], eps));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ArgMax.cpp b/training/src/tests/tests/layers/Test_Layer_ArgMax.cpp
new file mode 100644
index 00000000..07494dcc
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ArgMax.cpp
@@ -0,0 +1,150 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/ArgMaxLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerArgMax, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ArgMaxLayer("argmax", raul::BasicParamsWithDim{ { "x", "y" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerArgMax, Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+    const raul::Tensor x{ 0.4963_dt, 0.7682_dt, 0.0885_dt, 0.1320_dt, 0.3074_dt, 0.6341_dt, 0.4901_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.3489_dt, 0.4017_dt, 0.0223_dt, 0.1689_dt, 0.2939_dt,
+                          0.5185_dt, 0.6977_dt, 0.8000_dt, 0.1610_dt, 0.2823_dt, 0.6816_dt, 0.9152_dt, 0.3971_dt, 0.8742_dt, 0.4194_dt, 0.5529_dt, 0.9527_dt, 0.0362_dt, 0.1852_dt, 0.3734_dt,
+                          0.3051_dt, 0.9320_dt, 0.1759_dt, 0.2698_dt, 0.1507_dt, 0.0317_dt, 0.2081_dt, 0.9298_dt, 0.7231_dt, 0.7423_dt, 0.5263_dt, 0.2437_dt, 0.5846_dt, 0.0332_dt, 0.1387_dt,
+                          0.2422_dt, 0.8155_dt, 0.7932_dt, 0.2783_dt, 0.4820_dt, 0.8198_dt, 0.9971_dt, 0.6984_dt, 0.5675_dt, 0.8352_dt, 0.2056_dt, 0.5932_dt, 0.1123_dt, 0.1535_dt, 0.2417_dt,
+                          0.7262_dt, 0.7011_dt, 0.2038_dt, 0.6511_dt, 0.7745_dt, 0.4369_dt, 0.5191_dt, 0.6159_dt, 0.8102_dt, 0.9801_dt, 0.1147_dt, 0.3168_dt, 0.6965_dt, 0.9143_dt, 0.9351_dt,
+                          0.9412_dt, 0.5995_dt, 0.0652_dt, 0.5460_dt, 0.1872_dt, 0.0340_dt, 0.9442_dt, 0.8802_dt, 0.0012_dt, 0.5936_dt, 0.4158_dt, 0.4177_dt, 0.2711_dt, 0.6923_dt, 0.2038_dt,
+                          0.6833_dt, 0.7529_dt, 0.8579_dt, 0.6870_dt, 0.0051_dt, 0.1757_dt, 0.7497_dt, 0.6047_dt, 0.1100_dt, 0.2121_dt, 0.9704_dt, 0.8369_dt, 0.2820_dt, 0.3742_dt, 0.0237_dt,
+                          0.4910_dt, 0.1235_dt, 0.1143_dt, 0.4725_dt, 0.5751_dt, 0.2952_dt, 0.7967_dt, 0.1957_dt, 0.9537_dt, 0.8426_dt, 0.0784_dt, 0.3756_dt, 0.5226_dt, 0.5730_dt, 0.6186_dt };
+
+    std::string dimensions[] = { "batch", "depth", "height", "width" };
+
+    raul::Tensor realOutputs[] = {
+        { 0.7262_dt, 0.7682_dt, 0.2038_dt, 0.6511_dt, 0.7745_dt, 0.6341_dt, 0.5191_dt, 0.8964_dt, 0.8102_dt, 0.9801_dt, 0.3489_dt, 0.4017_dt, 0.6965_dt, 0.9143_dt, 0.9351_dt,
+          0.9412_dt, 0.6977_dt, 0.8000_dt, 0.5460_dt, 0.2823_dt, 0.6816_dt, 0.9442_dt, 0.8802_dt, 0.8742_dt, 0.5936_dt, 0.5529_dt, 0.9527_dt, 0.2711_dt, 0.6923_dt, 0.3734_dt,
+          0.6833_dt, 0.9320_dt, 0.8579_dt, 0.6870_dt, 0.1507_dt, 0.1757_dt, 0.7497_dt, 0.9298_dt, 0.7231_dt, 0.7423_dt, 0.9704_dt, 0.8369_dt, 0.5846_dt, 0.3742_dt, 0.1387_dt,
+          0.4910_dt, 0.8155_dt, 0.7932_dt, 0.4725_dt, 0.5751_dt, 0.8198_dt, 0.9971_dt, 0.6984_dt, 0.9537_dt, 0.8426_dt, 0.2056_dt, 0.5932_dt, 0.5226_dt, 0.5730_dt, 0.6186_dt },
+        { 0.6816_dt, 0.9152_dt, 0.5846_dt, 0.8742_dt, 0.4194_dt, 0.6341_dt, 0.9527_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.8198_dt, 0.9971_dt, 0.6984_dt, 0.5675_dt,
+          0.8352_dt, 0.5185_dt, 0.6977_dt, 0.9298_dt, 0.7231_dt, 0.7423_dt, 0.9704_dt, 0.9442_dt, 0.8802_dt, 0.6511_dt, 0.7745_dt, 0.4910_dt, 0.5191_dt, 0.6159_dt,
+          0.8102_dt, 0.9801_dt, 0.6833_dt, 0.7967_dt, 0.8579_dt, 0.9537_dt, 0.9351_dt, 0.9412_dt, 0.7497_dt, 0.6047_dt, 0.5730_dt, 0.6186_dt },
+        { 0.6341_dt, 0.7682_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.6816_dt, 0.9527_dt, 0.9298_dt, 0.8742_dt, 0.7423_dt, 0.8198_dt, 0.9971_dt, 0.7932_dt, 0.5675_dt, 0.8352_dt,
+          0.9412_dt, 0.7011_dt, 0.6965_dt, 0.9143_dt, 0.9801_dt, 0.6833_dt, 0.9442_dt, 0.8802_dt, 0.6923_dt, 0.5936_dt, 0.9704_dt, 0.8369_dt, 0.5226_dt, 0.9537_dt, 0.8426_dt },
+        { 0.7682_dt, 0.8964_dt, 0.4017_dt, 0.8000_dt, 0.9152_dt, 0.9527_dt, 0.9320_dt, 0.9298_dt, 0.5846_dt, 0.8155_dt, 0.9971_dt, 0.5932_dt,
+          0.7745_dt, 0.9801_dt, 0.9351_dt, 0.9412_dt, 0.9442_dt, 0.6923_dt, 0.8579_dt, 0.7497_dt, 0.9704_dt, 0.5751_dt, 0.9537_dt, 0.6186_dt }
+    };
+
+    raul::Tensor realIndices[] = { { 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                     0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                     1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                                   { 1.0_dt, 1.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     2.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 2.0_dt, 1.0_dt, 2.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt },
+                                   { 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 3.0_dt, 0.0_dt, 3.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 2.0_dt, 2.0_dt,
+                                     3.0_dt, 0.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 2.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 3.0_dt, 2.0_dt, 2.0_dt },
+                                   { 1.0_dt, 2.0_dt, 1.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     4.0_dt, 4.0_dt, 4.0_dt, 0.0_dt, 1.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 0.0_dt, 4.0_dt, 3.0_dt, 4.0_dt } };
+
+    raul::Tensor deltas[] = { { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                              { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                              { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                              { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt } };
+
+    raul::Tensor realGrads[] = { { 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                                 { 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt },
+                                 { 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt },
+                                 { 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt } };
+
+    // See argmax.py
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+        // Apply function
+        raul::ArgMaxLayer argmax("argmax", raul::BasicParamsWithDim{ { "x" }, { "ind", "val" }, dimensions[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("val").grad()] = TORANGE(deltas[iter]);
+
+        argmax.forwardCompute(raul::NetworkMode::Test);
+        argmax.backwardCompute();
+
+        // Forward checks
+        const auto& out_indices = memory_manager["ind"];
+        const auto& out_values = memory_manager["val"];
+        EXPECT_EQ(out_values.size(), realOutputs[iter].size());
+        EXPECT_EQ(out_indices.size(), realIndices[iter].size());
+        for (size_t i = 0; i < out_values.size(); ++i)
+        {
+            EXPECT_EQ(out_values[i], realOutputs[iter][i]);
+            EXPECT_EQ(out_indices[i], realIndices[iter][i]);
+        }
+
+        // Backward checks
+        const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+        {
+            EXPECT_EQ(x_tensor_grad[i], realGrads[iter][i]);
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ArgMin.cpp b/training/src/tests/tests/layers/Test_Layer_ArgMin.cpp
new file mode 100644
index 00000000..de620790
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ArgMin.cpp
@@ -0,0 +1,151 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/ArgMinLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerArgMin, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ArgMinLayer("argmin", raul::BasicParamsWithDim{ { "x", "y" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerArgMin, Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+    const raul::Tensor x{ 0.4963_dt, 0.7682_dt, 0.0885_dt, 0.1320_dt, 0.3074_dt, 0.6341_dt, 0.4901_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.3489_dt, 0.4017_dt, 0.0223_dt, 0.1689_dt, 0.2939_dt,
+                          0.5185_dt, 0.6977_dt, 0.8000_dt, 0.1610_dt, 0.2823_dt, 0.6816_dt, 0.9152_dt, 0.3971_dt, 0.8742_dt, 0.4194_dt, 0.5529_dt, 0.9527_dt, 0.0362_dt, 0.1852_dt, 0.3734_dt,
+                          0.3051_dt, 0.9320_dt, 0.1759_dt, 0.2698_dt, 0.1507_dt, 0.0317_dt, 0.2081_dt, 0.9298_dt, 0.7231_dt, 0.7423_dt, 0.5263_dt, 0.2437_dt, 0.5846_dt, 0.0332_dt, 0.1387_dt,
+                          0.2422_dt, 0.8155_dt, 0.7932_dt, 0.2783_dt, 0.4820_dt, 0.8198_dt, 0.9971_dt, 0.6984_dt, 0.5675_dt, 0.8352_dt, 0.2056_dt, 0.5932_dt, 0.1123_dt, 0.1535_dt, 0.2417_dt,
+                          0.7262_dt, 0.7011_dt, 0.2038_dt, 0.6511_dt, 0.7745_dt, 0.4369_dt, 0.5191_dt, 0.6159_dt, 0.8102_dt, 0.9801_dt, 0.1147_dt, 0.3168_dt, 0.6965_dt, 0.9143_dt, 0.9351_dt,
+                          0.9412_dt, 0.5995_dt, 0.0652_dt, 0.5460_dt, 0.1872_dt, 0.0340_dt, 0.9442_dt, 0.8802_dt, 0.0012_dt, 0.5936_dt, 0.4158_dt, 0.4177_dt, 0.2711_dt, 0.6923_dt, 0.2038_dt,
+                          0.6833_dt, 0.7529_dt, 0.8579_dt, 0.6870_dt, 0.0051_dt, 0.1757_dt, 0.7497_dt, 0.6047_dt, 0.1100_dt, 0.2121_dt, 0.9704_dt, 0.8369_dt, 0.2820_dt, 0.3742_dt, 0.0237_dt,
+                          0.4910_dt, 0.1235_dt, 0.1143_dt, 0.4725_dt, 0.5751_dt, 0.2952_dt, 0.7967_dt, 0.1957_dt, 0.9537_dt, 0.8426_dt, 0.0784_dt, 0.3756_dt, 0.5226_dt, 0.5730_dt, 0.6186_dt };
+
+    std::string dimensions[] = { "batch", "depth", "height", "width" };
+
+    raul::Tensor realOutputs[] = {
+        { 0.4963_dt, 0.7011_dt, 0.0885_dt, 0.1320_dt, 0.3074_dt, 0.4369_dt, 0.4901_dt, 0.6159_dt, 0.4556_dt, 0.6323_dt, 0.1147_dt, 0.3168_dt, 0.0223_dt, 0.1689_dt, 0.2939_dt,
+          0.5185_dt, 0.5995_dt, 0.0652_dt, 0.1610_dt, 0.1872_dt, 0.0340_dt, 0.9152_dt, 0.3971_dt, 0.0012_dt, 0.4194_dt, 0.4158_dt, 0.4177_dt, 0.0362_dt, 0.1852_dt, 0.2038_dt,
+          0.3051_dt, 0.7529_dt, 0.1759_dt, 0.2698_dt, 0.0051_dt, 0.0317_dt, 0.2081_dt, 0.6047_dt, 0.1100_dt, 0.2121_dt, 0.5263_dt, 0.2437_dt, 0.2820_dt, 0.0332_dt, 0.0237_dt,
+          0.2422_dt, 0.1235_dt, 0.1143_dt, 0.2783_dt, 0.4820_dt, 0.2952_dt, 0.7967_dt, 0.1957_dt, 0.5675_dt, 0.8352_dt, 0.0784_dt, 0.3756_dt, 0.1123_dt, 0.1535_dt, 0.2417_dt },
+        { 0.4963_dt, 0.2437_dt, 0.0885_dt, 0.0332_dt, 0.1387_dt, 0.2422_dt, 0.4901_dt, 0.0362_dt, 0.1852_dt, 0.3734_dt, 0.3051_dt, 0.4017_dt, 0.0223_dt, 0.1689_dt,
+          0.1507_dt, 0.0317_dt, 0.2081_dt, 0.1123_dt, 0.1535_dt, 0.2417_dt, 0.0340_dt, 0.7011_dt, 0.2038_dt, 0.0012_dt, 0.0237_dt, 0.4158_dt, 0.1235_dt, 0.1143_dt,
+          0.4725_dt, 0.2038_dt, 0.1147_dt, 0.3168_dt, 0.1957_dt, 0.6870_dt, 0.0051_dt, 0.0784_dt, 0.3756_dt, 0.0652_dt, 0.1100_dt, 0.1872_dt },
+        { 0.3489_dt, 0.4017_dt, 0.0223_dt, 0.1320_dt, 0.2823_dt, 0.0317_dt, 0.2081_dt, 0.0362_dt, 0.1852_dt, 0.1507_dt, 0.2056_dt, 0.2437_dt, 0.1123_dt, 0.0332_dt, 0.1387_dt,
+          0.1147_dt, 0.3168_dt, 0.0652_dt, 0.5460_dt, 0.1872_dt, 0.0340_dt, 0.4177_dt, 0.2711_dt, 0.0012_dt, 0.0051_dt, 0.0784_dt, 0.1235_dt, 0.1143_dt, 0.3742_dt, 0.0237_dt },
+        { 0.0885_dt, 0.4556_dt, 0.0223_dt, 0.1610_dt, 0.3971_dt, 0.0362_dt, 0.1507_dt, 0.0317_dt, 0.0332_dt, 0.2422_dt, 0.5675_dt, 0.1123_dt,
+          0.2038_dt, 0.4369_dt, 0.1147_dt, 0.0652_dt, 0.0012_dt, 0.2038_dt, 0.0051_dt, 0.1100_dt, 0.0237_dt, 0.1143_dt, 0.1957_dt, 0.0784_dt }
+    };
+
+    raul::Tensor realIndices[] = { { 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt,
+                                     1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                   { 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 2.0_dt,
+                                     1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 2.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 0.0_dt, 1.0_dt, 0.0_dt },
+                                   { 2.0_dt, 2.0_dt, 2.0_dt, 0.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 3.0_dt, 0.0_dt, 3.0_dt, 0.0_dt, 0.0_dt,
+                                     2.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 2.0_dt, 3.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt },
+                                   { 2.0_dt, 3.0_dt, 2.0_dt, 3.0_dt, 2.0_dt, 2.0_dt, 4.0_dt, 0.0_dt, 3.0_dt, 0.0_dt, 3.0_dt, 2.0_dt,
+                                     2.0_dt, 0.0_dt, 0.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 4.0_dt, 3.0_dt, 4.0_dt, 2.0_dt, 2.0_dt, 0.0_dt } };
+
+    raul::Tensor deltas[] = { { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                              { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                              { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                              { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt } };
+
+    raul::Tensor realGrads[] = { { 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                   0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                 { 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                   0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                 { 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                 { 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt } };
+
+    // See argmin.py
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+        // Apply function
+        raul::ArgMinLayer argmin("argmin", raul::BasicParamsWithDim{ { "x" }, { "ind", "val" }, dimensions[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("val").grad()] = TORANGE(deltas[iter]);
+
+        argmin.forwardCompute(raul::NetworkMode::Test);
+        argmin.backwardCompute();
+
+        // Forward checks
+        const auto& out_indices = memory_manager["ind"];
+        const auto& out_values = memory_manager["val"];
+        EXPECT_EQ(out_values.size(), realOutputs[iter].size());
+        EXPECT_EQ(out_indices.size(), realIndices[iter].size());
+        for (size_t i = 0; i < out_values.size(); ++i)
+        {
+            EXPECT_EQ(out_values[i], realOutputs[iter][i]);
+            EXPECT_EQ(out_indices[i], realIndices[iter][i]);
+        }
+
+        // Backward checks
+        const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+        {
+            EXPECT_EQ(x_tensor_grad[i], realGrads[iter][i]);
+        }
+        memory_manager.clear();
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_BahdanauMonotonicAttention.cpp b/training/src/tests/tests/layers/Test_Layer_BahdanauMonotonicAttention.cpp
new file mode 100644
index 00000000..0f4b454d
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_BahdanauMonotonicAttention.cpp
@@ -0,0 +1,1344 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+#include <training/base/layers/basic/SplitterLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/base/layers/composite/BahdanauMonotonicAttentionLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+using namespace raul;
+
+TEST(TestLayerBahdanauMonotonicAttention, GetTrainableParametersUnit)
+{
+    PROFILE_TEST
+    // See bahdanau_attention.py
+    // Test parameters
+    const size_t numUnits = 3;
+    const size_t alignmentsSize = 5;
+    const size_t anyNumber = 7;
+    const size_t batchSize = 2;
+    const size_t goldenTrainableParams = 6;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data_query", DataParams{ { "query" }, 1u, 1u, numUnits });
+    work.add<DataLayer>("data_state", DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<DataLayer>("data_memory", DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+
+    // Apply function
+    BahdanauMonotonicAttentionLayer("battn", BahdanauAttentionParams{ BasicParams{ { "query", "state", "memory" }, { "attn" } }, numUnits, true, 0.0_dt, 1.7_dt }, networkParameters);
+    TENSORS_CREATE(batchSize);
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(work.getTrainableParameterNames().size(), goldenTrainableParams);
+}
+
+TEST(TestLayerBahdanauMonotonicAttention, ForwardDefaultModeUnit)
+{
+    PROFILE_TEST
+    // See bahdanau_attention.py
+    // Test parameters
+    const auto eps = TODTYPE(1e-5);
+    const size_t numUnits = 3;
+    const size_t alignmentsSize = 5;
+    const size_t anyNumber = 7;
+    const size_t batchSize = 2;
+    const Tensor query{ 0.8822692633_dt, 0.9150039554_dt, 0.3828637600_dt, 0.9593056440_dt, 0.3904482126_dt, 0.6008953452_dt };
+
+    const Tensor state{ 0.2565724850_dt, 0.7936413288_dt, 0.9407714605_dt, 0.1331859231_dt, 0.9345980883_dt, 0.5935796499_dt, 0.8694044352_dt, 0.5677152872_dt, 0.7410940528_dt, 0.4294044971_dt };
+
+    const Tensor memory{ 0.8854429126_dt, 0.5739044547_dt, 0.2665800452_dt, 0.6274491549_dt, 0.2696316838_dt, 0.4413635731_dt, 0.2969208360_dt, 0.8316854835_dt, 0.1053149104_dt, 0.2694948316_dt,
+                         0.3588126302_dt, 0.1993637681_dt, 0.5471915603_dt, 0.0061604381_dt, 0.9515545368_dt, 0.0752658844_dt, 0.8860136867_dt, 0.5832095742_dt, 0.3376477361_dt, 0.8089749813_dt,
+                         0.5779253840_dt, 0.9039816856_dt, 0.5546598434_dt, 0.3423134089_dt, 0.6343418360_dt, 0.3644102812_dt, 0.7104287744_dt, 0.9464110732_dt, 0.7890297771_dt, 0.2814137340_dt,
+                         0.7886323333_dt, 0.5894631147_dt, 0.7539175153_dt, 0.1952474713_dt, 0.0050457716_dt, 0.3068197370_dt, 0.1164885759_dt, 0.9102694392_dt, 0.6440156698_dt, 0.7071067691_dt,
+                         0.6581305861_dt, 0.4913020134_dt, 0.8913041353_dt, 0.1447432041_dt, 0.5314818621_dt, 0.1587299109_dt, 0.6541759968_dt, 0.3278088570_dt, 0.6532081366_dt, 0.3958292603_dt,
+                         0.9146959186_dt, 0.2036490440_dt, 0.2018010020_dt, 0.2017830014_dt, 0.9497213960_dt, 0.6656255593_dt, 0.9811253548_dt, 0.0873618722_dt, 0.0040619373_dt, 0.1088181138_dt,
+                         0.1636554599_dt, 0.7025200725_dt, 0.6790379286_dt, 0.9154621959_dt, 0.2417873144_dt, 0.1591441035_dt, 0.7652890682_dt, 0.2978977561_dt, 0.8034619093_dt, 0.3813496828_dt };
+
+    // Mask parameter
+    const Tensor memorySeqLength[]{ { 1.0_dt, 1.0_dt }, { 2.0_dt, 4.0_dt }, { 3.0_dt, 2.0_dt } };
+
+    const Tensor realOut[]{
+        { 0.2140678316_dt, 0.6273269653_dt, 0.8624919057_dt, 0.3153227270_dt, 0.7799403667_dt, 0.5078871846_dt, 0.7165711522_dt, 0.6048905253_dt, 0.7070785761_dt, 0.4987507761_dt },
+        { 0.2140678316_dt, 0.6842455268_dt, 0.8197882175_dt, 0.3046578765_dt, 0.7772769332_dt, 0.5078871846_dt, 0.7979118824_dt, 0.5912402272_dt, 0.7099284530_dt, 0.4458272159_dt },
+        { 0.2140678316_dt, 0.6842455268_dt, 0.9112838507_dt, 0.2360123247_dt, 0.7601333857_dt, 0.5078871846_dt, 0.7979118824_dt, 0.5438638330_dt, 0.6918377876_dt, 0.4949445128_dt }
+    };
+    const auto expectedShape = yato::dims(batchSize, 1, 1, alignmentsSize);
+
+    // In order to reproduce the result
+    const Tensor queryLinearLayerWeights{ -0.1448689997_dt, 0.3424179852_dt, 0.5204399824_dt, -0.3655380011_dt, 0.2678830028_dt, 0.3229590058_dt, 0.1139210016_dt, 0.1118329987_dt, -0.3971950114_dt };
+    const Tensor memoryLinearLayerWeights{ -0.0409465991_dt, -0.2600440085_dt, -0.3023909926_dt, -0.3340570033_dt, -0.0308049005_dt, 0.2768029869_dt,  -0.1257040054_dt,
+                                           0.0764357001_dt,  -0.2699669898_dt, 0.1572880000_dt,  0.1140609980_dt,  -0.3624039888_dt, -0.3353210092_dt, 0.3552179933_dt,
+                                           0.1678149998_dt,  0.2513029873_dt,  0.3315150142_dt,  -0.2174510062_dt, -0.3773759902_dt, -0.2405180037_dt, 0.3720769882_dt };
+    const Tensor attentionV{ -0.633191_dt, 0.234963_dt, -0.391515_dt };
+
+    Name battnName = "battn";
+    for (size_t iter = 0; iter < std::size(memorySeqLength); ++iter)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<DataLayer>("data_query", DataParams{ { "query" }, 1u, 1u, numUnits });
+        work.add<DataLayer>("data_state", DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+        work.add<DataLayer>("data_memory", DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+        work.add<DataLayer>("data_memory_seq_length", DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+        work.add<TensorLayer>("data_score_mask_value", TensorParams{ { "scoreMaskValue" }, 1u, 1u, 1u, 1u });
+
+        // Apply function
+        BahdanauMonotonicAttentionLayer(
+            battnName, BahdanauAttentionParams{ { { "query", "state", "memory", "memorySeqLength", "scoreMaskValue" }, { "attn", "values" } }, numUnits, false, 0.0_dt, 1.7_dt }, networkParameters);
+
+        TENSORS_CREATE(batchSize);
+        memory_manager["query"] = TORANGE(query);
+        memory_manager["state"] = TORANGE(state);
+        memory_manager["memory"] = TORANGE(memory);
+        memory_manager["memorySeqLength"] = TORANGE(memorySeqLength[iter]);
+        memory_manager["scoreMaskValue"][0] = 1.1_dt;
+
+        // In order to reproduce the result
+        memory_manager[battnName / "query_layer" / "Weights"] = TORANGE(queryLinearLayerWeights);
+        memory_manager[battnName / "memory_layer" / "Weights"] = TORANGE(memoryLinearLayerWeights);
+        memory_manager[battnName / "attention_v"] = TORANGE(attentionV);
+
+        ASSERT_NO_THROW(work.forwardPassTraining());
+
+        const auto& output = memory_manager["attn"];
+        EXPECT_EQ(expectedShape, output.getShape());
+
+        for (size_t i = 0; i < output.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(output[i], realOut[iter][i], eps));
+        }
+    }
+}
+
+TEST(TestLayerBahdanauMonotonicAttention, BackwardDefaultModeUnit)
+{
+    PROFILE_TEST
+    // See bahdanau_attention.py
+    // Test parameters
+    const auto eps = TODTYPE(1e-3);
+    const size_t numUnits = 3;
+    const size_t alignmentsSize = 5;
+    const size_t anyNumber = 7;
+    const size_t batchSize = 2;
+    const Tensor query{ 0.8822692633_dt, 0.9150039554_dt, 0.3828637600_dt, 0.9593056440_dt, 0.3904482126_dt, 0.6008953452_dt };
+
+    const Tensor state{ 0.2565724850_dt, 0.7936413288_dt, 0.9407714605_dt, 0.1331859231_dt, 0.9345980883_dt, 0.5935796499_dt, 0.8694044352_dt, 0.5677152872_dt, 0.7410940528_dt, 0.4294044971_dt };
+
+    const Tensor memory{ 0.8854429126_dt, 0.5739044547_dt, 0.2665800452_dt, 0.6274491549_dt, 0.2696316838_dt, 0.4413635731_dt, 0.2969208360_dt, 0.8316854835_dt, 0.1053149104_dt, 0.2694948316_dt,
+                         0.3588126302_dt, 0.1993637681_dt, 0.5471915603_dt, 0.0061604381_dt, 0.9515545368_dt, 0.0752658844_dt, 0.8860136867_dt, 0.5832095742_dt, 0.3376477361_dt, 0.8089749813_dt,
+                         0.5779253840_dt, 0.9039816856_dt, 0.5546598434_dt, 0.3423134089_dt, 0.6343418360_dt, 0.3644102812_dt, 0.7104287744_dt, 0.9464110732_dt, 0.7890297771_dt, 0.2814137340_dt,
+                         0.7886323333_dt, 0.5894631147_dt, 0.7539175153_dt, 0.1952474713_dt, 0.0050457716_dt, 0.3068197370_dt, 0.1164885759_dt, 0.9102694392_dt, 0.6440156698_dt, 0.7071067691_dt,
+                         0.6581305861_dt, 0.4913020134_dt, 0.8913041353_dt, 0.1447432041_dt, 0.5314818621_dt, 0.1587299109_dt, 0.6541759968_dt, 0.3278088570_dt, 0.6532081366_dt, 0.3958292603_dt,
+                         0.9146959186_dt, 0.2036490440_dt, 0.2018010020_dt, 0.2017830014_dt, 0.9497213960_dt, 0.6656255593_dt, 0.9811253548_dt, 0.0873618722_dt, 0.0040619373_dt, 0.1088181138_dt,
+                         0.1636554599_dt, 0.7025200725_dt, 0.6790379286_dt, 0.9154621959_dt, 0.2417873144_dt, 0.1591441035_dt, 0.7652890682_dt, 0.2978977561_dt, 0.8034619093_dt, 0.3813496828_dt };
+
+    // For Mask
+    const Tensor memorySeqLength = { 3.0_dt, 2.0_dt };
+
+    const Tensor queryRealGrad{ -0.0003809182_dt, -0.0020890478_dt, -0.0011571154_dt, -8.3754188381e-05_dt, -0.0004378771_dt, -0.0002263010_dt };
+    const Tensor stateRealGrad{
+        0.9996883869_dt, 0.9981191158_dt, 0.9896460176_dt, 0.9376283288_dt, 0.7502601147_dt, 0.9996299148_dt, 0.9974362850_dt, 0.9844242334_dt, 0.9376257658_dt, 0.7502601147_dt
+    };
+    const Tensor memoryRealGrad{ -1.2848134929e-06_dt,
+                                 4.5413935368e-07_dt,
+                                 6.8503318289e-06_dt,
+                                 2.1204959921e-05_dt,
+                                 5.0911908147e-06_dt,
+                                 -1.0806312275e-05_dt,
+                                 1.4944680515e-06_dt,
+                                 -3.0462137147e-05_dt,
+                                 -1.0001345800e-05_dt,
+                                 0.0001099166_dt,
+                                 0.0003973579_dt,
+                                 0.0001046803_dt,
+                                 -0.0001896893_dt,
+                                 1.4684021153e-05_dt,
+                                 -0.0001589781_dt,
+                                 0.0001083027_dt,
+                                 0.0010229523_dt,
+                                 0.0029847207_dt,
+                                 0.0006749138_dt,
+                                 -0.0015689796_dt,
+                                 0.0002623755_dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 -4.0988957153e-06_dt,
+                                 4.1648624460e-07_dt,
+                                 1.7735041183e-05_dt,
+                                 5.9070152929e-05_dt,
+                                 1.5281635569e-05_dt,
+                                 -2.8919352189e-05_dt,
+                                 2.8568676953e-06_dt,
+                                 -4.1273560782e-05_dt,
+                                 1.0954358913e-05_dt,
+                                 0.0002079968_dt,
+                                 0.0006563818_dt,
+                                 0.0001604330_dt,
+                                 -0.0003312167_dt,
+                                 4.2701489292e-05_dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt };
+    const Tensor realAttentionVNabla{ 0.0003681126_dt, 0.0008723068_dt, 0.0030059866_dt };
+    const Tensor realScoreBiasNabla{ 0.0130159315_dt };
+
+    // In order to reproduce the result
+    const Tensor queryLinearLayerWeights{ -0.1448689997_dt, 0.3424179852_dt, 0.5204399824_dt, -0.3655380011_dt, 0.2678830028_dt, 0.3229590058_dt, 0.1139210016_dt, 0.1118329987_dt, -0.3971950114_dt };
+    const Tensor memoryLinearLayerWeights{ -0.0409465991_dt, -0.2600440085_dt, -0.3023909926_dt, -0.3340570033_dt, -0.0308049005_dt, 0.2768029869_dt,  -0.1257040054_dt,
+                                           0.0764357001_dt,  -0.2699669898_dt, 0.1572880000_dt,  0.1140609980_dt,  -0.3624039888_dt, -0.3353210092_dt, 0.3552179933_dt,
+                                           0.1678149998_dt,  0.2513029873_dt,  0.3315150142_dt,  -0.2174510062_dt, -0.3773759902_dt, -0.2405180037_dt, 0.3720769882_dt };
+    const Tensor attentionV{ -0.633191_dt, 0.234963_dt, -0.391515_dt };
+
+    Name battnName = "battn";
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data_query", DataParams{ { "query" }, 1u, 1u, numUnits });
+    work.add<DataLayer>("data_state", DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<DataLayer>("data_memory", DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<DataLayer>("data_memory_seq_length", DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+    work.add<TensorLayer>("data_score_mask_value", TensorParams{ { "scoreMaskValue" }, 1u, 1u, 1u, 1u });
+
+    // Apply function
+    BahdanauMonotonicAttentionLayer(
+        battnName, BahdanauAttentionParams{ { { "query", "state", "memory", "memorySeqLength", "scoreMaskValue" }, { "attn", "values" } }, numUnits, false, 0.0_dt, 1.7_dt }, networkParameters);
+
+    TENSORS_CREATE(batchSize);
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["scoreMaskValue"][0] = 1.1_dt;
+
+    // In order to reproduce the result
+    memory_manager[battnName / "query_layer" / "Weights"] = TORANGE(queryLinearLayerWeights);
+    memory_manager[battnName / "memory_layer" / "Weights"] = TORANGE(memoryLinearLayerWeights);
+    memory_manager[battnName / "attention_v"] = TORANGE(attentionV);
+
+    memory_manager[Name("attn").grad()] = 1.0_dt;
+    memory_manager[Name("values").grad()] = 0.0_dt;
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    const auto queryNabla = memory_manager[Name("query").grad()];
+    EXPECT_EQ(queryNabla.size(), queryRealGrad.size());
+    for (size_t i = 0; i < queryNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(queryNabla[i], queryRealGrad[i], eps) || (queryNabla[i] < eps && queryRealGrad[i] < eps));
+    }
+    const auto stateNabla = memory_manager[Name("state").grad()];
+    EXPECT_EQ(stateNabla.size(), stateRealGrad.size());
+    for (size_t i = 0; i < stateNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(stateNabla[i], stateRealGrad[i], eps));
+    }
+    const auto memoryNabla = memory_manager[Name("memory").grad()];
+    EXPECT_EQ(memoryNabla.size(), memoryRealGrad.size());
+    for (size_t i = 0; i < memoryNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(memoryNabla[i], memoryRealGrad[i], eps) || (memoryNabla[i] < eps && memoryRealGrad[i] < eps));
+    }
+    const auto attentionVNabla = memory_manager[(battnName / "attention_v").grad()];
+    EXPECT_EQ(attentionVNabla.size(), realAttentionVNabla.size());
+    for (size_t i = 0; i < attentionVNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(attentionVNabla[i], realAttentionVNabla[i], eps));
+    }
+
+    const auto scoreBiasNabla = memory_manager[(battnName / "score_bias").grad()];
+    EXPECT_EQ(scoreBiasNabla.size(), realScoreBiasNabla.size());
+    for (size_t i = 0; i < scoreBiasNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(scoreBiasNabla[i], realScoreBiasNabla[i], eps));
+    }
+}
+
+TEST(TestLayerBahdanauMonotonicAttention, ForwardNormalizedModeUnit)
+{
+    PROFILE_TEST
+    // See bahdanau_attention.py
+    // Test parameters
+    const auto eps = TODTYPE(1e-5);
+    const size_t numUnits = 4;
+    const size_t queryDepth = 5;
+    const size_t alignmentsSize = 6;
+    const size_t anyNumber = 7;
+    const size_t batchSize = 3;
+    const Tensor query{ 0.4962565899_dt, 0.7682217956_dt, 0.0884774327_dt, 0.1320304871_dt, 0.3074228168_dt, 0.6340786815_dt, 0.4900934100_dt, 0.8964447379_dt,
+                        0.4556279778_dt, 0.6323062778_dt, 0.3488934636_dt, 0.4017173052_dt, 0.0223257542_dt, 0.1688589454_dt, 0.2938884497_dt };
+
+    const Tensor state{ 0.5185217857_dt, 0.6976675987_dt, 0.8000113964_dt, 0.1610294580_dt, 0.2822685838_dt, 0.6816085577_dt, 0.9151939750_dt, 0.3970999122_dt, 0.8741558790_dt,
+                        0.4194083214_dt, 0.5529070497_dt, 0.9527381063_dt, 0.0361648202_dt, 0.1852310300_dt, 0.3734173775_dt, 0.3051000237_dt, 0.9320003986_dt, 0.1759101748_dt };
+
+    const Tensor memory{ 0.2698335648_dt, 0.1506797671_dt, 0.0317195058_dt, 0.2081297636_dt, 0.9297990203_dt, 0.7231091857_dt, 0.7423362732_dt, 0.5262957811_dt, 0.2436582446_dt, 0.5845923424_dt,
+                         0.0331526399_dt, 0.1387168765_dt, 0.2422350049_dt, 0.8154689670_dt, 0.7931606174_dt, 0.2782524824_dt, 0.4819588065_dt, 0.8197803497_dt, 0.9970665574_dt, 0.6984410882_dt,
+                         0.5675464272_dt, 0.8352431655_dt, 0.2055988312_dt, 0.5931720138_dt, 0.1123472452_dt, 0.1534569263_dt, 0.2417082191_dt, 0.7262365222_dt, 0.7010802031_dt, 0.2038237453_dt,
+                         0.6510535479_dt, 0.7744860053_dt, 0.4368913174_dt, 0.5190907717_dt, 0.6158523560_dt, 0.8101882935_dt, 0.9800970554_dt, 0.1146882176_dt, 0.3167651296_dt, 0.6965049505_dt,
+                         0.9142746925_dt, 0.9351036549_dt, 0.9411783814_dt, 0.5995072722_dt, 0.0652086735_dt, 0.5459962487_dt, 0.1871973276_dt, 0.0340229273_dt, 0.9442462325_dt, 0.8801798820_dt,
+                         0.0012360215_dt, 0.5935860276_dt, 0.4157699943_dt, 0.4177194238_dt, 0.2711215615_dt, 0.6922780871_dt, 0.2038482428_dt, 0.6832956672_dt, 0.7528540492_dt, 0.8579357862_dt,
+                         0.6869555712_dt, 0.0051323771_dt, 0.1756515503_dt, 0.7496575117_dt, 0.6046506763_dt, 0.1099579930_dt, 0.2120902538_dt, 0.9703746438_dt, 0.8369089365_dt, 0.2819874287_dt,
+                         0.3741576076_dt, 0.0237009525_dt, 0.4910129309_dt, 0.1234705448_dt, 0.1143216491_dt, 0.4724501967_dt, 0.5750725269_dt, 0.2952348590_dt, 0.7966887951_dt, 0.1957304478_dt,
+                         0.9536850452_dt, 0.8426499367_dt, 0.0783585310_dt, 0.3755578399_dt, 0.5225613117_dt, 0.5729505420_dt, 0.6185871363_dt, 0.6962141395_dt, 0.5299500823_dt, 0.2560356259_dt,
+                         0.7365944982_dt, 0.0203755498_dt, 0.2036466599_dt, 0.3748350739_dt, 0.2564433217_dt, 0.3250833154_dt, 0.0901891589_dt, 0.3936424255_dt, 0.6068782210_dt, 0.1742671132_dt,
+                         0.4743403196_dt, 0.8579254150_dt, 0.4485998750_dt, 0.5138961077_dt, 0.4568655491_dt, 0.6011906862_dt, 0.8179197311_dt, 0.9736230969_dt, 0.8175279498_dt, 0.9747067690_dt,
+                         0.4638391733_dt, 0.0508392453_dt, 0.2629613876_dt, 0.8404526114_dt, 0.4967587590_dt, 0.2514768243_dt, 0.1168441176_dt, 0.0320739746_dt, 0.0779958963_dt, 0.3985816240_dt,
+                         0.7742030025_dt, 0.7703205347_dt, 0.0177840590_dt, 0.8118910193_dt, 0.1087452769_dt, 0.3942948580_dt };
+
+    // Mask parameter
+    const Tensor memorySeqLength[]{ { 2, 3, 4 }, { 4, 3, 2 }, { 7, 7, 2 } };
+    const Tensor scoreMaskValues[]{ { 1.0_dt }, { 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt }, { 10.0_dt, -10.0_dt, 0.013213_dt, 1.0_dt, 1.123_dt, -1.0_dt } };
+    const yato::dimensionality<4> scoreShapes[]{ yato::dims(1, 1, 1, 1), yato::dims(1, 1, alignmentsSize, 1), yato::dims(1, 1, alignmentsSize, 1) };
+
+    const Tensor realOut[]{ { 0.4272302091_dt,
+                              0.6427391171_dt,
+                              0.6917507052_dt,
+                              0.3037623167_dt,
+                              0.2880491614_dt,
+                              0.5757641792_dt,
+                              0.7611824274_dt,
+                              0.4488304853_dt,
+                              0.7797641158_dt,
+                              0.4503913522_dt,
+                              0.5253363252_dt,
+                              0.8377920985_dt,
+                              0.0297812112_dt,
+                              0.1573416293_dt,
+                              0.3391829133_dt,
+                              0.3056303263_dt,
+                              0.7310422063_dt,
+                              0.3252082169_dt },
+                            { 0.4272302091_dt,
+                              0.6427391171_dt,
+                              0.7793446183_dt,
+                              0.2695223093_dt,
+                              0.3383825719_dt,
+                              0.6821975708_dt,
+                              0.7611824274_dt,
+                              0.4488304853_dt,
+                              0.7797641158_dt,
+                              0.6050001383_dt,
+                              0.5602133870_dt,
+                              0.9541477561_dt,
+                              0.0297812112_dt,
+                              0.1573416293_dt,
+                              0.3883553147_dt,
+                              0.3185997307_dt,
+                              0.9315590262_dt,
+                              0.1817364693_dt },
+                            { 0.4272302091_dt,
+                              0.6427391171_dt,
+                              0.7793446183_dt,
+                              0.2695223093_dt,
+                              0.2784935832_dt,
+                              0.6213750839_dt,
+                              0.7611824274_dt,
+                              0.4488304853_dt,
+                              0.7797641158_dt,
+                              0.5108545423_dt,
+                              0.5322520733_dt,
+                              0.8819122910_dt,
+                              0.0297812112_dt,
+                              0.1573416293_dt,
+                              0.2051918805_dt,
+                              0.3710842729_dt,
+                              0.8062421679_dt,
+                              0.1178454757_dt } };
+
+    const auto expectedShape = yato::dims(batchSize, 1, 1, alignmentsSize);
+
+    // In order to reproduce the result
+    const Tensor queryLinearLayerWeights{ -0.1122149974_dt, 0.2652359903_dt,  0.4031310081_dt,  -0.2831450105_dt, 0.2075019926_dt,  0.2501629889_dt,  0.0882427990_dt,
+                                          0.0866253972_dt,  -0.3076660037_dt, -0.0484487005_dt, -0.3076879978_dt, -0.3577930033_dt, -0.3952620029_dt, -0.0364488997_dt,
+                                          0.3275179863_dt,  -0.1487360001_dt, 0.0904399976_dt,  -0.3194299936_dt, 0.1861059964_dt,  0.1349589974_dt };
+    const Tensor memoryLinearLayerWeights{ -0.3624039888_dt, -0.3353210092_dt, 0.3552179933_dt,  0.1678149998_dt,  0.2513029873_dt,  0.3315150142_dt,  -0.2174510062_dt,
+                                           -0.3773759902_dt, -0.2405180037_dt, 0.3720769882_dt,  -0.2393240035_dt, 0.0888077021_dt,  -0.1479790062_dt, 0.0844018012_dt,
+                                           0.0187141001_dt,  -0.3726229966_dt, -0.0514447019_dt, -0.3605310023_dt, -0.1578159928_dt, 0.0187279005_dt,  0.0845528021_dt,
+                                           -0.0756980032_dt, -0.2725169957_dt, -0.3426890075_dt, -0.1571239978_dt, 0.3581260145_dt,  -0.1010209993_dt, -0.2020059973_dt };
+    const Tensor attentionV{ -0.076089_dt, -0.70909_dt, 0.493939_dt, 0.205051_dt };
+
+    Name battnName = "battn";
+    for (size_t iter = 1; iter < 2; ++iter)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<DataLayer>("data_query", DataParams{ { "query" }, 1u, 1u, queryDepth });
+        work.add<DataLayer>("data_state", DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+        work.add<DataLayer>("data_memory", DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+        work.add<DataLayer>("data_memory_seq_length", DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+        work.add<TensorLayer>("data_score_mask_value", TensorParams{ { "scoreMaskValue" }, scoreShapes[iter][0], scoreShapes[iter][1], scoreShapes[iter][2], scoreShapes[iter][3] });
+
+        // Apply function
+        BahdanauMonotonicAttentionLayer(
+            battnName, BahdanauAttentionParams{ { { "query", "state", "memory", "memorySeqLength", "scoreMaskValue" }, { "attn", "values" } }, numUnits, true, 0.0_dt, 1.7_dt }, networkParameters);
+
+        TENSORS_CREATE(batchSize);
+        memory_manager["query"] = TORANGE(query);
+        memory_manager["state"] = TORANGE(state);
+        memory_manager["memory"] = TORANGE(memory);
+        memory_manager["memorySeqLength"] = TORANGE(memorySeqLength[iter]);
+        memory_manager["scoreMaskValue"] = TORANGE(scoreMaskValues[iter]);
+
+        // In order to reproduce the result
+        memory_manager[battnName / "query_layer" / "Weights"] = TORANGE(queryLinearLayerWeights);
+        memory_manager[battnName / "memory_layer" / "Weights"] = TORANGE(memoryLinearLayerWeights);
+        memory_manager[battnName / "attention_v"] = TORANGE(attentionV);
+
+        ASSERT_NO_THROW(work.forwardPassTraining());
+
+        const auto& output = memory_manager["attn"];
+        EXPECT_EQ(expectedShape, output.getShape());
+
+        for (size_t i = 0; i < output.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(output[i], realOut[iter][i], eps));
+        }
+    }
+}
+
+TEST(TestLayerBahdanauMonotonicAttention, BackwardNormalizedModeUnit)
+{
+    PROFILE_TEST
+    // See bahdanau_attention.py
+    // Test parameters
+    const auto eps = TODTYPE(1e-2);
+    const size_t numUnits = 4;
+    const size_t queryDepth = 5;
+    const size_t alignmentsSize = 6;
+    const size_t anyNumber = 7;
+    const size_t batchSize = 3;
+    const Tensor query{ 0.4962565899_dt, 0.7682217956_dt, 0.0884774327_dt, 0.1320304871_dt, 0.3074228168_dt, 0.6340786815_dt, 0.4900934100_dt, 0.8964447379_dt,
+                        0.4556279778_dt, 0.6323062778_dt, 0.3488934636_dt, 0.4017173052_dt, 0.0223257542_dt, 0.1688589454_dt, 0.2938884497_dt };
+
+    const Tensor state{ 0.5185217857_dt, 0.6976675987_dt, 0.8000113964_dt, 0.1610294580_dt, 0.2822685838_dt, 0.6816085577_dt, 0.9151939750_dt, 0.3970999122_dt, 0.8741558790_dt,
+                        0.4194083214_dt, 0.5529070497_dt, 0.9527381063_dt, 0.0361648202_dt, 0.1852310300_dt, 0.3734173775_dt, 0.3051000237_dt, 0.9320003986_dt, 0.1759101748_dt };
+
+    const Tensor memory{ 0.2698335648_dt, 0.1506797671_dt, 0.0317195058_dt, 0.2081297636_dt, 0.9297990203_dt, 0.7231091857_dt, 0.7423362732_dt, 0.5262957811_dt, 0.2436582446_dt, 0.5845923424_dt,
+                         0.0331526399_dt, 0.1387168765_dt, 0.2422350049_dt, 0.8154689670_dt, 0.7931606174_dt, 0.2782524824_dt, 0.4819588065_dt, 0.8197803497_dt, 0.9970665574_dt, 0.6984410882_dt,
+                         0.5675464272_dt, 0.8352431655_dt, 0.2055988312_dt, 0.5931720138_dt, 0.1123472452_dt, 0.1534569263_dt, 0.2417082191_dt, 0.7262365222_dt, 0.7010802031_dt, 0.2038237453_dt,
+                         0.6510535479_dt, 0.7744860053_dt, 0.4368913174_dt, 0.5190907717_dt, 0.6158523560_dt, 0.8101882935_dt, 0.9800970554_dt, 0.1146882176_dt, 0.3167651296_dt, 0.6965049505_dt,
+                         0.9142746925_dt, 0.9351036549_dt, 0.9411783814_dt, 0.5995072722_dt, 0.0652086735_dt, 0.5459962487_dt, 0.1871973276_dt, 0.0340229273_dt, 0.9442462325_dt, 0.8801798820_dt,
+                         0.0012360215_dt, 0.5935860276_dt, 0.4157699943_dt, 0.4177194238_dt, 0.2711215615_dt, 0.6922780871_dt, 0.2038482428_dt, 0.6832956672_dt, 0.7528540492_dt, 0.8579357862_dt,
+                         0.6869555712_dt, 0.0051323771_dt, 0.1756515503_dt, 0.7496575117_dt, 0.6046506763_dt, 0.1099579930_dt, 0.2120902538_dt, 0.9703746438_dt, 0.8369089365_dt, 0.2819874287_dt,
+                         0.3741576076_dt, 0.0237009525_dt, 0.4910129309_dt, 0.1234705448_dt, 0.1143216491_dt, 0.4724501967_dt, 0.5750725269_dt, 0.2952348590_dt, 0.7966887951_dt, 0.1957304478_dt,
+                         0.9536850452_dt, 0.8426499367_dt, 0.0783585310_dt, 0.3755578399_dt, 0.5225613117_dt, 0.5729505420_dt, 0.6185871363_dt, 0.6962141395_dt, 0.5299500823_dt, 0.2560356259_dt,
+                         0.7365944982_dt, 0.0203755498_dt, 0.2036466599_dt, 0.3748350739_dt, 0.2564433217_dt, 0.3250833154_dt, 0.0901891589_dt, 0.3936424255_dt, 0.6068782210_dt, 0.1742671132_dt,
+                         0.4743403196_dt, 0.8579254150_dt, 0.4485998750_dt, 0.5138961077_dt, 0.4568655491_dt, 0.6011906862_dt, 0.8179197311_dt, 0.9736230969_dt, 0.8175279498_dt, 0.9747067690_dt,
+                         0.4638391733_dt, 0.0508392453_dt, 0.2629613876_dt, 0.8404526114_dt, 0.4967587590_dt, 0.2514768243_dt, 0.1168441176_dt, 0.0320739746_dt, 0.0779958963_dt, 0.3985816240_dt,
+                         0.7742030025_dt, 0.7703205347_dt, 0.0177840590_dt, 0.8118910193_dt, 0.1087452769_dt, 0.3942948580_dt };
+
+    // Mask
+    const Tensor memorySeqLength{ 4.0_dt, 3.0_dt, 2.0_dt };
+
+    const Tensor queryRealGrad{ -0.0008168381_dt, -0.0005155340_dt, -0.0007735305_dt,     0.0006895980_dt,      0.0004305975_dt,      -0.0003753756_dt,    -0.0001851519_dt,   -0.0003009100_dt,
+                                0.0003927155_dt,  0.0001497775_dt,  -1.9470287953e-05_dt, -1.2845145648e-05_dt, -1.9195687855e-05_dt, 1.5204145711e-05_dt, 1.1072296729e-05_dt };
+    const Tensor stateRealGrad{ 0.9999361038_dt, 0.9996370077_dt, 0.9980420470_dt, 0.9889017940_dt, 0.9376558661_dt, 0.7502601147_dt, 0.9999020100_dt, 0.9994177818_dt, 0.9968632460_dt,
+                                0.9844186902_dt, 0.9376075268_dt, 0.7502601743_dt, 0.9998771548_dt, 0.9993041158_dt, 0.9961088300_dt, 0.9844209552_dt, 0.9376264811_dt, 0.7502601147_dt };
+    const Tensor memoryRealGrad{ 5.5376531236e-06_dt,
+                                 6.9251200330e-09_dt,
+                                 -7.3705064096e-06_dt,
+                                 -9.2431037046e-08_dt,
+                                 -1.2425049363e-06_dt,
+                                 1.3704108142e-06_dt,
+                                 -1.1076406281e-06_dt,
+                                 3.8405527448e-05_dt,
+                                 -1.0113180906e-06_dt,
+                                 -4.9431029765e-05_dt,
+                                 -2.8877147997e-06_dt,
+                                 -1.1275419638e-05_dt,
+                                 9.1915426310e-06_dt,
+                                 -5.7464162637e-06_dt,
+                                 0.0002307519_dt,
+                                 3.2046416891e-05_dt,
+                                 -0.0003045338_dt,
+                                 2.7337911888e-05_dt,
+                                 -3.8790916733e-05_dt,
+                                 5.6512184528e-05_dt,
+                                 -5.3834337450e-05_dt,
+                                 0.0004872462_dt,
+                                 -2.2791791707e-06_dt,
+                                 -0.0006235492_dt,
+                                 -2.6386958780e-05_dt,
+                                 -0.0001413992_dt,
+                                 0.0001175316_dt,
+                                 -7.4017174484e-05_dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 8.8209790192e-06_dt,
+                                 1.4536826711e-06_dt,
+                                 -1.1051546608e-05_dt,
+                                 7.8021366789e-07_dt,
+                                 -2.1497403395e-06_dt,
+                                 1.9545670966e-06_dt,
+                                 -1.5062261127e-06_dt,
+                                 3.9843456761e-05_dt,
+                                 4.4484568207e-06_dt,
+                                 -4.9985828809e-05_dt,
+                                 2.9281600291e-06_dt,
+                                 -1.0040303096e-05_dt,
+                                 1.0306517652e-05_dt,
+                                 -7.2083112173e-06_dt,
+                                 0.0003780332_dt,
+                                 0.0001253704_dt,
+                                 -0.0004666141_dt,
+                                 0.0001162831_dt,
+                                 -5.9158242948e-05_dt,
+                                 0.0001030223_dt,
+                                 -9.2531699920e-05_dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 -2.0451706106e-08_dt,
+                                 -1.8770993648e-09_dt,
+                                 2.5994911113e-08_dt,
+                                 -7.6339734534e-10_dt,
+                                 5.1734425632e-09_dt,
+                                 -4.8417865273e-09_dt,
+                                 3.5318492575e-09_dt,
+                                 1.6743522792e-05_dt,
+                                 -1.3715134628e-06_dt,
+                                 -2.2303716833e-05_dt,
+                                 -1.9069275368e-06_dt,
+                                 -4.5294887059e-06_dt,
+                                 3.9687711251e-06_dt,
+                                 -2.7945375223e-06_dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0._dt,
+                                 0.0_dt };
+
+    const Tensor realAttentionVNabla{ 0.0011630179_dt, -0.0016222397_dt, -0.0016261924_dt, -0.0012610571_dt };
+    const Tensor realAttentionBNabla{ -0.0002754711_dt, -0.0030156649_dt, 0.0012318231_dt, 0.0007339923_dt };
+    const Tensor realAttentionGNabla{ -0.0033586258_dt };
+    const Tensor realScoreBiasNabla{ 0.0076629906_dt };
+
+    // In order to reproduce the result
+    const Tensor queryLinearLayerWeights{ -0.1122149974_dt, 0.2652359903_dt,  0.4031310081_dt,  -0.2831450105_dt, 0.2075019926_dt,  0.2501629889_dt,  0.0882427990_dt,
+                                          0.0866253972_dt,  -0.3076660037_dt, -0.0484487005_dt, -0.3076879978_dt, -0.3577930033_dt, -0.3952620029_dt, -0.0364488997_dt,
+                                          0.3275179863_dt,  -0.1487360001_dt, 0.0904399976_dt,  -0.3194299936_dt, 0.1861059964_dt,  0.1349589974_dt };
+    const Tensor memoryLinearLayerWeights{ -0.3624039888_dt, -0.3353210092_dt, 0.3552179933_dt,  0.1678149998_dt,  0.2513029873_dt,  0.3315150142_dt,  -0.2174510062_dt,
+                                           -0.3773759902_dt, -0.2405180037_dt, 0.3720769882_dt,  -0.2393240035_dt, 0.0888077021_dt,  -0.1479790062_dt, 0.0844018012_dt,
+                                           0.0187141001_dt,  -0.3726229966_dt, -0.0514447019_dt, -0.3605310023_dt, -0.1578159928_dt, 0.0187279005_dt,  0.0845528021_dt,
+                                           -0.0756980032_dt, -0.2725169957_dt, -0.3426890075_dt, -0.1571239978_dt, 0.3581260145_dt,  -0.1010209993_dt, -0.2020059973_dt };
+    const Tensor attentionV{ -0.076089_dt, -0.70909_dt, 0.493939_dt, 0.205051_dt };
+
+    Name battnName = "battn";
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data_query", DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<DataLayer>("data_state", DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<DataLayer>("data_memory", DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<DataLayer>("data_memory_seq_length", DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+    work.add<TensorLayer>("data_score_mask_value", TensorParams{ { "scoreMaskValue" }, 1u, 1u, 1u, 1u });
+
+    // Apply function
+    BahdanauMonotonicAttentionLayer(
+        battnName, BahdanauAttentionParams{ { { "query", "state", "memory", "memorySeqLength", "scoreMaskValue" }, { "attn", "values" } }, numUnits, true, 0.0_dt, 1.7_dt }, networkParameters);
+
+    TENSORS_CREATE(batchSize);
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["scoreMaskValue"][0] = 1.1_dt;
+
+    // In order to reproduce the result
+    memory_manager[battnName / "query_layer" / "Weights"] = TORANGE(queryLinearLayerWeights);
+    memory_manager[battnName / "memory_layer" / "Weights"] = TORANGE(memoryLinearLayerWeights);
+    memory_manager[battnName / "attention_v"] = TORANGE(attentionV);
+
+    memory_manager[Name("attn").grad()] = 1.0_dt;
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    const auto queryNabla = memory_manager[Name("query").grad()];
+    EXPECT_EQ(queryNabla.size(), queryRealGrad.size());
+    for (size_t i = 0; i < queryNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(queryNabla[i], queryRealGrad[i], eps) || (queryNabla[i] < eps && queryRealGrad[i] < eps));
+    }
+    const auto stateNabla = memory_manager[Name("state").grad()];
+    EXPECT_EQ(stateNabla.size(), stateRealGrad.size());
+    for (size_t i = 0; i < stateNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(stateNabla[i], stateRealGrad[i], eps));
+    }
+    const auto memoryNabla = memory_manager[Name("memory").grad()];
+    EXPECT_EQ(memoryNabla.size(), memoryRealGrad.size());
+    for (size_t i = 0; i < memoryNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(memoryNabla[i], memoryRealGrad[i], eps) || (memoryNabla[i] < eps && memoryRealGrad[i] < eps));
+    }
+    const auto attentionVNabla = memory_manager[(battnName / "attention_v").grad()];
+    EXPECT_EQ(attentionVNabla.size(), realAttentionVNabla.size());
+    for (size_t i = 0; i < realAttentionVNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(attentionVNabla[i], realAttentionVNabla[i], eps));
+    }
+    const auto attentionBNabla = memory_manager[(battnName / "attention_b").grad()];
+    EXPECT_EQ(attentionBNabla.size(), realAttentionBNabla.size());
+    for (size_t i = 0; i < attentionBNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(attentionBNabla[i], realAttentionBNabla[i], eps));
+    }
+    const auto attentionGNabla = memory_manager[(battnName / "attention_g").grad()];
+    EXPECT_EQ(attentionGNabla.size(), realAttentionGNabla.size());
+    for (size_t i = 0; i < attentionGNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(attentionGNabla[i], realAttentionGNabla[i], eps));
+    }
+    const auto scoreBiasNabla = memory_manager[(battnName / "score_bias").grad()];
+    EXPECT_EQ(scoreBiasNabla.size(), realScoreBiasNabla.size());
+    for (size_t i = 0; i < scoreBiasNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(scoreBiasNabla[i], realScoreBiasNabla[i], eps));
+    }
+}
+
+TEST(TestLayerStepwiseBahdanauMonotonicAttention, DefaultModeUnit)
+{
+    PROFILE_TEST
+    // See bahdanau_attention.py
+    // Test parameters
+    const auto eps = TODTYPE(1e-5);
+    const size_t numUnits = 4;
+    const size_t queryDepth = 5;
+    const size_t alignmentsSize = 6;
+    const size_t anyNumber = 7;
+    const size_t batchSize = 3;
+    const Tensor query{ 0.8995079_dt, 0.4925307_dt,  0.36408758_dt, 0.01972544_dt, 0.7798331_dt,  0.77982223_dt, 0.81948376_dt, 0.29104686_dt,
+                        0.1190486_dt, 0.56751144_dt, 0.15556598_dt, 0.4869895_dt,  0.26108038_dt, 0.28730178_dt, 0.71505356_dt };
+
+    const Tensor state{ 0.7241353_dt, 0.6742269_dt,  0.44395304_dt, 0.09891009_dt, 0.95322967_dt, 0.4661187_dt,  0.588459_dt,   0.3044685_dt, 0.03514242_dt,
+                        0.2526785_dt, 0.81228757_dt, 0.77946687_dt, 0.9784529_dt,  0.07848763_dt, 0.80135584_dt, 0.20913935_dt, 0.6607659_dt, 0.21036232_dt };
+
+    const Tensor memory{ 0.01975703_dt, 0.00704217_dt, 0.18987215_dt, 0.7772658_dt,  0.41817415_dt, 0.7437942_dt,  0.26365364_dt, 0.4459244_dt,  0.82929873_dt, 0.52497685_dt, 0.55597556_dt,
+                         0.19923508_dt, 0.46925998_dt, 0.18594062_dt, 0.23303056_dt, 0.3938471_dt,  0.9660922_dt,  0.36530995_dt, 0.28173566_dt, 0.4888971_dt,  0.96301997_dt, 0.45836866_dt,
+                         0.70952535_dt, 0.477888_dt,   0.71620464_dt, 0.12221897_dt, 0.2998824_dt,  0.6689563_dt,  0.06436884_dt, 0.23358119_dt, 0.8235085_dt,  0.24635303_dt, 0.87422705_dt,
+                         0.97360873_dt, 0.5011089_dt,  0.4178022_dt,  0.19041097_dt, 0.05045938_dt, 0.07118928_dt, 0.17497218_dt, 0.06644797_dt, 0.7329292_dt,  0.8574884_dt,  0.4593867_dt,
+                         0.28661895_dt, 0.7181833_dt,  0.30093706_dt, 0.02433372_dt, 0.42253482_dt, 0.06825948_dt, 0.48981392_dt, 0.92883205_dt, 0.9339298_dt,  0.41831005_dt, 0.8322693_dt,
+                         0.22140837_dt, 0.23945987_dt, 0.7574657_dt,  0.5762696_dt,  0.5139812_dt,  0.7258351_dt,  0.86447895_dt, 0.9819726_dt,  0.24162543_dt, 0.24936235_dt, 0.72023165_dt,
+                         0.3312081_dt,  0.40411353_dt, 0.59419465_dt, 0.71123624_dt, 0.8676628_dt,  0.8858366_dt,  0.82439685_dt, 0.43707013_dt, 0.92378604_dt, 0.00537562_dt, 0.63191164_dt,
+                         0.5659201_dt,  0.12591887_dt, 0.5189445_dt,  0.80667794_dt, 0.34214568_dt, 0.34712052_dt, 0.5230378_dt,  0.02033377_dt, 0.9925318_dt,  0.04908013_dt, 0.5698966_dt,
+                         0.4791932_dt,  0.221825_dt,   0.39972973_dt, 0.09565127_dt, 0.07026207_dt, 0.7138928_dt,  0.21078682_dt, 0.8794396_dt,  0.5082735_dt,  0.8915067_dt,  0.13851714_dt,
+                         0.06712937_dt, 0.24958026_dt, 0.10923862_dt, 0.6606549_dt,  0.7950859_dt,  0.5450705_dt,  0.4209025_dt,  0.585426_dt,   0.63537335_dt, 0.40576637_dt, 0.5183171_dt,
+                         0.58145976_dt, 0.7846494_dt,  0.6629163_dt,  0.77547586_dt, 0.75580096_dt, 0.2184534_dt,  0.25045693_dt, 0.22379267_dt, 0.62836266_dt, 0.10235023_dt, 0.74957764_dt,
+                         0.6434492_dt,  0.6539769_dt,  0.11029541_dt, 0.10112023_dt, 0.23958611_dt };
+
+    // Mask parameter
+    const Tensor memorySeqLength{ 1.0_dt, 2.0_dt, 3.0_dt };
+    const Tensor scoreMaskValues{ 1.1_dt };
+
+    const Tensor realOut{ 0.69977564_dt, 0.5302052_dt, 0.5014616_dt, 0.18508108_dt, 0.739872_dt,   0.70417815_dt, 0.5675052_dt, 0.3154183_dt,  0.03637001_dt,
+                          0.19835109_dt, 0.6725309_dt, 0.9823275_dt, 0.9505889_dt,  0.10431705_dt, 0.77667105_dt, 0.1836284_dt, 0.54797673_dt, 0.37538192_dt };
+    const Tensor realIndices{ 4.0_dt, 5.0_dt, 0.0_dt };
+    const auto expectedShape = yato::dims(batchSize, 1, 1, alignmentsSize);
+
+    // In order to reproduce the result
+    const Tensor queryLinearLayerWeights{ 0.09049082_dt, 0.2369746_dt,  -0.04944408_dt, -0.813432_dt,  -0.47131312_dt, -0.45512667_dt, 0.04958135_dt, -0.18497097_dt, 0.5842254_dt, 0.26539183_dt,
+                                          0.59591985_dt, -0.7929145_dt, 0.63058674_dt,  -0.6403303_dt, -0.61891955_dt, 0.45280218_dt,  0.6099379_dt,  0.2233758_dt,   0.7512772_dt, -0.57287085_dt };
+    const Tensor memoryLinearLayerWeights{ -0.3072731_dt,  -0.12307996_dt, 0.29059702_dt, 0.10673004_dt, 0.61124223_dt, -0.3253169_dt,  0.26568073_dt, -0.43343008_dt, 0.45469207_dt, -0.55335987_dt,
+                                           -0.19977236_dt, 0.23876876_dt,  -0.7156197_dt, 0.04834241_dt, 0.0522756_dt,  -0.0100072_dt,  0.30991977_dt, -0.11740226_dt, 0.49257308_dt, 0.33437592_dt,
+                                           0.37889642_dt,  0.09048331_dt,  0.73578566_dt, 0.23990375_dt, 0.1921069_dt,  -0.61453474_dt, 0.39222664_dt, -0.6685021_dt };
+    const Tensor attentionV{ 0.36473823_dt, 0.41261613_dt, 0.36806035_dt, 0.03106666_dt };
+
+    Name battnName = "battn";
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data_query", DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<DataLayer>("data_state", DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<DataLayer>("data_memory", DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<DataLayer>("data_memory_seq_length", DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+    work.add<TensorLayer>("data_score_mask_value", TensorParams{ { "scoreMaskValue" }, 1u, 1u, 1u, 1u });
+
+    // Apply function
+    BahdanauMonotonicAttentionLayer(
+        battnName,
+        BahdanauAttentionParams{ { { "query", "state", "memory", "memorySeqLength", "scoreMaskValue" }, { "attn", "values", "max" } }, numUnits, false, 0.0_dt, 3.5_dt, "parallel", true, false },
+        networkParameters);
+
+    TENSORS_CREATE(batchSize);
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["scoreMaskValue"] = TORANGE(scoreMaskValues);
+
+    // In order to reproduce the result
+    memory_manager[battnName / "query_layer" / "Weights"] = TORANGE(queryLinearLayerWeights);
+    memory_manager[battnName / "memory_layer" / "Weights"] = TORANGE(memoryLinearLayerWeights);
+    memory_manager[battnName / "attention_v"] = TORANGE(attentionV);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    const auto& output = memory_manager["attn"];
+    const auto& indices = memory_manager["max"];
+    EXPECT_EQ(expectedShape, output.getShape());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(output[i], realOut[i], eps));
+    }
+    for (size_t i = 0; i < indices.size(); ++i)
+    {
+        EXPECT_EQ(indices[i], realIndices[i]);
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerStepwiseBahdanauMonotonicAttention, NormalizedModeUnit)
+{
+    PROFILE_TEST
+    // See bahdanau_attention.py
+    // Test parameters
+    const auto eps = TODTYPE(1e-5);
+    const size_t numUnits = 4;
+    const size_t queryDepth = 5;
+    const size_t alignmentsSize = 6;
+    const size_t anyNumber = 7;
+    const size_t batchSize = 3;
+    const Tensor query{ 0.8995079_dt, 0.4925307_dt,  0.36408758_dt, 0.01972544_dt, 0.7798331_dt,  0.77982223_dt, 0.81948376_dt, 0.29104686_dt,
+                        0.1190486_dt, 0.56751144_dt, 0.15556598_dt, 0.4869895_dt,  0.26108038_dt, 0.28730178_dt, 0.71505356_dt };
+
+    const Tensor state{ 0.7241353_dt, 0.6742269_dt,  0.44395304_dt, 0.09891009_dt, 0.95322967_dt, 0.4661187_dt,  0.588459_dt,   0.3044685_dt, 0.03514242_dt,
+                        0.2526785_dt, 0.81228757_dt, 0.77946687_dt, 0.9784529_dt,  0.07848763_dt, 0.80135584_dt, 0.20913935_dt, 0.6607659_dt, 0.21036232_dt };
+
+    const Tensor memory{ 0.01975703_dt, 0.00704217_dt, 0.18987215_dt, 0.7772658_dt,  0.41817415_dt, 0.7437942_dt,  0.26365364_dt, 0.4459244_dt,  0.82929873_dt, 0.52497685_dt, 0.55597556_dt,
+                         0.19923508_dt, 0.46925998_dt, 0.18594062_dt, 0.23303056_dt, 0.3938471_dt,  0.9660922_dt,  0.36530995_dt, 0.28173566_dt, 0.4888971_dt,  0.96301997_dt, 0.45836866_dt,
+                         0.70952535_dt, 0.477888_dt,   0.71620464_dt, 0.12221897_dt, 0.2998824_dt,  0.6689563_dt,  0.06436884_dt, 0.23358119_dt, 0.8235085_dt,  0.24635303_dt, 0.87422705_dt,
+                         0.97360873_dt, 0.5011089_dt,  0.4178022_dt,  0.19041097_dt, 0.05045938_dt, 0.07118928_dt, 0.17497218_dt, 0.06644797_dt, 0.7329292_dt,  0.8574884_dt,  0.4593867_dt,
+                         0.28661895_dt, 0.7181833_dt,  0.30093706_dt, 0.02433372_dt, 0.42253482_dt, 0.06825948_dt, 0.48981392_dt, 0.92883205_dt, 0.9339298_dt,  0.41831005_dt, 0.8322693_dt,
+                         0.22140837_dt, 0.23945987_dt, 0.7574657_dt,  0.5762696_dt,  0.5139812_dt,  0.7258351_dt,  0.86447895_dt, 0.9819726_dt,  0.24162543_dt, 0.24936235_dt, 0.72023165_dt,
+                         0.3312081_dt,  0.40411353_dt, 0.59419465_dt, 0.71123624_dt, 0.8676628_dt,  0.8858366_dt,  0.82439685_dt, 0.43707013_dt, 0.92378604_dt, 0.00537562_dt, 0.63191164_dt,
+                         0.5659201_dt,  0.12591887_dt, 0.5189445_dt,  0.80667794_dt, 0.34214568_dt, 0.34712052_dt, 0.5230378_dt,  0.02033377_dt, 0.9925318_dt,  0.04908013_dt, 0.5698966_dt,
+                         0.4791932_dt,  0.221825_dt,   0.39972973_dt, 0.09565127_dt, 0.07026207_dt, 0.7138928_dt,  0.21078682_dt, 0.8794396_dt,  0.5082735_dt,  0.8915067_dt,  0.13851714_dt,
+                         0.06712937_dt, 0.24958026_dt, 0.10923862_dt, 0.6606549_dt,  0.7950859_dt,  0.5450705_dt,  0.4209025_dt,  0.585426_dt,   0.63537335_dt, 0.40576637_dt, 0.5183171_dt,
+                         0.58145976_dt, 0.7846494_dt,  0.6629163_dt,  0.77547586_dt, 0.75580096_dt, 0.2184534_dt,  0.25045693_dt, 0.22379267_dt, 0.62836266_dt, 0.10235023_dt, 0.74957764_dt,
+                         0.6434492_dt,  0.6539769_dt,  0.11029541_dt, 0.10112023_dt, 0.23958611_dt };
+
+    // Mask parameter
+    const Tensor memorySeqLength{ 1.0_dt, 2.0_dt, 3.0_dt };
+    const Tensor scoreMaskValues{ 1.1_dt };
+
+    const Tensor realOut{ 0.70058554_dt, 0.5293953_dt, 0.5014616_dt, 0.18508108_dt, 0.739872_dt,  0.70417815_dt, 0.568482_dt,   0.31471816_dt, 0.03609333_dt,
+                          0.19835109_dt, 0.6725309_dt, 0.9823275_dt, 0.9503901_dt,  0.1044533_dt, 0.77756566_dt, 0.18279624_dt, 0.54797673_dt, 0.37538192_dt };
+    const Tensor realIndices{ 4.0_dt, 5.0_dt, 0.0_dt };
+    const auto expectedShape = yato::dims(batchSize, 1, 1, alignmentsSize);
+
+    // In order to reproduce the result
+    const Tensor queryLinearLayerWeights{ 0.09049082_dt, 0.2369746_dt,  -0.04944408_dt, -0.813432_dt,  -0.47131312_dt, -0.45512667_dt, 0.04958135_dt, -0.18497097_dt, 0.5842254_dt, 0.26539183_dt,
+                                          0.59591985_dt, -0.7929145_dt, 0.63058674_dt,  -0.6403303_dt, -0.61891955_dt, 0.45280218_dt,  0.6099379_dt,  0.2233758_dt,   0.7512772_dt, -0.57287085_dt };
+    const Tensor memoryLinearLayerWeights{ -0.3072731_dt,  -0.12307996_dt, 0.29059702_dt, 0.10673004_dt, 0.61124223_dt, -0.3253169_dt,  0.26568073_dt, -0.43343008_dt, 0.45469207_dt, -0.55335987_dt,
+                                           -0.19977236_dt, 0.23876876_dt,  -0.7156197_dt, 0.04834241_dt, 0.0522756_dt,  -0.0100072_dt,  0.30991977_dt, -0.11740226_dt, 0.49257308_dt, 0.33437592_dt,
+                                           0.37889642_dt,  0.09048331_dt,  0.73578566_dt, 0.23990375_dt, 0.1921069_dt,  -0.61453474_dt, 0.39222664_dt, -0.6685021_dt };
+    const Tensor attentionV{ 0.36473823_dt, 0.41261613_dt, 0.36806035_dt, 0.03106666_dt };
+
+    Name battnName = "battn";
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data_query", DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<DataLayer>("data_state", DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<DataLayer>("data_memory", DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<DataLayer>("data_memory_seq_length", DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+    work.add<TensorLayer>("data_score_mask_value", TensorParams{ { "scoreMaskValue" }, 1u, 1u, 1u, 1u });
+
+    // Apply function
+    BahdanauMonotonicAttentionLayer(
+        battnName,
+        BahdanauAttentionParams{ { { "query", "state", "memory", "memorySeqLength", "scoreMaskValue" }, { "attn", "values", "max" } }, numUnits, true, 0.0_dt, 3.5_dt, "parallel", true, false },
+        networkParameters);
+
+    TENSORS_CREATE(batchSize);
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["scoreMaskValue"] = TORANGE(scoreMaskValues);
+
+    // In order to reproduce the result
+    memory_manager[battnName / "query_layer" / "Weights"] = TORANGE(queryLinearLayerWeights);
+    memory_manager[battnName / "memory_layer" / "Weights"] = TORANGE(memoryLinearLayerWeights);
+    memory_manager[battnName / "attention_v"] = TORANGE(attentionV);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    const auto& output = memory_manager["attn"];
+    const auto& indices = memory_manager["max"];
+    EXPECT_EQ(expectedShape, output.getShape());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(output[i], realOut[i], eps));
+    }
+    for (size_t i = 0; i < indices.size(); ++i)
+    {
+        EXPECT_EQ(indices[i], realIndices[i]);
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerStepwiseBahdanauMonotonicAttention, OldForwardDefaultModeUnit)
+{
+    PROFILE_TEST
+    // See bahdanau_attention.py
+    // Test parameters
+    const auto eps = TODTYPE(1e-5);
+    const size_t numUnits = 4;
+    const size_t queryDepth = 5;
+    const size_t alignmentsSize = 6;
+    const size_t anyNumber = 7;
+    const size_t batchSize = 3;
+    const Tensor query{ 0.4962565899_dt, 0.7682217956_dt, 0.0884774327_dt, 0.1320304871_dt, 0.3074228168_dt, 0.6340786815_dt, 0.4900934100_dt, 0.8964447379_dt,
+                        0.4556279778_dt, 0.6323062778_dt, 0.3488934636_dt, 0.4017173052_dt, 0.0223257542_dt, 0.1688589454_dt, 0.2938884497_dt };
+
+    const Tensor state{ 0.5185217857_dt, 0.6976675987_dt, 0.8000113964_dt, 0.1610294580_dt, 0.2822685838_dt, 0.6816085577_dt, 0.9151939750_dt, 0.3970999122_dt, 0.8741558790_dt,
+                        0.4194083214_dt, 0.5529070497_dt, 0.9527381063_dt, 0.0361648202_dt, 0.1852310300_dt, 0.3734173775_dt, 0.3051000237_dt, 0.9320003986_dt, 0.1759101748_dt };
+
+    const Tensor memory{ 0.2698335648_dt, 0.1506797671_dt, 0.0317195058_dt, 0.2081297636_dt, 0.9297990203_dt, 0.7231091857_dt, 0.7423362732_dt, 0.5262957811_dt, 0.2436582446_dt, 0.5845923424_dt,
+                         0.0331526399_dt, 0.1387168765_dt, 0.2422350049_dt, 0.8154689670_dt, 0.7931606174_dt, 0.2782524824_dt, 0.4819588065_dt, 0.8197803497_dt, 0.9970665574_dt, 0.6984410882_dt,
+                         0.5675464272_dt, 0.8352431655_dt, 0.2055988312_dt, 0.5931720138_dt, 0.1123472452_dt, 0.1534569263_dt, 0.2417082191_dt, 0.7262365222_dt, 0.7010802031_dt, 0.2038237453_dt,
+                         0.6510535479_dt, 0.7744860053_dt, 0.4368913174_dt, 0.5190907717_dt, 0.6158523560_dt, 0.8101882935_dt, 0.9800970554_dt, 0.1146882176_dt, 0.3167651296_dt, 0.6965049505_dt,
+                         0.9142746925_dt, 0.9351036549_dt, 0.9411783814_dt, 0.5995072722_dt, 0.0652086735_dt, 0.5459962487_dt, 0.1871973276_dt, 0.0340229273_dt, 0.9442462325_dt, 0.8801798820_dt,
+                         0.0012360215_dt, 0.5935860276_dt, 0.4157699943_dt, 0.4177194238_dt, 0.2711215615_dt, 0.6922780871_dt, 0.2038482428_dt, 0.6832956672_dt, 0.7528540492_dt, 0.8579357862_dt,
+                         0.6869555712_dt, 0.0051323771_dt, 0.1756515503_dt, 0.7496575117_dt, 0.6046506763_dt, 0.1099579930_dt, 0.2120902538_dt, 0.9703746438_dt, 0.8369089365_dt, 0.2819874287_dt,
+                         0.3741576076_dt, 0.0237009525_dt, 0.4910129309_dt, 0.1234705448_dt, 0.1143216491_dt, 0.4724501967_dt, 0.5750725269_dt, 0.2952348590_dt, 0.7966887951_dt, 0.1957304478_dt,
+                         0.9536850452_dt, 0.8426499367_dt, 0.0783585310_dt, 0.3755578399_dt, 0.5225613117_dt, 0.5729505420_dt, 0.6185871363_dt, 0.6962141395_dt, 0.5299500823_dt, 0.2560356259_dt,
+                         0.7365944982_dt, 0.0203755498_dt, 0.2036466599_dt, 0.3748350739_dt, 0.2564433217_dt, 0.3250833154_dt, 0.0901891589_dt, 0.3936424255_dt, 0.6068782210_dt, 0.1742671132_dt,
+                         0.4743403196_dt, 0.8579254150_dt, 0.4485998750_dt, 0.5138961077_dt, 0.4568655491_dt, 0.6011906862_dt, 0.8179197311_dt, 0.9736230969_dt, 0.8175279498_dt, 0.9747067690_dt,
+                         0.4638391733_dt, 0.0508392453_dt, 0.2629613876_dt, 0.8404526114_dt, 0.4967587590_dt, 0.2514768243_dt, 0.1168441176_dt, 0.0320739746_dt, 0.0779958963_dt, 0.3985816240_dt,
+                         0.7742030025_dt, 0.7703205347_dt, 0.0177840590_dt, 0.8118910193_dt, 0.1087452769_dt, 0.3942948580_dt };
+
+    // Mask parameter
+    const Tensor memorySeqLength{ 4.0_dt, 3.0_dt, 2.0_dt };
+    const Tensor scoreMaskValues{ 1.1_dt };
+
+    const Tensor realOut{ 0.4176315367_dt, 0.6501834393_dt, 0.7922494411_dt, 0.2852070928_dt, 0.2437335104_dt, 0.5818774104_dt, 0.7506654859_dt, 0.4769741297_dt, 0.7445594668_dt,
+                          0.5289160013_dt, 0.5195670724_dt, 0.8528842926_dt, 0.0290965550_dt, 0.1552542150_dt, 0.3172052503_dt, 0.3221615851_dt, 0.7754383683_dt, 0.3647360802_dt };
+    const Tensor realIndices{ 2.0_dt, 5.0_dt, 4.0_dt };
+    const auto expectedShape = yato::dims(batchSize, 1, 1, alignmentsSize);
+
+    // In order to reproduce the result
+    const Tensor queryLinearLayerWeights{ -0.1122149974_dt, 0.2652359903_dt,  0.4031310081_dt,  -0.2831450105_dt, 0.2075019926_dt,  0.2501629889_dt,  0.0882427990_dt,
+                                          0.0866253972_dt,  -0.3076660037_dt, -0.0484487005_dt, -0.3076879978_dt, -0.3577930033_dt, -0.3952620029_dt, -0.0364488997_dt,
+                                          0.3275179863_dt,  -0.1487360001_dt, 0.0904399976_dt,  -0.3194299936_dt, 0.1861059964_dt,  0.1349589974_dt };
+    const Tensor memoryLinearLayerWeights{ -0.3624039888_dt, -0.3353210092_dt, 0.3552179933_dt,  0.1678149998_dt,  0.2513029873_dt,  0.3315150142_dt,  -0.2174510062_dt,
+                                           -0.3773759902_dt, -0.2405180037_dt, 0.3720769882_dt,  -0.2393240035_dt, 0.0888077021_dt,  -0.1479790062_dt, 0.0844018012_dt,
+                                           0.0187141001_dt,  -0.3726229966_dt, -0.0514447019_dt, -0.3605310023_dt, -0.1578159928_dt, 0.0187279005_dt,  0.0845528021_dt,
+                                           -0.0756980032_dt, -0.2725169957_dt, -0.3426890075_dt, -0.1571239978_dt, 0.3581260145_dt,  -0.1010209993_dt, -0.2020059973_dt };
+    const Tensor attentionV{ -0.076089_dt, -0.70909_dt, 0.493939_dt, 0.205051_dt };
+
+    Name battnName = "battn";
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data_query", DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<DataLayer>("data_state", DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<DataLayer>("data_memory", DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<DataLayer>("data_memory_seq_length", DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+    work.add<TensorLayer>("data_score_mask_value", TensorParams{ { "scoreMaskValue" }, 1u, 1u, 1u, 1u });
+
+    // Apply function
+    BahdanauMonotonicAttentionLayer(
+        battnName,
+        BahdanauAttentionParams{ { { "query", "state", "memory", "memorySeqLength", "scoreMaskValue" }, { "attn", "values", "max" } }, numUnits, false, 0.0_dt, 1.7_dt, "parallel", true, true },
+        networkParameters);
+
+    TENSORS_CREATE(batchSize);
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["scoreMaskValue"] = TORANGE(scoreMaskValues);
+
+    // In order to reproduce the result
+    memory_manager[battnName / "query_layer" / "Weights"] = TORANGE(queryLinearLayerWeights);
+    memory_manager[battnName / "memory_layer" / "Weights"] = TORANGE(memoryLinearLayerWeights);
+    memory_manager[battnName / "attention_v"] = TORANGE(attentionV);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    const auto& output = memory_manager["attn"];
+    const auto& indices = memory_manager["max"];
+    EXPECT_EQ(expectedShape, output.getShape());
+
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(output[i], realOut[i], eps));
+    }
+    for (size_t i = 0; i < indices.size(); ++i)
+    {
+        EXPECT_EQ(indices[i], realIndices[i]);
+    }
+}
+
+TEST(TestLayerStepwiseBahdanauMonotonicAttention, OldBackwardDefaultModeUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // See bahdanau_attention.py and bahdanau_attention_scheme.py
+    // Test parameters
+    const auto eps = TODTYPE(1e-5);
+    const size_t numUnits = 4;
+    const size_t queryDepth = 5;
+    const size_t alignmentsSize = 6;
+    const size_t anyNumber = 7;
+    const size_t batchSize = 3;
+    const Tensor query{ 0.4962565899_dt, 0.7682217956_dt, 0.0884774327_dt, 0.1320304871_dt, 0.3074228168_dt, 0.6340786815_dt, 0.4900934100_dt, 0.8964447379_dt,
+                        0.4556279778_dt, 0.6323062778_dt, 0.3488934636_dt, 0.4017173052_dt, 0.0223257542_dt, 0.1688589454_dt, 0.2938884497_dt };
+
+    const Tensor state{ 0.5185217857_dt, 0.6976675987_dt, 0.8000113964_dt, 0.1610294580_dt, 0.2822685838_dt, 0.6816085577_dt, 0.9151939750_dt, 0.3970999122_dt, 0.8741558790_dt,
+                        0.4194083214_dt, 0.5529070497_dt, 0.9527381063_dt, 0.0361648202_dt, 0.1852310300_dt, 0.3734173775_dt, 0.3051000237_dt, 0.9320003986_dt, 0.1759101748_dt };
+
+    const Tensor memory{ 0.2698335648_dt, 0.1506797671_dt, 0.0317195058_dt, 0.2081297636_dt, 0.9297990203_dt, 0.7231091857_dt, 0.7423362732_dt, 0.5262957811_dt, 0.2436582446_dt, 0.5845923424_dt,
+                         0.0331526399_dt, 0.1387168765_dt, 0.2422350049_dt, 0.8154689670_dt, 0.7931606174_dt, 0.2782524824_dt, 0.4819588065_dt, 0.8197803497_dt, 0.9970665574_dt, 0.6984410882_dt,
+                         0.5675464272_dt, 0.8352431655_dt, 0.2055988312_dt, 0.5931720138_dt, 0.1123472452_dt, 0.1534569263_dt, 0.2417082191_dt, 0.7262365222_dt, 0.7010802031_dt, 0.2038237453_dt,
+                         0.6510535479_dt, 0.7744860053_dt, 0.4368913174_dt, 0.5190907717_dt, 0.6158523560_dt, 0.8101882935_dt, 0.9800970554_dt, 0.1146882176_dt, 0.3167651296_dt, 0.6965049505_dt,
+                         0.9142746925_dt, 0.9351036549_dt, 0.9411783814_dt, 0.5995072722_dt, 0.0652086735_dt, 0.5459962487_dt, 0.1871973276_dt, 0.0340229273_dt, 0.9442462325_dt, 0.8801798820_dt,
+                         0.0012360215_dt, 0.5935860276_dt, 0.4157699943_dt, 0.4177194238_dt, 0.2711215615_dt, 0.6922780871_dt, 0.2038482428_dt, 0.6832956672_dt, 0.7528540492_dt, 0.8579357862_dt,
+                         0.6869555712_dt, 0.0051323771_dt, 0.1756515503_dt, 0.7496575117_dt, 0.6046506763_dt, 0.1099579930_dt, 0.2120902538_dt, 0.9703746438_dt, 0.8369089365_dt, 0.2819874287_dt,
+                         0.3741576076_dt, 0.0237009525_dt, 0.4910129309_dt, 0.1234705448_dt, 0.1143216491_dt, 0.4724501967_dt, 0.5750725269_dt, 0.2952348590_dt, 0.7966887951_dt, 0.1957304478_dt,
+                         0.9536850452_dt, 0.8426499367_dt, 0.0783585310_dt, 0.3755578399_dt, 0.5225613117_dt, 0.5729505420_dt, 0.6185871363_dt, 0.6962141395_dt, 0.5299500823_dt, 0.2560356259_dt,
+                         0.7365944982_dt, 0.0203755498_dt, 0.2036466599_dt, 0.3748350739_dt, 0.2564433217_dt, 0.3250833154_dt, 0.0901891589_dt, 0.3936424255_dt, 0.6068782210_dt, 0.1742671132_dt,
+                         0.4743403196_dt, 0.8579254150_dt, 0.4485998750_dt, 0.5138961077_dt, 0.4568655491_dt, 0.6011906862_dt, 0.8179197311_dt, 0.9736230969_dt, 0.8175279498_dt, 0.9747067690_dt,
+                         0.4638391733_dt, 0.0508392453_dt, 0.2629613876_dt, 0.8404526114_dt, 0.4967587590_dt, 0.2514768243_dt, 0.1168441176_dt, 0.0320739746_dt, 0.0779958963_dt, 0.3985816240_dt,
+                         0.7742030025_dt, 0.7703205347_dt, 0.0177840590_dt, 0.8118910193_dt, 0.1087452769_dt, 0.3942948580_dt };
+
+    // Mask parameter
+    const Tensor memorySeqLength{ 4.0_dt, 3.0_dt, 2.0_dt };
+    const Tensor scoreMaskValues{ 1.1_dt };
+
+    const Tensor queryRealGrad{ 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt };
+    const Tensor stateRealGrad{ 1.0_dt, 1.0_dt, 1.0_dt,           1.0_dt, 1.0_dt, 0.7502601147_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                1.0_dt, 1.0_dt, 0.75026011470_dt, 1.0_dt, 1.0_dt, 1.0_dt,          1.0_dt, 1.0_dt, 0.7502601147_dt };
+    const Tensor memoryRealGrad{
+        0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+        0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+        0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+        0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+        0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+        0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt
+    };
+
+    // In order to reproduce the result
+    const Tensor queryLinearLayerWeights{ -0.1122149974_dt, 0.2652359903_dt,  0.4031310081_dt,  -0.2831450105_dt, 0.2075019926_dt,  0.2501629889_dt,  0.0882427990_dt,
+                                          0.0866253972_dt,  -0.3076660037_dt, -0.0484487005_dt, -0.3076879978_dt, -0.3577930033_dt, -0.3952620029_dt, -0.0364488997_dt,
+                                          0.3275179863_dt,  -0.1487360001_dt, 0.0904399976_dt,  -0.3194299936_dt, 0.1861059964_dt,  0.1349589974_dt };
+    const Tensor memoryLinearLayerWeights{ -0.3624039888_dt, -0.3353210092_dt, 0.3552179933_dt,  0.1678149998_dt,  0.2513029873_dt,  0.3315150142_dt,  -0.2174510062_dt,
+                                           -0.3773759902_dt, -0.2405180037_dt, 0.3720769882_dt,  -0.2393240035_dt, 0.0888077021_dt,  -0.1479790062_dt, 0.0844018012_dt,
+                                           0.0187141001_dt,  -0.3726229966_dt, -0.0514447019_dt, -0.3605310023_dt, -0.1578159928_dt, 0.0187279005_dt,  0.0845528021_dt,
+                                           -0.0756980032_dt, -0.2725169957_dt, -0.3426890075_dt, -0.1571239978_dt, 0.3581260145_dt,  -0.1010209993_dt, -0.2020059973_dt };
+    const Tensor attentionV{ -0.076089_dt, -0.70909_dt, 0.493939_dt, 0.205051_dt };
+
+    Name battnName = "battn";
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data_query", DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<DataLayer>("data_state", DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<DataLayer>("data_memory", DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<DataLayer>("data_memory_seq_length", DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+    work.add<TensorLayer>("data_score_mask_value", TensorParams{ { "scoreMaskValue" }, 1u, 1u, 1u, 1u });
+
+    // Apply function
+    BahdanauMonotonicAttentionLayer(
+        battnName,
+        BahdanauAttentionParams{ { { "query", "state", "memory", "memorySeqLength", "scoreMaskValue" }, { "attn", "values", "max" } }, numUnits, false, 0.0_dt, 1.7_dt, "parallel", true, true },
+        networkParameters);
+
+    TENSORS_CREATE(batchSize);
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["scoreMaskValue"] = TORANGE(scoreMaskValues);
+
+    // In order to reproduce the result
+    memory_manager[battnName / "query_layer" / "Weights"] = TORANGE(queryLinearLayerWeights);
+    memory_manager[battnName / "memory_layer" / "Weights"] = TORANGE(memoryLinearLayerWeights);
+    memory_manager[battnName / "attention_v"] = TORANGE(attentionV);
+    memory_manager[Name("attn").grad()] = 1.0_dt;
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    const auto queryNabla = memory_manager[Name("query").grad()];
+    EXPECT_EQ(queryNabla.size(), queryRealGrad.size());
+    for (size_t i = 0; i < queryNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(queryNabla[i], queryRealGrad[i], eps) || (queryNabla[i] < eps && queryRealGrad[i] < eps));
+    }
+    const auto stateNabla = memory_manager[Name("state").grad()];
+    EXPECT_EQ(stateNabla.size(), stateRealGrad.size());
+    for (size_t i = 0; i < stateNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(stateNabla[i], stateRealGrad[i], eps));
+    }
+    const auto memoryNabla = memory_manager[Name("memory").grad()];
+    EXPECT_EQ(memoryNabla.size(), memoryRealGrad.size());
+    for (size_t i = 0; i < memoryNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(memoryNabla[i], memoryRealGrad[i], eps) || (memoryNabla[i] < eps && memoryRealGrad[i] < eps));
+    }
+}
+
+TEST(TestLayerBahdanauMonotonicAttention, SimpleMultipleForwardBackwardUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // See bahdanau_attention.py
+    // Test parameters
+    const auto eps = TODTYPE(1e-3);
+    const size_t numUnits = 3;
+    const size_t queryDepth = 2;
+    const size_t alignmentsSize = 3;
+    const size_t anyNumber = 2;
+    const size_t batchSize = 2;
+    const Tensor query1{ 0.8822692633_dt, 0.9150039554_dt, 0.3828637600_dt, 0.9593056440_dt };
+
+    const Tensor state1{ 0.3904482126_dt, 0.6008953452_dt, 0.2565724850_dt, 0.7936413288_dt, 0.9407714605_dt, 0.1331859231_dt };
+
+    const Tensor memory1{ 0.9345980883_dt, 0.5935796499_dt, 0.8694044352_dt, 0.5677152872_dt, 0.7410940528_dt, 0.4294044971_dt,
+                          0.8854429126_dt, 0.5739044547_dt, 0.2665800452_dt, 0.6274491549_dt, 0.2696316838_dt, 0.4413635731_dt };
+
+    const Tensor query2{ 0.2969208360_dt, 0.8316854835_dt, 0.1053149104_dt, 0.2694948316_dt };
+
+    const Tensor state2{ 0.3588126302_dt, 0.1993637681_dt, 0.5471915603_dt, 0.0061604381_dt, 0.9515545368_dt, 0.0752658844_dt };
+
+    const Tensor memory2{ 0.8860136867_dt, 0.5832095742_dt, 0.3376477361_dt, 0.8089749813_dt, 0.5779253840_dt, 0.9039816856_dt,
+                          0.5546598434_dt, 0.3423134089_dt, 0.6343418360_dt, 0.3644102812_dt, 0.7104287744_dt, 0.9464110732_dt };
+
+    // In order to reproduce the result
+    const Tensor queryLinearLayerWeights{ -0.1448689997_dt, 0.3424179852_dt, -0.3655380011_dt, 0.2678830028_dt, 0.123214_dt, 0.565657676_dt };
+    const Tensor memoryLinearLayerWeights{ -0.0409465991_dt, -0.2600440085_dt, 0.0764357001_dt, -0.2699669898_dt, 0.675676_dt, 0.234354545_dt };
+    const Tensor attentionV{ -0.633191_dt, 0.234963_dt, -0.391515_dt };
+
+    const Tensor realAttentionVGrad{ 0.0224523991_dt, -0.0126725147_dt, 0.2702962756_dt };
+    const Tensor realScoreBiasGrad{ 0.3677265942_dt };
+
+    Name battnChild = "battn_child";
+    Name battnParent = "battn_parent";
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data_query", DataParams{ { "query1", "query2" }, 1u, 1u, queryDepth });
+    work.add<DataLayer>("data_state", DataParams{ { "state1", "state2" }, 1u, 1u, alignmentsSize });
+    work.add<DataLayer>("data_memory", DataParams{ { "memory1", "memory2" }, 1u, alignmentsSize, anyNumber });
+
+    BahdanauMonotonicAttentionLayer(battnParent, BahdanauAttentionParams{ { { "query1", "state1", "memory1" }, { "attn1" } }, numUnits, false, 0.0_dt, 1.7_dt }, networkParameters);
+    BahdanauMonotonicAttentionLayer(battnChild, BahdanauAttentionParams{ { { "query2", "state2", "memory2" }, { "attn2" } }, battnParent, numUnits, false, 0.0_dt, 1.7_dt }, networkParameters);
+
+    TENSORS_CREATE(batchSize);
+    memory_manager["query1"] = TORANGE(query1);
+    memory_manager["state1"] = TORANGE(state1);
+    memory_manager["memory1"] = TORANGE(memory1);
+    memory_manager["query2"] = TORANGE(query2);
+    memory_manager["state2"] = TORANGE(state2);
+    memory_manager["memory2"] = TORANGE(memory2);
+
+    // In order to reproduce the result
+    memory_manager[battnParent / "query_layer" / "Weights"] = TORANGE(queryLinearLayerWeights);
+    memory_manager[battnParent / "memory_layer" / "Weights"] = TORANGE(memoryLinearLayerWeights);
+    memory_manager[battnParent / "attention_v"] = TORANGE(attentionV);
+    memory_manager[Name("attn1").grad()] = 1.0_dt;
+    memory_manager[Name("attn2").grad()] = 1.0_dt;
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Checks
+    const auto attentionVNabla = memory_manager[(battnParent / "attention_v").grad()];
+    EXPECT_EQ(attentionVNabla.size(), realAttentionVGrad.size());
+    for (size_t i = 0; i < attentionVNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(attentionVNabla[i], realAttentionVGrad[i], eps));
+    }
+    const auto scoreBiasNabla = memory_manager[(battnParent / "score_bias").grad()];
+    EXPECT_EQ(scoreBiasNabla.size(), realScoreBiasGrad.size());
+    for (size_t i = 0; i < scoreBiasNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(scoreBiasNabla[i], realScoreBiasGrad[i], eps));
+    }
+}
+
+TEST(TestLayerBahdanauMonotonicAttention, MultipleForwardBackwardUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // See bahdanau_attention.py
+    // Test parameters
+    const auto eps = TODTYPE(1e-3);
+    const size_t numUnits = 3;
+    const size_t queryDepth = 2;
+    const size_t alignmentsSize = 2;
+    const size_t anyNumber = 2;
+    const size_t batchSize = 2;
+    const Tensor query1{ 0.8822692633_dt, 0.9150039554_dt, 0.3828637600_dt, 0.9593056440_dt };
+
+    const Tensor state1{ 0.3904482126_dt, 0.6008953452_dt, 0.2565724850_dt, 0.7936413288_dt };
+
+    const Tensor memory1{ 0.9407714605_dt, 0.1331859231_dt, 0.9345980883_dt, 0.5935796499_dt, 0.8694044352_dt, 0.5677152872_dt, 0.7410940528_dt, 0.4294044971_dt };
+
+    const Tensor query2{ 0.8854429126_dt, 0.5739044547_dt, 0.2665800452_dt, 0.6274491549_dt };
+
+    const Tensor state2{ 0.2696316838_dt, 0.4413635731_dt, 0.2969208360_dt, 0.8316854835_dt };
+
+    // Mask parameter
+    const Tensor memorySeqLength1{ 1.0_dt, 1.0_dt };
+    const Tensor scoreMaskValues1{ 1.1_dt };
+    const Tensor memorySeqLength2{ 1.0_dt, 1.0_dt };
+    const Tensor scoreMaskValues2{ 1.1_dt };
+
+    // In order to reproduce the result
+    const Tensor queryLinearLayerWeights{ -0.1448689997_dt, 0.3424179852_dt, -0.3655380011_dt, 0.2678830028_dt, 0.123214_dt, 0.565657676_dt };
+    const Tensor memoryLinearLayerWeights{ -0.0409465991_dt, -0.2600440085_dt, 0.0764357001_dt, -0.2699669898_dt, 0.675676_dt, 0.234354545_dt };
+    const Tensor attentionV{ -0.633191_dt, 0.234963_dt, -0.391515_dt };
+
+    const Tensor realFinalResult{ 0.2017848790_dt, 0.0496129580_dt, 0.0_dt, 0.0_dt, 0.2065062523_dt, 0.3795414865_dt, 0.0_dt, 0.0_dt };
+
+    const Tensor realFinalValues{ 0.9407714605_dt, 0.1331859231_dt, 0.0_dt, 0.0_dt, 0.8694044352_dt, 0.5677152872_dt, 0.0_dt, 0.0_dt };
+
+    // Gradient for internals
+    const Tensor realAttentionVNabla{ 0.0076167323_dt, -0.0069890255_dt, 0.1108852476_dt };
+    const Tensor realScoreBiasNabla{ 0.1327716708_dt };
+
+    // Gradients for inputs
+    const Tensor realQuery1Nabla{ -0.0004310280_dt, -0.0116593838_dt, -0.0001334163_dt, -0.0039595729_dt };
+    const Tensor realState1Nabla{ 0.7582378983_dt, 0.0999240801_dt, 0.7753205299_dt, 0.4259341657_dt };
+    const Tensor realMemory1Nabla{ 0.5168668628_dt, 0.8936948776_dt, 0.0_dt, 0.0_dt, 0.4381725192_dt, 1.3077256680_dt, 0.0_dt, 0.0_dt };
+    const Tensor realQuery2Nabla{ -0.0003669365_dt, -0.0086477958_dt, -0.0002368438_dt, -0.0048965309_dt };
+    const Tensor realState2Nabla{ 0.7688080072_dt, 0.0999240875_dt, 0.7806946039_dt, 0.4259341359_dt };
+
+    Name battnChild = "battn_child";
+    Name battnParent = "battn_parent";
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data_query", DataParams{ { "query1", "query2" }, 1u, 1u, queryDepth });
+    work.add<DataLayer>("data_state", DataParams{ { "state1", "state2" }, 1u, 1u, alignmentsSize });
+    work.add<DataLayer>("data_memory", DataParams{ { "memory1" }, 1u, alignmentsSize, anyNumber });
+    work.add<DataLayer>("data_memory_seq_length", DataParams{ { "memorySeqLength1", "memorySeqLength2" }, 1u, 1u, 1u });
+    work.add<TensorLayer>("data_score_mask_value1", TensorParams{ { "scoreMaskValues1" }, 1u, 1u, 1u, 1u });
+    work.add<TensorLayer>("data_score_mask_value2", TensorParams{ { "scoreMaskValues2" }, 1u, 1u, 1u, 1u });
+
+    BahdanauMonotonicAttentionLayer(
+        battnParent, BahdanauAttentionParams{ { { "query1", "state1", "memory1", "memorySeqLength1", "scoreMaskValues1" }, { "y1", "values" } }, numUnits, false, 0.0_dt, 1.7_dt }, networkParameters);
+    work.add<ElementWiseMulLayer>("mul1", ElementWiseLayerParams{ { "values", "y1" }, { "z1" } });
+    BahdanauMonotonicAttentionLayer(
+        battnChild, BahdanauAttentionParams{ { { "query2", "state2", "z1", "memorySeqLength2", "scoreMaskValues2" }, { "y2" } }, battnParent, numUnits, false, 0.0_dt, 1.7_dt }, networkParameters);
+    work.add<ElementWiseMulLayer>("mul2", ElementWiseLayerParams{ { "values", "y2" }, { "z2" } });
+
+    TENSORS_CREATE(batchSize);
+    memory_manager["query1"] = TORANGE(query1);
+    memory_manager["state1"] = TORANGE(state1);
+    memory_manager["memory1"] = TORANGE(memory1);
+    memory_manager["query2"] = TORANGE(query2);
+    memory_manager["state2"] = TORANGE(state2);
+
+    memory_manager["memorySeqLength1"] = TORANGE(memorySeqLength1);
+    memory_manager["scoreMaskValues1"] = TORANGE(scoreMaskValues1);
+
+    memory_manager["memorySeqLength2"] = TORANGE(memorySeqLength2);
+    memory_manager["scoreMaskValues2"] = TORANGE(scoreMaskValues2);
+
+    // In order to reproduce the result
+    memory_manager[battnParent / "query_layer" / "Weights"] = TORANGE(queryLinearLayerWeights);
+    memory_manager[battnParent / "memory_layer" / "Weights"] = TORANGE(memoryLinearLayerWeights);
+    memory_manager[battnParent / "attention_v"] = TORANGE(attentionV);
+    memory_manager[Name("z2").grad()] = 1.0_dt;
+    memory_manager[Name("z1").grad()] = 1.0_dt;
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Forward cheks
+    const auto finalResult = memory_manager["z2"];
+    const auto finalValues = memory_manager["values"];
+    for (size_t i = 0; i < finalResult.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(finalResult[i], realFinalResult[i], eps));
+    }
+    for (size_t i = 0; i < finalValues.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(finalValues[i], realFinalValues[i], eps));
+    }
+
+    work.backwardPassTraining();
+
+    const auto attentionVNabla = memory_manager[(battnParent / "attention_v").grad()];
+    EXPECT_EQ(attentionVNabla.size(), realAttentionVNabla.size());
+    for (size_t i = 0; i < attentionVNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(attentionVNabla[i], realAttentionVNabla[i], eps));
+    }
+    const auto scoreBiasNabla = memory_manager[(battnParent / "score_bias").grad()];
+    EXPECT_EQ(scoreBiasNabla.size(), realScoreBiasNabla.size());
+    for (size_t i = 0; i < scoreBiasNabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(scoreBiasNabla[i], realScoreBiasNabla[i], eps));
+    }
+    const auto query1Nabla = memory_manager[Name("query1").grad()];
+    EXPECT_EQ(query1Nabla.size(), realQuery1Nabla.size());
+    for (size_t i = 0; i < query1Nabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(query1Nabla[i], realQuery1Nabla[i], eps));
+    }
+    const auto state1Nabla = memory_manager[Name("state1").grad()];
+    EXPECT_EQ(state1Nabla.size(), realState1Nabla.size());
+    for (size_t i = 0; i < state1Nabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(state1Nabla[i], realState1Nabla[i], eps));
+    }
+    const auto memory1Nabla = memory_manager[Name("memory1").grad()];
+    EXPECT_EQ(memory1Nabla.size(), realMemory1Nabla.size());
+    for (size_t i = 0; i < memory1Nabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(memory1Nabla[i], realMemory1Nabla[i], eps));
+    }
+    const auto query2Nabla = memory_manager[Name("query2").grad()];
+    EXPECT_EQ(query2Nabla.size(), realQuery2Nabla.size());
+    for (size_t i = 0; i < query2Nabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(query2Nabla[i], realQuery2Nabla[i], eps));
+    }
+    const auto state2Nabla = memory_manager[Name("state2").grad()];
+    EXPECT_EQ(state2Nabla.size(), realState2Nabla.size());
+    for (size_t i = 0; i < state2Nabla.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(state2Nabla[i], realState2Nabla[i], eps));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_BatchExpander.cpp b/training/src/tests/tests/layers/Test_Layer_BatchExpander.cpp
new file mode 100644
index 00000000..2d7c83d3
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_BatchExpander.cpp
@@ -0,0 +1,131 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/BatchExpanderLayer.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+using namespace raul;
+
+TEST(TestLayerBatchExpander, IncorrectInputSizeUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto params = ViewParams{ { "x" }, { "out" } };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<TensorLayer>("input", TensorParams{ { "x" }, 2, 1, 1, 1 });
+    BatchExpanderLayer expander("expander", params, networkParameters);
+    ASSERT_THROW(expander.forwardCompute(NetworkMode::Test), raul::Exception);
+}
+
+TEST(TestLayerBatchExpander, IncorrectInputShapeUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto params = ViewParams{ { "x" }, { "out" } };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x" }, 1, 1, 1 });
+    ASSERT_THROW(BatchExpanderLayer expander("expander", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerBatchExpander, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batchSize = 3;
+    const auto depth = 4;
+    const auto height = 5;
+    const auto width = 6;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<TensorLayer>("input", TensorParams{ { "x" }, 1, depth, height, width });
+    // Apply function
+    const auto params = ViewParams{ { "x" }, { "out" }, 4, 5, 6 };
+    BatchExpanderLayer expander("expander", params, networkParameters);
+    TENSORS_CREATE(batchSize);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    expander.forwardCompute(NetworkMode::Test);
+
+    // Checks
+    const auto& xTensor = memory_manager["x"];
+    const auto& outTensor = memory_manager["out"];
+
+    EXPECT_EQ(outTensor.size(), xTensor.size() * batchSize);
+    auto outTensor2D = outTensor.reshape(yato::dims(batchSize, depth * height * width));
+    for (size_t i = 0; i < batchSize; ++i)
+    {
+        for (size_t j = 0; j < depth * height * width; ++j)
+        {
+            EXPECT_EQ(outTensor2D[i][j], xTensor[j]);
+        }
+    }
+}
+
+TEST(TestLayerBatchExpander, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batchSize = 3;
+    const auto depth = 4;
+    const auto height = 5;
+    const auto width = 6;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<TensorLayer>("input", TensorParams{ { "x" }, 1, depth, height, width });
+    // Apply function
+    const auto params = ViewParams{ { "x" }, { "out" }, 4, 5, 6 };
+    BatchExpanderLayer expander("expander", params, networkParameters);
+    TENSORS_CREATE(batchSize);
+
+    expander.forwardCompute(NetworkMode::Test);
+
+    memory_manager[Name("out").grad()].memAllocate(nullptr);
+    memory_manager[Name("out").grad()] = 1_dt;
+
+    expander.backwardCompute();
+
+    // Checks
+    const auto& xTensor = memory_manager["x"];
+    const auto& xNablaTensor = memory_manager[Name("x").grad()];
+
+    EXPECT_EQ(xNablaTensor.size(), xTensor.size());
+    for (size_t i = 0; i < xNablaTensor.size(); ++i)
+    {
+        EXPECT_EQ(xNablaTensor[i], static_cast<dtype>(batchSize));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_BatchNorm.cpp b/training/src/tests/tests/layers/Test_Layer_BatchNorm.cpp
new file mode 100644
index 00000000..82aa87e4
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_BatchNorm.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/base/layers/basic/trainable/Batchnorm.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestBatchNorm, Unit)
+{
+    PROFILE_TEST
+    const raul::dtype eps = TODTYPE(1e-4);
+    const size_t batch = 1;
+    const raul::Tensor input = { -1.12583983_dt, -1.15236020_dt, -0.25057858_dt, -0.43387881_dt, 0.84871036_dt, 0.69200915_dt,  -0.31601277_dt, -2.11521935_dt,
+                                 0.46809641_dt,  -0.15771244_dt, 1.44366014_dt,  0.26604941_dt,  0.16645534_dt, 0.87438184_dt,  -0.14347385_dt, -0.11160933_dt,
+                                 0.93182659_dt,  1.25900924_dt,  2.00498056_dt,  0.05373690_dt,  0.61805665_dt, -0.41280222_dt, -0.84106481_dt, -2.31604195_dt };
+
+    const raul::Tensor realOut = { -1.02864230_dt, -1.05667686_dt, -0.10340743_dt, -0.29717332_dt, 1.05864620_dt,  0.89299798_dt,  -0.17257762_dt, -2.07451105_dt,
+                                   0.65630078_dt,  -0.00523904_dt, 1.68756485_dt,  0.44271776_dt,  -0.00674019_dt, 0.65912879_dt,  -0.29825664_dt, -0.26828519_dt,
+                                   0.71316075_dt,  1.02090561_dt,  1.72255909_dt,  -0.11276208_dt, 0.41803172_dt,  -0.55158430_dt, -0.95440412_dt, -2.34175348_dt };
+
+    std::array<std::string, 3> dimensions{ "depth", "height", "width" };
+    yato::dimensionality<4U, size_t> shapes[] = { yato::dims(batch, 2, 3, 4), yato::dims(batch, 3, 2, 4), yato::dims(batch, 4, 2, 3) };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data1", raul::DataParams{ { "in1" }, shapes[iter][1], shapes[iter][2], shapes[iter][3] });
+        work.add<raul::DataLayer>("data2", raul::DataParams{ { "in2" }, shapes[iter][1], shapes[iter][2], shapes[iter][3] });
+        // First branch
+        raul::BatchNormLayer norm("bn", raul::BatchnormParams{ { "in1" }, { "out1" }, 0.0f, 1e-5f, dimensions[iter] }, networkParameters);
+
+        // Second branch
+        raul::TransposeLayer tr1("tr1", raul::TransposingParams{ { "in2" }, { "intermediate1" }, "depth", dimensions[iter] }, networkParameters);
+        raul::BatchNormLayer norm_default("bn_def", raul::BatchnormParams{ { "intermediate1" }, { "intermediate2" } }, networkParameters);
+        raul::TransposeLayer tr2("tr2", raul::TransposingParams{ { "intermediate2" }, { "out2" }, "depth", dimensions[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["in1"] = TORANGE(input);
+        memory_manager["in2"] = TORANGE(input);
+        norm.initNotBSTensors();
+        norm_default.initNotBSTensors();
+
+        // First
+        norm.forwardCompute(raul::NetworkMode::Train);
+        if (iter == 0)
+        {
+            const raul::Tensor& out = memory_manager["out1"];
+            for (size_t i = 0; i < out.size(); ++i)
+            {
+                EXPECT_NEAR(out[i], realOut[i], eps);
+            }
+        }
+        else
+        {
+            // Second variant
+            tr1.forwardCompute(raul::NetworkMode::Train);
+            norm_default.forwardCompute(raul::NetworkMode::Train);
+            tr2.forwardCompute(raul::NetworkMode::Train);
+
+            // Compare
+            const raul::Tensor& out1 = memory_manager["out1"];
+            const raul::Tensor& out2 = memory_manager["out2"];
+            EXPECT_EQ(out1.getShape(), out2.getShape());
+            for (size_t i = 0; i < out1.size(); ++i)
+            {
+                EXPECT_NEAR(out1[i], out2[i], eps);
+            }
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_BiLSTM.cpp b/training/src/tests/tests/layers/Test_Layer_BiLSTM.cpp
new file mode 100644
index 00000000..9e1efc12
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_BiLSTM.cpp
@@ -0,0 +1,365 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <training/compiler/Layers.h>
+#include <training/base/optimizers/SGD.h>
+
+#include <training/base/layers/composite/rnn/BidirectionalLSTMFunc.h>
+#include <training/base/layers/composite/rnn/LSTMLayer.h>
+
+namespace UT
+{
+using namespace std;
+using namespace raul;
+
+TEST(TestBiLSTM, SimpleForwardSeq1Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 1U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data1", DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    BidirectionalLSTMFunc("lstm", params, networkParameters, BidirectionalMergeType::ConcatDepth);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const Tensor output_golden{ -5.799451e-02_dt, -5.799451e-02_dt, -5.799451e-02_dt, 7.003791e-01_dt, 7.003791e-01_dt, 7.003791e-01_dt };
+
+    const auto& outputTensor = memory_manager["out"];
+    const auto& outputTensor_1 = memory_manager["out::direct"];
+    const auto& outputTensor_2 = memory_manager["out::reversed"];
+
+    EXPECT_EQ(outputTensor.size(), 2 * batch_size * hidden_size * sequence_length);
+
+    for (size_t i = 0; i < output_golden.size(); ++i)
+    {
+        const auto val_1 = outputTensor_1[i];
+        const auto val_2 = outputTensor_2[i];
+        const auto val_g = output_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val_1, val_g, eps_rel)) << "at " << i << ", expected: " << val_g << ", got: " << val_1;
+        ASSERT_TRUE(tools::expect_near_relative(val_2, val_g, eps_rel)) << "at " << i << ", expected: " << val_g << ", got: " << val_2;
+    }
+}
+
+TEST(TestBiLSTM, SimpleForwardSeq2Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 2U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data1", DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    BidirectionalLSTMFunc("lstm", params, networkParameters, BidirectionalMergeType::ConcatDepth);
+
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 4.386492e-01_dt, -1.070405e-02_dt, 1.338354e+00_dt, -2.794050e-01_dt, -5.518340e-01_dt, -2.889061e+00_dt, -1.509981e+00_dt, 1.024115e+00_dt,
+                                   1.953929e-01_dt, -7.371095e-01_dt, 1.700101e+00_dt, 3.462155e-01_dt,  9.711247e-01_dt,  1.450250e+00_dt,  -5.190918e-02_dt, -6.284308e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    work.printInfo(std::cout);
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const Tensor outputTensor_1_golden{ 7.258359e-01_dt, 7.258359e-01_dt, 7.258359e-01_dt, 3.336587e-01_dt, 3.336587e-01_dt, 3.336587e-01_dt,
+                                              7.264570e-01_dt, 7.264570e-01_dt, 7.264570e-01_dt, 9.588038e-01_dt, 9.588038e-01_dt, 9.588038e-01_dt };
+    const Tensor outputTensor_2_golden{ 6.690636e-01_dt, 6.690636e-01_dt, 6.690636e-01_dt, -1.540969e-02_dt, -1.540969e-02_dt, -1.540969e-02_dt,
+                                              9.585935e-01_dt, 9.585935e-01_dt, 9.585935e-01_dt, 7.337949e-01_dt,  7.337949e-01_dt,  7.337949e-01_dt };
+
+    const auto& outputTensor = memory_manager["out"];
+    const auto& outputTensor_1 = memory_manager["out::direct"];
+    const auto& outputTensor_2 = memory_manager["out::reversed"];
+
+    EXPECT_EQ(outputTensor.size(), 2 * batch_size * hidden_size * sequence_length);
+
+    for (size_t i = 0; i < outputTensor_1.size(); ++i)
+    {
+        const auto val_1 = outputTensor_1[i];
+        const auto val_2 = outputTensor_2[i];
+        const auto val_1_g = outputTensor_1_golden[i];
+        const auto val_2_g = outputTensor_2_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val_1, val_1_g, eps_rel)) << "at " << i << ", expected: " << val_1_g << ", got: " << val_1;
+        ASSERT_TRUE(tools::expect_near_relative(val_2, val_2_g, eps_rel)) << "at " << i << ", expected: " << val_2_g << ", got: " << val_2;
+    }
+}
+
+TEST(TestBiLSTM, SimpleForwardSeq5Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 5U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data1", DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    BidirectionalLSTMFunc("lstm", params, networkParameters, BidirectionalMergeType::ConcatDepth);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 4.386492e-01_dt,  -1.070405e-02_dt, 1.338354e+00_dt,  -2.794050e-01_dt, -5.518340e-01_dt, -2.889061e+00_dt, -1.509981e+00_dt, 1.024115e+00_dt,
+                                   1.953929e-01_dt,  -7.371095e-01_dt, 1.700101e+00_dt,  3.462155e-01_dt,  9.711247e-01_dt,  1.450250e+00_dt,  -5.190918e-02_dt, -6.284308e-01_dt,
+                                   -6.537996e-01_dt, 1.719824e+00_dt,  -9.609554e-01_dt, -6.375025e-01_dt, 7.472499e-02_dt,  5.599695e-01_dt,  5.314036e-01_dt,  1.235090e+00_dt,
+                                   -3.937254e-02_dt, -8.014722e-01_dt, -4.955443e-01_dt, -3.615141e-01_dt, 5.851132e-01_dt,  -1.156007e+00_dt, -1.433649e-01_dt, -1.947406e-01_dt,
+                                   -8.556341e-02_dt, 1.394520e+00_dt,  5.969000e-01_dt,  -4.828483e-01_dt, -3.660986e-01_dt, -1.327052e+00_dt, 1.695280e+00_dt,  2.065500e+00_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const Tensor outputTensor_1_golden{ 7.258359e-01_dt, 7.258359e-01_dt, 7.258359e-01_dt, 3.336587e-01_dt, 3.336587e-01_dt, 3.336587e-01_dt, 9.206030e-01_dt, 9.206030e-01_dt,
+                                              9.206030e-01_dt, 9.887827e-01_dt, 9.887827e-01_dt, 9.887827e-01_dt, 9.868459e-01_dt, 9.868459e-01_dt, 9.868459e-01_dt, 7.471700e-01_dt,
+                                              7.471700e-01_dt, 7.471700e-01_dt, 8.807512e-01_dt, 8.807512e-01_dt, 8.807512e-01_dt, 9.689146e-01_dt, 9.689146e-01_dt, 9.689146e-01_dt,
+                                              9.971335e-01_dt, 9.971335e-01_dt, 9.971335e-01_dt, 9.989925e-01_dt, 9.989925e-01_dt, 9.989925e-01_dt };
+    const Tensor outputTensor_2_golden{ 9.950213e-01_dt, 9.950213e-01_dt, 9.950213e-01_dt, 7.308027e-01_dt, 7.308027e-01_dt, 7.308027e-01_dt, 9.894929e-01_dt, 9.894929e-01_dt,
+                                              9.894929e-01_dt, 9.332256e-01_dt, 9.332256e-01_dt, 9.332256e-01_dt, 5.067036e-01_dt, 5.067036e-01_dt, 5.067036e-01_dt, 9.991685e-01_dt,
+                                              9.991685e-01_dt, 9.991685e-01_dt, 9.608952e-01_dt, 9.608952e-01_dt, 9.608952e-01_dt, 9.757363e-01_dt, 9.757363e-01_dt, 9.757363e-01_dt,
+                                              9.588814e-01_dt, 9.588814e-01_dt, 9.588814e-01_dt, 7.414938e-01_dt, 7.414938e-01_dt, 7.414938e-01_dt };
+
+    const auto& outputTensor = memory_manager["out"];
+    const auto& outputTensor_1 = memory_manager["out::direct"];
+    const auto& outputTensor_2 = memory_manager["out::reversed"];
+
+    EXPECT_EQ(outputTensor.size(), 2 * batch_size * hidden_size * sequence_length);
+
+    for (size_t i = 0; i < outputTensor_1.size(); ++i)
+    {
+        const auto val_1 = outputTensor_1[i];
+        const auto val_2 = outputTensor_2[i];
+        const auto val_1_g = outputTensor_1_golden[i];
+        const auto val_2_g = outputTensor_2_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val_1, val_1_g, eps_rel)) << "at " << i << ", expected: " << val_1_g << ", got: " << val_1;
+        ASSERT_TRUE(tools::expect_near_relative(val_2, val_2_g, eps_rel)) << "at " << i << ", expected: " << val_2_g << ", got: " << val_2;
+    }
+}
+
+TEST(TestBiLSTM, SimpleBackwardSeq1Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 1U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data1", DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    BidirectionalLSTMFunc("lstm", params, networkParameters, BidirectionalMergeType::ConcatDepth);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 4.987862e-01_dt, -5.233232e-01_dt, -2.514795e-01_dt, -1.055532e+00_dt, -5.592613e-01_dt, -1.197086e-01_dt, -1.635457e-01_dt, -2.504632e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    memory_manager[Name("out").grad()] = 1.0_dt;
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTraining();
+
+    work.backwardPassTraining();
+
+    // Checks
+    const Tensor inputs_grad_golden{ 2.437663e+00_dt, 2.437663e+00_dt, 2.437663e+00_dt, 2.437663e+00_dt, 2.210873e+00_dt, 2.210873e+00_dt, 2.210873e+00_dt, 2.210873e+00_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestBiLSTM, SimpleBackwardSeq2Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 2U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data1", DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    BidirectionalLSTMFunc("lstm", params, networkParameters, BidirectionalMergeType::ConcatDepth);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 4.386492e-01_dt, -1.070405e-02_dt, 1.338354e+00_dt, -2.794050e-01_dt, -5.518340e-01_dt, -2.889061e+00_dt, -1.509981e+00_dt, 1.024115e+00_dt,
+                                   1.953929e-01_dt, -7.371095e-01_dt, 1.700101e+00_dt, 3.462155e-01_dt,  9.711247e-01_dt,  1.450250e+00_dt,  -5.190918e-02_dt, -6.284308e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    memory_manager[Name("out").grad()] = 1.0_dt;
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTraining();
+
+    work.backwardPassTraining();
+
+    // Checks
+    const Tensor inputs_grad_golden{ 3.770120e-01_dt, 3.770120e-01_dt, 3.770120e-01_dt, 3.770120e-01_dt, 1.116199e+00_dt, 1.116199e+00_dt, 1.116199e+00_dt, 1.116199e+00_dt,
+                                           1.234487e-01_dt, 1.234487e-01_dt, 1.234487e-01_dt, 1.234487e-01_dt, 9.806208e-02_dt, 9.806208e-02_dt, 9.806208e-02_dt, 9.806208e-02_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestBiLSTM, SimpleBackwardSeq5Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 5U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data1", DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    BidirectionalLSTMFunc("lstm", params, networkParameters, BidirectionalMergeType::ConcatDepth);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 4.386492e-01_dt,  -1.070405e-02_dt, 1.338354e+00_dt,  -2.794050e-01_dt, -5.518340e-01_dt, -2.889061e+00_dt, -1.509981e+00_dt, 1.024115e+00_dt,
+                                   1.953929e-01_dt,  -7.371095e-01_dt, 1.700101e+00_dt,  3.462155e-01_dt,  9.711247e-01_dt,  1.450250e+00_dt,  -5.190918e-02_dt, -6.284308e-01_dt,
+                                   -6.537996e-01_dt, 1.719824e+00_dt,  -9.609554e-01_dt, -6.375025e-01_dt, 7.472499e-02_dt,  5.599695e-01_dt,  5.314036e-01_dt,  1.235090e+00_dt,
+                                   -3.937254e-02_dt, -8.014722e-01_dt, -4.955443e-01_dt, -3.615141e-01_dt, 5.851132e-01_dt,  -1.156007e+00_dt, -1.433649e-01_dt, -1.947406e-01_dt,
+                                   -8.556341e-02_dt, 1.394520e+00_dt,  5.969000e-01_dt,  -4.828483e-01_dt, -3.660986e-01_dt, -1.327052e+00_dt, 1.695280e+00_dt,  2.065500e+00_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    memory_manager[Name("out").grad()] = 1.0_dt;
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTraining();
+
+    work.backwardPassTraining();
+
+    // Checks
+    const Tensor inputs_grad_golden{ 3.341508e-01_dt, 3.341508e-01_dt, 3.341508e-01_dt, 3.341508e-01_dt, 2.412369e+00_dt, 2.412369e+00_dt, 2.412369e+00_dt, 2.412369e+00_dt,
+                                           4.854282e-02_dt, 4.854282e-02_dt, 4.854282e-02_dt, 4.854282e-02_dt, 2.387276e-02_dt, 2.387276e-02_dt, 2.387276e-02_dt, 2.387276e-02_dt,
+                                           9.123158e-01_dt, 9.123158e-01_dt, 9.123158e-01_dt, 9.123158e-01_dt, 6.033855e-02_dt, 6.033855e-02_dt, 6.033855e-02_dt, 6.033855e-02_dt,
+                                           3.709853e-01_dt, 3.709853e-01_dt, 3.709853e-01_dt, 3.709853e-01_dt, 1.347198e-01_dt, 1.347198e-01_dt, 1.347198e-01_dt, 1.347198e-01_dt,
+                                           1.808115e-02_dt, 1.808115e-02_dt, 1.808115e-02_dt, 1.808115e-02_dt, 6.819304e-02_dt, 6.819304e-02_dt, 6.819304e-02_dt, 6.819304e-02_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Clamp.cpp b/training/src/tests/tests/layers/Test_Layer_Clamp.cpp
new file mode 100644
index 00000000..0e52cd5d
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Clamp.cpp
@@ -0,0 +1,107 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/ClampLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerClamp, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 1;
+    const auto height = 1;
+    const auto width = 5;
+
+    constexpr raul::dtype min = 3.0_dt;
+    constexpr raul::dtype max = 8.0_dt;
+
+    const raul::Tensor x{ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt, 8.0_dt, 9.0_dt, 10.0_dt };
+
+    const raul::Tensor realOut{ 3.0_dt, 3.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt, 8.0_dt, 8.0_dt, 8.0_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+
+    // Apply function
+    raul::ClampLayer clamper("clamp", raul::ClampLayerParams{ { "x" }, { "out" }, min, max }, networkParameters);
+    TENSORS_CREATE(batch);
+    memory_manager["x"] = TORANGE(x);
+
+    clamper.forwardCompute(raul::NetworkMode::Train);
+
+    // Checks
+    const auto& xTensor = memory_manager["x"];
+    const auto& outTensor = memory_manager["out"];
+
+    EXPECT_EQ(outTensor.size(), xTensor.size());
+    EXPECT_EQ(outTensor.size(), realOut.size());
+    for (size_t i = 0; i < outTensor.size(); ++i)
+    {
+        EXPECT_EQ(outTensor[i], realOut[i]);
+    }
+}
+
+TEST(TestLayerClamp, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 1;
+    const auto height = 1;
+    const auto width = 5;
+
+    constexpr raul::dtype min = 3.0_dt;
+    constexpr raul::dtype max = 8.0_dt;
+
+    const raul::Tensor x{ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt, 8.0_dt, 9.0_dt, 10.0_dt };
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor realGrad{ 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+
+    // Apply function
+    raul::ClampLayer clamper("clamp", raul::ClampLayerParams{ { "x" }, { "out" }, min, max }, networkParameters);
+    TENSORS_CREATE(batch);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+    clamper.forwardCompute(raul::NetworkMode::Train);
+    clamper.backwardCompute();
+
+    // Checks
+    const auto& xTensor = memory_manager["x"];
+    const auto& xNablaTensor = memory_manager[raul::Name("x").grad()];
+
+    EXPECT_EQ(xNablaTensor.size(), xTensor.size());
+    EXPECT_EQ(xNablaTensor.size(), realGrad.size());
+    for (size_t i = 0; i < xNablaTensor.size(); ++i)
+    {
+        EXPECT_EQ(xNablaTensor[i], realGrad[i]);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_CumSum.cpp b/training/src/tests/tests/layers/Test_Layer_CumSum.cpp
new file mode 100644
index 00000000..296fc0c8
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_CumSum.cpp
@@ -0,0 +1,316 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/CumSumLayer.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerCumSum, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::CumSumLayer("csum", raul::BasicParamsWithDim{ { "x", "y" }, { "x_out" }, raul::Dimension::Width }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerCumSum, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::CumSumLayer("csum", raul::BasicParamsWithDim{ { "x" }, { "x_out", "y_out" }, raul::Dimension::Width }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerCumSum, IncorrectDimensionUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::CumSumLayer("csum", raul::BasicParamsWithDim{ { "x" }, { "out" }, raul::Dimension::Default }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerCumSum, ForwardSimpleUnit)
+{
+    PROFILE_TEST
+    // See https://pytorch.org/docs/master/generated/torch.cumsum.html
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-3);
+
+    const raul::Tensor x{ -0.8286_dt, -0.4890_dt, 0.5155_dt, 0.8443_dt, 0.1865_dt, -0.1752_dt, -2.0595_dt, 0.1850_dt, -1.1571_dt, -0.4243_dt };
+    const raul::Tensor realOut{ -0.8286_dt, -1.3176_dt, -0.8020_dt, 0.0422_dt, 0.2289_dt, 0.0535_dt, -2.0058_dt, -1.8209_dt, -2.9780_dt, -3.4022_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::CumSumLayer cumsum("csum", raul::BasicParamsWithDim{ { "x" }, { "out" }, raul::Dimension::Batch }, networkParameters);
+    TENSORS_CREATE(10);
+    memory_manager["x"] = TORANGE(x);
+
+    cumsum.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& xTensor = memory_manager["x"];
+    const auto& outTensor = memory_manager["out"];
+
+    EXPECT_EQ(outTensor.size(), xTensor.size());
+
+    for (size_t i = 0; i < outTensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(outTensor[i], realOut[i], eps_rel));
+    }
+}
+
+TEST(TestLayerCumSum, BackwardSimpleUnit)
+{
+    PROFILE_TEST
+    // See https://pytorch.org/docs/master/generated/torch.cumsum.html
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-3);
+
+    const raul::Tensor x{ -0.8286_dt, -0.4890_dt, 0.5155_dt, 0.8443_dt, 0.1865_dt, -0.1752_dt, -2.0595_dt, 0.1850_dt, -1.1571_dt, -0.4243_dt };
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor realGrad{ 10.0_dt, 9.0_dt, 8.0_dt, 7.0_dt, 6.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::CumSumLayer cumsum("csum", raul::BasicParamsWithDim{ { "x" }, { "out" }, raul::Dimension::Batch }, networkParameters);
+    TENSORS_CREATE(10);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    cumsum.forwardCompute(raul::NetworkMode::Test);
+    cumsum.backwardCompute();
+
+    // Checks
+    const auto& xNablaTensor = memory_manager[raul::Name("x").grad()];
+    const auto& xTensor = memory_manager["x"];
+
+    EXPECT_EQ(xTensor.size(), xNablaTensor.size());
+
+    for (size_t i = 0; i < xNablaTensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(xNablaTensor[i], realGrad[i], eps_rel));
+    }
+}
+
+TEST(TestLayerCumSum, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+
+    const raul::Tensor x{ 0.1033096910_dt, 0.9701558948_dt, 0.9480701089_dt, 0.9787087440_dt, 0.3267636895_dt, 0.9200272560_dt, 0.0502749681_dt, 0.4583170414_dt, 0.8607905507_dt, 0.4881546497_dt,
+                          0.7980576158_dt, 0.2011045814_dt, 0.1913603544_dt, 0.8979360461_dt, 0.9541048408_dt, 0.5241690278_dt, 0.6006127000_dt, 0.9887800217_dt, 0.7473886609_dt, 0.5498082042_dt,
+                          0.0670150518_dt, 0.1167818904_dt, 0.1723778248_dt, 0.9939703345_dt, 0.6243668795_dt, 0.3656120300_dt, 0.5017478466_dt, 0.2137093544_dt, 0.8107876778_dt, 0.7783825397_dt,
+                          0.2362361550_dt, 0.2898594737_dt, 0.3328117728_dt, 0.9092149138_dt, 0.2501674891_dt, 0.6224393249_dt, 0.9649521708_dt, 0.5299566984_dt, 0.2069533467_dt, 0.6873005629_dt,
+                          0.1918165684_dt, 0.8134448528_dt, 0.9125209451_dt, 0.9396399260_dt, 0.8208933473_dt, 0.4034467340_dt, 0.9324436188_dt, 0.2018597722_dt, 0.9788960814_dt, 0.4333596826_dt,
+                          0.7238065600_dt, 0.8973705173_dt, 0.0776838064_dt, 0.6971374750_dt, 0.3664962053_dt, 0.0779988170_dt, 0.3857882619_dt, 0.3668601513_dt, 0.7975063324_dt, 0.9332120419_dt,
+                          0.8668168187_dt, 0.5823799968_dt, 0.3222199082_dt, 0.5328013897_dt, 0.0239760280_dt, 0.6003485918_dt, 0.8691412807_dt, 0.3132150769_dt, 0.1712092757_dt, 0.2083655000_dt,
+                          0.6775689721_dt, 0.6496044993_dt, 0.0529118180_dt, 0.7317054272_dt, 0.3720138669_dt, 0.3189361095_dt, 0.8919355273_dt, 0.7041678429_dt, 0.7928147316_dt, 0.6565824151_dt,
+                          0.7744513750_dt, 0.8949885964_dt, 0.6901841164_dt, 0.9020239711_dt, 0.3684692383_dt, 0.5173735023_dt, 0.8764913678_dt, 0.2990424037_dt, 0.9684888721_dt, 0.0940009356_dt,
+                          0.7392263412_dt, 0.6003669500_dt, 0.6738508344_dt, 0.3602285385_dt, 0.8780175447_dt, 0.6230656505_dt, 0.3569628000_dt, 0.8145191073_dt, 0.6073390245_dt, 0.5124547482_dt,
+                          0.6408753395_dt, 0.1860215068_dt, 0.5974498987_dt, 0.1584112048_dt, 0.1544559598_dt, 0.8474228978_dt, 0.3584001660_dt, 0.6629422307_dt, 0.4294191003_dt, 0.4718081951_dt,
+                          0.3983595371_dt, 0.7621403337_dt, 0.7940700650_dt, 0.6270959973_dt, 0.3249167800_dt, 0.9852560759_dt, 0.9440631270_dt, 0.6515852809_dt, 0.2359522581_dt, 0.1550757289_dt };
+
+    raul::Dimension dimensions[]{ raul::Dimension::Batch, raul::Dimension::Depth, raul::Dimension::Height, raul::Dimension::Width };
+    const raul::Tensor realOut[]{
+        { 0.1033096910_dt, 0.9701558948_dt, 0.9480701089_dt, 0.9787087440_dt, 0.3267636895_dt, 0.9200272560_dt, 0.0502749681_dt, 0.4583170414_dt, 0.8607905507_dt, 0.4881546497_dt, 0.7980576158_dt,
+          0.2011045814_dt, 0.1913603544_dt, 0.8979360461_dt, 0.9541048408_dt, 0.5241690278_dt, 0.6006127000_dt, 0.9887800217_dt, 0.7473886609_dt, 0.5498082042_dt, 0.0670150518_dt, 0.1167818904_dt,
+          0.1723778248_dt, 0.9939703345_dt, 0.6243668795_dt, 0.3656120300_dt, 0.5017478466_dt, 0.2137093544_dt, 0.8107876778_dt, 0.7783825397_dt, 0.2362361550_dt, 0.2898594737_dt, 0.3328117728_dt,
+          0.9092149138_dt, 0.2501674891_dt, 0.6224393249_dt, 0.9649521708_dt, 0.5299566984_dt, 0.2069533467_dt, 0.6873005629_dt, 0.1918165684_dt, 0.8134448528_dt, 0.9125209451_dt, 0.9396399260_dt,
+          0.8208933473_dt, 0.4034467340_dt, 0.9324436188_dt, 0.2018597722_dt, 0.9788960814_dt, 0.4333596826_dt, 0.7238065600_dt, 0.8973705173_dt, 0.0776838064_dt, 0.6971374750_dt, 0.3664962053_dt,
+          0.0779988170_dt, 0.3857882619_dt, 0.3668601513_dt, 0.7975063324_dt, 0.9332120419_dt, 0.9701265097_dt, 1.5525358915_dt, 1.2702900171_dt, 1.5115101337_dt, 0.3507397175_dt, 1.5203758478_dt,
+          0.9194162488_dt, 0.7715321183_dt, 1.0319998264_dt, 0.6965201497_dt, 1.4756265879_dt, 0.8507090807_dt, 0.2442721725_dt, 1.6296415329_dt, 1.3261187077_dt, 0.8431051373_dt, 1.4925482273_dt,
+          1.6929478645_dt, 1.5402033329_dt, 1.2063906193_dt, 0.8414664268_dt, 1.0117704868_dt, 0.8625619411_dt, 1.8959943056_dt, 0.9928361177_dt, 0.8829855323_dt, 1.3782391548_dt, 0.5127517581_dt,
+          1.7792766094_dt, 0.8723834753_dt, 0.9754624963_dt, 0.8902264237_dt, 1.0066626072_dt, 1.2694435120_dt, 1.1281850338_dt, 1.2455049753_dt, 1.3219149113_dt, 1.3444757462_dt, 0.8142923713_dt,
+          1.1997553110_dt, 0.8326919079_dt, 0.9994663596_dt, 1.5099709034_dt, 1.0980510712_dt, 0.9753493071_dt, 1.2508696318_dt, 1.2908437252_dt, 0.8648020029_dt, 1.4083151817_dt, 0.9051678777_dt,
+          1.1221661568_dt, 1.6595108509_dt, 0.8717538714_dt, 1.3242335320_dt, 0.6914129853_dt, 1.0632548332_dt, 1.3298513889_dt, 1.0184454918_dt, 1.0334585905_dt, 1.0882878304_dt },
+        { 0.1033096910_dt, 0.9701558948_dt, 0.9480701089_dt, 0.9787087440_dt, 0.3267636895_dt, 0.9200272560_dt, 0.0502749681_dt, 0.4583170414_dt, 0.8607905507_dt, 0.4881546497_dt, 0.7980576158_dt,
+          0.2011045814_dt, 0.1913603544_dt, 0.8979360461_dt, 0.9541048408_dt, 0.5241690278_dt, 0.6006127000_dt, 0.9887800217_dt, 0.7473886609_dt, 0.5498082042_dt, 0.1703247428_dt, 1.0869377851_dt,
+          1.1204478741_dt, 1.9726791382_dt, 0.9511305690_dt, 1.2856392860_dt, 0.5520228148_dt, 0.6720263958_dt, 1.6715781689_dt, 1.2665371895_dt, 1.0342937708_dt, 0.4909640551_dt, 0.5241721272_dt,
+          1.8071509600_dt, 1.2042722702_dt, 1.1466083527_dt, 1.5655648708_dt, 1.5187367201_dt, 0.9543420076_dt, 1.2371087074_dt, 0.3621413112_dt, 1.9003826380_dt, 2.0329689980_dt, 2.9123189449_dt,
+          1.7720239162_dt, 1.6890859604_dt, 1.4844664335_dt, 0.8738861680_dt, 2.6504743099_dt, 1.6998968124_dt, 1.7581002712_dt, 1.3883345127_dt, 0.6018559337_dt, 2.5042884350_dt, 1.5707685947_dt,
+          1.2246072292_dt, 1.9513530731_dt, 1.8855968714_dt, 1.7518483400_dt, 2.1703207493_dt, 0.8668168187_dt, 0.5823799968_dt, 0.3222199082_dt, 0.5328013897_dt, 0.0239760280_dt, 0.6003485918_dt,
+          0.8691412807_dt, 0.3132150769_dt, 0.1712092757_dt, 0.2083655000_dt, 0.6775689721_dt, 0.6496044993_dt, 0.0529118180_dt, 0.7317054272_dt, 0.3720138669_dt, 0.3189361095_dt, 0.8919355273_dt,
+          0.7041678429_dt, 0.7928147316_dt, 0.6565824151_dt, 1.6412682533_dt, 1.4773685932_dt, 1.0124039650_dt, 1.4348254204_dt, 0.3924452662_dt, 1.1177220345_dt, 1.7456326485_dt, 0.6122574806_dt,
+          1.1396981478_dt, 0.3023664355_dt, 1.4167952538_dt, 1.2499713898_dt, 0.7267626524_dt, 1.0919339657_dt, 1.2500314713_dt, 0.9420017600_dt, 1.2488982677_dt, 1.5186870098_dt, 1.4001537561_dt,
+          1.1690371037_dt, 2.2821435928_dt, 1.6633901596_dt, 1.6098539829_dt, 1.5932365656_dt, 0.5469012260_dt, 1.9651449919_dt, 2.1040327549_dt, 1.2751996517_dt, 1.5691173077_dt, 0.7741746306_dt,
+          1.8151547909_dt, 2.0121116638_dt, 1.5208327770_dt, 1.7190299034_dt, 1.5749481916_dt, 1.9272577763_dt, 2.1929614544_dt, 2.1702723503_dt, 1.6361060143_dt, 1.3241128922_dt },
+        { 0.1033096910_dt, 0.9701558948_dt, 0.9480701089_dt, 0.9787087440_dt, 0.3267636895_dt, 1.0233368874_dt, 1.0204308033_dt, 1.4063870907_dt, 1.8394992352_dt, 0.8149183393_dt, 1.8213945627_dt,
+          1.2215354443_dt, 1.5977475643_dt, 2.7374353409_dt, 1.7690231800_dt, 2.3455636501_dt, 1.8221480846_dt, 2.5865275860_dt, 3.4848239422_dt, 2.3188314438_dt, 0.0670150518_dt, 0.1167818904_dt,
+          0.1723778248_dt, 0.9939703345_dt, 0.6243668795_dt, 0.4326270819_dt, 0.6185297370_dt, 0.3860871792_dt, 1.8047580719_dt, 1.4027494192_dt, 0.6688632369_dt, 0.9083892107_dt, 0.7188989520_dt,
+          2.7139730453_dt, 1.6529169083_dt, 1.2913025618_dt, 1.8733413219_dt, 1.2488555908_dt, 2.9209263325_dt, 2.3402175903_dt, 0.1918165684_dt, 0.8134448528_dt, 0.9125209451_dt, 0.9396399260_dt,
+          0.8208933473_dt, 0.5952633023_dt, 1.7458884716_dt, 1.1143807173_dt, 1.9185359478_dt, 1.2542530298_dt, 1.3190698624_dt, 2.6432590485_dt, 1.1920645237_dt, 2.6156735420_dt, 1.6207492352_dt,
+          1.3970687389_dt, 3.0290472507_dt, 1.5589246750_dt, 3.4131798744_dt, 2.5539612770_dt, 0.8668168187_dt, 0.5823799968_dt, 0.3222199082_dt, 0.5328013897_dt, 0.0239760280_dt, 1.4671654701_dt,
+          1.4515212774_dt, 0.6354349852_dt, 0.7040106654_dt, 0.2323415279_dt, 2.1447343826_dt, 2.1011257172_dt, 0.6883468032_dt, 1.4357161522_dt, 0.6043553948_dt, 2.4636704922_dt, 2.9930613041_dt,
+          1.3925147057_dt, 2.2285308838_dt, 1.2609378099_dt, 0.7744513750_dt, 0.8949885964_dt, 0.6901841164_dt, 0.9020239711_dt, 0.3684692383_dt, 1.2918248177_dt, 1.7714799643_dt, 0.9892265201_dt,
+          1.8705128431_dt, 0.4624701738_dt, 2.0310511589_dt, 2.3718469143_dt, 1.6630773544_dt, 2.2307415009_dt, 1.3404877186_dt, 2.6541168690_dt, 2.7288098335_dt, 2.4775965214_dt, 2.8380804062_dt,
+          1.8529424667_dt, 0.6408753395_dt, 0.1860215068_dt, 0.5974498987_dt, 0.1584112048_dt, 0.1544559598_dt, 1.4882981777_dt, 0.5444216728_dt, 1.2603921890_dt, 0.5878303051_dt, 0.6262641549_dt,
+          1.8866577148_dt, 1.3065619469_dt, 2.0544621944_dt, 1.2149262428_dt, 0.9511809349_dt, 2.8719139099_dt, 2.2506251335_dt, 2.7060475349_dt, 1.4508786201_dt, 1.1062567234_dt },
+        { 0.1033096910_dt, 1.0734655857_dt, 2.0215356350_dt, 3.0002443790_dt, 3.3270082474_dt, 0.9200272560_dt, 0.9703022242_dt, 1.4286192656_dt, 2.2894098759_dt, 2.7775645256_dt, 0.7980576158_dt,
+          0.9991621971_dt, 1.1905225515_dt, 2.0884585381_dt, 3.0425634384_dt, 0.5241690278_dt, 1.1247817278_dt, 2.1135616302_dt, 2.8609504700_dt, 3.4107584953_dt, 0.0670150518_dt, 0.1837969422_dt,
+          0.3561747670_dt, 1.3501451015_dt, 1.9745119810_dt, 0.3656120300_dt, 0.8673598766_dt, 1.0810692310_dt, 1.8918569088_dt, 2.6702394485_dt, 0.2362361550_dt, 0.5260956287_dt, 0.8589074016_dt,
+          1.7681223154_dt, 2.0182898045_dt, 0.6224393249_dt, 1.5873914957_dt, 2.1173481941_dt, 2.3243014812_dt, 3.0116021633_dt, 0.1918165684_dt, 1.0052614212_dt, 1.9177823067_dt, 2.8574223518_dt,
+          3.6783156395_dt, 0.4034467340_dt, 1.3358902931_dt, 1.5377501249_dt, 2.5166461468_dt, 2.9500060081_dt, 0.7238065600_dt, 1.6211770773_dt, 1.6988608837_dt, 2.3959984779_dt, 2.7624945641_dt,
+          0.0779988170_dt, 0.4637870789_dt, 0.8306472301_dt, 1.6281535625_dt, 2.5613656044_dt, 0.8668168187_dt, 1.4491968155_dt, 1.7714166641_dt, 2.3042180538_dt, 2.3281941414_dt, 0.6003485918_dt,
+          1.4694898129_dt, 1.7827049494_dt, 1.9539141655_dt, 2.1622796059_dt, 0.6775689721_dt, 1.3271734715_dt, 1.3800852299_dt, 2.1117906570_dt, 2.4838047028_dt, 0.3189361095_dt, 1.2108716965_dt,
+          1.9150395393_dt, 2.7078542709_dt, 3.3644366264_dt, 0.7744513750_dt, 1.6694400311_dt, 2.3596241474_dt, 3.2616481781_dt, 3.6301174164_dt, 0.5173735023_dt, 1.3938648701_dt, 1.6929073334_dt,
+          2.6613960266_dt, 2.7553970814_dt, 0.7392263412_dt, 1.3395932913_dt, 2.0134441853_dt, 2.3736727238_dt, 3.2516901493_dt, 0.6230656505_dt, 0.9800284505_dt, 1.7945475578_dt, 2.4018864632_dt,
+          2.9143414497_dt, 0.6408753395_dt, 0.8268968463_dt, 1.4243466854_dt, 1.5827579498_dt, 1.7372138500_dt, 0.8474228978_dt, 1.2058230639_dt, 1.8687653542_dt, 2.2981843948_dt, 2.7699925900_dt,
+          0.3983595371_dt, 1.1604998112_dt, 1.9545699358_dt, 2.5816659927_dt, 2.9065828323_dt, 0.9852560759_dt, 1.9293191433_dt, 2.5809044838_dt, 2.8168568611_dt, 2.9719324112_dt }
+    };
+
+    // Initialization
+
+    for (size_t iter = 0; iter < std::size(dimensions); iter++)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+        // Apply function
+        raul::CumSumLayer cumsum("csum", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+
+        cumsum.forwardCompute(raul::NetworkMode::Test);
+
+        // Checks
+        const auto& xTensor = memory_manager["x"];
+        const auto& outTensor = memory_manager["out"];
+
+        EXPECT_EQ(outTensor.size(), xTensor.size());
+        EXPECT_EQ(outTensor.size(), realOut[iter].size());
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(outTensor[i], realOut[iter][i], eps_rel));
+        }
+    }
+}
+
+TEST(TestLayerCumSum, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+
+    const raul::Tensor x{ 0.1033096910_dt, 0.9701558948_dt, 0.9480701089_dt, 0.9787087440_dt, 0.3267636895_dt, 0.9200272560_dt, 0.0502749681_dt, 0.4583170414_dt, 0.8607905507_dt, 0.4881546497_dt,
+                          0.7980576158_dt, 0.2011045814_dt, 0.1913603544_dt, 0.8979360461_dt, 0.9541048408_dt, 0.5241690278_dt, 0.6006127000_dt, 0.9887800217_dt, 0.7473886609_dt, 0.5498082042_dt,
+                          0.0670150518_dt, 0.1167818904_dt, 0.1723778248_dt, 0.9939703345_dt, 0.6243668795_dt, 0.3656120300_dt, 0.5017478466_dt, 0.2137093544_dt, 0.8107876778_dt, 0.7783825397_dt,
+                          0.2362361550_dt, 0.2898594737_dt, 0.3328117728_dt, 0.9092149138_dt, 0.2501674891_dt, 0.6224393249_dt, 0.9649521708_dt, 0.5299566984_dt, 0.2069533467_dt, 0.6873005629_dt,
+                          0.1918165684_dt, 0.8134448528_dt, 0.9125209451_dt, 0.9396399260_dt, 0.8208933473_dt, 0.4034467340_dt, 0.9324436188_dt, 0.2018597722_dt, 0.9788960814_dt, 0.4333596826_dt,
+                          0.7238065600_dt, 0.8973705173_dt, 0.0776838064_dt, 0.6971374750_dt, 0.3664962053_dt, 0.0779988170_dt, 0.3857882619_dt, 0.3668601513_dt, 0.7975063324_dt, 0.9332120419_dt,
+                          0.8668168187_dt, 0.5823799968_dt, 0.3222199082_dt, 0.5328013897_dt, 0.0239760280_dt, 0.6003485918_dt, 0.8691412807_dt, 0.3132150769_dt, 0.1712092757_dt, 0.2083655000_dt,
+                          0.6775689721_dt, 0.6496044993_dt, 0.0529118180_dt, 0.7317054272_dt, 0.3720138669_dt, 0.3189361095_dt, 0.8919355273_dt, 0.7041678429_dt, 0.7928147316_dt, 0.6565824151_dt,
+                          0.7744513750_dt, 0.8949885964_dt, 0.6901841164_dt, 0.9020239711_dt, 0.3684692383_dt, 0.5173735023_dt, 0.8764913678_dt, 0.2990424037_dt, 0.9684888721_dt, 0.0940009356_dt,
+                          0.7392263412_dt, 0.6003669500_dt, 0.6738508344_dt, 0.3602285385_dt, 0.8780175447_dt, 0.6230656505_dt, 0.3569628000_dt, 0.8145191073_dt, 0.6073390245_dt, 0.5124547482_dt,
+                          0.6408753395_dt, 0.1860215068_dt, 0.5974498987_dt, 0.1584112048_dt, 0.1544559598_dt, 0.8474228978_dt, 0.3584001660_dt, 0.6629422307_dt, 0.4294191003_dt, 0.4718081951_dt,
+                          0.3983595371_dt, 0.7621403337_dt, 0.7940700650_dt, 0.6270959973_dt, 0.3249167800_dt, 0.9852560759_dt, 0.9440631270_dt, 0.6515852809_dt, 0.2359522581_dt, 0.1550757289_dt };
+
+    raul::Dimension dimensions[]{ raul::Dimension::Batch, raul::Dimension::Depth, raul::Dimension::Height, raul::Dimension::Width };
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                               1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                               1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                               1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                               1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                               1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor realGrad[]{ { 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt,
+                                     2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt,
+                                     2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt,
+                                     1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                                   { 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt,
+                                     2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt,
+                                     1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt,
+                                     2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt,
+                                     1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                                   { 4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 4.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                                   { 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt,
+                                     5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt,
+                                     5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt,
+                                     5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt,
+                                     5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt,
+                                     5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 5.0_dt, 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt } };
+
+    for (size_t iter = 0; iter < std::size(dimensions); iter++)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+        // Apply function
+        raul::CumSumLayer cumsum("csum", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+        cumsum.forwardCompute(raul::NetworkMode::Test);
+        cumsum.backwardCompute();
+
+        // Checks
+        const auto& xTensor = memory_manager["x"];
+        const auto& xNablaTensor = memory_manager[raul::Name("x").grad()];
+
+        EXPECT_EQ(xNablaTensor.size(), xTensor.size());
+        EXPECT_EQ(xNablaTensor.size(), realGrad[iter].size());
+        for (size_t i = 0; i < xNablaTensor.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(xNablaTensor[i], realGrad[iter][i], eps_rel));
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Dropout.cpp b/training/src/tests/tests/layers/Test_Layer_Dropout.cpp
new file mode 100644
index 00000000..0369f69b
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Dropout.cpp
@@ -0,0 +1,158 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/initializers/RandomUniformInitializer.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/DropoutLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerDropout, TestForward1RandUnit)
+{
+    PROFILE_TEST
+    constexpr auto eps = 1e-6_dt;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    raul::Tensor raw = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+    size_t batch = 1;
+
+    constexpr raul::dtype probability = 0.3_dt;
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in", "labels" }, 1, 3, 3, 1 });
+    raul::DropoutLayer drop("drop", raul::DropoutParams{ { "in" }, { "drop" }, probability }, networkParameters);
+    TENSORS_CREATE(batch);
+    memory_manager["in"] = TORANGE(raw);
+
+    const raul::Tensor& out = memory_manager["drop"];
+    drop.forwardCompute(raul::NetworkMode::Train);
+    constexpr auto scale = 1.0_dt / (1.0_dt - probability);
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        if (out[i] != 0.0_dt)
+        {
+            EXPECT_NEAR(out[i], raw[i] * scale, eps);
+        }
+    }
+
+    printf(" - Dropout forward is Ok.\n");
+}
+
+TEST(TestLayerDropout, TestForwardRandSeedUnit)
+{
+    PROFILE_TEST
+
+    raul::random::setGlobalSeed(48);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const raul::Tensor raw = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+    size_t batch = 2;
+
+    constexpr auto probability = 0.3_dt;
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1u, 3u, 3u });
+    work.add<raul::DropoutLayer>("drop", raul::DropoutParams{ { "in" }, { "drop" }, probability });
+    TENSORS_CREATE(batch);
+    memory_manager["in"] = TORANGE(raw);
+
+    const raul::Tensor& out = memory_manager["drop"];
+    work.forwardPassTraining();
+    auto avg = 0_dt;
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        avg += out[i];
+    }
+    avg /= TODTYPE(out.size());
+
+    printf(" - Dropout mean: %f\n", avg);
+}
+
+TEST(TestLayerDropout, TestForward2RandUnit)
+{
+    PROFILE_TEST
+    constexpr auto probability = 0.35_dt;
+    const size_t batch = 1;
+    const auto random_range = raul::random::dtypeRange{ 1.0_dt, 100.0_dt };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    raul::Tensor raw(1, 1, 1, 1000);
+    auto filler = [&random_range]() { return raul::random::uniform::rand<raul::dtype>(random_range); };
+    std::generate(raw.begin(), raw.end(), filler);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in", "labels" }, 1, 10, 100, 1 });
+    raul::DropoutLayer drop("drop", raul::DropoutParams{ { "in" }, { "drop" }, probability }, networkParameters);
+    TENSORS_CREATE(batch);
+    memory_manager["in"] = TORANGE(raw);
+
+    const raul::Tensor& out = memory_manager["drop"];
+    drop.forwardCompute(raul::NetworkMode::Train);
+    size_t count = 0;
+    for (raul::dtype x : out)
+    {
+        if (x == 0.0_dt) count++;
+    }
+    raul::dtype variance = probability * (1.0_dt - probability);
+    raul::dtype mean = TODTYPE(count) / TODTYPE(out.size());
+    printf(" - Probability = 0.35, mean in output is %f\n", mean);
+    EXPECT_NEAR(mean, probability, variance);
+}
+
+TEST(TestLayerDropout, TestBackwardRandUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const auto probability = raul::random::uniform::rand<raul::dtype>(0.0_dt, 1.0_dt);
+    const size_t n = 1'000'000;
+    const size_t batch = 1;
+
+    std::cout << "Test with p=" << probability << std::endl;
+
+    raul::initializers::RandomUniformInitializer initializer_uniform{ -1e3_dt, 1e3_dt };
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in", "labels" }, 1, 1, n, 1 });
+    raul::DropoutLayer drop("drop", raul::DropoutParams{ { "in" }, { "drop" }, 0_dt }, networkParameters);
+    TENSORS_CREATE(batch);
+    initializer_uniform(memory_manager["in"]);
+    initializer_uniform(memory_manager[raul::Name("drop").grad()]);
+
+    drop.forwardCompute(raul::NetworkMode::Train);
+    drop.backwardCompute();
+
+    // Check
+    const raul::Tensor& out = memory_manager["drop"];
+    const raul::Tensor& out_grad = memory_manager[raul::Name("drop").grad()];
+    const raul::Tensor& in_grad = memory_manager[raul::Name("in").grad()];
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        if (out[i] == 0.0_dt)
+        {
+            EXPECT_TRUE(in_grad[i] == 0.0_dt);
+        }
+        else
+        {
+            EXPECT_TRUE(in_grad[i] == out_grad[i]);
+        }
+    }
+}
+
+} // namespace UT
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_DynamicConvolutionAttention.cpp b/training/src/tests/tests/layers/Test_Layer_DynamicConvolutionAttention.cpp
new file mode 100644
index 00000000..363a486c
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_DynamicConvolutionAttention.cpp
@@ -0,0 +1,167 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <tests/tools/callbacks/TensorChecker.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/composite/DynamicConvolutionAttentionLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerDynamicConvolutionAttention, IncorrectParamsUnit)
+{
+    PROFILE_TEST
+    const size_t numUnits = 4;
+    const size_t queryDepth = 3;
+    const size_t alignmentsSize = 2;
+    const size_t anyNumber = 3;
+
+    const raul::Name parent = "parent";
+
+    // Wrong params
+    raul::DynamicConvolutionAttentionParams incorrectParams[]{
+        { raul::DynamicConvolutionAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "next_state", "max_attn" }, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, false } },
+        { raul::DynamicConvolutionAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "values", "next_state", "max_attn" }, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, false } },
+        { raul::DynamicConvolutionAttentionParams{ { "query", "state", "memory" }, { "alignment" }, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, true } },
+        { raul::DynamicConvolutionAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "values", "next_state", "max_attn" }, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, true } },
+        { raul::DynamicConvolutionAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "next_state", "max_attn" }, parent, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, false } },
+        { raul::DynamicConvolutionAttentionParams{ { "query", "state", "memory" },
+                                                   { "alignment", "values", "next_state", "max_attn" },
+                                                   parent,
+                                                   numUnits,
+                                                   raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt },
+                                                   false } },
+        { raul::DynamicConvolutionAttentionParams{
+            { "query", "state", "memory" }, { "alignment" }, parent, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, true } },
+        { raul::DynamicConvolutionAttentionParams{ { "query", "state", "memory" },
+                                                   { "alignment", "values", "next_state", "max_attn" },
+                                                   parent,
+                                                   numUnits,
+                                                   raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt },
+                                                   true } },
+        { raul::DynamicConvolutionAttentionParams{
+            { "query", "state", "memory", "memory_seq_length" }, { "alignment" }, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, false } },
+        { raul::DynamicConvolutionAttentionParams{ { "query", "state", "memory", "memory_seq_length" },
+                                                   { "alignment", "values", "next_state", "max_attn" },
+                                                   numUnits,
+                                                   raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt },
+                                                   false } },
+        { raul::DynamicConvolutionAttentionParams{ { "query", "state", "memory", "memory_seq_length" },
+                                                   { "alignment", "values", "next_state" },
+                                                   parent,
+                                                   numUnits,
+                                                   raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt },
+                                                   false } },
+        { raul::DynamicConvolutionAttentionParams{ { "query", "state", "memory", "memory_seq_length" },
+                                                   { "alignment", "values", "next_state", "max_attn" },
+                                                   parent,
+                                                   numUnits,
+                                                   raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt },
+                                                   false } },
+        { raul::DynamicConvolutionAttentionParams{
+            { "query", "state", "memory", "memory_seq_length" }, { "alignment" }, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, true } },
+        { raul::DynamicConvolutionAttentionParams{
+            { "query", "state", "memory", "memory_seq_length" }, { "alignment", "values" }, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, true } },
+        { raul::DynamicConvolutionAttentionParams{
+            { "query", "state", "memory", "memory_seq_length" }, { "alignment" }, parent, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, true } },
+        { raul::DynamicConvolutionAttentionParams{ { "query", "state", "memory", "memory_seq_length" },
+                                                   { "alignment", "values", "next_state", "max_attn" },
+                                                   parent,
+                                                   numUnits,
+                                                   raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt },
+                                                   true } }
+    };
+
+    for (size_t i = 0; i < std::size(incorrectParams); ++i)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        // Inputs
+        work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+        work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+        work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+        if (i > 1)
+        {
+            work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memory_seq_legnth" }, 1u, 1u, 1u });
+        }
+
+        // Layer
+        ASSERT_THROW(raul::DynamicConvolutionAttentionLayer("attn", incorrectParams[i], networkParameters), raul::Exception);
+    }
+}
+
+TEST(TestLayerDynamicConvolutionAttention, GetTrainableParametersUnit)
+{
+    PROFILE_TEST
+    const size_t numUnits = 4;
+    const size_t queryDepth = 3;
+    const size_t alignmentsSize = 2;
+    const size_t anyNumber = 3;
+    const size_t batchSize = 1;
+
+    const size_t goldenTrainableParams = 10u;
+    // List of trainable parameters:
+    // 1. attention_variable_projection;
+    // 2. attention_bias;
+    // 3. memory_layer::Weights;
+    // 4. location_convolution::Weights;
+    // 5. location_convolution::Biases;
+    // 6. location_layer::Weights;
+    // 7. dynamic_fc1::Weights;
+    // 8. dynamic_fc1::Biases;
+    // 9. dynamic_fc2::Weights;
+    // 10. dynamic_projection::Weights.
+
+    const raul::Tensor query{ 0.01975703_dt, 0.00704217_dt, 0.18987215_dt };
+    const raul::Tensor state{ 0.01975703_dt, 0.00704217_dt };
+    const raul::Tensor memory{ 0.01975703_dt, 0.00704217_dt, 0.18987215_dt, 0.7772658_dt, 0.41817415_dt, 0.7437942_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+
+    // Layer
+    raul::DynamicConvolutionAttentionLayer(
+        "attn",
+        raul::DynamicConvolutionAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "max_attn" }, numUnits, raul::DynamicConvolutionAttentionParams::hparams{ 1, 2, 11, 1, 0.0_dt, 0.0_dt }, false },
+        networkParameters);
+
+    TENSORS_CREATE(batchSize);
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(work.getTrainableParameterNames().size(), goldenTrainableParams);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_DynamicDepthwiseConvolution2D.cpp b/training/src/tests/tests/layers/Test_Layer_DynamicDepthwiseConvolution2D.cpp
new file mode 100644
index 00000000..ed9d477e
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_DynamicDepthwiseConvolution2D.cpp
@@ -0,0 +1,334 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/DynamicDepthwiseConvolution2DLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerDynamicDepthwiseConvolution2D, BatchedUnit)
+{
+    PROFILE_TEST
+
+    // see tf_depthwise_conv2d.py
+    // Test settings
+    constexpr raul::dtype eps = 1.0e-6_dt;
+
+    // Inputs
+    const raul::Tensor input{
+        0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt, 0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt,
+        0._dt, 0._dt, 0.01975703_dt, 0.00704217_dt, 0.18987215_dt, 0.7772658_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.41817415_dt, 0.7437942_dt,  0.26365364_dt, 0.4459244_dt,  0._dt, 0._dt,
+        0._dt, 0._dt, 0.82929873_dt, 0.52497685_dt, 0.55597556_dt, 0.19923508_dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt,
+        0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt, 0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt,
+        0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt, 0._dt, 0._dt, 0.46925998_dt, 0.18594062_dt, 0.23303056_dt, 0.3938471_dt,  0._dt, 0._dt,
+        0._dt, 0._dt, 0.9660922_dt,  0.36530995_dt, 0.28173566_dt, 0.4888971_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.96301997_dt, 0.45836866_dt, 0.70952535_dt, 0.477888_dt,   0._dt, 0._dt,
+        0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt, 0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt,
+        0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt, 0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt,
+        0._dt, 0._dt, 0.71620464_dt, 0.12221897_dt, 0.2998824_dt,  0.6689563_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.06436884_dt, 0.23358119_dt, 0.8235085_dt,  0.24635303_dt, 0._dt, 0._dt,
+        0._dt, 0._dt, 0.87422705_dt, 0.97360873_dt, 0.5011089_dt,  0.4178022_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt,
+        0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt, 0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt,
+        0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt, 0._dt, 0._dt, 0.19041097_dt, 0.05045938_dt, 0.07118928_dt, 0.17497218_dt, 0._dt, 0._dt,
+        0._dt, 0._dt, 0.06644797_dt, 0.7329292_dt,  0.8574884_dt,  0.4593867_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0.28661895_dt, 0.7181833_dt,  0.30093706_dt, 0.02433372_dt, 0._dt, 0._dt,
+        0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt, 0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt,
+        0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt, 0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt,
+        0._dt, 0._dt, 0.42253482_dt, 0.06825948_dt, 0.48981392_dt, 0.92883205_dt, 0._dt, 0._dt, 0._dt, 0._dt, 0.9339298_dt,  0.41831005_dt, 0.8322693_dt,  0.22140837_dt, 0._dt, 0._dt,
+        0._dt, 0._dt, 0.23945987_dt, 0.7574657_dt,  0.5762696_dt,  0.5139812_dt,  0._dt, 0._dt, 0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt,
+        0._dt, 0._dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt, 0._dt
+    };
+
+    const raul::Tensor filters{ 0.77896154_dt, 0.5083395_dt,  0.37192822_dt, 0.2564721_dt,  0.0173713_dt,  0.12236476_dt, 0.41399598_dt, 0.6549072_dt,  0.58900976_dt, 0.41211855_dt,
+                                0.904863_dt,   0.383209_dt,   0.32986057_dt, 0.66525745_dt, 0.8445356_dt,  0.5856713_dt,  0.7861192_dt,  0.17401242_dt, 0.45946157_dt, 0.455971_dt,
+                                0.5607773_dt,  0.36806154_dt, 0.71749187_dt, 0.05388129_dt, 0.34521234_dt, 0.24467218_dt, 0.5839158_dt,  0.09249318_dt, 0.4646201_dt,  0.26813436_dt,
+                                0.91642344_dt, 0.12303817_dt, 0.6001576_dt,  0.7009108_dt,  0.3092351_dt,  0.06028998_dt, 0.625105_dt,   0.36331964_dt, 0.15423799_dt, 0.12976074_dt,
+                                0.4372971_dt,  0.14352834_dt, 0.05433822_dt, 0.17287242_dt, 0.73136127_dt, 0.62641513_dt, 0.49098003_dt, 0.42000008_dt, 0.2128079_dt,  0.38877666_dt,
+                                0.7380631_dt,  0.44847226_dt, 0.222965_dt,   0.4715277_dt,  0.01670182_dt, 0.5786401_dt,  0.05703926_dt, 0.6873926_dt,  0.17311347_dt, 0.26997936_dt,
+                                0.36309445_dt, 0.6264982_dt,  0.5275842_dt,  0.7112013_dt,  0.4725076_dt,  0.15004325_dt, 0.7902857_dt,  0.6516645_dt,  0.85346174_dt, 0.3106085_dt,
+                                0.2373141_dt,  0.9790237_dt,  0.37913144_dt, 0.856442_dt,   0.32877612_dt, 0.19255197_dt, 0.51858544_dt, 0.20067143_dt, 0.63354146_dt, 0.99083376_dt,
+                                0.54786384_dt, 0.58600605_dt, 0.49146748_dt, 0.6517689_dt,  0.58926547_dt, 0.7655344_dt,  0.9426141_dt,  0.4075675_dt,  0.7852074_dt,  0.7942022_dt };
+
+    const raul::Tensor realFeatures{
+        1.2546906_dt,  1.758348_dt,   1.5786406_dt, 1.4093302_dt,  1.0484831_dt, 1.4180284_dt,  0.79202974_dt, 1.7082783_dt,  0.8821168_dt,  1.5340638_dt,  1.0943279_dt, 0.8303825_dt,
+        1.0001473_dt,  1.0848525_dt,  1.5379478_dt, 1.6466024_dt,  1.0035802_dt, 0.84523404_dt, 0.56607544_dt, 1.0811576_dt,  0.71989954_dt, 0.9528942_dt,  1.0954056_dt, 0.6744617_dt,
+        0.6201665_dt,  0.63352114_dt, 1.5912652_dt, 1.2576785_dt,  1.4030486_dt, 0.67870927_dt, 0.9257573_dt,  0.78698933_dt, 1.0421672_dt,  0.83006406_dt, 1.7716845_dt, 0.61738896_dt,
+        1.6767358_dt,  2.45793_dt,    2.4617476_dt, 1.1681485_dt,  1.0682555_dt, 1.4755492_dt,  1.2390563_dt,  2.4740949_dt,  1.1518929_dt,  1.2379603_dt,  1.0414997_dt, 0.76753944_dt,
+        1.3983952_dt,  1.4857323_dt,  2.4419913_dt, 1.215771_dt,   0.8265422_dt, 0.99419963_dt, 1.188979_dt,   1.6724479_dt,  1.1835436_dt,  1.113475_dt,   1.0213432_dt, 0.56884575_dt,
+        1.0661569_dt,  1.1757137_dt,  2.4343936_dt, 1.185205_dt,   1.1303781_dt, 0.6850685_dt,  1.6436929_dt,  1.3731412_dt,  1.6855145_dt,  0.7485111_dt,  1.2839427_dt, 0.5909537_dt,
+        1.6260588_dt,  2.0672073_dt,  2.4332392_dt, 1.360034_dt,   1.1845512_dt, 1.6093835_dt,  1.413005_dt,   2.110466_dt,   1.3298943_dt,  1.2029771_dt,  1.2489654_dt, 0.8662152_dt,
+        1.4142549_dt,  1.4176774_dt,  2.1419637_dt, 1.5591891_dt,  0.9924498_dt, 0.84249055_dt, 0.7141279_dt,  1.3854965_dt,  1.50319_dt,    1.2578799_dt,  0.9875599_dt, 0.6484919_dt,
+        1.3264725_dt,  1.1315681_dt,  2.2161596_dt, 1.4336894_dt,  1.392356_dt,  0.8466469_dt,  1.5696193_dt,  1.1955621_dt,  1.4612973_dt,  0.8038471_dt,  1.5203788_dt, 0.6651642_dt,
+        1.0862007_dt,  1.1753873_dt,  1.3993876_dt, 1.2048057_dt,  0.8911915_dt, 1.1508342_dt,  0.7346766_dt,  1.2008022_dt,  0.7967271_dt,  1.0992267_dt,  0.8363159_dt, 0.50751317_dt,
+        0.65895873_dt, 0.775882_dt,   1.2434406_dt, 1.3107783_dt,  0.8849546_dt, 0.6801046_dt,  0.31424892_dt, 0.6310036_dt,  0.9608475_dt,  0.9286081_dt,  0.7993823_dt, 0.54531914_dt,
+        0.99068916_dt, 0.4603683_dt,  1.1574365_dt, 0.97384536_dt, 1.0238392_dt, 0.5871148_dt,  0.69986004_dt, 0.5396757_dt,  0.9150152_dt,  0.50573885_dt, 1.2245203_dt, 0.34919372_dt,
+        1.6153148_dt,  2.0693576_dt,  2.5479672_dt, 1.4789497_dt,  1.2666607_dt, 1.6480653_dt,  1.0644124_dt,  2.0760858_dt,  1.3102106_dt,  1.4827486_dt,  1.3083357_dt, 1.0330613_dt,
+        1.3649669_dt,  1.1742215_dt,  2.456157_dt,  1.7170308_dt,  1.0535737_dt, 0.9712448_dt,  1.2054962_dt,  1.2952919_dt,  1.5933845_dt,  1.1902285_dt,  1.1461501_dt, 0.6873801_dt,
+        1.5572598_dt,  1.1989173_dt,  2.3078265_dt, 1.4709939_dt,  1.5412611_dt, 0.87378997_dt, 1.4293349_dt,  1.3516736_dt,  1.9137547_dt,  0.9949931_dt,  1.8282485_dt, 0.77078557_dt
+    };
+
+    const raul::Tensor deltas{ 0.2455703_dt,  0.67252374_dt, 0.06595552_dt, 0.22566724_dt, 0.12165248_dt, 0.34891522_dt, 0.11888039_dt, 0.47397935_dt, 0.8101833_dt,  0.08105242_dt, 0.14795518_dt,
+                               0.8522136_dt,  0.88956475_dt, 0.5235504_dt,  0.78217006_dt, 0.0097661_dt,  0.9842837_dt,  0.4528315_dt,  0.1684258_dt,  0.09325731_dt, 0.5867677_dt,  0.47904193_dt,
+                               0.90361214_dt, 0.44907594_dt, 0.6200323_dt,  0.34236217_dt, 0.9900886_dt,  0.18420243_dt, 0.4838239_dt,  0.4779688_dt,  0.7693052_dt,  0.7986605_dt,  0.8320849_dt,
+                               0.0491637_dt,  0.20643818_dt, 0.61298096_dt, 0.22808123_dt, 0.76741135_dt, 0.886292_dt,   0.33155513_dt, 0.51319087_dt, 0.15767169_dt, 0.13007402_dt, 0.9465703_dt,
+                               0.18995786_dt, 0.36849797_dt, 0.9293808_dt,  0.5779208_dt,  0.20169926_dt, 0.2623061_dt,  0.0521735_dt,  0.64536107_dt, 0.5865052_dt,  0.5442966_dt,  0.6695279_dt,
+                               0.6453804_dt,  0.18496847_dt, 0.68037975_dt, 0.2035402_dt,  0.37345743_dt, 0.48308456_dt, 0.47415805_dt, 0.7709352_dt,  0.02728796_dt, 0.26876915_dt, 0.3912257_dt,
+                               0.47889543_dt, 0.36939442_dt, 0.56657505_dt, 0.09261692_dt, 0.69726706_dt, 0.7697934_dt,  0.37017977_dt, 0.72821045_dt, 0.5914197_dt,  0.13103616_dt, 0.8721162_dt,
+                               0.9396914_dt,  0.0898335_dt,  0.64125204_dt, 0.59468186_dt, 0.9526489_dt,  0.58091915_dt, 0.41282332_dt, 0.08863175_dt, 0.8548584_dt,  0.4440868_dt,  0.70558417_dt,
+                               0.55628467_dt, 0.7936231_dt,  0.03845227_dt, 0.72081244_dt, 0.7246641_dt,  0.77859426_dt, 0.13219965_dt, 0.7014549_dt,  0.00531662_dt, 0.16521204_dt, 0.31161308_dt,
+                               0.6340182_dt,  0.81727743_dt, 0.5770134_dt,  0.4830289_dt,  0.38074362_dt, 0.36956656_dt, 0.35619748_dt, 0.9744946_dt,  0.4673227_dt,  0.02055061_dt, 0.01841414_dt,
+                               0.38048124_dt, 0.19153368_dt, 0.24953306_dt, 0.04886508_dt, 0.8518716_dt,  0.2697699_dt,  0.43472767_dt, 0.6233506_dt,  0.7728535_dt,  0.7977462_dt,  0.17800307_dt,
+                               0.39288294_dt, 0.74178994_dt, 0.5404706_dt,  0.6666156_dt,  0.5245379_dt,  0.5809839_dt,  0.5353533_dt,  0.24594212_dt, 0.47274435_dt, 0.8491398_dt,  0.24667323_dt,
+                               0.2283063_dt,  0.4228245_dt,  0.7832608_dt,  0.27312124_dt, 0.50172055_dt, 0.33225644_dt, 0.4334905_dt,  0.55062807_dt, 0.8973236_dt,  0.89052856_dt, 0.04011297_dt,
+                               0.09300995_dt, 0.86229265_dt, 0.1463412_dt,  0.6064124_dt,  0.77159905_dt, 0.11275506_dt, 0.7820941_dt,  0.03212047_dt, 0.5106342_dt,  0.24657166_dt, 0.6957668_dt,
+                               0.18525338_dt, 0.9630281_dt,  0.7088288_dt,  0.9751494_dt,  0.06359577_dt, 0.03091824_dt, 0.7603164_dt,  0.2684846_dt,  0.38360918_dt, 0.4091003_dt,  0.8824315_dt,
+                               0.23227227_dt, 0.35561895_dt, 0.47684944_dt, 0.16890907_dt, 0.02330649_dt, 0.07008195_dt, 0.5172961_dt,  0.98326766_dt, 0.8809185_dt,  0.27750802_dt, 0.37777352_dt,
+                               0.4160868_dt,  0.37960744_dt, 0.00660408_dt, 0.87966454_dt };
+
+    const raul::Tensor realInputGrad{
+        0.5576909_dt,  0.10268554_dt, 1.21583_dt,    0.4644267_dt,  1.4209416_dt,  0.7823741_dt,  1.0387609_dt, 0.31207624_dt, 1.7064569_dt,  0.26415774_dt, 2.5816064_dt, 1.614563_dt,
+        2.7561765_dt,  2.567015_dt,   1.2666433_dt,  1.2229962_dt,  2.5193486_dt,  0.98090345_dt, 4.2689877_dt, 2.7617307_dt,  5.41607_dt,    2.7106342_dt,  2.8133173_dt, 1.411195_dt,
+        2.260148_dt,   1.2077585_dt,  3.868504_dt,   2.3523033_dt,  5.032123_dt,   2.6380591_dt,  2.9021127_dt, 1.6579413_dt,  1.7178662_dt,  0.77999735_dt, 4.4457464_dt, 2.5680385_dt,
+        5.6356497_dt,  3.4607222_dt,  2.9795966_dt,  1.9860446_dt,  1.3077455_dt,  0.9426035_dt,  3.3319125_dt, 2.101921_dt,   4.9596415_dt,  3.0184565_dt,  2.562317_dt,  1.925806_dt,
+        0.8538045_dt,  0.38228726_dt, 2.5237176_dt,  0.8967829_dt,  3.295321_dt,   1.3643707_dt,  1.849062_dt,  0.66896516_dt, 0.897409_dt,   0.11324289_dt, 1.7721951_dt, 0.8427998_dt,
+        2.119919_dt,   1.8393399_dt,  0.83304536_dt, 1.0469882_dt,  1.2615857_dt,  0.7410467_dt,  2.5863404_dt, 2.3739586_dt,  2.316667_dt,   2.5628784_dt,  1.1560793_dt, 1.2039962_dt,
+        1.703695_dt,   1.0372825_dt,  3.8538268_dt,  2.4922085_dt,  3.955898_dt,   3.450927_dt,   2.2220776_dt, 1.943661_dt,   1.5983331_dt,  1.0016215_dt,  3.8070493_dt, 3.032041_dt,
+        4.1941357_dt,  3.1916635_dt,  2.2360268_dt,  1.7543677_dt,  1.7863901_dt,  1.2309372_dt,  4.166173_dt,  3.3804798_dt,  5.1619616_dt,  4.201617_dt,   2.751583_dt,  2.52417_dt,
+        0.6446827_dt,  0.7085514_dt,  2.4169133_dt,  1.9299953_dt,  3.5867696_dt,  2.8640993_dt,  2.16568_dt,   1.6816076_dt,  0.84270644_dt, 0.22314194_dt, 1.882439_dt,  0.9369726_dt,
+        2.3541582_dt,  1.4315591_dt,  1.0990422_dt,  1.1966186_dt,  0.8784996_dt,  0.16374204_dt, 1.5956448_dt, 1.5081806_dt,  1.9134548_dt,  2.0023036_dt,  0.9584602_dt, 1.0864471_dt,
+        1.5025485_dt,  1.0123351_dt,  2.8424652_dt,  2.8450546_dt,  3.2067394_dt,  2.5848465_dt,  1.6223412_dt, 1.5542384_dt,  1.470764_dt,   1.4822503_dt,  3.2123826_dt, 3.6849844_dt,
+        3.456596_dt,   5.1094837_dt,  1.9909093_dt,  2.436319_dt,   1.1482949_dt,  1.7943096_dt,  3.011209_dt,  4.2696443_dt,  4.043538_dt,   5.1864557_dt,  2.5309825_dt, 2.1265197_dt,
+        1.5913383_dt,  1.9841243_dt,  4.073986_dt,   4.896811_dt,   4.8628902_dt,  6.083433_dt,   2.746561_dt,  2.7296062_dt,  1.0252061_dt,  1.316683_dt,   2.5359368_dt, 3.0267444_dt,
+        3.2475493_dt,  4.072784_dt,   2.2028027_dt,  1.7776501_dt,  0.24596116_dt, 0.6616998_dt,  0.9685084_dt, 1.8170067_dt,  1.3090839_dt,  2.3506553_dt,  0.9244632_dt, 1.2815036_dt,
+        0.16688047_dt, 0.05943713_dt, 1.207072_dt,   0.59436685_dt, 1.1257634_dt,  1.5787661_dt,  0.8276083_dt, 1.1114513_dt,  0.84547186_dt, 0.4665476_dt,  2.576058_dt,  2.1668088_dt,
+        2.54592_dt,    2.8345635_dt,  1.8302705_dt,  1.7113249_dt,  1.4392259_dt,  0.9658877_dt,  4.0039215_dt, 3.0276153_dt,  3.9217286_dt,  3.4416733_dt,  2.6200447_dt, 2.006117_dt,
+        1.1380541_dt,  1.1033841_dt,  3.2236466_dt,  3.102367_dt,   3.6424043_dt,  3.0423124_dt,  2.7368813_dt, 2.3178868_dt,  0.8384216_dt,  1.1071498_dt,  3.240076_dt,  2.967668_dt,
+        4.70199_dt,    4.199554_dt,   3.1047962_dt,  2.5365076_dt,  0.9410038_dt,  0.91932404_dt, 2.8061519_dt, 2.6924577_dt,  4.2383738_dt,  3.2108688_dt,  2.451251_dt,  1.4325186_dt,
+        0.70620024_dt, 0.3794494_dt,  1.9236599_dt,  0.83412397_dt, 2.5083585_dt,  1.3713393_dt,  1.5227956_dt, 0.4683162_dt,  0.9716257_dt,  0.2955531_dt,  1.1863115_dt, 1.0192273_dt,
+        1.3868766_dt,  1.5000448_dt,  0.558537_dt,   0.72070074_dt, 1.8744905_dt,  0.46102738_dt, 2.8779607_dt, 1.7020626_dt,  3.3349226_dt,  2.2942593_dt,  1.3841847_dt, 1.1015902_dt,
+        1.6611931_dt,  1.0905963_dt,  2.826785_dt,   3.6167417_dt,  3.2516243_dt,  3.9875083_dt,  2.0130806_dt, 1.4895421_dt,  1.068397_dt,   2.0794399_dt,  2.450947_dt,  3.1819956_dt,
+        3.7007842_dt,  3.5831351_dt,  2.0110729_dt,  1.9323556_dt,  1.3562186_dt,  1.2128909_dt,  3.1887813_dt, 3.5673032_dt,  4.4456472_dt,  4.485358_dt,   2.47816_dt,   2.4037614_dt,
+        1.1451116_dt,  1.2177517_dt,  2.5959253_dt,  2.355865_dt,   3.1214237_dt,  3.1060677_dt,  2.1916268_dt, 1.7333051_dt,  0.10704067_dt, 0.7862899_dt,  0.7340517_dt, 1.6135805_dt,
+        0.96151686_dt, 2.4815655_dt,  0.84493375_dt, 0.8585328_dt
+    };
+
+    const raul::Tensor realFiltersGrad{ 0.78567135_dt, 0.7262797_dt, 0.8936672_dt,  0.13194889_dt, 0.25268063_dt, 0.26930696_dt, 0.92689794_dt, 1.0140908_dt, 1.451793_dt,   0.954332_dt,
+                                        1.3458238_dt,  2.1491122_dt, 0.33088169_dt, 0.26655897_dt, 0.55117583_dt, 1.1063218_dt,  2.029714_dt,   1.7879496_dt, 1.7211943_dt,  2.2004976_dt,
+                                        2.404417_dt,   1.2298425_dt, 0.79554075_dt, 1.485776_dt,   3.175342_dt,   3.9317822_dt,  4.4431424_dt,  3.0550385_dt, 3.7402368_dt,  3.945177_dt,
+                                        1.2426529_dt,  1.664371_dt,  1.8426224_dt,  1.3487302_dt,  3.1876945_dt,  2.3326287_dt,  3.1322505_dt,  3.905685_dt,  3.9447882_dt,  2.8370028_dt,
+                                        3.2555163_dt,  3.1472173_dt, 5.619875_dt,   7.0883136_dt,  7.5981_dt,     5.300009_dt,   7.350748_dt,   7.4326067_dt, 2.5977278_dt,  3.4125273_dt,
+                                        3.525877_dt,   2.4798262_dt, 3.449635_dt,   3.6449788_dt,  1.3442022_dt,  3.2291903_dt,  2.3998146_dt,  3.0021935_dt, 2.8047822_dt,  3.557187_dt,
+                                        4.266777_dt,   5.831498_dt,  5.566462_dt,   3.9503753_dt,  4.689049_dt,   4.895632_dt,   2.3132772_dt,  2.7165463_dt, 2.5393875_dt,  1.1880282_dt,
+                                        1.775405_dt,   1.4924512_dt, 0.5542402_dt,  2.064834_dt,   1.5583493_dt,  2.1136653_dt,  1.7646335_dt,  2.4166102_dt, 1.4375653_dt,  3.605618_dt,
+                                        2.8356533_dt,  2.0830863_dt, 2.243164_dt,   2.9306808_dt,  0.9869584_dt,  1.3731908_dt,  1.4258399_dt,  0.6594024_dt, 0.69788325_dt, 0.9406414_dt };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data_input", raul::DataParams{ { "input" }, 7u, 4u, 2u });
+    work.add<raul::TensorLayer>("data_filters", raul::TensorParams{ { "filters" }, 5u, 3u, 2u, 3u });
+
+    raul::DynamicDepthwiseConvolution2DLayer conv("conv2d", raul::BasicParams{ { "input", "filters" }, { "features" } }, networkParameters);
+
+    TENSORS_CREATE(5u);
+    memory_manager["input"] = TORANGE(input);
+    memory_manager["filters"] = TORANGE(filters);
+    memory_manager[raul::Name("features").grad()] = TORANGE(deltas);
+
+    conv.forwardCompute(raul::NetworkMode::Train);
+
+    // Forward checks
+    const auto& features = memory_manager["features"];
+    EXPECT_EQ(features.getBatchSize(), 5u);
+    EXPECT_EQ(features.getDepth(), 3u);
+    EXPECT_EQ(features.getHeight(), 2u);
+    EXPECT_EQ(features.getWidth(), 6u);
+    for (size_t i = 0; i < features.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(features[i], realFeatures[i], eps));
+    }
+
+    conv.backwardComputeImpl();
+
+    // Backward checks
+    const auto& inputGrad = memory_manager[raul::Name("input").grad()];
+    for (size_t i = 0; i < inputGrad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(inputGrad[i], realInputGrad[i], eps));
+    }
+
+    const auto& filtersGrad = memory_manager[raul::Name("filters").grad()];
+    for (size_t i = 0; i < filtersGrad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(filtersGrad[i], realFiltersGrad[i], eps));
+    }
+}
+
+TEST(TestLayerDynamicDepthwiseConvolution2D, NonBatchedUnit)
+{
+    PROFILE_TEST
+
+    // see tf_depthwise_conv2d.py
+    // Test settings
+    constexpr raul::dtype eps = 1.0e-6_dt;
+
+    // Inputs
+    const raul::Tensor input{
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0.01021147_dt, 0.1765188_dt,  0.87924564_dt, 0.9197762_dt,  0.5962018_dt,  0.47564948_dt,
+        0.5654975_dt,  0.34476447_dt, 0.85968804_dt, 0.9875555_dt,  0.01058149_dt, 0.339782_dt,   0.2425921_dt,  0.9211738_dt,  0.1642909_dt,  0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0.5010754_dt,  0.06802893_dt, 0.92733943_dt,
+        0.38135028_dt, 0.79155624_dt, 0.02508831_dt, 0.8701066_dt,  0.82625425_dt, 0.7514365_dt,  0.9443294_dt,  0.51774037_dt, 0.33772874_dt, 0.03504014_dt, 0.2998674_dt,  0.6926192_dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0.81010747_dt, 0.14049459_dt, 0.5186857_dt,  0.94181883_dt, 0.876843_dt,   0.30339718_dt, 0.40794075_dt, 0.55919313_dt, 0.3231908_dt,  0.17547584_dt, 0.9436065_dt,  0.7587601_dt,
+        0.91779304_dt, 0.16914761_dt, 0.40076125_dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0.5133803_dt,  0.42446828_dt, 0.5984447_dt,  0.6057581_dt,  0.56168854_dt, 0.12310338_dt, 0.21958232_dt, 0.38766956_dt, 0.69197893_dt,
+        0.40024364_dt, 0.09241581_dt, 0.6181046_dt,  0.43550146_dt, 0.47900355_dt, 0.09884691_dt, 0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,
+        0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt,         0._dt
+    };
+
+    const raul::Tensor filters{ 0.72825086_dt, 0.09089839_dt, 0.5703032_dt,  0.5246129_dt,  0.02513623_dt, 0.8086195_dt,  0.31067622_dt, 0.41581333_dt, 0.39445984_dt, 0.818167_dt,   0.37038922_dt,
+                                0.44542086_dt, 0.09213388_dt, 0.32764196_dt, 0.08026743_dt, 0.33068836_dt, 0.6397383_dt,  0.6067544_dt,  0.4225545_dt,  0.13956106_dt, 0.7970207_dt,  0.81368256_dt,
+                                0.5407566_dt,  0.202963_dt,   0.735265_dt,   0.67287743_dt, 0.70911014_dt, 0.73380566_dt, 0.31025946_dt, 0.6905507_dt,  0.63429415_dt, 0.555498_dt,   0.59422433_dt,
+                                0.23926544_dt, 0.0207057_dt,  0.06849682_dt, 0.05877316_dt, 0.04263389_dt, 0.5698596_dt,  0.11631501_dt, 0.93821836_dt, 0.7392818_dt,  0.1456263_dt,  0.7058301_dt,
+                                0.6448051_dt,  0.37258422_dt, 0.32680643_dt, 0.7298577_dt,  0.59354806_dt, 0.88705945_dt, 0.6850368_dt,  0.85964084_dt, 0.10746598_dt, 0.18292606_dt, 0.34517407_dt,
+                                0.7990773_dt,  0.38956988_dt, 0.01598418_dt, 0.64771616_dt, 0.866467_dt,   0.707366_dt,   0.8152659_dt,  0.94780564_dt, 0.56207216_dt, 0.2276349_dt,  0.16288865_dt,
+                                0.46146047_dt, 0.68838036_dt, 0.9376774_dt,  0.12843692_dt, 0.7948899_dt,  0.45111728_dt, 0.24555266_dt, 0.43258548_dt, 0.42977738_dt, 0.85790086_dt, 0.28016567_dt,
+                                0.6703578_dt,  0.3804351_dt,  0.12822366_dt, 0.19434083_dt, 0.85616803_dt, 0.34747386_dt, 0.4888357_dt,  0.9132589_dt,  0.4949168_dt,  0.37212765_dt, 0.6721473_dt,
+                                0.629706_dt,   0.8874141_dt,  0.27165258_dt, 0.11999154_dt, 0.06307685_dt, 0.6719115_dt,  0.05676365_dt, 0.80641127_dt, 0.4519614_dt,  0.11959243_dt, 0.36097383_dt,
+                                0.49697495_dt, 0.7626773_dt,  0.6142422_dt,  0.50659907_dt, 0.36766732_dt, 0.55378807_dt, 0.16661847_dt, 0.7330595_dt,  0.5390216_dt,  0.20473635_dt, 0.5092616_dt,
+                                0.95956874_dt, 0.3083793_dt,  0.40405488_dt, 0.29207802_dt, 0.5329044_dt,  0.5597873_dt,  0.3113656_dt,  0.30077517_dt, 0.68057406_dt, 0.29917705_dt, 0.6745639_dt,
+                                0.91068506_dt, 0.77012074_dt, 0.7781373_dt,  0.5271839_dt,  0.42010713_dt };
+
+    const raul::Tensor realFeatures{ 1.1677732_dt,  1.1327978_dt,  1.219373_dt,   1.0997884_dt,  1.0406114_dt, 1.245983_dt,   1.2498852_dt,  1.6056075_dt,  1.1406971_dt,  1.1799679_dt,  1.5134155_dt,
+                                     1.5352222_dt,  1.2388445_dt,  1.9898169_dt,  1.0857755_dt,  1.1466899_dt, 0.91438776_dt, 0.97642016_dt, 1.5531234_dt,  2.2470958_dt,  1.5303433_dt,  0.7080586_dt,
+                                     1.1032326_dt,  1.6030853_dt,  0.9826737_dt,  2.1381812_dt,  1.5984172_dt, 1.0569761_dt,  1.3302375_dt,  1.4755971_dt,  1.1353354_dt,  1.1858466_dt,  1.3740151_dt,
+                                     1.4935743_dt,  0.99506235_dt, 1.4458361_dt,  1.0894454_dt,  1.7232603_dt, 1.832293_dt,   1.2530274_dt,  1.5389601_dt,  1.7115577_dt,  1.4889032_dt,  2.123796_dt,
+                                     1.7524469_dt,  0.85930073_dt, 0.749466_dt,   0.9304311_dt,  1.2838138_dt, 2.1812375_dt,  1.6798903_dt,  1.048529_dt,   1.419909_dt,   1.6999748_dt,  0.9521787_dt,
+                                     1.8455267_dt,  1.601093_dt,   1.1899389_dt,  1.3354164_dt,  1.246784_dt,  2.0127654_dt,  1.9619832_dt,  1.4420117_dt,  1.5928972_dt,  0.96579635_dt, 1.1645594_dt,
+                                     1.5263364_dt,  1.8895807_dt,  1.848565_dt,   1.5212554_dt,  1.0137587_dt, 1.3491881_dt,  1.4008342_dt,  2.3215978_dt,  1.9253362_dt,  0.71540314_dt, 1.0412362_dt,
+                                     1.021352_dt,   1.3654492_dt,  2.5080738_dt,  1.9620675_dt,  1.3671541_dt, 0.90273196_dt, 1.0184839_dt,  1.136684_dt,   1.949634_dt,   1.5564426_dt,  0.9339007_dt,
+                                     1.2032131_dt,  1.3867317_dt,  1.2465925_dt,  1.1973791_dt,  1.2918184_dt, 1.0443897_dt,  0.704969_dt,   1.0067571_dt,  0.97098774_dt, 1.2842218_dt,  1.1940488_dt,
+                                     0.86167645_dt, 1.1732104_dt,  1.3134195_dt,  0.96076804_dt, 1.5823758_dt, 1.1695498_dt,  1.0145738_dt,  0.8205857_dt,  0.67395204_dt, 0.9941878_dt,  1.7012615_dt,
+                                     1.3462441_dt,  0.7606633_dt,  0.87628555_dt, 1.204149_dt,   0.6949326_dt, 1.3191073_dt,  1.3596395_dt,  0.86261666_dt, 1.0989243_dt,  1.2105163_dt };
+
+    const raul::Tensor deltas{ 0.13313437_dt, 0.11416864_dt, 0.83263075_dt, 0.26782572_dt, 0.04344749_dt, 0.19939256_dt, 0.9605675_dt,  0.6210573_dt,  0.79436934_dt, 0.50629103_dt, 0.6402621_dt,
+                               0.54030013_dt, 0.562127_dt,   0.19475496_dt, 0.45741856_dt, 0.89240277_dt, 0.63750327_dt, 0.45551407_dt, 0.04760432_dt, 0.31664217_dt, 0.8744447_dt,  0.33307338_dt,
+                               0.34883022_dt, 0.6697351_dt,  0.56559145_dt, 0.10826266_dt, 0.3126694_dt,  0.45361698_dt, 0.60872674_dt, 0.9392239_dt,  0.4712106_dt,  0.40582657_dt, 0.74381626_dt,
+                               0.6904721_dt,  0.51705444_dt, 0.56366825_dt, 0.48726392_dt, 0.6419326_dt,  0.23007059_dt, 0.7204796_dt,  0.7305455_dt,  0.8685031_dt,  0.4747963_dt,  0.4136318_dt,
+                               0.58509445_dt, 0.36083388_dt, 0.75604653_dt, 0.4816519_dt,  0.37079167_dt, 0.20339346_dt, 0.06977868_dt, 0.87573063_dt, 0.18314803_dt, 0.6457157_dt,  0.6279726_dt,
+                               0.34137547_dt, 0.97568464_dt, 0.5323595_dt,  0.55254424_dt, 0.26628053_dt, 0.05504179_dt, 0.9998379_dt,  0.03265238_dt, 0.14103556_dt, 0.9608103_dt,  0.397375_dt,
+                               0.2576326_dt,  0.4989009_dt,  0.27725708_dt, 0.70624924_dt, 0.6668159_dt,  0.7953532_dt,  0.16207945_dt, 0.08067286_dt, 0.58791435_dt, 0.28615236_dt, 0.31968975_dt,
+                               0.07930934_dt, 0.70950806_dt, 0.4748652_dt,  0.40453756_dt, 0.6115334_dt,  0.2513436_dt,  0.90194166_dt, 0.958609_dt,   0.37559354_dt, 0.7585801_dt,  0.16757655_dt,
+                               0.02005875_dt, 0.12391651_dt, 0.65940046_dt, 0.7076644_dt,  0.5556011_dt,  0.9214721_dt,  0.6244832_dt,  0.07338941_dt, 0.26348615_dt, 0.4965471_dt,  0.40726805_dt,
+                               0.8692074_dt,  0.14704847_dt, 0.5777532_dt,  0.38797998_dt, 0.09531462_dt, 0.55477977_dt, 0.41230667_dt, 0.6197858_dt,  0.87575734_dt, 0.5060594_dt,  0.75868726_dt,
+                               0.27394187_dt, 0.3676635_dt,  0.10253441_dt, 0.49808252_dt, 0.9871435_dt,  0.09287333_dt, 0.256451_dt,   0.70734787_dt, 0.9495864_dt,  0.3149048_dt };
+
+    const raul::Tensor realInputGrad{
+        0.10733296_dt, 0.6153568_dt,  0.16232482_dt, 0.84482175_dt, 1.2662038_dt, 0.5578971_dt,  1.0334144_dt,  1.6120107_dt,  1.0109465_dt,  0.6832466_dt, 2.6967363_dt,  1.7907424_dt,  1.3510659_dt,
+        3.1976035_dt,  2.4935708_dt,  1.8656075_dt,  3.259173_dt,   2.2968366_dt, 1.6632171_dt,  3.2210863_dt,  2.0346513_dt,  1.1498107_dt,  3.1163638_dt, 2.2947826_dt,  1.4946784_dt,  3.0198233_dt,
+        2.47014_dt,    1.9127548_dt,  2.7668734_dt,  1.8086905_dt,  1.789782_dt,  3.2566433_dt,  2.9453454_dt,  2.1433992_dt,  3.3010502_dt,  2.2102842_dt, 2.2104678_dt,  3.28276_dt,    2.1613019_dt,
+        1.5368422_dt,  2.8554683_dt,  2.7629628_dt,  1.5737187_dt,  3.4539347_dt, 2.0458262_dt,  1.9574974_dt,  2.7343302_dt,  2.944356_dt,   1.2634692_dt, 2.5726376_dt,  2.6820889_dt,  1.2231464_dt,
+        2.6709704_dt,  2.8476694_dt,  1.6987537_dt,  2.7146494_dt,  3.3790646_dt, 1.2301981_dt,  2.4658866_dt,  2.6879723_dt,  1.676707_dt,   2.8414285_dt, 2.752582_dt,   2.1194658_dt,  2.6070976_dt,
+        2.4237237_dt,  0.93010294_dt, 1.8590455_dt,  1.4855059_dt,  0.6824837_dt, 1.166396_dt,   1.160536_dt,   0.48012075_dt, 0.5937695_dt,  0.7154856_dt, 0.3800485_dt,  0.78643143_dt, 0.46878994_dt,
+        0.728343_dt,   1.3675101_dt,  1.1632327_dt,  0.978055_dt,   1.4912409_dt, 1.7387006_dt,  1.1189864_dt,  2.4366167_dt,  2.4096417_dt,  1.7824104_dt, 3.549758_dt,   2.48156_dt,    2.0107336_dt,
+        3.174596_dt,   2.2145352_dt,  1.692854_dt,   2.5155034_dt,  2.288757_dt,  1.6644881_dt,  3.2887077_dt,  2.8120599_dt,  2.248878_dt,   3.2287943_dt, 2.5000532_dt,  2.3250968_dt,  2.4382539_dt,
+        2.3381824_dt,  2.2430701_dt,  2.894434_dt,   2.850984_dt,   2.5921462_dt, 3.2709584_dt,  2.3928103_dt,  2.59715_dt,    3.14487_dt,    2.5319872_dt, 2.0501275_dt,  2.9946346_dt,  2.795827_dt,
+        2.2280254_dt,  3.365367_dt,   2.644926_dt,   1.8768839_dt,  3.2679114_dt, 3.2563124_dt,  1.5782375_dt,  3.2098918_dt,  3.0985208_dt,  1.5854976_dt, 2.5074172_dt,  3.215337_dt,   1.9176538_dt,
+        2.8783731_dt,  3.3650494_dt,  1.7010297_dt,  2.146011_dt,   2.6656342_dt, 2.2015767_dt,  2.8724823_dt,  2.7798615_dt,  2.0209937_dt,  1.9945602_dt, 2.219821_dt,   1.3108406_dt,  2.1169014_dt,
+        1.219783_dt,   0.961097_dt,   1.1990918_dt,  0.8235348_dt,  0.7344932_dt, 1.1656438_dt,  0.40315878_dt, 0.13096789_dt, 0.09261083_dt, 0.3454763_dt, 0.66581655_dt, 0.65689874_dt, 1.1927723_dt,
+        0.74551755_dt, 1.221865_dt,   1.5291914_dt,  0.9937576_dt,  1.4141424_dt, 2.3987699_dt,  2.0832257_dt,  2.2445505_dt,  1.9427035_dt,  1.8701437_dt, 2.1404536_dt,  1.7628777_dt,  1.2373285_dt,
+        1.9319458_dt,  1.8244576_dt,  2.196448_dt,   2.2081327_dt,  2.5665026_dt, 2.9935918_dt,  2.0014248_dt,  1.4917082_dt,  2.37259_dt,    2.0902548_dt, 2.2576642_dt,  2.020271_dt,   1.8187227_dt,
+        2.3042188_dt,  2.67012_dt,    2.296483_dt,   1.7891291_dt,  2.6152835_dt, 2.034862_dt,   2.4792285_dt,  2.0785878_dt,  2.4655206_dt,  1.8624862_dt, 2.4203782_dt,  2.5347066_dt,  2.3523166_dt,
+        1.7696817_dt,  2.4652743_dt,  2.3974323_dt,  1.1909232_dt,  2.0569973_dt, 2.5447206_dt,  1.9211378_dt,  1.574482_dt,   3.0239453_dt,  2.228361_dt,  1.5318108_dt,  2.5785477_dt,  1.6905537_dt,
+        1.5200746_dt,  2.3983955_dt,  2.4507065_dt,  1.7690592_dt,  2.2793984_dt, 1.7704803_dt,  2.056984_dt,   1.37346_dt,    1.2142613_dt,  1.7649109_dt, 0.68705_dt,    1.6321607_dt,  1.0739983_dt,
+        0.5621409_dt,  0.9886904_dt,  0.7145958_dt,  0.06263286_dt, 0.5445345_dt, 0.80027723_dt, 0.07504126_dt, 0.73613566_dt, 1.661344_dt,   0.7348698_dt, 0.8721541_dt,  1.7538185_dt,  1.4795772_dt,
+        1.1620317_dt,  2.4180233_dt,  1.8221947_dt,  2.4086394_dt,  3.2091076_dt, 1.9074223_dt,  2.1569552_dt,  3.078372_dt,   1.8352387_dt,  1.3023868_dt, 2.3417234_dt,  2.4349203_dt,  2.2438877_dt,
+        2.7070842_dt,  1.8487272_dt,  2.9655852_dt,  3.0749679_dt,  2.3275113_dt, 2.2535396_dt,  2.2892246_dt,  1.9369491_dt,  2.5171118_dt,  2.4936097_dt, 2.497524_dt,   2.775391_dt,   2.669236_dt,
+        2.4596434_dt,  2.7328248_dt,  3.1767502_dt,  1.5430514_dt,  2.2571979_dt, 2.968308_dt,   2.5614862_dt,  2.782628_dt,   3.1863427_dt,  2.1262536_dt, 1.8821661_dt,  2.6152751_dt,  2.6028903_dt,
+        1.4876878_dt,  2.8368764_dt,  2.919303_dt,   2.1146412_dt,  2.3567472_dt, 2.6962397_dt,  2.1741772_dt,  2.4280248_dt,  2.988492_dt,   1.8847075_dt, 2.2579832_dt,  2.293509_dt,   2.6281857_dt,
+        2.8497248_dt,  2.4004908_dt,  1.9142599_dt,  1.9228818_dt,  2.0568087_dt, 1.2923046_dt,  1.4081726_dt,  1.3891119_dt,  1.610337_dt,   0.7896637_dt, 1.0037787_dt,  0.7504697_dt,  0.74791205_dt,
+        0.6329004_dt
+    };
+
+    const raul::Tensor realFiltersGrad{ 0._dt,         0._dt,        0._dt,        0._dt,        0._dt,         0._dt,         0._dt,        0._dt,         0._dt,        0._dt,        0._dt,
+                                        0._dt,         0._dt,        0._dt,        0._dt,        0._dt,         0._dt,         0._dt,        0._dt,         0._dt,        0._dt,        0._dt,
+                                        0._dt,         0._dt,        0._dt,        0._dt,        0._dt,         0._dt,         0._dt,        0._dt,         0._dt,        0._dt,        0._dt,
+                                        0._dt,         0._dt,        0._dt,        1.6037935_dt, 0.52411085_dt, 0.33699852_dt, 0.4400781_dt, 1.6262956_dt,  1.3254677_dt, 3.2813606_dt, 1.5190967_dt,
+                                        2.1001425_dt,  1.5964346_dt, 1.0946635_dt, 2.4833422_dt, 3.208155_dt,   2.1310265_dt,  2.9613686_dt, 2.9959674_dt,  3.6598485_dt, 3.3476024_dt, 4.1486936_dt,
+                                        2.7063766_dt,  3.870176_dt,  3.7335548_dt, 3.7550666_dt, 4.7942333_dt,  5.1676087_dt,  4.8035135_dt, 4.0347257_dt,  5.4231215_dt, 4.6004987_dt, 4.781832_dt,
+                                        3.966888_dt,   4.3378425_dt, 4.14267_dt,   4.202571_dt,  3.2113044_dt,  3.899431_dt,   2.682681_dt,  2.6640391_dt,  2.5240722_dt, 3.4002225_dt, 3.0474308_dt,
+                                        2.7384732_dt,  1.4513367_dt, 1.8019592_dt, 1.5187945_dt, 1.7968041_dt,  2.1973555_dt,  1.6711557_dt, 0.38649547_dt, 1.2677498_dt, 1.2617018_dt, 0.9190084_dt,
+                                        0.81204367_dt, 0.5896726_dt, 0._dt,        0._dt,        0._dt,         0._dt,         0._dt,        0._dt,         0._dt,        0._dt,        0._dt,
+                                        0._dt,         0._dt,        0._dt,        0._dt,        0._dt,         0._dt,         0._dt,        0._dt,         0._dt,        0._dt,        0._dt,
+                                        0._dt,         0._dt,        0._dt,        0._dt,        0._dt,         0._dt,         0._dt,        0._dt,         0._dt,        0._dt,        0._dt,
+                                        0._dt,         0._dt,        0._dt,        0._dt,        0._dt };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::TensorLayer>("data_input", raul::TensorParams{ { "input" }, 1u, 4u, 25u, 3u });
+    work.add<raul::TensorLayer>("data_filters", raul::TensorParams{ { "filters" }, 1u, 21u, 3u, 2u });
+
+    raul::DynamicDepthwiseConvolution2DLayer conv("conv2d", raul::BasicParams{ { "input", "filters" }, { "features" } }, networkParameters);
+    TENSORS_CREATE(5u);
+    memory_manager["input"] = TORANGE(input);
+    memory_manager["filters"] = TORANGE(filters);
+    memory_manager[raul::Name("features").grad()] = TORANGE(deltas);
+
+    conv.forwardCompute(raul::NetworkMode::Train);
+
+    // Forward checks
+    const auto& features = memory_manager["features"];
+    EXPECT_EQ(features.getBatchSize(), 1u);
+    EXPECT_EQ(features.getDepth(), 4u);
+    EXPECT_EQ(features.getHeight(), 5u);
+    EXPECT_EQ(features.getWidth(), 6u);
+    for (size_t i = 0; i < features.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(features[i], realFeatures[i], eps));
+    }
+
+    conv.backwardComputeImpl();
+
+    // Backward checks
+    const auto& inputGrad = memory_manager[raul::Name("input").grad()];
+    for (size_t i = 0; i < inputGrad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(inputGrad[i], realInputGrad[i], eps));
+    }
+
+    const auto& filtersGrad = memory_manager[raul::Name("filters").grad()];
+    for (size_t i = 0; i < filtersGrad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(filtersGrad[i], realFiltersGrad[i], eps));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ElementWiseCompare.cpp b/training/src/tests/tests/layers/Test_Layer_ElementWiseCompare.cpp
new file mode 100644
index 00000000..7eca11ad
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ElementWiseCompare.cpp
@@ -0,0 +1,260 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/ElementWiseCompareLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/parameters/LayerParameters.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+bool golden_equal_layer(const raul::dtype x, const raul::dtype y, const raul::dtype tol = 1e-6_dt)
+{
+    return std::abs(x - y) <= tol;
+}
+
+bool golden_less_layer(const raul::dtype x, const raul::dtype y, const raul::dtype tol = 0.0_dt)
+{
+    return y - x > tol;
+}
+
+bool golden_le_layer(const raul::dtype x, const raul::dtype y, const raul::dtype tol = 0.0_dt)
+{
+    return y - x >= tol;
+}
+
+bool golden_greater_layer(const raul::dtype x, const raul::dtype y, const raul::dtype tol = 0.0_dt)
+{
+    return x - y > tol;
+}
+
+bool golden_ge_layer(const raul::dtype x, const raul::dtype y, const raul::dtype tol = 0.0_dt)
+{
+    return x - y >= tol;
+}
+
+}
+
+TEST(TestLayerElementWiseCompare, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ElementWiseCompareLayer("comp", raul::ElementWiseComparisonLayerParams{ { "x", "y", "z" }, { "x_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerElementWiseCompare, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ElementWiseCompareLayer("comp", raul::ElementWiseComparisonLayerParams{ { "x", "y" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerElementWiseCompare, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseCompareLayer elementwise_eq("mul", raul::ElementWiseComparisonLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_eq.forwardCompute(raul::NetworkMode::Train);
+    ASSERT_NO_THROW(elementwise_eq.backwardCompute());
+}
+
+TEST(TestLayerElementWiseCompare, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = false;
+
+    std::vector<raul::ElementWiseComparisonLayerParams> params = { raul::ElementWiseComparisonLayerParams{ { "x", "y" }, { "out" }, enable_broadcast },
+                                                                   raul::ElementWiseComparisonLayerParams{ { "x", "y" }, { "out" }, enable_broadcast, "less" },
+                                                                   raul::ElementWiseComparisonLayerParams{ { "x", "y" }, { "out" }, enable_broadcast, "greater" },
+                                                                   raul::ElementWiseComparisonLayerParams{ { "x", "y" }, { "out" }, enable_broadcast, "le" },
+                                                                   raul::ElementWiseComparisonLayerParams{ { "x", "y" }, { "out" }, enable_broadcast, "ge" } };
+
+    // Check comparators
+    std::array<std::function<bool(const raul::dtype x, const raul::dtype y, const raul::dtype tol)>, 5> golden_layers = {
+        golden_equal_layer, golden_less_layer, golden_greater_layer, golden_le_layer, golden_ge_layer
+    };
+    std::array<std::string, 5> names = { "equal", "less", "greater", "less_or_equal", "greater_or_equal" };
+    std::array<raul::dtype, 5> tols = { 1e-6_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt };
+    for (size_t i = 0; i < params.size(); i++)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+        work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+        raul::ElementWiseCompareLayer elementwise_comp(names[i], params[i], networkParameters);
+        TENSORS_CREATE(tensor_size);
+        tools::init_rand_tensor("x", random_range, memory_manager);
+        tools::init_rand_tensor("y", random_range, memory_manager);
+        elementwise_comp.forwardCompute(raul::NetworkMode::Test);
+
+        // Checks
+        const auto& x_tensor = memory_manager["x"];
+        const auto& y_tensor = memory_manager["y"];
+        const auto& out_tensor = memory_manager["out"];
+
+        EXPECT_EQ(out_tensor.size(), x_tensor.size());
+        EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+        for (size_t j = 0; j < out_tensor.size(); j++)
+        {
+            const auto golden_out_value = golden_layers[i](x_tensor[j], y_tensor[j], tols[i]);
+            EXPECT_EQ(out_tensor[j], golden_out_value);
+        }
+    }
+}
+
+TEST(TestLayerElementWiseCompare, EqualBroadcastForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseCompareLayer elementwise_eq("equal", raul::ElementWiseComparisonLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_eq.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[0];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_equal_layer(x_value, y_value);
+        EXPECT_EQ(out_value, golden_out_value);
+    }
+}
+
+TEST(TestLayerElementWiseCompare, ComplexForwardBackwardUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const Tensor x{ 0.8822692633_dt, 0.9150039554_dt, 0.3828637600_dt, 0.9593056440_dt, 0.3904482126_dt, 0.6008953452_dt, 0.2565724850_dt, 0.7936413288_dt,
+                    0.9407714605_dt, 0.1331859231_dt, 0.9345980883_dt, 0.5935796499_dt, 0.8694044352_dt, 0.5677152872_dt, 0.7410940528_dt, 0.4294044971_dt,
+                    0.8854429126_dt, 0.5739044547_dt, 0.2665800452_dt, 0.6274491549_dt, 0.2696316838_dt, 0.4413635731_dt, 0.2969208360_dt, 0.8316854835_dt };
+    const Tensor y{ 0.1053149104_dt, 0.2694948316_dt, 0.3588126302_dt, 0.1993637681_dt, 0.5471915603_dt, 0.0061604381_dt, 0.9515545368_dt, 0.0752658844_dt,
+                    0.8860136867_dt, 0.5832095742_dt, 0.3376477361_dt, 0.8089749813_dt, 0.5779253840_dt, 0.9039816856_dt, 0.5546598434_dt, 0.3423134089_dt,
+                    0.6343418360_dt, 0.3644102812_dt, 0.7104287744_dt, 0.9464110732_dt, 0.7890297771_dt, 0.2814137340_dt, 0.7886323333_dt, 0.5894631147_dt };
+    const Tensor z{ 0.9875841737_dt, 1.1844987869_dt, 0.7416763902_dt, 1.1586694717_dt, 1.9376397133_dt, 0.6070557833_dt, 2.2081270218_dt, 0.8689072132_dt,
+                    1.8267850876_dt, 1.7163954973_dt, 1.2722458839_dt, 2.4025545120_dt, 1.4473297596_dt, 2.4716968536_dt, 1.2957539558_dt, 0.7717179060_dt,
+                    1.5197846889_dt, 0.9383147359_dt, 1.9770088196_dt, 2.5738601685_dt, 2.0586614609_dt, 0.7227773070_dt, 2.0855531693_dt, 1.4211485386_dt };
+
+    const Tensor x_grad{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                         1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const Tensor y_grad{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                         1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const size_t BATCH_SIZE = 1;
+
+    // See element-wise_compare_and_sum.py
+    raul::Workflow work;
+    work.add<raul::DataLayer>("x", raul::DataParams{ { "x" }, 2, 3, 4 });
+    work.add<raul::DataLayer>("y", raul::DataParams{ { "y" }, 2, 3, 4 });
+    work.add<raul::ElementWiseSumLayer>("sum", ElementWiseLayerParams{ { "x", "y" }, { "out1" } });
+    work.add<raul::ElementWiseCompareLayer>("less", ElementWiseComparisonLayerParams{ { "x", "y" }, { "out2" } });
+    work.add<raul::ElementWiseSumLayer>("final_sum", ElementWiseLayerParams{ { "out1", "out2" }, { "out" } });
+
+    work.preparePipelines();
+    work.setBatchSize(BATCH_SIZE);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+
+    // Apply functions
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    // Forward checks
+    const auto& out_tensor = memory_manager["out"];
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], z[i], eps_rel));
+    }
+
+    // Backward Checks
+    const auto& x_tensor_grad = memory_manager[Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[Name("y").grad()];
+
+    EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+    EXPECT_EQ(y_tensor_grad.getShape(), memory_manager["y"].getShape());
+
+    for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], x_grad[i], eps_rel)) << "expected: " << x_grad[i] << ", got: " << x_tensor_grad[i];
+    }
+
+    for (size_t i = 0; i < y_tensor_grad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(y_tensor_grad[i], y_grad[i], eps_rel)) << "expected: " << y_grad[i] << ", got: " << y_tensor_grad[i];
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ElementWiseDiv.cpp b/training/src/tests/tests/layers/Test_Layer_ElementWiseDiv.cpp
new file mode 100644
index 00000000..e46583e7
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ElementWiseDiv.cpp
@@ -0,0 +1,482 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/base/layers/basic/ElementWiseDivLayer.h>
+#include <training/compiler/Workflow.h>
+#include <training/system/NameGenerator.h>
+
+namespace UT
+{
+
+namespace
+{
+raul::dtype golden_div_layer(const raul::dtype x, const raul::dtype y)
+{
+    return x / y;
+}
+
+std::pair<raul::dtype, raul::dtype> golden_div_layer_grad(const raul::dtype x, const raul::dtype y, const raul::dtype grad)
+{
+    const auto x_grad = grad / y;
+    const auto y_grad = grad * x / y / y * (-1);
+    return std::make_pair(x_grad, y_grad);
+}
+}
+
+TEST(TestLayerElementWiseDiv, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ElementWiseDivLayer("div", raul::ElementWiseLayerParams{ { { "x", "y" }, { "x_out", "y_out" } } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerElementWiseDiv, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ElementWiseDivLayer("div", raul::ElementWiseLayerParams{ { "x", "y", "z" }, { "out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerElementWiseDiv, ForwardZeroDivisionUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+
+    const raul::Tensor x{ 3.0_dt, 2.0_dt, 1.0_dt };
+    const raul::Tensor y{ 2.0_dt, 1.0_dt, 0.0_dt };
+    const raul::Tensor z{ 1.5_dt, 2.0_dt, std::numeric_limits<raul::dtype>::infinity() };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseDivLayer elementwise_div("div", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+
+    elementwise_div.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& out_tensor = memory_manager["out"];
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], z[i], eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseDiv, BackwardZeroDivisionUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+
+    const raul::Tensor x{ 3.0_dt, 2.0_dt, 1.0_dt };
+    const raul::Tensor y{ 2.0_dt, 1.0_dt, 0.0_dt };
+
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor x_grad{ 0.5_dt, 1.0_dt, std::numeric_limits<raul::dtype>::infinity() };
+    const raul::Tensor y_grad{ -0.75_dt, -2.0_dt, -std::numeric_limits<raul::dtype>::infinity() };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseDivLayer elementwise_div("div", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    elementwise_div.forwardCompute(raul::NetworkMode::Test);
+    elementwise_div.backwardCompute();
+
+    // Checks
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+    EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+    EXPECT_EQ(y_tensor_grad.getShape(), memory_manager["y"].getShape());
+
+    for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], x_grad[i], eps_rel)) << "expected: " << x_grad[i] << ", got: " << x_tensor_grad[i];
+    }
+
+    for (size_t i = 0; i < y_tensor_grad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(y_tensor_grad[i], y_grad[i], eps_rel)) << "expected: " << y_grad[i] << ", got: " << y_tensor_grad[i];
+    }
+}
+
+TEST(TestLayerElementWiseDiv, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseDivLayer elementwise_div("div", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_div.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_div_layer(x_value, y_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseDiv, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseDivLayer elementwise_div("div", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_div.forwardCompute(raul::NetworkMode::Train);
+    elementwise_div.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+    EXPECT_EQ(out_tensor_grad.size(), y_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto y_grad_value = y_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_div_layer_grad(x_value, y_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+        ASSERT_TRUE(tools::expect_near_relative(y_grad_value, golden_out_value_y, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseDiv, NoBroadcastForwarFailUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = false;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 3u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseDivLayer elementwise_div("div", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    ASSERT_THROW(elementwise_div.forwardCompute(raul::NetworkMode::Test), raul::Exception);
+}
+
+TEST(TestLayerElementWiseDiv, BroadcastForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 3u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseDivLayer elementwise_div("div", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_div.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        // We have x tensor with shape [1,1] so there is only 1 value in x
+        const auto x_value = x_tensor[0];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_div_layer(x_value, y_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseDiv, BroadcastBackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto random_range = std::make_pair(1.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 2u, 3u, 4u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseDivLayer elementwise_div("div", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_div.forwardCompute(raul::NetworkMode::Train);
+    elementwise_div.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        // We have y tensor with shape [1,1] so there is only 1 value in y
+        const auto y_value = y_tensor[0];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_div_layer_grad(x_value, y_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+    }
+
+    auto golden_out_value = 0.0_dt;
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        // We have y tensor with shape [1,1] so there is only 1 value in y
+        const auto y_value = y_tensor[0];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_div_layer_grad(x_value, y_value, out_grad_value);
+        golden_out_value += golden_out_value_y;
+    }
+    // We have y tensor with shape [1,1] so there is only 1 value in grad of y
+    ASSERT_TRUE(tools::expect_near_relative(y_tensor_grad[0], golden_out_value, eps_rel));
+}
+
+TEST(TestLayerElementWiseDiv, BroadcastForwardFunc)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto enable_broadcast = true;
+    const auto shape = yato::dims(5, 4, 3, 2);
+
+    // See broadcasting.ipynb (seed 0)
+    const raul::Tensor x{ 0.1033359326064693_dt,  0.48656171252681413_dt, 0.26826552260919523_dt, 0.77789329039684629_dt, 0.46992440350274567_dt, 0.99579112335869169_dt,
+                          0.7922695881242352_dt,  0.93204329767498884_dt, 0.21747891739260439_dt, 0.4719288679087088_dt,  0.99415248552170699_dt, 0.0057258796833245_dt,
+                          0.26544109756295542_dt, 0.42194057001397134_dt, 0.04668182220684081_dt, 0.95499018226495858_dt, 0.32216882381204004_dt, 0.33478528995344214_dt,
+                          0.12209337140187504_dt, 0.55431150968945719_dt, 0.36458131861678045_dt, 0.86235267336661425_dt, 0.95191105862818748_dt, 0.43618645497613173_dt,
+                          0.14027554365465433_dt, 0.81191335595594505_dt, 0.19024521112905723_dt, 0.78636232005979567_dt, 0.57441018559926216_dt, 0.96256729778404049_dt };
+    const raul::Tensor y{ 0.35537000495037541_dt, 0.73477384868738793_dt, 0.72838401246540652_dt, 0.49430872789854596_dt, 0.7289773666006969_dt,  0.44661990703196996_dt, 0.39500410903829208_dt,
+                          0.89862335756822109_dt, 0.45748917853917637_dt, 0.09670656422271051_dt, 0.65839241759395906_dt, 0.24527510459208224_dt, 0.13743707385911919_dt, 0.22845330022296384_dt,
+                          0.48653049268892479_dt, 0.28265795763984414_dt, 0.68238250997643435_dt, 0.43896126229277976_dt, 0.59572414410976138_dt, 0.27390336240389146_dt };
+    const raul::Tensor z{ 0.290784059338095757_dt, 1.36916933266429908_dt,  0.754890730428012247_dt,  2.18896721603015676_dt,  1.32235246913528015_dt,  2.8021248543408892_dt,
+                          0.140636377833901344_dt, 0.662192473774095181_dt, 0.365099442622283277_dt,  1.05868396348956573_dt,  0.639549712258031922_dt, 1.3552348455753962_dt,
+                          0.141870127347663494_dt, 0.668001636773875029_dt, 0.3683023213279768_dt,    1.06797139569807786_dt,  0.645160238913212059_dt, 1.36712380601020578_dt,
+                          0.209051402037308237_dt, 0.984327577211378202_dt, 0.542708448118389697_dt,  1.57369928243812929_dt,  0.950669848579317356_dt, 2.01451252457567787_dt,
+                          1.08682330127569937_dt,  1.27856273785455254_dt,  0.298334252003917388_dt,  0.647384801683714817_dt, 1.36376317162980154_dt,  0.00785467415816339457_dt,
+                          1.77392358838031572_dt,  2.08688256613754408_dt,  0.486944074745501221_dt,  1.05666778501865388_dt,  2.22594754481139478_dt,  0.0128204757404927483_dt,
+                          2.00572492790810886_dt,  2.35957873943188678_dt,  0.550573810287937504_dt,  1.1947441991368235_dt,   2.51681555400056078_dt,  0.0144957471385934993_dt,
+                          0.881648113697165114_dt, 1.03719015294372707_dt,  0.242013425937566917_dt,  0.525168708262606199_dt, 1.10630608157348465_dt,  0.00637183491292658052_dt,
+                          0.580212844401137628_dt, 0.922296285479983435_dt, 0.102039183431402808_dt,  2.08745961011442693_dt,  0.704210807435419195_dt, 0.731788434914364516_dt,
+                          2.7448095141882769_dt,   4.363101650910302_dt,    0.482716169083774294_dt,  9.87513298544721962_dt,  3.33140595368584203_dt,  3.46186727492921698_dt,
+                          0.4031654837900292_dt,   0.640864868334781956_dt, 0.0709027336272123054_dt, 1.45048781964241269_dt,  0.489326449094568128_dt, 0.50848898165760692_dt,
+                          1.0822178549445991_dt,   1.72027475318256196_dt,  0.190324339212809557_dt,  3.89354714108961675_dt,  1.31349989371257214_dt,  1.36493791536741793_dt,
+                          0.888358344466993577_dt, 4.03320220756197134_dt,  2.65271449965892758_dt,   6.27452731022616739_dt,  6.92615923709166292_dt,  3.17371756199675525_dt,
+                          0.53443470189625375_dt,  2.42636682923146729_dt,  1.59586803193895466_dt,   3.77474377706508468_dt,  4.16676431331545505_dt,  1.90930249005125474_dt,
+                          0.250947007919477771_dt, 1.13931504400870076_dt,  0.749349370893150679_dt,  1.77245349741723301_dt,  1.95652908282732274_dt,  0.896524393703352573_dt,
+                          0.431947405342266832_dt, 1.96106812034546496_dt,  1.28983214080008701_dt,   3.05086996512372366_dt,  3.36771363727565509_dt,  1.5431600037664952_dt,
+                          0.205567319800589066_dt, 1.18982146242872489_dt,  0.278795555788244387_dt,  1.15237760136459566_dt,  0.841771553639466874_dt, 1.41059784462717563_dt,
+                          0.319562466450838911_dt, 1.8496241598066403_dt,   0.433398633253808341_dt,  1.79141620823776759_dt,  1.30856691681404058_dt,  2.19282971065912502_dt,
+                          0.235470636940994543_dt, 1.36290154425292376_dt,  0.319351184621459283_dt,  1.32001082688182847_dt,  0.964221764853344321_dt, 1.61579366440230898_dt,
+                          0.512135164838930734_dt, 2.96423289159450576_dt,  0.694570557511178355_dt,  2.87094803495053208_dt,  2.09712717857128883_dt,  3.51425878578540907_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 3u, 2u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 4u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseDivLayer elementwise_div("div", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(5);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+
+    elementwise_div.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& out_tensor = memory_manager["out"];
+    EXPECT_EQ(out_tensor.getShape(), shape);
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], z[i], eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseDiv, BroadcastBackwardFunc)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto enable_broadcast = true;
+    const auto shape = yato::dims(1, 2, 2, 2);
+
+    // See broadcasting.ipynb (seed 0)
+    const raul::Tensor x{ 0.40343133345893134_dt, 0.54692206441810598_dt };
+    const raul::Tensor y{ 0.92483389330772503_dt, 0.47847837523050574_dt, 0.91370178736457419_dt, 0.90764699003836147_dt };
+    const raul::Tensor z{ 0.43622031629489105_dt, 0.84315478889632023_dt, 0.59137329241038705_dt, 1.14304447751610017_dt,
+                          0.44153501617038982_dt, 0.4444804399581388_dt,  0.59857830200334261_dt, 0.60257134152451763_dt };
+
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor x_grad{ 5.36743277413323039_dt, 5.36743277413323039_dt };
+    const raul::Tensor y_grad{ -1.11111153704588683_dt, -4.15107425796531349_dt, -1.13835097244777406_dt, -1.15358921802671643_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 2u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 1u, 2u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseDivLayer elementwise_div("div", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    elementwise_div.forwardCompute(raul::NetworkMode::Train);
+    elementwise_div.backwardCompute();
+
+    // Checks
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.getShape(), shape);
+    EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+    EXPECT_EQ(y_tensor_grad.getShape(), memory_manager["y"].getShape());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], z[i], eps_rel)) << "expected: " << z[i] << ", got: " << out_tensor[i];
+    }
+
+    for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], x_grad[i], eps_rel)) << "expected: " << x_grad[i] << ", got: " << x_tensor_grad[i];
+    }
+
+    for (size_t i = 0; i < y_tensor_grad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(y_tensor_grad[i], y_grad[i], eps_rel)) << "expected: " << y_grad[i] << ", got: " << y_tensor_grad[i];
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ElementWiseMax.cpp b/training/src/tests/tests/layers/Test_Layer_ElementWiseMax.cpp
new file mode 100644
index 00000000..d701408b
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ElementWiseMax.cpp
@@ -0,0 +1,451 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ElementWiseMaxLayer.h>
+#include <training/compiler/Workflow.h>
+#include <training/system/NameGenerator.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_max_layer(const raul::dtype x, const raul::dtype y)
+{
+    return std::max<raul::dtype>(x, y);
+}
+
+std::pair<raul::dtype, raul::dtype> golden_max_layer_grad(const raul::dtype x, const raul::dtype y, const raul::dtype grad)
+{
+    return std::make_pair((x > y) * grad, (x <= y) * grad);
+}
+
+}
+
+TEST(TestLayerElementWiseMax, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ElementWiseMaxLayer("max", raul::ElementWiseLayerParams{ { { "x", "y" }, { "x_out", "y_out" } } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerElementWiseMax, NoBroadcastForwardFailUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = false;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMaxLayer elementwise_max("max", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    ASSERT_THROW(elementwise_max.forwardCompute(raul::NetworkMode::Test), raul::Exception);
+}
+
+TEST(TestLayerElementWiseMax, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMaxLayer elementwise_max("max", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_max.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_max_layer(x_value, y_value);
+        EXPECT_EQ(out_value, golden_out_value);
+    }
+}
+
+TEST(TestLayerElementWiseMax, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMaxLayer elementwise_max("max", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_max.forwardCompute(raul::NetworkMode::Train);
+    elementwise_max.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+    EXPECT_EQ(out_tensor_grad.size(), y_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto y_grad_value = y_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_max_layer_grad(x_value, y_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+        ASSERT_TRUE(tools::expect_near_relative(y_grad_value, golden_out_value_y, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseMax, MultipleRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_amount = 10U;
+    const auto tensor_size = 10U;
+    const auto random_range = std::make_pair(-10.0_dt, 10.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    auto name_gen = raul::NameGenerator("factor");
+    raul::Names factor_tensor_names(tensor_amount);
+    std::generate(factor_tensor_names.begin(), factor_tensor_names.end(), [&]() { return name_gen.generate(); });
+
+    // Create and initialize tensors with random data
+    for (const auto& tensor_name : factor_tensor_names)
+    {
+        work.tensorNeeded("x", tensor_name, raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    }
+
+    // Apply function
+    raul::ElementWiseMaxLayer elementwise_max("max", raul::ElementWiseLayerParams{ factor_tensor_names, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    for (const auto& tensor_name : factor_tensor_names)
+    {
+        tools::init_rand_tensor(tensor_name, random_range, memory_manager);
+    }
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_max.forwardCompute(raul::NetworkMode::Train);
+    elementwise_max.backwardCompute();
+
+    // Check sizes
+    const auto& out_tensor = memory_manager["out"];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+    for (const auto& tensor_name : factor_tensor_names)
+    {
+        const auto& tensor = memory_manager[tensor_name];
+        const auto& tensor_grad = memory_manager[tensor_name.grad()];
+        EXPECT_EQ(out_tensor.size(), tensor.size());
+        EXPECT_EQ(out_tensor_grad.size(), tensor_grad.size());
+    }
+
+    const auto skip_and_select_max = [&](const std::optional<size_t> skip_idx, const size_t axis) {
+        auto out = -std::numeric_limits<raul::dtype>::infinity();
+        for (size_t i = 0; i < factor_tensor_names.size(); ++i)
+        {
+            if (skip_idx && i == skip_idx.value()) continue;
+            const auto tensor_name = factor_tensor_names[i];
+            const auto& tensor = memory_manager[tensor_name];
+            out = golden_max_layer(out, tensor[axis]);
+        }
+        return out;
+    };
+
+    // Check forward
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto golden_out = skip_and_select_max(std::nullopt, i);
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], golden_out, eps_rel));
+    }
+
+    // Check backward
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        for (size_t j = 0; j < factor_tensor_names.size(); ++j)
+        {
+            const auto tensor_name = factor_tensor_names[j];
+            const auto& tensor = memory_manager[tensor_name];
+            const auto& tensor_grad = memory_manager[tensor_name.grad()];
+            const auto skip_max_out_i = skip_and_select_max(j, i);
+            const auto [_, tensor_golden_grad] = golden_max_layer_grad(skip_max_out_i, tensor[i], out_tensor_grad[i]);
+            ASSERT_TRUE(tools::expect_near_relative(tensor_grad[i], tensor_golden_grad, eps_rel));
+        }
+    }
+}
+
+TEST(TestLayerElementWiseMax, BroadcastForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 3u, 4u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMaxLayer elementwise_max("max", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_max.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[0];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_max_layer(x_value, y_value);
+        EXPECT_EQ(out_value, golden_out_value);
+    }
+}
+
+TEST(TestLayerElementWiseMax, BroadcastBackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 3u, 4u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMaxLayer elementwise_max("max", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_max.forwardCompute(raul::NetworkMode::Train);
+    elementwise_max.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    auto golden_out_value = 0.0_dt;
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[0];
+        const auto y_value = y_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto y_grad_value = y_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_max_layer_grad(x_value, y_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(y_grad_value, golden_out_value_y, eps_rel));
+        golden_out_value += golden_out_value_x;
+    }
+    ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[0], golden_out_value, eps_rel));
+}
+
+TEST(TestLayerElementWiseMax, BroadcastFuncUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto enable_broadcast = true;
+    const auto shape = yato::dims(3, 2, 2, 3);
+
+    // See element-wise_min.ipynb (seed=42)
+    const raul::Tensor x{ 0.4963_dt, 0.7682_dt, 0.0885_dt, 0.1320_dt, 0.3074_dt, 0.6341_dt };
+    const raul::Tensor y{ 0.4901_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.3489_dt, 0.4017_dt, 0.0223_dt, 0.1689_dt, 0.2939_dt,
+                          0.5185_dt, 0.6977_dt, 0.8000_dt, 0.1610_dt, 0.2823_dt, 0.6816_dt, 0.9152_dt, 0.3971_dt, 0.8742_dt };
+    const raul::Tensor z{ 0.4963_dt, 0.8964_dt, 0.4963_dt, 0.7682_dt, 0.8964_dt, 0.7682_dt, 0.6323_dt, 0.4963_dt, 0.4963_dt, 0.7682_dt, 0.7682_dt, 0.7682_dt,
+                          0.0885_dt, 0.1689_dt, 0.2939_dt, 0.1320_dt, 0.1689_dt, 0.2939_dt, 0.5185_dt, 0.6977_dt, 0.8000_dt, 0.5185_dt, 0.6977_dt, 0.8000_dt,
+                          0.3074_dt, 0.3074_dt, 0.6816_dt, 0.6341_dt, 0.6341_dt, 0.6816_dt, 0.9152_dt, 0.3971_dt, 0.8742_dt, 0.9152_dt, 0.6341_dt, 0.8742_dt };
+
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                               1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor x_grad{ 4.0_dt, 5.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 3.0_dt };
+    const raul::Tensor y_grad{ 0.0_dt, 2.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 0.0_dt, 0.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 2.0_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 2u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMaxLayer elementwise_max("max", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(3);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    elementwise_max.forwardCompute(raul::NetworkMode::Test);
+    elementwise_max.backwardCompute();
+
+    // Forward checks
+    const auto& out_tensor = memory_manager["out"];
+    EXPECT_EQ(out_tensor.getShape(), shape);
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        EXPECT_EQ(out_tensor[i], z[i]);
+    }
+
+    // Backward checks
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+    EXPECT_EQ(y_tensor_grad.getShape(), memory_manager["y"].getShape());
+
+    for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+    {
+        EXPECT_EQ(x_tensor_grad[i], x_grad[i]);
+    }
+    for (size_t i = 0; i < y_tensor_grad.size(); ++i)
+    {
+        EXPECT_EQ(y_tensor_grad[i], y_grad[i]);
+    }
+}
+
+TEST(TestLayerElementWiseMax, EqualValuesUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto enable_broadcast = true;
+    const auto shape = yato::dims(1, 1, 1, 5);
+
+    // See element-wise_min.ipynb
+    const raul::Tensor x{ -5.0_dt, -1.0_dt, 0.0_dt, 1.2_dt, 2.5_dt };
+    const raul::Tensor y{ -5.0_dt, -1.0_dt, 0.0_dt, 1.2_dt, 2.5_dt };
+    const raul::Tensor z{ -5.0_dt, -1.0_dt, 0.0_dt, 1.2_dt, 2.5_dt };
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor x_grad{ 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt };
+    const raul::Tensor y_grad{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 5u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 5u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMaxLayer elementwise_max("max", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    elementwise_max.forwardCompute(raul::NetworkMode::Test);
+    elementwise_max.backwardCompute();
+
+    // Forward checks
+    const auto& out_tensor = memory_manager["out"];
+    EXPECT_EQ(out_tensor.getShape(), shape);
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        EXPECT_EQ(out_tensor[i], z[i]);
+    }
+
+    // Backward checks
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+    EXPECT_EQ(y_tensor_grad.getShape(), memory_manager["y"].getShape());
+
+    for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+    {
+        EXPECT_EQ(x_tensor_grad[i], x_grad[i]);
+    }
+    for (size_t i = 0; i < y_tensor_grad.size(); ++i)
+    {
+        EXPECT_EQ(y_tensor_grad[i], y_grad[i]);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ElementWiseMin.cpp b/training/src/tests/tests/layers/Test_Layer_ElementWiseMin.cpp
new file mode 100644
index 00000000..adf01dac
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ElementWiseMin.cpp
@@ -0,0 +1,451 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ElementWiseMinLayer.h>
+#include <training/compiler/Workflow.h>
+#include <training/system/NameGenerator.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_min_layer(const raul::dtype x, const raul::dtype y)
+{
+    return std::min<raul::dtype>(x, y);
+}
+
+std::pair<raul::dtype, raul::dtype> golden_min_layer_grad(const raul::dtype x, const raul::dtype y, const raul::dtype grad)
+{
+    return std::make_pair(grad * (x < y), grad * (x >= y));
+}
+
+}
+
+TEST(TestLayerElementWiseMin, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ElementWiseMinLayer("min", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerElementWiseMin, NoBroadcastForwardFailUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = false;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMinLayer elementwise_min("min", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    ASSERT_THROW(elementwise_min.forwardCompute(raul::NetworkMode::Test), raul::Exception);
+}
+
+TEST(TestLayerElementWiseMin, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMinLayer elementwise_min("min", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_min.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_min_layer(x_value, y_value);
+        EXPECT_EQ(out_value, golden_out_value);
+    }
+}
+
+TEST(TestLayerElementWiseMin, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMinLayer elementwise_min("min", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_min.forwardCompute(raul::NetworkMode::Train);
+    elementwise_min.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+    EXPECT_EQ(out_tensor_grad.size(), y_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto y_grad_value = y_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_min_layer_grad(x_value, y_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+        ASSERT_TRUE(tools::expect_near_relative(y_grad_value, golden_out_value_y, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseMin, MultipleRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_amount = 10U;
+    const auto tensor_size = 10U;
+    const auto random_range = std::make_pair(-10.0_dt, 10.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    auto name_gen = raul::NameGenerator("factor");
+    raul::Names factor_tensor_names(tensor_amount);
+    std::generate(factor_tensor_names.begin(), factor_tensor_names.end(), [&]() { return name_gen.generate(); });
+
+    // Create and initialize tensors with random data
+    for (const auto& tensor_name : factor_tensor_names)
+    {
+        work.tensorNeeded("x", tensor_name, raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    }
+
+    // Apply function
+    raul::ElementWiseMinLayer elementwise_min("min", raul::ElementWiseLayerParams{ factor_tensor_names, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    for (const auto& tensor_name : factor_tensor_names)
+    {
+        tools::init_rand_tensor(tensor_name, random_range, memory_manager);
+    }
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_min.forwardCompute(raul::NetworkMode::Train);
+    elementwise_min.backwardCompute();
+
+    // Check sizes
+    const auto& out_tensor = memory_manager["out"];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+    for (const auto& tensor_name : factor_tensor_names)
+    {
+        const auto& tensor = memory_manager[tensor_name];
+        const auto& tensor_grad = memory_manager[tensor_name.grad()];
+        EXPECT_EQ(out_tensor.size(), tensor.size());
+        EXPECT_EQ(out_tensor_grad.size(), tensor_grad.size());
+    }
+
+    const auto skip_and_select_min = [&](const std::optional<size_t> skip_idx, const size_t axis) {
+        auto out = std::numeric_limits<raul::dtype>::infinity();
+        for (size_t i = 0; i < factor_tensor_names.size(); ++i)
+        {
+            if (skip_idx && i == skip_idx.value()) continue;
+            const auto tensor_name = factor_tensor_names[i];
+            const auto& tensor = memory_manager[tensor_name];
+            out = golden_min_layer(out, tensor[axis]);
+        }
+        return out;
+    };
+
+    // Check forward
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto golden_out = skip_and_select_min(std::nullopt, i);
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], golden_out, eps_rel));
+    }
+
+    // Check backward
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        for (size_t j = 0; j < factor_tensor_names.size(); ++j)
+        {
+            const auto tensor_name = factor_tensor_names[j];
+            const auto& tensor = memory_manager[tensor_name];
+            const auto& tensor_grad = memory_manager[tensor_name.grad()];
+            const auto skip_min_out_i = skip_and_select_min(j, i);
+            const auto [_, tensor_golden_grad] = golden_min_layer_grad(skip_min_out_i, tensor[i], out_tensor_grad[i]);
+            ASSERT_TRUE(tools::expect_near_relative(tensor_grad[i], tensor_golden_grad, eps_rel));
+        }
+    }
+}
+
+TEST(TestLayerElementWiseMin, BroadcastForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 3u, 4u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMinLayer elementwise_min("min", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_min.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[0];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_min_layer(x_value, y_value);
+        EXPECT_EQ(out_value, golden_out_value);
+    }
+}
+
+TEST(TestLayerElementWiseMin, BroadcastBackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 3u, 4u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMinLayer elementwise_min("mul", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_min.forwardCompute(raul::NetworkMode::Train);
+    elementwise_min.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    auto golden_out_value = 0.0_dt;
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[0];
+        const auto y_value = y_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto y_grad_value = y_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_min_layer_grad(x_value, y_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(y_grad_value, golden_out_value_y, eps_rel));
+        golden_out_value += golden_out_value_x;
+    }
+    ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[0], golden_out_value, eps_rel));
+}
+
+TEST(TestLayerElementWiseMin, BroadcastFuncUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto enable_broadcast = true;
+    const auto shape = yato::dims(3, 2, 2, 3);
+
+    // See element-wise_min.ipynb (seed=42)
+    const raul::Tensor x{ 0.8823_dt, 0.9150_dt, 0.3829_dt, 0.9593_dt, 0.3904_dt, 0.6009_dt };
+    const raul::Tensor y{ 0.2566_dt, 0.7936_dt, 0.9408_dt, 0.1332_dt, 0.9346_dt, 0.5936_dt, 0.8694_dt, 0.5677_dt, 0.7411_dt,
+                          0.4294_dt, 0.8854_dt, 0.5739_dt, 0.2666_dt, 0.6274_dt, 0.2696_dt, 0.4414_dt, 0.2969_dt, 0.8317_dt };
+    const raul::Tensor z{ 0.2566_dt, 0.7936_dt, 0.8823_dt, 0.2566_dt, 0.7936_dt, 0.9150_dt, 0.1332_dt, 0.8823_dt, 0.5936_dt, 0.1332_dt, 0.9150_dt, 0.5936_dt,
+                          0.3829_dt, 0.3829_dt, 0.3829_dt, 0.8694_dt, 0.5677_dt, 0.7411_dt, 0.3829_dt, 0.3829_dt, 0.3829_dt, 0.4294_dt, 0.8854_dt, 0.5739_dt,
+                          0.2666_dt, 0.3904_dt, 0.2696_dt, 0.2666_dt, 0.6009_dt, 0.2696_dt, 0.3904_dt, 0.2969_dt, 0.3904_dt, 0.4414_dt, 0.2969_dt, 0.6009_dt };
+
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                               1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor x_grad{ 2.0_dt, 2.0_dt, 6.0_dt, 0.0_dt, 3.0_dt, 2.0_dt };
+    const raul::Tensor y_grad{ 2.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 1.0_dt, 2.0_dt, 0.0_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 2u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMinLayer elementwise_min("min", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(3);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    elementwise_min.forwardCompute(raul::NetworkMode::Test);
+    elementwise_min.backwardCompute();
+
+    // Forward checks
+    const auto& out_tensor = memory_manager["out"];
+    EXPECT_EQ(out_tensor.getShape(), shape);
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        EXPECT_EQ(out_tensor[i], z[i]);
+    }
+
+    // Backward checks
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+    EXPECT_EQ(y_tensor_grad.getShape(), memory_manager["y"].getShape());
+
+    for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+    {
+        EXPECT_EQ(x_tensor_grad[i], x_grad[i]);
+    }
+    for (size_t i = 0; i < y_tensor_grad.size(); ++i)
+    {
+        EXPECT_EQ(y_tensor_grad[i], y_grad[i]);
+    }
+}
+
+TEST(TestLayerElementWiseMin, EqualValuesUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto enable_broadcast = true;
+    const auto shape = yato::dims(1, 1, 1, 5);
+
+    // See element-wise_min.ipynb
+    const raul::Tensor x{ -5.0_dt, -1.0_dt, 0.0_dt, 1.2_dt, 2.5_dt };
+    const raul::Tensor y{ -5.0_dt, -1.0_dt, 0.0_dt, 1.2_dt, 2.5_dt };
+    const raul::Tensor z{ -5.0_dt, -1.0_dt, 0.0_dt, 1.2_dt, 2.5_dt };
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor x_grad{ 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt };
+    const raul::Tensor y_grad{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 5u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 5u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMinLayer elementwise_min("min", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    elementwise_min.forwardCompute(raul::NetworkMode::Test);
+    elementwise_min.backwardCompute();
+
+    // Forward checks
+    const auto& out_tensor = memory_manager["out"];
+    EXPECT_EQ(out_tensor.getShape(), shape);
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        EXPECT_EQ(out_tensor[i], z[i]);
+    }
+
+    // Backward checks
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+    EXPECT_EQ(y_tensor_grad.getShape(), memory_manager["y"].getShape());
+
+    for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+    {
+        EXPECT_EQ(x_tensor_grad[i], x_grad[i]);
+    }
+    for (size_t i = 0; i < y_tensor_grad.size(); ++i)
+    {
+        EXPECT_EQ(y_tensor_grad[i], y_grad[i]);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ElementWiseMul.cpp b/training/src/tests/tests/layers/Test_Layer_ElementWiseMul.cpp
new file mode 100644
index 00000000..0c9e1cc1
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ElementWiseMul.cpp
@@ -0,0 +1,542 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+
+namespace
+{
+
+template<typename T>
+T golden_mul_layer(const T x, const T y)
+{
+    return x * y;
+}
+
+template<typename T>
+std::pair<T, T> golden_mul_layer_grad(const T x, const T y, const T grad)
+{
+    T x_grad = grad * y;
+    T y_grad = grad * x;
+    return std::make_pair(x_grad, y_grad);
+}
+
+}
+
+namespace UT
+{
+
+TEST(TestLayerElementWiseMul, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMulLayer elementwise_mul("mul", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_mul.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_mul_layer(x_value, y_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseMul, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMulLayer elementwise_mul("mul", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_mul.forwardCompute(raul::NetworkMode::Train);
+    elementwise_mul.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+    EXPECT_EQ(out_tensor_grad.size(), y_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto y_grad_value = y_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_mul_layer_grad(x_value, y_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+        ASSERT_TRUE(tools::expect_near_relative(y_grad_value, golden_out_value_y, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseMul, MultipleRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_amount = 10U;
+    const auto tensor_size = 10U;
+    const auto random_range = std::make_pair(-10.0_dt, 10.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Generate tensor names
+    auto name_gen = raul::NameGenerator("factor");
+    raul::Names factor_tensor_names(tensor_amount);
+    std::generate(factor_tensor_names.begin(), factor_tensor_names.end(), [&]() { return name_gen.generate(); });
+
+    // Create and initialize tensors with random data
+    for (const auto& tensor_name : factor_tensor_names)
+    {
+        work.tensorNeeded("x", tensor_name, raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    }
+
+    const auto params = raul::ElementWiseLayerParams{ factor_tensor_names, { "out" } };
+    memory_manager.createTensor(raul::Name("out").grad(), tensor_size, 1, 1, 1);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    // Apply function
+    raul::ElementWiseMulLayer elementwise_mul("mul", raul::ElementWiseLayerParams{ factor_tensor_names, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    for (const auto& tensor_name : factor_tensor_names)
+    {
+        tools::init_rand_tensor(tensor_name, random_range, memory_manager);
+    }
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_mul.forwardCompute(raul::NetworkMode::Train);
+    const auto& out_tensor = memory_manager["out"];
+
+    const auto skip_and_mul = [&](const std::optional<size_t> skip_idx, const size_t axis) {
+        auto out = 1.0_dt;
+        for (size_t i = 0; i < factor_tensor_names.size(); ++i)
+        {
+            if (skip_idx && i == skip_idx.value()) continue;
+            const auto tensor_name = factor_tensor_names[i];
+            const auto& tensor = memory_manager[tensor_name];
+            out = golden_mul_layer(out, tensor[axis]);
+        }
+        return out;
+    };
+    // Check forward
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto golden_out = skip_and_mul(std::nullopt, i);
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], golden_out, eps_rel));
+    }
+
+    elementwise_mul.backwardCompute();
+
+    // Check sizes
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+    for (const auto& tensor_name : factor_tensor_names)
+    {
+        const auto& tensor = memory_manager[tensor_name];
+        const auto& tensor_grad = memory_manager[tensor_name.grad()];
+        EXPECT_EQ(out_tensor.getShape(), tensor.getShape());
+        EXPECT_EQ(out_tensor_grad.getShape(), tensor_grad.getShape());
+    }
+
+    // Check backward
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        for (size_t j = 0; j < factor_tensor_names.size(); ++j)
+        {
+            const auto tensor_name = factor_tensor_names[j];
+            const auto& tensor = memory_manager[tensor_name];
+            const auto& tensor_grad = memory_manager[tensor_name.grad()];
+            const auto skip_mul_out_i = skip_and_mul(j, i);
+            const auto [_, tensor_golden_grad] = golden_mul_layer_grad(skip_mul_out_i, tensor[i], out_tensor_grad[i]);
+            ASSERT_TRUE(tools::expect_near_relative(tensor_grad[i], tensor_golden_grad, eps_rel));
+        }
+    }
+}
+
+TEST(TestLayerElementWiseMul, BroadcastForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 2u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMulLayer elementwise_mul("mul", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_mul.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        // We have x tensor with shape [1,1] so there is only 1 value in x
+        const auto x_value = x_tensor[0];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_mul_layer(x_value, y_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseMul, BroadcastBackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 2u, 2u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMulLayer elementwise_mul("mul", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_mul.forwardCompute(raul::NetworkMode::Train);
+    elementwise_mul.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        // We have y tensor with shape [1,1] so there is only 1 value in y
+        const auto y_value = y_tensor[0];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_mul_layer_grad(x_value, y_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+    }
+
+    auto golden_out_value = 0.0_dt;
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        // We have y tensor with shape [1,1] so there is only 1 value in y
+        const auto y_value = y_tensor[0];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_mul_layer_grad(x_value, y_value, out_grad_value);
+        golden_out_value += golden_out_value_y;
+    }
+    // We have y tensor with shape [1,1] so there is only 1 value in grad of y
+    ASSERT_TRUE(tools::expect_near_relative(y_tensor_grad[0], golden_out_value, eps_rel));
+}
+
+TEST(TestLayerElementWiseMul, BroadcastForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto enable_broadcast = true;
+    const auto shape = yato::dims(5, 3, 2, 3);
+
+    // See broadcasting.ipynb (seed 0)
+    const raul::Tensor x{ 0.49625658988952637_dt,  0.7682217955589294_dt,  0.08847743272781372_dt, 0.13203048706054688_dt,  0.30742281675338745_dt, 0.6340786814689636_dt,
+                          0.4900934100151062_dt,   0.8964447379112244_dt,  0.455627977848053_dt,   0.6323062777519226_dt,   0.3488934636116028_dt,  0.40171730518341064_dt,
+                          0.022325754165649414_dt, 0.16885894536972046_dt, 0.2938884496688843_dt,  0.518521785736084_dt,    0.6976675987243652_dt,  0.800011396408081_dt,
+                          0.16102945804595947_dt,  0.28226858377456665_dt, 0.6816085577011108_dt,  0.9151939749717712_dt,   0.39709991216659546_dt, 0.8741558790206909_dt,
+                          0.41940832138061523_dt,  0.5529070496559143_dt,  0.9527381062507629_dt,  0.036164820194244385_dt, 0.1852310299873352_dt,  0.37341737747192383_dt };
+    const raul::Tensor y{ 0.3051000237464905_dt,  0.9320003986358643_dt,  0.17591017484664917_dt, 0.2698335647583008_dt, 0.15067976713180542_dt, 0.03171950578689575_dt, 0.20812976360321045_dt,
+                          0.9297990202903748_dt,  0.7231091856956482_dt,  0.7423362731933594_dt,  0.5262957811355591_dt, 0.24365824460983276_dt, 0.584592342376709_dt,   0.033152639865875244_dt,
+                          0.13871687650680542_dt, 0.242235004901886_dt,   0.815468966960907_dt,   0.793160617351532_dt,  0.2782524824142456_dt,  0.48195880651474_dt,    0.8197803497314453_dt,
+                          0.9970665574073792_dt,  0.6984410881996155_dt,  0.5675464272499084_dt,  0.8352431654930115_dt, 0.2055988311767578_dt,  0.593172013759613_dt,   0.11234724521636963_dt,
+                          0.1534569263458252_dt,  0.24170821905136108_dt, 0.7262365221977234_dt,  0.7010802030563354_dt, 0.2038237452507019_dt,  0.6510535478591919_dt,  0.7744860053062439_dt,
+                          0.4368913173675537_dt,  0.5190907716751099_dt,  0.6158523559570312_dt,  0.8101882934570312_dt, 0.9800970554351807_dt,  0.1146882176399231_dt,  0.3167651295661926_dt,
+                          0.6965049505233765_dt,  0.9142746925354004_dt,  0.9351036548614502_dt };
+    const raul::Tensor z{ 0.1514078974723816_dt,   0.7159830331802368_dt,   0.015564080327749252_dt,  0.04028250649571419_dt,  0.2865181863307953_dt,   0.11154089123010635_dt,
+                          0.13390667736530304_dt,  0.11575548350811005_dt,  0.0028064604848623276_dt, 0.035626258701086044_dt, 0.04632239788770676_dt,  0.0201126616448164_dt,
+                          0.1032857671380043_dt,   0.7142918705940247_dt,   0.06397884339094162_dt,   0.027479473501443863_dt, 0.2858414351940155_dt,   0.45850813388824463_dt,
+                          0.363814115524292_dt,    0.47179508209228516_dt,  0.11101751029491425_dt,   0.4693838953971863_dt,   0.18362115323543549_dt,  0.0978817343711853_dt,
+                          0.286504864692688_dt,    0.029719509184360504_dt, 0.06320329010486603_dt,   0.36964139342308044_dt,  0.011566739529371262_dt, 0.05572497099637985_dt,
+                          0.11871778219938278_dt,  0.731022834777832_dt,    0.36138617992401123_dt,   0.15316671133041382_dt,  0.2845118045806885_dt,   0.318626344203949_dt,
+                          0.006212196312844753_dt, 0.08138305693864822_dt,  0.24092397093772888_dt,   0.14427997171878815_dt,  0.336247056722641_dt,    0.6558336019515991_dt,
+                          0.022260263562202454_dt, 0.11793802678585052_dt,  0.16679534316062927_dt,   0.5170007348060608_dt,   0.4872797131538391_dt,   0.45404359698295593_dt,
+                          0.018647434189915657_dt, 0.03471720218658447_dt,  0.17432640492916107_dt,   0.43309178948402405_dt,  0.1434396356344223_dt,   0.47454437613487244_dt,
+                          0.018091216683387756_dt, 0.0433160699903965_dt,   0.16475039720535278_dt,   0.10281952470541_dt,     0.060937732458114624_dt, 0.2112906575202942_dt,
+                          0.11694547533988953_dt,  0.1978929191827774_dt,   0.13892801105976105_dt,   0.6646472811698914_dt,   0.27839890122413635_dt,  0.1781737208366394_dt,
+                          0.10483880341053009_dt,  0.21861307322978973_dt,  0.2977888584136963_dt,    0.5958402752876282_dt,   0.30754831433296204_dt,  0.38191109895706177_dt,
+                          0.21771098673343658_dt,  0.3405091166496277_dt,   0.7718972563743591_dt,    0.018772823736071587_dt, 0.11407496780157089_dt,  0.3025383949279785_dt,
+                          0.4110608696937561_dt,   0.06341192126274109_dt,  0.3017942011356354_dt,    0.03544503450393677_dt,  0.02124381624162197_dt,  0.11828560382127762_dt,
+                          0.29211997985839844_dt,  0.5055088996887207_dt,   0.8909088969230652_dt,    0.025188976898789406_dt, 0.16935203969478607_dt,  0.34918394684791565_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 2u, 3u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 3u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMulLayer elementwise_mul("mul", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(5);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+
+    elementwise_mul.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& out_tensor = memory_manager["out"];
+    EXPECT_EQ(out_tensor.getShape(), shape);
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], z[i], eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseMul, BroadcastBackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto enable_broadcast = true;
+    const auto shape = yato::dims(1, 2, 1, 3);
+
+    // See broadcasting.ipynb (seed 0)
+    const raul::Tensor x{ 0.49625658988952637_dt, 0.7682217955589294_dt };
+    const raul::Tensor y{ 0.08847743272781372_dt, 0.13203048706054688_dt, 0.30742281675338745_dt };
+    const raul::Tensor z{ 0.04390750825405121_dt, 0.0655210018157959_dt, 0.15256059169769287_dt, 0.06797029078006744_dt, 0.10142869502305984_dt, 0.23616890609264374_dt };
+
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor x_grad{ 0.527930736541748_dt, 0.527930736541748_dt };
+    const raul::Tensor y_grad{ 1.2644784450531006_dt, 1.2644784450531006_dt, 1.2644784450531006_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 2u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMulLayer elementwise_mul("mul", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    elementwise_mul.forwardCompute(raul::NetworkMode::Train);
+    elementwise_mul.backwardCompute();
+
+    // Checks
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.getShape(), shape);
+    EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+    EXPECT_EQ(y_tensor_grad.getShape(), memory_manager["y"].getShape());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], z[i], eps_rel)) << "expected: " << z[i] << ", got: " << out_tensor[i];
+    }
+
+    for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], x_grad[i], eps_rel)) << "expected: " << x_grad[i] << ", got: " << x_tensor_grad[i];
+    }
+
+    for (size_t i = 0; i < y_tensor_grad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(y_tensor_grad[i], y_grad[i], eps_rel)) << "expected: " << y_grad[i] << ", got: " << y_tensor_grad[i];
+    }
+}
+
+#ifdef ANDROID
+TEST(TestLayerElementWiseMul, ForwardRandFP16Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_hf;
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(0.0_hf, 100.0_hf);
+
+    // Initialization
+
+    raul::WorkflowEager work{ raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16 };
+    auto& memory_manager = work.getMemoryManager<raul::MemoryManagerFP16>();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMulLayer elementwise_mul("mul", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+
+    tools::init_rand_tensor<raul::MemoryManagerFP16>("x", random_range, memory_manager);
+    tools::init_rand_tensor<raul::MemoryManagerFP16>("y", random_range, memory_manager);
+
+    elementwise_mul.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_mul_layer(x_value, y_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseMul, BroadcastBackwardRandFP16Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_hf;
+    const auto random_range = std::make_pair(0.0_hf, 100.0_hf);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    raul::WorkflowEager work{ raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16 };
+    auto& memory_manager = work.getMemoryManager<raul::MemoryManagerFP16>();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 2u, 2u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseMulLayer elementwise_mul("mul", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_mul.forwardCompute(raul::NetworkMode::Train);
+    elementwise_mul.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        // We have y tensor with shape [1,1] so there is only 1 value in y
+        const auto y_value = y_tensor[0];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_mul_layer_grad(x_value, y_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+    }
+
+    auto golden_out_value = 0.0_hf;
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        // We have y tensor with shape [1,1] so there is only 1 value in y
+        const auto y_value = y_tensor[0];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_mul_layer_grad(x_value, y_value, out_grad_value);
+        golden_out_value += golden_out_value_y;
+    }
+    // We have y tensor with shape [1,1] so there is only 1 value in grad of y
+    ASSERT_TRUE(tools::expect_near_relative(y_tensor_grad[0], golden_out_value, eps_rel));
+}
+#endif // ANDROID
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ElementWiseSub.cpp b/training/src/tests/tests/layers/Test_Layer_ElementWiseSub.cpp
new file mode 100644
index 00000000..493dd1f5
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ElementWiseSub.cpp
@@ -0,0 +1,271 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <chrono>
+#include <cstdio>
+#include <tests/tools/TestTools.h>
+#include <utility>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/base/layers/basic/ElementWiseSubLayer.h>
+#include <training/compiler/Workflow.h>
+#include <training/system/NameGenerator.h>
+
+namespace UT
+{
+
+namespace
+{
+raul::dtype golden_sub_layer(const raul::dtype x, const raul::dtype y)
+{
+    return x - y;
+}
+
+std::pair<raul::dtype, raul::dtype> golden_sub_layer_grad(const raul::dtype grad)
+{
+    const auto x_grad = grad * 1;
+    const auto y_grad = grad * (-1);
+    return std::make_pair(x_grad, y_grad);
+}
+}
+
+TEST(TestLayerElementWiseSub, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ElementWiseSubLayer("sub", raul::ElementWiseLayerParams{ { { "x", "y" }, { "x_out", "y_out" } } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerElementWiseSub, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ElementWiseSubLayer("sub", raul::ElementWiseLayerParams{ { "x", "y", "z" }, { "out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerElementWiseSub, ForwardSizeMismatchUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = false;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 2u, 2u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseSubLayer elementwise_sub("sub", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    ASSERT_THROW(elementwise_sub.forwardCompute(raul::NetworkMode::Test), raul::Exception);
+}
+
+TEST(TestLayerElementWiseSub, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = false;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseSubLayer elementwise_sub("sub", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_sub.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_sub_layer(x_value, y_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseSub, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = false;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseSubLayer elementwise_sub("sub", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    elementwise_sub.forwardCompute(raul::NetworkMode::Train);
+    elementwise_sub.backwardCompute();
+
+    // Checks
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+    EXPECT_EQ(out_tensor_grad.size(), y_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto y_grad_value = y_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_sub_layer_grad(out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+        ASSERT_TRUE(tools::expect_near_relative(y_grad_value, golden_out_value_y, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseSub, BroadcastForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseSubLayer elementwise_sub("sub", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_sub.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[0];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_sub_layer(x_value, y_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseSub, BroadcastBackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    constexpr auto eps_rel = TODTYPE(1e-6);
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    const raul::Tensor deltas{ 1._dt, 2._dt, 3._dt, 4._dt };
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 2u, 2u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseSubLayer elementwise_sub("sub", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    elementwise_sub.forwardCompute(raul::NetworkMode::Train);
+    elementwise_sub.backwardCompute();
+
+    // Checks
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_sub_layer_grad(out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+    }
+
+    auto golden_out_value = 0.0_dt;
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_sub_layer_grad(out_grad_value);
+        golden_out_value += golden_out_value_y;
+    }
+    ASSERT_TRUE(tools::expect_near_relative(y_tensor_grad[0], golden_out_value, eps_rel));
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ElementWiseSum.cpp b/training/src/tests/tests/layers/Test_Layer_ElementWiseSum.cpp
new file mode 100644
index 00000000..e36182dd
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ElementWiseSum.cpp
@@ -0,0 +1,146 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <chrono>
+#include <cstdio>
+#include <tests/tools/TestTools.h>
+#include <utility>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/compiler/Workflow.h>
+#include <training/system/NameGenerator.h>
+
+namespace UT
+{
+
+namespace
+{
+raul::dtype golden_sum_layer(const raul::dtype x, const raul::dtype y)
+{
+    return x + y;
+}
+
+std::pair<raul::dtype, raul::dtype> golden_sum_layer_grad(const raul::dtype, const raul::dtype, const raul::dtype grad)
+{
+    const auto x_grad = grad;
+    const auto y_grad = grad;
+    return std::make_pair(x_grad, y_grad);
+}
+}
+
+TEST(TestLayerElementWiseSum, BroadcastForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 2u, 2u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseSumLayer elementwise_sum("sum", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_sum.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        // We have x tensor with shape [1,1] so there is only 1 value in x
+        const auto x_value = x_tensor[0];
+        const auto y_value = y_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_sum_layer(x_value, y_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerElementWiseSum, BroadcastBackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    constexpr auto eps_rel = 1e-6_dt;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+    const auto enable_broadcast = true;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 2u, 2u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::ElementWiseSumLayer elementwise_sum("sum", raul::ElementWiseLayerParams{ { "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(1);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+
+    elementwise_sum.forwardCompute(raul::NetworkMode::Train);
+    elementwise_sum.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        // We have y tensor with shape [1,1] so there is only 1 value in y
+        const auto y_value = y_tensor[0];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_sum_layer_grad(x_value, y_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+    }
+
+    auto golden_out_value = 0.0_dt;
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        // We have y tensor with shape [1,1] so there is only 1 value in y
+        const auto y_value = y_tensor[0];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto [golden_out_value_x, golden_out_value_y] = golden_sum_layer_grad(x_value, y_value, out_grad_value);
+        golden_out_value += golden_out_value_y;
+    }
+    // We have y tensor with shape [1,1] so there is only 1 value in grad of y
+    ASSERT_TRUE(tools::expect_near_relative(y_tensor_grad[0], golden_out_value, eps_rel));
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Exp.cpp b/training/src/tests/tests/layers/Test_Layer_Exp.cpp
new file mode 100644
index 00000000..74001a41
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Exp.cpp
@@ -0,0 +1,148 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/ExpLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_exp_layer(const raul::dtype x)
+{
+    return std::exp(x);
+}
+
+raul::dtype golden_exp_layer_grad(const raul::dtype x, const raul::dtype grad)
+{
+    return grad * std::exp(x);
+}
+
+}
+
+TEST(TestLayerExp, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto params = raul::BasicParams{ { "x", "y" }, { "x_out" } };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    memory_manager.createTensor("x", tensor_size, 1, 1, 1);
+    memory_manager.createTensor("y", tensor_size, 1, 1, 1);
+
+    // Apply function
+    ASSERT_THROW(raul::ExpLayer("exp", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerExp, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto params = raul::BasicParams{ { "x" }, { "x_out", "y_out" } };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    memory_manager.createTensor("x", tensor_size, 1, 1, 1);
+
+    // Apply function
+    ASSERT_THROW(raul::ExpLayer("exp", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerExp, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::ExpLayer exp("exp", raul::BasicParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+
+    exp.forwardCompute(raul::NetworkMode::Train);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_exp_layer(x_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerExp, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::ExpLayer exp("exp", raul::BasicParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    exp.forwardCompute(raul::NetworkMode::Train);
+    exp.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto golden_out_value_x = golden_exp_layer_grad(x_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_FakeQuant.cpp b/training/src/tests/tests/layers/Test_Layer_FakeQuant.cpp
new file mode 100644
index 00000000..c0cf80b1
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_FakeQuant.cpp
@@ -0,0 +1,290 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/common/quantization/SymmetricQuantizer.h>
+#include <training/base/layers/basic/FakeQuantLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/system/NameGenerator.h>
+
+namespace
+{
+
+constexpr auto high_value = 127.0_dt;
+constexpr auto low_value = -127.0_dt;
+
+/**
+ * @brief Quantization function directly reproduced from slides
+ * @param x
+ * @param scale
+ * @return quntized float value
+ */
+raul::dtype golden_fake_quant_layer(raul::dtype x, raul::dtype scale)
+{
+    auto value = scale * x;
+    value = (value > 0.0_dt) ? std::floor(value) : std::ceil(value);
+    if (value < low_value)
+    {
+        value = low_value;
+    }
+    if (value > high_value)
+    {
+        value = low_value;
+    }
+    return value / scale;
+}
+
+raul::dtype golden_backward_fake_quant_layer(raul::dtype x, raul::dtype grad, raul::dtype scale)
+{
+    auto value = scale * x;
+    value = (value > 0.0_dt) ? std::floor(value) : std::ceil(value);
+    if (value < low_value || value > high_value)
+    {
+        return 0.0_dt;
+    }
+    return grad;
+}
+
+} // namespace
+
+namespace UT
+{
+
+TEST(TestLayerFakeQuant, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = 1000U;
+    const auto digits = 8U;
+    const auto round_algo = static_cast<raul::dtype (*)(raul::dtype)>(std::trunc);
+    const auto mode = raul::quantization::SymmetricQuantizer::Mode::restricted_range;
+    const auto fake_quant_mode = raul::QuantizationMode::over_full_tensor;
+    const auto random_range = std::make_pair(-200.0_dt, 200.0_dt);
+
+    auto quantizer = raul::quantization::SymmetricQuantizer(round_algo, digits, mode);
+
+    // Initialization
+    raul::WorkflowEager work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPU, false, &quantizer);
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(net_params)
+
+    auto params = raul::FakeQuantParams{ { "x" }, { "y" }, fake_quant_mode };
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    work.add<raul::FakeQuantLayer>("fk", params);
+    TENSORS_CREATE(tensor_size)
+
+    tools::init_rand_tensor("x", random_range, memory_manager);
+
+    work.forwardPassTesting();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+
+    EXPECT_EQ(y_tensor.size(), x_tensor.size());
+
+    const auto max_val = *std::max_element(x_tensor.cbegin(), x_tensor.cend(), [](raul::dtype a, raul::dtype b) { return std::abs(a) < std::abs(b); });
+    const auto scale = high_value / std::abs(max_val);
+
+    for (size_t i = 0; i < y_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto y_value = y_tensor[i];
+        const auto golden_out_value = golden_fake_quant_layer(x_value, scale);
+        ASSERT_TRUE(tools::expect_near_relative(y_value, golden_out_value, eps_rel)) << "from: " << x_value << ", expected: " << golden_out_value << ", got: " << y_value;
+    }
+}
+
+TEST(TestLayerFakeQuant, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = 1000U;
+    const auto digits = 8U;
+    const auto round_algo = static_cast<raul::dtype (*)(raul::dtype)>(std::trunc);
+    const auto mode = raul::quantization::SymmetricQuantizer::Mode::restricted_range;
+    const auto fake_quant_mode = raul::QuantizationMode::over_full_tensor;
+    const auto random_range = std::make_pair(-200.0_dt, 200.0_dt);
+
+    auto quantizer = raul::quantization::SymmetricQuantizer(round_algo, digits, mode);
+
+    // Initialization
+    raul::WorkflowEager work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPU, false, &quantizer);
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(net_params)
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    auto params = raul::FakeQuantParams{ { "x" }, { "y" }, fake_quant_mode };
+
+    // Apply function
+
+    work.add<raul::FakeQuantLayer>("fk", params);
+    TENSORS_CREATE(tensor_size)
+
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("y").grad(), random_range, memory_manager);
+
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(y_tensor_grad.size(), x_tensor_grad.size());
+
+    const auto max_val = *std::max_element(x_tensor.cbegin(), x_tensor.cend(), [](raul::dtype a, raul::dtype b) { return std::abs(a) < std::abs(b); });
+    const auto scale = high_value / std::abs(max_val);
+
+    for (size_t i = 0; i < x_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto y_grad_value = y_tensor_grad[i];
+
+        const auto golden_out_value = golden_backward_fake_quant_layer(x_value, y_grad_value, scale);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value, eps_rel)) << "from: " << x_value << ", expected: " << golden_out_value << ", got: " << x_grad_value;
+    }
+}
+
+TEST(TestLayerFakeQuant, ForwardRandOverBatchUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = 5U;
+    const auto batch_size = 2U;
+    const auto digits = 8U;
+    const auto round_algo = static_cast<raul::dtype (*)(raul::dtype)>(std::trunc);
+    const auto mode = raul::quantization::SymmetricQuantizer::Mode::restricted_range;
+    const auto fake_quant_mode = raul::QuantizationMode::over_batch;
+    const auto random_range = std::make_pair(-200.0_dt, 200.0_dt);
+
+    auto quantizer = raul::quantization::SymmetricQuantizer(round_algo, digits, mode);
+
+    // Initialization
+    raul::WorkflowEager work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPU, false, &quantizer);
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(net_params)
+
+    auto params = raul::FakeQuantParams{ { "x" }, { "y" }, fake_quant_mode };
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, tensor_size, 1, 1 });
+
+    // Apply function
+    work.add<raul::FakeQuantLayer>("fk", params);
+    TENSORS_CREATE(batch_size)
+
+    tools::init_rand_tensor("x", random_range, memory_manager);
+
+    work.forwardPassTesting();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+
+    EXPECT_EQ(y_tensor.size(), x_tensor.size());
+
+    auto x_tensor2D = x_tensor.reshape(yato::dims(batch_size, tensor_size));
+    auto y_tensor2D = y_tensor.reshape(yato::dims(batch_size, tensor_size));
+
+    for (size_t b = 0U; b < batch_size; ++b)
+    {
+        const auto max_val = *std::max_element(x_tensor2D[b].cbegin(), x_tensor2D[b].cend(), [](raul::dtype a, raul::dtype b) { return std::abs(a) < std::abs(b); });
+        const auto scale = high_value / std::abs(max_val);
+
+        for (size_t i = 0; i < tensor_size; ++i)
+        {
+            const auto x_value = x_tensor2D[b][i];
+            const auto y_value = y_tensor2D[b][i];
+            const auto golden_out_value = golden_fake_quant_layer(x_value, scale);
+            ASSERT_TRUE(tools::expect_near_relative(y_value, golden_out_value, eps_rel)) << "from: " << x_value << ", expected: " << golden_out_value << ", got: " << y_value;
+        }
+    }
+}
+
+TEST(TestLayerFakeQuant, BackwardRandOverBatchUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = 5U;
+    const auto batch_size = 2U;
+    const auto digits = 8U;
+    const auto round_algo = static_cast<raul::dtype (*)(raul::dtype)>(std::trunc);
+    const auto mode = raul::quantization::SymmetricQuantizer::Mode::restricted_range;
+    const auto fake_quant_mode = raul::QuantizationMode::over_batch;
+    const auto random_range = std::make_pair(-200.0_dt, 200.0_dt);
+
+    auto quantizer = raul::quantization::SymmetricQuantizer(round_algo, digits, mode);
+
+    // Initialization
+    raul::WorkflowEager work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPU, false, &quantizer);
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(net_params)
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, tensor_size, 1, 1 });
+
+    auto params = raul::FakeQuantParams{ { "x" }, { "y" }, fake_quant_mode };
+
+    // Apply function
+    work.add<raul::FakeQuantLayer>("fk", params);
+    TENSORS_CREATE(batch_size)
+
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("y").grad(), random_range, memory_manager);
+
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& y_tensor_grad = memory_manager[raul::Name("y").grad()];
+
+    EXPECT_EQ(y_tensor_grad.size(), x_tensor_grad.size());
+
+    auto x_tensor2D = x_tensor.reshape(yato::dims(batch_size, tensor_size));
+    auto x_tensor_grad2D = x_tensor_grad.reshape(yato::dims(batch_size, tensor_size));
+    auto y_tensor_grad2D = y_tensor_grad.reshape(yato::dims(batch_size, tensor_size));
+
+    for (size_t b = 0U; b < batch_size; ++b)
+    {
+        const auto max_val = *std::max_element(x_tensor2D[b].cbegin(), x_tensor2D[b].cend(), [](raul::dtype a, raul::dtype b) { return std::abs(a) < std::abs(b); });
+        const auto scale = high_value / std::abs(max_val);
+
+        for (size_t i = 0; i < tensor_size; ++i)
+        {
+            const auto x_value = x_tensor2D[b][i];
+            const auto x_grad_value = x_tensor_grad2D[b][i];
+            const auto y_grad_value = y_tensor_grad2D[b][i];
+
+            const auto golden_out_value = golden_backward_fake_quant_layer(x_value, y_grad_value, scale);
+            ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value, eps_rel)) << "from: " << x_value << ", expected: " << golden_out_value << ", got: " << x_grad_value;
+        }
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_FixedBias.cpp b/training/src/tests/tests/layers/Test_Layer_FixedBias.cpp
new file mode 100644
index 00000000..ba17a33d
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_FixedBias.cpp
@@ -0,0 +1,223 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/initializers/ConstantInitializer.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/FixedBiasLayer.h>
+
+namespace UT
+{
+
+using namespace raul;
+
+namespace
+{
+
+template<typename T>
+T golden_scale_layer(const T x, const T bias)
+{
+    return bias + x;
+}
+
+template<typename T>
+T golden_scale_layer_grad(const T grad)
+{
+    return grad;
+}
+
+}
+
+TEST(TestLayerFixedBias, InputNumExceedsUnit)
+{
+    // Test parameters
+    const auto bias = 2.0_dt;
+    const auto params = FixedBiasParams{ { "x", "y" }, { "x_out" }, bias };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x", "y" }, 1, 1, 1 });
+    ASSERT_THROW(FixedBiasLayer("two", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerFixedBias, OutputNumExceedsUnit)
+{
+    // Test parameters
+    const auto bias = 2.0_dt;
+    const auto params = FixedBiasParams{ { "x" }, { "x_out", "y_out" }, bias };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x" }, 1, 1, 1 });
+    ASSERT_THROW(FixedBiasLayer("two", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerFixedBias, TwoForwardUnit)
+{
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto bias = 2.0_dt;
+    const auto params = FixedBiasParams{ { "x" }, { "out" }, bias };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x" }, 1, 1, 1 });
+    FixedBiasLayer two("two", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    initializers::ConstantInitializer initializer{ 1.0_dt };
+    initializer(memory_manager["x"]);
+    two.forwardCompute(NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_scale_layer(x_value, bias);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerFixedBias, ForwardRandUnit)
+{
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = random::uniform::rand<int>({ 1, 1000 });
+    const auto bias = random::uniform::rand<raul::dtype>({ -100.0_dt, 100.0_dt });
+
+    std::cout << "Run test with bias=" << bias << " and tensor shape (" << tensor_size << ",1,1,1)" << std::endl;
+
+    const auto params = FixedBiasParams{ { "x" }, { "out" }, bias };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x" }, 1, 1, 1 });
+    FixedBiasLayer biaser("biaser", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    initializers::ConstantInitializer initializer{ 1.0_dt };
+    initializer(memory_manager["x"]);
+    biaser.forwardCompute(NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_scale_layer(x_value, bias);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerFixedBias, BackwardRandUnit)
+{
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = random::uniform::rand<int>({ 1, 1000 });
+    const auto bias = random::uniform::rand<raul::dtype>({ -100.0_dt, 100.0_dt });
+
+    std::cout << "Run test with bias=" << bias << " and tensor shape (" << tensor_size << ",1,1,1)" << std::endl;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const auto params = FixedBiasParams{ { "x" }, { "out" }, bias };
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x" }, 1, 1, 1 });
+    FixedBiasLayer biaser("biaser", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", { -100.0_dt, 100.0_dt }, memory_manager);
+    biaser.forwardCompute(NetworkMode::Train);
+    memory_manager[Name("out").grad()].memAllocate(nullptr);
+    tools::init_rand_tensor(Name("out").grad(), { -100.0_dt, 100.0_dt }, memory_manager);
+    biaser.backwardCompute();
+
+    // Checks
+    const auto& x_tensor_grad = memory_manager[Name("x").grad()];
+    const auto& out_tensor_grad = memory_manager[Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto golden_out_value_x = golden_scale_layer_grad(out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel)) << "expected: " << golden_out_value_x << ", got: " << x_grad_value;
+    }
+}
+#ifdef ANDROID
+TEST(TestLayerFixedBias, TwoForwardFP16Unit)
+{
+    // Test parameters
+    const auto eps_rel = 1e-6_hf;
+    const auto tensor_size = 1000U;
+    const auto bias = 2.0_hf;
+    const auto params = FixedBiasParams{ { "x" }, { "out" }, bias };
+
+    // Initialization
+    raul::WorkflowEager work{ raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16 };
+    auto& memory_manager = work.getMemoryManager<raul::MemoryManagerFP16>();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x" }, 1, 1, 1 });
+    FixedBiasLayer two("two", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+
+    auto& tensorX = memory_manager["x"];
+    for (auto& val : tensorX)
+    {
+        val = 1.0_hf;
+    }
+
+    two.forwardCompute(NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_scale_layer(x_value, bias);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+#endif // ANDROID
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_GRU.cpp b/training/src/tests/tests/layers/Test_Layer_GRU.cpp
new file mode 100644
index 00000000..3d5b4a1f
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_GRU.cpp
@@ -0,0 +1,1155 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/compiler/Layers.h>
+#include <training/base/layers/composite/rnn/GRULayer.h>
+
+namespace UT
+{
+
+// see gru.py
+TEST(TestGRU, SimpleSeq1Unit)
+{
+    PROFILE_TEST
+    
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 1U;
+    const size_t batch_size = 2U;
+
+    const raul::Tensor input_init{ -0.13295190_dt, -0.04689599_dt, -0.28016463_dt, 0.54008639_dt, -0.14635307_dt,
+        -0.15740128_dt, -1.01898205_dt, 0.02789201_dt };
+    
+    const raul::Tensor output_golden{ 0.10680354_dt, 0.10680354_dt, 0.10680354_dt, 0.11834978_dt, 0.11834978_dt, 0.11834978_dt };
+    const raul::Tensor hidden_golden{ 0.10680354_dt, 0.10680354_dt, 0.10680354_dt, 0.11834978_dt, 0.11834978_dt, 0.11834978_dt };
+
+    const raul::Tensor inputs_grad_golden{ -0.25737935_dt, -0.25737935_dt, -0.25737935_dt, -0.25737935_dt, 0.81872380_dt,
+        0.81872380_dt, 0.81872380_dt, 0.81872380_dt };
+    const raul::Tensor ih_weights_grad_golden{ -0.00944828_dt, -0.01008253_dt, -0.06525287_dt, 0.00222383_dt, -0.00944828_dt,
+        -0.01008253_dt, -0.06525287_dt, 0.00222383_dt, -0.00944828_dt, -0.01008253_dt, -0.06525287_dt, 0.00222383_dt, 0.02421623_dt,
+        0.01692100_dt, 0.10731841_dt, -0.05348697_dt, 0.02421623_dt, 0.01692100_dt, 0.10731841_dt, -0.05348697_dt, 0.02421623_dt,
+        0.01692100_dt, 0.10731841_dt, -0.05348697_dt, -0.04330252_dt, -0.04577118_dt, -0.29611763_dt, 0.01253940_dt, -0.04330252_dt,
+        -0.04577118_dt, -0.29611763_dt, 0.01253940_dt, -0.04330252_dt, -0.04577118_dt, -0.29611763_dt, 0.01253940_dt };
+    const raul::Tensor hh_weights_grad_golden{ 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt,
+        0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt, 0._dt };
+    const raul::Tensor ih_biases_grad_golden{ 0.06463338_dt, 0.06463338_dt, 0.06463338_dt, -0.17415819_dt, -0.17415819_dt, -0.17415819_dt,
+        0.29663962_dt, 0.29663962_dt, 0.29663962_dt };
+    const raul::Tensor hh_biases_grad_golden{ 0.06463338_dt, 0.06463338_dt, 0.06463338_dt, -0.17415819_dt, -0.17415819_dt, -0.17415819_dt,
+        0.20037875_dt, 0.20037875_dt, 0.20037875_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = raul::GRUParams{ { "in" }, { "out" }, hidden_size, false };
+    raul::GRULayer("gru", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    memory_manager["in"] = TORANGE(input_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Checks
+    const auto& outputTensor = memory_manager["out"];
+    const auto& hiddenTensor = memory_manager["gru::hidden_state[" + Conversions::toString(sequence_length - 1) + "]"];
+
+    EXPECT_EQ(outputTensor.size(), batch_size * hidden_size * sequence_length);
+    EXPECT_EQ(hiddenTensor.size(), batch_size * hidden_size);
+
+    for (size_t i = 0; i < outputTensor.size(); ++i)
+    {
+        const auto val = outputTensor[i];
+        const auto golden_val = output_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < hiddenTensor.size(); ++i)
+    {
+        const auto val = hiddenTensor[i];
+        const auto golden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    // Apply
+    memory_manager[raul::Name("out").grad()] = 1.0_dt;
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Checks
+    const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto gradBiasesIH = memory_manager[raul::Name("gru::cell::linear_ih::Biases").grad()];
+    const auto gradWeightsIH = memory_manager[raul::Name("gru::cell::linear_ih::Weights").grad()];
+    const auto gradBiasesHH = memory_manager[raul::Name("gru::cell::linear_hh::Biases").grad()];
+    const auto gradWeightsHH = memory_manager[raul::Name("gru::cell::linear_hh::Weights").grad()];
+
+    EXPECT_EQ(ih_weights_grad_golden.size(), gradWeightsIH.size());
+
+    for (size_t i = 0; i < gradWeightsIH.size(); ++i)
+    {
+        const auto val = gradWeightsIH[i];
+        const auto golden_val = ih_weights_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(hh_weights_grad_golden.size(), gradWeightsHH.size());
+
+    for (size_t i = 0; i < gradWeightsHH.size(); ++i)
+    {
+        const auto val = gradWeightsHH[i];
+        const auto golden_val = hh_weights_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(ih_biases_grad_golden.size(), gradBiasesIH.size());
+
+    for (size_t i = 0; i < gradBiasesIH.size(); ++i)
+    {
+        const auto val = gradBiasesIH[i];
+        const auto golden_val = ih_biases_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(hh_biases_grad_golden.size(), gradBiasesHH.size());
+
+    for (size_t i = 0; i < gradBiasesHH.size(); ++i)
+    {
+        const auto val = gradBiasesHH[i];
+        const auto golden_val = hh_biases_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestGRU, SimpleSeq3Unit)
+{
+    PROFILE_TEST
+    
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 5U;
+    const size_t hidden_size = 4U;
+    const size_t sequence_length = 3U;
+    const size_t batch_size = 2U;
+
+    const raul::Tensor input_init{ 0.63761783_dt, -0.28129023_dt, -1.32987511_dt, -0.65379959_dt, 1.71982408_dt, -0.96095538_dt,
+        -0.63750249_dt, 0.07472499_dt, 0.83877468_dt, 1.15289509_dt, -1.76109815_dt, -1.10703886_dt, -1.71736121_dt, 1.53456104_dt,
+        -0.36151406_dt, 0.58511323_dt, -1.15600657_dt, -0.14336488_dt, -0.19474059_dt, 1.49027479_dt, -0.70052689_dt, 0.18056405_dt,
+        -0.48284835_dt, -0.36609861_dt, -1.32705247_dt, 1.69527960_dt, 2.06549954_dt, 0.25783238_dt, -0.56502467_dt, 0.92781103_dt };
+    
+    const raul::Tensor output_golden{ 0.10574234_dt, 0.10574234_dt, 0.10574234_dt, 0.10574234_dt, 0.15240932_dt, 0.15240932_dt,
+        0.15240932_dt, 0.15240932_dt, -0.61403465_dt, -0.61403465_dt, -0.61403465_dt, -0.61403465_dt, 0.06943178_dt, 0.06943178_dt,
+        0.06943178_dt, 0.06943178_dt, -0.47310147_dt, -0.47310147_dt, -0.47310147_dt, -0.47310147_dt, -0.45674217_dt,
+        -0.45674217_dt,-0.45674217_dt, -0.45674217_dt };
+    const raul::Tensor hidden_golden{ -0.61403465_dt, -0.61403465_dt, -0.61403465_dt, -0.61403465_dt, -0.45674217_dt, -0.45674217_dt, -0.45674217_dt, -0.45674217_dt };
+
+    const raul::Tensor inputs_grad_golden{ -0.97712630_dt, -0.97712630_dt, -0.97712630_dt, -0.97712630_dt, -0.97712630_dt,
+        -0.41819614_dt, -0.41819614_dt, -0.41819614_dt, -0.41819614_dt, -0.41819614_dt, 1.25780964_dt, 1.25780964_dt, 1.25780964_dt,
+        1.25780964_dt, 1.25780964_dt, -1.11048031_dt, -1.11048031_dt, -1.11048031_dt, -1.11048031_dt, -1.11048031_dt, 3.54119205_dt,
+        3.54119205_dt, 3.54119205_dt, 3.54119205_dt, 3.54119205_dt, -0.06468894_dt, -0.06468894_dt, -0.06468894_dt, -0.06468894_dt,
+        -0.06468894_dt };
+    const raul::Tensor ih_weights_grad_golden{ -0.11048512_dt, -0.00331075_dt, -0.09028359_dt, -0.01148807_dt, -0.14872201_dt,
+        -0.11048512_dt, -0.00331075_dt, -0.09028359_dt, -0.01148807_dt, -0.14872201_dt, -0.11048512_dt, -0.00331075_dt, -0.09028359_dt,
+        -0.01148807_dt, -0.14872201_dt, -0.11048512_dt, -0.00331075_dt, -0.09028359_dt, -0.01148807_dt, -0.14872201_dt, -0.97238457_dt,
+        0.25348625_dt, -0.22005269_dt, 0.36400220_dt, -1.66352570_dt, -0.97238457_dt, 0.25348625_dt, -0.22005269_dt, 0.36400220_dt,
+        -1.66352570_dt, -0.97238457_dt, 0.25348625_dt, -0.22005269_dt, 0.36400220_dt, -1.66352570_dt, -0.97238457_dt, 0.25348625_dt,
+        -0.22005269_dt, 0.36400220_dt, -1.66352570_dt, -0.33623388_dt, -0.01554246_dt, -0.30447406_dt, -0.05885433_dt, -0.44565848_dt,
+        -0.33623388_dt, -0.01554246_dt, -0.30447406_dt, -0.05885433_dt, -0.44565848_dt, -0.33623388_dt, -0.01554246_dt, -0.30447406_dt,
+        -0.05885433_dt, -0.44565848_dt, -0.33623388_dt, -0.01554246_dt, -0.30447406_dt, -0.05885433_dt, -0.44565848_dt };
+    const raul::Tensor hh_weights_grad_golden{ 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt,
+        0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt,
+        0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt,
+        0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt,
+        0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt,
+        0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt,
+        0.01282894_dt, 0.01282894_dt, 0.01282894_dt };
+    const raul::Tensor ih_biases_grad_golden{ 0.13298517_dt, 0.13298517_dt, 0.13298517_dt, 0.13298517_dt, -0.02688873_dt,
+        -0.02688873_dt, -0.02688873_dt, -0.02688873_dt, 0.45103103_dt, 0.45103103_dt, 0.45103103_dt, 0.45103103_dt };
+    const raul::Tensor hh_biases_grad_golden{ 0.13298517_dt, 0.13298517_dt, 0.13298517_dt, 0.13298517_dt, -0.02688867_dt,
+        -0.02688867_dt, -0.02688867_dt, -0.02688867_dt, 0.19064982_dt, 0.19064982_dt, 0.19064982_dt, 0.19064982_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = raul::GRUParams{ { "in" }, { "out" }, hidden_size, false, true, true, false, true };
+    raul::GRULayer("gru", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    memory_manager["in"] = TORANGE(input_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Checks
+    const auto& outputTensor = memory_manager["out"];
+    const auto& hiddenTensor = memory_manager["gru::hidden_state[" + Conversions::toString(sequence_length - 1) + "]"];
+
+    EXPECT_EQ(outputTensor.size(), batch_size * hidden_size * sequence_length);
+    EXPECT_EQ(hiddenTensor.size(), batch_size * hidden_size);
+
+    for (size_t i = 0; i < outputTensor.size(); ++i)
+    {
+        const auto val = outputTensor[i];
+        const auto golden_val = output_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < hiddenTensor.size(); ++i)
+    {
+        const auto val = hiddenTensor[i];
+        const auto golden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    // Apply
+    memory_manager[raul::Name("out").grad()] = 1.0_dt;
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Checks
+    const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto gradBiasesIH = memory_manager[raul::Name("gru::cell::linear_ih::Biases").grad()];
+    const auto gradWeightsIH = memory_manager[raul::Name("gru::cell::linear_ih::Weights").grad()];
+    const auto gradBiasesHH = memory_manager[raul::Name("gru::cell::linear_hh::Biases").grad()];
+    const auto gradWeightsHH = memory_manager[raul::Name("gru::cell::linear_hh::Weights").grad()];
+
+    EXPECT_EQ(ih_weights_grad_golden.size(), gradWeightsIH.size());
+
+    for (size_t i = 0; i < gradWeightsIH.size(); ++i)
+    {
+        const auto val = gradWeightsIH[i];
+        const auto golden_val = ih_weights_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(hh_weights_grad_golden.size(), gradWeightsHH.size());
+
+    for (size_t i = 0; i < gradWeightsHH.size(); ++i)
+    {
+        const auto val = gradWeightsHH[i];
+        const auto golden_val = hh_weights_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(ih_biases_grad_golden.size(), gradBiasesIH.size());
+
+    for (size_t i = 0; i < gradBiasesIH.size(); ++i)
+    {
+        const auto val = gradBiasesIH[i];
+        const auto golden_val = ih_biases_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(hh_biases_grad_golden.size(), gradBiasesHH.size());
+
+    for (size_t i = 0; i < gradBiasesHH.size(); ++i)
+    {
+        const auto val = gradBiasesHH[i];
+        const auto golden_val = hh_biases_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestGRU, SimpleSeq3GlobalFusionUnit)
+{
+    PROFILE_TEST
+    
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 5U;
+    const size_t hidden_size = 4U;
+    const size_t sequence_length = 3U;
+    const size_t batch_size = 2U;
+
+    const raul::Tensor input_init{ 0.63761783_dt, -0.28129023_dt, -1.32987511_dt, -0.65379959_dt, 1.71982408_dt, -0.96095538_dt,
+        -0.63750249_dt, 0.07472499_dt, 0.83877468_dt, 1.15289509_dt, -1.76109815_dt, -1.10703886_dt, -1.71736121_dt, 1.53456104_dt,
+        -0.36151406_dt, 0.58511323_dt, -1.15600657_dt, -0.14336488_dt, -0.19474059_dt, 1.49027479_dt, -0.70052689_dt, 0.18056405_dt,
+        -0.48284835_dt, -0.36609861_dt, -1.32705247_dt, 1.69527960_dt, 2.06549954_dt, 0.25783238_dt, -0.56502467_dt, 0.92781103_dt };
+    
+    const raul::Tensor output_golden{ 0.10574234_dt, 0.10574234_dt, 0.10574234_dt, 0.10574234_dt, 0.15240932_dt, 0.15240932_dt,
+        0.15240932_dt, 0.15240932_dt, -0.61403465_dt, -0.61403465_dt, -0.61403465_dt, -0.61403465_dt, 0.06943178_dt, 0.06943178_dt,
+        0.06943178_dt, 0.06943178_dt, -0.47310147_dt, -0.47310147_dt, -0.47310147_dt, -0.47310147_dt, -0.45674217_dt,
+        -0.45674217_dt,-0.45674217_dt, -0.45674217_dt };
+    const raul::Tensor hidden_golden{ -0.61403465_dt, -0.61403465_dt, -0.61403465_dt, -0.61403465_dt, -0.45674217_dt, -0.45674217_dt, -0.45674217_dt, -0.45674217_dt };
+
+    const raul::Tensor inputs_grad_golden{ -0.97712630_dt, -0.97712630_dt, -0.97712630_dt, -0.97712630_dt, -0.97712630_dt,
+        -0.41819614_dt, -0.41819614_dt, -0.41819614_dt, -0.41819614_dt, -0.41819614_dt, 1.25780964_dt, 1.25780964_dt, 1.25780964_dt,
+        1.25780964_dt, 1.25780964_dt, -1.11048031_dt, -1.11048031_dt, -1.11048031_dt, -1.11048031_dt, -1.11048031_dt, 3.54119205_dt,
+        3.54119205_dt, 3.54119205_dt, 3.54119205_dt, 3.54119205_dt, -0.06468894_dt, -0.06468894_dt, -0.06468894_dt, -0.06468894_dt,
+        -0.06468894_dt };
+    const raul::Tensor ih_weights_grad_golden{ -0.11048512_dt, -0.00331075_dt, -0.09028359_dt, -0.01148807_dt, -0.14872201_dt,
+        -0.11048512_dt, -0.00331075_dt, -0.09028359_dt, -0.01148807_dt, -0.14872201_dt, -0.11048512_dt, -0.00331075_dt, -0.09028359_dt,
+        -0.01148807_dt, -0.14872201_dt, -0.11048512_dt, -0.00331075_dt, -0.09028359_dt, -0.01148807_dt, -0.14872201_dt, -0.97238457_dt,
+        0.25348625_dt, -0.22005269_dt, 0.36400220_dt, -1.66352570_dt, -0.97238457_dt, 0.25348625_dt, -0.22005269_dt, 0.36400220_dt,
+        -1.66352570_dt, -0.97238457_dt, 0.25348625_dt, -0.22005269_dt, 0.36400220_dt, -1.66352570_dt, -0.97238457_dt, 0.25348625_dt,
+        -0.22005269_dt, 0.36400220_dt, -1.66352570_dt, -0.33623388_dt, -0.01554246_dt, -0.30447406_dt, -0.05885433_dt, -0.44565848_dt,
+        -0.33623388_dt, -0.01554246_dt, -0.30447406_dt, -0.05885433_dt, -0.44565848_dt, -0.33623388_dt, -0.01554246_dt, -0.30447406_dt,
+        -0.05885433_dt, -0.44565848_dt, -0.33623388_dt, -0.01554246_dt, -0.30447406_dt, -0.05885433_dt, -0.44565848_dt };
+    const raul::Tensor hh_weights_grad_golden{ 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt,
+        0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.01068671_dt,
+        0.01068671_dt, 0.01068671_dt, 0.01068671_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt,
+        0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.06131050_dt,
+        0.06131050_dt, 0.06131050_dt, 0.06131050_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt,
+        0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt, 0.01282894_dt,
+        0.01282894_dt, 0.01282894_dt, 0.01282894_dt };
+    const raul::Tensor ih_biases_grad_golden{ 0.13298517_dt, 0.13298517_dt, 0.13298517_dt, 0.13298517_dt, -0.02688873_dt,
+        -0.02688873_dt, -0.02688873_dt, -0.02688873_dt, 0.45103103_dt, 0.45103103_dt, 0.45103103_dt, 0.45103103_dt };
+    const raul::Tensor hh_biases_grad_golden{ 0.13298517_dt, 0.13298517_dt, 0.13298517_dt, 0.13298517_dt, -0.02688867_dt,
+        -0.02688867_dt, -0.02688867_dt, -0.02688867_dt, 0.19064982_dt, 0.19064982_dt, 0.19064982_dt, 0.19064982_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = raul::GRUParams{ { "in" }, { "out" }, hidden_size, true };
+    raul::GRULayer("gru", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    memory_manager["in"] = TORANGE(input_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Checks
+    const auto& outputTensor = memory_manager["out"];
+    const auto& hiddenTensor = memory_manager["gru::hidden_state[" + Conversions::toString(sequence_length - 1) + "]"];
+
+    EXPECT_EQ(outputTensor.size(), batch_size * hidden_size * sequence_length);
+    EXPECT_EQ(hiddenTensor.size(), batch_size * hidden_size);
+
+    for (size_t i = 0; i < outputTensor.size(); ++i)
+    {
+        const auto val = outputTensor[i];
+        const auto golden_val = output_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < hiddenTensor.size(); ++i)
+    {
+        const auto val = hiddenTensor[i];
+        const auto golden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    // Apply
+    memory_manager[raul::Name("out").grad()] = 1.0_dt;
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Checks
+    const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto gradBiasesIH = memory_manager[raul::Name("gru::cell::linear_ih::Biases").grad()];
+    const auto gradWeightsIH = memory_manager[raul::Name("gru::cell::linear_ih::Weights").grad()];
+    const auto gradBiasesHH = memory_manager[raul::Name("gru::cell::linear_hh::Biases").grad()];
+    const auto gradWeightsHH = memory_manager[raul::Name("gru::cell::linear_hh::Weights").grad()];
+
+
+    EXPECT_EQ(ih_weights_grad_golden.size(), gradWeightsIH.size());
+
+    for (size_t i = 0; i < gradWeightsIH.size(); ++i)
+    {
+        const auto val = gradWeightsIH[i];
+        const auto golden_val = ih_weights_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(hh_weights_grad_golden.size(), gradWeightsHH.size());
+
+    for (size_t i = 0; i < gradWeightsHH.size(); ++i)
+    {
+        const auto val = gradWeightsHH[i];
+        const auto golden_val = hh_weights_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(ih_biases_grad_golden.size(), gradBiasesIH.size());
+
+    for (size_t i = 0; i < gradBiasesIH.size(); ++i)
+    {
+        const auto val = gradBiasesIH[i];
+        const auto golden_val = ih_biases_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(hh_biases_grad_golden.size(), gradBiasesHH.size());
+
+    for (size_t i = 0; i < gradBiasesHH.size(); ++i)
+    {
+        const auto val = gradBiasesHH[i];
+        const auto golden_val = hh_biases_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestGRU, SimpleSeq7FusionOffAndOnUnit)
+{
+    PROFILE_TEST
+    
+    // Test parameters
+    const auto eps_rel = 1e-3_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 5U;
+    const size_t sequence_length = 7U;
+    const size_t batch_size = 2U;
+
+    const raul::Tensor input_init{ -0.19567661_dt, 0.16095491_dt, 0.86689448_dt, 0.22917956_dt, 0.23287529_dt, 0.03640829_dt,
+        0.40704045_dt,1.25164700_dt, -0.02822716_dt, 0.05133806_dt, -0.51514918_dt, -1.88835800_dt, -0.00439325_dt, -2.22326303_dt,
+        0.74002379_dt, -0.71609747_dt, -0.65955150_dt, 0.18859509_dt, -0.41376933_dt, 0.47538298_dt, 0.10268653_dt, -0.48300776_dt,
+        -0.26079562_dt, 0.47629976_dt, -1.03042293_dt, -1.62371719_dt, -0.19360958_dt, 0.80913794_dt, 0.08752438_dt, 0.70364809_dt,
+        -1.34323740_dt, -0.02036794_dt, 1.06088448_dt, -0.01595338_dt, 1.27565277_dt, 0.00945723_dt, -0.36944997_dt, 1.21819782_dt,
+        0.25504440_dt, 0.27399307_dt, 0.36422995_dt, 0.08813406_dt, -1.30691111_dt, -0.70636863_dt, -0.16421619_dt, -0.97146821_dt,
+        -1.03084397_dt, 0.64727920_dt, -0.19061494_dt, 0.71665096_dt, -2.00018930_dt, -2.40965796_dt, 0.21942286_dt, -1.69886053_dt,
+        1.30942404_dt, -1.66129482_dt };
+
+    const raul::Tensor ih_weights{ -0.39872482_dt, 0.39215106_dt, -0.29042551_dt, -0.05087572_dt, 0.12812382_dt, 0.01424748_dt,
+        -0.30092186_dt, -0.36149246_dt, 0.35646611_dt, 0.07282370_dt, 0.37101936_dt, -0.14992416_dt, 0.13172919_dt, -0.10226712_dt,
+        -0.01988810_dt, -0.27237284_dt, 0.15124804_dt, 0.14139372_dt, -0.00922537_dt, -0.10057929_dt, -0.27567577_dt, 0.30926824_dt,
+        -0.33291659_dt, 0.18320799_dt, -0.15037715_dt, -0.21576625_dt, 0.08033973_dt, -0.23230822_dt, 0.10304016_dt, 0.08782417_dt,
+        -0.33205137_dt, 0.07446039_dt, 0.19048131_dt, 0.17702103_dt, -0.05629465_dt, -0.36662018_dt, -0.06893981_dt, 0.15532070_dt,
+        -0.16317800_dt, 0.16975385_dt, 0.29782754_dt, -0.23351328_dt, 0.00441036_dt, 0.18492240_dt, 0.03505164_dt, 0.03735644_dt,
+        0.05584151_dt, -0.35157594_dt, 0.03514573_dt, 0.30968189_dt, 0.40299034_dt, 0.26285022_dt, 0.05992800_dt, 0.20885509_dt,
+        -0.21756031_dt, -0.37060070_dt, -0.38456839_dt, 0.44614464_dt, 0.28388643_dt, -0.30912912_dt };
+    const raul::Tensor hh_weights{ 0.17496902_dt, 0.33772123_dt, 0.44704133_dt, 0.39104128_dt, 0.34648043_dt, -0.10252786_dt,
+        -0.15694588_dt, 0.36718422_dt, 0.25060940_dt, -0.26912373_dt, 0.40205270_dt, 0.21607506_dt, 0.24379200_dt, -0.28030288_dt,
+        0.12830549_dt, -0.15677628_dt, 0.34943330_dt, -0.08047187_dt, 0.17410582_dt, 0.07943487_dt, 0.19027519_dt, -0.15197548_dt,
+        0.21804851_dt, -0.31236571_dt, 0.10100543_dt, -0.30258107_dt, -0.44119301_dt, -0.35913745_dt, 0.35306174_dt, 0.24195850_dt,
+        0.41958284_dt, 0.35827231_dt, -0.39938205_dt, -0.30519247_dt, -0.07226193_dt, -0.29043496_dt, 0.31054354_dt, -0.33809698_dt,
+        -0.21819615_dt, -0.43204921_dt, -0.25390354_dt, 0.36782312_dt, 0.36616063_dt, 0.32013237_dt, 0.34530580_dt, 0.39765626_dt,
+        -0.11451486_dt, 0.19677490_dt, 0.39843619_dt, 0.14794666_dt, 0.44706887_dt, 0.23195308_dt, 0.27800959_dt, -0.15652126_dt,
+        0.21458536_dt, 0.05138776_dt, -0.10680234_dt, -0.25209871_dt, -0.25093895_dt, -0.34412229_dt, 0.30022806_dt, 0.31793809_dt,
+        -0.05089858_dt, -0.25879624_dt, 0.34565383_dt, 0.28598815_dt, 0.03324318_dt, -0.21114533_dt, 0.41101068_dt, 0.18288380_dt,
+        -0.33950013_dt, 0.42802048_dt, 0.33960229_dt, -0.16300526_dt, 0.25140315_dt };
+    const raul::Tensor ih_biases{ -0.25410187_dt, -0.07008478_dt, 0.37972957_dt, 0.01847848_dt, -0.31627756_dt, -0.14947352_dt,
+        -0.12139395_dt, -0.08628038_dt, 0.04279861_dt, 0.41359639_dt, 0.02394396_dt, -0.27612755_dt, 0.02292162_dt, 0.21443319_dt,
+        0.22183591_dt };
+    const raul::Tensor hh_biases{ -0.40872574_dt, -0.08002549_dt, -0.33234432_dt, -0.19081959_dt, 0.16112810_dt, -0.31757987_dt,
+        0.16624129_dt, 0.37958509_dt, 0.02933201_dt, -0.29806235_dt, -0.16022989_dt, 0.09765542_dt, -0.34091896_dt, 0.22218031_dt,
+        -0.40601161_dt };
+    
+    const raul::Tensor output_golden{ -0.04908502_dt, -0.12932368_dt, 0.13605782_dt, 0.02979310_dt, 0.16845636_dt, 0.13728850_dt,
+        -0.40618682_dt, 0.23002338_dt, -0.11854070_dt, -0.02049330_dt, -0.19374560_dt, -0.05333966_dt, -0.21428776_dt, 0.19794083_dt,
+        0.18007322_dt, 0.22964041_dt, -0.05222400_dt, -0.45063496_dt, 0.11880872_dt, -0.21370764_dt, 0.01256491_dt, -0.18886641_dt,
+        -0.33110374_dt, 0.21499550_dt, -0.12713812_dt, 0.09414217_dt, -0.27867889_dt, -0.31757757_dt, 0.17967044_dt, -0.28890455_dt,
+        0.12870508_dt, -0.37409306_dt, -0.39582416_dt, -0.17325714_dt, -0.42465699_dt, -0.09158994_dt, -0.14073415_dt, -0.12809369_dt,
+        0.27497604_dt, -0.02867714_dt, 0.18024704_dt, -0.13876417_dt, 0.05959263_dt, 0.21977210_dt, -0.05727591_dt, -0.09114996_dt,
+        -0.22235252_dt, 0.18779227_dt, 0.30441296_dt, 0.17236847_dt, -0.10560454_dt, -0.16898175_dt, -0.14934486_dt, 0.45449603_dt,
+        0.00756194_dt, 0.03945459_dt, -0.36106116_dt, -0.31888619_dt, 0.29281402_dt, -0.27528569_dt, -0.27271730_dt, 0.05054754_dt,
+        -0.48803875_dt, 0.48465484_dt, 0.00834736_dt, 0.03663467_dt, 0.14176588_dt, -0.55479670_dt, 0.47510657_dt, 0.04648385_dt };
+    const raul::Tensor hidden_golden{ 0.12870508_dt, -0.37409306_dt, -0.39582416_dt, -0.17325714_dt, -0.42465699_dt, 0.03663467_dt,
+        0.14176588_dt, -0.55479670_dt, 0.47510657_dt, 0.04648385_dt };
+
+    const raul::Tensor inputs_grad_golden{ 0.06706255_dt, 0.53410208_dt, 0.57455337_dt, -0.62969238_dt, 0.02530956_dt, 0.55312926_dt,
+        0.39051867_dt, -0.72325438_dt, 0.02505900_dt, 0.49701324_dt, 0.21707325_dt, -0.10702824_dt, 0.15325701_dt, 0.54164988_dt,
+        0.40254655_dt, -0.46143129_dt, -0.05772207_dt, 0.52750760_dt, 0.34876883_dt, -0.55287397_dt, 0.11948650_dt, 0.35562691_dt,
+        0.24785848_dt, -0.44526634_dt, 0.12410703_dt, 0.24529704_dt, 0.06473933_dt, -0.29945928_dt, -0.13560967_dt, 0.47893506_dt,
+        0.37807149_dt, -0.53091091_dt, 0.09586967_dt, 0.58233088_dt, 0.70613271_dt, -0.60545051_dt, 0.07166006_dt, 0.44349703_dt,
+        0.35678184_dt, -0.45502383_dt, 0.06642039_dt, 0.35965362_dt, 0.22833532_dt, -0.32521999_dt, 0.20002100_dt, 0.38204491_dt,
+        0.01235957_dt, -0.59885281_dt, -0.12039655_dt, 0.32276195_dt, 0.09595823_dt, -0.10270441_dt, 0.04946303_dt,
+        0.25459915_dt, 0.28399706_dt, -0.16773076_dt };
+    const raul::Tensor ih_weights_grad_golden{ -1.15681514e-02_dt, 2.45727226e-01_dt, 2.38236189e-02_dt, 1.12843558e-01_dt,
+        -4.95894961e-02_dt, -2.31185928e-03_dt, -4.74320278e-02_dt, -2.92562563e-02_dt, -1.96367726e-02_dt, 1.85933888e-01_dt,
+        -7.28185698e-02_dt, 1.92100853e-02_dt, -5.43801710e-02_dt, -2.92829037e-01_dt, 2.95404345e-04_dt, 1.37008205e-01_dt,
+        -3.14522982e-02_dt, 2.80055493e-01_dt, 1.22947857e-01_dt, 4.58172709e-01_dt, -4.85285789e-01_dt, 1.34967923_dt, -1.05025959_dt,
+        -7.19864130e-01_dt, -9.49798748e-02_dt, -2.63819635e-01_dt, 9.84790683e-01_dt, 3.07576895_dt, 1.84837170e-02_dt, -7.02335596e-01_dt,
+        -1.74826765_dt, -1.88073802_dt, -3.94826569e-03_dt, -9.48365688e-01_dt, 1.22688019_dt, 1.55642343_dt, 3.03023636e-01_dt,
+        -1.74265385_dt, 7.75853097e-02_dt, 9.33504462e-01_dt, 5.49203157e-01_dt, -4.20032644_dt, 8.99180770e-01_dt, -2.47827482_dt,
+        3.55211198e-01_dt, -5.78275204e-01_dt, -2.30076051_dt, -1.83073068_dt, 8.96897912e-02_dt, -1.52554870_dt, 8.03956211e-01_dt,
+        1.39209092e-01_dt, -5.51340461e-01_dt, -3.11690378_dt, 3.74248147e-01_dt, 1.86640453_dt, 6.53812468e-01_dt, 
+        -2.70138478_dt, -1.77771389e-01_dt, -3.11907387_dt };
+    const raul::Tensor hh_weights_grad_golden{ 0.01514051_dt, 0.08280170_dt, 0.09544529_dt, -0.13120307_dt, 0.01722318_dt, 0.01949469_dt,
+        -0.03729708_dt, -0.05226147_dt, 0.03172657_dt, -0.03127133_dt, 0.00625432_dt, 0.10405099_dt, 0.08481736_dt, -0.14523050_dt,
+        0.01143921_dt, -0.00583379_dt, -0.08468263_dt, -0.11317360_dt, 0.15005642_dt, -0.02267838_dt, 0.00314150_dt, 0.19531782_dt,
+        0.18590628_dt, -0.23596179_dt, 0.05312429_dt, 0.22438687_dt, -0.14240305_dt, 0.01761173_dt, -0.04466634_dt, -0.16525964_dt,
+        -0.05472863_dt, 0.29740956_dt, -0.07371731_dt, 0.04768713_dt, 0.11215933_dt, -0.07453834_dt, -0.33322003_dt, 0.09421252_dt,
+        0.18262674_dt, 0.07366519_dt, -0.09912762_dt, 0.20223860_dt, -0.09617800_dt, 0.01614731_dt, 0.04994032_dt, -0.22760686_dt,
+        0.02137677_dt, -0.06901166_dt, 0.19307476_dt, 0.19715972_dt, 0.05638356_dt, -0.57298666_dt, -0.25935543_dt, 0.62430120_dt,
+        -0.06276698_dt, 0.01983850_dt, -1.00811970_dt, -0.29278663_dt, 0.97892523_dt, -0.05108772_dt, -0.04757062_dt, -0.46518254_dt,
+        -0.33999357_dt, 0.64060187_dt, -0.00513114_dt, -0.08422026_dt, -0.48028851_dt, -0.55409992_dt, 0.79238141_dt, -0.05314661_dt,
+        -0.05868209_dt, -0.66523165_dt, -0.41538826_dt, 0.73842663_dt, -0.04870981_dt };
+    const raul::Tensor ih_biases_grad_golden{ -0.61284310_dt, 0.26072496_dt, -0.79627937_dt, 0.68450576_dt, -1.33463049_dt, 0.37414274_dt,
+        0.00920656_dt, 1.02960610_dt, -0.72623384_dt, 0.29689837_dt, 12.5870171_dt, 12.3439045_dt, 7.91638613_dt, 8.89122868_dt, 10.4883652_dt };
+    const raul::Tensor hh_biases_grad_golden{ -0.61284310_dt, 0.26072496_dt, -0.79627937_dt, 0.68450576_dt, -1.33463049_dt, 0.37414274_dt,
+        0.00920656_dt, 1.02960610_dt, -0.72623384_dt, 0.29689837_dt, 3.95333195_dt, 6.15152121_dt, 3.90701532_dt, 4.00036478_dt, 4.69673634_dt };
+
+    // Initialization
+    for (size_t q = 0; q < 2; ++q)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+        // Network
+        bool useFusion = (q == 1 ? true : false);
+        const auto params = raul::GRUParams{ { "in" }, { "out" }, hidden_size, false, true, true, false, useFusion };
+        raul::GRULayer("gru", params, networkParameters);
+        TENSORS_CREATE(batch_size)
+
+        memory_manager["in"] = TORANGE(input_init);
+
+        memory_manager["gru::cell::linear_ih::Biases"] = TORANGE(ih_biases);
+        memory_manager["gru::cell::linear_ih::Weights"] = TORANGE(ih_weights);
+        memory_manager["gru::cell::linear_hh::Biases"] = TORANGE(hh_biases);
+        memory_manager["gru::cell::linear_hh::Weights"] = TORANGE(hh_weights);
+
+        // Apply
+        ASSERT_NO_THROW(work.forwardPassTraining());
+
+        // Checks
+        const auto& outputTensor = memory_manager["out"];
+        const auto& hiddenTensor = memory_manager["gru::hidden_state[" + Conversions::toString(sequence_length - 1) + "]"];
+
+        EXPECT_EQ(outputTensor.size(), batch_size * hidden_size * sequence_length);
+        EXPECT_EQ(hiddenTensor.size(), batch_size * hidden_size);
+
+        for (size_t i = 0; i < outputTensor.size(); ++i)
+        {
+            const auto val = outputTensor[i];
+            const auto golden_val = output_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        for (size_t i = 0; i < hiddenTensor.size(); ++i)
+        {
+            const auto val = hiddenTensor[i];
+            const auto golden_val = hidden_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        // Apply
+        memory_manager[raul::Name("out").grad()] = 1.0_dt;
+        ASSERT_NO_THROW(work.backwardPassTraining());
+
+        // Checks
+        const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+        const auto& inputs = memory_manager["in"];
+
+        EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+        for (size_t i = 0; i < inputs_grad.size(); ++i)
+        {
+            const auto val = inputs_grad[i];
+            const auto golden_val = inputs_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        const auto gradBiasesIH = memory_manager[raul::Name("gru::cell::linear_ih::Biases").grad()];
+        const auto gradWeightsIH = memory_manager[raul::Name("gru::cell::linear_ih::Weights").grad()];
+        const auto gradBiasesHH = memory_manager[raul::Name("gru::cell::linear_hh::Biases").grad()];
+        const auto gradWeightsHH = memory_manager[raul::Name("gru::cell::linear_hh::Weights").grad()];
+
+        EXPECT_EQ(ih_weights_grad_golden.size(), gradWeightsIH.size());
+
+        for (size_t i = 0; i < gradWeightsIH.size(); ++i)
+        {
+            const auto val = gradWeightsIH[i];
+            const auto golden_val = ih_weights_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        EXPECT_EQ(hh_weights_grad_golden.size(), gradWeightsHH.size());
+
+        for (size_t i = 0; i < gradWeightsHH.size(); ++i)
+        {
+            const auto val = gradWeightsHH[i];
+            const auto golden_val = hh_weights_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        EXPECT_EQ(ih_biases_grad_golden.size(), gradBiasesIH.size());
+
+        for (size_t i = 0; i < gradBiasesIH.size(); ++i)
+        {
+            const auto val = gradBiasesIH[i];
+            const auto golden_val = ih_biases_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        EXPECT_EQ(hh_biases_grad_golden.size(), gradBiasesHH.size());
+
+        for (size_t i = 0; i < gradBiasesHH.size(); ++i)
+        {
+            const auto val = gradBiasesHH[i];
+            const auto golden_val = hh_biases_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+    }
+}
+
+TEST(TestGRU, ExternalStateSeq5Unit)
+{
+    PROFILE_TEST
+    
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 5U;
+    const size_t batch_size = 3U;
+
+    const raul::Tensor input_init{ -0.98124963_dt, 0.88884652_dt, 1.56899524_dt, -0.08185262_dt, -0.34940010_dt, 0.20242652_dt,
+        -0.28838342_dt, -0.00948004_dt, 2.41873121_dt, 1.32786930_dt, -0.26386023_dt, 0.36446592_dt, 2.54401636_dt, -2.68946719_dt,
+        2.44260907_dt, 0.40346053_dt, -0.99648869_dt, 0.97850031_dt, -0.44143954_dt, -0.26104334_dt, 0.79797685_dt, -1.10714447_dt,
+        2.33057547_dt, -1.04561079_dt, -0.47496739_dt, -0.49525651_dt, -0.19836050_dt, 2.21487951_dt, -0.13668862_dt, -1.01816082_dt,
+        0.17840540_dt, 1.30254650_dt, -0.56442887_dt, -0.91836810_dt, -0.74956363_dt, -0.09493295_dt, 1.10086620_dt, 1.31046069_dt,
+        -0.29284531_dt, 0.41807881_dt, -0.16952126_dt, -2.17486167_dt, 0.72024739_dt, 0.28544617_dt, -0.45227137_dt, 0.27532846_dt,
+        0.60920399_dt, -1.24595284_dt, 2.49661326_dt, -0.70688295_dt, 1.15044856_dt, -0.54976708_dt, 0.36670256_dt, -0.30200303_dt,
+        0.52035600_dt, -0.43189803_dt, -0.47292864_dt, 0.32564098_dt, -0.97362131_dt, 0.79789245_dt };
+    const raul::Tensor hidden_init{ 0.19764619_dt, 1.03078353_dt, -0.11038844_dt, 0.36835346_dt, 1.21385443_dt, -1.87656891_dt,
+        -0.90929747_dt, 1.49962282_dt, -0.14606902_dt };
+
+    const raul::Tensor ih_weights{ -0.16018850_dt, -0.21523368_dt, 0.14534736_dt, 0.20478249_dt, -0.28207695_dt, 0.05103678_dt,
+        0.33459508_dt, -0.05744445_dt, 0.17570728_dt, -0.13924935_dt, 0.20236105_dt, -0.41822916_dt, -0.33949858_dt, -0.29305753_dt,
+        0.53059113_dt, -0.15535578_dt, -0.00157636_dt, -0.27972361_dt, 0.57636297_dt, 0.56388080_dt, -0.43542984_dt, -0.46804047_dt,
+        -0.43762743_dt, -0.00278443_dt, -0.14716884_dt, -0.37790209_dt, -0.20707944_dt, 0.10907930_dt, -0.30166015_dt, 0.12793076_dt,
+        -0.13239560_dt, -0.27976277_dt, 0.07931954_dt, 0.47473097_dt, -0.39033455_dt, 0.02680892_dt };
+    const raul::Tensor hh_weights{ -0.21290815_dt, 0.56655741_dt, -0.54776871_dt, -0.55348331_dt, 0.56891227_dt, -0.36528373_dt,
+        0.11069155_dt, -0.04982883_dt, -0.12162286_dt, -0.12894991_dt, 0.36687177_dt, 0.02759558_dt, -0.56211662_dt, -0.34084457_dt,
+        -0.19681889_dt, 0.29053211_dt, -0.37362283_dt, 0.54439485_dt, -0.12859282_dt, -0.10367015_dt, 0.45238745_dt, 0.29018068_dt,
+        0.48966253_dt, 0.33396500_dt, -0.17514145_dt, -0.38305598_dt, -0.04295659_dt };
+    const raul::Tensor ih_biases{ 0.47783673_dt, -0.19377017_dt, -0.53541726_dt, 0.23667228_dt, 0.56203401_dt, -0.16436192_dt,
+        -0.47805962_dt, -0.52369112_dt, 0.14468360_dt };
+    const raul::Tensor hh_biases{ -0.04371679_dt, -0.29155451_dt, 0.11670089_dt, 0.21925384_dt, 0.45917761_dt, 0.44824445_dt,
+        -0.08641994_dt, -0.50908852_dt, -0.52170706_dt };
+    
+    const raul::Tensor output_golden{ 0.04099482_dt, 0.72546315_dt, -0.35705012_dt, -0.18205380_dt, 0.30703565_dt, -0.16191800_dt,
+        -0.65896875_dt, -0.07225198_dt, 0.53462708_dt, -0.57538640_dt, -0.09426290_dt, -0.40636504_dt, -0.60188627_dt, -0.18984750_dt,
+        0.03315166_dt, 0.24364877_dt, 0.92888021_dt, -1.07731462_dt, -0.02491495_dt, 0.72756612_dt, -0.61441928_dt, -0.10913710_dt,
+        0.54654652_dt, -0.58707130_dt, -0.14081354_dt, 0.21234041_dt, -0.49405333_dt, -0.56583554_dt, -0.08478528_dt, 0.36142004_dt,
+        -0.80644315_dt, 1.28136015_dt, -0.50205016_dt, -0.80267113_dt, 0.76070410_dt, -0.45086491_dt, -0.79942143_dt, 0.53077704_dt,
+        -0.66155320_dt, -0.76926607_dt, 0.30146921_dt, -0.50100321_dt, -0.59248662_dt, 0.08321577_dt, -0.04018950_dt };
+    const raul::Tensor hidden_golden{ -0.60188627_dt, -0.18984750_dt, 0.03315166_dt, -0.56583554_dt, -0.08478528_dt, 0.36142004_dt,
+        -0.59248662_dt, 0.08321577_dt, -0.04018950_dt };
+
+    const raul::Tensor inputs_grad_golden{ -0.25887769_dt, 0.21682419_dt, -0.09057298_dt, 0.26790270_dt, -0.21655954_dt, 0.11239155_dt,
+        0.10340309_dt, 0.24684711_dt, -0.08420015_dt, 0.12760904_dt, 0.17206614_dt, 0.35157475_dt, -0.23247159_dt, -0.29676440_dt,
+        -0.33962688_dt, 0.06672341_dt, 0.01960248_dt, 0.19405299_dt, -0.10021859_dt, 0.01033405_dt, -0.01225911_dt, -0.12045509_dt,
+        0.58137876_dt, 0.41612568_dt, 0.06256969_dt, 0.35233840_dt, 0.08686145_dt, 0.33367401_dt, -0.06411639_dt, 0.06608679_dt,
+        -0.10589395_dt, 0.24795744_dt, -0.11367548_dt, 0.00650537_dt, -0.02212469_dt, 0.17822637_dt, 0.00484396_dt, 0.15573753_dt,
+        0.06359266_dt, 0.10736005_dt, -0.11233012_dt, -0.22879635_dt, -0.04534333_dt, 0.32057947_dt, -0.19985782_dt, 0.17319165_dt,
+        0.06864062_dt, 0.38571107_dt, -0.02268109_dt, 0.11460479_dt, -0.09249460_dt, 0.31114897_dt, 0.03686558_dt, 0.32430127_dt,
+        -0.21339902_dt, 0.24955866_dt, 0.07459679_dt, 0.13102753_dt, -0.09447040_dt, 0.14053909_dt };
+    const raul::Tensor ih_weights_grad_golden{ 0.01096504_dt, -0.01558308_dt, 0.05583686_dt, -0.10004991_dt, 0.03207716_dt,
+        -0.05444815_dt, 0.04632650_dt, 0.02723377_dt, -0.20065905_dt, -0.08330191_dt, -0.49565664_dt, -0.22866280_dt, 0.52743018_dt,
+        1.11495745_dt, 0.12717551_dt, 0.66095996_dt, 1.73225510_dt, -1.31874371_dt, 3.57361364_dt, -0.38388062_dt, 0.98578322_dt,
+        -2.33836722_dt, 1.91859090_dt, -0.87441468_dt, 1.14404619_dt, -1.00529671_dt, 0.21646389_dt, 1.83426893_dt, -0.84994364_dt,
+        0.66395962_dt, 0.97516519_dt, -0.77440029_dt, 1.94098234_dt, 0.54351902_dt, 2.14976430_dt, 1.99930155_dt };
+    const raul::Tensor hh_weights_grad_golden{ 0.03578843_dt, -0.11866765_dt, 0.11812082_dt, 0.10448553_dt, -0.15508111_dt,
+        0.15297520_dt, 0.30883890_dt, -0.99575895_dt, 0.64709747_dt, 0.44530675_dt, 0.72096628_dt, -0.95579660_dt, -1.60582161_dt,
+        5.66374731_dt, -4.01859045_dt, -0.32694715_dt, -0.52114737_dt, 1.62126541_dt, -0.88528574_dt, 1.78323519_dt, -1.39898789_dt,
+        -0.66307080_dt, 1.93795180_dt, -1.04120386_dt, -1.07339120_dt, 2.28095698_dt, -1.58336806_dt };
+    const raul::Tensor ih_biases_grad_golden{ -0.18053019_dt, -0.29352021_dt, -1.28909278_dt, 1.08188605_dt, 6.52996635_dt, -0.93364418_dt,
+        3.78647590_dt, 3.67670298_dt, 7.98870373_dt };
+    const raul::Tensor hh_biases_grad_golden{ -0.18053019_dt, -0.29352021_dt, -1.28909278_dt, 1.08188605_dt, 6.52996635_dt, -0.93364418_dt,
+        2.84415555_dt, 2.21476746_dt, 3.18151975_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+    work.add<raul::DataLayer>("data2", raul::DataParams{ { "hidden" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = raul::GRUParams{ { "in", "hidden" }, { "out", "new_hidden" }, false };
+    raul::GRULayer("gru", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+
+    memory_manager["gru::cell::linear_ih::Biases"] = TORANGE(ih_biases);
+    memory_manager["gru::cell::linear_ih::Weights"] = TORANGE(ih_weights);
+    memory_manager["gru::cell::linear_hh::Biases"] = TORANGE(hh_biases);
+    memory_manager["gru::cell::linear_hh::Weights"] = TORANGE(hh_weights);
+
+    // Apply
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Checks
+    const auto& outputTensor = memory_manager["out"];
+    const auto& hiddenTensor = memory_manager["new_hidden"];
+
+    EXPECT_EQ(outputTensor.size(), batch_size * hidden_size * sequence_length);
+    EXPECT_EQ(hiddenTensor.size(), batch_size * hidden_size);
+
+    for (size_t i = 0; i < outputTensor.size(); ++i)
+    {
+        const auto val = outputTensor[i];
+        const auto golden_val = output_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < hiddenTensor.size(); ++i)
+    {
+        const auto val = hiddenTensor[i];
+        const auto golden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    // Apply
+    memory_manager[raul::Name("out").grad()] = 1.0_dt;
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Checks
+    const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto gradBiasesIH = memory_manager[raul::Name("gru::cell::linear_ih::Biases").grad()];
+    const auto gradWeightsIH = memory_manager[raul::Name("gru::cell::linear_ih::Weights").grad()];
+    const auto gradBiasesHH = memory_manager[raul::Name("gru::cell::linear_hh::Biases").grad()];
+    const auto gradWeightsHH = memory_manager[raul::Name("gru::cell::linear_hh::Weights").grad()];
+
+    EXPECT_EQ(ih_weights_grad_golden.size(), gradWeightsIH.size());
+
+    for (size_t i = 0; i < gradWeightsIH.size(); ++i)
+    {
+        const auto val = gradWeightsIH[i];
+        const auto golden_val = ih_weights_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(hh_weights_grad_golden.size(), gradWeightsHH.size());
+
+    for (size_t i = 0; i < gradWeightsHH.size(); ++i)
+    {
+        const auto val = gradWeightsHH[i];
+        const auto golden_val = hh_weights_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(ih_biases_grad_golden.size(), gradBiasesIH.size());
+
+    for (size_t i = 0; i < gradBiasesIH.size(); ++i)
+    {
+        const auto val = gradBiasesIH[i];
+        const auto golden_val = ih_biases_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    EXPECT_EQ(hh_biases_grad_golden.size(), gradBiasesHH.size());
+
+    for (size_t i = 0; i < gradBiasesHH.size(); ++i)
+    {
+        const auto val = gradBiasesHH[i];
+        const auto golden_val = hh_biases_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestGRU, SimpleSeq7FusionOffAndOnSeparateBiasesManagementUnit)
+{
+    PROFILE_TEST
+    
+    // Test parameters
+    const auto eps_rel = 1e-3_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 5U;
+    const size_t sequence_length = 7U;
+    const size_t batch_size = 2U;
+
+    const raul::Tensor input_init{ -0.49871477_dt, 0.76111090_dt, 0.61830086_dt, -0.29938447_dt, -0.18777807_dt, 1.91589761_dt,
+        0.69019532_dt, -2.32170153_dt, -1.07870805_dt, 0.24081051_dt, -1.39622724_dt, 0.11355210_dt, 1.10472572_dt, -1.39517200_dt,
+        0.47511867_dt, -0.81372583_dt, 0.13147806_dt, 1.57353830_dt, 0.78142995_dt, 0.98743659_dt, -1.48780751_dt, 0.58668745_dt,
+        0.15829536_dt, 0.11024587_dt, -0.99438965_dt, -1.18936396_dt, -1.19594944_dt, 1.31189203_dt, -0.20983894_dt, 0.78173131_dt,
+        0.98969257_dt, 0.41471335_dt, 0.87604475_dt, -0.28708741_dt, 1.02164006_dt, -0.51107377_dt, -1.71372783_dt, -0.51006478_dt,
+        -0.47489312_dt, -0.63340378_dt, -2.88906121_dt, -1.50998127_dt, 1.02411532_dt, -0.34333581_dt, 1.57126486_dt, 0.19161488_dt,
+        0.37994185_dt, -0.14475703_dt, 1.45025015_dt, -0.05190918_dt, -0.62843078_dt, -0.14201079_dt, -0.53414577_dt, -0.52337903_dt,
+        0.86150426_dt, -0.88696319_dt };
+
+    const raul::Tensor ih_weights{ -0.00334820_dt, 0.23990488_dt, -0.36807698_dt, -0.32912195_dt, -0.17224628_dt, 0.11992359_dt,
+        -0.00886074_dt, 0.35459095_dt, -0.03968754_dt, 0.11833835_dt, -0.13515380_dt, -0.08790672_dt, -0.42724484_dt,
+        -0.29618156_dt, -0.18435177_dt, 0.01656640_dt, 0.17679930_dt, 0.26833832_dt, -0.30318445_dt, -0.19474488_dt,
+        0.16243565_dt, 0.37136078_dt, -0.09203663_dt, 0.33465517_dt, -0.07208338_dt, 0.04732150_dt, 0.40494126_dt,
+        -0.41486681_dt, -0.28153792_dt, -0.11321893_dt, -0.17432383_dt, 0.38639289_dt, -0.28987473_dt, -0.20586711_dt,
+        -0.31244153_dt, -0.41884279_dt, -0.26105666_dt, 0.38442391_dt, 0.19955492_dt, 0.21675217_dt, 0.02351967_dt, -0.22927903_dt,
+        0.07566172_dt, -0.41756096_dt, -0.32314146_dt, -0.23055202_dt, 0.28216404_dt, 0.26221085_dt, -0.19833700_dt, -0.01613653_dt,
+        0.28602022_dt, 0.44458985_dt, 0.17749113_dt, 0.06041539_dt, 0.29985058_dt, -0.26332039_dt, 0.08333558_dt, -0.34672716_dt,
+        -0.30995756_dt, -0.23102319_dt };
+    const raul::Tensor hh_weights{ 0.20235211_dt, 0.17985159_dt, -0.26490808_dt, 0.13510638_dt, 0.24550772_dt, -0.05644614_dt,
+        0.01707530_dt, 0.10362148_dt, 0.27744085_dt, 0.42941183_dt, -0.34463334_dt, -0.16389024_dt, 0.17575938_dt, 0.37053853_dt,
+        0.38916856_dt, 0.39460194_dt, 0.08900201_dt, -0.38888919_dt, 0.04114029_dt, -0.27977920_dt, -0.41678256_dt, 0.39734590_dt,
+        0.34004325_dt, -0.44610807_dt, 0.08370590_dt, -0.07533762_dt, -0.07359397_dt, -0.20471510_dt, 0.17197877_dt, -0.26488620_dt,
+        0.16394460_dt, 0.22615951_dt, 0.32014751_dt, 0.16721815_dt, -0.44262305_dt, -0.29010606_dt, 0.22330046_dt, 0.09360242_dt,
+        -0.34886417_dt, -0.25751430_dt, 0.42071587_dt, 0.30134052_dt, -0.19499636_dt, -0.11255684_dt, -0.42601481_dt, -0.00803828_dt,
+        -0.33677816_dt, -0.34496120_dt, -0.02464131_dt, 0.06714690_dt, -0.18314752_dt, 0.26536649_dt, -0.27214694_dt, 0.40578824_dt,
+        0.30647540_dt, -0.37712759_dt, -0.11130446_dt, 0.02017945_dt, 0.06524897_dt, 0.10606754_dt, 0.17549926_dt, 0.02678818_dt,
+        -0.21820837_dt, 0.21161658_dt, -0.42898914_dt, -0.26506650_dt, -0.11195090_dt, -0.21784371_dt, -0.15645024_dt, -0.36654595_dt,
+        -0.09512910_dt, 0.09559476_dt, -0.29134434_dt, -0.02295071_dt, 0.32013822_dt };
+    const raul::Tensor ih_biases{ -0.04597366_dt, 0.01242906_dt, -0.03858063_dt, 0.09050769_dt, 0.28435606_dt, 0.42362136_dt,
+        0.28400564_dt, 0.42459065_dt, -0.03234324_dt, -0.40174159_dt, -0.21201378_dt, 0.30451006_dt, -0.00289905_dt,
+        -0.22228588_dt, -0.34270504_dt };
+    const raul::Tensor hh_biases{ -0.41852576_dt, -0.37745196_dt, -0.09071136_dt, 0.24525464_dt, 0.24178201_dt, -0.43130705_dt,
+        0.27896380_dt, -0.34994885_dt, -0.09454554_dt, -0.18133289_dt, -0.08614016_dt, -0.08780712_dt, -0.40130711_dt,
+        -0.38614115_dt, -0.06997976_dt };
+    
+    const raul::Tensor output_golden[] { { -8.57370198e-02_dt, 1.19639024e-01_dt, 5.53444251e-02_dt, -3.10806499e-04_dt,
+        -3.10788631e-01_dt, 8.00666958e-02_dt, 3.76263261e-02_dt, -4.01700109e-01_dt, 2.31814235e-01_dt, -4.84594762e-01_dt,
+        -7.17546046e-02_dt, 1.17292888e-01_dt, -2.81554103e-01_dt, -5.21355867e-03_dt, -2.93126822e-01_dt, 2.17504472e-01_dt,
+        1.38011470e-01_dt, -3.11364532e-01_dt, 1.34796441e-01_dt, 1.46658793e-01_dt, 2.59505510e-02_dt, 2.19239727e-01_dt,
+        6.08797073e-02_dt, -7.75245801e-02_dt, -2.07820177e-01_dt, -1.48364425e-01_dt, 3.54213387e-01_dt, 1.54362008e-01_dt,
+        -2.04493836e-01_dt, -4.12495077e-01_dt, -3.18946958e-01_dt, 5.72942138e-01_dt, 1.94450945e-01_dt, -4.19023573e-01_dt,
+        -1.42185301e-01_dt, -1.49107546e-01_dt, 1.89451307e-01_dt, 1.85296476e-01_dt, -1.60599183e-02_dt, -3.41232061e-01_dt,
+        -2.55143046e-02_dt, 2.01914787e-01_dt, 5.86769059e-02_dt, 2.03721538e-01_dt, -3.93848389e-01_dt, 3.77948023e-02_dt,
+        3.24185044e-01_dt, 4.96790037e-02_dt, 5.14572561e-02_dt, -1.99050665e-01_dt, 2.09582925e-01_dt, 4.70343411e-01_dt,
+        1.89169794e-01_dt, -5.90947568e-02_dt, -2.57973611e-01_dt, 8.13183188e-02_dt, 2.35455722e-01_dt, -1.95161998e-02_dt,
+        8.88638273e-02_dt, -3.67845953e-01_dt, 1.69192255e-03_dt, -3.95700634e-02_dt, -2.00530604e-01_dt, 5.09417057e-03_dt,
+        -1.26955509e-01_dt, 1.74372435e-01_dt, 1.16178185e-01_dt, -1.03745155e-01_dt, 8.13597590e-02_dt, -2.04327971e-01_dt }, 
+        { -0.02961647_dt, 0.01451481_dt, -0.04160413_dt, 0.00958520_dt, -0.18542275_dt, 0.28184667_dt, -0.09257030_dt,
+        -0.62827367_dt, 0.26646343_dt, -0.28583765_dt, 0.01991658_dt, -0.13148597_dt, -0.39614418_dt, -0.01365161_dt,
+        -0.05323233_dt, 0.44904941_dt, -0.13373131_dt, -0.46756098_dt, 0.13595393_dt, 0.37929219_dt, 0.04046041_dt,
+        -0.08687550_dt, 0.11659843_dt, -0.08234733_dt, 0.07530963_dt, -0.14636600_dt, 0.09070024_dt, 0.12827668_dt,
+        -0.26135188_dt, -0.11404249_dt, -0.32152492_dt, 0.37199897_dt, 0.16131368_dt, -0.50541115_dt, 0.14930819_dt,
+        -0.15999079_dt, 0.08358504_dt, 0.18025717_dt, 0.00758567_dt, -0.24013162_dt, 0.15188913_dt, 0.03908857_dt,
+        -0.11993092_dt, 0.25217247_dt, -0.16721363_dt, 0.25011995_dt, 0.13385345_dt, -0.14975448_dt, 0.04662204_dt,
+        0.06902729_dt, 0.42518508_dt, 0.33955550_dt, 0.15296564_dt, -0.11062419_dt, 0.05563445_dt, 0.19338971_dt,
+        0.02292895_dt, -0.25131604_dt, 0.08438525_dt, -0.03594999_dt, 0.11438245_dt, -0.30490211_dt, -0.47564423_dt,
+        -0.00414112_dt, 0.20976135_dt, 0.39620376_dt, -0.11654735_dt, -0.28611282_dt, 0.05767488_dt, 0.16176815_dt },
+        { -0.00706040_dt, 0.03178396_dt, 0.07029247_dt, 0.11342310_dt, -0.15415509_dt, 0.26933858_dt, -0.09887218_dt,
+        -0.48309278_dt, 0.34790334_dt, -0.24748604_dt, 0.08931449_dt, -0.11649603_dt, -0.26981816_dt, 0.09851944_dt,
+        -0.04053652_dt, 0.43648767_dt, -0.11886206_dt, -0.32755351_dt, 0.25758505_dt, 0.37187147_dt, 0.14333361_dt,
+        -0.04956196_dt, 0.19021407_dt, 0.07831401_dt, 0.09686732_dt, -0.04139061_dt, 0.14461881_dt, 0.25575104_dt,
+        -0.08327460_dt, -0.07595009_dt, -0.22977406_dt, 0.42075720_dt, 0.29032698_dt, -0.35370421_dt, 0.16582713_dt,
+        -0.11684012_dt, 0.11565334_dt, 0.23445119_dt, 0.12515122_dt, -0.20633316_dt, 0.14084697_dt, 0.06108802_dt,
+        0.03741142_dt, 0.35524768_dt, -0.13273706_dt, 0.26739243_dt, 0.17413653_dt, 0.02623897_dt, 0.18181825_dt,
+        0.08243188_dt, 0.43924320_dt, 0.39940402_dt, 0.25082776_dt, 0.05352539_dt, 0.06630776_dt, 0.27124614_dt,
+        0.01792890_dt, -0.08152765_dt, 0.22347879_dt, -0.00778555_dt, 0.19513480_dt, -0.33423594_dt, -0.30472487_dt,
+        0.14336413_dt, 0.21643990_dt, 0.40987831_dt, -0.10091737_dt, -0.12359181_dt, 0.22056001_dt, 0.16674608_dt } };
+    const raul::Tensor hidden_golden[] { { -0.31894696_dt, 0.57294214_dt, 0.19445094_dt, -0.41902357_dt, -0.14218530_dt,
+        0.17437243_dt, 0.11617818_dt, -0.10374516_dt, 0.08135976_dt, -0.20432797_dt },
+        { -0.32152492_dt, 0.37199897_dt, 0.16131368_dt, -0.50541115_dt, 0.14930819_dt, 0.39620376_dt, -0.11654735_dt,
+        -0.28611282_dt, 0.05767488_dt, 0.16176815_dt },
+        { -0.22977406_dt, 0.42075720_dt, 0.29032698_dt, -0.35370421_dt, 0.16582713_dt, 0.40987831_dt, -0.10091737_dt,
+        -0.12359181_dt, 0.22056001_dt, 0.16674608_dt } };
+
+    const raul::Tensor inputs_grad_golden[] { { -9.25021023e-02_dt, -3.01523805e-01_dt, 6.98627114e-01_dt, -1.43015325e-01_dt,
+        -7.86656663e-02_dt, -3.43749017e-01_dt, 5.18770278e-01_dt, -8.68005976e-02_dt, -3.02938372e-01_dt, -7.31878459e-01_dt,
+        1.37118518e-01_dt, -1.72322974e-01_dt, -3.47757265e-02_dt, -9.39048290e-01_dt, 1.94955528e-01_dt, -4.81629193e-01_dt,
+        -3.52195911e-02_dt, 7.49912187e-02_dt, 8.13023865e-01_dt, -8.30701441e-02_dt, -5.15766777e-02_dt, -1.91788614e-01_dt,
+        2.56899089e-01_dt, -1.37026951e-01_dt, -2.14259475e-02_dt, -3.93410474e-01_dt, -1.19292483e-01_dt, -1.93351388e-01_dt,
+        -4.44839895e-02_dt, -1.23751394e-01_dt, 7.11426079e-01_dt, 1.26082888e-02_dt, -4.33393158e-02_dt, -5.82754374e-01_dt,
+        7.31405020e-01_dt, -1.44216880e-01_dt, -1.71302691e-01_dt, -8.06554735e-01_dt, -9.25432905e-05_dt, -3.58552724e-01_dt,
+        8.11034665e-02_dt, -5.18651307e-01_dt, -3.18620913e-02_dt, -6.58564389e-01_dt, -8.97924528e-02_dt, -4.08081114e-01_dt,
+        6.63027287e-01_dt, -2.07603231e-01_dt, -1.12009682e-01_dt, -6.25381708e-01_dt, 2.95281917e-01_dt, -3.43482643e-01_dt,
+        -1.62094869e-02_dt, -3.42224836e-01_dt, 1.50504053e-01_dt, -1.51037127e-01_dt },
+        { -0.08421224_dt, -0.44107383_dt, 0.78805649_dt, -0.24144687_dt, 0.06028999_dt, -0.34397894_dt, 0.43945572_dt,
+        -0.25038162_dt, -0.34120509_dt, -0.73727071_dt, 0.26539871_dt, -0.18698014_dt, 0.01312664_dt, -0.73127222_dt,
+        0.29258260_dt, -0.35089502_dt, -0.07429407_dt, 0.02853717_dt, 0.91718560_dt, 0.04553531_dt, -0.16895452_dt,
+        -0.35225326_dt, 0.41099527_dt, -0.04041295_dt, -0.11435586_dt, -0.47189313_dt, 0.03883795_dt, -0.15085733_dt,
+        -0.11003456_dt, -0.26289243_dt, 0.86028969_dt, -0.01405015_dt, 0.02566952_dt, -0.64772069_dt, 0.74743372_dt,
+        -0.26823974_dt, -0.28428474_dt, -0.85054433_dt, 0.14142622_dt, -0.26525578_dt, 0.03493105_dt, -0.46877751_dt,
+        -0.05046972_dt, -0.41282529_dt, 0.03381753_dt, -0.43182972_dt, 0.68081868_dt, -0.37568352_dt, -0.00581875_dt,
+        -0.54737091_dt, 0.38047162_dt, -0.46099150_dt, -0.06257868_dt, -0.35707968_dt, 0.29034615_dt, -0.05703910_dt },
+        { -0.11691321_dt, -0.45594504_dt, 0.79155427_dt, -0.17999138_dt, -0.03996599_dt, -0.36488774_dt, 0.43468049_dt,
+        -0.11774554_dt, -0.32652164_dt, -0.73080486_dt, 0.28780222_dt, -0.21729580_dt, -0.03788246_dt, -0.77943867_dt,
+        0.26735383_dt, -0.29480329_dt, -0.08147183_dt, 0.01550219_dt, 0.89700401_dt, -0.03946599_dt, -0.14152449_dt,
+        -0.30673307_dt, 0.38511863_dt, -0.10278194_dt, -0.10865213_dt, -0.43026522_dt, 0.02680536_dt, -0.16575882_dt,
+        -0.11480118_dt, -0.26361251_dt, 0.84987438_dt, -0.01486738_dt, -0.07805163_dt, -0.67427260_dt, 0.69294155_dt,
+        -0.15906250_dt, -0.27417341_dt, -0.82342595_dt, 0.17474693_dt, -0.26968765_dt, 0.06483445_dt, -0.46927723_dt,
+        -0.03933090_dt, -0.50779426_dt, -0.05170135_dt, -0.44503903_dt, 0.65237957_dt, -0.30915731_dt, -0.06642064_dt,
+        -0.54794633_dt, 0.37532255_dt, -0.36642486_dt, -0.09255892_dt, -0.33367574_dt, 0.25742531_dt, -0.00314695_dt } };
+    const raul::Tensor hidden_grad_golden[] { { 0.65615290_dt, 1.83252537_dt, 0.13781744_dt, 1.44201040_dt, 1.00348175_dt,
+        0.54186738_dt, 1.78111482_dt, 0.33433187_dt, 0.89475089_dt, 0.86535913_dt },
+        { 0.04626983_dt, 1.83335853_dt, -0.09080449_dt, 1.26611650_dt, 1.11229444_dt, 0.03718096_dt, 1.77139342_dt,
+        0.02294287_dt, 0.76090360_dt, 0.88440871_dt },
+        { 0.24432102_dt, 1.47863686_dt, 0.04352704_dt, 1.53253007_dt, 1.44209468_dt, 0.20157242_dt, 1.33543420_dt,
+        0.17899784_dt, 0.98194647_dt, 1.17850268_dt } };
+    const raul::Tensor ih_weights_grad_golden [] { { -2.74017211e-02_dt, -1.49154505e-02_dt, -3.91223421e-03_dt, 2.04608459e-02_dt,
+        -1.59261990e-02_dt, -2.35812552e-02_dt, 2.28142738e-02_dt, -6.13229629e-03_dt, -6.25466462e-03_dt, -1.23037528e-02_dt,
+        -7.20387138e-03_dt, -8.71205553e-02_dt, 1.47865163e-02_dt, -7.40025565e-02_dt, 6.69760257e-03_dt, -1.14621691e-01_dt,
+        2.85029672e-02_dt, 3.80026065e-02_dt, 2.38449834e-02_dt, 3.90766561e-02_dt, -2.40946472e-01_dt, 9.29698944e-01_dt,
+        -4.58416164e-01_dt, 1.30317163e+00_dt, 2.67947698e+00_dt, 3.83054167e-01_dt, -6.44073009e-01_dt, -3.71567458e-01_dt,
+        1.27126896e+00_dt, -1.02224134e-01_dt, -2.12133408e-01_dt, -1.12794220e+00_dt, -2.11958432e+00_dt, -6.71627343e-01_dt,
+        -1.28821325e+00_dt, 1.34202528e+00_dt, -3.34208250e-01_dt, 1.78880835e+00_dt, 1.36162257e+00_dt, 1.37326837e-01_dt,
+        -3.80962086e+00_dt, -1.06971049e+00_dt, 2.75867891e+00_dt, -3.59006214e+00_dt, -7.25637794e-01_dt, 1.69286978e+00_dt,
+        1.00791526e+00_dt, -7.71707177e-01_dt, 2.50608325e-02_dt, 1.10111403e+00_dt, 2.52171850e+00_dt, -2.61387706e+00_dt,
+        1.72042727e-01_dt, 3.93380070e+00_dt, 5.73299503e+00_dt, -1.88192308e+00_dt, -1.26346266e+00_dt, -2.35832357e+00_dt,
+        9.52226520e-01_dt, -3.33012009e+00_dt },
+        { 0.01515695_dt, -0.01556537_dt, -0.04193422_dt, 0.04416762_dt, 0.07711352_dt, -0.08050019_dt, 0.00775786_dt,
+        -0.01008915_dt, 0.13275868_dt, -0.12212355_dt, -0.29995272_dt, 0.12567018_dt, -0.17704812_dt, -0.49933982_dt,
+        -0.55892980_dt, 0.05801111_dt, 0.00698950_dt, -0.00710623_dt, -0.02897462_dt, 0.04966169_dt, -0.15025759_dt,
+        0.68107641_dt, -0.31230891_dt, 1.11865151_dt, 2.77492499_dt, 0.90601391_dt, -0.35773870_dt, -0.49675110_dt,
+        1.06987023_dt, 0.04967080_dt, -0.09080416_dt, -0.64871752_dt, -2.45233822_dt, -0.84692669_dt, -1.30428219_dt,
+        1.24401033_dt, -0.19111200_dt, 1.76824546_dt, 1.23853469_dt, 0.37137318_dt, -3.33355784_dt, -0.20663369_dt,
+        2.25592971_dt, -2.85118556_dt, -2.24702334_dt, 1.50240397_dt, 1.27767754_dt, -0.45343941_dt, -1.78661406_dt,
+        1.10452437_dt, 2.52133369_dt, -1.82008171_dt, 0.54681885_dt, 3.67907667_dt, 5.52823734_dt, -1.88177872_dt,
+        -1.36771071_dt, -0.96405947_dt, 1.84500313_dt, -3.30103755_dt },
+        { -6.25821874e-02_dt, -1.98632143e-02_dt, 1.50990663e-02_dt, -2.28704382e-02_dt, 6.19283691e-03_dt, -3.26259732e-02_dt,
+        2.91429684e-02_dt, -9.97227430e-03_dt, -2.77164709e-02_dt, -7.06638768e-03_dt, -3.88959534e-02_dt, -5.89128211e-02_dt,
+        -4.35883403e-02_dt, -1.22362770e-01_dt, -7.91351646e-02_dt, -6.85479194e-02_dt, -3.83072160e-03_dt, -5.03487140e-03_dt,
+        -8.37371871e-03_dt, 2.70785354e-02_dt, -1.38976425e-01_dt, 6.47332847e-01_dt, -4.35974658e-01_dt, 1.28151321e+00_dt,
+        2.78643227e+00_dt, 8.62253726e-01_dt, -3.56498510e-01_dt, -5.02790391e-01_dt, 1.21891451e+00_dt, -7.72543699e-02_dt,
+        -1.68554068e-01_dt, -8.25520217e-01_dt, -2.27432084e+00_dt, -9.89254355e-01_dt, -1.56061053e+00_dt, 1.32943869e+00_dt,
+        -2.40540981e-01_dt, 1.82324803e+00_dt, 1.22914481e+00_dt, 3.93661976e-01_dt, -3.45429778e+00_dt, -3.74935627e-01_dt,
+        2.14440107e+00_dt, -2.57402086e+00_dt, -2.31083727e+00_dt, 1.63542128e+00_dt, 1.42455649e+00_dt, -5.72202981e-01_dt,
+        -7.52898216e-01_dt, 1.11538112e+00_dt, 2.50962377e+00_dt, -2.57560778e+00_dt, -5.40989876e-01_dt, 3.64450693e+00_dt,
+        5.14214325e+00_dt, -1.40772665e+00_dt, -1.17870414e+00_dt, -9.35788572e-01_dt, 1.91784728e+00_dt, -3.43893766e+00_dt } };
+    const raul::Tensor hh_weights_grad_golden[] { { 3.79499071e-03_dt, -8.80209263e-03_dt, -1.30117619e-02_dt, 5.79346623e-03_dt,
+        1.08418055e-02_dt, -1.08007807e-02_dt, -2.22886261e-02_dt, 8.73351097e-03_dt, -7.37401703e-03_dt, 2.68841293e-02_dt,
+        3.85046378e-03_dt, 3.33169401e-02_dt, -1.85050201e-02_dt, 1.46386158e-02_dt, -6.30537644e-02_dt, -1.57447029e-02_dt,
+        1.14286914e-02_dt, -2.16874294e-04_dt, 1.29081495e-03_dt, -4.75544259e-02_dt, 3.06097232e-03_dt, -3.85432765e-02_dt,
+        -1.34961978e-02_dt, -4.57143644e-03_dt, 5.61441779e-02_dt, 1.46982074e-01_dt, 8.33640173e-02_dt, -4.12352197e-02_dt,
+        3.13674137e-02_dt, 4.26567793e-02_dt, 6.43069521e-02_dt, 2.35203020e-02_dt, 1.03892036e-01_dt, -6.52568787e-02_dt,
+        4.15292196e-02_dt, -9.67449993e-02_dt, 1.25450827e-02_dt, 2.13597178e-01_dt, -7.01165348e-02_dt, -1.32469878e-01_dt,
+        1.06343068e-01_dt, 6.08820170e-02_dt, -1.83758527e-01_dt, 1.53454050e-01_dt, -1.13714397e-01_dt, 9.07379612e-02_dt,
+        3.26490030e-02_dt, -3.02950572e-03_dt, -9.16632358e-03_dt, 1.91837788e-01_dt, 6.26117736e-03_dt, 6.47648036e-01_dt,
+        -1.05257429e-01_dt, 1.44780025e-01_dt, -1.00078046e+00_dt, 1.42739117e-01_dt, 5.39625704e-01_dt, -2.98494697e-01_dt,
+        2.24041328e-01_dt, -8.16666842e-01_dt, 3.60179096e-02_dt, 4.87171352e-01_dt, -1.04639463e-01_dt, 1.06505051e-01_dt,
+        -7.19190836e-01_dt, 1.28398418e-01_dt, 8.42316806e-01_dt, -1.79315388e-01_dt, 1.90877199e-01_dt, -1.03166974e+00_dt,
+        1.40393525e-03_dt, 1.02372003e+00_dt, -1.66061044e-01_dt, 1.82061747e-01_dt, -1.64755750e+00_dt },
+        { -2.06336239e-03_dt, -1.21392086e-02_dt, -1.58382282e-02_dt, 1.17725534e-02_dt, 1.08888168e-02_dt, -6.10688478e-02_dt,
+        -6.53812895e-05_dt, 6.72151595e-02_dt, -2.57743038e-02_dt, 6.50642347e-03_dt, -7.05780461e-02_dt, 4.66251047e-03_dt,
+        5.97687289e-02_dt, -9.72759561e-04_dt, -1.32295359e-02_dt, -1.64037853e-01_dt, -1.68339945e-02_dt, 1.10417724e-01_dt,
+        -2.10126489e-02_dt, -4.57031690e-02_dt, -8.63897614e-04_dt, -1.75186172e-02_dt, -1.73618160e-02_dt, 2.01846822e-04_dt,
+        2.34027933e-02_dt, 2.42396265e-01_dt, 1.29013397e-02_dt, -1.15447998e-01_dt, 3.71865220e-02_dt, 1.45322025e-01_dt,
+        7.34156296e-02_dt, 1.45307451e-01_dt, 1.03126988e-01_dt, -3.61430869e-02_dt, -9.70488936e-02_dt, -1.54037058e-01_dt,
+        9.09431279e-02_dt, 2.58155137e-01_dt, -7.21935108e-02_dt, -1.34414375e-01_dt, 2.54395336e-01_dt, -5.64202443e-02_dt,
+        -3.37104827e-01_dt, 1.88293323e-01_dt, 1.29346848e-02_dt, 1.04515925e-01_dt, -2.76436862e-02_dt, 9.16519091e-02_dt,
+        -4.92685437e-02_dt, 2.79361308e-01_dt, 4.32625651e-01_dt, 2.29690541e-02_dt, -5.05983591e-01_dt, 1.36502072e-01_dt,
+        -1.73656315e-01_dt, 5.08324981e-01_dt, -5.81545755e-02_dt, -6.43786430e-01_dt, 2.12467715e-01_dt, -6.83289692e-02_dt,
+        4.38951969e-01_dt, -5.58404773e-02_dt, -5.14174104e-01_dt, 1.26409635e-01_dt, -3.45334411e-02_dt, 6.83218241e-01_dt,
+        -9.72456858e-03_dt, -7.07503319e-01_dt, 1.82228670e-01_dt, 1.29231066e-02_dt, 4.83273029e-01_dt, 1.34956405e-01_dt,
+        -5.23505449e-01_dt, 1.13200955e-01_dt, -3.39472622e-01_dt },
+        { 0.02239588_dt, -0.00480289_dt, -0.02123515_dt, 0.02128624_dt, 0.00417687_dt, -0.03775726_dt, -0.00592602_dt, 0.01690736_dt,
+        -0.02578502_dt, -0.00191308_dt, 0.02369287_dt, -0.00360171_dt, -0.02650358_dt, 0.03255519_dt, -0.01456579_dt, -0.06765833_dt,
+        -0.01420606_dt, 0.00493713_dt, -0.02830710_dt, -0.03792100_dt, 0.00303537_dt, -0.01701461_dt, -0.02536058_dt, -0.00497453_dt,
+        0.01346707_dt, 0.21275178_dt, 0.02542560_dt, -0.06009022_dt, 0.04251155_dt, 0.14398302_dt, 0.05948129_dt, 0.17038615_dt,
+        0.11483939_dt, -0.04301221_dt, -0.09474866_dt, -0.16747689_dt, 0.11515789_dt, 0.23117679_dt, -0.10968873_dt, -0.15126731_dt,
+        0.22987351_dt, -0.04751486_dt, -0.20893422_dt, 0.21182910_dt, 0.02880069_dt, 0.07861820_dt, -0.02197762_dt, 0.06071748_dt,
+        -0.06976791_dt, 0.26163822_dt, 0.63474494_dt, 0.11256830_dt, -0.09814491_dt, 0.61247039_dt, -0.09848526_dt, 0.76436496_dt,
+        0.00617715_dt, -0.34061840_dt, 0.74200177_dt, -0.02422517_dt, 0.57431597_dt, 0.02574631_dt, -0.12581050_dt, 0.56141913_dt,
+        -0.02327836_dt, 0.82180691_dt, 0.07936050_dt, -0.18451987_dt, 0.74644482_dt, 0.07475318_dt, 0.58632004_dt, 0.21516977_dt,
+        0.01163810_dt, 0.60022622_dt, -0.21872407_dt } };
+    const raul::Tensor ih_biases_grad_golden{ -0.02127843_dt, -0.09590528_dt, 0.19884211_dt, 0.09993608_dt, -0.15076885_dt, 0.34202248_dt,
+        -0.95102441_dt, -0.38943574_dt, 0.29840440_dt, 0.80678231_dt, 8.05632210_dt, 8.07075119_dt, 6.89470148_dt, 11.67421913_dt, 9.89730263_dt };
+    const raul::Tensor hh_biases_grad_golden{ -0.14693266_dt, -0.29261622_dt, -0.66793716_dt, -1.05724382_dt, -0.13950041_dt, 0.27531958_dt,
+        -0.12721337_dt, -0.24831492_dt, 0.42653963_dt, 0.26813397_dt, 3.69462466_dt, 3.76037288_dt, 3.68750381_dt, 5.87739801_dt, 4.98879528_dt };
+
+    // Initialization
+    for (size_t k = 2; k < 3; ++k)
+    {
+        for (size_t q = 0; q < 2; ++q)
+        {
+            MANAGERS_DEFINE
+            NETWORK_PARAMS_DEFINE(networkParameters);
+
+            work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+            // Network
+            bool useFusion = (q == 1 ? true : false);
+            bool useBiasForInput = (k == 0);
+            bool useBiasForHidden = (k == 1);
+            const auto params = raul::GRUParams{ { "in" }, { "out" }, hidden_size, false, useBiasForInput, useBiasForHidden, false, useFusion };
+            raul::GRULayer("gru", params, networkParameters);
+            TENSORS_CREATE(batch_size)
+
+            const auto ihBiasesName = Name("gru::cell::linear_ih::Biases");
+            const auto ihWeightsName = Name("gru::cell::linear_ih::Weights");
+            const auto hhBiasesName = raul::Name("gru::cell::linear_hh::Biases");
+            const auto hhWeightsName = Name("gru::cell::linear_hh::Weights");
+            
+            memory_manager["in"] = TORANGE(input_init);
+            memory_manager[ihWeightsName] = TORANGE(ih_weights);
+            memory_manager[hhWeightsName] = TORANGE(hh_weights);
+            if (useBiasForInput)
+            {
+                memory_manager[ihBiasesName] = TORANGE(ih_biases);
+            }
+            if (useBiasForHidden)
+            {
+                memory_manager[hhBiasesName] = TORANGE(hh_biases);
+            }
+
+            // Apply
+            ASSERT_NO_THROW(work.forwardPassTraining());
+
+            // Checks
+            const auto& outputTensor = memory_manager["out"];
+            const auto& hiddenTensor = memory_manager["gru::hidden_state[" + Conversions::toString(sequence_length - 1) + "]"];
+
+            EXPECT_EQ(outputTensor.size(), batch_size * hidden_size * sequence_length);
+            EXPECT_EQ(hiddenTensor.size(), batch_size * hidden_size);
+
+            for (size_t i = 0; i < outputTensor.size(); ++i)
+            {
+                const auto val = outputTensor[i];
+                const auto golden_val = output_golden[k][i];
+                ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+            }
+
+            for (size_t i = 0; i < hiddenTensor.size(); ++i)
+            {
+                const auto val = hiddenTensor[i];
+                const auto golden_val = hidden_golden[k][i];
+                ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+            }
+
+            // Apply
+            memory_manager[raul::Name("out").grad()] = 1.0_dt;
+            ASSERT_NO_THROW(work.backwardPassTraining());
+
+            // Checks
+            const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+            const auto& inputs = memory_manager["in"];
+
+            EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+            for (size_t i = 0; i < inputs_grad.size(); ++i)
+            {
+                const auto val = inputs_grad[i];
+                const auto golden_val = inputs_grad_golden[k][i];
+                ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+            }
+
+            const auto& hidden_grad = memory_manager[raul::Name("gru::hidden_state").grad()];
+            const auto& hidden = memory_manager["gru::hidden_state"];
+
+            EXPECT_EQ(hidden.size(), hidden_grad.size());
+
+            for (size_t i = 0; i < hidden_grad.size(); ++i)
+            {
+                const auto val = hidden_grad[i];
+                const auto golden_val = hidden_grad_golden[k][i];
+                ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+            }
+            
+            const auto gradWeightsIH = memory_manager[ihWeightsName.grad()];
+            const auto gradWeightsHH = memory_manager[hhWeightsName.grad()];
+
+            EXPECT_EQ(ih_weights_grad_golden[k].size(), gradWeightsIH.size());
+
+            for (size_t i = 0; i < gradWeightsIH.size(); ++i)
+            {
+                const auto val = gradWeightsIH[i];
+                const auto golden_val = ih_weights_grad_golden[k][i];
+                ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+            }
+
+            EXPECT_EQ(hh_weights_grad_golden[k].size(), gradWeightsHH.size());
+
+            for (size_t i = 0; i < gradWeightsHH.size(); ++i)
+            {
+                const auto val = gradWeightsHH[i];
+                const auto golden_val = hh_weights_grad_golden[k][i];
+                ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+            }
+
+            const auto ihBiasesGradName = ihBiasesName.grad();
+            if (useBiasForInput)
+            {
+                const auto gradBiasesIH = memory_manager[ihBiasesGradName];
+                EXPECT_TRUE(memory_manager.tensorExists(ihBiasesGradName));
+                EXPECT_EQ(ih_biases_grad_golden.size(), gradBiasesIH.size());
+
+                for (size_t i = 0; i < gradBiasesIH.size(); ++i)
+                {
+                    const auto val = gradBiasesIH[i];
+                    const auto golden_val = ih_biases_grad_golden[i];
+                    ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+                }
+            }
+            else
+            {
+                EXPECT_TRUE(!memory_manager.tensorExists(ihBiasesGradName));
+            }
+
+            const auto hhBiasesGradName = hhBiasesName.grad();
+            if (useBiasForHidden)
+            {
+                const auto gradBiasesHH = memory_manager[hhBiasesGradName];
+                EXPECT_TRUE(memory_manager.tensorExists(hhBiasesGradName));
+                EXPECT_EQ(hh_biases_grad_golden.size(), gradBiasesHH.size());
+
+                for (size_t i = 0; i < gradBiasesHH.size(); ++i)
+                {
+                    const auto val = gradBiasesHH[i];
+                    const auto golden_val = hh_biases_grad_golden[i];
+                    ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+                }
+            }
+            else
+            {
+                EXPECT_TRUE(!memory_manager.tensorExists(hhBiasesGradName));
+            }
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_GRUCell.cpp b/training/src/tests/tests/layers/Test_Layer_GRUCell.cpp
new file mode 100644
index 00000000..58549392
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_GRUCell.cpp
@@ -0,0 +1,579 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/initializers/RandomUniformInitializer.h>
+#include <training/base/layers/composite/rnn/GRUCellLayer.h>
+#include <training/compiler/Layers.h>
+
+namespace UT
+{
+
+TEST(TestGRUCell, BuildUnit)
+{
+    PROFILE_TEST
+    raul::Workflow netdef;
+    netdef.add<raul::DataLayer>("fake_data_in", raul::DataParams{ { "in", "labels" }, 1, 1, 1, 1 });
+    netdef.add<raul::DataLayer>("fake_data_state", raul::DataParams{ { "hidden" }, 1, 1, 1, 1 });
+    raul::GRUCellLayer("GRU", raul::GRUCellParams{ "in", "hidden", "new_hidden", {} }, netdef.getNetworkParameters());
+    netdef.preparePipelines();
+    netdef.setBatchSize(1u);
+    netdef.prepareMemoryForTraining();
+    netdef.printInfo(std::cout);
+}
+
+// see gru_cell.py
+TEST(TestGRUCell, SimpleZeroHiddenUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 9U;
+    const auto hidden_size = 5U;
+    const auto batch_size = 3U;
+
+    const raul::Tensor input_init{ -1.47774279_dt, -1.75567114_dt, 0.07616609_dt, -1.07860339_dt, 1.44034219_dt, -0.11059419_dt,
+        0.57686025_dt, -0.16917409_dt, -0.06402487_dt, 1.03842556_dt, 0.90682352_dt, 1.36152399_dt, 2.03717399_dt, 0.64304245_dt,
+        -0.73256963_dt, -0.48771340_dt, -0.23395768_dt, 0.70731831_dt, 0.58004808_dt, 0.48257831_dt, -0.82978928_dt, 1.26783395_dt,
+        0.27356258_dt, -0.61465430_dt, -0.02349441_dt, 1.17166984_dt, 0.39868718_dt };
+    
+    const raul::Tensor hidden_golden{ -0.53094757_dt, -0.53094757_dt, -0.53094757_dt, -0.53094757_dt, -0.53094757_dt, 0.00071675_dt,
+        0.00071675_dt, 0.00071675_dt, 0.00071675_dt, 0.00071675_dt, 0.00895447_dt, 0.00895447_dt, 0.00895447_dt, 0.00895447_dt,
+        0.00895447_dt };
+
+    const raul::Tensor inputs_grad_golden{ 2.16085649_dt, 2.16085649_dt, 2.16085649_dt, 2.16085649_dt, 2.16085649_dt, 2.16085649_dt,
+        2.16085649_dt, 2.16085649_dt, 2.16085649_dt, -0.00358115_dt, -0.00358115_dt, -0.00358115_dt, -0.00358115_dt, -0.00358115_dt,
+        -0.00358115_dt, -0.00358115_dt, -0.00358115_dt, -0.00358115_dt, -0.04435632_dt, -0.04435632_dt, -0.04435632_dt, -0.04435632_dt,
+        -0.04435632_dt, -0.04435632_dt, -0.04435632_dt, -0.04435632_dt, -0.04435632_dt };
+    const raul::Tensor hidden_grad_golden{ 1.90440905_dt, 1.90440905_dt, 1.90440905_dt, 1.90440905_dt, 1.90440905_dt, 0.99570209_dt,
+        0.99570209_dt, 0.99570209_dt, 0.99570209_dt, 0.99570209_dt, 0.94668758_dt, 0.94668758_dt, 0.94668758_dt, 0.94668758_dt,
+        0.94668758_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", raul::DataParams{ { "hidden" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = raul::GRUCellParams{ "in", "hidden", "new_hidden", {} };
+    raul::GRUCellLayer("gru_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = 0.0_dt;
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Checks
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+
+    // Backward
+    memory_manager[raul::Name("new_hidden").grad()] = 1.0_dt;
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Checks
+    const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto& hidden_grad = memory_manager[raul::Name("hidden").grad()];
+
+    EXPECT_EQ(hidden_grad.size(), hidden_input_tensor.size());
+
+    for (size_t i = 0; i < hidden_grad.size(); ++i)
+    {
+        const auto val = hidden_grad[i];
+        const auto golden_val = hidden_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestGRUCell, SimpleRandomHiddenUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    const raul::Tensor input_init{ -0.13295190_dt, -0.04689599_dt, -0.28016463_dt, 0.54008639_dt, -0.14635307_dt, -0.15740128_dt, -1.01898205_dt, 0.02789201_dt };
+    const raul::Tensor hidden_init{ 0.80468506_dt, -1.31859577_dt, -1.06609106_dt, -2.97711873_dt, 1.99564660_dt, -0.96829116_dt };
+    
+    const raul::Tensor hidden_golden{ 0.73356980_dt, -0.58812207_dt, -0.43094379_dt, -1.02896798_dt, 0.08311620_dt, -0.57972389_dt };
+
+    const raul::Tensor inputs_grad_golden{ -0.19915806_dt, -0.19915806_dt, -0.19915806_dt, -0.19915806_dt, 1.42452824_dt, 1.42452824_dt, 1.42452824_dt, 1.42452824_dt };
+    const raul::Tensor hidden_grad_golden{ 0.15815496_dt, 0.15815496_dt, 0.15815496_dt, 0.23563468_dt, 0.23563468_dt, 0.23563468_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", raul::DataParams{ { "hidden" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = raul::GRUCellParams{ "in", "hidden", "new_hidden", {} };
+    raul::GRUCellLayer("gru_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Checks
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+
+    // Backward
+    memory_manager[raul::Name("new_hidden").grad()] = 1.0_dt;
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Checks
+    const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto& hidden_grad = memory_manager[raul::Name("hidden").grad()];
+
+    EXPECT_EQ(hidden_grad.size(), hidden_input_tensor.size());
+
+    for (size_t i = 0; i < hidden_grad.size(); ++i)
+    {
+        const auto val = hidden_grad[i];
+        const auto golden_val = hidden_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestGRUCell, RandomWeightsFusionOffAndOnUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 7U;
+    const auto hidden_size = 4U;
+    const auto batch_size = 3U;
+
+    const raul::Tensor input_init{ -0.68704545_dt, -2.33485818_dt, 0.09404085_dt, -0.20208217_dt, 3.12574148_dt, 1.62500739_dt,
+        -0.63396734_dt, 1.75850010_dt, -0.57933402_dt, 0.52056360_dt, 1.06761038_dt, -1.56271243_dt, -0.87330669_dt, 0.37735614_dt,
+        -2.18030119_dt, -0.14689700_dt, 0.72165412_dt, 0.31223938_dt, 0.32089281_dt, -1.56860936_dt, 0.21175182_dt };
+    const raul::Tensor hidden_init{ 1.20043528_dt, -0.15125844_dt, 0.89194465_dt, 0.69662333_dt, -0.15776138_dt, -1.64236963_dt,
+        -0.39137629_dt, -1.31812286_dt, -1.75797606_dt, 0.10276648_dt, 0.26235840_dt, 0.50530910_dt };
+
+    const raul::Tensor ih_weights{ -0.28185493_dt, -0.28055829_dt, -0.38474041_dt, 0.33566517_dt, 0.35546559_dt, -0.05690634_dt,
+        -0.28934300_dt, 0.38645273_dt, 0.31974447_dt, 0.03716701_dt, -0.23606765_dt, 0.45952392_dt, 0.20447034_dt, -0.37957269_dt,
+        0.47854143_dt, 0.37968689_dt, -0.18224543_dt, 0.28107727_dt, -0.28409451_dt, -0.07835716_dt, 0.42455059_dt, 0.02065957_dt,
+        -0.35360909_dt, -0.16711646_dt, -0.13572258_dt, -0.09646440_dt, 0.04785031_dt, 0.46241480_dt, 0.02677017_dt, -0.30871999_dt,
+        0.02562714_dt, 0.23974359_dt, 0.24802011_dt, -0.45696926_dt, -0.08947122_dt, -0.37157226_dt, -0.21334279_dt, 0.18014669_dt,
+        -0.35506511_dt, 0.18586344_dt, 0.42438906_dt, 0.03279418_dt, -0.33324385_dt, -0.17914248_dt, 0.10918206_dt, -0.38115901_dt,
+        0.24840516_dt, -0.45393479_dt, -0.48064667_dt, -0.48583031_dt, -0.10143167_dt, 0.33621645_dt, -0.47323948_dt, 0.41559356_dt,
+        -0.20001143_dt, 0.14644206_dt, 0.02280146_dt, -0.45085955_dt, 0.41466451_dt, 0.26922172_dt, 0.49699783_dt, 0.25260609_dt,
+        -0.33003449_dt, 0.41729188_dt, 0.02687222_dt, 0.23710823_dt, -0.40091455_dt, -0.14381325_dt, -0.49093878_dt, -0.19474626_dt,
+        0.10786557_dt, -0.39258087_dt, 0.15938210_dt, 0.26840341_dt, 0.06965464_dt, -0.33454168_dt, -0.38765985_dt, -0.15425831_dt,
+        0.21947908_dt, 0.49319822_dt, 0.28751451_dt, -0.05630463_dt, 0.17530823_dt, -0.49053144_dt };
+    const raul::Tensor hh_weights{ -0.42705065_dt, 0.23330396_dt, -0.28320760_dt, 0.24054784_dt, -0.35296607_dt, -0.24765545_dt,
+        -0.41184449_dt,0.26092035_dt, -0.05094755_dt, 0.38480055_dt, 0.30943608_dt, 0.27667129_dt, 0.01607805_dt, -0.15458900_dt,
+        -0.10871583_dt, 0.06645030_dt, 0.24785477_dt, -0.35029495_dt, 0.41963893_dt, -0.05436504_dt, -0.41897279_dt, -0.27052891_dt,
+        0.44240886_dt, 0.45726359_dt, -0.46313983_dt, 0.35264915_dt, 0.25057960_dt, 0.29595923_dt, 0.42326462_dt, -0.26947516_dt,
+        0.15788788_dt, 0.20461661_dt, -0.14774668_dt, 0.16732657_dt, -0.14385670_dt, 0.30913067_dt, -0.13872731_dt, -0.18639785_dt,
+        0.12587452_dt, 0.17734683_dt, -0.24428582_dt, 0.04419917_dt, 0.28976786_dt, -0.04974836_dt, 0.15216696_dt, -0.12059349_dt,
+        0.17524981_dt, -0.36219710_dt };
+    const raul::Tensor ih_biases{ -0.29401439_dt, -0.25379527_dt, 0.45950544_dt, -0.13454205_dt, -0.00136518_dt, -0.24224776_dt,
+        0.49914503_dt, 0.48833507_dt, -0.37709332_dt, -0.40533495_dt, -0.37899649_dt, -0.00241137_dt };
+    const raul::Tensor hh_biases{ -0.12745196_dt, -0.32727283_dt, -0.17933607_dt, 0.09446543_dt, -0.26124537_dt, 0.11079127_dt,
+        -0.11465794_dt, -0.24228168_dt, 0.06869274_dt, 0.41112912_dt, -0.33803964_dt, 0.02321720_dt };
+
+    const raul::Tensor hidden_golden{ 1.15116501_dt, -0.22627246_dt, 0.76003265_dt, 0.65564865_dt, -0.32599488_dt, 0.30409038_dt,
+        0.07302547_dt, -0.07713708_dt, -0.94339150_dt, -0.01185670_dt, 0.25705278_dt, 0.43279916_dt };
+
+    const raul::Tensor inputs_grad_golden{ -0.06205156_dt, -0.11707603_dt, 0.10811222_dt, -0.00926699_dt, 0.07639554_dt, -0.10442813_dt,
+        -0.16265041_dt, 0.45669836_dt, -0.21845661_dt, 0.55357927_dt, 0.48653150_dt, -0.09047136_dt, -0.41427892_dt, -0.85915112_dt,
+        -0.00800398_dt, -0.05152406_dt, 0.35865912_dt, -0.04100856_dt, 0.14345874_dt, 0.26601300_dt, -0.29337540_dt };
+    const raul::Tensor hidden_grad_golden{ 0.71030843_dt, 0.88720351_dt, 0.87098521_dt, 0.98754054_dt, 0.62120128_dt, 0.10333990_dt,
+        0.22900221_dt, -0.14070934_dt, 0.35126179_dt, 0.92843390_dt, 0.76738483_dt, 0.94897246_dt };
+    const raul::Tensor ih_weights_grad_golden{ -0.11158133_dt, 0.00196455_dt, 0.02207936_dt, -0.00178930_dt, 0.03388711_dt,
+        -0.05153515_dt, 0.00347505_dt, 0.02151464_dt, -0.03634772_dt, 0.04463237_dt, 0.05855070_dt, -0.05827445_dt, -0.08093993_dt,
+        0.02229045_dt, -0.13203172_dt, 0.06505504_dt, -0.03921376_dt, -0.07903744_dt, 0.09153767_dt, 0.04898011_dt, -0.02265527_dt,
+        0.22539201_dt, -0.06163184_dt, 0.03907458_dt, 0.10346460_dt, -0.16171342_dt, -0.05449111_dt, 0.03331917_dt, 1.14799678_dt,
+        -0.08594241_dt, -0.26294976_dt, -0.02966374_dt, -0.19954802_dt, 0.66892284_dt, -0.07717165_dt, -0.69866252_dt, -0.01895342_dt,
+        -0.06668114_dt, -0.26599085_dt, 0.64897746_dt, 0.20096782_dt, -0.12314733_dt, -0.21734332_dt, -0.19725603_dt, -0.02676402_dt,
+        -0.10153750_dt, 0.44480217_dt, 0.22722758_dt, -0.09347940_dt, -0.63375926_dt, 0.06178473_dt, -0.09889315_dt, -0.28399488_dt,
+        0.56805682_dt, 0.21391854_dt, -0.11618468_dt, -0.62966585_dt, -0.18863520_dt, 0.42599195_dt, 0.35003769_dt, -0.15434620_dt,
+        -0.86832333_dt, 0.16704439_dt, 0.51606363_dt, -0.38505769_dt, 0.42970464_dt, 0.64234579_dt, -0.73804444_dt, -0.76220328_dt,
+        0.23778228_dt, 1.17398190_dt, -0.94368589_dt, 0.47939503_dt, 0.84242839_dt, -0.62719125_dt, -0.46752310_dt, 0.19090149_dt,
+        0.75417721_dt, -0.59985816_dt, 0.53401029_dt, 0.82055140_dt, -0.85075861_dt, -0.85590893_dt, 0.27495939_dt };
+    const raul::Tensor hh_weights_grad_golden{ -0.06879749_dt, 0.02627936_dt, 0.01582104_dt, 0.03811130_dt, -0.05328522_dt,
+        -0.07500345_dt, -0.00935309_dt, -0.04681793_dt, -0.00212387_dt, 0.12686367_dt, 0.02227901_dt, 0.09520691_dt, 0.02250028_dt,
+        -0.17216827_dt, -0.04403837_dt, -0.14523764_dt, 0.81720275_dt, -0.23362128_dt, -0.12760787_dt, -0.34663787_dt, -0.03185679_dt,
+        0.42914343_dt, 0.18589064_dt, 0.43685034_dt, 0.12731318_dt, 0.11139744_dt, 0.12358128_dt, 0.17601015_dt, -0.01004303_dt,
+        0.45217478_dt, 0.15674014_dt, 0.41814232_dt, -0.58024323_dt, -0.01532696_dt, 0.07699983_dt, 0.12894915_dt, -0.09628728_dt,
+        -0.22395962_dt, -0.03463840_dt, -0.15305349_dt, -0.09230665_dt, -0.88297045_dt, -0.18214068_dt, -0.67916596_dt, -0.18996060_dt,
+        -0.71913588_dt, -0.11495291_dt, -0.50746447_dt };
+    const raul::Tensor ih_biases_grad_golden{ 0.02681053_dt, 0.07684670_dt, -0.08346894_dt, 0.08473006_dt, -0.30082783_dt,
+        -0.10661648_dt, 0.03127038_dt, -0.18045160_dt, 0.64852643_dt, 0.76234984_dt, 1.06390333_dt, 0.99692702_dt };
+    const raul::Tensor hh_biases_grad_golden{ 0.02681053_dt, 0.07684670_dt, -0.08346894_dt, 0.08473006_dt, -0.30082783_dt,
+        -0.10661648_dt, 0.03127038_dt, -0.18045160_dt, 0.36137265_dt, 0.19431981_dt, 0.58304089_dt, 0.57078493_dt };
+
+    // Initialization
+    for (size_t q = 0; q < 2; ++q)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, 1, 1, input_size });
+        work.add<raul::DataLayer>("data2", raul::DataParams{ { "hidden" }, 1, 1, hidden_size });
+
+        // Network
+        bool useFusion = (q == 1 ? true : false);
+        auto params = raul::GRUCellParams{ "in", "hidden", "new_hidden", {}, true, true, false, useFusion };
+        raul::GRUCellLayer("gru_cell", params, networkParameters);
+        TENSORS_CREATE(batch_size)
+
+        memory_manager["in"] = TORANGE(input_init);
+        memory_manager["hidden"] = TORANGE(hidden_init);
+        memory_manager[raul::Name("gru_cell") / "linear_ih" / "Weights"] = TORANGE(ih_weights);
+        memory_manager[raul::Name("gru_cell") / "linear_hh" / "Weights"] = TORANGE(hh_weights);
+        memory_manager[raul::Name("gru_cell") / "linear_ih" / "Biases"] = TORANGE(ih_biases);
+        memory_manager[raul::Name("gru_cell") / "linear_hh" / "Biases"] = TORANGE(hh_biases);
+
+        // Forward
+        ASSERT_NO_THROW(work.forwardPassTraining());
+
+        // Checks
+        const auto& hidden_input_tensor = memory_manager["hidden"];
+        const auto& hidden_new_tensor = memory_manager["new_hidden"];
+
+        EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+
+        for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+        {
+            const auto hidden_val = hidden_new_tensor[i];
+            const auto golden_hidden_val = hidden_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+        }
+
+        // Backward
+        memory_manager[raul::Name("new_hidden").grad()] = 1.0_dt;
+        ASSERT_NO_THROW(work.backwardPassTraining());
+
+        // Checks
+        const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+        const auto& inputs = memory_manager["in"];
+
+        EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+        for (size_t i = 0; i < inputs_grad.size(); ++i)
+        {
+            const auto val = inputs_grad[i];
+            const auto golden_val = inputs_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        const auto& hidden_grad = memory_manager[raul::Name("hidden").grad()];
+
+        EXPECT_EQ(hidden_grad.size(), hidden_input_tensor.size());
+
+        for (size_t i = 0; i < hidden_grad.size(); ++i)
+        {
+            const auto val = hidden_grad[i];
+            const auto golden_val = hidden_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        const auto ih_weights_name = raul::Name("gru_cell") / raul::Name("linear_ih") / "Weights";
+        const auto& ih_weights_grad = memory_manager[ih_weights_name.grad()];
+
+        EXPECT_EQ(ih_weights_grad.size(), ih_weights.size());
+
+        for (size_t i = 0; i < ih_weights_grad.size(); ++i)
+        {
+            const auto val = ih_weights_grad[i];
+            const auto golden_val = ih_weights_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        const auto hh_weights_name = raul::Name("gru_cell") / raul::Name("linear_hh") / "Weights";
+        const auto& hh_weights_grad = memory_manager[hh_weights_name.grad()];
+
+        EXPECT_EQ(hh_weights_grad.size(), hh_weights.size());
+
+        for (size_t i = 0; i < hh_weights_grad.size(); ++i)
+        {
+            const auto val = hh_weights_grad[i];
+            const auto golden_val = hh_weights_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        const auto ih_biases_name = raul::Name("gru_cell") / raul::Name("linear_ih") / "Biases";
+        const auto& ih_biases_grad = memory_manager[ih_biases_name.grad()];
+
+        EXPECT_EQ(ih_biases_grad.size(), ih_biases.size());
+
+        for (size_t i = 0; i < ih_biases_grad.size(); ++i)
+        {
+            const auto val = ih_biases_grad[i];
+            const auto golden_val = ih_biases_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        const auto hh_biases_name = raul::Name("gru_cell") / raul::Name("linear_hh") / "Biases";
+        const auto& hh_biases_grad = memory_manager[hh_biases_name.grad()];
+
+        EXPECT_EQ(hh_biases_grad.size(), hh_biases.size());
+
+        for (size_t i = 0; i < hh_biases_grad.size(); ++i)
+        {
+            const auto val = hh_biases_grad[i];
+            const auto golden_val = hh_biases_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+    }
+}
+
+#ifdef ANDROID
+TEST(TestGRUCell, RandomWeightsFusionOnFP16Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 0.1_hf;
+    const auto input_size = 7U;
+    const auto hidden_size = 4U;
+    const auto batch_size = 3U;
+
+    const raul::TensorFP16 input_init{ -0.68704545_hf, -2.33485818_hf, 0.09404085_hf, -0.20208217_hf, 3.12574148_hf, 1.62500739_hf,
+        -0.63396734_hf, 1.75850010_hf, -0.57933402_hf, 0.52056360_hf, 1.06761038_hf, -1.56271243_hf, -0.87330669_hf, 0.37735614_hf,
+        -2.18030119_hf, -0.14689700_hf, 0.72165412_hf, 0.31223938_hf, 0.32089281_hf, -1.56860936_hf, 0.21175182_hf };
+    const raul::TensorFP16 hidden_init{ 1.20043528_hf, -0.15125844_hf, 0.89194465_hf, 0.69662333_hf, -0.15776138_hf, -1.64236963_hf,
+        -0.39137629_hf, -1.31812286_hf, -1.75797606_hf, 0.10276648_hf, 0.26235840_hf, 0.50530910_hf };
+
+    const raul::TensorFP16 ih_weights{ -0.28185493_hf, -0.28055829_hf, -0.38474041_hf, 0.33566517_hf, 0.35546559_hf, -0.05690634_hf,
+        -0.28934300_hf, 0.38645273_hf, 0.31974447_hf, 0.03716701_hf, -0.23606765_hf, 0.45952392_hf, 0.20447034_hf, -0.37957269_hf,
+        0.47854143_hf, 0.37968689_hf, -0.18224543_hf, 0.28107727_hf, -0.28409451_hf, -0.07835716_hf, 0.42455059_hf, 0.02065957_hf,
+        -0.35360909_hf, -0.16711646_hf, -0.13572258_hf, -0.09646440_hf, 0.04785031_hf, 0.46241480_hf, 0.02677017_hf, -0.30871999_hf,
+        0.02562714_hf, 0.23974359_hf, 0.24802011_hf, -0.45696926_hf, -0.08947122_hf, -0.37157226_hf, -0.21334279_hf, 0.18014669_hf,
+        -0.35506511_hf, 0.18586344_hf, 0.42438906_hf, 0.03279418_hf, -0.33324385_hf, -0.17914248_hf, 0.10918206_hf, -0.38115901_hf,
+        0.24840516_hf, -0.45393479_hf, -0.48064667_hf, -0.48583031_hf, -0.10143167_hf, 0.33621645_hf, -0.47323948_hf, 0.41559356_hf,
+        -0.20001143_hf, 0.14644206_hf, 0.02280146_hf, -0.45085955_hf, 0.41466451_hf, 0.26922172_hf, 0.49699783_hf, 0.25260609_hf,
+        -0.33003449_hf, 0.41729188_hf, 0.02687222_hf, 0.23710823_hf, -0.40091455_hf, -0.14381325_hf, -0.49093878_hf, -0.19474626_hf,
+        0.10786557_hf, -0.39258087_hf, 0.15938210_hf, 0.26840341_hf, 0.06965464_hf, -0.33454168_hf, -0.38765985_hf, -0.15425831_hf,
+        0.21947908_hf, 0.49319822_hf, 0.28751451_hf, -0.05630463_hf, 0.17530823_hf, -0.49053144_hf };
+    const raul::TensorFP16 hh_weights{ -0.42705065_hf, 0.23330396_hf, -0.28320760_hf, 0.24054784_hf, -0.35296607_hf, -0.24765545_hf,
+        -0.41184449_hf,0.26092035_hf, -0.05094755_hf, 0.38480055_hf, 0.30943608_hf, 0.27667129_hf, 0.01607805_hf, -0.15458900_hf,
+        -0.10871583_hf, 0.06645030_hf, 0.24785477_hf, -0.35029495_hf, 0.41963893_hf, -0.05436504_hf, -0.41897279_hf, -0.27052891_hf,
+        0.44240886_hf, 0.45726359_hf, -0.46313983_hf, 0.35264915_hf, 0.25057960_hf, 0.29595923_hf, 0.42326462_hf, -0.26947516_hf,
+        0.15788788_hf, 0.20461661_hf, -0.14774668_hf, 0.16732657_hf, -0.14385670_hf, 0.30913067_hf, -0.13872731_hf, -0.18639785_hf,
+        0.12587452_hf, 0.17734683_hf, -0.24428582_hf, 0.04419917_hf, 0.28976786_hf, -0.04974836_hf, 0.15216696_hf, -0.12059349_hf,
+        0.17524981_hf, -0.36219710_hf };
+    const raul::TensorFP16 ih_biases{ -0.29401439_hf, -0.25379527_hf, 0.45950544_hf, -0.13454205_hf, -0.00136518_hf, -0.24224776_hf,
+        0.49914503_hf, 0.48833507_hf, -0.37709332_hf, -0.40533495_hf, -0.37899649_hf, -0.00241137_hf };
+    const raul::TensorFP16 hh_biases{ -0.12745196_hf, -0.32727283_hf, -0.17933607_hf, 0.09446543_hf, -0.26124537_hf, 0.11079127_hf,
+        -0.11465794_hf, -0.24228168_hf, 0.06869274_hf, 0.41112912_hf, -0.33803964_hf, 0.02321720_hf };
+
+    const raul::TensorFP16 hidden_golden{ 1.15116501_hf, -0.22627246_hf, 0.76003265_hf, 0.65564865_hf, -0.32599488_hf, 0.30409038_hf,
+        0.07302547_hf, -0.07713708_hf, -0.94339150_hf, -0.01185670_hf, 0.25705278_hf, 0.43279916_hf };
+
+    const raul::TensorFP16 inputs_grad_golden{ -0.06205156_hf, -0.11707603_hf, 0.10811222_hf, -0.00926699_hf, 0.07639554_hf, -0.10442813_hf,
+        -0.16265041_hf, 0.45669836_hf, -0.21845661_hf, 0.55357927_hf, 0.48653150_hf, -0.09047136_hf, -0.41427892_hf, -0.85915112_hf,
+        -0.00800398_hf, -0.05152406_hf, 0.35865912_hf, -0.04100856_hf, 0.14345874_hf, 0.26601300_hf, -0.29337540_hf };
+    const raul::TensorFP16 hidden_grad_golden{ 0.71030843_hf, 0.88720351_hf, 0.87098521_hf, 0.98754054_hf, 0.62120128_hf, 0.10333990_hf,
+        0.22900221_hf, -0.14070934_hf, 0.35126179_hf, 0.92843390_hf, 0.76738483_hf, 0.94897246_hf };
+    const raul::TensorFP16 ih_weights_grad_golden{ -0.11158133_hf, 0.00196455_hf, 0.02207936_hf, -0.00178930_hf, 0.03388711_hf,
+        -0.05153515_hf, 0.00347505_hf, 0.02151464_hf, -0.03634772_hf, 0.04463237_hf, 0.05855070_hf, -0.05827445_hf, -0.08093993_hf,
+        0.02229045_hf, -0.13203172_hf, 0.06505504_hf, -0.03921376_hf, -0.07903744_hf, 0.09153767_hf, 0.04898011_hf, -0.02265527_hf,
+        0.22539201_hf, -0.06163184_hf, 0.03907458_hf, 0.10346460_hf, -0.16171342_hf, -0.05449111_hf, 0.03331917_hf, 1.14799678_hf,
+        -0.08594241_hf, -0.26294976_hf, -0.02966374_hf, -0.19954802_hf, 0.66892284_hf, -0.07717165_hf, -0.69866252_hf, -0.01895342_hf,
+        -0.06668114_hf, -0.26599085_hf, 0.64897746_hf, 0.20096782_hf, -0.12314733_hf, -0.21734332_hf, -0.19725603_hf, -0.02676402_hf,
+        -0.10153750_hf, 0.44480217_hf, 0.22722758_hf, -0.09347940_hf, -0.63375926_hf, 0.06178473_hf, -0.09889315_hf, -0.28399488_hf,
+        0.56805682_hf, 0.21391854_hf, -0.11618468_hf, -0.62966585_hf, -0.18863520_hf, 0.42599195_hf, 0.35003769_hf, -0.15434620_hf,
+        -0.86832333_hf, 0.16704439_hf, 0.51606363_hf, -0.38505769_hf, 0.42970464_hf, 0.64234579_hf, -0.73804444_hf, -0.76220328_hf,
+        0.23778228_hf, 1.17398190_hf, -0.94368589_hf, 0.47939503_hf, 0.84242839_hf, -0.62719125_hf, -0.46752310_hf, 0.19090149_hf,
+        0.75417721_hf, -0.59985816_hf, 0.53401029_hf, 0.82055140_hf, -0.85075861_hf, -0.85590893_hf, 0.27495939_hf };
+    const raul::TensorFP16 hh_weights_grad_golden{ -0.06879749_hf, 0.02627936_hf, 0.01582104_hf, 0.03811130_hf, -0.05328522_hf,
+        -0.07500345_hf, -0.00935309_hf, -0.04681793_hf, -0.00212387_hf, 0.12686367_hf, 0.02227901_hf, 0.09520691_hf, 0.02250028_hf,
+        -0.17216827_hf, -0.04403837_hf, -0.14523764_hf, 0.81720275_hf, -0.23362128_hf, -0.12760787_hf, -0.34663787_hf, -0.03185679_hf,
+        0.42914343_hf, 0.18589064_hf, 0.43685034_hf, 0.12731318_hf, 0.11139744_hf, 0.12358128_hf, 0.17601015_hf, -0.01004303_hf,
+        0.45217478_hf, 0.15674014_hf, 0.41814232_hf, -0.58024323_hf, -0.01532696_hf, 0.07699983_hf, 0.12894915_hf, -0.09628728_hf,
+        -0.22395962_hf, -0.03463840_hf, -0.15305349_hf, -0.09230665_hf, -0.88297045_hf, -0.18214068_hf, -0.67916596_hf, -0.18996060_hf,
+        -0.71913588_hf, -0.11495291_hf, -0.50746447_hf };
+    const raul::TensorFP16 ih_biases_grad_golden{ 0.02681053_hf, 0.07684670_hf, -0.08346894_hf, 0.08473006_hf, -0.30082783_hf,
+        -0.10661648_hf, 0.03127038_hf, -0.18045160_hf, 0.64852643_hf, 0.76234984_hf, 1.06390333_hf, 0.99692702_hf };
+    const raul::TensorFP16 hh_biases_grad_golden{ 0.02681053_hf, 0.07684670_hf, -0.08346894_hf, 0.08473006_hf, -0.30082783_hf,
+        -0.10661648_hf, 0.03127038_hf, -0.18045160_hf, 0.36137265_hf, 0.19431981_hf, 0.58304089_hf, 0.57078493_hf };
+
+    // Initialization
+    for (size_t q = 0; q < 2; ++q)
+    {
+        raul::WorkflowEager work{ raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16 };
+        auto& memory_manager = work.getMemoryManager<raul::MemoryManagerFP16>();
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, 1, 1, input_size });
+        work.add<raul::DataLayer>("data2", raul::DataParams{ { "hidden" }, 1, 1, hidden_size });
+
+        // Network
+        bool useFusion = (q == 1 ? true : false);
+        auto params = raul::GRUCellParams{ "in", "hidden", "new_hidden", {}, true, true, false, useFusion };
+        raul::GRUCellLayer("gru_cell", params, networkParameters);
+        TENSORS_CREATE(batch_size)
+
+        memory_manager["in"] = TORANGE_FP16(input_init);
+        memory_manager["hidden"] = TORANGE_FP16(hidden_init);
+        memory_manager[raul::Name("gru_cell") / "linear_ih" / "Weights"] = TORANGE_FP16(ih_weights);
+        memory_manager[raul::Name("gru_cell") / "linear_hh" / "Weights"] = TORANGE_FP16(hh_weights);
+        memory_manager[raul::Name("gru_cell") / "linear_ih" / "Biases"] = TORANGE_FP16(ih_biases);
+        memory_manager[raul::Name("gru_cell") / "linear_hh" / "Biases"] = TORANGE_FP16(hh_biases);
+
+        // Forward
+        ASSERT_NO_THROW(work.forwardPassTraining());
+
+        // Checks
+        const auto& hidden_input_tensor = memory_manager["hidden"];
+        const auto& hidden_new_tensor = memory_manager["new_hidden"];
+
+        EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+
+        for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+        {
+            const auto hidden_val = hidden_new_tensor[i];
+            const auto golden_hidden_val = hidden_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_hidden_val) << ", got: " << TODTYPE(hidden_val);
+        }
+
+        // Backward
+        memory_manager[raul::Name("new_hidden").grad()] = 1.0_hf;
+        ASSERT_NO_THROW(work.backwardPassTraining());
+
+        // Checks
+        const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+        const auto& inputs = memory_manager["in"];
+
+        EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+        for (size_t i = 0; i < inputs_grad.size(); ++i)
+        {
+            const auto val = inputs_grad[i];
+            const auto golden_val = inputs_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+
+        const auto& hidden_grad = memory_manager[raul::Name("hidden").grad()];
+
+        EXPECT_EQ(hidden_grad.size(), hidden_input_tensor.size());
+
+        for (size_t i = 0; i < hidden_grad.size(); ++i)
+        {
+            const auto val = hidden_grad[i];
+            const auto golden_val = hidden_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+
+        const auto& ih_weights_grad = memory_manager[raul::Name(raul::Name("gru_cell") / "linear_ih" / "Weights").grad()];
+
+        EXPECT_EQ(ih_weights_grad.size(), ih_weights.size());
+
+        for (size_t i = 0; i < ih_weights_grad.size(); ++i)
+        {
+            const auto val = ih_weights_grad[i];
+            const auto golden_val = ih_weights_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+
+        const auto& hh_weights_grad = memory_manager[raul::Name(raul::Name("gru_cell") / "linear_hh" / "Weights").grad()];
+
+        EXPECT_EQ(hh_weights_grad.size(), hh_weights.size());
+
+        for (size_t i = 0; i < hh_weights_grad.size(); ++i)
+        {
+            const auto val = hh_weights_grad[i];
+            const auto golden_val = hh_weights_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+
+        const auto& ih_biases_grad = memory_manager[raul::Name(raul::Name("gru_cell") / "linear_ih" / "Biases").grad()];
+
+        EXPECT_EQ(ih_biases_grad.size(), ih_biases.size());
+
+        for (size_t i = 0; i < ih_biases_grad.size(); ++i)
+        {
+            const auto val = ih_biases_grad[i];
+            const auto golden_val = ih_biases_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+
+        const auto& hh_biases_grad = memory_manager[raul::Name(raul::Name("gru_cell") / "linear_hh" / "Biases").grad()];
+
+        EXPECT_EQ(hh_biases_grad.size(), hh_biases.size());
+
+        for (size_t i = 0; i < hh_biases_grad.size(); ++i)
+        {
+            const auto val = hh_biases_grad[i];
+            const auto golden_val = hh_biases_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+    }
+}
+#endif // ANDROID
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_IndexFill.cpp b/training/src/tests/tests/layers/Test_Layer_IndexFill.cpp
new file mode 100644
index 00000000..08b757a4
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_IndexFill.cpp
@@ -0,0 +1,155 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/IndexFillLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerIndexFill, ForwardIncorrectIndexUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const size_t batch = 1;
+    const size_t depth = 2;
+    const size_t height = 1;
+    const size_t width = 1;
+
+    const raul::Tensor x{ 1.0_dt, 1.0_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::IndexFillLayer filler("fill", raul::IndexFillLayerParams{ { "x" }, { "out" }, raul::Dimension::Batch, std::unordered_set<size_t>{ 2 }, 1.0_dt }, networkParameters);
+    TENSORS_CREATE(batch);
+    memory_manager["x"] = TORANGE(x);
+
+    ASSERT_THROW(filler.forwardCompute(raul::NetworkMode::Test), raul::Exception);
+}
+
+TEST(TestLayerIndexFill, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const size_t batch = 1;
+    const size_t depth = 2;
+    const size_t height = 3;
+    const size_t width = 4;
+
+    const raul::Tensor x{ 0.2364_dt, 0.2266_dt, 0.8005_dt, 0.1692_dt, 0.2650_dt, 0.7720_dt, 0.1282_dt, 0.7452_dt, 0.8045_dt, 0.6357_dt, 0.5896_dt, 0.6933_dt,
+                          0.8782_dt, 0.5407_dt, 0.1400_dt, 0.9613_dt, 0.8666_dt, 0.4884_dt, 0.2077_dt, 0.3063_dt, 0.0585_dt, 0.8314_dt, 0.4566_dt, 0.8445_dt };
+
+    const raul::Tensor realOut[]{ { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                    1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                                  { 0.23640_dt, 0.22660_dt, 0.80050_dt, 0.16920_dt, 0.26500_dt, 0.77200_dt, 0.12820_dt, 0.74520_dt, 0.80450_dt, 0.63570_dt, 0.58960_dt, 0.69330_dt,
+                                    1.0_dt,     1.0_dt,     1.0_dt,     1.0_dt,     1.0_dt,     1.0_dt,     1.0_dt,     1.0_dt,     1.0_dt,     1.0_dt,     1.0_dt,     1.0_dt },
+                                  { 0.23640_dt, 0.22660_dt, 0.80050_dt, 0.16920_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                    0.87820_dt, 0.54070_dt, 0.14000_dt, 0.96130_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                                  { 0.23640_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.26500_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.80450_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                    0.87820_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.86660_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.05850_dt, 1.0_dt, 1.0_dt, 1.0_dt } };
+
+    const raul::Dimension dimensions[]{ raul::Dimension::Batch, raul::Dimension::Depth, raul::Dimension::Height, raul::Dimension::Width };
+    std::vector<std::unordered_set<size_t>> indices = { { 0 }, { 1 }, { 1, 2 }, { 1, 2, 3 } };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+        // Apply function
+        raul::IndexFillLayer filler("fill", raul::IndexFillLayerParams{ { "x" }, { "out" }, dimensions[iter], indices[iter], 1.0_dt }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+
+        filler.forwardCompute(raul::NetworkMode::Test);
+
+        // Checks
+        const auto& xTensor = memory_manager["x"];
+        const auto& outTensor = memory_manager["out"];
+
+        EXPECT_EQ(outTensor.size(), xTensor.size());
+        EXPECT_EQ(outTensor.size(), realOut[iter].size());
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            EXPECT_EQ(outTensor[i], realOut[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerIndexFill, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const size_t batch = 1;
+    const size_t depth = 2;
+    const size_t height = 3;
+    const size_t width = 4;
+
+    const raul::Tensor x{ 0.2364_dt, 0.2266_dt, 0.8005_dt, 0.1692_dt, 0.2650_dt, 0.7720_dt, 0.1282_dt, 0.7452_dt, 0.8045_dt, 0.6357_dt, 0.5896_dt, 0.6933_dt,
+                          0.8782_dt, 0.5407_dt, 0.1400_dt, 0.9613_dt, 0.8666_dt, 0.4884_dt, 0.2077_dt, 0.3063_dt, 0.0585_dt, 0.8314_dt, 0.4566_dt, 0.8445_dt };
+
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                               1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor realGrad[]{ { 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                     0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                   { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                   { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                     1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                   { 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                     1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt } };
+
+    const raul::Dimension dimensions[]{ raul::Dimension::Batch, raul::Dimension::Depth, raul::Dimension::Height, raul::Dimension::Width };
+    std::vector<std::unordered_set<size_t>> indices = { { 0 }, { 1 }, { 1, 2 }, { 1, 2, 3 } };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+        // Apply function
+        raul::IndexFillLayer filler("fill", raul::IndexFillLayerParams{ { "x" }, { "out" }, dimensions[iter], indices[iter], 1.0_dt }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+        filler.forwardCompute(raul::NetworkMode::Test);
+        filler.backwardCompute();
+
+        // Checks
+        const auto& xTensor = memory_manager["x"];
+        const auto& xNablaTensor = memory_manager[raul::Name("x").grad()];
+
+        EXPECT_EQ(xNablaTensor.size(), xTensor.size());
+        EXPECT_EQ(xNablaTensor.size(), realGrad[iter].size());
+        for (size_t i = 0; i < xNablaTensor.size(); ++i)
+        {
+            EXPECT_EQ(xNablaTensor[i], realGrad[iter][i]);
+        }
+        memory_manager.clear();
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_L2Norm.cpp b/training/src/tests/tests/layers/Test_Layer_L2Norm.cpp
new file mode 100644
index 00000000..efbae4cc
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_L2Norm.cpp
@@ -0,0 +1,161 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/L2NormLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerL2Norm, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::L2NormLayer("L2Norm", raul::BasicParams{ { "x", "y" }, { "x_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerL2Norm, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::L2NormLayer("L2Norm", raul::BasicParams{ { "x" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+// See l2_norm.py
+TEST(TestLayerL2Norm, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+    const auto eps = TODTYPE(1e-5);
+
+    const raul::Tensor x{ 0.6645621_dt,  0.44100678_dt, 0.3528825_dt,  0.46448255_dt, 0.03366041_dt, 0.68467236_dt,  0.74011743_dt, 0.8724445_dt,  0.22632635_dt, 0.22319686_dt,  0.3103881_dt,
+                          0.7223358_dt,  0.13318717_dt, 0.5480639_dt,  0.5746088_dt,  0.8996835_dt,  0.009463668_dt, 0.5212307_dt,  0.6345445_dt,  0.1993283_dt,  0.72942245_dt,  0.54583454_dt,
+                          0.10756552_dt, 0.6767061_dt,  0.6602763_dt,  0.33695042_dt, 0.60141766_dt, 0.21062577_dt,  0.8527372_dt,  0.44062173_dt, 0.9485276_dt,  0.23752594_dt,  0.81179297_dt,
+                          0.5263394_dt,  0.494308_dt,   0.21612847_dt, 0.8457197_dt,  0.8718841_dt,  0.3083862_dt,   0.6868038_dt,  0.23764038_dt, 0.7817228_dt,  0.9671384_dt,   0.068701625_dt,
+                          0.79873943_dt, 0.66028714_dt, 0.5871513_dt,  0.16461694_dt, 0.7381023_dt,  0.32054043_dt,  0.6073899_dt,  0.46523476_dt, 0.97803545_dt, 0.7223145_dt,   0.32347047_dt,
+                          0.82577336_dt, 0.4976915_dt,  0.19483674_dt, 0.7588748_dt,  0.3380444_dt,  0.28128064_dt,  0.31513572_dt, 0.60670924_dt, 0.7498598_dt,  0.5016055_dt,   0.18282163_dt,
+                          0.13179815_dt, 0.64636123_dt, 0.9559475_dt,  0.6670735_dt,  0.30755532_dt, 0.36892188_dt,  0.44735897_dt, 0.18359458_dt, 0.5288255_dt,  0.7052754_dt,   0.898633_dt,
+                          0.31386292_dt, 0.62338257_dt, 0.96815526_dt, 0.11207926_dt, 0.29590535_dt, 0.9356605_dt,   0.1341263_dt,  0.31937933_dt, 0.262277_dt,   0.031487584_dt, 0.90045524_dt,
+                          0.6409379_dt,  0.5821855_dt,  0.20917094_dt, 0.71736085_dt, 0.363523_dt,   0.04670918_dt,  0.14977789_dt, 0.84361756_dt, 0.9355587_dt,  0.09517312_dt,  0.08617878_dt,
+                          0.6247839_dt,  0.37050653_dt, 0.5139042_dt,  0.6233207_dt,  0.8024682_dt,  0.1665138_dt,   0.22090447_dt, 0.62422717_dt, 0.08719146_dt, 0.92142665_dt,  0.9348017_dt,
+                          0.60455227_dt, 0.47940433_dt, 0.14430141_dt, 0.32600033_dt, 0.92557526_dt, 0.7757342_dt,   0.636765_dt,   0.6282351_dt,  0.35401833_dt, 0.41446733_dt };
+
+    const raul::Tensor realOut{ 0.6721557_dt,  0.44604594_dt, 0.3569147_dt,   0.46978992_dt,  0.03404503_dt, 0.49951473_dt, 0.5399656_dt,    0.6365072_dt,   0.16512035_dt,  0.16283718_dt,
+                                0.27581632_dt, 0.6418803_dt,  0.118352465_dt, 0.48701918_dt,  0.5106075_dt,  0.72888184_dt, 0.0076670246_dt, 0.4222769_dt,   0.5140785_dt,   0.16148654_dt,
+                                0.55369675_dt, 0.41433716_dt, 0.08165184_dt,  0.51368034_dt,  0.50120866_dt, 0.2807033_dt,  0.50102305_dt,   0.17546603_dt,  0.71038985_dt,  0.3670688_dt,
+                                0.6489302_dt,  0.16250214_dt, 0.5553839_dt,   0.36009237_dt,  0.33817825_dt, 0.14953724_dt, 0.5851454_dt,    0.6032483_dt,   0.21336947_dt,  0.47519302_dt,
+                                0.15858118_dt, 0.521656_dt,   0.6453867_dt,   0.04584568_dt,  0.53301144_dt, 0.54732686_dt, 0.48670292_dt,   0.13645469_dt,  0.61182964_dt,  0.26570317_dt,
+                                0.41248563_dt, 0.31594637_dt, 0.66419536_dt,  0.49053222_dt,  0.21967259_dt, 0.6413641_dt,  0.38654852_dt,   0.15132637_dt,  0.5894051_dt,   0.26255333_dt,
+                                0.2411586_dt,  0.27018458_dt, 0.5201679_dt,   0.64289933_dt,  0.4300562_dt,  0.13524175_dt, 0.097497284_dt,  0.47814378_dt,  0.7071593_dt,   0.4934656_dt,
+                                0.35651854_dt, 0.42765474_dt, 0.5185791_dt,   0.2128231_dt,   0.6130153_dt,  0.4268994_dt,  0.5439377_dt,    0.18997952_dt,  0.3773301_dt,   0.58601916_dt,
+                                0.10707895_dt, 0.28270382_dt, 0.89391685_dt,  0.12814239_dt,  0.3051305_dt,  0.20540966_dt, 0.024660394_dt,  0.705217_dt,    0.5019687_dt,   0.45595506_dt,
+                                0.24734943_dt, 0.8482956_dt,  0.42987427_dt,  0.055234674_dt, 0.17711578_dt, 0.5974544_dt,  0.6625676_dt,    0.06740211_dt,  0.061032265_dt, 0.44247523_dt,
+                                0.30646726_dt, 0.42507973_dt, 0.5155844_dt,   0.6637676_dt,   0.13773313_dt, 0.1499963_dt,  0.42385635_dt,   0.059203856_dt, 0.62565774_dt,  0.6347395_dt,
+                                0.48109287_dt, 0.38150218_dt, 0.11483272_dt,  0.25942576_dt,  0.7365578_dt,  0.5951317_dt,  0.48851663_dt,   0.4819726_dt,   0.2715976_dt,   0.31797317_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+    work.add<raul::L2NormLayer>("L2Norm", raul::BasicParams{ { "x" }, { "out" } });
+    TENSORS_CREATE(batch);
+    memory_manager["x"] = TORANGE(x);
+
+    work.forwardPassTraining();
+
+    // Checks
+    const auto output = memory_manager["out"];
+    for (size_t q = 0; q < output.size(); ++q)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(output[q], realOut[q], eps));
+    }
+}
+
+TEST(TestLayerL2Norm, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+    const auto eps = TODTYPE(1e-3);
+
+    const raul::Tensor x{ 0.6645621_dt,  0.44100678_dt, 0.3528825_dt,  0.46448255_dt, 0.03366041_dt, 0.68467236_dt,  0.74011743_dt, 0.8724445_dt,  0.22632635_dt, 0.22319686_dt,  0.3103881_dt,
+                          0.7223358_dt,  0.13318717_dt, 0.5480639_dt,  0.5746088_dt,  0.8996835_dt,  0.009463668_dt, 0.5212307_dt,  0.6345445_dt,  0.1993283_dt,  0.72942245_dt,  0.54583454_dt,
+                          0.10756552_dt, 0.6767061_dt,  0.6602763_dt,  0.33695042_dt, 0.60141766_dt, 0.21062577_dt,  0.8527372_dt,  0.44062173_dt, 0.9485276_dt,  0.23752594_dt,  0.81179297_dt,
+                          0.5263394_dt,  0.494308_dt,   0.21612847_dt, 0.8457197_dt,  0.8718841_dt,  0.3083862_dt,   0.6868038_dt,  0.23764038_dt, 0.7817228_dt,  0.9671384_dt,   0.068701625_dt,
+                          0.79873943_dt, 0.66028714_dt, 0.5871513_dt,  0.16461694_dt, 0.7381023_dt,  0.32054043_dt,  0.6073899_dt,  0.46523476_dt, 0.97803545_dt, 0.7223145_dt,   0.32347047_dt,
+                          0.82577336_dt, 0.4976915_dt,  0.19483674_dt, 0.7588748_dt,  0.3380444_dt,  0.28128064_dt,  0.31513572_dt, 0.60670924_dt, 0.7498598_dt,  0.5016055_dt,   0.18282163_dt,
+                          0.13179815_dt, 0.64636123_dt, 0.9559475_dt,  0.6670735_dt,  0.30755532_dt, 0.36892188_dt,  0.44735897_dt, 0.18359458_dt, 0.5288255_dt,  0.7052754_dt,   0.898633_dt,
+                          0.31386292_dt, 0.62338257_dt, 0.96815526_dt, 0.11207926_dt, 0.29590535_dt, 0.9356605_dt,   0.1341263_dt,  0.31937933_dt, 0.262277_dt,   0.031487584_dt, 0.90045524_dt,
+                          0.6409379_dt,  0.5821855_dt,  0.20917094_dt, 0.71736085_dt, 0.363523_dt,   0.04670918_dt,  0.14977789_dt, 0.84361756_dt, 0.9355587_dt,  0.09517312_dt,  0.08617878_dt,
+                          0.6247839_dt,  0.37050653_dt, 0.5139042_dt,  0.6233207_dt,  0.8024682_dt,  0.1665138_dt,   0.22090447_dt, 0.62422717_dt, 0.08719146_dt, 0.92142665_dt,  0.9348017_dt,
+                          0.60455227_dt, 0.47940433_dt, 0.14430141_dt, 0.32600033_dt, 0.92557526_dt, 0.7757342_dt,   0.636765_dt,   0.6282351_dt,  0.35401833_dt, 0.41446733_dt };
+
+    const raul::Tensor realGrad{
+        -0.33393586_dt,  0.118637204_dt,  0.29703897_dt,   0.07111204_dt,   0.94328314_dt,   -0.00072962046_dt, -0.05986941_dt,  -0.2010144_dt,   0.4881594_dt,    0.4914974_dt,    0.3901734_dt,
+        -0.27136272_dt,  0.6747357_dt,    0.008495986_dt,  -0.034131825_dt, -0.27306557_dt,  0.79875934_dt,     0.18259168_dt,   0.04616183_dt,   0.57016224_dt,   -0.10866243_dt,  0.10974151_dt,
+        0.6311248_dt,    -0.045948803_dt, -0.026403248_dt, 0.35727605_dt,   -0.016167462_dt, 0.53565395_dt,     -0.37104553_dt,  0.210886_dt,     -0.23267573_dt,  0.4545588_dt,    -0.10051185_dt,
+        0.17539966_dt,   0.20636037_dt,   0.48222262_dt,   -0.12854856_dt,  -0.1539309_dt,   0.3927227_dt,      0.025616884_dt,  0.4657765_dt,    0.0043483377_dt, -0.15289992_dt,  0.6090509_dt,
+        -0.010083258_dt, -0.100245655_dt, 0.0026724339_dt, 0.59727055_dt,   -0.20974863_dt,  0.37785214_dt,     0.09005839_dt,   0.22792202_dt,   -0.26939768_dt,  -0.021396697_dt, 0.36540654_dt,
+        -0.23513079_dt,  0.16686541_dt,   0.537951_dt,     -0.15316045_dt,  0.36247975_dt,   0.4222408_dt,      0.36986968_dt,   -0.081171215_dt, -0.30261374_dt,  0.08141583_dt,   0.5485108_dt,
+        0.6018827_dt,    0.06363636_dt,   -0.26019895_dt,  0.04197079_dt,   0.27950418_dt,   0.10397804_dt,     -0.12037468_dt,  0.6340678_dt,    -0.3533926_dt,   0.056410372_dt,  -0.09407121_dt,
+        0.36102915_dt,   0.12014389_dt,   -0.14817727_dt,  0.7797367_dt,    0.4916467_dt,    -0.51096964_dt,    0.74518484_dt,   0.4548586_dt,    0.47861296_dt,   0.74661386_dt,   -0.26246226_dt,
+        0.038898468_dt,  0.10712385_dt,   0.66835237_dt,   -0.5808474_dt,   0.28893405_dt,   1.0677054_dt,      0.8143485_dt,    -0.06649917_dt,  -0.15092981_dt,  0.62080663_dt,   0.6290662_dt,
+        0.13445854_dt,   0.307836_dt,     0.10684222_dt,   -0.046521723_dt, -0.29762423_dt,  0.593763_dt,       0.4861635_dt,    0.13406885_dt,   0.60289294_dt,   -0.12538183_dt,  -0.13705802_dt,
+        0.040271282_dt,  0.1966694_dt,    0.6154494_dt,    0.38837925_dt,   -0.36091304_dt,  -0.21682405_dt,    -0.040543377_dt, -0.029723346_dt, 0.31811723_dt,   0.24143845_dt
+    };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+    work.add<raul::L2NormLayer>("L2Norm", raul::BasicParams{ { "x" }, { "out" } });
+    TENSORS_CREATE(batch);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    // Checks
+    const auto xTensorGrad = memory_manager[raul::Name("x").grad()];
+    for (size_t q = 0; q < xTensorGrad.size(); ++q)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(xTensorGrad[q], realGrad[q], eps));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_L2SquaredNorm.cpp b/training/src/tests/tests/layers/Test_Layer_L2SquaredNorm.cpp
new file mode 100644
index 00000000..a963c5f8
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_L2SquaredNorm.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/L2SquaredNormLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerL2SquaredNorm, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::L2SquaredNormLayer("L2SquaredNorm", raul::BasicParams{ { "x", "y" }, { "x_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerL2SquaredNorm, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::L2SquaredNormLayer("L2SquaredNorm", raul::BasicParams{ { "x" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+// See l2_squared_norm.py
+TEST(TestLayerL2SquaredNorm, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 1;
+    const auto depth = 1;
+    const auto height = 2;
+    const auto width = 3;
+
+    const raul::Tensor x{ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt };
+    const raul::Tensor realOut{ 45.5_dt };
+    // Initialization
+    raul::WorkflowEager work(raul::CompressionMode::FP16, raul::CalculationMode::DETERMINISTIC);
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const auto params = raul::BasicParams{ { "x" }, { "out" } };
+    memory_manager.createTensor("x", batch, depth, height, width, x);
+
+    // Apply function
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+    work.add<raul::L2SquaredNormLayer>("L2SquaredNorm", raul::BasicParams{ { "x" }, { "out" } });
+    TENSORS_CREATE(batch);
+    memory_manager["x"] = TORANGE(x);
+
+    work.forwardPassTraining();
+
+    // Checks
+    const auto output = memory_manager["out"];
+    EXPECT_EQ(output[0], realOut[0]);
+}
+
+TEST(TestLayerL2SquaredNorm, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 1;
+    const auto depth = 1;
+    const auto height = 2;
+    const auto width = 3;
+    const raul::Tensor x{ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt };
+    const raul::Tensor realGrad{ 0.7_dt, 1.4_dt, 2.1_dt, 2.8_dt, 3.5_dt, 4.2_dt };
+
+    // Initialization
+    raul::WorkflowEager work(raul::CompressionMode::FP16, raul::CalculationMode::DETERMINISTIC);
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+    work.add<raul::L2SquaredNormLayer>("L2SquaredNorm", raul::BasicParams{ { "x" }, { "out" } });
+    TENSORS_CREATE(batch);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager[raul::Name("out").grad()] = 0.7_dt;
+
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    // Checks
+    const auto xTensorGrad = memory_manager[raul::Name("x").grad()];
+    for (size_t q = 0; q < xTensorGrad.size(); ++q)
+    {
+        EXPECT_EQ(xTensorGrad[q], realGrad[q]);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_LSTM.cpp b/training/src/tests/tests/layers/Test_Layer_LSTM.cpp
new file mode 100644
index 00000000..1dc63e31
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_LSTM.cpp
@@ -0,0 +1,1133 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/GTestExtensions.h>
+#include <tests/tools/TestTools.h>
+
+#include <training/compiler/Layers.h>
+#include <training/compiler/Compiler.h>
+#include <training/base/optimizers/SGD.h>
+#include <training/base/layers/composite/rnn/LSTMLayer.h>
+
+namespace
+{
+
+auto buildLSTMOnPrimitives(const size_t batch_size, const size_t sequence_length, const size_t input_size, const size_t hidden_size)
+{
+    size_t parts = 4;
+    size_t cnt = 0;
+    auto work = std::make_unique<raul::WorkflowEager>();
+    raul::Names input_names(sequence_length);
+    raul::Names output_names(sequence_length);
+    std::generate_n(input_names.begin(), sequence_length, [cnt]() mutable { return "in[" + Conversions::toString(cnt++) + "]"; });
+    std::generate_n(output_names.begin(), sequence_length, [cnt]() mutable { return "hidden_state[" + Conversions::toString(cnt++) + "]"; });
+    work->add<raul::DataLayer>("fake_data_in", raul::DataParams{ { "in", "labels" }, sequence_length, 1, input_size, 0 });
+    work->add<raul::DataLayer>("fake_data_state", raul::DataParams{ { "hidden", "cell" }, 1, 1, hidden_size, hidden_size });
+    work->add<raul::TensorLayer>("weight_ih", raul::TensorParams{ { "weights_ih" }, 1, 1, hidden_size * parts, input_size });
+    work->add<raul::TensorLayer>("weight_hh",
+                                 raul::TensorParams{
+                                     { "weights_hh" },
+                                     1,
+                                     1,
+                                     hidden_size * parts,
+                                     hidden_size,
+                                 });
+    work->add<raul::TensorLayer>("biases_ih", raul::TensorParams{ { "biases_hh" }, 1, 1, 1, hidden_size * parts });
+    work->add<raul::TensorLayer>("biases_hh", raul::TensorParams{ { "biases_ih" }, 1, 1, 1, hidden_size * parts });
+    work->add<raul::SlicerLayer>("slice", raul::SlicingParams("in", input_names, "depth"));
+
+    std::string name_hidden_in = "hidden";
+    std::string name_cell_in = "cell";
+    for (size_t i = 0; i < sequence_length; ++i)
+    {
+        const auto idx = Conversions::toString(i);
+        std::string name_hidden_out = "hidden_state[" + idx + "]";
+        std::string name_cell_out = "cell_state[" + idx + "]";
+        raul::Name cellName = i == 0 ? "cell" : "unrolled_cell" + idx;
+        raul::LSTMCellLayer(cellName, raul::LSTMCellParams(input_names[i], name_hidden_in, name_cell_in, name_hidden_out, name_cell_out, {}), work->getNetworkParameters());
+        name_hidden_in = name_hidden_out;
+        name_cell_in = name_cell_out;
+    }
+
+    work->add<raul::ConcatenationLayer>("concat", raul::BasicParamsWithDim(output_names, { "out" }, "depth"));
+
+    work->preparePipelines();
+    work->setBatchSize(batch_size);
+    work->prepareMemoryForTraining();
+
+    return work;
+}
+
+}
+
+namespace UT
+{
+
+using namespace std;
+
+TEST(TestLSTM, PrimitiveBlocksBuildUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 1U;
+    const auto batch_size = 2U;
+
+    // Network
+    auto work = buildLSTMOnPrimitives(batch_size, sequence_length, input_size, hidden_size);
+    //    work->printInfo(std::cout);
+}
+
+TEST(TestLSTM, PrimitiveBlocksForwardSeq1Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 1U;
+    const auto batch_size = 2U;
+
+    // Network
+    auto work = buildLSTMOnPrimitives(batch_size, sequence_length, input_size, hidden_size);
+
+    // Initialization
+    auto& memory_manager = work->getMemoryManager();
+
+    for (auto& [param, grad] : work->getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    const raul::Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    // Apply
+    work->forwardPassTesting();
+
+    // Checks
+    const raul::Tensor output_golden{ -5.799451e-02_dt, -5.799451e-02_dt, -5.799451e-02_dt, 7.003791e-01_dt, 7.003791e-01_dt, 7.003791e-01_dt };
+    const auto& outputTensor = memory_manager["out"];
+
+    EXPECT_EQ(outputTensor.size(), output_golden.size());
+
+    for (size_t i = 0; i < outputTensor.size(); ++i)
+    {
+        const auto val = outputTensor[i];
+        const auto golden_val = output_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTM, PrimitiveBlocksForwardSeq2Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 2U;
+    const auto batch_size = 2U;
+
+    // Network
+    auto work = buildLSTMOnPrimitives(batch_size, sequence_length, input_size, hidden_size);
+
+    // Initialization
+    auto& memory_manager = work->getMemoryManager();
+
+    for (auto& [param, grad] : work->getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    const raul::Tensor input_init{ -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                                   7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    // Apply
+    work->forwardPassTesting();
+
+    // Checks
+    const raul::Tensor output_golden{ -1.037908e-02_dt, -1.037908e-02_dt, -1.037908e-02_dt, -7.253138e-02_dt, -7.253138e-02_dt, -7.253138e-02_dt,
+                                      3.804927e-01_dt,  3.804927e-01_dt,  3.804927e-01_dt,  8.850382e-01_dt,  8.850382e-01_dt,  8.850382e-01_dt };
+    const auto& outputTensor = memory_manager["out"];
+
+    EXPECT_EQ(outputTensor.size(), output_golden.size());
+
+    for (size_t i = 0; i < outputTensor.size(); ++i)
+    {
+        const auto val = outputTensor[i];
+        const auto golden_val = output_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTM, PrimitiveBlocksForwardSeq5Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 5U;
+    const auto batch_size = 2U;
+
+    // Network
+    auto work = buildLSTMOnPrimitives(batch_size, sequence_length, input_size, hidden_size);
+
+    // Initialization
+    auto& memory_manager = work->getMemoryManager();
+
+    for (auto& [param, grad] : work->getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    const raul::Tensor input_init{ -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                                   7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt,
+                                   9.295023e-02_dt,  -6.660997e-01_dt, 6.080472e-01_dt,  -7.300199e-01_dt, -8.833758e-01_dt, -4.189135e-01_dt, -8.048265e-01_dt, 5.656096e-01_dt,
+                                   2.885762e-01_dt,  3.865978e-01_dt,  -2.010639e-01_dt, -1.179270e-01_dt, -8.293669e-01_dt, -1.407257e+00_dt, 1.626847e+00_dt,  1.722732e-01_dt,
+                                   -7.042940e-01_dt, 3.147210e-01_dt,  1.573929e-01_dt,  3.853627e-01_dt,  5.736546e-01_dt,  9.979313e-01_dt,  5.436094e-01_dt,  7.880439e-02_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    // Apply
+    work->forwardPassTesting();
+
+    // Checks
+    const raul::Tensor output_golden{ -1.037908e-02_dt, -1.037908e-02_dt, -1.037908e-02_dt, -7.253138e-02_dt, -7.253138e-02_dt, -7.253138e-02_dt, 2.026980e-01_dt, 2.026980e-01_dt,
+                                      2.026980e-01_dt,  8.062391e-01_dt,  8.062391e-01_dt,  8.062391e-01_dt,  9.519626e-01_dt,  9.519626e-01_dt,  9.519626e-01_dt, 1.573658e-01_dt,
+                                      1.573658e-01_dt,  1.573658e-01_dt,  7.829522e-01_dt,  7.829522e-01_dt,  7.829522e-01_dt,  9.537140e-01_dt,  9.537140e-01_dt, 9.537140e-01_dt,
+                                      9.895445e-01_dt,  9.895445e-01_dt,  9.895445e-01_dt,  9.986963e-01_dt,  9.986963e-01_dt,  9.986963e-01_dt };
+    const auto& outputTensor = memory_manager["out"];
+
+    EXPECT_EQ(outputTensor.size(), output_golden.size());
+
+    for (size_t i = 0; i < outputTensor.size(); ++i)
+    {
+        const auto val = outputTensor[i];
+        const auto golden_val = output_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTM, PrimitiveBlocksBackwardSeq1Unit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 1U;
+    const auto batch_size = 2U;
+
+    // Network
+    auto work = buildLSTMOnPrimitives(batch_size, sequence_length, input_size, hidden_size);
+
+    // Initialization
+    auto& memory_manager = work->getMemoryManager();
+
+    for (auto& [param, grad] : work->getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    memory_manager[Name("out").grad()] = 1.0_dt;
+
+    // Apply
+    work->forwardPassTraining();
+    work->backwardPassTraining();
+
+    // Checks
+    const Tensor inputs_grad_golden{ -1.359324e-01_dt, -1.359324e-01_dt, -1.359324e-01_dt, -1.359324e-01_dt, 1.805458e-01_dt, 1.805458e-01_dt, 1.805458e-01_dt, 1.805458e-01_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTM, PrimitiveBlocksBackwardSeq2Unit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 2U;
+    const auto batch_size = 2U;
+
+    // Network
+    auto work = buildLSTMOnPrimitives(batch_size, sequence_length, input_size, hidden_size);
+
+    // Initialization
+    auto& memory_manager = work->getMemoryManager();
+
+    for (auto& [param, grad] : work->getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    const Tensor input_init{ -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                             7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    memory_manager[Name("out").grad()] = 1.0_dt;
+
+    // Apply
+    work->forwardPassTraining();
+    work->backwardPassTraining();
+
+    // Checks
+    const Tensor inputs_grad_golden{ -6.995806e-02_dt, -6.995806e-02_dt, -6.995806e-02_dt, -6.995806e-02_dt, -1.382187e-01_dt, -1.382187e-01_dt, -1.382187e-01_dt, -1.382187e-01_dt,
+                                     1.336384e+00_dt,  1.336384e+00_dt,  1.336384e+00_dt,  1.336384e+00_dt,  9.485833e-02_dt,  9.485833e-02_dt,  9.485833e-02_dt,  9.485833e-02_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTM, PrimitiveBlocksBackwardSeq5Unit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 5U;
+    const auto batch_size = 2U;
+
+    // Network
+    auto work = buildLSTMOnPrimitives(batch_size, sequence_length, input_size, hidden_size);
+
+    // Initialization
+    auto& memory_manager = work->getMemoryManager();
+
+    for (auto& [param, grad] : work->getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    const Tensor input_init{ -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                             7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt,
+                             9.295023e-02_dt,  -6.660997e-01_dt, 6.080472e-01_dt,  -7.300199e-01_dt, -8.833758e-01_dt, -4.189135e-01_dt, -8.048265e-01_dt, 5.656096e-01_dt,
+                             2.885762e-01_dt,  3.865978e-01_dt,  -2.010639e-01_dt, -1.179270e-01_dt, -8.293669e-01_dt, -1.407257e+00_dt, 1.626847e+00_dt,  1.722732e-01_dt,
+                             -7.042940e-01_dt, 3.147210e-01_dt,  1.573929e-01_dt,  3.853627e-01_dt,  5.736546e-01_dt,  9.979313e-01_dt,  5.436094e-01_dt,  7.880439e-02_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    memory_manager[Name("out").grad()] = 1.0_dt;
+
+    // Apply
+    work->forwardPassTraining();
+    work->backwardPassTraining();
+
+    // Checks
+    const Tensor inputs_grad_golden{ -1.586803e-01_dt, -1.586803e-01_dt, -1.586803e-01_dt, -1.586803e-01_dt, -3.506274e-01_dt, -3.506274e-01_dt, -3.506274e-01_dt, -3.506274e-01_dt,
+                                     1.747969e+00_dt,  1.747969e+00_dt,  1.747969e+00_dt,  1.747969e+00_dt,  1.852361e-01_dt,  1.852361e-01_dt,  1.852361e-01_dt,  1.852361e-01_dt,
+                                     7.512633e-02_dt,  7.512633e-02_dt,  7.512633e-02_dt,  7.512633e-02_dt,  2.134141e+00_dt,  2.134141e+00_dt,  2.134141e+00_dt,  2.134141e+00_dt,
+                                     2.289679e-01_dt,  2.289679e-01_dt,  2.289679e-01_dt,  2.289679e-01_dt,  6.543004e-02_dt,  6.543004e-02_dt,  6.543004e-02_dt,  6.543004e-02_dt,
+                                     2.018033e-02_dt,  2.018033e-02_dt,  2.018033e-02_dt,  2.018033e-02_dt,  2.330255e-03_dt,  2.330255e-03_dt,  2.330255e-03_dt,  2.330255e-03_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTM, SimpleForwardSeq1Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 1U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = raul::LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    raul::LSTMLayer("lstm", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const raul::Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const raul::Tensor output_golden{ -5.799451e-02_dt, -5.799451e-02_dt, -5.799451e-02_dt, 7.003791e-01_dt, 7.003791e-01_dt, 7.003791e-01_dt };
+    const raul::Tensor cell_golden{ -2.068135e-01_dt, -2.068135e-01_dt, -2.068135e-01_dt, 9.446084e-01_dt, 9.446084e-01_dt, 9.446084e-01_dt };
+    const auto& outputTensor = memory_manager["out"];
+    const auto& cellTensor = memory_manager["lstm::cell_state[" + Conversions::toString(sequence_length - 1) + "]"];
+
+    EXPECT_EQ(outputTensor.size(), batch_size * hidden_size * sequence_length);
+    EXPECT_EQ(cellTensor.size(), batch_size * hidden_size);
+
+    for (size_t i = 0; i < outputTensor.size(); ++i)
+    {
+        const auto val = outputTensor[i];
+        const auto golden_val = output_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < cellTensor.size(); ++i)
+    {
+        const auto val = cellTensor[i];
+        const auto golden_val = cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTM, SimpleForwardSeq2Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 2U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = raul::LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    raul::LSTMLayer("lstm", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const raul::Tensor input_init{ -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                                   7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const raul::Tensor output_golden{ -1.037908e-02_dt, -1.037908e-02_dt, -1.037908e-02_dt, -7.253138e-02_dt, -7.253138e-02_dt, -7.253138e-02_dt,
+                                      3.804927e-01_dt,  3.804927e-01_dt,  3.804927e-01_dt,  8.850382e-01_dt,  8.850382e-01_dt,  8.850382e-01_dt };
+    const raul::Tensor cell_golden{ -2.369964e-01_dt, -2.369964e-01_dt, -2.369964e-01_dt, 1.526655e+00_dt, 1.526655e+00_dt, 1.526655e+00_dt };
+    const auto& outputTensor = memory_manager["out"];
+    const auto& cellTensor = memory_manager["lstm::cell_state[" + Conversions::toString(sequence_length - 1) + "]"];
+
+    EXPECT_EQ(outputTensor.size(), batch_size * hidden_size * sequence_length);
+    EXPECT_EQ(cellTensor.size(), batch_size * hidden_size);
+
+    for (size_t i = 0; i < outputTensor.size(); ++i)
+    {
+        const auto val = outputTensor[i];
+        const auto golden_val = output_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < cellTensor.size(); ++i)
+    {
+        const auto val = cellTensor[i];
+        const auto golden_val = cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTM, SimpleForwardSeq5Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 5U;
+    const size_t batch_size = 2U;
+
+    // Use fusion or not
+    for (size_t q = 0; q < 2; ++q)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+        // Network
+        bool useFusion = (q == 1 ? true : false);
+        const auto params = raul::LSTMParams{ { "in" }, { "out" }, hidden_size, false, true, false, false, 0.0_dt, false, 0.0_dt, useFusion };
+        raul::LSTMLayer("lstm", params, networkParameters);
+        TENSORS_CREATE(batch_size)
+
+        const raul::Tensor input_init{ -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                                    7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt,
+                                    9.295023e-02_dt,  -6.660997e-01_dt, 6.080472e-01_dt,  -7.300199e-01_dt, -8.833758e-01_dt, -4.189135e-01_dt, -8.048265e-01_dt, 5.656096e-01_dt,
+                                    2.885762e-01_dt,  3.865978e-01_dt,  -2.010639e-01_dt, -1.179270e-01_dt, -8.293669e-01_dt, -1.407257e+00_dt, 1.626847e+00_dt,  1.722732e-01_dt,
+                                    -7.042940e-01_dt, 3.147210e-01_dt,  1.573929e-01_dt,  3.853627e-01_dt,  5.736546e-01_dt,  9.979313e-01_dt,  5.436094e-01_dt,  7.880439e-02_dt };
+        memory_manager["in"] = TORANGE(input_init);
+
+        for (auto& [param, grad] : work.getTrainableParameters())
+        {
+            param = 1.0_dt;
+        }
+
+        // Apply
+        work.forwardPassTesting();
+
+        // Checks
+        const raul::Tensor output_golden{ -1.037908e-02_dt, -1.037908e-02_dt, -1.037908e-02_dt, -7.253138e-02_dt, -7.253138e-02_dt, -7.253138e-02_dt, 2.026980e-01_dt, 2.026980e-01_dt,
+                                        2.026980e-01_dt,  8.062391e-01_dt,  8.062391e-01_dt,  8.062391e-01_dt,  9.519626e-01_dt,  9.519626e-01_dt,  9.519626e-01_dt, 1.573658e-01_dt,
+                                        1.573658e-01_dt,  1.573658e-01_dt,  7.829522e-01_dt,  7.829522e-01_dt,  7.829522e-01_dt,  9.537140e-01_dt,  9.537140e-01_dt, 9.537140e-01_dt,
+                                        9.895445e-01_dt,  9.895445e-01_dt,  9.895445e-01_dt,  9.986963e-01_dt,  9.986963e-01_dt,  9.986963e-01_dt };
+        const raul::Tensor cell_golden{ 2.183707e+00_dt, 2.183707e+00_dt, 2.183707e+00_dt, 4.118004e+00_dt, 4.118004e+00_dt, 4.118004e+00_dt };
+        const auto& outputTensor = memory_manager["out"];
+        const auto& cellTensor = memory_manager["lstm::cell_state[" + Conversions::toString(sequence_length - 1) + "]"];
+
+        EXPECT_EQ(outputTensor.size(), batch_size * hidden_size * sequence_length);
+        EXPECT_EQ(cellTensor.size(), batch_size * hidden_size);
+
+        for (size_t i = 0; i < outputTensor.size(); ++i)
+        {
+            const auto val = outputTensor[i];
+            const auto golden_val = output_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        for (size_t i = 0; i < cellTensor.size(); ++i)
+        {
+            const auto val = cellTensor[i];
+            const auto golden_val = cell_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+    }
+}
+
+TEST(TestLSTM, SimpleBackwardSeq1Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 1U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = raul::LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    raul::LSTMLayer("lstm", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const raul::Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTraining();
+
+    work.backwardPassTraining();
+
+    // Checks
+    const raul::Tensor inputs_grad_golden{ -1.359324e-01_dt, -1.359324e-01_dt, -1.359324e-01_dt, -1.359324e-01_dt, 1.805458e-01_dt, 1.805458e-01_dt, 1.805458e-01_dt, 1.805458e-01_dt };
+
+    const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    // Check shared parameters grads
+    const raul::Tensor weight_ih_grad_golden{ 2.008261e-02_dt, 3.833952e-02_dt,  5.488532e-02_dt,  2.392589e-02_dt,  2.008261e-02_dt, 3.833952e-02_dt,  5.488532e-02_dt,  2.392589e-02_dt,
+                                              2.008261e-02_dt, 3.833952e-02_dt,  5.488532e-02_dt,  2.392589e-02_dt,  0.000000e+00_dt, 0.000000e+00_dt,  0.000000e+00_dt,  0.000000e+00_dt,
+                                              0.000000e+00_dt, 0.000000e+00_dt,  0.000000e+00_dt,  0.000000e+00_dt,  0.000000e+00_dt, 0.000000e+00_dt,  0.000000e+00_dt,  0.000000e+00_dt,
+                                              1.243622e-02_dt, -3.385932e-02_dt, -5.629471e-02_dt, -2.475417e-02_dt, 1.243622e-02_dt, -3.385932e-02_dt, -5.629471e-02_dt, -2.475417e-02_dt,
+                                              1.243622e-02_dt, -3.385932e-02_dt, -5.629471e-02_dt, -2.475417e-02_dt, 3.917178e-02_dt, 3.996138e-02_dt,  5.247791e-02_dt,  2.274714e-02_dt,
+                                              3.917178e-02_dt, 3.996138e-02_dt,  5.247791e-02_dt,  2.274714e-02_dt,  3.917178e-02_dt, 3.996138e-02_dt,  5.247791e-02_dt,  2.274714e-02_dt };
+    const raul::Tensor bias_grad_golden{ -1.981921e-02_dt, -1.981921e-02_dt, -1.981921e-02_dt, 0.000000e+00_dt,  0.000000e+00_dt,  0.000000e+00_dt,
+                                         4.108956e-02_dt,  4.108956e-02_dt,  4.108956e-02_dt,  -6.399199e-03_dt, -6.399199e-03_dt, -6.399199e-03_dt };
+
+    const auto paramGrad = work.getTrainableParameters();
+    EXPECT_EQ(paramGrad.size(), 4u);
+
+    const auto gradBiasesIH = paramGrad[0].Gradient;
+    const auto gradWeightsIH = paramGrad[1].Gradient;
+    const auto gradBiasesHH = paramGrad[2].Gradient;
+    const auto gradWeightsHH = paramGrad[3].Gradient;
+
+    const auto parts = 4;
+    EXPECT_EQ(gradWeightsIH.size(), weight_ih_grad_golden.size());
+    EXPECT_EQ(gradWeightsHH.size(), hidden_size * hidden_size * parts);
+    EXPECT_EQ(gradBiasesIH.size(), bias_grad_golden.size());
+    EXPECT_EQ(gradBiasesHH.size(), bias_grad_golden.size());
+
+    for (size_t i = 0; i < gradWeightsIH.size(); ++i)
+    {
+        const auto val = gradWeightsIH[i];
+        const auto golden_val = weight_ih_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < gradWeightsHH.size(); ++i)
+    {
+        const auto val = gradWeightsHH[i];
+        const auto golden_val = 0.0_dt;
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < gradBiasesIH.size(); ++i)
+    {
+        const auto val = gradBiasesIH[i];
+        const auto golden_val = bias_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < gradBiasesHH.size(); ++i)
+    {
+        const auto val = gradBiasesHH[i];
+        const auto golden_val = bias_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTM, SimpleBackwardSeq2Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 2U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = raul::LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    raul::LSTMLayer("lstm", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const raul::Tensor input_init{ -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                                   7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+
+    memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTraining();
+
+    work.backwardPassTraining();
+
+    // Checks
+    const raul::Tensor inputs_grad_golden{ -6.995806e-02_dt, -6.995806e-02_dt, -6.995806e-02_dt, -6.995806e-02_dt, -1.382187e-01_dt, -1.382187e-01_dt, -1.382187e-01_dt, -1.382187e-01_dt,
+                                           1.336384e+00_dt,  1.336384e+00_dt,  1.336384e+00_dt,  1.336384e+00_dt,  9.485833e-02_dt,  9.485833e-02_dt,  9.485833e-02_dt,  9.485833e-02_dt };
+
+    const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    // Check shared parameters grads
+    const raul::Tensor weight_ih_grad_golden{ 1.097069e-01_dt,  9.967550e-02_dt,  -1.816753e-02_dt, -1.161487e-01_dt, 1.097069e-01_dt,  9.967550e-02_dt,  -1.816753e-02_dt, -1.161487e-01_dt,
+                                              1.097069e-01_dt,  9.967550e-02_dt,  -1.816753e-02_dt, -1.161487e-01_dt, -1.854407e-04_dt, 1.433821e-02_dt,  5.339870e-03_dt,  -8.046657e-04_dt,
+                                              -1.854407e-04_dt, 1.433821e-02_dt,  5.339870e-03_dt,  -8.046657e-04_dt, -1.854407e-04_dt, 1.433821e-02_dt,  5.339870e-03_dt,  -8.046657e-04_dt,
+                                              1.650045e-01_dt,  -1.691406e-01_dt, -8.596025e-02_dt, -2.727654e-01_dt, 1.650045e-01_dt,  -1.691406e-01_dt, -8.596025e-02_dt, -2.727654e-01_dt,
+                                              1.650045e-01_dt,  -1.691406e-01_dt, -8.596025e-02_dt, -2.727654e-01_dt, 8.956295e-02_dt,  1.005179e-01_dt,  1.552046e-02_dt,  -1.299831e-01_dt,
+                                              8.956295e-02_dt,  1.005179e-01_dt,  1.552046e-02_dt,  -1.299831e-01_dt, 8.956295e-02_dt,  1.005179e-01_dt,  1.552046e-02_dt,  -1.299831e-01_dt };
+    const raul::Tensor weight_hh_grad_golden{ 2.120828e-03_dt,  2.120828e-03_dt,  2.120828e-03_dt,  2.120828e-03_dt,  2.120828e-03_dt,  2.120828e-03_dt,  2.120828e-03_dt,  2.120828e-03_dt,
+                                              2.120828e-03_dt,  1.031388e-03_dt,  1.031388e-03_dt,  1.031388e-03_dt,  1.031388e-03_dt,  1.031388e-03_dt,  1.031388e-03_dt,  1.031388e-03_dt,
+                                              1.031388e-03_dt,  1.031388e-03_dt,  -3.445673e-04_dt, -3.445673e-04_dt, -3.445673e-04_dt, -3.445673e-04_dt, -3.445673e-04_dt, -3.445673e-04_dt,
+                                              -3.445673e-04_dt, -3.445673e-04_dt, -3.445673e-04_dt, 9.701515e-03_dt,  9.701515e-03_dt,  9.701515e-03_dt,  9.701515e-03_dt,  9.701515e-03_dt,
+                                              9.701515e-03_dt,  9.701515e-03_dt,  9.701515e-03_dt,  9.701515e-03_dt };
+    const raul::Tensor bias_grad_golden{ 5.974016e-02_dt, 5.974016e-02_dt, 5.974016e-02_dt, -3.834467e-03_dt, -3.834467e-03_dt, -3.834467e-03_dt,
+                                         2.761197e-01_dt, 2.761197e-01_dt, 2.761197e-01_dt, 7.566305e-02_dt,  7.566305e-02_dt,  7.566305e-02_dt };
+
+    const auto paramGrad = work.getTrainableParameters();
+    const auto gradBiasesIH = paramGrad[0].Gradient;
+    const auto gradWeightsIH = paramGrad[1].Gradient;
+    const auto gradBiasesHH = paramGrad[2].Gradient;
+    const auto gradWeightsHH = paramGrad[3].Gradient;
+
+    EXPECT_EQ(gradWeightsIH.size(), weight_ih_grad_golden.size());
+    EXPECT_EQ(gradWeightsHH.size(), weight_hh_grad_golden.size());
+    EXPECT_EQ(gradBiasesIH.size(), bias_grad_golden.size());
+    EXPECT_EQ(gradBiasesHH.size(), bias_grad_golden.size());
+
+    for (size_t i = 0; i < gradWeightsIH.size(); ++i)
+    {
+        const auto val = gradWeightsIH[i];
+        const auto golden_val = weight_ih_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < gradWeightsHH.size(); ++i)
+    {
+        const auto val = gradWeightsHH[i];
+        const auto golden_val = weight_hh_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < gradBiasesIH.size(); ++i)
+    {
+        const auto val = gradBiasesIH[i];
+        const auto golden_val = bias_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < gradBiasesHH.size(); ++i)
+    {
+        const auto val = gradBiasesHH[i];
+        const auto golden_val = bias_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTM, SimpleBackwardSeq5Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 5U;
+    const size_t batch_size = 2U;
+
+    // Use fusion or not
+    for (size_t q = 0; q < 2; ++q)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+        // Network
+        bool useFusion = (q == 1 ? true : false);
+        const auto params = raul::LSTMParams{ { "in" }, { "out" }, hidden_size, false, true, false, false, 0.0_dt, false, 0.0_dt, useFusion };
+        raul::LSTMLayer("lstm", params, networkParameters);
+        TENSORS_CREATE(batch_size)
+
+        const raul::Tensor input_init{ -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                                    7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt,
+                                    9.295023e-02_dt,  -6.660997e-01_dt, 6.080472e-01_dt,  -7.300199e-01_dt, -8.833758e-01_dt, -4.189135e-01_dt, -8.048265e-01_dt, 5.656096e-01_dt,
+                                    2.885762e-01_dt,  3.865978e-01_dt,  -2.010639e-01_dt, -1.179270e-01_dt, -8.293669e-01_dt, -1.407257e+00_dt, 1.626847e+00_dt,  1.722732e-01_dt,
+                                    -7.042940e-01_dt, 3.147210e-01_dt,  1.573929e-01_dt,  3.853627e-01_dt,  5.736546e-01_dt,  9.979313e-01_dt,  5.436094e-01_dt,  7.880439e-02_dt };
+        memory_manager["in"] = TORANGE(input_init);
+
+        memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+        for (auto& [param, grad] : work.getTrainableParameters())
+        {
+            param = 1.0_dt;
+        }
+
+        // Apply
+        work.forwardPassTraining();
+
+        work.backwardPassTraining();
+
+        // Checks
+        const raul::Tensor inputs_grad_golden{ -1.586803e-01_dt, -1.586803e-01_dt, -1.586803e-01_dt, -1.586803e-01_dt, -3.506274e-01_dt, -3.506274e-01_dt, -3.506274e-01_dt, -3.506274e-01_dt,
+                                            1.747969e+00_dt,  1.747969e+00_dt,  1.747969e+00_dt,  1.747969e+00_dt,  1.852361e-01_dt,  1.852361e-01_dt,  1.852361e-01_dt,  1.852361e-01_dt,
+                                            7.512633e-02_dt,  7.512633e-02_dt,  7.512633e-02_dt,  7.512633e-02_dt,  2.134141e+00_dt,  2.134141e+00_dt,  2.134141e+00_dt,  2.134141e+00_dt,
+                                            2.289679e-01_dt,  2.289679e-01_dt,  2.289679e-01_dt,  2.289679e-01_dt,  6.543004e-02_dt,  6.543004e-02_dt,  6.543004e-02_dt,  6.543004e-02_dt,
+                                            2.018033e-02_dt,  2.018033e-02_dt,  2.018033e-02_dt,  2.018033e-02_dt,  2.330255e-03_dt,  2.330255e-03_dt,  2.330255e-03_dt,  2.330255e-03_dt };
+
+        const auto& inputs_grad = memory_manager[raul::Name("in").grad()];
+        const auto& inputs = memory_manager["in"];
+
+        EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+        for (size_t i = 0; i < inputs_grad.size(); ++i)
+        {
+            const auto val = inputs_grad[i];
+            const auto golden_val = inputs_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        // Check shared parameters grads
+        const raul::Tensor weight_ih_grad_golden{ 1.112274e-01_dt,  5.255696e-01_dt,  -6.308367e-02_dt, -6.135947e-02_dt, 1.112274e-01_dt,  5.255696e-01_dt,  -6.308367e-02_dt, -6.135947e-02_dt,
+                                                1.112274e-01_dt,  5.255696e-01_dt,  -6.308367e-02_dt, -6.135947e-02_dt, -3.832054e-02_dt, 8.897363e-02_dt,  3.326690e-02_dt,  6.371387e-02_dt,
+                                                -3.832054e-02_dt, 8.897363e-02_dt,  3.326690e-02_dt,  6.371387e-02_dt,  -3.832054e-02_dt, 8.897363e-02_dt,  3.326690e-02_dt,  6.371387e-02_dt,
+                                                -2.038447e-01_dt, -9.682778e-01_dt, -5.943990e-01_dt, -2.112956e-01_dt, -2.038447e-01_dt, -9.682778e-01_dt, -5.943990e-01_dt, -2.112956e-01_dt,
+                                                -2.038447e-01_dt, -9.682778e-01_dt, -5.943990e-01_dt, -2.112956e-01_dt, -1.620699e-03_dt, 2.575284e-01_dt,  3.971567e-02_dt,  -6.623324e-02_dt,
+                                                -1.620699e-03_dt, 2.575284e-01_dt,  3.971567e-02_dt,  -6.623324e-02_dt, -1.620699e-03_dt, 2.575284e-01_dt,  3.971567e-02_dt,  -6.623324e-02_dt };
+        const raul::Tensor weight_hh_grad_golden{ -8.980469e-04_dt, -8.980469e-04_dt, -8.980469e-04_dt, -8.980469e-04_dt, -8.980469e-04_dt, -8.980469e-04_dt, -8.980469e-04_dt, -8.980469e-04_dt,
+                                                -8.980469e-04_dt, 8.354402e-03_dt,  8.354402e-03_dt,  8.354402e-03_dt,  8.354402e-03_dt,  8.354402e-03_dt,  8.354402e-03_dt,  8.354402e-03_dt,
+                                                8.354402e-03_dt,  8.354402e-03_dt,  -3.099198e-02_dt, -3.099198e-02_dt, -3.099198e-02_dt, -3.099198e-02_dt, -3.099198e-02_dt, -3.099198e-02_dt,
+                                                -3.099198e-02_dt, -3.099198e-02_dt, -3.099198e-02_dt, 5.146423e-02_dt,  5.146423e-02_dt,  5.146423e-02_dt,  5.146423e-02_dt,  5.146423e-02_dt,
+                                                5.146423e-02_dt,  5.146423e-02_dt,  5.146423e-02_dt,  5.146423e-02_dt };
+        const raul::Tensor bias_grad_golden{ 2.827185e-02_dt, 2.827185e-02_dt, 2.827185e-02_dt, -7.531291e-02_dt, -7.531291e-02_dt, -7.531291e-02_dt,
+                                            1.223329e+00_dt, 1.223329e+00_dt, 1.223329e+00_dt, 1.404038e-01_dt,  1.404038e-01_dt,  1.404038e-01_dt };
+
+        const auto paramGrad = work.getTrainableParameters();
+        const auto gradBiasesIH = paramGrad[(useFusion ? 2 : 0)].Gradient;
+        const auto gradWeightsIH = paramGrad[(useFusion ? 3 : 1)].Gradient;
+        const auto gradBiasesHH = paramGrad[(useFusion ? 0 : 2)].Gradient;
+        const auto gradWeightsHH = paramGrad[(useFusion ? 1 : 3)].Gradient;
+
+        EXPECT_EQ(gradWeightsIH.size(), weight_ih_grad_golden.size());
+        EXPECT_EQ(gradWeightsHH.size(), weight_hh_grad_golden.size());
+        EXPECT_EQ(gradBiasesIH.size(), bias_grad_golden.size());
+        EXPECT_EQ(gradBiasesHH.size(), bias_grad_golden.size());
+
+        for (size_t i = 0; i < gradWeightsIH.size(); ++i)
+        {
+            const auto val = gradWeightsIH[i];
+            const auto golden_val = weight_ih_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        for (size_t i = 0; i < gradWeightsHH.size(); ++i)
+        {
+            const auto val = gradWeightsHH[i];
+            const auto golden_val = weight_hh_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        for (size_t i = 0; i < gradBiasesIH.size(); ++i)
+        {
+            const auto val = gradBiasesIH[i];
+            const auto golden_val = bias_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        for (size_t i = 0; i < gradBiasesHH.size(); ++i)
+        {
+            const auto val = gradBiasesHH[i];
+            const auto golden_val = bias_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+    }
+}
+
+TEST(TestLSTM, SimpleForwardSeq5ExtStateUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 5U;
+    const auto batch_size = 2U;
+
+    // Use fusion or not
+    for (size_t q = 0; q < 2; ++q)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+        work.add<raul::DataLayer>("data2", raul::DataParams{ { "hidden_in" }, 1, 1, hidden_size });
+        work.add<raul::DataLayer>("data3", raul::DataParams{ { "cell_in" }, 1, 1, hidden_size });
+
+        // Network
+        bool useFusion = (q == 1 ? true : false);
+        const auto params = raul::LSTMParams{ "in", "hidden_in", "cell_in", "out", "hidden_out", "cell_out", false, true, false, false, 0.0_dt, false, 0.0_dt, useFusion };
+        raul::LSTMLayer("lstm", params, networkParameters);
+        TENSORS_CREATE(batch_size)
+
+        const raul::Tensor input_init{ -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                                    7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt,
+                                    9.295023e-02_dt,  -6.660997e-01_dt, 6.080472e-01_dt,  -7.300199e-01_dt, -8.833758e-01_dt, -4.189135e-01_dt, -8.048265e-01_dt, 5.656096e-01_dt,
+                                    2.885762e-01_dt,  3.865978e-01_dt,  -2.010639e-01_dt, -1.179270e-01_dt, -8.293669e-01_dt, -1.407257e+00_dt, 1.626847e+00_dt,  1.722732e-01_dt,
+                                    -7.042940e-01_dt, 3.147210e-01_dt,  1.573929e-01_dt,  3.853627e-01_dt,  5.736546e-01_dt,  9.979313e-01_dt,  5.436094e-01_dt,  7.880439e-02_dt };
+        const raul::Tensor hidden_init{ -4.468389e-01_dt, 4.520225e-01_dt, -9.759244e-01_dt, 7.112372e-01_dt, -7.582265e-01_dt, -6.435831e-01_dt };
+        const raul::Tensor cell_init{ -6.461524e-01_dt, -1.590926e-01_dt, -1.778664e+00_dt, 8.476512e-01_dt, 2.459428e-01_dt, -1.311679e-01_dt };
+
+        memory_manager["in"] = TORANGE(input_init);
+        memory_manager["hidden_in"] = TORANGE(hidden_init);
+        memory_manager["cell_in"] = TORANGE(cell_init);
+
+        for (auto& [param, grad] : work.getTrainableParameters())
+        {
+            param = 1.0_dt;
+        }
+
+        // Apply
+        work.forwardPassTesting();
+
+        // Checks
+        const raul::Tensor output_golden{ -2.873812e-03_dt, -2.023149e-03_dt, -4.841400e-03_dt, -7.045983e-02_dt, -6.851754e-02_dt, -7.495427e-02_dt, 2.086206e-01_dt, 2.114406e-01_dt,
+                                        2.020342e-01_dt,  8.093282e-01_dt,  8.104742e-01_dt,  8.066313e-01_dt,  9.525990e-01_dt,  9.527962e-01_dt,  9.521345e-01_dt, 1.182437e-01_dt,
+                                        3.509183e-03_dt,  -6.965960e-02_dt, 7.515067e-01_dt,  6.616085e-01_dt,  5.865334e-01_dt,  9.432809e-01_dt,  9.260048e-01_dt, 9.104356e-01_dt,
+                                        9.885837e-01_dt,  9.860258e-01_dt,  9.836924e-01_dt,  9.986322e-01_dt,  9.982802e-01_dt,  9.979585e-01_dt };
+        const raul::Tensor hidden_golden{ 9.525990e-01_dt, 9.527962e-01_dt, 9.521345e-01_dt, 9.986322e-01_dt, 9.982802e-01_dt, 9.979585e-01_dt };
+        const raul::Tensor cell_golden{ 2.193398e+00_dt, 2.197572e+00_dt, 2.183694e+00_dt, 4.067711e+00_dt, 3.832196e+00_dt, 3.684592e+00_dt };
+        const auto& outputTensor = memory_manager["out"];
+        const auto& hiddenTensor = memory_manager["hidden_out"];
+        const auto& cellTensor = memory_manager["cell_out"];
+
+        EXPECT_EQ(outputTensor.size(), batch_size * hidden_size * sequence_length);
+        EXPECT_EQ(hiddenTensor.size(), batch_size * hidden_size);
+        EXPECT_EQ(cellTensor.size(), batch_size * hidden_size);
+
+        for (size_t i = 0; i < outputTensor.size(); ++i)
+        {
+            const auto val = outputTensor[i];
+            const auto golden_val = output_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        for (size_t i = 0; i < hiddenTensor.size(); ++i)
+        {
+            const auto val = hiddenTensor[i];
+            const auto golden_val = hidden_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        for (size_t i = 0; i < cellTensor.size(); ++i)
+        {
+            const auto val = cellTensor[i];
+            const auto golden_val = cell_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+    }
+}
+
+TEST(TestLSTM, SimpleBackwardSeq5ExtStateUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 5U;
+    const auto batch_size = 2U;
+
+    // Use fusion or not
+    for (size_t q = 0; q < 2; ++q)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+        work.add<raul::DataLayer>("data2", raul::DataParams{ { "hidden_in" }, 1, 1, hidden_size });
+        work.add<raul::DataLayer>("data3", raul::DataParams{ { "cell_in" }, 1, 1, hidden_size });
+
+        // Network
+        bool useFusion = (q == 1 ? true : false);
+        const auto params = LSTMParams{ "in", "hidden_in", "cell_in", "out", "hidden_out", "cell_out", false, true, false, false, 0.0_dt, false, 0.0_dt, useFusion };
+        LSTMLayer("lstm", params, networkParameters);
+        TENSORS_CREATE(batch_size)
+
+        const Tensor input_init{ -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                                7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt,
+                                9.295023e-02_dt,  -6.660997e-01_dt, 6.080472e-01_dt,  -7.300199e-01_dt, -8.833758e-01_dt, -4.189135e-01_dt, -8.048265e-01_dt, 5.656096e-01_dt,
+                                2.885762e-01_dt,  3.865978e-01_dt,  -2.010639e-01_dt, -1.179270e-01_dt, -8.293669e-01_dt, -1.407257e+00_dt, 1.626847e+00_dt,  1.722732e-01_dt,
+                                -7.042940e-01_dt, 3.147210e-01_dt,  1.573929e-01_dt,  3.853627e-01_dt,  5.736546e-01_dt,  9.979313e-01_dt,  5.436094e-01_dt,  7.880439e-02_dt };
+        const Tensor hidden_init{ -4.468389e-01_dt, 4.520225e-01_dt, -9.759244e-01_dt, 7.112372e-01_dt, -7.582265e-01_dt, -6.435831e-01_dt };
+        const Tensor cell_init{ -6.461524e-01_dt, -1.590926e-01_dt, -1.778664e+00_dt, 8.476512e-01_dt, 2.459428e-01_dt, -1.311679e-01_dt };
+
+        memory_manager["in"] = TORANGE(input_init);
+        memory_manager["hidden_in"] = TORANGE(hidden_init);
+        memory_manager["cell_in"] = TORANGE(cell_init);
+
+        for (auto& [param, grad] : work.getTrainableParameters())
+        {
+            param = 1.0_dt;
+        }
+
+        // Apply
+        work.forwardPassTraining();
+
+        memory_manager[Name("out").grad()] = 1.0_dt;
+        memory_manager[Name("hidden_out").grad()] = 1.0_dt;
+        memory_manager[Name("cell_out").grad()] = 1.0_dt;
+
+        work.backwardPassTraining();
+
+        // Checks
+        const Tensor inputs_grad_golden{ -2.154902e-01_dt, -2.154902e-01_dt, -2.154902e-01_dt, -2.154902e-01_dt, -4.182333e-01_dt, -4.182333e-01_dt, -4.182333e-01_dt, -4.182333e-01_dt,
+                                        3.471492e+00_dt,  3.471492e+00_dt,  3.471492e+00_dt,  3.471492e+00_dt,  4.183314e-01_dt,  4.183314e-01_dt,  4.183314e-01_dt,  4.183314e-01_dt,
+                                        3.095903e-01_dt,  3.095903e-01_dt,  3.095903e-01_dt,  3.095903e-01_dt,  2.948041e+00_dt,  2.948041e+00_dt,  2.948041e+00_dt,  2.948041e+00_dt,
+                                        7.211524e-01_dt,  7.211524e-01_dt,  7.211524e-01_dt,  7.211524e-01_dt,  2.657774e-01_dt,  2.657774e-01_dt,  2.657774e-01_dt,  2.657774e-01_dt,
+                                        8.456340e-02_dt,  8.456340e-02_dt,  8.456340e-02_dt,  8.456340e-02_dt,  1.379239e-02_dt,  1.379239e-02_dt,  1.379239e-02_dt,  1.379239e-02_dt };
+
+        const auto& inputs_grad = memory_manager[Name("in").grad()];
+        const auto& inputs = memory_manager["in"];
+
+        EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+        for (size_t i = 0; i < inputs_grad.size(); ++i)
+        {
+            const auto val = inputs_grad[i];
+            const auto golden_val = inputs_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        // Check shared parameters grads
+        const Tensor weight_ih_grad_golden{ 4.427108e-01_dt,  9.863892e-01_dt,  1.883101e-01_dt,  -4.382888e-01_dt, 4.584172e-01_dt,  9.972150e-01_dt,  1.983972e-01_dt,  -4.469928e-01_dt,
+                                            4.695551e-01_dt,  1.002892e+00_dt,  2.035596e-01_dt,  -4.559649e-01_dt, -4.650742e-01_dt, -7.634363e-02_dt, -1.693685e-01_dt, 3.799740e-01_dt,
+                                            -2.327396e-01_dt, -1.716787e-02_dt, 3.827258e-02_dt,  1.929823e-01_dt,  4.139580e-03_dt,  2.546748e-01_dt,  2.607774e-01_dt,  1.691507e-01_dt,
+                                            -1.789881e-01_dt, -1.807874e+00_dt, -1.010744e+00_dt, -5.859998e-01_dt, -2.726763e-01_dt, -1.849642e+00_dt, -1.096480e+00_dt, -5.241356e-01_dt,
+                                            -3.152885e-01_dt, -1.872623e+00_dt, -1.144722e+00_dt, -5.015457e-01_dt, -1.010975e-02_dt, 4.048809e-01_dt,  4.290407e-02_dt,  -8.930679e-02_dt,
+                                            8.417503e-02_dt,  4.327759e-01_dt,  1.314885e-01_dt,  -1.529343e-01_dt, 1.468077e-01_dt,  5.106470e-01_dt,  1.925082e-01_dt,  -1.840367e-01_dt };
+        const Tensor weight_hh_grad_golden{ -1.060414e-02_dt, 1.070702e-01_dt,  1.364040e-01_dt,  -1.790520e-02_dt, 1.187281e-01_dt,  1.451716e-01_dt,  -2.102653e-02_dt, 1.251480e-01_dt,
+                                            1.497073e-01_dt,  3.912693e-01_dt,  -2.543108e-01_dt, -1.755999e-01_dt, 1.723360e-01_dt,  -3.753334e-02_dt, -1.506338e-02_dt, 4.103640e-02_dt,
+                                            9.432380e-02_dt,  1.818821e-01_dt,  5.489463e-01_dt,  -6.905279e-01_dt, -6.048693e-01_dt, 6.254129e-01_dt,  -7.707407e-01_dt, -6.733758e-01_dt,
+                                            6.654203e-01_dt,  -8.135428e-01_dt, -7.101704e-01_dt, 1.605264e-01_dt,  -1.921275e-02_dt, -1.248798e-02_dt, 8.037686e-02_dt,  6.404880e-02_dt,
+                                            5.817027e-02_dt,  3.009853e-02_dt,  1.164924e-01_dt,  1.054975e-01_dt };
+
+        const Tensor bias_grad_golden{ -7.130340e-03_dt, -4.987031e-03_dt, 4.158601e-03_dt, 3.736566e-01_dt, 8.978634e-02_dt, -2.685242e-01_dt,
+                                    2.289106e+00_dt,  2.399543e+00_dt,  2.467558e+00_dt, 1.949094e-01_dt, 8.254965e-02_dt, -2.160972e-02_dt };
+
+        const auto paramGrad = work.getTrainableParameters();
+        const auto gradBiasesIH = paramGrad[(useFusion ? 2 : 0)].Gradient;
+        const auto gradWeightsIH = paramGrad[(useFusion ? 3 : 1)].Gradient;
+        const auto gradBiasesHH = paramGrad[(useFusion ? 0 : 2)].Gradient;
+        const auto gradWeightsHH = paramGrad[(useFusion ? 1 : 3)].Gradient;
+
+        EXPECT_EQ(gradWeightsIH.size(), weight_ih_grad_golden.size());
+        EXPECT_EQ(gradWeightsHH.size(), weight_hh_grad_golden.size());
+        EXPECT_EQ(gradBiasesIH.size(), bias_grad_golden.size());
+        EXPECT_EQ(gradBiasesHH.size(), bias_grad_golden.size());
+
+        for (size_t i = 0; i < gradWeightsIH.size(); ++i)
+        {
+            const auto val = gradWeightsIH[i];
+            const auto golden_val = weight_ih_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        for (size_t i = 0; i < gradWeightsHH.size(); ++i)
+        {
+            const auto val = gradWeightsHH[i];
+            const auto golden_val = weight_hh_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        for (size_t i = 0; i < gradBiasesIH.size(); ++i)
+        {
+            const auto val = gradBiasesIH[i];
+            const auto golden_val = bias_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+
+        for (size_t i = 0; i < gradBiasesHH.size(); ++i)
+        {
+            const auto val = gradBiasesHH[i];
+            const auto golden_val = bias_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+        }
+    }
+}
+
+#ifdef ANDROID
+TEST(TestLSTM, SimpleForwardSeq1FP16Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const size_t input_size = 4U;
+    const size_t hidden_size = 3U;
+    const size_t sequence_length = 1U;
+    const size_t batch_size = 2U;
+
+    // Initialization
+    raul::WorkflowEager work{ raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16 };
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "in" }, sequence_length, 1, input_size });
+
+    // Network
+    const auto params = raul::LSTMParams{ { "in" }, { "out" }, hidden_size, false };
+    raul::LSTMLayer("lstm", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    // Apply
+    EXPECT_NO_THROW(work.forwardPassTesting());
+}
+#endif // ANDROID
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_LSTMCell.cpp b/training/src/tests/tests/layers/Test_Layer_LSTMCell.cpp
new file mode 100644
index 00000000..bd80e2b5
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_LSTMCell.cpp
@@ -0,0 +1,1777 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/initializers/RandomUniformInitializer.h>
+#include <training/base/layers/composite/rnn/LSTMCellLayer.h>
+#include <training/compiler/Layers.h>
+
+namespace
+{
+
+using namespace raul;
+
+auto buildLSTMCellOnPrimitives(const size_t batch_size, const size_t input_size, const size_t hidden_size)
+{
+    auto work = std::make_unique<WorkflowEager>();
+    const size_t parts = 4U;
+
+    work->add<raul::DataLayer>("fake_data_in", DataParams{ { "in", "labels" }, 1, 1, input_size, 0 });
+    work->add<raul::DataLayer>("fake_data_state", DataParams{ { "hidden", "cell" }, 1, 1, hidden_size, hidden_size });
+    work->add<raul::LinearLayer>("linear_ih", LinearParams({ "in" }, { "linear_ih" }, hidden_size * parts));
+    work->add<raul::LinearLayer>("linear_hh", LinearParams({ "hidden" }, { "linear_hh" }, hidden_size * parts));
+    work->add<raul::ElementWiseSumLayer>("gates", ElementWiseLayerParams({ "linear_ih", "linear_hh" }, { "gates" }));
+    work->add<raul::SlicerLayer>("slice", SlicingParams("gates", { "gates[0]", "gates[1]", "gates[2]", "gates[3]" }, "width"));
+    work->add<raul::SigmoidActivation>("sigmoid_input", BasicParams({ "gates[0]" }, { "sigmoid_input" }));
+    work->add<raul::SigmoidActivation>("sigmoid_forget", BasicParams({ "gates[1]" }, { "sigmoid_forget" }));
+    work->add<raul::TanhActivation>("tanh_gates", BasicParams({ "gates[2]" }, { "tanh_gates" }));
+    work->add<raul::SigmoidActivation>("sigmoid_output", BasicParams({ "gates[3]" }, { "sigmoid_output" }));
+    work->add<raul::ElementWiseMulLayer>("mul_input", ElementWiseLayerParams({ "sigmoid_input", "tanh_gates" }, { "mul_input" }, false));
+    work->add<raul::ElementWiseMulLayer>("mul_forget", ElementWiseLayerParams({ "sigmoid_forget", "cell" }, { "mul_forget" }, false));
+    work->add<raul::ElementWiseSumLayer>("sum_new_cell_state", ElementWiseLayerParams({ "mul_input", "mul_forget" }, { "sum_new_cell_state" }));
+    work->add<raul::SplitterLayer>("splitter", BasicParams({ "sum_new_cell_state" }, { "internal_new_cell", "new_cell" }));
+    work->add<raul::TanhActivation>("tanh_new_cell_state", BasicParams({ "internal_new_cell" }, { "tanh_new_cell_state" }));
+    work->add<raul::ElementWiseMulLayer>("mul_new_hidden_state", ElementWiseLayerParams({ "sigmoid_output", "tanh_new_cell_state" }, { "new_hidden" }, false));
+
+    work->preparePipelines();
+    work->setBatchSize(batch_size);
+    work->prepareMemoryForTraining();
+
+    return work;
+}
+
+[[maybe_unused]] dtype zoneout_outputs_test_golden(const dtype prev, const dtype curr, const dtype prob)
+{
+    return prob * prev + (1.0_dt - prob) * curr;
+}
+
+}
+
+namespace UT
+{
+
+using namespace std;
+
+TEST(TestLSTMCell, BuildUnit)
+{
+    PROFILE_TEST
+    Workflow netdef;
+    netdef.add<raul::DataLayer>("fake_data_in", DataParams{ { "in", "labels" }, 1, 1, 1, 1 });
+    netdef.add<raul::DataLayer>("fake_data_state", DataParams{ { "hidden", "cell" }, 1, 1, 1, 1 });
+    LSTMCellLayer("lstm", LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {} }, netdef.getNetworkParameters());
+    netdef.preparePipelines();
+    netdef.setBatchSize(1u);
+    netdef.prepareMemoryForTraining();
+    netdef.printInfo(std::cout);
+}
+
+TEST(TestLSTMCell, PrimitiveBlocksForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Network
+    auto network = buildLSTMCellOnPrimitives(batch_size, input_size, hidden_size);
+    network->printInfo(std::cout);
+
+    // Initialization
+    auto& memory_manager = network->getMemoryManager();
+
+    for (auto& [param, grad] : network->getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    const raul::Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const raul::Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const raul::Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    // Apply
+    network->forwardPassTesting();
+
+    // Checks
+    const raul::Tensor hidden_golden{ 9.557552e-01_dt, 9.459500e-01_dt, 8.612298e-01_dt, -5.386918e-01_dt, -2.835236e-01_dt, 8.465633e-01_dt };
+    const raul::Tensor cell_golden{ 2.330955e+00_dt, 2.113240e+00_dt, 1.394835e+00_dt, -6.845742e-01_dt, -3.237444e-01_dt, 1.690755e+00_dt };
+
+    // Intermediate values
+    const raul::Tensor gatess_golden{ 3.622849e+00_dt, 3.622849e+00_dt, 3.622849e+00_dt, 3.622849e+00_dt, 3.622849e+00_dt, 3.622849e+00_dt, 3.622849e+00_dt, 3.622849e+00_dt,
+                                      3.622849e+00_dt, 3.622849e+00_dt, 3.622849e+00_dt, 3.622849e+00_dt, 2.267491e+00_dt, 2.267491e+00_dt, 2.267491e+00_dt, 2.267491e+00_dt,
+                                      2.267491e+00_dt, 2.267491e+00_dt, 2.267491e+00_dt, 2.267491e+00_dt, 2.267491e+00_dt, 2.267491e+00_dt, 2.267491e+00_dt, 2.267491e+00_dt };
+
+    const raul::Tensor i_t_golden{ 9.739882e-01_dt, 9.739882e-01_dt, 9.739882e-01_dt, 9.061487e-01_dt, 9.061487e-01_dt, 9.061487e-01_dt };
+    const raul::Tensor f_t_golden{ 9.739882e-01_dt, 9.739882e-01_dt, 9.739882e-01_dt, 9.061487e-01_dt, 9.061487e-01_dt, 9.061487e-01_dt };
+    const raul::Tensor g_t_golden{ 9.985746e-01_dt, 9.985746e-01_dt, 9.985746e-01_dt, 9.787735e-01_dt, 9.787735e-01_dt, 9.787735e-01_dt };
+    const raul::Tensor o_h_golden{ 9.739882e-01_dt, 9.739882e-01_dt, 9.739882e-01_dt, 9.061487e-01_dt, 9.061487e-01_dt, 9.061487e-01_dt };
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    const auto& gates_tensor = memory_manager["gates"];
+    const auto& sigmoid_input_tensor = memory_manager["sigmoid_input"];
+    const auto& sigmoid_forget_tensor = memory_manager["sigmoid_forget"];
+    const auto& tanh_gates_tensor = memory_manager["tanh_gates"];
+    const auto& sigmoid_output_tensor = memory_manager["sigmoid_output"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < gates_tensor.size(); ++i)
+    {
+        const auto val = gates_tensor[i];
+        const auto golden_val = gatess_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < sigmoid_input_tensor.size(); ++i)
+    {
+        const auto val = sigmoid_input_tensor[i];
+        const auto golden_val = i_t_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < sigmoid_input_tensor.size(); ++i)
+    {
+        const auto val = sigmoid_forget_tensor[i];
+        const auto golden_val = f_t_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < sigmoid_input_tensor.size(); ++i)
+    {
+        const auto val = tanh_gates_tensor[i];
+        const auto golden_val = g_t_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < sigmoid_input_tensor.size(); ++i)
+    {
+        const auto val = sigmoid_output_tensor[i];
+        const auto golden_val = o_h_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+}
+
+TEST(TestLSTMCell, PrimitiveBlocksBackwardUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Network
+    auto network = buildLSTMCellOnPrimitives(batch_size, input_size, hidden_size);
+
+    // Initialization
+    auto& memory_manager = network->getMemoryManager();
+
+    for (auto& [param, grad] : network->getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    memory_manager[Name("new_hidden").grad()] = 1.0_dt;
+    memory_manager[Name("new_cell").grad()] = 1.0_dt;
+
+    // Apply
+    network->forwardPassTraining();
+    network->backwardPassTraining();
+
+    // Checks
+    const Tensor inputs_grad_golden{ 2.458567e-01_dt, 2.458567e-01_dt, 2.458567e-01_dt, 2.458567e-01_dt, 1.941207e-01_dt, 1.941207e-01_dt, 1.941207e-01_dt, 1.941207e-01_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto cell_val = inputs_grad[i];
+        const auto golden_cell_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+    }
+}
+
+TEST(TestLSTMCell, SimpleForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = raul::LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {} };
+    raul::LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const raul::Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const raul::Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const raul::Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const raul::Tensor hidden_golden{ 9.557552e-01_dt, 9.459500e-01_dt, 8.612298e-01_dt, -5.386918e-01_dt, -2.835236e-01_dt, 8.465633e-01_dt };
+    const raul::Tensor cell_golden{ 2.330955e+00_dt, 2.113240e+00_dt, 1.394835e+00_dt, -6.845742e-01_dt, -3.237444e-01_dt, 1.690755e+00_dt };
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+}
+
+TEST(TestLSTMCell, SharedForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-3_dt;
+    const auto input_size = 3U;
+    const auto hidden_size = 2U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params1 = raul::LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {} };
+    const auto params2 = raul::LSTMCellParams{ {{"in", "new_hidden", "new_cell"}, {"new_hidden2", "new_cell2"}, raul::Name("lstm_cell")} };
+    raul::LSTMCellLayer("lstm_cell", params1, networkParameters);
+    raul::LSTMCellLayer("lstm_cell2", params2, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    memory_manager["in"] = { -0.0209, -0.7185, 0.5186, -1.3125,  0.1920, 0.5428 };
+    memory_manager["hidden"] = { -2.2188,  0.2590, -1.0297, -0.5008 };
+    memory_manager["cell"] = { 0.2734, -0.9181, -0.0404,  0.2881 };
+
+    memory_manager[Name("lstm_cell") / "linear_ih" / "Weights"] = { 
+        -0.0053,  0.3793, -0.5820, -0.5204, -0.2723,  0.1896, 
+        -0.0140,  0.5607, -0.0628,  0.1871, -0.2137, -0.1390, 
+        -0.6755, -0.4683, -0.2915,  0.0262,  0.2795,  0.4243, 
+        -0.4794, -0.3079,  0.2568,  0.5872, -0.1455,  0.5291 };
+
+    memory_manager[Name("lstm_cell") / "linear_ih" / "Biases"] = { 0.0372, -0.3625, 0.1196, -0.6602, -0.5109, -0.3645, 0.4461, 0.4146 };
+
+    memory_manager[Name("lstm_cell") / "linear_hh" / "Weights"] = { 
+        -0.1140,  0.0748,  0.6403, -0.6560, -0.4452, -0.1790, -0.2756, 0.6109, 
+        -0.4583, -0.3255, -0.4940, -0.6622, -0.4128,  0.6078,  0.3155, 0.3427 };
+
+    memory_manager[Name("lstm_cell") / "linear_hh" / "Biases"] = { -0.3136, -0.0255, 0.4522, 0.7030, 0.2806, 0.0955, 0.4741, -0.4163 };
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const raul::Tensor new_hidden_golden(batch_size, 1, 1, hidden_size, { 0.3941_dt, -0.2222_dt, 0.2289_dt, 0.1148_dt });
+    const raul::Tensor new_cell_golden(batch_size, 1, 1, hidden_size, { 0.4616_dt, -0.5580_dt, 0.2754_dt, 0.4600_dt });
+
+    const raul::Tensor new_hidden_golden2(batch_size, 1, 1, hidden_size, { 0.1377_dt, -0.2434_dt, 0.2198_dt, 0.0282_dt });
+    const raul::Tensor new_cell_golden2(batch_size, 1, 1, hidden_size, { 0.1913_dt, -0.4290_dt, 0.2704_dt, 0.0704_dt });
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    const auto& hidden_new_tensor2 = memory_manager["new_hidden2"];
+    const auto& cell_new_tensor2 = memory_manager["new_cell2"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor2.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor2.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = new_cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = new_hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor2[i];
+        const auto golden_cell_val = new_cell_golden2[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor2[i];
+        const auto golden_hidden_val = new_hidden_golden2[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+}
+
+TEST(TestLSTMCell, SimpleBackwardUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {} };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTraining();
+
+    memory_manager[Name("new_hidden").grad()] = 1.0_dt;
+    memory_manager[Name("new_cell").grad()] = 1.0_dt;
+
+    work.backwardPassTraining();
+
+    // Checks
+    const Tensor inputs_grad_golden{ 2.458567e-01_dt, 2.458567e-01_dt, 2.458567e-01_dt, 2.458567e-01_dt, 1.941207e-01_dt, 1.941207e-01_dt, 1.941207e-01_dt, 1.941207e-01_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTMCell, SimpleBackwardHiddenOnlyGradUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {} };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    memory_manager[Name("new_hidden").grad()] = 1.0_dt;
+    memory_manager[Name("new_cell").grad()] = 0.0_dt;
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    // Checks
+    const Tensor inputs_grad_golden{ 8.564898e-02_dt, 8.564898e-02_dt, 8.564898e-02_dt, 8.564898e-02_dt, 1.589159e-02_dt, 1.589159e-02_dt, 1.589159e-02_dt, 1.589159e-02_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTMCell, ZoneoutP0ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+    const auto zonenout_prob = 0.0_dt;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    const raul::Tensor hidden_golden{ 9.557552e-01_dt, 9.459500e-01_dt, 8.612298e-01_dt, -5.386918e-01_dt, -2.835236e-01_dt, 8.465633e-01_dt };
+    const raul::Tensor cell_golden{ 2.330955e+00_dt, 2.113240e+00_dt, 1.394835e+00_dt, -6.845742e-01_dt, -3.237444e-01_dt, 1.690755e+00_dt };
+
+    // Network with zoneout
+    const auto params = raul::LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, zonenout_prob };
+    raul::LSTMCellLayer("lstm_cell_zoneout", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const raul::Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const raul::Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const raul::Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+}
+
+TEST(TestLSTMCell, ZoneoutP1ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto zonenout_prob = 1.0_dt;
+
+    // Initialization
+    Workflow work;
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, zonenout_prob };
+    EXPECT_NO_THROW(LSTMCellLayer("lstm_cell_zoneout", params, work.getNetworkParameters()));
+}
+
+TEST(TestLSTMCell, ZoneoutTrainForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto probability = raul::random::uniform::rand<raul::dtype>({ 0.0_dt, 1.0_dt });
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 10'000;
+
+    std::cout << "Test with p=" << probability << std::endl;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = raul::LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, probability };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    raul::initializers::RandomUniformInitializer initializer{ -1e3_dt, 1e3_dt };
+
+    initializer(memory_manager["in"]);
+    initializer(memory_manager["hidden"]);
+    initializer(memory_manager["cell"]);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Apply
+    work.forwardPassTraining();
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    {
+        std::cout << "Hidden state:" << std::endl;
+
+        size_t curr_cnt = 0U;
+        size_t prev_cnt = 0U;
+
+        for (size_t i = 0; i < hidden_new_tensor.size(); ++i)
+        {
+            if (hidden_new_tensor[i] == hidden_input_tensor[i])
+            {
+                ++prev_cnt;
+            }
+            else
+            {
+                ++curr_cnt;
+            }
+        }
+
+        // Checks
+        raul::dtype curr_prob = static_cast<raul::dtype>(curr_cnt) / static_cast<raul::dtype>(hidden_new_tensor.size());
+        raul::dtype prev_prob = static_cast<raul::dtype>(prev_cnt) / static_cast<raul::dtype>(hidden_new_tensor.size());
+
+        // Assumption
+        ASSERT_TRUE(TODTYPE(hidden_new_tensor.size()) * curr_prob * (1.0_dt - curr_prob) >= 10.0_dt);
+        ASSERT_TRUE(TODTYPE(hidden_new_tensor.size()) * prev_prob * (1.0_dt - prev_prob) >= 10.0_dt);
+
+        // The confident interval for p estimation
+        // See https://www.wolframalpha.com/input/?i=confidence+99.999%25
+        const auto z_ci = 4.417_dt; // 99.999%
+        const auto prev_ci = z_ci * std::sqrt(prev_prob * (1.0_dt - prev_prob) / TODTYPE(hidden_new_tensor.size()));
+        const auto curr_ci = z_ci * std::sqrt(curr_prob * (1.0_dt - curr_prob) / TODTYPE(hidden_new_tensor.size()));
+
+        std::cout << "[prev prob] expected: " << probability << ", got: " << prev_prob << ", ci: " << prev_ci << std::endl;
+        std::cout << "[curr prob] expected: " << 1.0_dt - probability << ", got: " << curr_prob << ", ci: " << curr_ci << std::endl;
+        EXPECT_NEAR(prev_prob, probability, prev_ci);
+        EXPECT_NEAR(curr_prob, 1.0_dt - probability, curr_ci);
+    }
+
+    {
+        std::cout << "Cell state:" << std::endl;
+
+        size_t curr_cnt = 0U;
+        size_t prev_cnt = 0U;
+
+        for (size_t i = 0; i < cell_new_tensor.size(); ++i)
+        {
+            if (cell_new_tensor[i] == cell_input_tensor[i])
+            {
+                ++prev_cnt;
+            }
+            else
+            {
+                ++curr_cnt;
+            }
+        }
+
+        // Checks
+        raul::dtype curr_prob = static_cast<raul::dtype>(curr_cnt) / static_cast<raul::dtype>(cell_new_tensor.size());
+        raul::dtype prev_prob = static_cast<raul::dtype>(prev_cnt) / static_cast<raul::dtype>(cell_new_tensor.size());
+
+        // Assumption
+        ASSERT_TRUE(TODTYPE(cell_new_tensor.size()) * curr_prob * (1.0_dt - curr_prob) >= 10.0_dt);
+        ASSERT_TRUE(TODTYPE(cell_new_tensor.size()) * prev_prob * (1.0_dt - prev_prob) >= 10.0_dt);
+
+        // The confident interval for p estimation
+        // See https://www.wolframalpha.com/input/?i=confidence+99.999%25
+        const auto z_ci = 4.417_dt; // 99.999%
+        const auto prev_ci = z_ci * std::sqrt(prev_prob * (1.0_dt - prev_prob) / TODTYPE(cell_new_tensor.size()));
+        const auto curr_ci = z_ci * std::sqrt(curr_prob * (1.0_dt - curr_prob) / TODTYPE(cell_new_tensor.size()));
+
+        std::cout << "[prev prob] expected: " << probability << ", got: " << prev_prob << ", ci: " << prev_ci << std::endl;
+        std::cout << "[curr prob] expected: " << 1.0_dt - probability << ", got: " << curr_prob << ", ci: " << curr_ci << std::endl;
+        EXPECT_NEAR(prev_prob, probability, prev_ci);
+        EXPECT_NEAR(curr_prob, 1.0_dt - probability, curr_ci);
+    }
+}
+
+TEST(TestLSTMCell, ZoneoutTestForwardRandUnit)
+{
+    PROFILE_TEST
+    
+    // Test parameters
+    [[maybe_unused]] const auto eps_rel = 1e-5_dt;
+    const auto probability = raul::random::uniform::rand<raul::dtype>({ 0.0_dt, 1.0_dt });
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 1U;
+
+    std::cout << "Test with p=" << probability << std::endl;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Network
+    {
+        work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+        work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+        work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+        const auto params = raul::LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, probability };
+        LSTMCellLayer("lstm_cell", params, networkParameters);
+    }
+
+    // No zoneout
+    {
+        work.add<raul::DataLayer>("data4", DataParams{ { "in_no_zoneout" }, 1, 1, input_size });
+        work.add<raul::DataLayer>("data5", DataParams{ { "hidden_no_zoneout" }, 1, 1, hidden_size });
+        work.add<raul::DataLayer>("data6", DataParams{ { "cell_no_zoneout" }, 1, 1, hidden_size });
+
+        const auto params = raul::LSTMCellParams{ "in_no_zoneout", "hidden_no_zoneout", "cell_no_zoneout", "new_hidden_no_zoneout", "new_cell_no_zoneout", {} };
+        LSTMCellLayer("lstm_cell_no_zoneout", params, networkParameters);
+    }
+
+    TENSORS_CREATE(batch_size)
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    raul::initializers::RandomUniformInitializer initializer{ -1e3_dt, 1e3_dt };
+
+    initializer(memory_manager["in"]);
+    initializer(memory_manager["hidden"]);
+    initializer(memory_manager["cell"]);
+
+    memory_manager["in_no_zoneout"] = TORANGE(memory_manager["in"]);
+    memory_manager["hidden_no_zoneout"] = TORANGE(memory_manager["hidden"]);
+    memory_manager["cell_no_zoneout"] = TORANGE(memory_manager["cell"]);
+
+    EXPECT_THROW(work.forwardPassTesting(), raul::Exception);
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor_no_zoneout = memory_manager["new_hidden_no_zoneout"];
+    const auto& cell_new_tensor_no_zoneout = memory_manager["new_cell_no_zoneout"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor_no_zoneout.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor_no_zoneout.size());
+
+#if 0
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto val = hidden_new_tensor[i];
+        const auto golden_val = zoneout_outputs_test_golden(hidden_input_tensor[i], hidden_new_tensor_no_zoneout[i], probability);
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    for (size_t i = 0; i < cell_input_tensor.size(); ++i)
+    {
+        const auto val = cell_new_tensor[i];
+        const auto golden_val = zoneout_outputs_test_golden(cell_input_tensor[i], cell_new_tensor_no_zoneout[i], probability);
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+#endif
+}
+
+TEST(TestLSTMCell, SingleParamsForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, 0.0_dt, true };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    memory_manager[Name("lstm_cell") / "linear" / "Weights"] = 1.0_dt;
+    memory_manager[Name("lstm_cell") / "linear" / "Biases"] = 2.0_dt;
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const Tensor hidden_golden{ 9.557552e-01_dt, 9.459500e-01_dt, 8.612298e-01_dt, -5.386918e-01_dt, -2.835236e-01_dt, 8.465633e-01_dt };
+    const Tensor cell_golden{ 2.330955e+00_dt, 2.113240e+00_dt, 1.394835e+00_dt, -6.845742e-01_dt, -3.237444e-01_dt, 1.690755e+00_dt };
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+}
+
+TEST(TestLSTMCell, BackwardHiddenOnlyGradSingleWeightTensorUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps_rel = 1e-4_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, 0, true };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    memory_manager[Name("new_hidden").grad()] = 1.0_dt;
+    memory_manager[Name("new_cell").grad()] = 0.0_dt;
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    memory_manager[Name("lstm_cell") / "linear" / "Weights"] = 1.0_dt;
+    memory_manager[Name("lstm_cell") / "linear" / "Biases"] = 2.0_dt;
+
+    // Apply
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    // Checks
+    const Tensor inputs_grad_golden{ 8.564898e-02_dt, 8.564898e-02_dt, 8.564898e-02_dt, 8.564898e-02_dt, 1.589159e-02_dt, 1.589159e-02_dt, 1.589159e-02_dt, 1.589159e-02_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTMCell, SingleParamsLoadedNoBiasForgetForwardUnit)
+{
+    PROFILE_TEST
+    /// @note(ck): Tensorflow -> PyTorch (Raul) conversion:
+    /// 1. Split the parameters into 4 parts (i, j, f, o) (axis 1 for weights, 0 for bias)
+    ///    i,j,f,o = np.split(parameter, 4, axis)
+    /// 2. Concatenate a param redodering parts: i, f, j, o
+    ///    np.concatenate([i,f,j,o], axis)
+    /// 3. If we need two tensors ih and hh, we have to split the weights tensor into 2 parts the weights tensor (axis=0).
+    ///    They must be dumped in transposed mode.
+    ///    weights_ih, weights_hh = np.split(weights, [input_size], axis=0)
+    /// 4. If we need one tensor,  we have to dump concatenate weights in in transposed mode.
+
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto batch_size = 1U;
+    const auto input_size = 2U;
+    const auto hidden_size = 3U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, 0.0_dt, true, 0.0_dt };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ -0.862954_dt, 0.515829_dt };
+    const Tensor hidden_init{ 0.60593426_dt, -2.328151_dt, 0.79729331_dt };
+    const Tensor cell_init{ -0.1931977_dt, -1.6983756_dt, -0.19214356_dt };
+    const Tensor weights_init{ -0.441001_dt,    -0.3623873_dt,  -0.17952624_dt, 0.11902428_dt,  0.02685672_dt,  0.42737567_dt,  -0.078520119_dt, -0.21646628_dt, 0.40090138_dt,   -0.27012825_dt,
+                               -0.33669758_dt,  -0.38707477_dt, -0.58897257_dt, 0.09874326_dt,  -0.50579959_dt, 0.26682276_dt,  -0.37865555_dt,  -0.40907878_dt, -0.14220604_dt,  -0.16274127_dt,
+                               0.25235814_dt,   -0.16358814_dt, -0.36718047_dt, -0.44681144_dt, 0.39746886_dt,  -0.33145159_dt, 0.56001222_dt,   -0.12500507_dt, -0.10351944_dt,  0.33464533_dt,
+                               -0.029248476_dt, -0.54003751_dt, -0.5581485_dt,  -0.32761213_dt, 0.44943726_dt,  0.18686062_dt,  0.4887594_dt,    -0.40617755_dt, 0.58165085_dt,   -0.042889237_dt,
+                               -0.55177462_dt,  0.33988893_dt,  0.13956285_dt,  0.062051535_dt, 0.048577607_dt, -0.57949477_dt, -0.56993592_dt,  -0.23829651_dt, 0.51333058_dt,   0.064030409_dt,
+                               -0.43928957_dt,  0.5261904_dt,   0.20812571_dt,  0.34141016_dt,  -0.49648321_dt, -0.55918819_dt, -0.57103509_dt,  0.44186842_dt,  -0.062169135_dt, -0.57692778_dt };
+
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    memory_manager[Name("lstm_cell") / "linear" / "Weights"] = TORANGE(weights_init);
+    memory_manager[Name("lstm_cell") / "linear" / "Biases"] = 0.0_dt;
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const Tensor hidden_golden{ 0.037091959_dt, -0.34549943_dt, 0.011092417_dt };
+    const Tensor cell_golden{ 0.14763787_dt, -1.3258458_dt, 0.020733863_dt };
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < cell_new_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_golden[i];
+        EXPECT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+    }
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        EXPECT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+}
+
+TEST(TestLSTMCell, SingleParamsLoadedWithBiasForgetForwardUnit)
+{
+    PROFILE_TEST
+    /// @note(ck): Tensorflow -> PyTorch (Raul) conversion:
+    /// 1. Split the parameters into 4 parts (i, j, f, o) (axis 1 for weights, 0 for bias)
+    ///    i,j,f,o = np.split(parameter, 4, axis)
+    /// 2. Concatenate a param redodering parts: i, f, j, o
+    ///    np.concatenate([i,f,j,o], axis)
+    /// 3. If we need two tensors ih and hh, we have to split the weights tensor into 2 parts the weights tensor (axis=0).
+    ///    They must be dumped in transposed mode.
+    ///    weights_ih, weights_hh = np.split(weights, [input_size], axis=0)
+    /// 4. If we need one tensor,  we have to dump concatenate weights in in transposed mode.
+
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto batch_size = 1U;
+    const auto input_size = 2U;
+    const auto hidden_size = 3U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, 0.0_dt, true, 1.0_dt };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ -0.53056484_dt, -1.2516655_dt };
+    const Tensor hidden_init{ -1.3469669_dt, -1.7043214_dt, -0.55913144_dt };
+    const Tensor cell_init{ 0.59965169_dt, -0.2301345_dt, -1.0851291_dt };
+    const Tensor weights_init{ -0.17065069_dt, 0.44004658_dt,  0.85328132_dt,   -0.76646215_dt,  -1.2310894_dt,  -0.56331462_dt, 0.48660466_dt,  -0.067891933_dt, -0.19195049_dt, 2.1599874_dt,
+                               0.61151052_dt,  1.2376982_dt,   -0.015872575_dt, -1.3109018_dt,   -0.65890425_dt, -0.32336307_dt, -0.74042344_dt, -1.1113732_dt,   -0.5390079_dt,  -0.22297508_dt,
+                               2.0183513_dt,   -0.22923379_dt, -0.61300713_dt,  -0.35666472_dt,  -0.9824217_dt,  0.8425144_dt,   0.91593617_dt,  1.8140028_dt,    1.0173856_dt,   0.39566979_dt,
+                               0.99805921_dt,  1.4657131_dt,   0.34357622_dt,   -0.094996393_dt, 0.71959883_dt,  1.3679156_dt,   -1.430184_dt,   -0.88232619_dt,  -0.58268321_dt, 0.11771394_dt,
+                               -0.24999142_dt, 1.7739073_dt,   1.8358583_dt,    0.49045855_dt,   2.9916673_dt,   1.3255521_dt,   0.55856633_dt,  -0.41269496_dt,  0.62596655_dt,  0.37756023_dt,
+                               0.0746601_dt,   -0.52719849_dt, 1.4623779_dt,    -0.092216626_dt, -0.35693371_dt, 0.064323775_dt, 1.0745957_dt,   -0.32174867_dt,  0.51347214_dt,  0.33080587_dt };
+    const Tensor bias_init{ 2.0_dt, 2.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 3.0_dt };
+
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    memory_manager[Name("lstm_cell") / "linear" / "Weights"] = TORANGE(weights_init);
+    memory_manager[Name("lstm_cell") / "linear" / "Biases"] = TORANGE(bias_init);
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const Tensor hidden_golden{ -0.086121075_dt, 0.39492172_dt, -0.57626414_dt };
+    const Tensor cell_golden{ -0.12261444_dt, 0.48217469_dt, -1.0701199_dt };
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < cell_new_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_golden[i];
+        EXPECT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+    }
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        EXPECT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+}
+
+TEST(TestLSTMCell, FusionOnSimpleForwardBackwardUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+
+    const Tensor hidden_golden{ 0.955755_dt, 0.94595_dt, 0.86123_dt, -0.538692_dt, -0.283524_dt, 0.846563_dt };
+    const Tensor cell_golden{ 2.33095_dt, 2.11324_dt, 1.39484_dt, -0.684574_dt, -0.323745_dt, 1.69076_dt };
+
+    const Tensor inputs_grad_golden{ 0.296787_dt, 0.296787_dt, 0.296787_dt, 0.296787_dt, 0.389521_dt, 0.389521_dt, 0.389521_dt, 0.389521_dt };
+    const Tensor hidden_grad_golden{ 0.296787_dt, 0.296787_dt, 0.296787_dt, 0.389521_dt, 0.389521_dt, 0.389521_dt };
+    const Tensor cell_grad_golden{ 1.42496_dt, 1.30811_dt, 0.327377_dt, -1.74987_dt, -2.37358_dt, 0.932454_dt };
+
+    const Tensor biases_ih_grad_golden{ -0.123729_dt, -0.184057_dt, 0.094158_dt, 0.336505_dt, 0.337471_dt, 0.0813232_dt,
+        -0.0694392_dt, -0.0959693_dt, 0.0400979_dt, 0.0640461_dt, 0.118326_dt, 0.087575_dt };
+    const Tensor weights_ih_grad_golden{ -0.206206_dt, -0.0404442_dt, -0.00992588_dt, -0.00301529_dt, -0.282549_dt,
+        -0.0397434_dt, 0.0109667_dt, 0.00663612_dt, 0.114829_dt, -0.00474223_dt, -0.0372058_dt, -0.0170506_dt, 0.385924_dt,
+        -0.0375827_dt, -0.158969_dt, -0.072174_dt, 0.4008_dt, -0.0260736_dt, -0.144787_dt, -0.0660544_dt, 0.103369_dt,
+        -0.00055761_dt, -0.0276757_dt, -0.0127994_dt, -0.0965421_dt, -0.00650937_dt, 0.0148289_dt, 0.00712453_dt, -0.131265_dt,
+        -0.00717158_dt, 0.0227939_dt, 0.0108404_dt, 0.051987_dt, 0.000584807_dt, -0.0125626_dt, -0.00584275_dt, 0.0307254_dt,
+        -0.0432066_dt, -0.0756874_dt, -0.0333723_dt, 0.0686959_dt, -0.0697576_dt, -0.127148_dt, -0.056173_dt, 0.127667_dt,
+        0.0131973_dt, -0.0124167_dt, -0.00626879_dt };
+    const Tensor biases_hh_grad_golden{ -0.123729_dt, -0.184057_dt, 0.094158_dt, 0.336505_dt, 0.337471_dt, 0.0813232_dt,
+        -0.0694392_dt, -0.0959693_dt, 0.0400979_dt, 0.0640461_dt, 0.118326_dt, 0.087575_dt };
+    const Tensor weights_hh_grad_golden{ 0.12407_dt, 0.367511_dt, -0.21492_dt, 0.137574_dt, 0.448019_dt, -0.284085_dt,
+        -0.0126824_dt, -0.108021_dt, 0.101585_dt, 0.00215716_dt, -0.286331_dt, 0.32705_dt, -0.0245682_dt, -0.343293_dt,
+        0.348256_dt, -0.0190953_dt, -0.110396_dt, 0.093911_dt, 0.032379_dt, 0.128021_dt, -0.0923745_dt, 0.0405508_dt,
+        0.168115_dt, -0.124484_dt, -0.0113936_dt, -0.0585874_dt, 0.0478045_dt, 0.0833724_dt, 0.119736_dt, -0.000653027_dt,
+        0.130866_dt, 0.172563_dt, 0.0163568_dt, -0.0523129_dt, -0.185561_dt, 0.125202_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, 0.0_dt, false, 0.0_dt, false, true };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = 1.0_dt;
+    }
+
+    // Forward checks
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+    EXPECT_EQ(hidden_init.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_init.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < hidden_new_tensor.size(); ++i)
+    {
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        EXPECT_NEAR(hidden_val, golden_hidden_val, eps);
+    }
+    
+    for (size_t i = 0; i < cell_new_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_golden[i];
+        EXPECT_NEAR(cell_val, golden_cell_val, eps);
+    }
+
+    memory_manager[Name("new_hidden").grad()] = TORANGE(hidden_init);
+    memory_manager[Name("new_cell").grad()] = TORANGE(cell_init);
+
+    // Backward checks
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto& hidden_grad = memory_manager[Name("hidden").grad()];
+    EXPECT_EQ(hidden_grad_golden.size(), hidden_grad.size());
+    for (size_t i = 0; i < hidden_grad.size(); ++i)
+    {
+        const auto val = hidden_grad[i];
+        const auto golden_val = hidden_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto& cell_grad = memory_manager[Name("cell").grad()];
+    EXPECT_EQ(cell_grad_golden.size(), cell_grad.size());
+    for (size_t i = 0; i < cell_grad.size(); ++i)
+    {
+        const auto val = cell_grad[i];
+        const auto golden_val = cell_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    // Check trainable params grads
+    const auto paramGrad = work.getTrainableParameters();
+    
+    const auto biases_hh_grad = paramGrad[0].Gradient;
+    EXPECT_EQ(biases_hh_grad_golden.size(), biases_hh_grad.size());
+    for (size_t i = 0; i < biases_hh_grad.size(); ++i)
+    {
+        const auto val = biases_hh_grad[i];
+        const auto golden_val = biases_hh_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto weights_hh_grad = paramGrad[1].Gradient;
+    EXPECT_EQ(weights_hh_grad_golden.size(), weights_hh_grad.size());
+    for (size_t i = 0; i < weights_hh_grad.size(); ++i)
+    {
+        const auto val = weights_hh_grad[i];
+        const auto golden_val = weights_hh_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto biases_ih_grad = paramGrad[2].Gradient;
+    EXPECT_EQ(biases_ih_grad_golden.size(), biases_ih_grad.size());
+    for (size_t i = 0; i < biases_ih_grad.size(); ++i)
+    {
+        const auto val = biases_ih_grad[i];
+        const auto golden_val = biases_ih_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto weights_ih_grad = paramGrad[3].Gradient;
+    EXPECT_EQ(weights_ih_grad_golden.size(), weights_ih_grad.size());
+    for (size_t i = 0; i < weights_ih_grad.size(); ++i)
+    {
+        const auto val = weights_ih_grad[i];
+        const auto golden_val = weights_ih_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTMCell, SingleParamsFusionOnForwardBackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, 0.0_dt, true, 0.0_dt, false, true };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    dtype initValue = 2.0_dt;
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = initValue;
+        initValue -= 1.0_dt;
+    }
+
+    // Apply
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Checks
+    const Tensor hidden_golden{ 9.557552e-01_dt, 9.459500e-01_dt, 8.612298e-01_dt, -5.386918e-01_dt, -2.835236e-01_dt, 8.465633e-01_dt };
+    const Tensor cell_golden{ 2.330955e+00_dt, 2.113240e+00_dt, 1.394835e+00_dt, -6.845742e-01_dt, -3.237444e-01_dt, 1.690755e+00_dt };
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+
+    memory_manager[Name("new_hidden").grad()] = TORANGE(hidden_init);
+    memory_manager[Name("new_cell").grad()] = TORANGE(cell_init);
+
+    // Apply
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Checks
+    const Tensor inputs_grad_golden{ 0.296787_dt, 0.296787_dt, 0.296787_dt, 0.296787_dt, 0.389521_dt, 0.389521_dt, 0.389521_dt, 0.389521_dt };
+    const Tensor hidden_grad_golden{ 0.296787_dt, 0.296787_dt, 0.296787_dt, 0.389521_dt, 0.389521_dt, 0.389521_dt };
+    const Tensor cell_grad_golden{ 1.42496_dt, 1.30811_dt, 0.327377_dt, -1.74987_dt, -2.37358_dt, 0.932454_dt };
+
+    const Tensor biases_grad_golden{ -0.123729_dt, -0.184057_dt, 0.094158_dt, 0.336505_dt, 0.337471_dt, 0.0813232_dt,
+        -0.0694392_dt, -0.0959693_dt, 0.0400979_dt, 0.0640461_dt, 0.118326_dt, 0.087575_dt };
+    const Tensor weights_grad_golden{ -0.206206_dt, -0.0404442_dt, -0.00992588_dt, -0.00301529_dt, 0.12407_dt, 0.367511_dt,
+        -0.21492_dt, -0.282549_dt, -0.0397434_dt, 0.0109667_dt, 0.00663612_dt, 0.137574_dt, 0.448019_dt, -0.284085_dt,
+        0.114829_dt, -0.00474223_dt, -0.0372058_dt, -0.0170506_dt, -0.0126824_dt, -0.108021_dt, 0.101585_dt, 0.385924_dt,
+        -0.0375827_dt, -0.158969_dt, -0.072174_dt, 0.00215716_dt, -0.286331_dt, 0.32705_dt, 0.4008_dt, -0.0260736_dt,
+        -0.144787_dt, -0.0660544_dt, -0.0245682_dt, -0.343293_dt, 0.348256_dt, 0.103369_dt, -0.00055761_dt, -0.0276757_dt,
+        -0.0127994_dt, -0.0190953_dt, -0.110396_dt, 0.093911_dt, -0.0965421_dt, -0.00650937_dt, 0.0148289_dt, 0.00712453_dt,
+        0.032379_dt, 0.128021_dt, -0.0923745_dt, -0.131265_dt, -0.00717158_dt, 0.0227939_dt, 0.0108404_dt, 0.0405508_dt,
+        0.168115_dt, -0.124484_dt, 0.051987_dt, 0.000584807_dt, -0.0125626_dt, -0.00584275_dt, -0.0113936_dt, -0.0585874_dt,
+        0.0478045_dt, 0.0307254_dt, -0.0432066_dt, -0.0756874_dt, -0.0333723_dt, 0.0833724_dt, 0.119736_dt, -0.000653027_dt,
+        0.0686959_dt, -0.0697576_dt, -0.127148_dt, -0.056173_dt, 0.130866_dt, 0.172563_dt, 0.0163568_dt, 0.127667_dt,
+        0.0131973_dt, -0.0124167_dt, -0.00626879_dt, -0.0523129_dt, -0.185561_dt, 0.125202_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto& hidden_grad = memory_manager[Name("hidden").grad()];
+    EXPECT_EQ(hidden_grad_golden.size(), hidden_grad.size());
+    for (size_t i = 0; i < hidden_grad.size(); ++i)
+    {
+        const auto val = hidden_grad[i];
+        const auto golden_val = hidden_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto& cell_grad = memory_manager[Name("cell").grad()];
+    EXPECT_EQ(cell_grad_golden.size(), cell_grad.size());
+    for (size_t i = 0; i < cell_grad.size(); ++i)
+    {
+        const auto val = cell_grad[i];
+        const auto golden_val = cell_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    // Check trainable params
+    const auto paramGrad = work.getTrainableParameters();
+    
+    const auto biases_grad = paramGrad[0].Gradient;
+    EXPECT_EQ(biases_grad_golden.size(), biases_grad.size());
+    for (size_t i = 0; i < biases_grad.size(); ++i)
+    {
+        const auto val = biases_grad[i];
+        const auto golden_val = biases_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto weights_grad = paramGrad[1].Gradient;
+    EXPECT_EQ(weights_grad_golden.size(), weights_grad.size());
+    for (size_t i = 0; i < weights_grad.size(); ++i)
+    {
+        const auto val = weights_grad[i];
+        const auto golden_val = weights_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+TEST(TestLSTMCell, SingleParamsZoneoutP1FusionOnForwardBackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, 1.0_dt, true, 0.0_dt, false, true };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    dtype initValue = 2.0_dt;
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = initValue;
+        initValue -= 1.0_dt;
+    }
+
+    // Apply
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Checks
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_input_tensor[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_input_tensor[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+
+    memory_manager[Name("new_hidden").grad()] = TORANGE(hidden_init);
+    memory_manager[Name("new_cell").grad()] = TORANGE(cell_init);
+
+    // Apply
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Checks
+    const Tensor biases_grad_golden{ -0.123729_dt, -0.184057_dt, 0.094158_dt, 0.336505_dt, 0.337471_dt, 0.0813232_dt,
+        -0.0694392_dt, -0.0959693_dt, 0.0400979_dt, 0.0640461_dt, 0.118326_dt, 0.087575_dt };
+    const Tensor weights_grad_golden{ -0.206206_dt, -0.0404442_dt, -0.00992588_dt, -0.00301529_dt, 0.12407_dt, 0.367511_dt,
+        -0.21492_dt, -0.282549_dt, -0.0397434_dt, 0.0109667_dt, 0.00663612_dt, 0.137574_dt, 0.448019_dt, -0.284085_dt,
+        0.114829_dt, -0.00474223_dt, -0.0372058_dt, -0.0170506_dt, -0.0126824_dt, -0.108021_dt, 0.101585_dt, 0.385924_dt,
+        -0.0375827_dt, -0.158969_dt, -0.072174_dt, 0.00215716_dt, -0.286331_dt, 0.32705_dt, 0.4008_dt, -0.0260736_dt,
+        -0.144787_dt, -0.0660544_dt, -0.0245682_dt, -0.343293_dt, 0.348256_dt, 0.103369_dt, -0.00055761_dt, -0.0276757_dt,
+        -0.0127994_dt, -0.0190953_dt, -0.110396_dt, 0.093911_dt, -0.0965421_dt, -0.00650937_dt, 0.0148289_dt, 0.00712453_dt,
+        0.032379_dt, 0.128021_dt, -0.0923745_dt, -0.131265_dt, -0.00717158_dt, 0.0227939_dt, 0.0108404_dt, 0.0405508_dt,
+        0.168115_dt, -0.124484_dt, 0.051987_dt, 0.000584807_dt, -0.0125626_dt, -0.00584275_dt, -0.0113936_dt, -0.0585874_dt,
+        0.0478045_dt, 0.0307254_dt, -0.0432066_dt, -0.0756874_dt, -0.0333723_dt, 0.0833724_dt, 0.119736_dt, -0.000653027_dt,
+        0.0686959_dt, -0.0697576_dt, -0.127148_dt, -0.056173_dt, 0.130866_dt, 0.172563_dt, 0.0163568_dt, 0.127667_dt,
+        0.0131973_dt, -0.0124167_dt, -0.00626879_dt, -0.0523129_dt, -0.185561_dt, 0.125202_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        EXPECT_EQ(val, 0.0_dt);
+    }
+
+    const auto& hidden_grad = memory_manager[Name("hidden").grad()];
+    EXPECT_EQ(hidden_init.size(), hidden_grad.size());
+    for (size_t i = 0; i < hidden_grad.size(); ++i)
+    {
+        const auto val = hidden_grad[i];
+        const auto golden_val = hidden_init[i];
+        EXPECT_EQ(val, golden_val);
+    }
+
+    const auto& cell_grad = memory_manager[Name("cell").grad()];
+    EXPECT_EQ(cell_init.size(), cell_grad.size());
+    for (size_t i = 0; i < cell_grad.size(); ++i)
+    {
+        const auto val = cell_grad[i];
+        const auto golden_val = cell_init[i];
+        EXPECT_EQ(val, golden_val);
+    }
+
+    // Check trainable params
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        for (size_t i = 0; i < grad.size(); ++i)
+        {
+            EXPECT_EQ(grad[i], 0.0_dt);
+        }
+    }
+}
+
+TEST(TestLSTMCell, SingleParamsForgetBiasFusionOnForwardBackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params = LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, 0.0_dt, true, 1.0_dt, false, true };
+    LSTMCellLayer("lstm_cell", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    dtype initValue = 2.0_dt;
+    for (auto& [param, grad] : work.getTrainableParameters())
+    {
+        param = initValue;
+        initValue -= 1.0_dt;
+    }
+
+    // Apply
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Checks
+    const Tensor hidden_golden{ 0.956557_dt, 0.946984_dt, 0.86272_dt, -0.59336_dt, -0.344365_dt, 0.852137_dt };
+    const Tensor cell_golden{ 2.35366_dt, 2.13231_dt, 1.40189_dt, -0.783683_dt, -0.400097_dt, 1.74145_dt };
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["new_hidden"];
+    const auto& cell_new_tensor = memory_manager["new_cell"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+
+    memory_manager[Name("new_hidden").grad()] = TORANGE(hidden_init);
+    memory_manager[Name("new_cell").grad()] = TORANGE(cell_init);
+
+    // Apply
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Checks
+    const Tensor inputs_grad_golden{ 0.237544_dt, 0.237544_dt, 0.237544_dt, 0.237544_dt, 0.0195209_dt, 0.0195209_dt, 0.0195209_dt, 0.0195209_dt };
+    const Tensor hidden_grad_golden{ 0.237544_dt, 0.237544_dt, 0.237544_dt, 0.0195209_dt, 0.0195209_dt, 0.0195209_dt };
+    const Tensor cell_grad_golden{ 1.44583_dt, 1.32379_dt, 0.334047_dt, -1.83812_dt, -2.45951_dt, 0.978865_dt };
+
+    const Tensor biases_grad_golden{ -0.121894_dt, -0.178706_dt, 0.0931175_dt, 0.13662_dt, 0.135692_dt, 0.0332802_dt, -0.0685741_dt, -0.0934675_dt, 0.0396115_dt, 0.0658095_dt, 0.127375_dt, 0.0882014_dt };
+    const Tensor weights_grad_golden{ -0.203692_dt, -0.040303_dt, -0.0103565_dt, -0.00322031_dt, 0.123285_dt, 0.364278_dt,
+        -0.212533_dt, -0.275286_dt, -0.0393915_dt, 0.00963532_dt, 0.00600555_dt, 0.135424_dt, 0.438877_dt, -0.277228_dt,
+        0.113417_dt, -0.00481048_dt, -0.0369466_dt, -0.0169279_dt, -0.0122646_dt, -0.106245_dt, 0.100252_dt, 0.158256_dt,
+        -0.013932_dt, -0.0628694_dt, -0.02858_dt, -0.00217656_dt, -0.12266_dt, 0.135095_dt, 0.162233_dt, -0.00957381_dt,
+        -0.0570698_dt, -0.0260638_dt, -0.0119724_dt, -0.14243_dt, 0.141616_dt, 0.0424192_dt, -0.000129548_dt, -0.0112015_dt,
+        -0.00518423_dt, -0.00804145_dt, -0.0456544_dt, 0.0386037_dt, -0.0953879_dt, -0.00646925_dt, 0.0145925_dt, 0.00701345_dt,
+        0.0320699_dt, 0.126624_dt, -0.0912952_dt, -0.127934_dt, -0.00706192_dt, 0.0221023_dt, 0.0105157_dt, 0.0396716_dt,
+        0.164106_dt, -0.121374_dt, 0.0513396_dt, 0.000563504_dt, -0.0124282_dt, -0.00577961_dt, -0.0112227_dt, -0.057808_dt,
+        0.0471998_dt, 0.0330133_dt, -0.0431794_dt, -0.076238_dt, -0.0336284_dt, 0.0828679_dt, 0.117152_dt, 0.00145179_dt,
+        0.0805727_dt, -0.0695029_dt, -0.129828_dt, -0.0574246_dt, 0.128013_dt, 0.158749_dt, 0.0273584_dt, 0.128517_dt,
+        0.0132377_dt, -0.0125735_dt, -0.00634302_dt, -0.052563_dt, -0.186628_dt, 0.126004_dt };
+
+    const auto& inputs_grad = memory_manager[Name("in").grad()];
+    const auto& inputs = memory_manager["in"];
+
+    EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+    for (size_t i = 0; i < inputs_grad.size(); ++i)
+    {
+        const auto val = inputs_grad[i];
+        const auto golden_val = inputs_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto& hidden_grad = memory_manager[Name("hidden").grad()];
+    EXPECT_EQ(hidden_grad_golden.size(), hidden_grad.size());
+    for (size_t i = 0; i < hidden_grad.size(); ++i)
+    {
+        const auto val = hidden_grad[i];
+        const auto golden_val = hidden_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto& cell_grad = memory_manager[Name("cell").grad()];
+    EXPECT_EQ(cell_grad_golden.size(), cell_grad.size());
+    for (size_t i = 0; i < cell_grad.size(); ++i)
+    {
+        const auto val = cell_grad[i];
+        const auto golden_val = cell_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    // Check trainable params
+    const auto paramGrad = work.getTrainableParameters();
+    
+    const auto biases_grad = paramGrad[0].Gradient;
+    EXPECT_EQ(biases_grad_golden.size(), biases_grad.size());
+    for (size_t i = 0; i < biases_grad.size(); ++i)
+    {
+        const auto val = biases_grad[i];
+        const auto golden_val = biases_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+
+    const auto weights_grad = paramGrad[1].Gradient;
+    EXPECT_EQ(weights_grad_golden.size(), weights_grad.size());
+    for (size_t i = 0; i < weights_grad.size(); ++i)
+    {
+        const auto val = weights_grad[i];
+        const auto golden_val = weights_grad_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }
+}
+
+#ifdef ANDROID
+TEST(TestLSTMCell, FP16FusionOffAndOnUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps_rel = 1e-1_hf;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    const TensorFP16 input_init{ 1.752833e-01_hf, -9.315211e-01_hf, -1.505490e+00_hf, -6.609825e-01_hf, 1.323202e+00_hf, 3.711430e-02_hf, -2.849093e-01_hf, -1.334417e-01_hf };
+    const TensorFP16 hidden_init{ 1.892910e+00_hf, 3.111044e+00_hf, -4.583958e-01_hf, -3.359881e-01_hf, -1.569986e+00_hf, 1.231500e+00_hf };
+    const TensorFP16 cell_init{ 1.394632e+00_hf, 1.171102e+00_hf, 4.335119e-01_hf, -1.734250e+00_hf, -1.336049e+00_hf, 8.870960e-01_hf };
+
+    const TensorFP16 hidden_golden{ 0.955755_hf, 0.94595_hf, 0.86123_hf, -0.538692_hf, -0.283524_hf, 0.846563_hf };
+    const TensorFP16 cell_golden{ 2.33095_hf, 2.11324_hf, 1.39484_hf, -0.684574_hf, -0.323745_hf, 1.69076_hf };
+
+    const TensorFP16 inputs_grad_golden{ 0.296787_hf, 0.296787_hf, 0.296787_hf, 0.296787_hf, 0.389521_hf, 0.389521_hf, 0.389521_hf, 0.389521_hf };
+    const TensorFP16 hidden_grad_golden{ 0.296787_hf, 0.296787_hf, 0.296787_hf, 0.389521_hf, 0.389521_hf, 0.389521_hf };
+    const TensorFP16 cell_grad_golden{ 1.42496_hf, 1.30811_hf, 0.327377_hf, -1.74987_hf, -2.37358_hf, 0.932454_hf };
+
+    const TensorFP16 biases_ih_grad_golden{ -0.123729_hf, -0.184057_hf, 0.094158_hf, 0.336505_hf, 0.337471_hf, 0.0813232_hf,
+        -0.0694392_hf, -0.0959693_hf, 0.0400979_hf, 0.0640461_hf, 0.118326_hf, 0.087575_hf };
+    const TensorFP16 weights_ih_grad_golden{ -0.206206_hf, -0.0404442_hf, -0.00992588_hf, -0.00301529_hf, -0.282549_hf,
+        -0.0397434_hf, 0.0109667_hf, 0.00663612_hf, 0.114829_hf, -0.00474223_hf, -0.0372058_hf, -0.0170506_hf, 0.385924_hf,
+        -0.0375827_hf, -0.158969_hf, -0.072174_hf, 0.4008_hf, -0.0260736_hf, -0.144787_hf, -0.0660544_hf, 0.103369_hf,
+        -0.00055761_hf, -0.0276757_hf, -0.0127994_hf, -0.0965421_hf, -0.00650937_hf, 0.0148289_hf, 0.00712453_hf, -0.131265_hf,
+        -0.00717158_hf, 0.0227939_hf, 0.0108404_hf, 0.051987_hf, 0.000584807_hf, -0.0125626_hf, -0.00584275_hf, 0.0307254_hf,
+        -0.0432066_hf, -0.0756874_hf, -0.0333723_hf, 0.0686959_hf, -0.0697576_hf, -0.127148_hf, -0.056173_hf, 0.127667_hf,
+        0.0131973_hf, -0.0124167_hf, -0.00626879_hf };
+
+    // Initialization
+    for (size_t q = 0; q < 2; ++q)
+    {
+        raul::WorkflowEager work{ raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16 };
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+        work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+        work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+        // Network
+        bool useFusion = (q == 1 ? true : false);
+        const auto params = raul::LSTMCellParams{ "in", "hidden", "cell", "new_hidden", "new_cell", {}, true, 0.0_dt, false, 0.0_dt, false, useFusion };
+        raul::LSTMCellLayer("lstm_cell", params, networkParameters);
+        TENSORS_CREATE(batch_size)
+            
+        auto& memory_manager = work.getMemoryManager<raul::MemoryManagerFP16>();
+        memory_manager["in"] = TORANGE_FP16(input_init);
+        memory_manager["hidden"] = TORANGE_FP16(hidden_init);
+        memory_manager["cell"] = TORANGE_FP16(cell_init);
+
+        for (auto& [param, grad] : work.getTrainableParameters<MemoryManagerFP16>())
+        {
+            param = 1.0_hf;
+        }
+
+        EXPECT_NO_THROW(work.forwardPassTraining());
+
+        // Checks
+        const auto& hidden_new_tensor = memory_manager["new_hidden"];
+        const auto& cell_new_tensor = memory_manager["new_cell"];
+        EXPECT_EQ(hidden_init.size(), hidden_new_tensor.size());
+        EXPECT_EQ(cell_init.size(), cell_new_tensor.size());
+
+        for (size_t i = 0; i < hidden_new_tensor.size(); ++i)
+        {
+            const auto val = hidden_new_tensor[i];
+            const auto golden_val = hidden_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+        
+        for (size_t i = 0; i < cell_new_tensor.size(); ++i)
+        {
+            const auto val = cell_new_tensor[i];
+            const auto golden_val = cell_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+
+        memory_manager[Name("new_hidden").grad()] = TORANGE_FP16(hidden_init);
+        memory_manager[Name("new_cell").grad()] = TORANGE_FP16(cell_init);
+
+        EXPECT_NO_THROW(work.backwardPassTraining());
+        const auto& inputs_grad = memory_manager[Name("in").grad()];
+        const auto& inputs = memory_manager["in"];
+
+        EXPECT_EQ(inputs.size(), inputs_grad.size());
+
+        for (size_t i = 0; i < inputs_grad.size(); ++i)
+        {
+            const auto val = inputs_grad[i];
+            const auto golden_val = inputs_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+
+        const auto& hidden_grad = memory_manager[Name("hidden").grad()];
+        EXPECT_EQ(hidden_grad_golden.size(), hidden_grad.size());
+        for (size_t i = 0; i < hidden_grad.size(); ++i)
+        {
+            const auto val = hidden_grad[i];
+            const auto golden_val = hidden_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+
+        const auto& cell_grad = memory_manager[Name("cell").grad()];
+        EXPECT_EQ(cell_grad_golden.size(), cell_grad.size());
+        for (size_t i = 0; i < cell_grad.size(); ++i)
+        {
+            const auto val = cell_grad[i];
+            const auto golden_val = cell_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+
+        // Check trainable params grads
+        const auto paramGrad = work.getTrainableParameters<MemoryManagerFP16>();
+        
+        const auto biases_ih_grad = paramGrad[(useFusion ? 2 : 0)].Gradient;
+        EXPECT_EQ(biases_ih_grad_golden.size(), biases_ih_grad.size());
+        for (size_t i = 0; i < biases_ih_grad.size(); ++i)
+        {
+            const auto val = biases_ih_grad[i];
+            const auto golden_val = biases_ih_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+
+        const auto weights_ih_grad = paramGrad[(useFusion ? 3: 1)].Gradient;
+        EXPECT_EQ(weights_ih_grad_golden.size(), weights_ih_grad.size());
+        for (size_t i = 0; i < weights_ih_grad.size(); ++i)
+        {
+            const auto val = weights_ih_grad[i];
+            const auto golden_val = weights_ih_grad_golden[i];
+            ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << TODTYPE(golden_val) << ", got: " << TODTYPE(val);
+        }
+    }
+}
+#endif // ANDROID
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_LSTMFused.cpp b/training/src/tests/tests/layers/Test_Layer_LSTMFused.cpp
new file mode 100644
index 00000000..824a75d2
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_LSTMFused.cpp
@@ -0,0 +1,173 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/composite/rnn/LSTMFusedLayer.h>
+#include <training/compiler/Layers.h>
+
+namespace UT
+{
+
+TEST(TestLSTMFused, SharedForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-3_dt;
+    const auto input_size = 3U;
+    const auto hidden_size = 2U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params1 = raul::LSTMParams( { "in" }, {"out"}, hidden_size, true, true, false, false, 0.0_dt, false, 0.0_dt, false );
+    auto params2 = raul::LSTMParams{ { "in" }, { "out2" }, hidden_size, true, true, false, false, 0.0_dt, false, 0.0_dt, false };
+    params2.getSharedLayer() = raul::Name("lstm::cell");
+    work.add<raul::LSTMFusedLayer>("lstm_cell", raul::LSTMParams(params1), "lstm", "hidden", "cell");
+    work.add<raul::LSTMFusedLayer>("lstm_cell2", params2, "lstm2", "hidden[0]", "cell[0]");
+    TENSORS_CREATE(batch_size)
+
+    memory_manager["in"] = { -0.0209, -0.7185, 0.5186, -1.3125,  0.1920, 0.5428 };
+    memory_manager["hidden"] = { -2.2188,  0.2590, -1.0297, -0.5008 };
+    memory_manager["cell"] = { 0.2734, -0.9181, -0.0404,  0.2881 };
+
+    memory_manager[Name("lstm") / ("cell") / "linear_ih" / "Weights"] = { 
+        -0.0053,  0.3793, -0.5820, -0.5204, -0.2723,  0.1896, 
+        -0.0140,  0.5607, -0.0628,  0.1871, -0.2137, -0.1390, 
+        -0.6755, -0.4683, -0.2915,  0.0262,  0.2795,  0.4243, 
+        -0.4794, -0.3079,  0.2568,  0.5872, -0.1455,  0.5291 };
+
+    memory_manager[Name("lstm") / ("cell") / "linear_ih" / "Biases"] = { 0.0372, -0.3625, 0.1196, -0.6602, -0.5109, -0.3645, 0.4461, 0.4146 };
+
+    memory_manager[Name("lstm") / ("cell") / "linear_hh" / "Weights"] = { 
+        -0.1140,  0.0748,  0.6403, -0.6560, -0.4452, -0.1790, -0.2756, 0.6109, 
+        -0.4583, -0.3255, -0.4940, -0.6622, -0.4128,  0.6078,  0.3155, 0.3427 };
+
+    memory_manager[Name("lstm") / ("cell") / "linear_hh" / "Biases"] = { -0.3136, -0.0255, 0.4522, 0.7030, 0.2806, 0.0955, 0.4741, -0.4163 };
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const raul::Tensor new_hidden_golden(batch_size, 1, 1, hidden_size, { 0.3941_dt, -0.2222_dt, 0.2289_dt, 0.1148_dt });
+    const raul::Tensor new_cell_golden(batch_size, 1, 1, hidden_size, { 0.4616_dt, -0.5580_dt, 0.2754_dt, 0.4600_dt });
+
+    const raul::Tensor new_hidden_golden2(batch_size, 1, 1, hidden_size, { 0.1377_dt, -0.2434_dt, 0.2198_dt, 0.0282_dt });
+    const raul::Tensor new_cell_golden2(batch_size, 1, 1, hidden_size, { 0.1913_dt, -0.4290_dt, 0.2704_dt, 0.0704_dt });
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["hidden[0]"];
+    const auto& cell_new_tensor = memory_manager["cell[0]"];
+
+    const auto& hidden_new_tensor2 = memory_manager["hidden[0][0]"];
+    const auto& cell_new_tensor2 = memory_manager["cell[0][0]"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor2.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor2.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = new_cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = new_hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor2[i];
+        const auto golden_cell_val = new_cell_golden2[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor2[i];
+        const auto golden_hidden_val = new_hidden_golden2[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+}
+
+TEST(TestLSTMFused, SingleParamsForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-5_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto batch_size = 2U;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data1", DataParams{ { "in" }, 1, 1, input_size });
+    work.add<raul::DataLayer>("data2", DataParams{ { "hidden" }, 1, 1, hidden_size });
+    work.add<raul::DataLayer>("data3", DataParams{ { "cell" }, 1, 1, hidden_size });
+
+    // Network
+    const auto params1 = raul::LSTMParams( { "in" }, {"out"}, hidden_size, true, true, false, false, 0.0_dt, true, 0.0_dt, false );
+    work.add<raul::LSTMFusedLayer>("lstm_cell", raul::LSTMParams(params1), "lstm", "hidden", "cell");
+    TENSORS_CREATE(batch_size)
+
+    const Tensor input_init{ 1.752833e-01_dt, -9.315211e-01_dt, -1.505490e+00_dt, -6.609825e-01_dt, 1.323202e+00_dt, 3.711430e-02_dt, -2.849093e-01_dt, -1.334417e-01_dt };
+    const Tensor hidden_init{ 1.892910e+00_dt, 3.111044e+00_dt, -4.583958e-01_dt, -3.359881e-01_dt, -1.569986e+00_dt, 1.231500e+00_dt };
+    const Tensor cell_init{ 1.394632e+00_dt, 1.171102e+00_dt, 4.335119e-01_dt, -1.734250e+00_dt, -1.336049e+00_dt, 8.870960e-01_dt };
+    memory_manager["in"] = TORANGE(input_init);
+    memory_manager["hidden"] = TORANGE(hidden_init);
+    memory_manager["cell"] = TORANGE(cell_init);
+
+    memory_manager[Name("lstm") / "cell" / "linear" / "Weights"] = 1.0_dt;
+    memory_manager[Name("lstm") / "cell" / "linear" / "Biases"] = 2.0_dt;
+
+    // Apply
+    work.forwardPassTesting();
+
+    // Checks
+    const Tensor hidden_golden{ 9.557552e-01_dt, 9.459500e-01_dt, 8.612298e-01_dt, -5.386918e-01_dt, -2.835236e-01_dt, 8.465633e-01_dt };
+    const Tensor cell_golden{ 2.330955e+00_dt, 2.113240e+00_dt, 1.394835e+00_dt, -6.845742e-01_dt, -3.237444e-01_dt, 1.690755e+00_dt };
+
+    const auto& hidden_input_tensor = memory_manager["hidden"];
+    const auto& cell_input_tensor = memory_manager["cell"];
+
+    const auto& hidden_new_tensor = memory_manager["hidden[0]"];
+    const auto& cell_new_tensor = memory_manager["cell[0]"];
+
+    EXPECT_EQ(hidden_input_tensor.size(), hidden_new_tensor.size());
+    EXPECT_EQ(cell_input_tensor.size(), cell_new_tensor.size());
+
+    for (size_t i = 0; i < hidden_input_tensor.size(); ++i)
+    {
+        const auto cell_val = cell_new_tensor[i];
+        const auto golden_cell_val = cell_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(cell_val, golden_cell_val, eps_rel)) << "at " << i << ", expected: " << golden_cell_val << ", got: " << cell_val;
+
+        const auto hidden_val = hidden_new_tensor[i];
+        const auto golden_hidden_val = hidden_golden[i];
+        ASSERT_TRUE(tools::expect_near_relative(hidden_val, golden_hidden_val, eps_rel)) << "at " << i << ", expected: " << golden_hidden_val << ", got: " << hidden_val;
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_LocationSensitiveAttention.cpp b/training/src/tests/tests/layers/Test_Layer_LocationSensitiveAttention.cpp
new file mode 100644
index 00000000..f4d9f557
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_LocationSensitiveAttention.cpp
@@ -0,0 +1,1324 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <tests/tools/callbacks/TensorChecker.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/composite/LocationSensitiveAttentionLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerLocationSensitiveAttention, IncorrectParamsUnit)
+{
+    PROFILE_TEST
+    const size_t numUnits = 4;
+    const size_t queryDepth = 3;
+    const size_t alignmentsSize = 2;
+    const size_t anyNumber = 3;
+
+    const raul::Name parent = "parent";
+
+    // Wrong params
+    raul::LocationSensitiveAttentionParams incorrectParams[]{
+        { raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "next_state", "max_attn" }, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, false } },
+        { raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "values", "next_state", "max_attn" }, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, false } },
+        { raul::LocationSensitiveAttentionParams{ { "query", "state", "memory" }, { "alignment" }, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, true } },
+        { raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "values", "next_state", "max_attn" }, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, true } },
+        { raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "next_state", "max_attn" }, parent, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, false } },
+        { raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "values", "next_state", "max_attn" }, parent, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, false } },
+        { raul::LocationSensitiveAttentionParams{ { "query", "state", "memory" }, { "alignment" }, parent, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, true } },
+        { raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "values", "next_state", "max_attn" }, parent, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, true } },
+        { raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory", "memory_seq_length" }, { "alignment" }, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, false } },
+        { raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memory_seq_length" },
+                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                  numUnits,
+                                                  raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false },
+                                                  false } },
+        { raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memory_seq_length" },
+                                                  { "alignment", "values", "next_state" },
+                                                  parent,
+                                                  numUnits,
+                                                  raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false },
+                                                  false } },
+        { raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memory_seq_length" },
+                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                  parent,
+                                                  numUnits,
+                                                  raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false },
+                                                  false } },
+        { raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory", "memory_seq_length" }, { "alignment" }, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, true } },
+        { raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory", "memory_seq_length" }, { "alignment", "values" }, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, true } },
+        { raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory", "memory_seq_length" }, { "alignment" }, parent, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false }, true } },
+        { raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memory_seq_length" },
+                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                  parent,
+                                                  numUnits,
+                                                  raul::LocationSensitiveAttentionParams::hparams{ 1, 2, false, false },
+                                                  true } }
+    };
+
+    for (size_t i = 0; i < std::size(incorrectParams); ++i)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        // Inputs
+        work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+        work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+        work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+        if (i > 1)
+        {
+            work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memory_seq_legnth" }, 1u, 1u, 1u });
+        }
+
+        // Layer
+        ASSERT_THROW(raul::LocationSensitiveAttentionLayer("attn", incorrectParams[i], networkParameters), raul::Exception);
+    }
+}
+
+TEST(TestLayerLocationSensitiveAttention, GetTrainableParametersUnit)
+{
+    PROFILE_TEST
+    const size_t numUnits = 4;
+    const size_t queryDepth = 3;
+    const size_t alignmentsSize = 2;
+    const size_t anyNumber = 3;
+    const size_t batchSize = 1;
+
+    const size_t goldenTrainableParams = 7u;
+    // List of trainable parameters:
+    // 1. attention_variable_projection;
+    // 2. attention_bias;
+    // 3. memory_layer::Weights;
+    // 4. query_layer::Weights;
+    // 5. location_convolution::Weights;
+    // 6. location_convolution::Biases;
+    // 7. location_layer::Weights;
+    // Optional: transition_agent_layer::Weights, transition_agent_layer::Biases.
+
+    const raul::Tensor query{ 0.01975703_dt, 0.00704217_dt, 0.18987215_dt };
+    const raul::Tensor state{ 0.01975703_dt, 0.00704217_dt };
+    const raul::Tensor memory{ 0.01975703_dt, 0.00704217_dt, 0.18987215_dt, 0.7772658_dt, 0.41817415_dt, 0.7437942_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(
+        "attn",
+        raul::LocationSensitiveAttentionParams{ { "query", "state", "memory" }, { "alignment" }, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 1u, 2u, false, false }, false },
+        networkParameters);
+
+    TENSORS_CREATE(batchSize);
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(work.getTrainableParameterNames().size(), goldenTrainableParams);
+}
+
+TEST(TestLayerLocationSensitiveAttention, NoMaskNotConstrainedNoTransitionAgentNoSmoothingNotCumulativeNoUseForwardUnit)
+{
+    PROFILE_TEST
+    constexpr size_t numUnits = 4;
+    constexpr size_t queryDepth = 5;
+    constexpr size_t alignmentsSize = 3;
+    constexpr size_t anyNumber = 5;
+    constexpr size_t batchSize = 2;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name name = "attn";
+
+    const raul::Tensor query{ 0.11943877_dt, 0.95280254_dt, 0.9744879_dt, 0.5722927_dt, 0.45100963_dt, 0.8541292_dt, 0.3453902_dt, 0.6201925_dt, 0.06198227_dt, 0.3225391_dt };
+    const raul::Tensor state{ 0.15373075_dt, 0.8988131_dt, 0.92626953_dt, 0.9800353_dt, 0.52614915_dt, 0.72589886_dt };
+    const raul::Tensor memory{ 0.01975703_dt, 0.00704217_dt, 0.18987215_dt, 0.7772658_dt,  0.41817415_dt, 0.7437942_dt,  0.26365364_dt, 0.4459244_dt,  0.82929873_dt, 0.52497685_dt,
+                               0.55597556_dt, 0.19923508_dt, 0.46925998_dt, 0.18594062_dt, 0.23303056_dt, 0.3938471_dt,  0.9660922_dt,  0.36530995_dt, 0.28173566_dt, 0.4888971_dt,
+                               0.96301997_dt, 0.45836866_dt, 0.70952535_dt, 0.477888_dt,   0.71620464_dt, 0.12221897_dt, 0.2998824_dt,  0.6689563_dt,  0.06436884_dt, 0.23358119_dt };
+
+    // Real output
+    const raul::Tensor realAlignment{ 0.2108239_dt, 0.38865748_dt, 0.40051863_dt, 0.3382085_dt, 0.32790488_dt, 0.33388662_dt };
+    const raul::Tensor realMaxAttnIndices{ 2.0_dt, 0.0_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realMaxAttnIndices" }, 1u, 1u, 1u });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(
+        name,
+        raul::LocationSensitiveAttentionParams{ { "query", "state", "memory" }, { "alignment", "max_attn" }, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 3, 3, false, false }, false },
+        networkParameters);
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["realAlignment"] = TORANGE(realAlignment);
+    memory_manager["realMaxAttnIndices"] = TORANGE(realMaxAttnIndices);
+
+    // For result stability
+    memory_manager[name / "attention_variable_projection"] = TORANGE(raul::Tensor({ -0.8425845_dt, -0.7054219_dt, -0.02346194_dt, 0.35520858_dt }));
+    memory_manager[name / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.09049082_dt, 0.2369746_dt,  -0.04944408_dt, -0.813432_dt,  -0.47131312_dt, -0.45512667_dt, 0.04958135_dt, -0.18497097_dt, 0.5842254_dt, 0.26539183_dt,
+                               0.59591985_dt, -0.7929145_dt, 0.63058674_dt,  -0.6403303_dt, -0.61891955_dt, 0.45280218_dt,  0.6099379_dt,  0.2233758_dt,   0.7512772_dt, -0.57287085_dt }));
+    memory_manager[name / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.43103278_dt, 0.33965617_dt, -0.0172509_dt, 0.5307831_dt, -0.1313616_dt, -0.68653256_dt, 0.29633683_dt, -0.19019729_dt, 0.11434728_dt }));
+    memory_manager[name / "location_layer" / "Weights"] = TORANGE(raul::Tensor(
+        { -0.6015324_dt, -0.48898125_dt, 0.18092096_dt, -0.33704627_dt, -0.50806355_dt, -0.01979709_dt, 0.65834594_dt, 0.18336487_dt, -0.7099722_dt, -0.15320659_dt, 0.60897243_dt, -0.16973108_dt }));
+    memory_manager[name / "memory_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.33970317_dt, -0.13607001_dt, 0.32126713_dt, 0.11799449_dt,  0.67575383_dt, -0.479175_dt,  0.5026809_dt,  -0.6117624_dt, -0.22085667_dt, 0.26396883_dt,
+                               0.05779284_dt,  -0.0110634_dt,  0.3426292_dt,  -0.12979311_dt, 0.5445601_dt,  0.10003304_dt, 0.81344163_dt, 0.26522362_dt, 0.2123822_dt,   -0.6793937_dt }));
+
+    tools::callbacks::TensorChecker checker({ { "alignment", "realAlignment" }, { "max_attn", "realMaxAttnIndices" } }, -1_dt, eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerLocationSensitiveAttention, NoMaskNotConstrainedNoTransitionAgentNoUseForwardUnit)
+{
+    PROFILE_TEST
+
+    // Cumulative attention mode, smoothing enabled
+
+    constexpr size_t numUnits = 5;
+    constexpr size_t queryDepth = 6;
+    constexpr size_t alignmentsSize = 4;
+    constexpr size_t anyNumber = 7;
+    constexpr size_t batchSize = 3;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name name = "attn";
+
+    const raul::Tensor query{ 0.6910331_dt,  0.7831433_dt,  0.05777133_dt, 0.9138534_dt, 0.43685377_dt, 0.8047224_dt,  0.88028264_dt, 0.31608927_dt, 0.57692194_dt,
+                              0.64140487_dt, 0.15070891_dt, 0.99759805_dt, 0.5437497_dt, 0.21818483_dt, 0.54885054_dt, 0.89362335_dt, 0.62603235_dt, 0.05133748_dt };
+    const raul::Tensor state{ 0.63547623_dt, 0.44589663_dt, 0.6047574_dt,  0.82557225_dt, 0.58478403_dt, 0.04986751_dt,
+                              0.9572661_dt,  0.20333457_dt, 0.11299467_dt, 0.05475962_dt, 0.2828188_dt,  0.5192108_dt };
+    const raul::Tensor memory{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,  0.66955376_dt, 0.9281193_dt,  0.12239242_dt,
+                               0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,  0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt, 0.06244731_dt, 0.33562684_dt,
+                               0.22166848_dt, 0.32035887_dt, 0.03924382_dt, 0.06723011_dt, 0.32712245_dt, 0.49054873_dt, 0.11453211_dt, 0.34396613_dt, 0.52225363_dt, 0.30574834_dt, 0.8817626_dt,
+                               0.8017194_dt,  0.9992852_dt,  0.65941477_dt, 0.1272459_dt,  0.19117236_dt, 0.65929854_dt, 0.7614676_dt,  0.75358987_dt, 0.41603255_dt, 0.94846773_dt, 0.8904344_dt,
+                               0.91729546_dt, 0.26704276_dt, 0.17427123_dt, 0.04580772_dt, 0.98797727_dt, 0.03881574_dt, 0.22868955_dt, 0.0036062_dt,  0.6006421_dt,  0.25169027_dt, 0.45649374_dt,
+                               0.21031535_dt, 0.13384092_dt, 0.610149_dt,   0.7017927_dt,  0.56946445_dt, 0.25802827_dt, 0.09499919_dt, 0.96377003_dt, 0.21196103_dt, 0.94442093_dt, 0.04924846_dt,
+                               0.888088_dt,   0.23339641_dt, 0.4439162_dt,  0.13146889_dt, 0.9257786_dt,  0.3446467_dt,  0.9887433_dt,  0.84542334_dt, 0.7688427_dt,  0.2861563_dt,  0.47002888_dt,
+                               0.83878493_dt, 0.7776841_dt,  0.35630226_dt, 0.7507192_dt,  0.3887322_dt,  0.3603543_dt,  0.0047611_dt };
+
+    // Real output
+    const raul::Tensor realAlignment{ 0.19119766_dt, 0.27237922_dt, 0.30908975_dt, 0.22733343_dt, 0.28871754_dt, 0.28436524_dt,
+                                      0.16134506_dt, 0.26557225_dt, 0.24311139_dt, 0.2950311_dt,  0.23211668_dt, 0.22974089_dt };
+    const raul::Tensor realNextState{ 0.82667387_dt, 0.71827585_dt, 0.9138472_dt,  1.0529057_dt,  0.87350154_dt, 0.33423275_dt,
+                                      1.1186111_dt,  0.46890682_dt, 0.35610604_dt, 0.34979072_dt, 0.5149355_dt,  0.7489517_dt };
+    const raul::Tensor realMaxAttnIndices{ 2.0_dt, 0.0_dt, 1.0_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realNextState" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output2", raul::DataParams{ { "realMaxAttnIndices" }, 1u, 1u, 1u });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(
+        name,
+        raul::LocationSensitiveAttentionParams{
+            { "query", "state", "memory" }, { "alignment", "next_state", "max_attn" }, numUnits, raul::LocationSensitiveAttentionParams::hparams{ 2, 5, false, false }, true, true },
+        networkParameters);
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["realAlignment"] = TORANGE(realAlignment);
+    memory_manager["realNextState"] = TORANGE(realNextState);
+    memory_manager["realMaxAttnIndices"] = TORANGE(realMaxAttnIndices);
+
+    // For result stability
+    memory_manager[name / "attention_variable_projection"] = TORANGE(raul::Tensor({ 0.36372268_dt, -0.55428183_dt, -0.67640346_dt, -0.48148942_dt, 0.46192443_dt }));
+    memory_manager[name / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.01492912_dt, -0.22769517_dt, -0.46085864_dt, -0.5776096_dt, -0.3043793_dt, -0.32275102_dt, -0.08340913_dt, -0.09443533_dt, 0.0720188_dt,  0.19336885_dt,
+                               0.6329126_dt,  0.4352892_dt,   -0.13510555_dt, 0.14927697_dt, 0.07225263_dt, 0.5521658_dt,   -0.57645786_dt, -0.59377813_dt, 0.7274594_dt,  -0.06407022_dt,
+                               0.0731563_dt,  0.04765564_dt,  0.68469685_dt,  0.703242_dt,   0.27867514_dt, 0.3732596_dt,   -0.32511705_dt, 0.09024948_dt,  0.27464074_dt, 0.49236614_dt }));
+    memory_manager[name / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.4165066_dt, 0.16978645_dt, 0.01862907_dt, -0.13777304_dt, 0.10233486_dt, -0.57113034_dt, -0.40776673_dt, 0.25892937_dt, -0.01023513_dt, -0.19474253_dt }));
+    memory_manager[name / "location_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.4879959_dt, 0.6891403_dt, -0.48369777_dt, -0.42060506_dt, 0.00672376_dt, 0.21907902_dt, 0.50649774_dt, -0.55083364_dt, 0.20511496_dt, 0.68524706_dt }));
+    memory_manager[name / "memory_layer" / "Weights"] = TORANGE(raul::Tensor(
+        { -0.4735765_dt,  0.20152837_dt,  0.1932261_dt,  -0.670368_dt,   0.4062776_dt,   0.57766455_dt,  0.57984453_dt,  0.5677802_dt,   0.67286223_dt,  0.16185504_dt,  -0.0836153_dt, -0.6228876_dt,
+          -0.04033387_dt, 0.09919816_dt,  0.18522543_dt, -0.09178317_dt, 0.55060273_dt,  -0.34977636_dt, -0.60656494_dt, -0.36432242_dt, -0.00504225_dt, -0.09256577_dt, 0.22641826_dt, 0.18068236_dt,
+          0.5462021_dt,   -0.27094254_dt, 0.18809599_dt, 0.13281602_dt,  -0.29424265_dt, 0.14834511_dt,  0.04521954_dt,  0.5477156_dt,   -0.35188082_dt, 0.12166631_dt,  0.05859524_dt }));
+
+    tools::callbacks::TensorChecker checker({ { "alignment", "realAlignment" }, { "next_state", "realNextState" }, { "max_attn", "realMaxAttnIndices" } }, -1_dt, eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerLocationSensitiveAttention, UseMaskNotConstrainedNoTransitionAgentNoUseForwardUnit)
+{
+    PROFILE_TEST
+
+    // Cumulative attention mode, smoothing enabled, mask memory
+
+    constexpr size_t numUnits = 5;
+    constexpr size_t queryDepth = 6;
+    constexpr size_t alignmentsSize = 4;
+    constexpr size_t anyNumber = 7;
+    constexpr size_t batchSize = 3;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name name = "attn";
+
+    const raul::Tensor query{ 0.6910331_dt,  0.7831433_dt,  0.05777133_dt, 0.9138534_dt, 0.43685377_dt, 0.8047224_dt,  0.88028264_dt, 0.31608927_dt, 0.57692194_dt,
+                              0.64140487_dt, 0.15070891_dt, 0.99759805_dt, 0.5437497_dt, 0.21818483_dt, 0.54885054_dt, 0.89362335_dt, 0.62603235_dt, 0.05133748_dt };
+    const raul::Tensor state{ 0.63547623_dt, 0.44589663_dt, 0.6047574_dt,  0.82557225_dt, 0.58478403_dt, 0.04986751_dt,
+                              0.9572661_dt,  0.20333457_dt, 0.11299467_dt, 0.05475962_dt, 0.2828188_dt,  0.5192108_dt };
+    const raul::Tensor memory{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,  0.66955376_dt, 0.9281193_dt,  0.12239242_dt,
+                               0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,  0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt, 0.06244731_dt, 0.33562684_dt,
+                               0.22166848_dt, 0.32035887_dt, 0.03924382_dt, 0.06723011_dt, 0.32712245_dt, 0.49054873_dt, 0.11453211_dt, 0.34396613_dt, 0.52225363_dt, 0.30574834_dt, 0.8817626_dt,
+                               0.8017194_dt,  0.9992852_dt,  0.65941477_dt, 0.1272459_dt,  0.19117236_dt, 0.65929854_dt, 0.7614676_dt,  0.75358987_dt, 0.41603255_dt, 0.94846773_dt, 0.8904344_dt,
+                               0.91729546_dt, 0.26704276_dt, 0.17427123_dt, 0.04580772_dt, 0.98797727_dt, 0.03881574_dt, 0.22868955_dt, 0.0036062_dt,  0.6006421_dt,  0.25169027_dt, 0.45649374_dt,
+                               0.21031535_dt, 0.13384092_dt, 0.610149_dt,   0.7017927_dt,  0.56946445_dt, 0.25802827_dt, 0.09499919_dt, 0.96377003_dt, 0.21196103_dt, 0.94442093_dt, 0.04924846_dt,
+                               0.888088_dt,   0.23339641_dt, 0.4439162_dt,  0.13146889_dt, 0.9257786_dt,  0.3446467_dt,  0.9887433_dt,  0.84542334_dt, 0.7688427_dt,  0.2861563_dt,  0.47002888_dt,
+                               0.83878493_dt, 0.7776841_dt,  0.35630226_dt, 0.7507192_dt,  0.3887322_dt,  0.3603543_dt,  0.0047611_dt };
+    const raul::Tensor memorySeqLength{ 2.0_dt, 3.0_dt, 4.0_dt };
+
+    // Real output
+    const raul::Tensor realAlignment{ 0.41244003_dt, 0.58756_dt, 0._dt, 0._dt, 0.393119_dt, 0.38719293_dt, 0.21968812_dt, 0._dt, 0.24311139_dt, 0.2950311_dt, 0.23211668_dt, 0.22974089_dt };
+    const raul::Tensor realNextState{ 1.0479163_dt, 1.0334566_dt,  0.6047574_dt,  0.82557225_dt, 0.977903_dt,  0.43706045_dt,
+                                      1.1769543_dt, 0.20333457_dt, 0.35610604_dt, 0.34979072_dt, 0.5149355_dt, 0.7489517_dt };
+    const raul::Tensor realMaxAttnIndices{ 1.0_dt, 0.0_dt, 1.0_dt };
+    const raul::Tensor realKeys{ 0.32252908_dt, 0.63313645_dt, 0.33559194_dt,  0.3484964_dt,  -0.02079474_dt, 1.07549_dt,     0.6181176_dt,   -0.27268583_dt, 0.33390415_dt, -0.21231672_dt,
+                                 0._dt,         0._dt,         0._dt,          0._dt,         0._dt,          0._dt,          0._dt,          0._dt,          0._dt,         0._dt,
+                                 1.3118237_dt,  -0.1270132_dt, -0.6617147_dt,  0.3732552_dt,  0.05422493_dt,  0.29425073_dt,  -0.02759789_dt, -0.75341266_dt, 0.35311234_dt, 0.04271942_dt,
+                                 0.39864153_dt, 1.2514074_dt,  0.37823635_dt,  0.51803136_dt, 0.04289641_dt,  0._dt,          0._dt,          0._dt,          0._dt,         0._dt,
+                                 0.5319779_dt,  0.48356113_dt, -0.03996772_dt, 0.6395663_dt,  0.37200555_dt,  -0.06839194_dt, 0.53928274_dt,  -0.6348988_dt,  0.7259106_dt,  0.5459642_dt,
+                                 0.00554493_dt, 0.4030629_dt,  -0.18443558_dt, 0.54069936_dt, 0.07830122_dt,  -0.3060568_dt,  0.7382188_dt,   -0.34951735_dt, 0.5359505_dt,  0.2031863_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realNextState" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output2", raul::DataParams{ { "realMaxAttnIndices" }, 1u, 1u, 1u });
+    work.add<raul::DataLayer>("output3", raul::DataParams{ { "realKeys" }, 1u, alignmentsSize, numUnits });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(name,
+                                          raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memorySeqLength" },
+                                                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                                                  numUnits,
+                                                                                  raul::LocationSensitiveAttentionParams::hparams{ 2, 5, false, false },
+                                                                                  true,
+                                                                                  true },
+                                          networkParameters);
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["realAlignment"] = TORANGE(realAlignment);
+    memory_manager["realNextState"] = TORANGE(realNextState);
+    memory_manager["realMaxAttnIndices"] = TORANGE(realMaxAttnIndices);
+    memory_manager["realKeys"] = TORANGE(realKeys);
+
+    // For result stability
+    memory_manager[name / "attention_variable_projection"] = TORANGE(raul::Tensor({ 0.36372268_dt, -0.55428183_dt, -0.67640346_dt, -0.48148942_dt, 0.46192443_dt }));
+    memory_manager[name / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.01492912_dt, -0.22769517_dt, -0.46085864_dt, -0.5776096_dt, -0.3043793_dt, -0.32275102_dt, -0.08340913_dt, -0.09443533_dt, 0.0720188_dt,  0.19336885_dt,
+                               0.6329126_dt,  0.4352892_dt,   -0.13510555_dt, 0.14927697_dt, 0.07225263_dt, 0.5521658_dt,   -0.57645786_dt, -0.59377813_dt, 0.7274594_dt,  -0.06407022_dt,
+                               0.0731563_dt,  0.04765564_dt,  0.68469685_dt,  0.703242_dt,   0.27867514_dt, 0.3732596_dt,   -0.32511705_dt, 0.09024948_dt,  0.27464074_dt, 0.49236614_dt }));
+    memory_manager[name / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.4165066_dt, 0.16978645_dt, 0.01862907_dt, -0.13777304_dt, 0.10233486_dt, -0.57113034_dt, -0.40776673_dt, 0.25892937_dt, -0.01023513_dt, -0.19474253_dt }));
+    memory_manager[name / "location_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.4879959_dt, 0.6891403_dt, -0.48369777_dt, -0.42060506_dt, 0.00672376_dt, 0.21907902_dt, 0.50649774_dt, -0.55083364_dt, 0.20511496_dt, 0.68524706_dt }));
+    memory_manager[name / "memory_layer" / "Weights"] = TORANGE(raul::Tensor(
+        { -0.4735765_dt,  0.20152837_dt,  0.1932261_dt,  -0.670368_dt,   0.4062776_dt,   0.57766455_dt,  0.57984453_dt,  0.5677802_dt,   0.67286223_dt,  0.16185504_dt,  -0.0836153_dt, -0.6228876_dt,
+          -0.04033387_dt, 0.09919816_dt,  0.18522543_dt, -0.09178317_dt, 0.55060273_dt,  -0.34977636_dt, -0.60656494_dt, -0.36432242_dt, -0.00504225_dt, -0.09256577_dt, 0.22641826_dt, 0.18068236_dt,
+          0.5462021_dt,   -0.27094254_dt, 0.18809599_dt, 0.13281602_dt,  -0.29424265_dt, 0.14834511_dt,  0.04521954_dt,  0.5477156_dt,   -0.35188082_dt, 0.12166631_dt,  0.05859524_dt }));
+
+    tools::callbacks::TensorChecker checker({ { "alignment", "realAlignment" }, { "next_state", "realNextState" }, { "max_attn", "realMaxAttnIndices" }, { name / "keys", "realKeys" } }, -1_dt, eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerLocationSensitiveAttention, UseMaskNotConstrainedNoTransitionAgentUseForwardUnit)
+{
+    PROFILE_TEST
+
+    // Cumulative attention mode, smoothing enabled, use forward and mask memory
+
+    constexpr size_t numUnits = 5;
+    constexpr size_t queryDepth = 6;
+    constexpr size_t alignmentsSize = 4;
+    constexpr size_t anyNumber = 7;
+    constexpr size_t batchSize = 3;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name name = "attn";
+
+    const raul::Tensor query{ 0.6910331_dt,  0.7831433_dt,  0.05777133_dt, 0.9138534_dt, 0.43685377_dt, 0.8047224_dt,  0.88028264_dt, 0.31608927_dt, 0.57692194_dt,
+                              0.64140487_dt, 0.15070891_dt, 0.99759805_dt, 0.5437497_dt, 0.21818483_dt, 0.54885054_dt, 0.89362335_dt, 0.62603235_dt, 0.05133748_dt };
+    const raul::Tensor state{ 0.63547623_dt, 0.44589663_dt, 0.6047574_dt,  0.82557225_dt, 0.58478403_dt, 0.04986751_dt,
+                              0.9572661_dt,  0.20333457_dt, 0.11299467_dt, 0.05475962_dt, 0.2828188_dt,  0.5192108_dt };
+    const raul::Tensor memory{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,  0.66955376_dt, 0.9281193_dt,  0.12239242_dt,
+                               0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,  0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt, 0.06244731_dt, 0.33562684_dt,
+                               0.22166848_dt, 0.32035887_dt, 0.03924382_dt, 0.06723011_dt, 0.32712245_dt, 0.49054873_dt, 0.11453211_dt, 0.34396613_dt, 0.52225363_dt, 0.30574834_dt, 0.8817626_dt,
+                               0.8017194_dt,  0.9992852_dt,  0.65941477_dt, 0.1272459_dt,  0.19117236_dt, 0.65929854_dt, 0.7614676_dt,  0.75358987_dt, 0.41603255_dt, 0.94846773_dt, 0.8904344_dt,
+                               0.91729546_dt, 0.26704276_dt, 0.17427123_dt, 0.04580772_dt, 0.98797727_dt, 0.03881574_dt, 0.22868955_dt, 0.0036062_dt,  0.6006421_dt,  0.25169027_dt, 0.45649374_dt,
+                               0.21031535_dt, 0.13384092_dt, 0.610149_dt,   0.7017927_dt,  0.56946445_dt, 0.25802827_dt, 0.09499919_dt, 0.96377003_dt, 0.21196103_dt, 0.94442093_dt, 0.04924846_dt,
+                               0.888088_dt,   0.23339641_dt, 0.4439162_dt,  0.13146889_dt, 0.9257786_dt,  0.3446467_dt,  0.9887433_dt,  0.84542334_dt, 0.7688427_dt,  0.2861563_dt,  0.47002888_dt,
+                               0.83878493_dt, 0.7776841_dt,  0.35630226_dt, 0.7507192_dt,  0.3887322_dt,  0.3603543_dt,  0.0047611_dt };
+    const raul::Tensor memorySeqLength{ 2.0_dt, 3.0_dt, 4.0_dt };
+
+    // Real output
+    const raul::Tensor realAlignment{ 0.29203945_dt, 0.7079606_dt, 0._dt, 0._dt, 0.32988536_dt, 0.35261944_dt, 0.3174952_dt, 0._dt, 0.08089499_dt, 0.14574707_dt, 0.23074879_dt, 0.54260916_dt };
+    const raul::Tensor realNextState{ 0.9275157_dt, 1.1538572_dt,  0.6047574_dt,  0.82557225_dt, 0.9146694_dt,  0.40248695_dt,
+                                      1.2747613_dt, 0.20333457_dt, 0.19388966_dt, 0.20050669_dt, 0.51356757_dt, 1.06182_dt };
+    const raul::Tensor realMaxAttnIndices{ 1.0_dt, 1.0_dt, 3.0_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realNextState" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output2", raul::DataParams{ { "realMaxAttnIndices" }, 1u, 1u, 1u });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(name,
+                                          raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memorySeqLength" },
+                                                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                                                  numUnits,
+                                                                                  raul::LocationSensitiveAttentionParams::hparams{ 2, 5, false, false },
+                                                                                  true,
+                                                                                  true,
+                                                                                  0.0_dt,
+                                                                                  true },
+                                          networkParameters);
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["realAlignment"] = TORANGE(realAlignment);
+    memory_manager["realNextState"] = TORANGE(realNextState);
+    memory_manager["realMaxAttnIndices"] = TORANGE(realMaxAttnIndices);
+
+    // For result stability
+    memory_manager[name / "attention_variable_projection"] = TORANGE(raul::Tensor({ 0.36372268_dt, -0.55428183_dt, -0.67640346_dt, -0.48148942_dt, 0.46192443_dt }));
+    memory_manager[name / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.01492912_dt, -0.22769517_dt, -0.46085864_dt, -0.5776096_dt, -0.3043793_dt, -0.32275102_dt, -0.08340913_dt, -0.09443533_dt, 0.0720188_dt,  0.19336885_dt,
+                               0.6329126_dt,  0.4352892_dt,   -0.13510555_dt, 0.14927697_dt, 0.07225263_dt, 0.5521658_dt,   -0.57645786_dt, -0.59377813_dt, 0.7274594_dt,  -0.06407022_dt,
+                               0.0731563_dt,  0.04765564_dt,  0.68469685_dt,  0.703242_dt,   0.27867514_dt, 0.3732596_dt,   -0.32511705_dt, 0.09024948_dt,  0.27464074_dt, 0.49236614_dt }));
+    memory_manager[name / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.4165066_dt, 0.16978645_dt, 0.01862907_dt, -0.13777304_dt, 0.10233486_dt, -0.57113034_dt, -0.40776673_dt, 0.25892937_dt, -0.01023513_dt, -0.19474253_dt }));
+    memory_manager[name / "location_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.4879959_dt, 0.6891403_dt, -0.48369777_dt, -0.42060506_dt, 0.00672376_dt, 0.21907902_dt, 0.50649774_dt, -0.55083364_dt, 0.20511496_dt, 0.68524706_dt }));
+    memory_manager[name / "memory_layer" / "Weights"] = TORANGE(raul::Tensor(
+        { -0.4735765_dt,  0.20152837_dt,  0.1932261_dt,  -0.670368_dt,   0.4062776_dt,   0.57766455_dt,  0.57984453_dt,  0.5677802_dt,   0.67286223_dt,  0.16185504_dt,  -0.0836153_dt, -0.6228876_dt,
+          -0.04033387_dt, 0.09919816_dt,  0.18522543_dt, -0.09178317_dt, 0.55060273_dt,  -0.34977636_dt, -0.60656494_dt, -0.36432242_dt, -0.00504225_dt, -0.09256577_dt, 0.22641826_dt, 0.18068236_dt,
+          0.5462021_dt,   -0.27094254_dt, 0.18809599_dt, 0.13281602_dt,  -0.29424265_dt, 0.14834511_dt,  0.04521954_dt,  0.5477156_dt,   -0.35188082_dt, 0.12166631_dt,  0.05859524_dt }));
+
+    tools::callbacks::TensorChecker checker({ { "alignment", "realAlignment" }, { "next_state", "realNextState" }, { "max_attn", "realMaxAttnIndices" } }, -1_dt, eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerLocationSensitiveAttention, UseMaskConstrainedNoTransitionAgentNoUseForwardUnit)
+{
+    PROFILE_TEST
+
+    // Cumulative attention mode, smoothing enabled, constrained and mask memory
+
+    constexpr size_t numUnits = 5;
+    constexpr size_t queryDepth = 8;
+    constexpr size_t alignmentsSize = 5;
+    constexpr size_t anyNumber = 4;
+    constexpr size_t batchSize = 3;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name name = "attn";
+
+    const raul::Tensor query{ 0.97862554_dt, 0.99683046_dt, 0.2748139_dt,  0.03111756_dt, 0.6671331_dt, 0.24519491_dt, 0.78158295_dt, 0.41332448_dt,
+                              0.40015996_dt, 0.56719434_dt, 0.94010246_dt, 0.20412171_dt, 0.8084135_dt, 0.94816256_dt, 0.19557941_dt, 0.68961465_dt,
+                              0.5837462_dt,  0.7141627_dt,  0.9072653_dt,  0.30709636_dt, 0.9239814_dt, 0.23369503_dt, 0.71944916_dt, 0.33713996_dt };
+    const raul::Tensor state{ 0.83753383_dt, 0.16850388_dt, 0.03760135_dt, 0.7767941_dt,  0.49460685_dt, 0.77782524_dt, 0.16286612_dt, 0.26141143_dt,
+                              0.7960582_dt,  0.24748445_dt, 0.09534061_dt, 0.36989713_dt, 0.6322192_dt,  0.9825914_dt,  0.07898891_dt };
+    const raul::Tensor memory{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,  0.66955376_dt, 0.9281193_dt,
+                               0.12239242_dt, 0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,  0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt,
+                               0.06244731_dt, 0.33562684_dt, 0.22166848_dt, 0.32035887_dt, 0.03924382_dt, 0.06723011_dt, 0.32712245_dt, 0.49054873_dt, 0.11453211_dt, 0.34396613_dt,
+                               0.52225363_dt, 0.30574834_dt, 0.8817626_dt,  0.8017194_dt,  0.9992852_dt,  0.65941477_dt, 0.1272459_dt,  0.19117236_dt, 0.65929854_dt, 0.7614676_dt,
+                               0.75358987_dt, 0.41603255_dt, 0.94846773_dt, 0.8904344_dt,  0.91729546_dt, 0.26704276_dt, 0.17427123_dt, 0.04580772_dt, 0.98797727_dt, 0.03881574_dt,
+                               0.22868955_dt, 0.0036062_dt,  0.6006421_dt,  0.25169027_dt, 0.45649374_dt, 0.21031535_dt, 0.13384092_dt, 0.610149_dt,   0.7017927_dt,  0.56946445_dt };
+    const raul::Tensor memorySeqLength{ 3.0_dt, 1.0_dt, 2.0_dt };
+
+    // Real output
+    const raul::Tensor realAlignment{ 0.33902472_dt, 0.5806279_dt,  0.10332_dt,    0.41663912_dt, 0.63432395_dt, 0.33679605_dt, 0.51554906_dt, 0.20459965_dt,
+                                      0.4847585_dt,  0.56436956_dt, 0.03594348_dt, 0.2044092_dt,  0.49786937_dt, 0.7823901_dt,  0.5946217_dt };
+    const raul::Tensor realNextState{ 1.1765585_dt, 0.7491318_dt, 0.14092135_dt, 1.1934332_dt, 1.1289308_dt, 1.1146213_dt, 0.6784152_dt, 0.46601108_dt,
+                                      1.2808167_dt, 0.811854_dt,  0.13128409_dt, 0.5743063_dt, 1.1300886_dt, 1.7649815_dt, 0.6736106_dt };
+    const raul::Tensor realMaxAttnIndices{ 4.0_dt, 4.0_dt, 3.0_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realNextState" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output2", raul::DataParams{ { "realMaxAttnIndices" }, 1u, 1u, 1u });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(name,
+                                          raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memorySeqLength" },
+                                                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                                                  numUnits,
+                                                                                  raul::LocationSensitiveAttentionParams::hparams{ 5, 5, false, true },
+                                                                                  true,
+                                                                                  true },
+                                          networkParameters);
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["realAlignment"] = TORANGE(realAlignment);
+    memory_manager["realNextState"] = TORANGE(realNextState);
+    memory_manager["realMaxAttnIndices"] = TORANGE(realMaxAttnIndices);
+
+    // For result stability
+    memory_manager[name / "attention_variable_projection"] = TORANGE(raul::Tensor({ 0.36372268_dt, -0.55428183_dt, -0.67640346_dt, -0.48148942_dt, 0.46192443_dt }));
+    memory_manager[name / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.01373279_dt, -0.20944911_dt, -0.4239283_dt, -0.53132355_dt, -0.27998826_dt, -0.29688776_dt, 0.49039555_dt,  0.65391076_dt, -0.07672524_dt, -0.08686787_dt,
+                               0.06624764_dt, 0.17787349_dt,  0.5821949_dt,  0.4004078_dt,   0.22502369_dt,  0.6567254_dt,   -0.12427902_dt, 0.13731486_dt, 0.06646276_dt,  0.5079187_dt,
+                               -0.5302641_dt, -0.54619646_dt, -0.4671754_dt, -0.62143993_dt, 0.66916525_dt,  -0.058936_dt,   0.067294_dt,    0.04383683_dt, 0.6298295_dt,   0.6468886_dt,
+                               0.6016828_dt,  -0.10036236_dt, 0.25634384_dt, 0.34334886_dt,  -0.2990642_dt,  0.08301741_dt,  0.25263274_dt,  0.45291102_dt, 0.41381907_dt,  0.08302146_dt }));
+    memory_manager[name / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.29451466_dt,  0.12005717_dt, 0.01317275_dt,  -0.09742025_dt, 0.07236165_dt, -0.40385014_dt, -0.2883346_dt,  0.18309075_dt,  -0.00723732_dt,
+                               -0.13770375_dt, 0.00707558_dt, -0.24894215_dt, 0.3117578_dt,   0.39720428_dt, 0.02360603_dt,  -0.10541618_dt, -0.19693545_dt, -0.41700584_dt,
+                               -0.06090459_dt, -0.3629806_dt, 0.4160447_dt,   0.22983533_dt,  0.15451461_dt, 0.03033999_dt,  -0.15783566_dt }));
+    memory_manager[name / "location_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.40828666_dt, 0.57657623_dt,  -0.01176476_dt, -0.45261443_dt, -0.41842088_dt, -0.4046906_dt,  -0.35190347_dt, -0.00475621_dt, -0.09430981_dt,
+                               -0.29316878_dt, 0.00562549_dt,  0.18329465_dt,  -0.7279368_dt,  -0.48186913_dt, 0.54186_dt,     0.42376637_dt,  -0.4608605_dt,  0.2877854_dt,
+                               0.4080819_dt,   -0.13309819_dt, 0.17161155_dt,  0.57331884_dt,  0.20383084_dt,  -0.04049957_dt, -0.45443535_dt }));
+    memory_manager[name / "memory_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.54683906_dt, 0.23270488_dt,  0.2231183_dt,   -0.7740744_dt, 0.65561616_dt, 0.77695453_dt, 0.18689406_dt,  -0.09655064_dt, 0.21387994_dt, -0.10598212_dt,
+                               0.6357813_dt,   -0.40388697_dt, -0.10688573_dt, 0.26144528_dt, 0.20863402_dt, 0.6306999_dt,  -0.33976218_dt, 0.17129415_dt,  0.05221498_dt, 0.6324476_dt }));
+
+    tools::callbacks::TensorChecker checker({ { "alignment", "realAlignment" }, { "next_state", "realNextState" }, { "max_attn", "realMaxAttnIndices" } }, -1_dt, eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerLocationSensitiveAttention, UseMaskConstrainedNoTransitionAgentUseForwardUnit)
+{
+    PROFILE_TEST
+
+    // Cumulative attention mode, smoothing enabled, constrained, use forward and mask memory
+
+    constexpr size_t numUnits = 5;
+    constexpr size_t queryDepth = 8;
+    constexpr size_t alignmentsSize = 5;
+    constexpr size_t anyNumber = 4;
+    constexpr size_t batchSize = 3;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name name = "attn";
+
+    const raul::Tensor query{ 0.97862554_dt, 0.99683046_dt, 0.2748139_dt,  0.03111756_dt, 0.6671331_dt, 0.24519491_dt, 0.78158295_dt, 0.41332448_dt,
+                              0.40015996_dt, 0.56719434_dt, 0.94010246_dt, 0.20412171_dt, 0.8084135_dt, 0.94816256_dt, 0.19557941_dt, 0.68961465_dt,
+                              0.5837462_dt,  0.7141627_dt,  0.9072653_dt,  0.30709636_dt, 0.9239814_dt, 0.23369503_dt, 0.71944916_dt, 0.33713996_dt };
+    const raul::Tensor state{ 0.83753383_dt, 0.16850388_dt, 0.03760135_dt, 0.7767941_dt,  0.49460685_dt, 0.77782524_dt, 0.16286612_dt, 0.26141143_dt,
+                              0.7960582_dt,  0.24748445_dt, 0.09534061_dt, 0.36989713_dt, 0.6322192_dt,  0.9825914_dt,  0.07898891_dt };
+    const raul::Tensor memory{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,  0.66955376_dt, 0.9281193_dt,
+                               0.12239242_dt, 0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,  0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt,
+                               0.06244731_dt, 0.33562684_dt, 0.22166848_dt, 0.32035887_dt, 0.03924382_dt, 0.06723011_dt, 0.32712245_dt, 0.49054873_dt, 0.11453211_dt, 0.34396613_dt,
+                               0.52225363_dt, 0.30574834_dt, 0.8817626_dt,  0.8017194_dt,  0.9992852_dt,  0.65941477_dt, 0.1272459_dt,  0.19117236_dt, 0.65929854_dt, 0.7614676_dt,
+                               0.75358987_dt, 0.41603255_dt, 0.94846773_dt, 0.8904344_dt,  0.91729546_dt, 0.26704276_dt, 0.17427123_dt, 0.04580772_dt, 0.98797727_dt, 0.03881574_dt,
+                               0.22868955_dt, 0.0036062_dt,  0.6006421_dt,  0.25169027_dt, 0.45649374_dt, 0.21031535_dt, 0.13384092_dt, 0.610149_dt,   0.7017927_dt,  0.56946445_dt };
+    const raul::Tensor memorySeqLength{ 3.0_dt, 1.0_dt, 2.0_dt };
+
+    // Real output
+    const raul::Tensor realAlignment{ 0.13951944_dt, 0.28702068_dt, 0.01046344_dt, 0.16672334_dt, 0.39627317_dt, 0.1353626_dt, 0.25059178_dt, 0.04485435_dt,
+                                      0.26487622_dt, 0.30431506_dt, 0.00137509_dt, 0.03816015_dt, 0.20020191_dt, 0.5069669_dt, 0.25329596_dt };
+    const raul::Tensor realNextState{ 0.9770533_dt, 0.45552456_dt, 0.04806479_dt, 0.94351745_dt, 0.89088_dt,   0.91318786_dt, 0.4134579_dt, 0.30626577_dt,
+                                      1.0609344_dt, 0.55179954_dt, 0.0967157_dt,  0.40805727_dt, 0.8324211_dt, 1.4895582_dt,  0.33228487_dt };
+    const raul::Tensor realMaxAttnIndices{ 4.0_dt, 4.0_dt, 3.0_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realNextState" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output2", raul::DataParams{ { "realMaxAttnIndices" }, 1u, 1u, 1u });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(name,
+                                          raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memorySeqLength" },
+                                                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                                                  numUnits,
+                                                                                  raul::LocationSensitiveAttentionParams::hparams{ 5, 5, false, true },
+                                                                                  true,
+                                                                                  true,
+                                                                                  0.0_dt,
+                                                                                  true },
+                                          networkParameters);
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["realAlignment"] = TORANGE(realAlignment);
+    memory_manager["realNextState"] = TORANGE(realNextState);
+    memory_manager["realMaxAttnIndices"] = TORANGE(realMaxAttnIndices);
+
+    // For result stability
+    memory_manager[name / "attention_variable_projection"] = TORANGE(raul::Tensor({ 0.36372268_dt, -0.55428183_dt, -0.67640346_dt, -0.48148942_dt, 0.46192443_dt }));
+    memory_manager[name / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.01373279_dt, -0.20944911_dt, -0.4239283_dt, -0.53132355_dt, -0.27998826_dt, -0.29688776_dt, 0.49039555_dt,  0.65391076_dt, -0.07672524_dt, -0.08686787_dt,
+                               0.06624764_dt, 0.17787349_dt,  0.5821949_dt,  0.4004078_dt,   0.22502369_dt,  0.6567254_dt,   -0.12427902_dt, 0.13731486_dt, 0.06646276_dt,  0.5079187_dt,
+                               -0.5302641_dt, -0.54619646_dt, -0.4671754_dt, -0.62143993_dt, 0.66916525_dt,  -0.058936_dt,   0.067294_dt,    0.04383683_dt, 0.6298295_dt,   0.6468886_dt,
+                               0.6016828_dt,  -0.10036236_dt, 0.25634384_dt, 0.34334886_dt,  -0.2990642_dt,  0.08301741_dt,  0.25263274_dt,  0.45291102_dt, 0.41381907_dt,  0.08302146_dt }));
+    memory_manager[name / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.29451466_dt,  0.12005717_dt, 0.01317275_dt,  -0.09742025_dt, 0.07236165_dt, -0.40385014_dt, -0.2883346_dt,  0.18309075_dt,  -0.00723732_dt,
+                               -0.13770375_dt, 0.00707558_dt, -0.24894215_dt, 0.3117578_dt,   0.39720428_dt, 0.02360603_dt,  -0.10541618_dt, -0.19693545_dt, -0.41700584_dt,
+                               -0.06090459_dt, -0.3629806_dt, 0.4160447_dt,   0.22983533_dt,  0.15451461_dt, 0.03033999_dt,  -0.15783566_dt }));
+    memory_manager[name / "location_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.40828666_dt, 0.57657623_dt,  -0.01176476_dt, -0.45261443_dt, -0.41842088_dt, -0.4046906_dt,  -0.35190347_dt, -0.00475621_dt, -0.09430981_dt,
+                               -0.29316878_dt, 0.00562549_dt,  0.18329465_dt,  -0.7279368_dt,  -0.48186913_dt, 0.54186_dt,     0.42376637_dt,  -0.4608605_dt,  0.2877854_dt,
+                               0.4080819_dt,   -0.13309819_dt, 0.17161155_dt,  0.57331884_dt,  0.20383084_dt,  -0.04049957_dt, -0.45443535_dt }));
+    memory_manager[name / "memory_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.54683906_dt, 0.23270488_dt,  0.2231183_dt,   -0.7740744_dt, 0.65561616_dt, 0.77695453_dt, 0.18689406_dt,  -0.09655064_dt, 0.21387994_dt, -0.10598212_dt,
+                               0.6357813_dt,   -0.40388697_dt, -0.10688573_dt, 0.26144528_dt, 0.20863402_dt, 0.6306999_dt,  -0.33976218_dt, 0.17129415_dt,  0.05221498_dt, 0.6324476_dt }));
+
+    tools::callbacks::TensorChecker checker({ { "alignment", "realAlignment" }, { "next_state", "realNextState" }, { "max_attn", "realMaxAttnIndices" } }, -1_dt, eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerLocationSensitiveAttention, UseMaskNotConstrainedUseTransitionAgentNoUseForwardUnit)
+{
+    PROFILE_TEST
+
+    // Cumulative attention mode, smoothing enabled, use transition agent and mask memory
+
+    constexpr size_t numUnits = 7;
+    constexpr size_t queryDepth = 4;
+    constexpr size_t alignmentsSize = 3;
+    constexpr size_t anyNumber = 7;
+    constexpr size_t batchSize = 3;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name name = "attn";
+
+    const raul::Tensor query{ 0.22413874_dt, 0.22268498_dt, 0.8552655_dt,  0.49562013_dt, 0.31110537_dt, 0.61050725_dt,
+                              0.21236408_dt, 0.93036723_dt, 0.54842377_dt, 0.84664714_dt, 0.47629058_dt, 0.89816856_dt };
+    const raul::Tensor state{ 0.7847103_dt, 0.11388123_dt, 0.59057367_dt, 0.9255452_dt, 0.6040523_dt, 0.91908026_dt, 0.34620273_dt, 0.6069509_dt, 0.18924046_dt };
+    const raul::Tensor memory{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,  0.66955376_dt, 0.9281193_dt,  0.12239242_dt,
+                               0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,  0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt, 0.06244731_dt, 0.33562684_dt,
+                               0.22166848_dt, 0.32035887_dt, 0.03924382_dt, 0.06723011_dt, 0.32712245_dt, 0.49054873_dt, 0.11453211_dt, 0.34396613_dt, 0.52225363_dt, 0.30574834_dt, 0.8817626_dt,
+                               0.8017194_dt,  0.9992852_dt,  0.65941477_dt, 0.1272459_dt,  0.19117236_dt, 0.65929854_dt, 0.7614676_dt,  0.75358987_dt, 0.41603255_dt, 0.94846773_dt, 0.8904344_dt,
+                               0.91729546_dt, 0.26704276_dt, 0.17427123_dt, 0.04580772_dt, 0.98797727_dt, 0.03881574_dt, 0.22868955_dt, 0.0036062_dt,  0.6006421_dt,  0.25169027_dt, 0.45649374_dt,
+                               0.21031535_dt, 0.13384092_dt, 0.610149_dt,   0.7017927_dt,  0.56946445_dt, 0.25802827_dt, 0.09499919_dt, 0.96377003_dt };
+    const raul::Tensor memorySeqLength{ 2.0_dt, 2.0_dt, 2.0_dt };
+
+    // Real output
+    const raul::Tensor realAlignment{ 0.47029987_dt, 0.52970016_dt, 0._dt, 0.45886216_dt, 0.5411378_dt, 0._dt, 0.51346195_dt, 0.48653802_dt, 0._dt };
+    const raul::Tensor realNextState{ 1.2550101_dt, 0.6435814_dt, 0.59057367_dt, 1.3844074_dt, 1.1451901_dt, 0.91908026_dt, 0.8596647_dt, 1.0934889_dt, 0.18924046_dt };
+    const raul::Tensor realMaxAttnIndices{ 1.0_dt, 1.0_dt, 0.0_dt };
+    const raul::Tensor realTransitProba{ 0.5761429_dt, 0.58464974_dt, 0.603815_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realNextState" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output2", raul::DataParams{ { "realMaxAttnIndices" }, 1u, 1u, 1u });
+    work.add<raul::DataLayer>("output3", raul::DataParams{ { "realTransitProba" }, 1u, 1u, 1u });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(name,
+                                          raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memorySeqLength" },
+                                                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                                                  numUnits,
+                                                                                  raul::LocationSensitiveAttentionParams::hparams{ 5, 5, true, false },
+                                                                                  true,
+                                                                                  true },
+                                          networkParameters);
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["realAlignment"] = TORANGE(realAlignment);
+    memory_manager["realNextState"] = TORANGE(realNextState);
+    memory_manager["realMaxAttnIndices"] = TORANGE(realMaxAttnIndices);
+    memory_manager["realTransitProba"] = TORANGE(realTransitProba);
+
+    // For result stability
+    memory_manager[name / "attention_variable_projection"] = TORANGE(raul::Tensor({ -0.21917742_dt, 0.19094306_dt, 0.26173806_dt, -0.31035906_dt, 0.06741166_dt, 0.22481245_dt, -0.6266_dt }));
+    memory_manager[name / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.4863749_dt,   0.3023644_dt,  0.03898406_dt,  0.37956053_dt, 0.19826788_dt,  -0.01195204_dt, -0.17408913_dt, 0.25517255_dt,  0.02175409_dt,  -0.22741026_dt,
+                               -0.32522815_dt, 0.0501048_dt,  -0.16088426_dt, 0.01168489_dt, -0.68866247_dt, -0.260657_dt,   0.11950135_dt,  -0.41111445_dt, -0.10058063_dt, -0.15056169_dt,
+                               -0.6669365_dt,  0.51485103_dt, -0.5994427_dt,  -0.4474305_dt, -0.47616896_dt, 0.6559612_dt,   0.6870752_dt,   0.4812215_dt }));
+    memory_manager[name / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.2357244_dt, -0.23364823_dt, 0.00324789_dt,  0.24466163_dt,  0.09907997_dt, 0.3328864_dt,   -0.20317155_dt, 0.10582519_dt, -0.26607794_dt,
+                               0.33100575_dt, -0.00679237_dt, -0.00274599_dt, -0.4202745_dt,  0.16615295_dt, 0.1176818_dt,   -0.26131704_dt, -0.0544498_dt, -0.27820724_dt,
+                               0.23560613_dt, -0.02338243_dt, -0.24157539_dt, -0.16926107_dt, 0.31284302_dt, -0.07684425_dt, -0.26236838_dt }));
+    memory_manager[name / "location_layer" / "Weights"] = TORANGE(raul::Tensor(
+        { 0.3320319_dt,   0.02018768_dt,  0.2187472_dt,   -0.40487307_dt, -0.63422394_dt, -0.50598776_dt, -0.6050468_dt,  -0.4294378_dt, 0.16675436_dt, 0.38511664_dt, -0.617469_dt,  -0.21875578_dt,
+          0.360884_dt,    -0.29847285_dt, -0.5667498_dt,  -0.43953767_dt, 0.3544187_dt,   -0.0836367_dt,  0.49457508_dt,  -0.5571702_dt, 0.4216774_dt,  -0.4902776_dt, -0.4861246_dt, -0.43035218_dt,
+          -0.18000495_dt, 0.28884786_dt,  -0.33423108_dt, 0.13451302_dt,  0.47462338_dt,  -0.29468757_dt, -0.12878579_dt, 0.17698961_dt, 0.610215_dt,   0.20963287_dt, -0.6354886_dt }));
+    memory_manager[name / "memory_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.43844664_dt, -0.08497471_dt, 0.04186517_dt,  -0.57668185_dt, 0.17414308_dt,  -0.07473892_dt, -0.1215468_dt,  0.5256624_dt,   0.20962256_dt,  -0.6206402_dt,
+                               -0.56157005_dt, 0.1126411_dt,   -0.2717067_dt,  -0.6420031_dt,  0.17148542_dt,  0.1373409_dt,   -0.07741272_dt, -0.25084403_dt, 0.53683174_dt,  0.30630547_dt,
+                               -0.04506826_dt, -0.08569926_dt, 0.17889261_dt,  -0.32382998_dt, -0.32577834_dt, 0.09183967_dt,  0.5495213_dt,   -0.26685473_dt, -0.27241576_dt, 0.14984864_dt,
+                               0.50568485_dt,  0.5348134_dt,   -0.00466824_dt, 0.22064257_dt,  0.3274873_dt,   0.18657899_dt,  0.50975907_dt,  0.50708616_dt,  -0.03734189_dt, 0.12296373_dt,
+                               -0.37171817_dt, 0.02866983_dt,  0.6229495_dt,   0.16727936_dt,  0.37614_dt,     -0.33729702_dt, 0.05424863_dt,  -0.41042358_dt, 0.1795525_dt }));
+    memory_manager[name / "transition_agent_layer" / "Weights"] = TORANGE(
+        raul::Tensor({ 0.01429349_dt, -0.07985818_dt, -0.12935376_dt, 0.6964893_dt, 0.26681113_dt, -0.21800154_dt, -0.09041494_dt, 0.1429218_dt, -0.06134254_dt, 0.3573689_dt, -0.44123855_dt }));
+
+    tools::callbacks::TensorChecker checker(
+        { { "alignment", "realAlignment" }, { "next_state", "realNextState" }, { "max_attn", "realMaxAttnIndices" }, { name / "transit_proba", "realTransitProba" } }, -1_dt, eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerLocationSensitiveAttention, UseMaskNotConstrainedUseTransitionAgentUseForwardUnit)
+{
+    PROFILE_TEST
+
+    // Cumulative attention mode, smoothing enabled, use transition agent, use forward and mask memory
+
+    constexpr size_t numUnits = 7;
+    constexpr size_t queryDepth = 4;
+    constexpr size_t alignmentsSize = 3;
+    constexpr size_t anyNumber = 7;
+    constexpr size_t batchSize = 3;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name name = "attn";
+
+    const raul::Tensor query{ 0.22413874_dt, 0.22268498_dt, 0.8552655_dt,  0.49562013_dt, 0.31110537_dt, 0.61050725_dt,
+                              0.21236408_dt, 0.93036723_dt, 0.54842377_dt, 0.84664714_dt, 0.47629058_dt, 0.89816856_dt };
+    const raul::Tensor state{ 0.7847103_dt, 0.11388123_dt, 0.59057367_dt, 0.9255452_dt, 0.6040523_dt, 0.91908026_dt, 0.34620273_dt, 0.6069509_dt, 0.18924046_dt };
+    const raul::Tensor memory{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,  0.66955376_dt, 0.9281193_dt,  0.12239242_dt,
+                               0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,  0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt, 0.06244731_dt, 0.33562684_dt,
+                               0.22166848_dt, 0.32035887_dt, 0.03924382_dt, 0.06723011_dt, 0.32712245_dt, 0.49054873_dt, 0.11453211_dt, 0.34396613_dt, 0.52225363_dt, 0.30574834_dt, 0.8817626_dt,
+                               0.8017194_dt,  0.9992852_dt,  0.65941477_dt, 0.1272459_dt,  0.19117236_dt, 0.65929854_dt, 0.7614676_dt,  0.75358987_dt, 0.41603255_dt, 0.94846773_dt, 0.8904344_dt,
+                               0.91729546_dt, 0.26704276_dt, 0.17427123_dt, 0.04580772_dt, 0.98797727_dt, 0.03881574_dt, 0.22868955_dt, 0.0036062_dt,  0.6006421_dt,  0.25169027_dt, 0.45649374_dt,
+                               0.21031535_dt, 0.13384092_dt, 0.610149_dt,   0.7017927_dt,  0.56946445_dt, 0.25802827_dt, 0.09499919_dt, 0.96377003_dt };
+    const raul::Tensor memorySeqLength{ 2.0_dt, 2.0_dt, 2.0_dt };
+
+    // Real output
+    const raul::Tensor realAlignment{ 0.37113705_dt, 0.6288629_dt, 0._dt, 0.2915739_dt, 0.70842606_dt, 0._dt, 0.24358198_dt, 0.75641805_dt, 0._dt };
+    const raul::Tensor realNextState{ 1.1558473_dt, 0.74274415_dt, 0.59057367_dt, 1.2171191_dt, 1.3124783_dt, 0.91908026_dt, 0.58978474_dt, 1.363369_dt, 0.18924046_dt };
+    const raul::Tensor realMaxAttnIndices{ 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor realTransitProba{ 0.5761429_dt, 0.58464974_dt, 0.603815_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realNextState" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output2", raul::DataParams{ { "realMaxAttnIndices" }, 1u, 1u, 1u });
+    work.add<raul::DataLayer>("output3", raul::DataParams{ { "realTransitProba" }, 1u, 1u, 1u });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(name,
+                                          raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memorySeqLength" },
+                                                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                                                  numUnits,
+                                                                                  raul::LocationSensitiveAttentionParams::hparams{ 5, 5, true, false },
+                                                                                  true,
+                                                                                  true,
+                                                                                  0.0_dt,
+                                                                                  true },
+                                          networkParameters);
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["realAlignment"] = TORANGE(realAlignment);
+    memory_manager["realNextState"] = TORANGE(realNextState);
+    memory_manager["realMaxAttnIndices"] = TORANGE(realMaxAttnIndices);
+    memory_manager["realTransitProba"] = TORANGE(realTransitProba);
+
+    // For result stability
+    memory_manager[name / "attention_variable_projection"] = TORANGE(raul::Tensor({ -0.21917742_dt, 0.19094306_dt, 0.26173806_dt, -0.31035906_dt, 0.06741166_dt, 0.22481245_dt, -0.6266_dt }));
+    memory_manager[name / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.4863749_dt,   0.3023644_dt,  0.03898406_dt,  0.37956053_dt, 0.19826788_dt,  -0.01195204_dt, -0.17408913_dt, 0.25517255_dt,  0.02175409_dt,  -0.22741026_dt,
+                               -0.32522815_dt, 0.0501048_dt,  -0.16088426_dt, 0.01168489_dt, -0.68866247_dt, -0.260657_dt,   0.11950135_dt,  -0.41111445_dt, -0.10058063_dt, -0.15056169_dt,
+                               -0.6669365_dt,  0.51485103_dt, -0.5994427_dt,  -0.4474305_dt, -0.47616896_dt, 0.6559612_dt,   0.6870752_dt,   0.4812215_dt }));
+    memory_manager[name / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.2357244_dt, -0.23364823_dt, 0.00324789_dt,  0.24466163_dt,  0.09907997_dt, 0.3328864_dt,   -0.20317155_dt, 0.10582519_dt, -0.26607794_dt,
+                               0.33100575_dt, -0.00679237_dt, -0.00274599_dt, -0.4202745_dt,  0.16615295_dt, 0.1176818_dt,   -0.26131704_dt, -0.0544498_dt, -0.27820724_dt,
+                               0.23560613_dt, -0.02338243_dt, -0.24157539_dt, -0.16926107_dt, 0.31284302_dt, -0.07684425_dt, -0.26236838_dt }));
+    memory_manager[name / "location_layer" / "Weights"] = TORANGE(raul::Tensor(
+        { 0.3320319_dt,   0.02018768_dt,  0.2187472_dt,   -0.40487307_dt, -0.63422394_dt, -0.50598776_dt, -0.6050468_dt,  -0.4294378_dt, 0.16675436_dt, 0.38511664_dt, -0.617469_dt,  -0.21875578_dt,
+          0.360884_dt,    -0.29847285_dt, -0.5667498_dt,  -0.43953767_dt, 0.3544187_dt,   -0.0836367_dt,  0.49457508_dt,  -0.5571702_dt, 0.4216774_dt,  -0.4902776_dt, -0.4861246_dt, -0.43035218_dt,
+          -0.18000495_dt, 0.28884786_dt,  -0.33423108_dt, 0.13451302_dt,  0.47462338_dt,  -0.29468757_dt, -0.12878579_dt, 0.17698961_dt, 0.610215_dt,   0.20963287_dt, -0.6354886_dt }));
+    memory_manager[name / "memory_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.43844664_dt, -0.08497471_dt, 0.04186517_dt,  -0.57668185_dt, 0.17414308_dt,  -0.07473892_dt, -0.1215468_dt,  0.5256624_dt,   0.20962256_dt,  -0.6206402_dt,
+                               -0.56157005_dt, 0.1126411_dt,   -0.2717067_dt,  -0.6420031_dt,  0.17148542_dt,  0.1373409_dt,   -0.07741272_dt, -0.25084403_dt, 0.53683174_dt,  0.30630547_dt,
+                               -0.04506826_dt, -0.08569926_dt, 0.17889261_dt,  -0.32382998_dt, -0.32577834_dt, 0.09183967_dt,  0.5495213_dt,   -0.26685473_dt, -0.27241576_dt, 0.14984864_dt,
+                               0.50568485_dt,  0.5348134_dt,   -0.00466824_dt, 0.22064257_dt,  0.3274873_dt,   0.18657899_dt,  0.50975907_dt,  0.50708616_dt,  -0.03734189_dt, 0.12296373_dt,
+                               -0.37171817_dt, 0.02866983_dt,  0.6229495_dt,   0.16727936_dt,  0.37614_dt,     -0.33729702_dt, 0.05424863_dt,  -0.41042358_dt, 0.1795525_dt }));
+    memory_manager[name / "transition_agent_layer" / "Weights"] = TORANGE(
+        raul::Tensor({ 0.01429349_dt, -0.07985818_dt, -0.12935376_dt, 0.6964893_dt, 0.26681113_dt, -0.21800154_dt, -0.09041494_dt, 0.1429218_dt, -0.06134254_dt, 0.3573689_dt, -0.44123855_dt }));
+
+    tools::callbacks::TensorChecker checker(
+        { { "alignment", "realAlignment" }, { "next_state", "realNextState" }, { "max_attn", "realMaxAttnIndices" }, { name / "transit_proba", "realTransitProba" } }, -1_dt, eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerLocationSensitiveAttention, EverythingEnabledNoSmoothingNoConstrainsUnit)
+{
+    PROFILE_TEST
+
+    // Cumulative attention mode, smoothing enabled, use transition agent, use forward, use constraints and mask memory
+
+    constexpr size_t numUnits = 7;
+    constexpr size_t queryDepth = 4;
+    constexpr size_t alignmentsSize = 3;
+    constexpr size_t anyNumber = 7;
+    constexpr size_t batchSize = 3;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name name = "attn";
+
+    const raul::Tensor query{ 0.22413874_dt, 0.22268498_dt, 0.8552655_dt,  0.49562013_dt, 0.31110537_dt, 0.61050725_dt,
+                              0.21236408_dt, 0.93036723_dt, 0.54842377_dt, 0.84664714_dt, 0.47629058_dt, 0.89816856_dt };
+    const raul::Tensor state{ 0.7847103_dt, 0.11388123_dt, 0.59057367_dt, 0.9255452_dt, 0.6040523_dt, 0.91908026_dt, 0.34620273_dt, 0.6069509_dt, 0.18924046_dt };
+    const raul::Tensor memory{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,  0.66955376_dt, 0.9281193_dt,  0.12239242_dt,
+                               0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,  0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt, 0.06244731_dt, 0.33562684_dt,
+                               0.22166848_dt, 0.32035887_dt, 0.03924382_dt, 0.06723011_dt, 0.32712245_dt, 0.49054873_dt, 0.11453211_dt, 0.34396613_dt, 0.52225363_dt, 0.30574834_dt, 0.8817626_dt,
+                               0.8017194_dt,  0.9992852_dt,  0.65941477_dt, 0.1272459_dt,  0.19117236_dt, 0.65929854_dt, 0.7614676_dt,  0.75358987_dt, 0.41603255_dt, 0.94846773_dt, 0.8904344_dt,
+                               0.91729546_dt, 0.26704276_dt, 0.17427123_dt, 0.04580772_dt, 0.98797727_dt, 0.03881574_dt, 0.22868955_dt, 0.0036062_dt,  0.6006421_dt,  0.25169027_dt, 0.45649374_dt,
+                               0.21031535_dt, 0.13384092_dt, 0.610149_dt,   0.7017927_dt,  0.56946445_dt, 0.25802827_dt, 0.09499919_dt, 0.96377003_dt };
+    const raul::Tensor memorySeqLength{ 2.0_dt, 2.0_dt, 2.0_dt };
+
+    // Real output
+    const raul::Tensor realAlignment{ 0.35116285_dt, 0.64883715_dt, 0._dt, 0.26699635_dt, 0.7330037_dt, 0._dt, 0.25074086_dt, 0.7492592_dt, 0._dt };
+    const raul::Tensor realNextState{ 1.1358731_dt, 0.7627184_dt, 0.59057367_dt, 1.1925416_dt, 1.3370559_dt, 0.91908026_dt, 0.5969436_dt, 1.35621_dt, 0.18924046_dt };
+    const raul::Tensor realMaxAttnIndices{ 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor realTransitProba{ 0.5761429_dt, 0.58464974_dt, 0.603815_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realNextState" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output2", raul::DataParams{ { "realMaxAttnIndices" }, 1u, 1u, 1u });
+    work.add<raul::DataLayer>("output3", raul::DataParams{ { "realTransitProba" }, 1u, 1u, 1u });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(name,
+                                          raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memorySeqLength" },
+                                                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                                                  numUnits,
+                                                                                  raul::LocationSensitiveAttentionParams::hparams{ 5, 5, true, false },
+                                                                                  true,
+                                                                                  false,
+                                                                                  0.0_dt,
+                                                                                  true },
+                                          networkParameters);
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["realAlignment"] = TORANGE(realAlignment);
+    memory_manager["realNextState"] = TORANGE(realNextState);
+    memory_manager["realMaxAttnIndices"] = TORANGE(realMaxAttnIndices);
+    memory_manager["realTransitProba"] = TORANGE(realTransitProba);
+
+    // For result stability
+    memory_manager[name / "attention_variable_projection"] = TORANGE(raul::Tensor({ -0.21917742_dt, 0.19094306_dt, 0.26173806_dt, -0.31035906_dt, 0.06741166_dt, 0.22481245_dt, -0.6266_dt }));
+    memory_manager[name / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.4863749_dt,   0.3023644_dt,  0.03898406_dt,  0.37956053_dt, 0.19826788_dt,  -0.01195204_dt, -0.17408913_dt, 0.25517255_dt,  0.02175409_dt,  -0.22741026_dt,
+                               -0.32522815_dt, 0.0501048_dt,  -0.16088426_dt, 0.01168489_dt, -0.68866247_dt, -0.260657_dt,   0.11950135_dt,  -0.41111445_dt, -0.10058063_dt, -0.15056169_dt,
+                               -0.6669365_dt,  0.51485103_dt, -0.5994427_dt,  -0.4474305_dt, -0.47616896_dt, 0.6559612_dt,   0.6870752_dt,   0.4812215_dt }));
+    memory_manager[name / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.2357244_dt, -0.23364823_dt, 0.00324789_dt,  0.24466163_dt,  0.09907997_dt, 0.3328864_dt,   -0.20317155_dt, 0.10582519_dt, -0.26607794_dt,
+                               0.33100575_dt, -0.00679237_dt, -0.00274599_dt, -0.4202745_dt,  0.16615295_dt, 0.1176818_dt,   -0.26131704_dt, -0.0544498_dt, -0.27820724_dt,
+                               0.23560613_dt, -0.02338243_dt, -0.24157539_dt, -0.16926107_dt, 0.31284302_dt, -0.07684425_dt, -0.26236838_dt }));
+    memory_manager[name / "location_layer" / "Weights"] = TORANGE(raul::Tensor(
+        { 0.3320319_dt,   0.02018768_dt,  0.2187472_dt,   -0.40487307_dt, -0.63422394_dt, -0.50598776_dt, -0.6050468_dt,  -0.4294378_dt, 0.16675436_dt, 0.38511664_dt, -0.617469_dt,  -0.21875578_dt,
+          0.360884_dt,    -0.29847285_dt, -0.5667498_dt,  -0.43953767_dt, 0.3544187_dt,   -0.0836367_dt,  0.49457508_dt,  -0.5571702_dt, 0.4216774_dt,  -0.4902776_dt, -0.4861246_dt, -0.43035218_dt,
+          -0.18000495_dt, 0.28884786_dt,  -0.33423108_dt, 0.13451302_dt,  0.47462338_dt,  -0.29468757_dt, -0.12878579_dt, 0.17698961_dt, 0.610215_dt,   0.20963287_dt, -0.6354886_dt }));
+    memory_manager[name / "memory_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.43844664_dt, -0.08497471_dt, 0.04186517_dt,  -0.57668185_dt, 0.17414308_dt,  -0.07473892_dt, -0.1215468_dt,  0.5256624_dt,   0.20962256_dt,  -0.6206402_dt,
+                               -0.56157005_dt, 0.1126411_dt,   -0.2717067_dt,  -0.6420031_dt,  0.17148542_dt,  0.1373409_dt,   -0.07741272_dt, -0.25084403_dt, 0.53683174_dt,  0.30630547_dt,
+                               -0.04506826_dt, -0.08569926_dt, 0.17889261_dt,  -0.32382998_dt, -0.32577834_dt, 0.09183967_dt,  0.5495213_dt,   -0.26685473_dt, -0.27241576_dt, 0.14984864_dt,
+                               0.50568485_dt,  0.5348134_dt,   -0.00466824_dt, 0.22064257_dt,  0.3274873_dt,   0.18657899_dt,  0.50975907_dt,  0.50708616_dt,  -0.03734189_dt, 0.12296373_dt,
+                               -0.37171817_dt, 0.02866983_dt,  0.6229495_dt,   0.16727936_dt,  0.37614_dt,     -0.33729702_dt, 0.05424863_dt,  -0.41042358_dt, 0.1795525_dt }));
+    memory_manager[name / "transition_agent_layer" / "Weights"] = TORANGE(
+        raul::Tensor({ 0.01429349_dt, -0.07985818_dt, -0.12935376_dt, 0.6964893_dt, 0.26681113_dt, -0.21800154_dt, -0.09041494_dt, 0.1429218_dt, -0.06134254_dt, 0.3573689_dt, -0.44123855_dt }));
+
+    tools::callbacks::TensorChecker checker(
+        { { "alignment", "realAlignment" }, { "next_state", "realNextState" }, { "max_attn", "realMaxAttnIndices" }, { name / "transit_proba", "realTransitProba" } }, -1_dt, eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerLocationSensitiveAttention, EverythingEnabledUnit)
+{
+    PROFILE_TEST
+
+    // Cumulative attention mode, smoothing enabled, use transition agent, use forward, use constraints and mask memory
+
+    constexpr size_t numUnits = 7;
+    constexpr size_t queryDepth = 4;
+    constexpr size_t alignmentsSize = 3;
+    constexpr size_t anyNumber = 7;
+    constexpr size_t batchSize = 3;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name name = "attn";
+
+    const raul::Tensor query{ 0.22413874_dt, 0.22268498_dt, 0.8552655_dt,  0.49562013_dt, 0.31110537_dt, 0.61050725_dt,
+                              0.21236408_dt, 0.93036723_dt, 0.54842377_dt, 0.84664714_dt, 0.47629058_dt, 0.89816856_dt };
+    const raul::Tensor state{ 0.7847103_dt, 0.11388123_dt, 0.59057367_dt, 0.9255452_dt, 0.6040523_dt, 0.91908026_dt, 0.34620273_dt, 0.6069509_dt, 0.18924046_dt };
+    const raul::Tensor memory{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,  0.66955376_dt, 0.9281193_dt,  0.12239242_dt,
+                               0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,  0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt, 0.06244731_dt, 0.33562684_dt,
+                               0.22166848_dt, 0.32035887_dt, 0.03924382_dt, 0.06723011_dt, 0.32712245_dt, 0.49054873_dt, 0.11453211_dt, 0.34396613_dt, 0.52225363_dt, 0.30574834_dt, 0.8817626_dt,
+                               0.8017194_dt,  0.9992852_dt,  0.65941477_dt, 0.1272459_dt,  0.19117236_dt, 0.65929854_dt, 0.7614676_dt,  0.75358987_dt, 0.41603255_dt, 0.94846773_dt, 0.8904344_dt,
+                               0.91729546_dt, 0.26704276_dt, 0.17427123_dt, 0.04580772_dt, 0.98797727_dt, 0.03881574_dt, 0.22868955_dt, 0.0036062_dt,  0.6006421_dt,  0.25169027_dt, 0.45649374_dt,
+                               0.21031535_dt, 0.13384092_dt, 0.610149_dt,   0.7017927_dt,  0.56946445_dt, 0.25802827_dt, 0.09499919_dt, 0.96377003_dt };
+    const raul::Tensor memorySeqLength{ 2.0_dt, 2.0_dt, 2.0_dt };
+
+    // Real output
+    const raul::Tensor realAlignment{ 0.22365978_dt, 0.56735224_dt, 0.20898801_dt, 0.10303543_dt, 0.49437153_dt, 0.40259302_dt, 0.04951486_dt, 0.48649505_dt, 0.46399006_dt };
+    const raul::Tensor realNextState{ 1.00837_dt, 0.68123347_dt, 0.7995617_dt, 1.0285807_dt, 1.0984238_dt, 1.3216733_dt, 0.3957176_dt, 1.0934459_dt, 0.65323055_dt };
+    const raul::Tensor realMaxAttnIndices{ 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor realTransitProba{ 0.5761429_dt, 0.58464974_dt, 0.603815_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory" }, 1u, alignmentsSize, anyNumber });
+    work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memorySeqLength" }, 1u, 1u, 1u });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realNextState" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output2", raul::DataParams{ { "realMaxAttnIndices" }, 1u, 1u, 1u });
+    work.add<raul::DataLayer>("output3", raul::DataParams{ { "realTransitProba" }, 1u, 1u, 1u });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(name,
+                                          raul::LocationSensitiveAttentionParams{ { "query", "state", "memory", "memorySeqLength" },
+                                                                                  { "alignment", "values", "next_state", "max_attn" },
+                                                                                  numUnits,
+                                                                                  raul::LocationSensitiveAttentionParams::hparams{ 5, 5, true, true },
+                                                                                  true,
+                                                                                  true,
+                                                                                  0.0_dt,
+                                                                                  true },
+                                          networkParameters);
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query"] = TORANGE(query);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory"] = TORANGE(memory);
+    memory_manager["memorySeqLength"] = TORANGE(memorySeqLength);
+    memory_manager["realAlignment"] = TORANGE(realAlignment);
+    memory_manager["realNextState"] = TORANGE(realNextState);
+    memory_manager["realMaxAttnIndices"] = TORANGE(realMaxAttnIndices);
+    memory_manager["realTransitProba"] = TORANGE(realTransitProba);
+
+    // For result stability
+    memory_manager[name / "attention_variable_projection"] = TORANGE(raul::Tensor({ -0.21917742_dt, 0.19094306_dt, 0.26173806_dt, -0.31035906_dt, 0.06741166_dt, 0.22481245_dt, -0.6266_dt }));
+    memory_manager[name / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.4863749_dt,   0.3023644_dt,  0.03898406_dt,  0.37956053_dt, 0.19826788_dt,  -0.01195204_dt, -0.17408913_dt, 0.25517255_dt,  0.02175409_dt,  -0.22741026_dt,
+                               -0.32522815_dt, 0.0501048_dt,  -0.16088426_dt, 0.01168489_dt, -0.68866247_dt, -0.260657_dt,   0.11950135_dt,  -0.41111445_dt, -0.10058063_dt, -0.15056169_dt,
+                               -0.6669365_dt,  0.51485103_dt, -0.5994427_dt,  -0.4474305_dt, -0.47616896_dt, 0.6559612_dt,   0.6870752_dt,   0.4812215_dt }));
+    memory_manager[name / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.2357244_dt, -0.23364823_dt, 0.00324789_dt,  0.24466163_dt,  0.09907997_dt, 0.3328864_dt,   -0.20317155_dt, 0.10582519_dt, -0.26607794_dt,
+                               0.33100575_dt, -0.00679237_dt, -0.00274599_dt, -0.4202745_dt,  0.16615295_dt, 0.1176818_dt,   -0.26131704_dt, -0.0544498_dt, -0.27820724_dt,
+                               0.23560613_dt, -0.02338243_dt, -0.24157539_dt, -0.16926107_dt, 0.31284302_dt, -0.07684425_dt, -0.26236838_dt }));
+    memory_manager[name / "location_layer" / "Weights"] = TORANGE(raul::Tensor(
+        { 0.3320319_dt,   0.02018768_dt,  0.2187472_dt,   -0.40487307_dt, -0.63422394_dt, -0.50598776_dt, -0.6050468_dt,  -0.4294378_dt, 0.16675436_dt, 0.38511664_dt, -0.617469_dt,  -0.21875578_dt,
+          0.360884_dt,    -0.29847285_dt, -0.5667498_dt,  -0.43953767_dt, 0.3544187_dt,   -0.0836367_dt,  0.49457508_dt,  -0.5571702_dt, 0.4216774_dt,  -0.4902776_dt, -0.4861246_dt, -0.43035218_dt,
+          -0.18000495_dt, 0.28884786_dt,  -0.33423108_dt, 0.13451302_dt,  0.47462338_dt,  -0.29468757_dt, -0.12878579_dt, 0.17698961_dt, 0.610215_dt,   0.20963287_dt, -0.6354886_dt }));
+    memory_manager[name / "memory_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.43844664_dt, -0.08497471_dt, 0.04186517_dt,  -0.57668185_dt, 0.17414308_dt,  -0.07473892_dt, -0.1215468_dt,  0.5256624_dt,   0.20962256_dt,  -0.6206402_dt,
+                               -0.56157005_dt, 0.1126411_dt,   -0.2717067_dt,  -0.6420031_dt,  0.17148542_dt,  0.1373409_dt,   -0.07741272_dt, -0.25084403_dt, 0.53683174_dt,  0.30630547_dt,
+                               -0.04506826_dt, -0.08569926_dt, 0.17889261_dt,  -0.32382998_dt, -0.32577834_dt, 0.09183967_dt,  0.5495213_dt,   -0.26685473_dt, -0.27241576_dt, 0.14984864_dt,
+                               0.50568485_dt,  0.5348134_dt,   -0.00466824_dt, 0.22064257_dt,  0.3274873_dt,   0.18657899_dt,  0.50975907_dt,  0.50708616_dt,  -0.03734189_dt, 0.12296373_dt,
+                               -0.37171817_dt, 0.02866983_dt,  0.6229495_dt,   0.16727936_dt,  0.37614_dt,     -0.33729702_dt, 0.05424863_dt,  -0.41042358_dt, 0.1795525_dt }));
+    memory_manager[name / "transition_agent_layer" / "Weights"] = TORANGE(
+        raul::Tensor({ 0.01429349_dt, -0.07985818_dt, -0.12935376_dt, 0.6964893_dt, 0.26681113_dt, -0.21800154_dt, -0.09041494_dt, 0.1429218_dt, -0.06134254_dt, 0.3573689_dt, -0.44123855_dt }));
+
+    tools::callbacks::TensorChecker checker(
+        { { "alignment", "realAlignment" }, { "next_state", "realNextState" }, { "max_attn", "realMaxAttnIndices" }, { name / "transit_proba", "realTransitProba" } }, -1_dt, eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerLocationSensitiveAttention, DoubleStepEverythingEnabledUnit)
+{
+    PROFILE_TEST
+
+    // Cumulative attention mode, smoothing enabled, use transition agent, use forward, use constraints and mask memory
+
+    constexpr size_t numUnits = 7;
+    constexpr size_t queryDepth = 4;
+    constexpr size_t alignmentsSize = 3;
+    constexpr size_t anyNumber = 7;
+    constexpr size_t batchSize = 3;
+    constexpr raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Name parent = "pattn";
+    raul::Name child = "cattn";
+
+    const raul::Tensor query1{ 0.22413874_dt, 0.22268498_dt, 0.8552655_dt,  0.49562013_dt, 0.31110537_dt, 0.61050725_dt,
+                               0.21236408_dt, 0.93036723_dt, 0.54842377_dt, 0.84664714_dt, 0.47629058_dt, 0.89816856_dt };
+    const raul::Tensor query2{ 0.63547623_dt, 0.44589663_dt, 0.6047574_dt,  0.82557225_dt, 0.58478403_dt, 0.04986751_dt,
+                               0.9572661_dt,  0.20333457_dt, 0.11299467_dt, 0.05475962_dt, 0.2828188_dt,  0.5192108_dt };
+    const raul::Tensor state{ 0.7847103_dt, 0.11388123_dt, 0.59057367_dt, 0.9255452_dt, 0.6040523_dt, 0.91908026_dt, 0.34620273_dt, 0.6069509_dt, 0.18924046_dt };
+    const raul::Tensor memory{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,  0.66955376_dt, 0.9281193_dt,  0.12239242_dt,
+                               0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,  0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt, 0.06244731_dt, 0.33562684_dt,
+                               0.22166848_dt, 0.32035887_dt, 0.03924382_dt, 0.06723011_dt, 0.32712245_dt, 0.49054873_dt, 0.11453211_dt, 0.34396613_dt, 0.52225363_dt, 0.30574834_dt, 0.8817626_dt,
+                               0.8017194_dt,  0.9992852_dt,  0.65941477_dt, 0.1272459_dt,  0.19117236_dt, 0.65929854_dt, 0.7614676_dt,  0.75358987_dt, 0.41603255_dt, 0.94846773_dt, 0.8904344_dt,
+                               0.91729546_dt, 0.26704276_dt, 0.17427123_dt, 0.04580772_dt, 0.98797727_dt, 0.03881574_dt, 0.22868955_dt, 0.0036062_dt,  0.6006421_dt,  0.25169027_dt, 0.45649374_dt,
+                               0.21031535_dt, 0.13384092_dt, 0.610149_dt,   0.7017927_dt,  0.56946445_dt, 0.25802827_dt, 0.09499919_dt, 0.96377003_dt };
+    const raul::Tensor memorySeqLength{ 2.0_dt, 2.0_dt, 2.0_dt };
+
+    const raul::Tensor bias{ 0.6910331_dt, 0.7831433_dt, 0.05777133_dt, 0.9138534_dt, 0.43685377_dt, 0.8047224_dt, 0.88028264_dt, 0.31608927_dt, 0.57692194_dt };
+    const raul::Tensor multiplier{ 0.48647678_dt, 0.69728816_dt, 0.25368452_dt, 0.09666646_dt, 0.90073335_dt, 0.17544937_dt, 0.8146868_dt, 0.1502955_dt, 0.5608002_dt };
+
+    // Real output
+    const raul::Tensor realAlignment1{ 0.22365978_dt, 0.56735224_dt, 0.20898801_dt, 0.10303543_dt, 0.49437153_dt, 0.40259302_dt, 0.04951486_dt, 0.48649505_dt, 0.46399006_dt };
+    const raul::Tensor realNextState1{ 1.00837_dt, 0.68123347_dt, 0.7995617_dt, 1.0285807_dt, 1.0984238_dt, 1.3216733_dt, 0.3957176_dt, 1.0934459_dt, 0.65323055_dt };
+    const raul::Tensor realMaxAttnIndices1{ 1.0_dt, 1.0_dt, 1.0_dt };
+
+    const raul::Tensor realAlignment2{ 0.05648708_dt, 0.5683527_dt, 0.37516022_dt, 0.09937368_dt, 0.42305422_dt, 0.47757202_dt, 0.11208147_dt, 0.44273034_dt, 0.4451882_dt };
+    const raul::Tensor realNextState2{ 0.97117996_dt, 1.9188483_dt, 0.64191955_dt, 1.1162626_dt, 1.3542795_dt, 1.6848874_dt, 1.0418789_dt, 1.2453146_dt, 1.4861002_dt };
+    const raul::Tensor realMaxAttnIndices2{ 1.0_dt, 2.0_dt, 2.0_dt };
+
+    const raul::Tensor realFinalResult{ 0.02747965_dt, 0.3963056_dt, 0.09517234_dt, 0.0096061_dt, 0.38105905_dt, 0.08378971_dt, 0.09131129_dt, 0.06654038_dt, 0.24966162_dt };
+
+    // Initialization
+    raul::Workflow work;
+    auto& networkParameters = work.getNetworkParameters();
+
+    // Inputs
+    work.add<raul::DataLayer>("data_query", raul::DataParams{ { "query1", "query2" }, 1u, 1u, queryDepth });
+    work.add<raul::DataLayer>("data_state", raul::DataParams{ { "state", "bias", "multiplier", "realFinalResult" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("data_memory", raul::DataParams{ { "memory1", "memory2" }, 1u, alignmentsSize, anyNumber });
+    work.add<raul::DataLayer>("data_memory_seq_length", raul::DataParams{ { "memorySeqLength1", "memorySeqLength2" }, 1u, 1u, 1u });
+
+    // Outputs
+    work.add<raul::DataLayer>("output0", raul::DataParams{ { "realAlignment1", "realAlignment2" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output1", raul::DataParams{ { "realNextState1", "realNextState2" }, 1u, 1u, alignmentsSize });
+    work.add<raul::DataLayer>("output2", raul::DataParams{ { "realMaxAttnIndices1", "realMaxAttnIndices2" }, 1u, 1u, 1u });
+
+    // Layer
+    raul::LocationSensitiveAttentionLayer(parent,
+                                          raul::LocationSensitiveAttentionParams{ { "query1", "state", "memory1", "memorySeqLength1" },
+                                                                                  { "alignment1", "values", "next_state1", "max_attn1" },
+                                                                                  numUnits,
+                                                                                  raul::LocationSensitiveAttentionParams::hparams{ 5, 5, true, true },
+                                                                                  true,
+                                                                                  true,
+                                                                                  0.0_dt,
+                                                                                  true },
+                                          networkParameters);
+    work.add<raul::ElementWiseSumLayer>("calculate_biased_state", raul::ElementWiseLayerParams{ { "alignment1", "bias" }, { "biased_state" } });
+    raul::LocationSensitiveAttentionLayer(child,
+                                          raul::LocationSensitiveAttentionParams{ { "query2", "biased_state", "memory2", "memorySeqLength2" },
+                                                                                  { "alignment2", "next_state2", "max_attn2" },
+                                                                                  parent,
+                                                                                  numUnits,
+                                                                                  raul::LocationSensitiveAttentionParams::hparams{ 5, 5, true, true },
+                                                                                  true,
+                                                                                  true,
+                                                                                  0.0_dt,
+                                                                                  true },
+                                          networkParameters);
+    work.add<raul::ElementWiseMulLayer>("calculate_final_result", raul::ElementWiseLayerParams{ { "alignment2", "multiplier" }, { "final_state" } });
+
+    work.preparePipelines();
+    work.setBatchSize(batchSize);
+    work.prepareMemoryForTraining();
+
+    raul::MemoryManager& memory_manager = work.getMemoryManager();
+
+    memory_manager["query1"] = TORANGE(query1);
+    memory_manager["query2"] = TORANGE(query2);
+    memory_manager["state"] = TORANGE(state);
+    memory_manager["memory1"] = TORANGE(memory);
+    memory_manager["memory2"] = TORANGE(memory);
+    memory_manager["memorySeqLength1"] = TORANGE(memorySeqLength);
+    memory_manager["memorySeqLength2"] = TORANGE(memorySeqLength);
+    memory_manager["bias"] = TORANGE(bias);
+    memory_manager["multiplier"] = TORANGE(multiplier);
+    memory_manager["realAlignment1"] = TORANGE(realAlignment1);
+    memory_manager["realNextState1"] = TORANGE(realNextState1);
+    memory_manager["realMaxAttnIndices1"] = TORANGE(realMaxAttnIndices1);
+    memory_manager["realAlignment2"] = TORANGE(realAlignment2);
+    memory_manager["realNextState2"] = TORANGE(realNextState2);
+    memory_manager["realMaxAttnIndices2"] = TORANGE(realMaxAttnIndices2);
+    memory_manager["realFinalResult"] = TORANGE(realFinalResult);
+
+    // For result stability
+    memory_manager[parent / "attention_variable_projection"] = TORANGE(raul::Tensor({ -0.21917742_dt, 0.19094306_dt, 0.26173806_dt, -0.31035906_dt, 0.06741166_dt, 0.22481245_dt, -0.6266_dt }));
+    memory_manager[parent / "query_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ 0.4863749_dt,   0.3023644_dt,  0.03898406_dt,  0.37956053_dt, 0.19826788_dt,  -0.01195204_dt, -0.17408913_dt, 0.25517255_dt,  0.02175409_dt,  -0.22741026_dt,
+                               -0.32522815_dt, 0.0501048_dt,  -0.16088426_dt, 0.01168489_dt, -0.68866247_dt, -0.260657_dt,   0.11950135_dt,  -0.41111445_dt, -0.10058063_dt, -0.15056169_dt,
+                               -0.6669365_dt,  0.51485103_dt, -0.5994427_dt,  -0.4474305_dt, -0.47616896_dt, 0.6559612_dt,   0.6870752_dt,   0.4812215_dt }));
+    memory_manager[parent / "location_convolution" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.2357244_dt, -0.23364823_dt, 0.00324789_dt,  0.24466163_dt,  0.09907997_dt, 0.3328864_dt,   -0.20317155_dt, 0.10582519_dt, -0.26607794_dt,
+                               0.33100575_dt, -0.00679237_dt, -0.00274599_dt, -0.4202745_dt,  0.16615295_dt, 0.1176818_dt,   -0.26131704_dt, -0.0544498_dt, -0.27820724_dt,
+                               0.23560613_dt, -0.02338243_dt, -0.24157539_dt, -0.16926107_dt, 0.31284302_dt, -0.07684425_dt, -0.26236838_dt }));
+    memory_manager[parent / "location_layer" / "Weights"] = TORANGE(raul::Tensor(
+        { 0.3320319_dt,   0.02018768_dt,  0.2187472_dt,   -0.40487307_dt, -0.63422394_dt, -0.50598776_dt, -0.6050468_dt,  -0.4294378_dt, 0.16675436_dt, 0.38511664_dt, -0.617469_dt,  -0.21875578_dt,
+          0.360884_dt,    -0.29847285_dt, -0.5667498_dt,  -0.43953767_dt, 0.3544187_dt,   -0.0836367_dt,  0.49457508_dt,  -0.5571702_dt, 0.4216774_dt,  -0.4902776_dt, -0.4861246_dt, -0.43035218_dt,
+          -0.18000495_dt, 0.28884786_dt,  -0.33423108_dt, 0.13451302_dt,  0.47462338_dt,  -0.29468757_dt, -0.12878579_dt, 0.17698961_dt, 0.610215_dt,   0.20963287_dt, -0.6354886_dt }));
+    memory_manager[parent / "memory_layer" / "Weights"] =
+        TORANGE(raul::Tensor({ -0.43844664_dt, -0.08497471_dt, 0.04186517_dt,  -0.57668185_dt, 0.17414308_dt,  -0.07473892_dt, -0.1215468_dt,  0.5256624_dt,   0.20962256_dt,  -0.6206402_dt,
+                               -0.56157005_dt, 0.1126411_dt,   -0.2717067_dt,  -0.6420031_dt,  0.17148542_dt,  0.1373409_dt,   -0.07741272_dt, -0.25084403_dt, 0.53683174_dt,  0.30630547_dt,
+                               -0.04506826_dt, -0.08569926_dt, 0.17889261_dt,  -0.32382998_dt, -0.32577834_dt, 0.09183967_dt,  0.5495213_dt,   -0.26685473_dt, -0.27241576_dt, 0.14984864_dt,
+                               0.50568485_dt,  0.5348134_dt,   -0.00466824_dt, 0.22064257_dt,  0.3274873_dt,   0.18657899_dt,  0.50975907_dt,  0.50708616_dt,  -0.03734189_dt, 0.12296373_dt,
+                               -0.37171817_dt, 0.02866983_dt,  0.6229495_dt,   0.16727936_dt,  0.37614_dt,     -0.33729702_dt, 0.05424863_dt,  -0.41042358_dt, 0.1795525_dt }));
+    memory_manager[parent / "transition_agent_layer" / "Weights"] = TORANGE(
+        raul::Tensor({ 0.01429349_dt, -0.07985818_dt, -0.12935376_dt, 0.6964893_dt, 0.26681113_dt, -0.21800154_dt, -0.09041494_dt, 0.1429218_dt, -0.06134254_dt, 0.3573689_dt, -0.44123855_dt }));
+
+    tools::callbacks::TensorChecker checker({ { "alignment1", "realAlignment1" },
+                                              { "next_state1", "realNextState1" },
+                                              { "max_attn1", "realMaxAttnIndices1" },
+                                              { "alignment2", "realAlignment2" },
+                                              { "next_state2", "realNextState2" },
+                                              { "max_attn2", "realMaxAttnIndices2" },
+                                              { "final_state", "realFinalResult" } },
+                                            -1_dt,
+                                            eps);
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Log.cpp b/training/src/tests/tests/layers/Test_Layer_Log.cpp
new file mode 100644
index 00000000..ff77e629
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Log.cpp
@@ -0,0 +1,160 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/LogLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_log_layer(const raul::dtype x)
+{
+    return std::log(x);
+}
+
+raul::dtype golden_log_layer_grad(const raul::dtype x, const raul::dtype grad)
+{
+    return grad / x;
+}
+
+}
+
+TEST(TestLayerLog, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::LogLayer("log", raul::BasicParams{ { "x", "y" }, { "x_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerLog, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::LogLayer("log", raul::BasicParams{ { "x" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerLog, ForwardNegativeNumberDetectedUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::LogLayer log("log", raul::BasicParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+
+    ASSERT_THROW(log.forwardCompute(raul::NetworkMode::Test), raul::Exception);
+}
+
+TEST(TestLayerLog, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(0.1_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::LogLayer log("log", raul::BasicParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+
+    log.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_log_layer(x_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerLog, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(0.1_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::LogLayer log("log", raul::BasicParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    log.forwardCompute(raul::NetworkMode::Test);
+    log.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto golden_out_value_x = golden_log_layer_grad(x_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_NonZeroMask.cpp b/training/src/tests/tests/layers/Test_Layer_NonZeroMask.cpp
new file mode 100644
index 00000000..1c23d28a
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_NonZeroMask.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/NonZeroMaskLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_mask_layer(raul::dtype x)
+{
+    return x == 0.0_dt ? 0.0_dt : 1.0_dt;
+}
+
+}
+
+TEST(TestLayerNonZeroMask, CpuUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+
+    const raul::Tensor x{ 0.0_dt,          0.9701558948_dt, 0.9480701089_dt, 0.9787087440_dt, 0.3267636895_dt, 0.9200272560_dt, 0.0_dt,          0.0_dt,          0.8607905507_dt, 0.0_dt,
+                          0.7980576158_dt, 0.2011045814_dt, 0.1913603544_dt, 0.8979360461_dt, 0.9541048408_dt, 0.5241690278_dt, 0.6006127000_dt, 0.9887800217_dt, 0.0_dt,          0.5498082042_dt,
+                          0.0670150518_dt, 0.1167818904_dt, 0.1723778248_dt, 0.9939703345_dt, 0.6243668795_dt, 0.3656120300_dt, 0.0_dt,          0.2137093544_dt, 0.8107876778_dt, 0.7783825397_dt,
+                          0.2362361550_dt, 0.0_dt,          0.3328117728_dt, 0.9092149138_dt, 0.2501674891_dt, 0.6224393249_dt, 0.9649521708_dt, 0.5299566984_dt, 0.2069533467_dt, 0.6873005629_dt,
+                          0.1918165684_dt, 0.8134448528_dt, 0.0_dt,          0.9396399260_dt, 0.8208933473_dt, 0.4034467340_dt, 0.0_dt,          0.0_dt,          0.9788960814_dt, 0.4333596826_dt,
+                          0.7238065600_dt, 0.8973705173_dt, 0.0_dt,          0.6971374750_dt, 0.3664962053_dt, 0.0779988170_dt, 0.3857882619_dt, 0.3668601513_dt, 0.0_dt,          0.9332120419_dt,
+                          0.0_dt,          0.5823799968_dt, 0.3222199082_dt, 0.5328013897_dt, 0.0239760280_dt, 0.6003485918_dt, 0.0_dt,          0.3132150769_dt, 0.1712092757_dt, 0.2083655000_dt,
+                          0.6775689721_dt, 0.0_dt,          0.0_dt,          0.7317054272_dt, 0.3720138669_dt, 0.3189361095_dt, 0.0_dt,          0.7041678429_dt, 0.0_dt,          0.6565824151_dt,
+                          0.7744513750_dt, 0.8949885964_dt, 0.6901841164_dt, 0.0_dt,          0.3684692383_dt, 0.5173735023_dt, 0.8764913678_dt, 0.2990424037_dt, 0.9684888721_dt, 0.0940009356_dt,
+                          0.0_dt,          0.0_dt,          0.6738508344_dt, 0.3602285385_dt, 0.8780175447_dt, 0.0_dt,          0.3569628000_dt, 0.8145191073_dt, 0.6073390245_dt, 0.5124547482_dt,
+                          0.6408753395_dt, 0.1860215068_dt, 0.5974498987_dt, 0.1584112048_dt, 0.1544559598_dt, 0.8474228978_dt, 0.3584001660_dt, 0.0_dt,          0.4294191003_dt, 0.4718081951_dt,
+                          0.3983595371_dt, 0.7621403337_dt, 0.7940700650_dt, 0.6270959973_dt, 0.0_dt,          0.9852560759_dt, 0.9440631270_dt, 0.6515852809_dt, 0.2359522581_dt, 0.1550757289_dt };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+    work.add<raul::NonZeroMaskLayer>("mask", raul::BasicParams{ { "x" }, { "out" } });
+
+    TENSORS_CREATE(batch);
+    memory_manager["x"] = TORANGE(x);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Checks
+    const auto& xTensor = memory_manager["x"];
+    const auto& outTensor = memory_manager["out"];
+
+    EXPECT_EQ(outTensor.size(), xTensor.size());
+    for (size_t i = 0; i < outTensor.size(); ++i)
+    {
+        EXPECT_EQ(outTensor[i], golden_mask_layer(xTensor[i]));
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_RSqrt.cpp b/training/src/tests/tests/layers/Test_Layer_RSqrt.cpp
new file mode 100644
index 00000000..d8b2bbc1
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_RSqrt.cpp
@@ -0,0 +1,225 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/RSqrtLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_rsqrt_layer(const raul::dtype x)
+{
+    return 1.0_dt / std::sqrt(x);
+}
+
+raul::dtype golden_rsqrt_layer_grad(const raul::dtype x, const raul::dtype grad)
+{
+    return grad * (-0.5_dt / std::sqrt(x) / x);
+}
+
+}
+
+TEST(TestLayerRSqrt, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::RSqrtLayer("rsqrt", raul::ElementWiseLayerParams{ { "x", "y" }, { "x_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerRSqrt, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::RSqrtLayer("rsqrt", raul::ElementWiseLayerParams{ { { "x" }, { "x_out", "y_out" } } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerRSqrt, ForwardBackwardFailUnit)
+{
+    PROFILE_TEST
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 0.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 4 });
+
+    // Apply function
+    raul::RSqrtLayer rsqrt("rsqrt", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+
+    ASSERT_THROW(rsqrt.forwardCompute(raul::NetworkMode::Train), raul::Exception);
+    ASSERT_THROW(rsqrt.backwardCompute(), raul::Exception);
+}
+
+TEST(TestLayerRSqrt, ForwardZeroUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+
+    const raul::Tensor x{ 3.0_dt, 2.0_dt, 1.0_dt, 0.0_dt };
+    const raul::Tensor z{ 0.5773502588_dt, 0.7071067691_dt, 1.0_dt, std::numeric_limits<raul::dtype>::infinity() };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 4 });
+
+    // Apply function
+    raul::RSqrtLayer rsqrt("rsqrt", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["x"] = TORANGE(x);
+
+    rsqrt.forwardCompute(raul::NetworkMode::Train);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(z[i], out_tensor[i], eps_rel));
+    }
+}
+
+TEST(TestLayerRSqrt, BackwardZeroUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+
+    const raul::Tensor x{ 3.0_dt, 2.0_dt, 1.0_dt, 0.0_dt };
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor x_grad{ -0.0962250382_dt, -0.1767766774_dt, -0.5_dt, -std::numeric_limits<raul::dtype>::infinity() };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 4 });
+
+    // Apply function
+    raul::RSqrtLayer rsqrt("rsqrt", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    rsqrt.forwardCompute(raul::NetworkMode::Train);
+    rsqrt.backwardCompute();
+
+    // Checks
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+
+    for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], x_grad[i], eps_rel)) << "expected: " << x_grad[i] << ", got: " << x_tensor_grad[i];
+    }
+}
+
+TEST(TestLayerRSqrt, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(1.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::RSqrtLayer rsqrt("rsqrt", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+
+    rsqrt.forwardCompute(raul::NetworkMode::Train);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_rsqrt_layer(x_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerRSqrt, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(1.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::RSqrtLayer rsqrt("rsqrt", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    rsqrt.forwardCompute(raul::NetworkMode::Train);
+    rsqrt.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto golden_out_value_x = golden_rsqrt_layer_grad(x_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_RandomChoice.cpp b/training/src/tests/tests/layers/Test_Layer_RandomChoice.cpp
new file mode 100644
index 00000000..292f850c
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_RandomChoice.cpp
@@ -0,0 +1,183 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/RandomChoiceLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerRandomChoice, BadParamsUnit)
+{
+    PROFILE_TEST
+    const size_t batch = 2;
+    const size_t depth = 3;
+    const size_t height = 4;
+    const size_t width = 5;
+
+    const raul::RandomChoiceParams params[] = {
+        { { "1", "2" }, "out", { 0.3f, 0.5f, 2.f } },
+        { { "1", "2" }, "out", { 1.3f } },
+        { { "1", "2", "3" }, "out", { 0.5f } },
+    };
+
+    for (auto& p : params)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        for (size_t i = 0; i < p.getInputs().size(); ++i)
+        {
+            work.tensorNeeded(std::to_string(i + 1), std::to_string(i + 1), raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+        }
+
+        TENSORS_CREATE(batch);
+
+        // Apply function
+        EXPECT_THROW(raul::RandomChoiceLayer random("random", p, networkParameters), raul::Exception);
+    }
+}
+
+TEST(TestLayerRandomChoice, RandomChoiceUnit)
+{
+    PROFILE_TEST
+    const size_t batch = 2;
+    const size_t depth = 3;
+    const size_t height = 4;
+    const size_t width = 5;
+
+    const raul::RandomChoiceParams params[] = {
+        { { "1", "2" }, "out", { 0.3f } },
+        { { "1", "2" }, "out", { 2.f, 3.f } },
+        { { "1", "2", "3" }, "out", { 2.f, 3.f, 5.f } },
+    };
+
+    for (auto& p : params)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        for (size_t i = 0; i < p.getInputs().size(); ++i)
+        {
+            work.tensorNeeded(std::to_string(i + 1), std::to_string(i + 1), raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+        }
+
+        // Apply function
+        raul::RandomChoiceLayer random("random", p, networkParameters);
+        TENSORS_CREATE(batch);
+        for (size_t i = 0; i < p.getInputs().size(); ++i)
+        {
+            memory_manager[std::to_string(i + 1)] = TORANGE(*memory_manager.createTensor("temp_" + std::to_string(i + 1), batch, depth, height, width, TODTYPE(i + 1)));
+        }
+
+        random.forwardCompute(raul::NetworkMode::Test);
+
+        const auto& outTensor = memory_manager["out"];
+
+        EXPECT_EQ(outTensor.size(), batch * depth * height * width);
+
+        auto outVal = outTensor[0];
+
+        for (auto v : outTensor)
+        {
+            EXPECT_EQ(outVal, v);
+        }
+
+        memory_manager[raul::Name("out").grad()] = TORANGE(*memory_manager.createTensor("gradient", batch, depth, height, width, 1.0_dt));
+
+        random.backwardCompute();
+        size_t ind = static_cast<size_t>(outVal) - 1;
+
+        for (size_t i = 0; i < 2; ++i)
+        {
+            const auto& t = memory_manager[raul::Name(std::to_string(i + 1)).grad()];
+            auto grad = 0_dt;
+            if (i == ind)
+            {
+                grad = 1_dt;
+            }
+
+            for (auto v : t)
+            {
+                EXPECT_EQ(grad, v);
+            }
+        }
+    }
+}
+
+TEST(TestLayerRandomChoice, UniformRandUnit)
+{
+    PROFILE_TEST
+    const auto eps = 1e-5_dt;
+    const auto tensor_cnt = 10U;
+    const size_t batch = 1;
+    const size_t depth = 1;
+    const size_t height = 1;
+    const size_t width = 1;
+    const size_t repeat = 1'000'000;
+    const raul::dtype prob_val = raul::random::uniform::rand<raul::dtype>(0., 100.);
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    raul::Names tensors;
+
+    for (size_t i = 0; i < tensor_cnt; ++i)
+    {
+        const size_t value = i;
+        const auto name = std::to_string(value);
+        work.tensorNeeded(name, name, raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+        memory_manager.createTensor(name, batch, depth, height, width, TODTYPE(value));
+        tensors.push_back(name);
+    }
+
+    raul::RandomChoiceLayer random("random", raul::RandomChoiceParams(tensors, "out", std::vector<raul::dtype>(tensor_cnt, prob_val)), networkParameters);
+    TENSORS_CREATE(batch);
+    for (size_t i = 0; i < tensor_cnt; ++i)
+    {
+        const size_t value = i;
+        const auto name = std::to_string(value);
+        memory_manager[name] = TORANGE(*memory_manager.createTensor("temp_" + name, batch, depth, height, width, TODTYPE(value)));
+    }
+    // Apply
+    std::vector<raul::dtype> distr(tensor_cnt);
+
+    for (size_t i = 0; i < repeat; ++i)
+    {
+        random.forwardCompute(raul::NetworkMode::Train);
+        const size_t val = static_cast<size_t>(memory_manager["out"][0]);
+        ++distr[val];
+    }
+
+    std::transform(distr.cbegin(), distr.cend(), distr.begin(), [&](raul::dtype x) { return x / TODTYPE(repeat); });
+
+    // Check
+    std::vector<raul::dtype> golden_distr(tensor_cnt, 1.0_dt / TODTYPE(tensor_cnt));
+
+    raul::dtype kld = 0.0_dt;
+
+    for (size_t i = 0; i < tensor_cnt; ++i)
+    {
+        const auto p = distr[i];
+        const auto q = golden_distr[i];
+        kld += p * std::log(p / q);
+    }
+
+    EXPECT_NEAR(kld, 0.0_dt, eps);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_RandomSelect.cpp b/training/src/tests/tests/layers/Test_Layer_RandomSelect.cpp
new file mode 100644
index 00000000..ddbe2505
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_RandomSelect.cpp
@@ -0,0 +1,280 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/RandomSelectLayer.h>
+
+namespace UT
+{
+
+namespace
+{
+
+template<typename T>
+constexpr auto golden_select_layer(const bool cond, const T x, const T y)
+{
+    return cond ? x : y;
+}
+
+template<typename T>
+constexpr auto golden_select_layer_grad(const bool cond, const T grad)
+{
+    return std::make_pair(static_cast<T>(cond) * grad, grad * (static_cast<T>(1) - static_cast<T>(cond)));
+}
+
+}
+
+TEST(TestLayerRandomSelect, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::RandomSelectLayer("select", raul::RandomSelectParams{ { "x", "y" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerRandomSelect, IncorrectInputNumUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::RandomSelectLayer("select", raul::RandomSelectParams{ { "cond", "x", "y" }, { "out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerRandomSelect, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 30U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    std::vector<raul::dtype> probas = { 1.0_dt, 0.0_dt, 0.5_dt };
+
+    for (size_t i = 0; i < probas.size(); ++i)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+        work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+        // Apply function
+        raul::RandomSelectLayer select("select", raul::RandomSelectParams{ { "x", "y" }, { "out" }, probas[i] }, networkParameters);
+        TENSORS_CREATE(tensor_size);
+        tools::init_rand_tensor("x", random_range, memory_manager);
+        tools::init_rand_tensor("y", random_range, memory_manager);
+
+        select.forwardCompute(raul::NetworkMode::Train);
+
+        // Checks
+        const auto& x_tensor = memory_manager["x"];
+        const auto& y_tensor = memory_manager["y"];
+        const auto& out_tensor = memory_manager["out"];
+
+        EXPECT_EQ(out_tensor.size(), x_tensor.size());
+        EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+        if (i != probas.size() - 1)
+        {
+            for (size_t j = 0; j < out_tensor.size(); ++j)
+            {
+                const auto golden_out_value = golden_select_layer(static_cast<bool>(probas[i]), x_tensor[j], y_tensor[j]);
+                EXPECT_EQ(golden_out_value, out_tensor[j]);
+            }
+        }
+        else
+        {
+            raul::dtype part = 0.0_dt;
+            for (size_t j = 0; j < out_tensor.size(); ++j)
+            {
+                part += static_cast<raul::dtype>(out_tensor[j] == x_tensor[j]);
+            }
+            std::cout << "Elements chosen from first tensor: " << part / static_cast<raul::dtype>(out_tensor.size()) << "% (probability = 50%)" << std::endl;
+        }
+    }
+}
+
+TEST(TestLayerRandomSelect, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 30U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    const raul::Tensor deltas(tensor_size, 1.0_dt);
+
+    std::vector<raul::dtype> probas = { 1.0_dt, 0.0_dt, 0.5_dt };
+
+    for (size_t i = 0; i < probas.size(); ++i)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+        work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+        // Apply function
+        raul::RandomSelectLayer select("select", raul::RandomSelectParams{ { "x", "y" }, { "out" }, probas[i] }, networkParameters);
+        TENSORS_CREATE(tensor_size);
+        tools::init_rand_tensor("x", random_range, memory_manager);
+        tools::init_rand_tensor("y", random_range, memory_manager);
+        memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+        select.forwardCompute(raul::NetworkMode::Train);
+        select.backwardCompute();
+
+        // Checks
+        const auto& x_tensor = memory_manager["x"];
+        const auto& y_tensor = memory_manager["y"];
+        const auto& out_tensor = memory_manager["out"];
+
+        const auto& x_nabla = memory_manager["xGradient"];
+        const auto& y_nabla = memory_manager["yGradient"];
+        const auto& out_nabla = memory_manager["outGradient"];
+
+        EXPECT_EQ(x_tensor.size(), x_nabla.size());
+        EXPECT_EQ(y_tensor.size(), y_nabla.size());
+
+        if (i != probas.size() - 1)
+        {
+            for (size_t j = 0; j < out_tensor.size(); ++j)
+            {
+                const auto [golden_x_nabla, golden_y_nabla] = golden_select_layer_grad(static_cast<bool>(probas[i]), out_nabla[j]);
+                EXPECT_EQ(golden_x_nabla, x_nabla[j]);
+                EXPECT_EQ(golden_y_nabla, y_nabla[j]);
+            }
+        }
+    }
+}
+
+#ifdef ANDROID
+
+TEST(TestLayerRandomSelect, ForwardFP16Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 60U;
+    const auto random_range = std::make_pair(0.0_hf, 100.0_hf);
+
+    std::vector<raul::dtype> probas = { 1.0_dt, 0.0_dt, 0.5_dt };
+
+    for (size_t i = 0; i < probas.size(); ++i)
+    {
+        raul::WorkflowEager work{ raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16 };
+        auto& memory_manager = work.getMemoryManager<raul::MemoryManagerFP16>();
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+        work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+        // Apply function
+        raul::RandomSelectLayer select("select", raul::RandomSelectParams{ { "x", "y" }, { "out" }, probas[i] }, networkParameters);
+        TENSORS_CREATE(tensor_size);
+        tools::init_rand_tensor("x", random_range, memory_manager);
+        tools::init_rand_tensor("y", random_range, memory_manager);
+
+        select.forwardCompute(raul::NetworkMode::Train);
+
+        // Checks
+        const auto& x_tensor = memory_manager["x"];
+        const auto& y_tensor = memory_manager["y"];
+        const auto& out_tensor = memory_manager["out"];
+
+        EXPECT_EQ(out_tensor.size(), x_tensor.size());
+        EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+        if (i != probas.size() - 1)
+        {
+            for (size_t j = 0; j < out_tensor.size(); ++j)
+            {
+                const auto golden_out_value = golden_select_layer(static_cast<bool>(probas[i]), x_tensor[j], y_tensor[j]);
+                EXPECT_TRUE(golden_out_value == out_tensor[j]);
+            }
+        }
+        else
+        {
+            raul::dtype part = 0.0_dt;
+            for (size_t j = 0; j < out_tensor.size(); ++j)
+            {
+                part += static_cast<raul::dtype>(out_tensor[j] == x_tensor[j]);
+            }
+            std::cout << "Elements chosen from first tensor: " << part / static_cast<raul::dtype>(out_tensor.size()) << "% (probability = 50%)" << std::endl;
+        }
+    }
+}
+
+TEST(TestLayerRandomSelect, BackwardFP16Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 30U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    const raul::Tensor deltas(tensor_size, 1.0_hf);
+
+    std::vector<raul::dtype> probas = { 1.0_dt, 0.0_dt, 0.5_dt };
+
+    for (size_t i = 0; i < probas.size(); ++i)
+    {
+        raul::WorkflowEager work{ raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16 };
+        auto& memory_manager = work.getMemoryManager<raul::MemoryManagerFP16>();
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+        work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+        // Apply function
+        raul::RandomSelectLayer select("select", raul::RandomSelectParams{ { "x", "y" }, { "out" }, probas[i] }, networkParameters);
+        TENSORS_CREATE(tensor_size);
+        tools::init_rand_tensor("x", random_range, memory_manager);
+        tools::init_rand_tensor("y", random_range, memory_manager);
+        memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+        select.forwardCompute(raul::NetworkMode::Train);
+        select.backwardCompute();
+
+        // Checks
+        const auto& x_tensor = memory_manager["x"];
+        const auto& y_tensor = memory_manager["y"];
+        const auto& out_tensor = memory_manager["out"];
+
+        const auto& x_nabla = memory_manager["xGradient"];
+        const auto& y_nabla = memory_manager["yGradient"];
+        const auto& out_nabla = memory_manager["outGradient"];
+
+        EXPECT_EQ(x_tensor.size(), x_nabla.size());
+        EXPECT_EQ(y_tensor.size(), y_nabla.size());
+
+        if (i != probas.size() - 1)
+        {
+            for (size_t j = 0; j < out_tensor.size(); ++j)
+            {
+                const auto [golden_x_nabla, golden_y_nabla] = golden_select_layer_grad(static_cast<bool>(probas[i]), out_nabla[j]);
+                EXPECT_TRUE(golden_x_nabla == x_nabla[j]);
+                EXPECT_TRUE(golden_y_nabla == y_nabla[j]);
+            }
+        }
+    }
+}
+
+#endif // ANDROID
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_RandomTensor.cpp b/training/src/tests/tests/layers/Test_Layer_RandomTensor.cpp
new file mode 100644
index 00000000..24d864b2
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_RandomTensor.cpp
@@ -0,0 +1,107 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/RandomTensorLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerRandomTensor, ForwardUnit)
+{
+    PROFILE_TEST
+    const size_t batch = 2;
+    const size_t depth = 30;
+    const size_t height = 40;
+    const size_t width = 50;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<raul::RandomTensorLayer>("random", raul::RandomTensorLayerParams{ { "out" }, depth, height, width });
+    TENSORS_CREATE(batch);
+
+    work.forwardPassTraining();
+
+    const auto& outTensor = memory_manager["out"];
+
+    EXPECT_EQ(outTensor.size(), batch * depth * height * width);
+    {
+        raul::dtype average = std::accumulate(outTensor.begin(), outTensor.end(), 0.0_dt) / static_cast<raul::dtype>(outTensor.size());
+        printf("Average of elements is = %f\n", average);
+        raul::dtype bias = 0.0_dt;
+        for (auto d : outTensor)
+        {
+            bias += (d - average) * (d - average);
+        }
+        printf("Standard deviation of elements is = %f\n", sqrt(bias / static_cast<raul::dtype>(outTensor.size())));
+    }
+}
+
+TEST(TestLayerRandomTensor, ForwardSeedUnit)
+{
+    PROFILE_TEST
+    const size_t batch = 2;
+    const size_t depth = 30;
+    const size_t height = 40;
+    const size_t width = 50;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    raul::random::setGlobalSeed(42);
+
+    // Apply function
+    work.add<raul::RandomTensorLayer>("random", raul::RandomTensorLayerParams{ { "out" }, depth, height, width });
+    TENSORS_CREATE(batch);
+
+    work.forwardPassTraining();
+
+    const auto& outTensor = memory_manager["out"];
+
+    EXPECT_EQ(outTensor.size(), batch * depth * height * width);
+
+    {
+        raul::dtype average = std::accumulate(outTensor.begin(), outTensor.end(), 0.0_dt) / static_cast<raul::dtype>(outTensor.size());
+        printf("Average of elements is = %f\n", average);
+        raul::dtype bias = 0.0_dt;
+        for (auto d : outTensor)
+        {
+            bias += (d - average) * (d - average);
+        }
+        printf("Standard deviation of elements is = %f\n", sqrt(bias / static_cast<raul::dtype>(outTensor.size())));
+    }
+}
+
+TEST(TestLayerRandomTensor, BackwardUnit)
+{
+    PROFILE_TEST
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<raul::RandomTensorLayer>("random", raul::RandomTensorLayerParams{ { "out" }, 1u, 1u, 1u });
+    TENSORS_CREATE(1);
+
+    work.forwardPassTraining();
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ReduceMax.cpp b/training/src/tests/tests/layers/Test_Layer_ReduceMax.cpp
new file mode 100644
index 00000000..20bcf917
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ReduceMax.cpp
@@ -0,0 +1,241 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ReduceMaxLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerReduceMax, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ReduceMaxLayer("rmax", raul::BasicParamsWithDim{ { "x", "y" }, { "x_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerReduceMax, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ReduceMaxLayer("rmax", raul::BasicParamsWithDim{ { "x" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerReduceMax, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+
+    // See reduce_max.py
+    const raul::Tensor x{ 0.6645621_dt,  0.44100678_dt, 0.3528825_dt,  0.46448255_dt, 0.03366041_dt, 0.68467236_dt,  0.74011743_dt, 0.8724445_dt,  0.22632635_dt, 0.22319686_dt,  0.3103881_dt,
+                          0.7223358_dt,  0.13318717_dt, 0.5480639_dt,  0.5746088_dt,  0.8996835_dt,  0.009463668_dt, 0.5212307_dt,  0.6345445_dt,  0.1993283_dt,  0.72942245_dt,  0.54583454_dt,
+                          0.10756552_dt, 0.6767061_dt,  0.6602763_dt,  0.33695042_dt, 0.60141766_dt, 0.21062577_dt,  0.8527372_dt,  0.44062173_dt, 0.9485276_dt,  0.23752594_dt,  0.81179297_dt,
+                          0.5263394_dt,  0.494308_dt,   0.21612847_dt, 0.8457197_dt,  0.8718841_dt,  0.3083862_dt,   0.6868038_dt,  0.23764038_dt, 0.7817228_dt,  0.9671384_dt,   0.068701625_dt,
+                          0.79873943_dt, 0.66028714_dt, 0.5871513_dt,  0.16461694_dt, 0.7381023_dt,  0.32054043_dt,  0.6073899_dt,  0.46523476_dt, 0.97803545_dt, 0.7223145_dt,   0.32347047_dt,
+                          0.82577336_dt, 0.4976915_dt,  0.19483674_dt, 0.7588748_dt,  0.3380444_dt,  0.28128064_dt,  0.31513572_dt, 0.60670924_dt, 0.7498598_dt,  0.5016055_dt,   0.18282163_dt,
+                          0.13179815_dt, 0.64636123_dt, 0.9559475_dt,  0.6670735_dt,  0.30755532_dt, 0.36892188_dt,  0.44735897_dt, 0.18359458_dt, 0.5288255_dt,  0.7052754_dt,   0.898633_dt,
+                          0.31386292_dt, 0.62338257_dt, 0.96815526_dt, 0.11207926_dt, 0.29590535_dt, 0.9356605_dt,   0.1341263_dt,  0.31937933_dt, 0.262277_dt,   0.031487584_dt, 0.90045524_dt,
+                          0.6409379_dt,  0.5821855_dt,  0.20917094_dt, 0.71736085_dt, 0.363523_dt,   0.04670918_dt,  0.14977789_dt, 0.84361756_dt, 0.9355587_dt,  0.09517312_dt,  0.08617878_dt,
+                          0.6247839_dt,  0.37050653_dt, 0.5139042_dt,  0.6233207_dt,  0.8024682_dt,  0.1665138_dt,   0.22090447_dt, 0.62422717_dt, 0.08719146_dt, 0.92142665_dt,  0.9348017_dt,
+                          0.60455227_dt, 0.47940433_dt, 0.14430141_dt, 0.32600033_dt, 0.92557526_dt, 0.7757342_dt,   0.636765_dt,   0.6282351_dt,  0.35401833_dt, 0.41446733_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    raul::Tensor realOutputs[] = {
+        { 0.97803545_dt },
+        { 0.6645621_dt,  0.44100678_dt, 0.60670924_dt, 0.7498598_dt,  0.5016055_dt,  0.68467236_dt, 0.74011743_dt, 0.8724445_dt,  0.9559475_dt,  0.6670735_dt,  0.3103881_dt,  0.7223358_dt,
+          0.44735897_dt, 0.5480639_dt,  0.5746088_dt,  0.8996835_dt,  0.898633_dt,   0.5212307_dt,  0.6345445_dt,  0.96815526_dt, 0.72942245_dt, 0.54583454_dt, 0.9356605_dt,  0.6767061_dt,
+          0.6602763_dt,  0.33695042_dt, 0.60141766_dt, 0.90045524_dt, 0.8527372_dt,  0.5821855_dt,  0.9485276_dt,  0.71736085_dt, 0.81179297_dt, 0.5263394_dt,  0.494308_dt,   0.84361756_dt,
+          0.9355587_dt,  0.8718841_dt,  0.3083862_dt,  0.6868038_dt,  0.37050653_dt, 0.7817228_dt,  0.9671384_dt,  0.8024682_dt,  0.79873943_dt, 0.66028714_dt, 0.62422717_dt, 0.16461694_dt,
+          0.92142665_dt, 0.9348017_dt,  0.6073899_dt,  0.47940433_dt, 0.97803545_dt, 0.7223145_dt,  0.92557526_dt, 0.82577336_dt, 0.636765_dt,   0.6282351_dt,  0.7588748_dt,  0.41446733_dt },
+        { 0.72942245_dt, 0.7817228_dt,  0.9671384_dt,  0.6767061_dt,  0.79873943_dt, 0.68467236_dt, 0.74011743_dt, 0.8724445_dt,  0.8527372_dt,  0.44062173_dt,
+          0.9485276_dt,  0.7223358_dt,  0.97803545_dt, 0.7223145_dt,  0.5746088_dt,  0.8996835_dt,  0.8457197_dt,  0.8718841_dt,  0.7588748_dt,  0.6868038_dt,
+          0.37050653_dt, 0.5139042_dt,  0.9356605_dt,  0.8024682_dt,  0.5016055_dt,  0.262277_dt,   0.62422717_dt, 0.90045524_dt, 0.9559475_dt,  0.9348017_dt,
+          0.60455227_dt, 0.71736085_dt, 0.44735897_dt, 0.32600033_dt, 0.92557526_dt, 0.84361756_dt, 0.9355587_dt,  0.6282351_dt,  0.62338257_dt, 0.96815526_dt },
+        { 0.8996835_dt,  0.74011743_dt, 0.8724445_dt,  0.6345445_dt, 0.5746088_dt,  0.9485276_dt, 0.8457197_dt, 0.8718841_dt,  0.8527372_dt,  0.6868038_dt,
+          0.82577336_dt, 0.7817228_dt,  0.97803545_dt, 0.7588748_dt, 0.79873943_dt, 0.7052754_dt, 0.898633_dt,  0.64636123_dt, 0.9559475_dt,  0.96815526_dt,
+          0.84361756_dt, 0.9355587_dt,  0.9356605_dt,  0.6409379_dt, 0.6247839_dt,  0.7757342_dt, 0.636765_dt,  0.6282351_dt,  0.92142665_dt, 0.9348017_dt },
+        { 0.6645621_dt, 0.8724445_dt, 0.7223358_dt, 0.8996835_dt,  0.72942245_dt, 0.8527372_dt,  0.9485276_dt,  0.8718841_dt, 0.9671384_dt, 0.7381023_dt, 0.97803545_dt, 0.82577336_dt,
+          0.7498598_dt, 0.9559475_dt, 0.5288255_dt, 0.96815526_dt, 0.9356605_dt,  0.90045524_dt, 0.71736085_dt, 0.9355587_dt, 0.8024682_dt, 0.9348017_dt, 0.92557526_dt, 0.7757342_dt }
+    };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceMaxLayer>("rmax", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+
+        work.forwardPassTraining();
+
+        // Checks
+        const auto& outTensor = memory_manager["out"];
+        EXPECT_EQ(outTensor.size(), realOutputs[iter].size());
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            EXPECT_EQ(outTensor[i], realOutputs[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerReduceMax, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+
+    // See reduce_max.py
+    const raul::Tensor x{ 0.6645621_dt,  0.44100678_dt, 0.3528825_dt,  0.46448255_dt, 0.03366041_dt, 0.68467236_dt,  0.74011743_dt, 0.8724445_dt,  0.22632635_dt, 0.22319686_dt,  0.3103881_dt,
+                          0.7223358_dt,  0.13318717_dt, 0.5480639_dt,  0.5746088_dt,  0.8996835_dt,  0.009463668_dt, 0.5212307_dt,  0.6345445_dt,  0.1993283_dt,  0.72942245_dt,  0.54583454_dt,
+                          0.10756552_dt, 0.6767061_dt,  0.6602763_dt,  0.33695042_dt, 0.60141766_dt, 0.21062577_dt,  0.8527372_dt,  0.44062173_dt, 0.9485276_dt,  0.23752594_dt,  0.81179297_dt,
+                          0.5263394_dt,  0.494308_dt,   0.21612847_dt, 0.8457197_dt,  0.8718841_dt,  0.3083862_dt,   0.6868038_dt,  0.23764038_dt, 0.7817228_dt,  0.9671384_dt,   0.068701625_dt,
+                          0.79873943_dt, 0.66028714_dt, 0.5871513_dt,  0.16461694_dt, 0.7381023_dt,  0.32054043_dt,  0.6073899_dt,  0.46523476_dt, 0.97803545_dt, 0.7223145_dt,   0.32347047_dt,
+                          0.82577336_dt, 0.4976915_dt,  0.19483674_dt, 0.7588748_dt,  0.3380444_dt,  0.28128064_dt,  0.31513572_dt, 0.60670924_dt, 0.7498598_dt,  0.5016055_dt,   0.18282163_dt,
+                          0.13179815_dt, 0.64636123_dt, 0.9559475_dt,  0.6670735_dt,  0.30755532_dt, 0.36892188_dt,  0.44735897_dt, 0.18359458_dt, 0.5288255_dt,  0.7052754_dt,   0.898633_dt,
+                          0.31386292_dt, 0.62338257_dt, 0.96815526_dt, 0.11207926_dt, 0.29590535_dt, 0.9356605_dt,   0.1341263_dt,  0.31937933_dt, 0.262277_dt,   0.031487584_dt, 0.90045524_dt,
+                          0.6409379_dt,  0.5821855_dt,  0.20917094_dt, 0.71736085_dt, 0.363523_dt,   0.04670918_dt,  0.14977789_dt, 0.84361756_dt, 0.9355587_dt,  0.09517312_dt,  0.08617878_dt,
+                          0.6247839_dt,  0.37050653_dt, 0.5139042_dt,  0.6233207_dt,  0.8024682_dt,  0.1665138_dt,   0.22090447_dt, 0.62422717_dt, 0.08719146_dt, 0.92142665_dt,  0.9348017_dt,
+                          0.60455227_dt, 0.47940433_dt, 0.14430141_dt, 0.32600033_dt, 0.92557526_dt, 0.7757342_dt,   0.636765_dt,   0.6282351_dt,  0.35401833_dt, 0.41446733_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    raul::Tensor realGrads[] = { { 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                 { 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt,
+                                   1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                   0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt },
+                                 { 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt,
+                                   0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt },
+                                 { 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt,
+                                   0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt },
+                                 { 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt } };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceMaxLayer>("rmax", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+
+        // Checks
+        const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+        {
+            EXPECT_EQ(x_tensor_grad[i], realGrads[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerReduceMax, MaxRepeatsBackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 1;
+    const auto depth = 1;
+    const auto height = 3;
+    const auto width = 7;
+    const auto eps = TODTYPE(1e-6);
+
+    // See reduce_max.py
+    const raul::Tensor x{ 1.0_dt, 1.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    raul::Tensor realGrads[] = {
+        { 0.0_dt,        0.0_dt,        0.0_dt, 0.11111111_dt, 0.11111111_dt, 0.11111111_dt, 0.0_dt,        0.0_dt,        0.0_dt,        0.0_dt, 0.11111111_dt,
+          0.11111111_dt, 0.11111111_dt, 0.0_dt, 0.0_dt,        0.0_dt,        0.0_dt,        0.11111111_dt, 0.11111111_dt, 0.11111111_dt, 0.0_dt },
+        { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+        { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+        { 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt,
+          0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt },
+        { 0.0_dt,        0.0_dt,        0.0_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.0_dt,        0.0_dt,        0.0_dt,        0.0_dt, 0.33333334_dt,
+          0.33333334_dt, 0.33333334_dt, 0.0_dt, 0.0_dt,        0.0_dt,        0.0_dt,        0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.0_dt }
+    };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceMaxLayer>("rmax", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+
+        // Checks
+        const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], realGrads[iter][i], eps));
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ReduceMean.cpp b/training/src/tests/tests/layers/Test_Layer_ReduceMean.cpp
new file mode 100644
index 00000000..711b7790
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ReduceMean.cpp
@@ -0,0 +1,146 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ReduceMeanLayer.h>
+#include <training/base/layers/basic/ReduceBatchMeanLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+// See reduce_mean.py
+TEST(TestLayerReduceMean, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+    const auto eps = TODTYPE(1e-6);
+
+    // See reduce_mean.py
+    const raul::Tensor x{ 0.49625659_dt, 0.76822180_dt, 0.08847743_dt, 0.13203049_dt, 0.30742282_dt, 0.63407868_dt, 0.49009341_dt, 0.89644474_dt, 0.45562798_dt, 0.63230628_dt, 0.34889346_dt,
+                          0.40171731_dt, 0.02232575_dt, 0.16885895_dt, 0.29388845_dt, 0.51852179_dt, 0.69766760_dt, 0.80001140_dt, 0.16102946_dt, 0.28226858_dt, 0.68160856_dt, 0.91519397_dt,
+                          0.39709991_dt, 0.87415588_dt, 0.41940832_dt, 0.55290705_dt, 0.95273811_dt, 0.03616482_dt, 0.18523103_dt, 0.37341738_dt, 0.30510002_dt, 0.93200040_dt, 0.17591017_dt,
+                          0.26983356_dt, 0.15067977_dt, 0.03171951_dt, 0.20812976_dt, 0.92979902_dt, 0.72310919_dt, 0.74233627_dt, 0.52629578_dt, 0.24365824_dt, 0.58459234_dt, 0.03315264_dt,
+                          0.13871688_dt, 0.24223500_dt, 0.81546897_dt, 0.79316062_dt, 0.27825248_dt, 0.48195881_dt, 0.81978035_dt, 0.99706656_dt, 0.69844109_dt, 0.56754643_dt, 0.83524317_dt,
+                          0.20559883_dt, 0.59317201_dt, 0.11234725_dt, 0.15345693_dt, 0.24170822_dt, 0.72623652_dt, 0.70108020_dt, 0.20382375_dt, 0.65105355_dt, 0.77448601_dt, 0.43689132_dt,
+                          0.51909077_dt, 0.61585236_dt, 0.81018829_dt, 0.98009706_dt, 0.11468822_dt, 0.31676513_dt, 0.69650495_dt, 0.91427469_dt, 0.93510365_dt, 0.94117838_dt, 0.59950727_dt,
+                          0.06520867_dt, 0.54599625_dt, 0.18719733_dt, 0.03402293_dt, 0.94424623_dt, 0.88017988_dt, 0.00123602_dt, 0.59358603_dt, 0.41576999_dt, 0.41771942_dt, 0.27112156_dt,
+                          0.69227809_dt, 0.20384824_dt, 0.68329567_dt, 0.75285405_dt, 0.85793579_dt, 0.68695557_dt, 0.00513238_dt, 0.17565155_dt, 0.74965751_dt, 0.60465068_dt, 0.10995799_dt,
+                          0.21209025_dt, 0.97037464_dt, 0.83690894_dt, 0.28198743_dt, 0.37415761_dt, 0.02370095_dt, 0.49101293_dt, 0.12347054_dt, 0.11432165_dt, 0.47245020_dt, 0.57507253_dt,
+                          0.29523486_dt, 0.79668880_dt, 0.19573045_dt, 0.95368505_dt, 0.84264994_dt, 0.07835853_dt, 0.37555784_dt, 0.52256131_dt, 0.57295054_dt, 0.61858714_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    raul::Tensor realOutputs[] = {
+        { 0.48631182_dt },
+        { 0.61124659_dt, 0.73465097_dt, 0.14615059_dt, 0.39154202_dt, 0.54095441_dt, 0.53548503_dt, 0.50459206_dt, 0.75614858_dt, 0.63290811_dt, 0.80620170_dt, 0.23179084_dt, 0.35924122_dt,
+          0.35941535_dt, 0.54156685_dt, 0.61449605_dt, 0.72985005_dt, 0.64858747_dt, 0.43261003_dt, 0.35351285_dt, 0.23473296_dt, 0.35781574_dt, 0.92972010_dt, 0.63863993_dt, 0.43769595_dt,
+          0.50649714_dt, 0.48433852_dt, 0.68522877_dt, 0.15364319_dt, 0.43875456_dt, 0.28863281_dt, 0.49419785_dt, 0.84242725_dt, 0.51692295_dt, 0.47839457_dt, 0.07790607_dt, 0.10368553_dt,
+          0.47889364_dt, 0.76722485_dt, 0.41653359_dt, 0.47721326_dt, 0.74833524_dt, 0.54028356_dt, 0.43328989_dt, 0.20365512_dt, 0.08120891_dt, 0.36662397_dt, 0.46946976_dt, 0.45374113_dt,
+          0.37535134_dt, 0.52851570_dt, 0.55750763_dt, 0.89687765_dt, 0.44708577_dt, 0.76061571_dt, 0.83894658_dt, 0.14197868_dt, 0.48436493_dt, 0.31745428_dt, 0.36320373_dt, 0.43014768_dt },
+        { 0.56805366_dt, 0.64235801_dt, 0.35672322_dt, 0.34644637_dt, 0.28851601_dt, 0.47640690_dt, 0.75276685_dt, 0.57525676_dt, 0.30637050_dt, 0.49589419_dt,
+          0.49125794_dt, 0.77692813_dt, 0.29889235_dt, 0.33541298_dt, 0.42660379_dt, 0.25194672_dt, 0.49965644_dt, 0.61405259_dt, 0.34586516_dt, 0.42210436_dt,
+          0.57687801_dt, 0.82741183_dt, 0.45533037_dt, 0.34214905_dt, 0.46392432_dt, 0.44789138_dt, 0.35342693_dt, 0.33376518_dt, 0.65830559_dt, 0.58633929_dt,
+          0.36440626_dt, 0.62210268_dt, 0.58339041_dt, 0.85163850_dt, 0.59429532_dt, 0.39839613_dt, 0.57490754_dt, 0.39747357_dt, 0.40963492_dt, 0.33929157_dt },
+        { 0.49943763_dt, 0.58942503_dt, 0.45181483_dt, 0.22938672_dt, 0.37897152_dt, 0.39283377_dt, 0.75201559_dt, 0.38474348_dt, 0.51308244_dt, 0.42146045_dt,
+          0.44847751_dt, 0.66234148_dt, 0.54713535_dt, 0.25810212_dt, 0.42440677_dt, 0.55474859_dt, 0.53411084_dt, 0.39534742_dt, 0.73037815_dt, 0.71922100_dt,
+          0.32718503_dt, 0.71611929_dt, 0.65347201_dt, 0.37260693_dt, 0.25366423_dt, 0.45874527_dt, 0.53315651_dt, 0.27865022_dt, 0.59331083_dt, 0.51500261_dt },
+        { 0.35848182_dt, 0.62171018_dt, 0.24713679_dt, 0.49189979_dt, 0.65749329_dt, 0.42009169_dt, 0.36670479_dt, 0.52701873_dt, 0.30528316_dt, 0.52221519_dt, 0.78361547_dt, 0.26125664_dt,
+          0.61133605_dt, 0.67242396_dt, 0.59546733_dt, 0.46781760_dt, 0.49065417_dt, 0.40014744_dt, 0.59723467_dt, 0.37040156_dt, 0.49742594_dt, 0.35526556_dt, 0.61679780_dt, 0.43360311_dt }
+    };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceMeanLayer>("rmean", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+
+        work.forwardPassTraining();
+
+        // Checks
+        const auto& outTensor = memory_manager["out"];
+        EXPECT_EQ(outTensor.size(), realOutputs[iter].size());
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(outTensor[i], realOutputs[iter][i], eps));
+        }
+    }
+}
+
+TEST(TestLayerReduceMean, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+    const auto eps = TODTYPE(1e-6);
+
+    const raul::Tensor x{ 0.49625659_dt, 0.76822180_dt, 0.08847743_dt, 0.13203049_dt, 0.30742282_dt, 0.63407868_dt, 0.49009341_dt, 0.89644474_dt, 0.45562798_dt, 0.63230628_dt, 0.34889346_dt,
+                          0.40171731_dt, 0.02232575_dt, 0.16885895_dt, 0.29388845_dt, 0.51852179_dt, 0.69766760_dt, 0.80001140_dt, 0.16102946_dt, 0.28226858_dt, 0.68160856_dt, 0.91519397_dt,
+                          0.39709991_dt, 0.87415588_dt, 0.41940832_dt, 0.55290705_dt, 0.95273811_dt, 0.03616482_dt, 0.18523103_dt, 0.37341738_dt, 0.30510002_dt, 0.93200040_dt, 0.17591017_dt,
+                          0.26983356_dt, 0.15067977_dt, 0.03171951_dt, 0.20812976_dt, 0.92979902_dt, 0.72310919_dt, 0.74233627_dt, 0.52629578_dt, 0.24365824_dt, 0.58459234_dt, 0.03315264_dt,
+                          0.13871688_dt, 0.24223500_dt, 0.81546897_dt, 0.79316062_dt, 0.27825248_dt, 0.48195881_dt, 0.81978035_dt, 0.99706656_dt, 0.69844109_dt, 0.56754643_dt, 0.83524317_dt,
+                          0.20559883_dt, 0.59317201_dt, 0.11234725_dt, 0.15345693_dt, 0.24170822_dt, 0.72623652_dt, 0.70108020_dt, 0.20382375_dt, 0.65105355_dt, 0.77448601_dt, 0.43689132_dt,
+                          0.51909077_dt, 0.61585236_dt, 0.81018829_dt, 0.98009706_dt, 0.11468822_dt, 0.31676513_dt, 0.69650495_dt, 0.91427469_dt, 0.93510365_dt, 0.94117838_dt, 0.59950727_dt,
+                          0.06520867_dt, 0.54599625_dt, 0.18719733_dt, 0.03402293_dt, 0.94424623_dt, 0.88017988_dt, 0.00123602_dt, 0.59358603_dt, 0.41576999_dt, 0.41771942_dt, 0.27112156_dt,
+                          0.69227809_dt, 0.20384824_dt, 0.68329567_dt, 0.75285405_dt, 0.85793579_dt, 0.68695557_dt, 0.00513238_dt, 0.17565155_dt, 0.74965751_dt, 0.60465068_dt, 0.10995799_dt,
+                          0.21209025_dt, 0.97037464_dt, 0.83690894_dt, 0.28198743_dt, 0.37415761_dt, 0.02370095_dt, 0.49101293_dt, 0.12347054_dt, 0.11432165_dt, 0.47245020_dt, 0.57507253_dt,
+                          0.29523486_dt, 0.79668880_dt, 0.19573045_dt, 0.95368505_dt, 0.84264994_dt, 0.07835853_dt, 0.37555784_dt, 0.52256131_dt, 0.57295054_dt, 0.61858714_dt };
+
+    // Always one
+    const raul::Tensor realGrads[] = { raul::Tensor("realGrad(Default)", batch, depth, height, width, 0.00833333_dt),
+                                       raul::Tensor("realGrad(Batch)", batch, depth, height, width, 0.5_dt),
+                                       raul::Tensor("realGrad(Depth)", batch, depth, height, width, 0.33333334_dt),
+                                       raul::Tensor("realGrad(Height)", batch, depth, height, width, 0.25_dt),
+                                       raul::Tensor("realGrad(Width)", batch, depth, height, width, 0.2_dt) };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceMeanLayer>("rmean", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+
+        // Checks
+        const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], realGrads[iter][i], eps));
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ReduceMin.cpp b/training/src/tests/tests/layers/Test_Layer_ReduceMin.cpp
new file mode 100644
index 00000000..2e2b036e
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ReduceMin.cpp
@@ -0,0 +1,240 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ReduceMinLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerReduceMin, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ReduceMinLayer("rmin", raul::BasicParamsWithDim{ { "x", "y" }, { "x_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerReduceMin, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ReduceMinLayer("rmin", raul::BasicParamsWithDim{ { "x" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerReduceMin, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+
+    // See reduce_min.py
+    const raul::Tensor x{ 0.6645621_dt,  0.44100678_dt, 0.3528825_dt,  0.46448255_dt, 0.03366041_dt, 0.68467236_dt,  0.74011743_dt, 0.8724445_dt,  0.22632635_dt, 0.22319686_dt,  0.3103881_dt,
+                          0.7223358_dt,  0.13318717_dt, 0.5480639_dt,  0.5746088_dt,  0.8996835_dt,  0.009463668_dt, 0.5212307_dt,  0.6345445_dt,  0.1993283_dt,  0.72942245_dt,  0.54583454_dt,
+                          0.10756552_dt, 0.6767061_dt,  0.6602763_dt,  0.33695042_dt, 0.60141766_dt, 0.21062577_dt,  0.8527372_dt,  0.44062173_dt, 0.9485276_dt,  0.23752594_dt,  0.81179297_dt,
+                          0.5263394_dt,  0.494308_dt,   0.21612847_dt, 0.8457197_dt,  0.8718841_dt,  0.3083862_dt,   0.6868038_dt,  0.23764038_dt, 0.7817228_dt,  0.9671384_dt,   0.068701625_dt,
+                          0.79873943_dt, 0.66028714_dt, 0.5871513_dt,  0.16461694_dt, 0.7381023_dt,  0.32054043_dt,  0.6073899_dt,  0.46523476_dt, 0.97803545_dt, 0.7223145_dt,   0.32347047_dt,
+                          0.82577336_dt, 0.4976915_dt,  0.19483674_dt, 0.7588748_dt,  0.3380444_dt,  0.28128064_dt,  0.31513572_dt, 0.60670924_dt, 0.7498598_dt,  0.5016055_dt,   0.18282163_dt,
+                          0.13179815_dt, 0.64636123_dt, 0.9559475_dt,  0.6670735_dt,  0.30755532_dt, 0.36892188_dt,  0.44735897_dt, 0.18359458_dt, 0.5288255_dt,  0.7052754_dt,   0.898633_dt,
+                          0.31386292_dt, 0.62338257_dt, 0.96815526_dt, 0.11207926_dt, 0.29590535_dt, 0.9356605_dt,   0.1341263_dt,  0.31937933_dt, 0.262277_dt,   0.031487584_dt, 0.90045524_dt,
+                          0.6409379_dt,  0.5821855_dt,  0.20917094_dt, 0.71736085_dt, 0.363523_dt,   0.04670918_dt,  0.14977789_dt, 0.84361756_dt, 0.9355587_dt,  0.09517312_dt,  0.08617878_dt,
+                          0.6247839_dt,  0.37050653_dt, 0.5139042_dt,  0.6233207_dt,  0.8024682_dt,  0.1665138_dt,   0.22090447_dt, 0.62422717_dt, 0.08719146_dt, 0.92142665_dt,  0.9348017_dt,
+                          0.60455227_dt, 0.47940433_dt, 0.14430141_dt, 0.32600033_dt, 0.92557526_dt, 0.7757342_dt,   0.636765_dt,   0.6282351_dt,  0.35401833_dt, 0.41446733_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    raul::Tensor realOutputs[] = {
+        { 0.009463668_dt },
+        { 0.28128064_dt, 0.31513572_dt, 0.3528825_dt,   0.46448255_dt, 0.03366041_dt,  0.18282163_dt, 0.13179815_dt, 0.64636123_dt,  0.22632635_dt, 0.22319686_dt, 0.30755532_dt, 0.36892188_dt,
+          0.13318717_dt, 0.18359458_dt, 0.5288255_dt,   0.7052754_dt,  0.009463668_dt, 0.31386292_dt, 0.62338257_dt, 0.1993283_dt,   0.11207926_dt, 0.29590535_dt, 0.10756552_dt, 0.1341263_dt,
+          0.31937933_dt, 0.262277_dt,   0.031487584_dt, 0.21062577_dt, 0.6409379_dt,   0.44062173_dt, 0.20917094_dt, 0.23752594_dt,  0.363523_dt,   0.04670918_dt, 0.14977789_dt, 0.21612847_dt,
+          0.8457197_dt,  0.09517312_dt, 0.08617878_dt,  0.6247839_dt,  0.23764038_dt,  0.5139042_dt,  0.6233207_dt,  0.068701625_dt, 0.1665138_dt,  0.22090447_dt, 0.5871513_dt,  0.08719146_dt,
+          0.7381023_dt,  0.32054043_dt, 0.60455227_dt,  0.46523476_dt, 0.14430141_dt,  0.32600033_dt, 0.32347047_dt, 0.7757342_dt,   0.4976915_dt,  0.19483674_dt, 0.35401833_dt, 0.3380444_dt },
+        { 0.23764038_dt, 0.44100678_dt, 0.10756552_dt, 0.068701625_dt, 0.03366041_dt, 0.33695042_dt, 0.5871513_dt,   0.16461694_dt, 0.22632635_dt, 0.22319686_dt,
+          0.3103881_dt,  0.23752594_dt, 0.13318717_dt, 0.5263394_dt,   0.32347047_dt, 0.21612847_dt, 0.009463668_dt, 0.19483674_dt, 0.3083862_dt,  0.1993283_dt,
+          0.11207926_dt, 0.29590535_dt, 0.60670924_dt, 0.1341263_dt,   0.1665138_dt,  0.18282163_dt, 0.031487584_dt, 0.08719146_dt, 0.6409379_dt,  0.5821855_dt,
+          0.20917094_dt, 0.36892188_dt, 0.14430141_dt, 0.04670918_dt,  0.14977789_dt, 0.7052754_dt,  0.636765_dt,    0.09517312_dt, 0.08617878_dt, 0.41446733_dt },
+        { 0.3103881_dt,  0.009463668_dt, 0.13318717_dt, 0.22632635_dt,  0.03366041_dt, 0.21612847_dt, 0.23752594_dt, 0.10756552_dt, 0.3083862_dt,  0.44062173_dt,
+          0.23764038_dt, 0.46523476_dt,  0.16461694_dt, 0.068701625_dt, 0.32054043_dt, 0.18282163_dt, 0.13179815_dt, 0.31386292_dt, 0.18359458_dt, 0.5016055_dt,
+          0.11207926_dt, 0.031487584_dt, 0.09517312_dt, 0.04670918_dt,  0.14977789_dt, 0.22090447_dt, 0.47940433_dt, 0.08719146_dt, 0.32600033_dt, 0.1665138_dt },
+        { 0.03366041_dt, 0.22319686_dt, 0.13318717_dt, 0.009463668_dt, 0.10756552_dt, 0.21062577_dt,  0.23752594_dt, 0.21612847_dt, 0.068701625_dt, 0.16461694_dt, 0.32347047_dt, 0.19483674_dt,
+          0.28128064_dt, 0.13179815_dt, 0.18359458_dt, 0.31386292_dt,  0.11207926_dt, 0.031487584_dt, 0.04670918_dt, 0.08617878_dt, 0.1665138_dt,   0.08719146_dt, 0.14430141_dt, 0.35401833_dt }
+    };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceMinLayer>("rmin", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+
+        work.forwardPassTraining();
+
+        // Checks
+        const auto& outTensor = memory_manager["out"];
+        EXPECT_EQ(outTensor.size(), realOutputs[iter].size());
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            EXPECT_EQ(outTensor[i], realOutputs[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerReduceMin, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+
+    // See reduce_max.py
+    const raul::Tensor x{ 0.6645621_dt,  0.44100678_dt, 0.3528825_dt,  0.46448255_dt, 0.03366041_dt, 0.68467236_dt,  0.74011743_dt, 0.8724445_dt,  0.22632635_dt, 0.22319686_dt,  0.3103881_dt,
+                          0.7223358_dt,  0.13318717_dt, 0.5480639_dt,  0.5746088_dt,  0.8996835_dt,  0.009463668_dt, 0.5212307_dt,  0.6345445_dt,  0.1993283_dt,  0.72942245_dt,  0.54583454_dt,
+                          0.10756552_dt, 0.6767061_dt,  0.6602763_dt,  0.33695042_dt, 0.60141766_dt, 0.21062577_dt,  0.8527372_dt,  0.44062173_dt, 0.9485276_dt,  0.23752594_dt,  0.81179297_dt,
+                          0.5263394_dt,  0.494308_dt,   0.21612847_dt, 0.8457197_dt,  0.8718841_dt,  0.3083862_dt,   0.6868038_dt,  0.23764038_dt, 0.7817228_dt,  0.9671384_dt,   0.068701625_dt,
+                          0.79873943_dt, 0.66028714_dt, 0.5871513_dt,  0.16461694_dt, 0.7381023_dt,  0.32054043_dt,  0.6073899_dt,  0.46523476_dt, 0.97803545_dt, 0.7223145_dt,   0.32347047_dt,
+                          0.82577336_dt, 0.4976915_dt,  0.19483674_dt, 0.7588748_dt,  0.3380444_dt,  0.28128064_dt,  0.31513572_dt, 0.60670924_dt, 0.7498598_dt,  0.5016055_dt,   0.18282163_dt,
+                          0.13179815_dt, 0.64636123_dt, 0.9559475_dt,  0.6670735_dt,  0.30755532_dt, 0.36892188_dt,  0.44735897_dt, 0.18359458_dt, 0.5288255_dt,  0.7052754_dt,   0.898633_dt,
+                          0.31386292_dt, 0.62338257_dt, 0.96815526_dt, 0.11207926_dt, 0.29590535_dt, 0.9356605_dt,   0.1341263_dt,  0.31937933_dt, 0.262277_dt,   0.031487584_dt, 0.90045524_dt,
+                          0.6409379_dt,  0.5821855_dt,  0.20917094_dt, 0.71736085_dt, 0.363523_dt,   0.04670918_dt,  0.14977789_dt, 0.84361756_dt, 0.9355587_dt,  0.09517312_dt,  0.08617878_dt,
+                          0.6247839_dt,  0.37050653_dt, 0.5139042_dt,  0.6233207_dt,  0.8024682_dt,  0.1665138_dt,   0.22090447_dt, 0.62422717_dt, 0.08719146_dt, 0.92142665_dt,  0.9348017_dt,
+                          0.60455227_dt, 0.47940433_dt, 0.14430141_dt, 0.32600033_dt, 0.92557526_dt, 0.7757342_dt,   0.636765_dt,   0.6282351_dt,  0.35401833_dt, 0.41446733_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    raul::Tensor realGrads[] = { { 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                 { 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt,
+                                   1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt,
+                                   1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                   0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt },
+                                 { 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt },
+                                 { 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                 { 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt,
+                                   1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt,
+                                   0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt } };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceMinLayer>("rmin", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+
+        // Checks
+        const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+        {
+            EXPECT_EQ(x_tensor_grad[i], realGrads[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerReduceMin, MaxRepeatsBackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 1;
+    const auto depth = 1;
+    const auto height = 3;
+    const auto width = 7;
+    const auto eps = TODTYPE(1e-6);
+
+    // See reduce_max.py
+    const raul::Tensor x{ 1.0_dt, 1.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 2.0_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    raul::Tensor realGrads[] = {
+        { 0.16666667_dt, 0.16666667_dt, 0.0_dt, 0.0_dt,        0.0_dt,        0.0_dt, 0.0_dt, 0.16666667_dt, 0.16666667_dt, 0.0_dt, 0.0_dt,
+          0.0_dt,        0.0_dt,        0.0_dt, 0.16666667_dt, 0.16666667_dt, 0.0_dt, 0.0_dt, 0.0_dt,        0.0_dt,        0.0_dt },
+        { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+        { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+        { 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt,
+          0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt, 0.33333334_dt },
+        { 0.5_dt, 0.5_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.5_dt, 0.5_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.5_dt, 0.5_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt }
+    };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceMinLayer>("rmin", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+
+        // Checks
+        const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], realGrads[iter][i], eps));
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ReduceNonZero.cpp b/training/src/tests/tests/layers/Test_Layer_ReduceNonZero.cpp
new file mode 100644
index 00000000..c1cd488d
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ReduceNonZero.cpp
@@ -0,0 +1,134 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ReduceNonZeroLayer.h>
+#include <training/compiler/Workflow.h>
+
+#include <random>
+
+namespace UT
+{
+
+TEST(TestLayerReduceNonZero, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ReduceNonZeroLayer("nonzero", raul::BasicParamsWithDim{ { "x", "y" }, { "x_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerReduceNonZero, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ReduceNonZeroLayer("nonzero", raul::BasicParamsWithDim{ { "x", "y" }, { "x_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerReduceNonZero, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 1;
+    const auto depth = 2;
+    const auto height = 3;
+    const auto width = 4;
+
+    // See reduce_sum.py
+    const raul::Tensor x{ 0.1_dt, 0.0_dt,        0.2_dt,        0.13203049_dt, 0.0_dt,        0.63407868_dt, 0.49009341_dt, 0.89644474_dt, 0.45562798_dt, 0.63230628_dt, 0.34889346_dt, 0.51909077_dt,
+                          0.0_dt, 0.81018829_dt, 0.98009706_dt, 0.11468822_dt, 0.31676513_dt, 0.69650495_dt, 0.91427469_dt, 0.0_dt,        0.94117838_dt, 0.59950727_dt, 0.0_dt,        0.0_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    raul::Tensor realOutputs[] = { { 18.0_dt },
+                                   { 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                                     0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt },
+                                   { 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 1.0_dt },
+                                   { 2.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 2.0_dt, 3.0_dt, 2.0_dt, 1.0_dt },
+                                   { 3.0_dt, 3.0_dt, 4.0_dt, 3.0_dt, 3.0_dt, 2.0_dt } };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceNonZeroLayer>("rnonzero", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+
+        work.forwardPassTraining();
+
+        // Checks
+        const auto& outTensor = memory_manager["out"];
+        EXPECT_EQ(outTensor.size(), realOutputs[iter].size());
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            EXPECT_EQ(outTensor[i], realOutputs[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerReduceNonZero, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 1;
+    const auto depth = 2;
+    const auto height = 3;
+    const auto width = 4;
+
+    // See reduce_sum.py
+    const raul::Tensor x{ 0.1_dt, 0.0_dt,        0.2_dt,        0.13203049_dt, 0.0_dt,        0.63407868_dt, 0.49009341_dt, 0.89644474_dt, 0.45562798_dt, 0.63230628_dt, 0.34889346_dt, 0.51909077_dt,
+                          0.0_dt, 0.81018829_dt, 0.98009706_dt, 0.11468822_dt, 0.31676513_dt, 0.69650495_dt, 0.91427469_dt, 0.0_dt,        0.94117838_dt, 0.59950727_dt, 0.0_dt,        0.0_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    const raul::Tensor realGrad = raul::Tensor("realGrad", batch, depth, height, width, 1.0_dt);
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceNonZeroLayer>("rnonzero", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+
+        // Checks
+        const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+        {
+            EXPECT_EQ(x_tensor_grad[i], realGrad[i]);
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ReduceStd.cpp b/training/src/tests/tests/layers/Test_Layer_ReduceStd.cpp
new file mode 100644
index 00000000..3ae09f22
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ReduceStd.cpp
@@ -0,0 +1,196 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ReduceStdLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+// See reduce_std.py
+TEST(TestLayerReduceStd, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+    const auto eps = TODTYPE(1e-6);
+
+    // See reduce_mean.py
+    const raul::Tensor x{ 0.49625659_dt, 0.76822180_dt, 0.08847743_dt, 0.13203049_dt, 0.30742282_dt, 0.63407868_dt, 0.49009341_dt, 0.89644474_dt, 0.45562798_dt, 0.63230628_dt, 0.34889346_dt,
+                          0.40171731_dt, 0.02232575_dt, 0.16885895_dt, 0.29388845_dt, 0.51852179_dt, 0.69766760_dt, 0.80001140_dt, 0.16102946_dt, 0.28226858_dt, 0.68160856_dt, 0.91519397_dt,
+                          0.39709991_dt, 0.87415588_dt, 0.41940832_dt, 0.55290705_dt, 0.95273811_dt, 0.03616482_dt, 0.18523103_dt, 0.37341738_dt, 0.30510002_dt, 0.93200040_dt, 0.17591017_dt,
+                          0.26983356_dt, 0.15067977_dt, 0.03171951_dt, 0.20812976_dt, 0.92979902_dt, 0.72310919_dt, 0.74233627_dt, 0.52629578_dt, 0.24365824_dt, 0.58459234_dt, 0.03315264_dt,
+                          0.13871688_dt, 0.24223500_dt, 0.81546897_dt, 0.79316062_dt, 0.27825248_dt, 0.48195881_dt, 0.81978035_dt, 0.99706656_dt, 0.69844109_dt, 0.56754643_dt, 0.83524317_dt,
+                          0.20559883_dt, 0.59317201_dt, 0.11234725_dt, 0.15345693_dt, 0.24170822_dt, 0.72623652_dt, 0.70108020_dt, 0.20382375_dt, 0.65105355_dt, 0.77448601_dt, 0.43689132_dt,
+                          0.51909077_dt, 0.61585236_dt, 0.81018829_dt, 0.98009706_dt, 0.11468822_dt, 0.31676513_dt, 0.69650495_dt, 0.91427469_dt, 0.93510365_dt, 0.94117838_dt, 0.59950727_dt,
+                          0.06520867_dt, 0.54599625_dt, 0.18719733_dt, 0.03402293_dt, 0.94424623_dt, 0.88017988_dt, 0.00123602_dt, 0.59358603_dt, 0.41576999_dt, 0.41771942_dt, 0.27112156_dt,
+                          0.69227809_dt, 0.20384824_dt, 0.68329567_dt, 0.75285405_dt, 0.85793579_dt, 0.68695557_dt, 0.00513238_dt, 0.17565155_dt, 0.74965751_dt, 0.60465068_dt, 0.10995799_dt,
+                          0.21209025_dt, 0.97037464_dt, 0.83690894_dt, 0.28198743_dt, 0.37415761_dt, 0.02370095_dt, 0.49101293_dt, 0.12347054_dt, 0.11432165_dt, 0.47245020_dt, 0.57507253_dt,
+                          0.29523486_dt, 0.79668880_dt, 0.19573045_dt, 0.95368505_dt, 0.84264994_dt, 0.07835853_dt, 0.37555784_dt, 0.52256131_dt, 0.57295054_dt, 0.61858714_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    raul::Tensor realOutputs[] = {
+        { 0.29362151_dt },
+        { 0.16262037_dt, 0.04747628_dt, 0.08156216_dt, 0.36700472_dt, 0.33026356_dt, 0.13943252_dt, 0.02050423_dt, 0.19840878_dt, 0.25071201_dt, 0.24592522_dt, 0.16560812_dt, 0.06007026_dt,
+          0.47671667_dt, 0.52708852_dt, 0.45340762_dt, 0.29886335_dt, 0.06940983_dt, 0.51958400_dt, 0.27221262_dt, 0.06722553_dt, 0.45791218_dt, 0.02054305_dt, 0.34158912_dt, 0.61724752_dt,
+          0.12316224_dt, 0.09697054_dt, 0.37831533_dt, 0.16613950_dt, 0.35853642_dt, 0.11990348_dt, 0.26742470_dt, 0.12667561_dt, 0.48226494_dt, 0.29494980_dt, 0.10291754_dt, 0.10177533_dt,
+          0.38291794_dt, 0.22991461_dt, 0.43356335_dt, 0.37494054_dt, 0.31401119_dt, 0.41949159_dt, 0.21397398_dt, 0.24112692_dt, 0.08132854_dt, 0.17591256_dt, 0.48931679_dt, 0.48001164_dt,
+          0.13731852_dt, 0.06584134_dt, 0.37090966_dt, 0.14168848_dt, 0.35547009_dt, 0.27304125_dt, 0.00523738_dt, 0.08997248_dt, 0.15387645_dt, 0.29006514_dt, 0.29662678_dt, 0.26649365_dt },
+        { 0.09948179_dt, 0.35301745_dt, 0.25050989_dt, 0.45967624_dt, 0.14129764_dt, 0.20682015_dt, 0.23761038_dt, 0.46971476_dt, 0.13737392_dt, 0.13000581_dt,
+          0.28535011_dt, 0.32656661_dt, 0.35443822_dt, 0.20727620_dt, 0.36106342_dt, 0.24668849_dt, 0.25781912_dt, 0.43930897_dt, 0.32672483_dt, 0.27806950_dt,
+          0.48571557_dt, 0.12186089_dt, 0.37000030_dt, 0.32608911_dt, 0.39182734_dt, 0.03880884_dt, 0.20549692_dt, 0.25656661_dt, 0.17141283_dt, 0.38824704_dt,
+          0.29054624_dt, 0.26533681_dt, 0.34528995_dt, 0.14397441_dt, 0.51231986_dt, 0.47257370_dt, 0.18825914_dt, 0.29066241_dt, 0.25987753_dt, 0.24219708_dt },
+        { 0.11714106_dt, 0.17203265_dt, 0.46022153_dt, 0.15165715_dt, 0.16920234_dt, 0.28700468_dt, 0.36291552_dt, 0.39257979_dt, 0.33721498_dt, 0.24399167_dt,
+          0.28603467_dt, 0.32432956_dt, 0.30213991_dt, 0.22928490_dt, 0.30935216_dt, 0.35883123_dt, 0.16291125_dt, 0.30821595_dt, 0.16378510_dt, 0.36549741_dt,
+          0.28493771_dt, 0.21875301_dt, 0.28388804_dt, 0.36873907_dt, 0.24598438_dt, 0.38045391_dt, 0.34370273_dt, 0.17643066_dt, 0.25358731_dt, 0.34788322_dt },
+        { 0.28002605_dt, 0.17367399_dt, 0.15256368_dt, 0.26990846_dt, 0.24414290_dt, 0.35570002_dt, 0.32241577_dt, 0.38537532_dt, 0.24106906_dt, 0.27337542_dt, 0.16083780_dt, 0.19199124_dt,
+          0.23211947_dt, 0.22126274_dt, 0.36606798_dt, 0.34941602_dt, 0.45169494_dt, 0.18779923_dt, 0.33846119_dt, 0.28702292_dt, 0.39526996_dt, 0.21923594_dt, 0.34552982_dt, 0.21858540_dt }
+    };
+
+    for (size_t iter = 1; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceStdLayer>("rstd", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+
+        work.forwardPassTraining();
+
+        // Checks
+        const auto& outTensor = memory_manager["out"];
+        EXPECT_EQ(outTensor.size(), realOutputs[iter].size());
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(outTensor[i], realOutputs[iter][i], eps));
+        }
+    }
+}
+
+TEST(TestLayerReduceStd, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+    const auto eps = TODTYPE(1e-4);
+
+    const raul::Tensor x{ 0.49625659_dt, 0.76822180_dt, 0.08847743_dt, 0.13203049_dt, 0.30742282_dt, 0.63407868_dt, 0.49009341_dt, 0.89644474_dt, 0.45562798_dt, 0.63230628_dt, 0.34889346_dt,
+                          0.40171731_dt, 0.02232575_dt, 0.16885895_dt, 0.29388845_dt, 0.51852179_dt, 0.69766760_dt, 0.80001140_dt, 0.16102946_dt, 0.28226858_dt, 0.68160856_dt, 0.91519397_dt,
+                          0.39709991_dt, 0.87415588_dt, 0.41940832_dt, 0.55290705_dt, 0.95273811_dt, 0.03616482_dt, 0.18523103_dt, 0.37341738_dt, 0.30510002_dt, 0.93200040_dt, 0.17591017_dt,
+                          0.26983356_dt, 0.15067977_dt, 0.03171951_dt, 0.20812976_dt, 0.92979902_dt, 0.72310919_dt, 0.74233627_dt, 0.52629578_dt, 0.24365824_dt, 0.58459234_dt, 0.03315264_dt,
+                          0.13871688_dt, 0.24223500_dt, 0.81546897_dt, 0.79316062_dt, 0.27825248_dt, 0.48195881_dt, 0.81978035_dt, 0.99706656_dt, 0.69844109_dt, 0.56754643_dt, 0.83524317_dt,
+                          0.20559883_dt, 0.59317201_dt, 0.11234725_dt, 0.15345693_dt, 0.24170822_dt, 0.72623652_dt, 0.70108020_dt, 0.20382375_dt, 0.65105355_dt, 0.77448601_dt, 0.43689132_dt,
+                          0.51909077_dt, 0.61585236_dt, 0.81018829_dt, 0.98009706_dt, 0.11468822_dt, 0.31676513_dt, 0.69650495_dt, 0.91427469_dt, 0.93510365_dt, 0.94117838_dt, 0.59950727_dt,
+                          0.06520867_dt, 0.54599625_dt, 0.18719733_dt, 0.03402293_dt, 0.94424623_dt, 0.88017988_dt, 0.00123602_dt, 0.59358603_dt, 0.41576999_dt, 0.41771942_dt, 0.27112156_dt,
+                          0.69227809_dt, 0.20384824_dt, 0.68329567_dt, 0.75285405_dt, 0.85793579_dt, 0.68695557_dt, 0.00513238_dt, 0.17565155_dt, 0.74965751_dt, 0.60465068_dt, 0.10995799_dt,
+                          0.21209025_dt, 0.97037464_dt, 0.83690894_dt, 0.28198743_dt, 0.37415761_dt, 0.02370095_dt, 0.49101293_dt, 0.12347054_dt, 0.11432165_dt, 0.47245020_dt, 0.57507253_dt,
+                          0.29523486_dt, 0.79668880_dt, 0.19573045_dt, 0.95368505_dt, 0.84264994_dt, 0.07835853_dt, 0.37555784_dt, 0.52256131_dt, 0.57295054_dt, 0.61858714_dt };
+
+    // Always one
+    const raul::Tensor realGrads[] = {
+        { 0.00028462_dt,  0.00806818_dt,  -0.01138590_dt, -0.01013943_dt, -0.00511975_dt, 0.00422904_dt,  0.00010823_dt,  0.01173788_dt,  -0.00087816_dt, 0.00417832_dt,  -0.00393287_dt,
+          -0.00242107_dt, -0.01327915_dt, -0.00908541_dt, -0.00550710_dt, 0.00092184_dt,  0.00604894_dt,  0.00897799_dt,  -0.00930949_dt, -0.00583966_dt, 0.00558934_dt,  0.01227448_dt,
+          -0.00255322_dt, 0.01109998_dt,  -0.00191476_dt, 0.00190594_dt,  0.01334898_dt,  -0.01288308_dt, -0.00861684_dt, -0.00323101_dt, -0.00518623_dt, 0.01275548_dt,  -0.00888360_dt,
+          -0.00619554_dt, -0.00960569_dt, -0.01301030_dt, -0.00796149_dt, 0.01269247_dt,  0.00677707_dt,  0.00732735_dt,  0.00114433_dt,  -0.00694467_dt, 0.00281276_dt,  -0.01296928_dt,
+          -0.00994807_dt, -0.00698541_dt, 0.00942038_dt,  0.00878192_dt,  -0.00595460_dt, -0.00012458_dt, 0.00954377_dt,  0.01461765_dt,  0.00607108_dt,  0.00232491_dt,  0.00998631_dt,
+          -0.00803392_dt, 0.00305831_dt,  -0.01070276_dt, -0.00952621_dt, -0.00700048_dt, 0.00686657_dt,  0.00614661_dt,  -0.00808473_dt, 0.00471486_dt,  0.00824746_dt,  -0.00141440_dt,
+          0.00093812_dt,  0.00370741_dt,  0.00926925_dt,  0.01413199_dt,  -0.01063576_dt, -0.00485238_dt, 0.00601567_dt,  0.01224817_dt,  0.01284429_dt,  0.01301815_dt,  0.00323962_dt,
+          -0.01205185_dt, 0.00170815_dt,  -0.00856057_dt, -0.01294438_dt, 0.01310595_dt,  0.01127239_dt,  -0.01388273_dt, 0.00307016_dt,  -0.00201889_dt, -0.00196309_dt, -0.00615868_dt,
+          0.00589469_dt,  -0.00808403_dt, 0.00563762_dt,  0.00762836_dt,  0.01063577_dt,  0.00574237_dt,  -0.01377122_dt, -0.00889101_dt, 0.00753688_dt,  0.00338682_dt,  -0.01077114_dt,
+          -0.00784814_dt, 0.01385374_dt,  0.01003399_dt,  -0.00584770_dt, -0.00320982_dt, -0.01323979_dt, 0.00013454_dt,  -0.01038441_dt, -0.01064625_dt, -0.00039672_dt, 0.00254031_dt,
+          -0.00546857_dt, 0.00888290_dt,  -0.00831635_dt, 0.01337609_dt,  0.01019829_dt,  -0.01167550_dt, -0.00316975_dt, 0.00103745_dt,  0.00247957_dt,  0.00378568_dt },
+        { -0.70710701_dt, 0.70710737_dt,  -0.70710677_dt, -0.70710677_dt, -0.70710677_dt, 0.70710659_dt,  -0.70710534_dt, 0.70710665_dt,  -0.70710665_dt, -0.70710689_dt, 0.70710677_dt,
+          0.70710677_dt,  -0.70710677_dt, -0.70710683_dt, -0.70710677_dt, -0.70710665_dt, 0.70710635_dt,  0.70710677_dt,  -0.70710683_dt, 0.70710677_dt,  0.70710677_dt,  -0.70710677_dt,
+          -0.70710683_dt, 0.70710683_dt,  -0.70710653_dt, 0.70710677_dt,  0.70710683_dt,  -0.70710683_dt, -0.70710677_dt, 0.70710677_dt,  -0.70710677_dt, 0.70710653_dt,  -0.70710671_dt,
+          -0.70710683_dt, 0.70710683_dt,  -0.70710677_dt, -0.70710677_dt, 0.70710677_dt,  0.70710683_dt,  0.70710677_dt,  -0.70710683_dt, -0.70710671_dt, 0.70710677_dt,  -0.70710677_dt,
+          0.70710677_dt,  -0.70710677_dt, 0.70710677_dt,  0.70710677_dt,  -0.70710677_dt, -0.70710731_dt, 0.70710671_dt,  0.70710695_dt,  0.70710677_dt,  -0.70710665_dt, -0.70711243_dt,
+          0.70710677_dt,  0.70710683_dt,  -0.70710677_dt, -0.70710683_dt, -0.70710677_dt, 0.70710665_dt,  -0.70710611_dt, 0.70710677_dt,  0.70710677_dt,  0.70710677_dt,  -0.70710701_dt,
+          0.70710826_dt,  -0.70710695_dt, 0.70710689_dt,  0.70710665_dt,  -0.70710677_dt, -0.70710677_dt, 0.70710677_dt,  0.70710671_dt,  0.70710677_dt,  0.70710683_dt,  -0.70710719_dt,
+          -0.70710677_dt, 0.70710683_dt,  -0.70710677_dt, -0.70710677_dt, 0.70710677_dt,  0.70710665_dt,  -0.70710683_dt, 0.70710701_dt,  -0.70710677_dt, -0.70710683_dt, 0.70710683_dt,
+          0.70710677_dt,  -0.70710677_dt, 0.70710677_dt,  -0.70710701_dt, 0.70710683_dt,  0.70710683_dt,  -0.70710683_dt, 0.70710677_dt,  0.70710677_dt,  -0.70710677_dt, -0.70710683_dt,
+          -0.70710677_dt, 0.70710665_dt,  0.70710683_dt,  -0.70710677_dt, 0.70710677_dt,  -0.70710677_dt, 0.70710677_dt,  -0.70710677_dt, -0.70710677_dt, 0.70710677_dt,  0.70710635_dt,
+          -0.70710689_dt, -0.70710653_dt, -0.70710677_dt, 0.70710689_dt,  0.70710105_dt,  -0.70710677_dt, -0.70710683_dt, 0.70710677_dt,  0.70710683_dt,  0.70710677_dt },
+        { -0.36085534_dt, 0.17826852_dt,  -0.53539962_dt, -0.23322488_dt, 0.06690416_dt,  0.38118088_dt,  -0.55273986_dt, 0.34189680_dt,  0.54325259_dt,  0.52463841_dt,  -0.24945578_dt,
+          -0.57447821_dt, -0.39014784_dt, -0.40176836_dt, -0.18378398_dt, 0.54030710_dt,  0.38401178_dt,  0.21164922_dt,  -0.28286144_dt, -0.25144035_dt, 0.57073206_dt,  0.38643411_dt,
+          0.08058902_dt,  0.57400125_dt,  0.46317938_dt,  0.18494365_dt,  0.42079657_dt,  -0.57385033_dt, -0.44091141_dt, -0.47104365_dt, -0.32619211_dt, 0.23742823_dt,  -0.17348889_dt,
+          -0.15819331_dt, -0.38209912_dt, -0.44636706_dt, -0.56537056_dt, 0.35936716_dt,  0.57731158_dt,  0.57581276_dt,  -0.20987700_dt, -0.56470263_dt, 0.45481065_dt,  -0.34077650_dt,
+          -0.53008366_dt, -0.56612444_dt, 0.13194315_dt,  0.23195337_dt,  -0.10234116_dt, -0.05359525_dt, 0.57564795_dt,  0.33704981_dt,  0.56363666_dt,  0.55996168_dt,  0.56588310_dt,
+          -0.09394012_dt, 0.18135889_dt,  -0.57101655_dt, -0.29444999_dt, -0.32437241_dt, 0.15375100_dt,  -0.51834363_dt, -0.33987352_dt, 0.47365043_dt,  0.39629915_dt,  -0.14172114_dt,
+          0.40308106_dt,  0.54973477_dt,  0.44303191_dt,  0.50709689_dt,  -0.42973891_dt, -0.57537729_dt, 0.16379645_dt,  0.21752545_dt,  0.33261284_dt,  0.57428318_dt,  0.06533476_dt,
+          -0.57156497_dt, 0.26235691_dt,  -0.31398860_dt, -0.55881995_dt, 0.47937614_dt,  0.57412046_dt,  -0.52272981_dt, 0.16545771_dt,  -0.41384110_dt, 0.15643176_dt,  -0.12208061_dt,
+          0.09909556_dt,  -0.49258718_dt, 0.54877567_dt,  0.24638754_dt,  0.39755774_dt,  -0.57191736_dt, -0.57499522_dt, -0.23567180_dt, 0.46412084_dt,  0.35638785_dt,  -0.57657337_dt,
+          -0.26259878_dt, 0.40506899_dt,  0.03896700_dt,  -0.23424701_dt, 0.04907946_dt,  -0.56175685_dt, 0.55556339_dt,  -0.55951297_dt, -0.42765412_dt, -0.54212797_dt, -0.01450979_dt,
+          -0.11903682_dt, 0.32898962_dt,  -0.56135428_dt, 0.35439128_dt,  0.24238238_dt,  -0.33861133_dt, -0.52945560_dt, 0.21517701_dt,  0.31421652_dt,  0.57658738_dt },
+        { -0.00905188_dt, 0.34643960_dt,  -0.26316124_dt, -0.21398319_dt, -0.14095294_dt, 0.38313085_dt,  -0.19246660_dt, 0.32204050_dt,  0.49726480_dt,  0.49907655_dt,  -0.42838430_dt,
+          -0.36370561_dt, -0.31107417_dt, -0.13303643_dt, -0.16761602_dt, 0.05430534_dt,  0.20973262_dt,  0.25219491_dt,  -0.15024517_dt, -0.19050747_dt, 0.33538920_dt,  0.14987729_dt,
+          0.01049165_dt,  0.35691717_dt,  -0.00280355_dt, 0.18591252_dt,  0.18436110_dt,  -0.29597268_dt, -0.32407758_dt, -0.06563485_dt, -0.10189584_dt, 0.16531378_dt,  -0.17731708_dt,
+          -0.24044888_dt, -0.36993161_dt, -0.41940579_dt, -0.49955231_dt, 0.46279809_dt,  0.20760916_dt,  0.43836996_dt,  0.09068630_dt,  -0.43030638_dt, 0.04132411_dt,  -0.32703054_dt,
+          -0.30783674_dt, -0.24034676_dt, 0.15737849_dt,  0.27142534_dt,  0.02929451_dt,  0.06201351_dt,  0.43270147_dt,  0.34401745_dt,  0.16692679_dt,  0.44986871_dt,  0.44268468_dt,
+          -0.28304109_dt, -0.07108969_dt, -0.47967637_dt, -0.15213270_dt, -0.19686145_dt, 0.15930231_dt,  0.34163666_dt,  -0.20713149_dt, -0.16144042_dt, 0.05040164_dt,  -0.10948256_dt,
+          -0.03073263_dt, 0.23847453_dt,  0.16242859_dt,  0.23791875_dt,  -0.40879050_dt, -0.44471192_dt, 0.32569975_dt,  0.37426388_dt,  0.19688481_dt,  0.35897079_dt,  0.13380790_dt,
+          -0.35704270_dt, -0.37525168_dt, -0.48520514_dt, -0.34295461_dt, 0.34761727_dt,  0.26619399_dt,  -0.33571249_dt, 0.46062791_dt,  0.10363079_dt,  -0.45469835_dt, -0.44894511_dt,
+          0.28897685_dt,  -0.06750562_dt, 0.41659471_dt,  0.05597601_dt,  0.24007560_dt,  0.28416538_dt,  -0.33678544_dt, -0.17727089_dt, 0.05110516_dt,  -0.05732463_dt, -0.23742981_dt,
+          -0.05633688_dt, 0.44826221_dt,  0.29458833_dt,  0.00630504_dt,  -0.28807071_dt, -0.47075340_dt, 0.02827119_dt,  -0.39732590_dt, -0.31046867_dt, -0.15886790_dt, 0.05755755_dt,
+          -0.14325906_dt, 0.25558162_dt,  -0.15666170_dt, 0.47370172_dt,  0.31394377_dt,  -0.33327448_dt, -0.15284398_dt, 0.46082523_dt,  -0.02676303_dt, 0.09925220_dt },
+        { 0.12300174_dt,  0.36580524_dt,  -0.24105293_dt, -0.20216990_dt, -0.04558416_dt, 0.01780419_dt,  -0.18945953_dt, 0.39547452_dt,  -0.23907179_dt, 0.01525285_dt,  0.16674460_dt,
+          0.25330493_dt,  -0.36838886_dt, -0.12827078_dt, 0.07661008_dt,  0.02465836_dt,  0.19059037_dt,  0.28538528_dt,  -0.30646533_dt, -0.19416881_dt, 0.02469380_dt,  0.26388302_dt,
+          -0.26664034_dt, 0.22186041_dt,  -0.24379672_dt, 0.09334787_dt,  0.37436491_dt,  -0.26983893_dt, -0.16506933_dt, -0.03280455_dt, -0.04776811_dt, 0.43832812_dt,  -0.14794146_dt,
+          -0.07511359_dt, -0.16750501_dt, -0.32130963_dt, -0.20686908_dt, 0.26129094_dt,  0.12720746_dt,  0.13968042_dt,  0.22920054_dt,  -0.06390795_dt, 0.28965682_dt,  -0.28221220_dt,
+          -0.17273711_dt, -0.25604001_dt, 0.26817861_dt,  0.24777779_dt,  -0.22310227_dt, -0.03681419_dt, 0.05621328_dt,  0.33178005_dt,  -0.13239174_dt, -0.33584929_dt, 0.08024807_dt,
+          -0.07247441_dt, 0.43220121_dt,  -0.19390129_dt, -0.14037061_dt, -0.02545483_dt, 0.12375144_dt,  0.09665728_dt,  -0.43890363_dt, 0.04277700_dt,  0.17571765_dt,  -0.26612324_dt,
+          -0.17324786_dt, -0.06391903_dt, 0.15565695_dt,  0.34763318_dt,  -0.32834005_dt, -0.19033501_dt, 0.06900195_dt,  0.21772416_dt,  0.23194894_dt,  0.33867994_dt,  0.09422126_dt,
+          -0.28805843_dt, 0.05593522_dt,  -0.20077805_dt, -0.25273210_dt, 0.25105003_dt,  0.21559115_dt,  -0.27087870_dt, 0.05696979_dt,  0.02079688_dt,  0.02339198_dt,  -0.17176038_dt,
+          0.38888690_dt,  -0.26131523_dt, 0.06356785_dt,  0.11494626_dt,  0.19256353_dt,  0.06627119_dt,  -0.43734875_dt, -0.16962932_dt, 0.33033594_dt,  0.20403346_dt,  -0.22684911_dt,
+          -0.13789083_dt, 0.29913017_dt,  0.21471591_dt,  -0.13626036_dt, -0.07796465_dt, -0.29962116_dt, 0.15479599_dt,  -0.26432142_dt, -0.27475411_dt, 0.13362846_dt,  0.25065115_dt,
+          -0.23265933_dt, 0.13015591_dt,  -0.30465341_dt, 0.24374686_dt,  0.16341001_dt,  -0.40629953_dt, -0.06638741_dt, 0.10174308_dt,  0.15937413_dt,  0.21156952_dt }
+    };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceStdLayer>("rstd", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+
+        // Checks
+        const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], realGrads[iter][i], eps));
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ReduceSum.cpp b/training/src/tests/tests/layers/Test_Layer_ReduceSum.cpp
new file mode 100644
index 00000000..85205281
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ReduceSum.cpp
@@ -0,0 +1,162 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ReduceSumLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerReduceSum, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ReduceSumLayer("rsum", raul::BasicParamsWithDim{ { "x", "y" }, { "x_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerReduceSum, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::ReduceSumLayer("rsum", raul::BasicParamsWithDim{ { "x" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerReduceSum, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+    const auto eps = TODTYPE(1e-6);
+
+    // See reduce_sum.py
+    const raul::Tensor x{ 0.49625659_dt, 0.76822180_dt, 0.08847743_dt, 0.13203049_dt, 0.30742282_dt, 0.63407868_dt, 0.49009341_dt, 0.89644474_dt, 0.45562798_dt, 0.63230628_dt, 0.34889346_dt,
+                          0.40171731_dt, 0.02232575_dt, 0.16885895_dt, 0.29388845_dt, 0.51852179_dt, 0.69766760_dt, 0.80001140_dt, 0.16102946_dt, 0.28226858_dt, 0.68160856_dt, 0.91519397_dt,
+                          0.39709991_dt, 0.87415588_dt, 0.41940832_dt, 0.55290705_dt, 0.95273811_dt, 0.03616482_dt, 0.18523103_dt, 0.37341738_dt, 0.30510002_dt, 0.93200040_dt, 0.17591017_dt,
+                          0.26983356_dt, 0.15067977_dt, 0.03171951_dt, 0.20812976_dt, 0.92979902_dt, 0.72310919_dt, 0.74233627_dt, 0.52629578_dt, 0.24365824_dt, 0.58459234_dt, 0.03315264_dt,
+                          0.13871688_dt, 0.24223500_dt, 0.81546897_dt, 0.79316062_dt, 0.27825248_dt, 0.48195881_dt, 0.81978035_dt, 0.99706656_dt, 0.69844109_dt, 0.56754643_dt, 0.83524317_dt,
+                          0.20559883_dt, 0.59317201_dt, 0.11234725_dt, 0.15345693_dt, 0.24170822_dt, 0.72623652_dt, 0.70108020_dt, 0.20382375_dt, 0.65105355_dt, 0.77448601_dt, 0.43689132_dt,
+                          0.51909077_dt, 0.61585236_dt, 0.81018829_dt, 0.98009706_dt, 0.11468822_dt, 0.31676513_dt, 0.69650495_dt, 0.91427469_dt, 0.93510365_dt, 0.94117838_dt, 0.59950727_dt,
+                          0.06520867_dt, 0.54599625_dt, 0.18719733_dt, 0.03402293_dt, 0.94424623_dt, 0.88017988_dt, 0.00123602_dt, 0.59358603_dt, 0.41576999_dt, 0.41771942_dt, 0.27112156_dt,
+                          0.69227809_dt, 0.20384824_dt, 0.68329567_dt, 0.75285405_dt, 0.85793579_dt, 0.68695557_dt, 0.00513238_dt, 0.17565155_dt, 0.74965751_dt, 0.60465068_dt, 0.10995799_dt,
+                          0.21209025_dt, 0.97037464_dt, 0.83690894_dt, 0.28198743_dt, 0.37415761_dt, 0.02370095_dt, 0.49101293_dt, 0.12347054_dt, 0.11432165_dt, 0.47245020_dt, 0.57507253_dt,
+                          0.29523486_dt, 0.79668880_dt, 0.19573045_dt, 0.95368505_dt, 0.84264994_dt, 0.07835853_dt, 0.37555784_dt, 0.52256131_dt, 0.57295054_dt, 0.61858714_dt };
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    raul::Tensor realOutputs[] = {
+        { 58.35741806_dt },
+        { 1.22249317_dt, 1.46930194_dt, 0.29230118_dt, 0.78308403_dt, 1.08190882_dt, 1.07097006_dt, 1.00918412_dt, 1.51229715_dt, 1.26581621_dt, 1.61240339_dt, 0.46358168_dt, 0.71848243_dt,
+          0.71883070_dt, 1.08313370_dt, 1.22899210_dt, 1.45970011_dt, 1.29717493_dt, 0.86522007_dt, 0.70702571_dt, 0.46946591_dt, 0.71563148_dt, 1.85944021_dt, 1.27727985_dt, 0.87539190_dt,
+          1.01299429_dt, 0.96867704_dt, 1.37045753_dt, 0.30728638_dt, 0.87750912_dt, 0.57726562_dt, 0.98839569_dt, 1.68485451_dt, 1.03384590_dt, 0.95678914_dt, 0.15581214_dt, 0.20737106_dt,
+          0.95778728_dt, 1.53444970_dt, 0.83306718_dt, 0.95442653_dt, 1.49667048_dt, 1.08056712_dt, 0.86657977_dt, 0.40731025_dt, 0.16241783_dt, 0.73324794_dt, 0.93893951_dt, 0.90748227_dt,
+          0.75070268_dt, 1.05703139_dt, 1.11501527_dt, 1.79375529_dt, 0.89417154_dt, 1.52123141_dt, 1.67789316_dt, 0.28395736_dt, 0.96872985_dt, 0.63490856_dt, 0.72640747_dt, 0.86029536_dt },
+        { 1.70416093_dt, 1.92707396_dt, 1.07016969_dt, 1.03933907_dt, 0.86554801_dt, 1.42922068_dt, 2.25830054_dt, 1.72577024_dt, 0.91911149_dt, 1.48768258_dt,
+          1.47377384_dt, 2.33078432_dt, 0.89667702_dt, 1.00623894_dt, 1.27981138_dt, 0.75584012_dt, 1.49896932_dt, 1.84215772_dt, 1.03759551_dt, 1.26631308_dt,
+          1.73063409_dt, 2.48223543_dt, 1.36599112_dt, 1.02644718_dt, 1.39177299_dt, 1.34367418_dt, 1.06028080_dt, 1.00129557_dt, 1.97491670_dt, 1.75901783_dt,
+          1.09321880_dt, 1.86630797_dt, 1.75017118_dt, 2.55491543_dt, 1.78288603_dt, 1.19518840_dt, 1.72472262_dt, 1.19242072_dt, 1.22890472_dt, 1.01787472_dt },
+        { 1.99775052_dt, 2.35770011_dt, 1.80725932_dt, 0.91754687_dt, 1.51588607_dt, 1.57133508_dt, 3.00806236_dt, 1.53897393_dt, 2.05232978_dt, 1.68584180_dt,
+          1.79391003_dt, 2.64936590_dt, 2.18854141_dt, 1.03240848_dt, 1.69762707_dt, 2.21899438_dt, 2.13644338_dt, 1.58138967_dt, 2.92151260_dt, 2.87688398_dt,
+          1.30874014_dt, 2.86447716_dt, 2.61388803_dt, 1.49042773_dt, 1.01465690_dt, 1.83498108_dt, 2.13262606_dt, 1.11460090_dt, 2.37324333_dt, 2.06001043_dt },
+        { 1.79240918_dt, 3.10855103_dt, 1.23568392_dt, 2.45949888_dt, 3.28746653_dt, 2.10045838_dt, 1.83352399_dt, 2.63509369_dt, 1.52641582_dt, 2.61107588_dt, 3.91807747_dt, 1.30628324_dt,
+          3.05668020_dt, 3.36211991_dt, 2.97733665_dt, 2.33908796_dt, 2.45327091_dt, 2.00073719_dt, 2.98617339_dt, 1.85200787_dt, 2.48712969_dt, 1.77632785_dt, 3.08398914_dt, 2.16801548_dt }
+    };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceSumLayer>("rsum", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+
+        work.forwardPassTraining();
+
+        // Checks
+        const auto& outTensor = memory_manager["out"];
+        EXPECT_EQ(outTensor.size(), realOutputs[iter].size());
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(outTensor[i], realOutputs[iter][i], eps));
+        }
+    }
+}
+
+TEST(TestLayerReduceSum, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 3;
+    const auto height = 4;
+    const auto width = 5;
+
+    const raul::Tensor x{ 0.49625659_dt, 0.76822180_dt, 0.08847743_dt, 0.13203049_dt, 0.30742282_dt, 0.63407868_dt, 0.49009341_dt, 0.89644474_dt, 0.45562798_dt, 0.63230628_dt, 0.34889346_dt,
+                          0.40171731_dt, 0.02232575_dt, 0.16885895_dt, 0.29388845_dt, 0.51852179_dt, 0.69766760_dt, 0.80001140_dt, 0.16102946_dt, 0.28226858_dt, 0.68160856_dt, 0.91519397_dt,
+                          0.39709991_dt, 0.87415588_dt, 0.41940832_dt, 0.55290705_dt, 0.95273811_dt, 0.03616482_dt, 0.18523103_dt, 0.37341738_dt, 0.30510002_dt, 0.93200040_dt, 0.17591017_dt,
+                          0.26983356_dt, 0.15067977_dt, 0.03171951_dt, 0.20812976_dt, 0.92979902_dt, 0.72310919_dt, 0.74233627_dt, 0.52629578_dt, 0.24365824_dt, 0.58459234_dt, 0.03315264_dt,
+                          0.13871688_dt, 0.24223500_dt, 0.81546897_dt, 0.79316062_dt, 0.27825248_dt, 0.48195881_dt, 0.81978035_dt, 0.99706656_dt, 0.69844109_dt, 0.56754643_dt, 0.83524317_dt,
+                          0.20559883_dt, 0.59317201_dt, 0.11234725_dt, 0.15345693_dt, 0.24170822_dt, 0.72623652_dt, 0.70108020_dt, 0.20382375_dt, 0.65105355_dt, 0.77448601_dt, 0.43689132_dt,
+                          0.51909077_dt, 0.61585236_dt, 0.81018829_dt, 0.98009706_dt, 0.11468822_dt, 0.31676513_dt, 0.69650495_dt, 0.91427469_dt, 0.93510365_dt, 0.94117838_dt, 0.59950727_dt,
+                          0.06520867_dt, 0.54599625_dt, 0.18719733_dt, 0.03402293_dt, 0.94424623_dt, 0.88017988_dt, 0.00123602_dt, 0.59358603_dt, 0.41576999_dt, 0.41771942_dt, 0.27112156_dt,
+                          0.69227809_dt, 0.20384824_dt, 0.68329567_dt, 0.75285405_dt, 0.85793579_dt, 0.68695557_dt, 0.00513238_dt, 0.17565155_dt, 0.74965751_dt, 0.60465068_dt, 0.10995799_dt,
+                          0.21209025_dt, 0.97037464_dt, 0.83690894_dt, 0.28198743_dt, 0.37415761_dt, 0.02370095_dt, 0.49101293_dt, 0.12347054_dt, 0.11432165_dt, 0.47245020_dt, 0.57507253_dt,
+                          0.29523486_dt, 0.79668880_dt, 0.19573045_dt, 0.95368505_dt, 0.84264994_dt, 0.07835853_dt, 0.37555784_dt, 0.52256131_dt, 0.57295054_dt, 0.61858714_dt };
+
+    // Always one
+    const raul::Tensor realGrad = raul::Tensor("realGrad", batch, depth, height, width, 1.0_dt);
+
+    std::string dimensions[] = { "default", "batch", "depth", "height", "width" };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+        work.add<raul::ReduceSumLayer>("rsum", raul::BasicParamsWithDim{ { "x" }, { "out" }, dimensions[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+
+        // Checks
+        const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(x_tensor_grad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+        {
+            EXPECT_EQ(x_tensor_grad[i], realGrad[i]);
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_RepeatInterleave.cpp b/training/src/tests/tests/layers/Test_Layer_RepeatInterleave.cpp
new file mode 100644
index 00000000..b8321b0b
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_RepeatInterleave.cpp
@@ -0,0 +1,282 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/RepeatInterleaveLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerRepeatInterleave, IncorrectBatchDimensionUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::RepeatInterleaveLayer("repeat", raul::RepeatInterleaveParams{ { "x" }, { "out" }, { 1 }, raul::Dimension::Batch }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerRepeatInterleave, ForwardExampleUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 1;
+    const auto depth = 2;
+    const auto height = 2;
+    const auto width = 1;
+
+    const raul::Tensor x{ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt };
+
+    std::initializer_list<size_t> repeats[] = { { 2 }, { 3 }, { 1, 2 } };
+
+    std::string dimensions[] = { "default", "height", "depth" };
+
+    const raul::Tensor realOuts[] = { { 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 4.0_dt, 4.0_dt },
+                                      { 1.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 3.0_dt, 3.0_dt, 3.0_dt, 4.0_dt, 4.0_dt, 4.0_dt },
+                                      { 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 3.0_dt, 4.0_dt } };
+
+    yato::dimensionality<4> expectedShapes[] = { yato::dims(1, 8, 1, 1), yato::dims(1, 2, 6, 1), yato::dims(1, 3, 2, 1) };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+        raul::RepeatInterleaveLayer repeat("repeat", raul::RepeatInterleaveParams{ { "x" }, { "out" }, repeats[iter], dimensions[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        repeat.forwardCompute(raul::NetworkMode::Test);
+
+        // Checks
+        const auto& outTensor = memory_manager["out"];
+        EXPECT_EQ(outTensor.getShape(), expectedShapes[iter]);
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            EXPECT_EQ(outTensor[i], realOuts[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerRepeatInterleave, BackwardExampleUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 1;
+    const auto depth = 2;
+    const auto height = 2;
+    const auto width = 1;
+
+    const raul::Tensor x{ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt };
+
+    std::initializer_list<size_t> repeats[] = { { 2 }, { 3 }, { 1, 2 } };
+
+    std::string dimensions[] = { "default", "height", "depth" };
+
+    const raul::Tensor realGrads[] = { { 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt }, { 3.0_dt, 3.0_dt, 3.0_dt, 3.0_dt }, { 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt } };
+
+    yato::dimensionality<4> expectedShapes[] = { yato::dims(1, 8, 1, 1), yato::dims(1, 2, 6, 1), yato::dims(1, 3, 2, 1) };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+        raul::RepeatInterleaveLayer repeat("repeat", raul::RepeatInterleaveParams{ { "x" }, { "out" }, repeats[iter], dimensions[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = TORANGE(*memory_manager.createTensor("gradient", expectedShapes[iter], 1.0_dt));
+        repeat.forwardCompute(raul::NetworkMode::Test);
+        repeat.backwardCompute();
+
+        // Checks
+        const auto& xTensorGrad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(xTensorGrad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < xTensorGrad.size(); ++i)
+        {
+            EXPECT_EQ(xTensorGrad[i], realGrads[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerRepeatInterleave, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 1;
+    const auto depth = 2;
+    const auto height = 3;
+    const auto width = 4;
+
+    const raul::Tensor x{ 0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt,
+                          0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt, 0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt,
+                          0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt };
+
+    std::initializer_list<size_t> repeats[] = { { 2 }, { 2, 3 }, { 2, 2, 3 }, { 3, 4, 5, 6 }, { 3 } };
+
+    std::string dimensions[] = { "default", "depth", "height", "width", "width" };
+
+    const raul::Tensor realOuts[] = {
+        { 0.47356850_dt, 0.47356850_dt, 0.61359876_dt, 0.61359876_dt, 0.01207304_dt, 0.01207304_dt, 0.03959680_dt, 0.03959680_dt, 0.56738567_dt, 0.56738567_dt, 0.25380611_dt, 0.25380611_dt,
+          0.53375959_dt, 0.53375959_dt, 0.60933894_dt, 0.60933894_dt, 0.71622825_dt, 0.71622825_dt, 0.60428441_dt, 0.60428441_dt, 0.99795526_dt, 0.99795526_dt, 0.14445406_dt, 0.14445406_dt,
+          0.38160384_dt, 0.38160384_dt, 0.62407351_dt, 0.62407351_dt, 0.96103561_dt, 0.96103561_dt, 0.12649149_dt, 0.12649149_dt, 0.87728381_dt, 0.87728381_dt, 0.03422910_dt, 0.03422910_dt,
+          0.39936811_dt, 0.39936811_dt, 0.71827561_dt, 0.71827561_dt, 0.81016523_dt, 0.81016523_dt, 0.50096071_dt, 0.50096071_dt, 0.05554926_dt, 0.05554926_dt, 0.23256207_dt, 0.23256207_dt },
+        { 0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt, 0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt,
+          0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt, 0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt,
+          0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt, 0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt,
+          0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt, 0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt,
+          0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt, 0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt },
+        { 0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt,
+          0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt, 0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt, 0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt,
+          0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt, 0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt, 0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt,
+          0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt,
+          0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt },
+        { 0.47356850_dt, 0.47356850_dt, 0.47356850_dt, 0.61359876_dt, 0.61359876_dt, 0.61359876_dt, 0.61359876_dt, 0.01207304_dt, 0.01207304_dt, 0.01207304_dt, 0.01207304_dt, 0.01207304_dt,
+          0.03959680_dt, 0.03959680_dt, 0.03959680_dt, 0.03959680_dt, 0.03959680_dt, 0.03959680_dt, 0.56738567_dt, 0.56738567_dt, 0.56738567_dt, 0.25380611_dt, 0.25380611_dt, 0.25380611_dt,
+          0.25380611_dt, 0.53375959_dt, 0.53375959_dt, 0.53375959_dt, 0.53375959_dt, 0.53375959_dt, 0.60933894_dt, 0.60933894_dt, 0.60933894_dt, 0.60933894_dt, 0.60933894_dt, 0.60933894_dt,
+          0.71622825_dt, 0.71622825_dt, 0.71622825_dt, 0.60428441_dt, 0.60428441_dt, 0.60428441_dt, 0.60428441_dt, 0.99795526_dt, 0.99795526_dt, 0.99795526_dt, 0.99795526_dt, 0.99795526_dt,
+          0.14445406_dt, 0.14445406_dt, 0.14445406_dt, 0.14445406_dt, 0.14445406_dt, 0.14445406_dt, 0.38160384_dt, 0.38160384_dt, 0.38160384_dt, 0.62407351_dt, 0.62407351_dt, 0.62407351_dt,
+          0.62407351_dt, 0.96103561_dt, 0.96103561_dt, 0.96103561_dt, 0.96103561_dt, 0.96103561_dt, 0.12649149_dt, 0.12649149_dt, 0.12649149_dt, 0.12649149_dt, 0.12649149_dt, 0.12649149_dt,
+          0.87728381_dt, 0.87728381_dt, 0.87728381_dt, 0.03422910_dt, 0.03422910_dt, 0.03422910_dt, 0.03422910_dt, 0.39936811_dt, 0.39936811_dt, 0.39936811_dt, 0.39936811_dt, 0.39936811_dt,
+          0.71827561_dt, 0.71827561_dt, 0.71827561_dt, 0.71827561_dt, 0.71827561_dt, 0.71827561_dt, 0.81016523_dt, 0.81016523_dt, 0.81016523_dt, 0.50096071_dt, 0.50096071_dt, 0.50096071_dt,
+          0.50096071_dt, 0.05554926_dt, 0.05554926_dt, 0.05554926_dt, 0.05554926_dt, 0.05554926_dt, 0.23256207_dt, 0.23256207_dt, 0.23256207_dt, 0.23256207_dt, 0.23256207_dt, 0.23256207_dt },
+        { 0.47356850_dt, 0.47356850_dt, 0.47356850_dt, 0.61359876_dt, 0.61359876_dt, 0.61359876_dt, 0.01207304_dt, 0.01207304_dt, 0.01207304_dt, 0.03959680_dt, 0.03959680_dt, 0.03959680_dt,
+          0.56738567_dt, 0.56738567_dt, 0.56738567_dt, 0.25380611_dt, 0.25380611_dt, 0.25380611_dt, 0.53375959_dt, 0.53375959_dt, 0.53375959_dt, 0.60933894_dt, 0.60933894_dt, 0.60933894_dt,
+          0.71622825_dt, 0.71622825_dt, 0.71622825_dt, 0.60428441_dt, 0.60428441_dt, 0.60428441_dt, 0.99795526_dt, 0.99795526_dt, 0.99795526_dt, 0.14445406_dt, 0.14445406_dt, 0.14445406_dt,
+          0.38160384_dt, 0.38160384_dt, 0.38160384_dt, 0.62407351_dt, 0.62407351_dt, 0.62407351_dt, 0.96103561_dt, 0.96103561_dt, 0.96103561_dt, 0.12649149_dt, 0.12649149_dt, 0.12649149_dt,
+          0.87728381_dt, 0.87728381_dt, 0.87728381_dt, 0.03422910_dt, 0.03422910_dt, 0.03422910_dt, 0.39936811_dt, 0.39936811_dt, 0.39936811_dt, 0.71827561_dt, 0.71827561_dt, 0.71827561_dt,
+          0.81016523_dt, 0.81016523_dt, 0.81016523_dt, 0.50096071_dt, 0.50096071_dt, 0.50096071_dt, 0.05554926_dt, 0.05554926_dt, 0.05554926_dt, 0.23256207_dt, 0.23256207_dt, 0.23256207_dt }
+    };
+
+    yato::dimensionality<4> expectedShapes[] = { yato::dims(1, 48, 1, 1), yato::dims(1, 5, 3, 4), yato::dims(1, 2, 7, 4), yato::dims(1, 2, 3, 18), yato::dims(1, 2, 3, 12) };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+        raul::RepeatInterleaveLayer repeat("repeat", raul::RepeatInterleaveParams{ { "x" }, { "out" }, repeats[iter], dimensions[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        repeat.forwardCompute(raul::NetworkMode::Test);
+        // Checks
+        const auto& outTensor = memory_manager["out"];
+        EXPECT_EQ(outTensor.getShape(), expectedShapes[iter]);
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            EXPECT_EQ(outTensor[i], realOuts[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerRepeatInterleave, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 1;
+    const auto depth = 2;
+    const auto height = 3;
+    const auto width = 4;
+    const auto eps = TODTYPE(1e-6);
+
+    const raul::Tensor x{ 0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt,
+                          0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt, 0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt,
+                          0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt };
+
+    std::initializer_list<size_t> repeats[] = { { 2 }, { 2, 3 }, { 2, 2, 3 }, { 3, 4, 5, 6 }, { 3 } };
+
+    std::string dimensions[] = { "default", "depth", "height", "width", "width" };
+
+    const raul::Tensor outNablas[] = {
+        { 0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt, 0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt,
+          0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt, 0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt,
+          0.10957229_dt, 0.50652146_dt, 0.59991300_dt, 0.07291150_dt, 0.95796555_dt, 0.98673368_dt, 0.10710853_dt, 0.55515742_dt, 0.28962278_dt, 0.20267731_dt, 0.44419128_dt, 0.03274381_dt,
+          0.38145286_dt, 0.86610520_dt, 0.36800802_dt, 0.51245415_dt, 0.63189268_dt, 0.89268374_dt, 0.62026799_dt, 0.19522381_dt, 0.29146129_dt, 0.17491180_dt, 0.65033436_dt, 0.97998011_dt },
+        { 0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt, 0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt,
+          0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt, 0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt,
+          0.10957229_dt, 0.50652146_dt, 0.59991300_dt, 0.07291150_dt, 0.95796555_dt, 0.98673368_dt, 0.10710853_dt, 0.55515742_dt, 0.28962278_dt, 0.20267731_dt, 0.44419128_dt, 0.03274381_dt,
+          0.38145286_dt, 0.86610520_dt, 0.36800802_dt, 0.51245415_dt, 0.63189268_dt, 0.89268374_dt, 0.62026799_dt, 0.19522381_dt, 0.29146129_dt, 0.17491180_dt, 0.65033436_dt, 0.97998011_dt,
+          0.76997584_dt, 0.94594479_dt, 0.75355482_dt, 0.54385155_dt, 0.26714522_dt, 0.34428477_dt, 0.96595669_dt, 0.80406678_dt, 0.46522611_dt, 0.21898687_dt, 0.68941998_dt, 0.66582263_dt },
+        { 0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt, 0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt,
+          0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt, 0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt,
+          0.10957229_dt, 0.50652146_dt, 0.59991300_dt, 0.07291150_dt, 0.95796555_dt, 0.98673368_dt, 0.10710853_dt, 0.55515742_dt, 0.28962278_dt, 0.20267731_dt, 0.44419128_dt, 0.03274381_dt,
+          0.38145286_dt, 0.86610520_dt, 0.36800802_dt, 0.51245415_dt, 0.63189268_dt, 0.89268374_dt, 0.62026799_dt, 0.19522381_dt, 0.29146129_dt, 0.17491180_dt, 0.65033436_dt, 0.97998011_dt,
+          0.76997584_dt, 0.94594479_dt, 0.75355482_dt, 0.54385155_dt, 0.26714522_dt, 0.34428477_dt, 0.96595669_dt, 0.80406678_dt },
+        { 0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt, 0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt,
+          0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt, 0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt,
+          0.10957229_dt, 0.50652146_dt, 0.59991300_dt, 0.07291150_dt, 0.95796555_dt, 0.98673368_dt, 0.10710853_dt, 0.55515742_dt, 0.28962278_dt, 0.20267731_dt, 0.44419128_dt, 0.03274381_dt,
+          0.38145286_dt, 0.86610520_dt, 0.36800802_dt, 0.51245415_dt, 0.63189268_dt, 0.89268374_dt, 0.62026799_dt, 0.19522381_dt, 0.29146129_dt, 0.17491180_dt, 0.65033436_dt, 0.97998011_dt,
+          0.76997584_dt, 0.94594479_dt, 0.75355482_dt, 0.54385155_dt, 0.26714522_dt, 0.34428477_dt, 0.96595669_dt, 0.80406678_dt, 0.46522611_dt, 0.21898687_dt, 0.68941998_dt, 0.66582263_dt,
+          0.76642013_dt, 0.30439049_dt, 0.79592609_dt, 0.69084817_dt, 0.35829926_dt, 0.78180724_dt, 0.65462142_dt, 0.91969174_dt, 0.76629215_dt, 0.47312105_dt, 0.03121775_dt, 0.92311883_dt,
+          0.41893458_dt, 0.97102535_dt, 0.48558223_dt, 0.17522883_dt, 0.39512479_dt, 0.24138790_dt, 0.10555458_dt, 0.06852686_dt, 0.02500689_dt, 0.29676020_dt, 0.05066872_dt, 0.97760677_dt,
+          0.40033334_dt, 0.42702633_dt, 0.69453138_dt, 0.71851665_dt, 0.97512805_dt, 0.19063199_dt, 0.44824940_dt, 0.57315487_dt, 0.04944044_dt, 0.42187548_dt, 0.87570536_dt, 0.37057030_dt,
+          0.84195560_dt, 0.02813518_dt, 0.23979330_dt, 0.37214810_dt, 0.90836322_dt, 0.06156033_dt, 0.32391739_dt, 0.46039730_dt, 0.31428313_dt, 0.94868171_dt, 0.94160742_dt, 0.93181843_dt },
+        { 0.47356850_dt, 0.61359876_dt, 0.01207304_dt, 0.03959680_dt, 0.56738567_dt, 0.25380611_dt, 0.53375959_dt, 0.60933894_dt, 0.71622825_dt, 0.60428441_dt, 0.99795526_dt, 0.14445406_dt,
+          0.38160384_dt, 0.62407351_dt, 0.96103561_dt, 0.12649149_dt, 0.87728381_dt, 0.03422910_dt, 0.39936811_dt, 0.71827561_dt, 0.81016523_dt, 0.50096071_dt, 0.05554926_dt, 0.23256207_dt,
+          0.10957229_dt, 0.50652146_dt, 0.59991300_dt, 0.07291150_dt, 0.95796555_dt, 0.98673368_dt, 0.10710853_dt, 0.55515742_dt, 0.28962278_dt, 0.20267731_dt, 0.44419128_dt, 0.03274381_dt,
+          0.38145286_dt, 0.86610520_dt, 0.36800802_dt, 0.51245415_dt, 0.63189268_dt, 0.89268374_dt, 0.62026799_dt, 0.19522381_dt, 0.29146129_dt, 0.17491180_dt, 0.65033436_dt, 0.97998011_dt,
+          0.76997584_dt, 0.94594479_dt, 0.75355482_dt, 0.54385155_dt, 0.26714522_dt, 0.34428477_dt, 0.96595669_dt, 0.80406678_dt, 0.46522611_dt, 0.21898687_dt, 0.68941998_dt, 0.66582263_dt,
+          0.76642013_dt, 0.30439049_dt, 0.79592609_dt, 0.69084817_dt, 0.35829926_dt, 0.78180724_dt, 0.65462142_dt, 0.91969174_dt, 0.76629215_dt, 0.47312105_dt, 0.03121775_dt, 0.92311883_dt }
+    };
+
+    const raul::Tensor realGrads[] = {
+        { 1.08716726_dt, 0.05166984_dt, 0.82119179_dt, 1.14309859_dt, 1.32051265_dt, 1.14240932_dt, 1.00567734_dt, 1.08752704_dt, 0.91151291_dt, 1.11764371_dt, 1.31112599_dt, 0.28811133_dt,
+          0.61609375_dt, 0.67282450_dt, 1.94469929_dt, 0.66226596_dt, 0.49230009_dt, 0.47693509_dt, 1.24755812_dt, 0.88046217_dt, 1.52457643_dt, 0.81549180_dt, 0.46637309_dt, 1.63031447_dt },
+        { 0.85517234_dt, 1.23767233_dt, 0.97310865_dt, 0.16608828_dt, 1.44466949_dt, 0.28803521_dt, 0.93312770_dt, 1.32761455_dt, 1.52639341_dt, 1.10524511_dt, 1.05350447_dt, 0.37701613_dt,
+          1.26100099_dt, 2.31857157_dt, 1.72147584_dt, 1.12921715_dt, 1.85700345_dt, 2.22370219_dt, 1.69333315_dt, 1.55444801_dt, 1.04631019_dt, 0.59657598_dt, 1.78394556_dt, 1.67854655_dt },
+        { 1.04095411_dt, 0.86740488_dt, 0.54583263_dt, 0.64893574_dt, 1.09783208_dt, 1.22835791_dt, 1.95899081_dt, 0.27094555_dt, 1.79702127_dt, 1.04171133_dt, 1.05483031_dt, 1.02374911_dt,
+          1.24758840_dt, 1.18941092_dt, 0.55129981_dt, 0.58790123_dt, 1.01334548_dt, 1.75878894_dt, 0.98827600_dt, 0.70767796_dt, 1.32858229_dt, 1.46514130_dt, 2.36984587_dt, 2.32789850_dt },
+        { 1.09924030_dt, 1.39454818_dt, 3.07226086_dt, 3.00471735_dt, 1.92780900_dt, 0.89864433_dt, 3.12404513_dt, 1.63150108_dt, 1.61556613_dt, 2.65729856_dt, 2.29191136_dt, 3.62475705_dt,
+          2.23524952_dt, 2.34064960_dt, 2.93127108_dt, 3.76806307_dt, 1.87554216_dt, 0.91729611_dt, 1.41856945_dt, 3.40616751_dt, 1.07084465_dt, 2.51010680_dt, 1.61000013_dt, 3.92070532_dt },
+        { 1.09924030_dt, 0.86078858_dt, 1.85932684_dt, 1.74669361_dt, 1.96671295_dt, 1.03800440_dt, 1.92780900_dt, 0.78907204_dt, 1.21600676_dt, 2.01761079_dt, 0.95188874_dt, 0.67961240_dt,
+          1.61556613_dt, 2.03703070_dt, 1.10695314_dt, 1.80522633_dt, 2.46947551_dt, 1.15528154_dt, 2.23524952_dt, 1.57422948_dt, 1.86673665_dt, 1.83095455_dt, 2.34060526_dt, 1.42745757_dt }
+    };
+
+    yato::dimensionality<4> expectedShapes[] = { yato::dims(1, 48, 1, 1), yato::dims(1, 5, 3, 4), yato::dims(1, 2, 7, 4), yato::dims(1, 2, 3, 18), yato::dims(1, 2, 3, 12) };
+
+    for (size_t iter = 0; iter < std::size(expectedShapes); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+        raul::RepeatInterleaveLayer repeat("repeat", raul::RepeatInterleaveParams{ { "x" }, { "out" }, repeats[iter], dimensions[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = TORANGE(outNablas[iter]);
+        repeat.forwardCompute(raul::NetworkMode::Test);
+        repeat.backwardCompute();
+
+        // Checks
+        const auto& xTensorGrad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(xTensorGrad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < xTensorGrad.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(xTensorGrad[i], realGrads[iter][i], eps));
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Reverse.cpp b/training/src/tests/tests/layers/Test_Layer_Reverse.cpp
new file mode 100644
index 00000000..b05d20fc
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Reverse.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ReverseLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestReverseLayer, HeightUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    constexpr size_t BATCH_SIZE = 3u;
+    constexpr size_t DEPTH = 1u;
+    constexpr size_t HEIGHT = 5u;
+    constexpr size_t WIDTH = 3u;
+
+    const Tensor input{ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 0.0_dt, 3.0_dt, 6.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,  0.0_dt, 0.0_dt,  0.0_dt,
+                        1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 2.0_dt, 5.0_dt, 6.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,  0.0_dt, 0.0_dt,  0.0_dt,
+                        1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 0.0_dt, 5.0_dt, 6.0_dt, 9.0_dt, 7.0_dt, 8.0_dt, 10.0_dt, 9.0_dt, 10.0_dt, 11.0_dt };
+    const Tensor length{ 0.0_dt, 3.0_dt, 5.0_dt };
+
+    const Tensor realOutput{ 1.0_dt, 2.0_dt,  3.0_dt,  4.0_dt, 5.0_dt, 0.0_dt,  3.0_dt, 6.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                             5.0_dt, 6.0_dt,  1.0_dt,  4.0_dt, 5.0_dt, 2.0_dt,  1.0_dt, 2.0_dt, 3.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+                             9.0_dt, 10.0_dt, 11.0_dt, 7.0_dt, 8.0_dt, 10.0_dt, 5.0_dt, 6.0_dt, 9.0_dt, 4.0_dt, 5.0_dt, 0.0_dt, 1.0_dt, 2.0_dt, 3.0_dt };
+
+    // Input
+    work.add<DataLayer>("data_input", DataParams{ { "input" }, DEPTH, HEIGHT, WIDTH });
+    // Length
+    work.add<DataLayer>("data_length", DataParams{ { "length" }, 1u, 1u, 1u });
+
+    work.add<ReverseLayer>("reverse", BasicParams{ { "input", "length" }, { "output" } });
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["input"] = TORANGE(input);
+    memory_manager["length"] = TORANGE(length);
+    memory_manager["outputGradient"] = TORANGE(input);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Forward checks
+    const auto& output = memory_manager["output"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        EXPECT_EQ(output[i], realOutput[i]);
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    const auto& inGrad = memory_manager["inputGradient"];
+    EXPECT_EQ(inGrad.size(), output.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_EQ(inGrad[i], realOutput[i]);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Roll.cpp b/training/src/tests/tests/layers/Test_Layer_Roll.cpp
new file mode 100644
index 00000000..27a5d5d5
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Roll.cpp
@@ -0,0 +1,186 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/RollLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerRoll, ForwardNotCycledUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 1;
+    const auto height = 4;
+    const auto width = 3;
+
+    const raul::Tensor x{ 1.0_dt,  2.0_dt,  3.0_dt,  4.0_dt,  5.0_dt,  6.0_dt,  7.0_dt,  8.0_dt,  9.0_dt,  10.0_dt, 11.0_dt, 12.0_dt,
+                          13.0_dt, 14.0_dt, 15.0_dt, 16.0_dt, 17.0_dt, 18.0_dt, 19.0_dt, 20.0_dt, 21.0_dt, 22.0_dt, 23.0_dt, 24.0_dt };
+
+    const raul::Tensor realOut[]{ { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,  1.0_dt,  1.0_dt,
+                                    1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt, 8.0_dt, 9.0_dt, 10.0_dt, 11.0_dt, 12.0_dt },
+                                  { 1.0_dt,  2.0_dt,  3.0_dt,  4.0_dt,  5.0_dt,  6.0_dt,  7.0_dt,  8.0_dt,  9.0_dt,  10.0_dt, 11.0_dt, 12.0_dt,
+                                    13.0_dt, 14.0_dt, 15.0_dt, 16.0_dt, 17.0_dt, 18.0_dt, 19.0_dt, 20.0_dt, 21.0_dt, 22.0_dt, 23.0_dt, 24.0_dt },
+                                  { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,  2.0_dt,  3.0_dt,
+                                    1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 13.0_dt, 14.0_dt, 15.0_dt },
+                                  { 1.0_dt, 1.0_dt,  2.0_dt,  1.0_dt, 4.0_dt,  5.0_dt,  1.0_dt, 7.0_dt,  8.0_dt,  1.0_dt, 10.0_dt, 11.0_dt,
+                                    1.0_dt, 13.0_dt, 14.0_dt, 1.0_dt, 16.0_dt, 17.0_dt, 1.0_dt, 19.0_dt, 20.0_dt, 1.0_dt, 22.0_dt, 23.0_dt } };
+
+    const raul::Dimension dimensions[]{ raul::Dimension::Batch, raul::Dimension::Depth, raul::Dimension::Height, raul::Dimension::Width };
+    std::vector<size_t> shifts = { 1, 2, 3, 4 };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+
+        // Apply function
+        raul::RollLayer roller("roll", raul::RollLayerParams{ { "x" }, { "out" }, dimensions[iter], shifts[iter], false }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+
+        roller.forwardCompute(raul::NetworkMode::Test);
+
+        // Checks
+        const auto& xTensor = memory_manager["x"];
+        const auto& outTensor = memory_manager["out"];
+
+        EXPECT_EQ(outTensor.size(), xTensor.size());
+        EXPECT_EQ(outTensor.size(), realOut[iter].size());
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            EXPECT_EQ(outTensor[i], realOut[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerRoll, ForwardCycledUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 1;
+    const auto height = 4;
+    const auto width = 3;
+
+    const raul::Tensor x{ 1.0_dt,  2.0_dt,  3.0_dt,  4.0_dt,  5.0_dt,  6.0_dt,  7.0_dt,  8.0_dt,  9.0_dt,  10.0_dt, 11.0_dt, 12.0_dt,
+                          13.0_dt, 14.0_dt, 15.0_dt, 16.0_dt, 17.0_dt, 18.0_dt, 19.0_dt, 20.0_dt, 21.0_dt, 22.0_dt, 23.0_dt, 24.0_dt };
+
+    const raul::Tensor realOut[]{ { 13.0_dt, 14.0_dt, 15.0_dt, 16.0_dt, 17.0_dt, 18.0_dt, 19.0_dt, 20.0_dt, 21.0_dt, 22.0_dt, 23.0_dt, 24.0_dt,
+                                    1.0_dt,  2.0_dt,  3.0_dt,  4.0_dt,  5.0_dt,  6.0_dt,  7.0_dt,  8.0_dt,  9.0_dt,  10.0_dt, 11.0_dt, 12.0_dt },
+                                  { 1.0_dt,  2.0_dt,  3.0_dt,  4.0_dt,  5.0_dt,  6.0_dt,  7.0_dt,  8.0_dt,  9.0_dt,  10.0_dt, 11.0_dt, 12.0_dt,
+                                    13.0_dt, 14.0_dt, 15.0_dt, 16.0_dt, 17.0_dt, 18.0_dt, 19.0_dt, 20.0_dt, 21.0_dt, 22.0_dt, 23.0_dt, 24.0_dt },
+                                  { 4.0_dt,  5.0_dt,  6.0_dt,  7.0_dt,  8.0_dt,  9.0_dt,  10.0_dt, 11.0_dt, 12.0_dt, 1.0_dt,  2.0_dt,  3.0_dt,
+                                    16.0_dt, 17.0_dt, 18.0_dt, 19.0_dt, 20.0_dt, 21.0_dt, 22.0_dt, 23.0_dt, 24.0_dt, 13.0_dt, 14.0_dt, 15.0_dt },
+                                  { 3.0_dt,  1.0_dt,  2.0_dt,  6.0_dt,  4.0_dt,  5.0_dt,  9.0_dt,  7.0_dt,  8.0_dt,  12.0_dt, 10.0_dt, 11.0_dt,
+                                    15.0_dt, 13.0_dt, 14.0_dt, 18.0_dt, 16.0_dt, 17.0_dt, 21.0_dt, 19.0_dt, 20.0_dt, 24.0_dt, 22.0_dt, 23.0_dt } };
+
+    const raul::Dimension dimensions[]{ raul::Dimension::Batch, raul::Dimension::Depth, raul::Dimension::Height, raul::Dimension::Width };
+    std::vector<size_t> shifts = { 1, 2, 3, 4 };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+
+        // Apply function
+        raul::RollLayer roller("roll", raul::RollLayerParams{ { "x" }, { "out" }, dimensions[iter], shifts[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+
+        roller.forwardCompute(raul::NetworkMode::Test);
+
+        // Checks
+        const auto& xTensor = memory_manager["x"];
+        const auto& outTensor = memory_manager["out"];
+
+        EXPECT_EQ(outTensor.size(), xTensor.size());
+        EXPECT_EQ(outTensor.size(), realOut[iter].size());
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            EXPECT_EQ(outTensor[i], realOut[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerRoll, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto batch = 2;
+    const auto depth = 1;
+    const auto height = 4;
+    const auto width = 3;
+
+    const raul::Tensor x{ 1.0_dt,  2.0_dt,  3.0_dt,  4.0_dt,  5.0_dt,  6.0_dt,  7.0_dt,  8.0_dt,  9.0_dt,  10.0_dt, 11.0_dt, 12.0_dt,
+                          13.0_dt, 14.0_dt, 15.0_dt, 16.0_dt, 17.0_dt, 18.0_dt, 19.0_dt, 20.0_dt, 21.0_dt, 22.0_dt, 23.0_dt, 24.0_dt };
+
+    const raul::Tensor deltas{ 0.6646448374_dt, 0.3886462450_dt, 0.0786520243_dt, 0.1446506977_dt, 0.1808730364_dt, 0.6439573765_dt, 0.1503965855_dt, 0.7280383706_dt,
+                               0.8867152333_dt, 0.2971339822_dt, 0.7827857733_dt, 0.5104721189_dt, 0.8187109828_dt, 0.4369543791_dt, 0.1877676845_dt, 0.8780589104_dt,
+                               0.1925331354_dt, 0.6161287427_dt, 0.7849457264_dt, 0.1380746961_dt, 0.0454765558_dt, 0.7794026732_dt, 0.0058631897_dt, 0.1268088222_dt };
+    const raul::Tensor realGrad[]{ { 0.8187109828_dt, 0.4369543791_dt, 0.1877676845_dt, 0.8780589104_dt, 0.1925331354_dt, 0.6161287427_dt, 0.7849457264_dt, 0.1380746961_dt,
+                                     0.0454765558_dt, 0.7794026732_dt, 0.0058631897_dt, 0.1268088222_dt, 0.6646448374_dt, 0.3886462450_dt, 0.0786520243_dt, 0.1446506977_dt,
+                                     0.1808730364_dt, 0.6439573765_dt, 0.1503965855_dt, 0.7280383706_dt, 0.8867152333_dt, 0.2971339822_dt, 0.7827857733_dt, 0.5104721189_dt },
+                                   { 0.6646448374_dt, 0.3886462450_dt, 0.0786520243_dt, 0.1446506977_dt, 0.1808730364_dt, 0.6439573765_dt, 0.1503965855_dt, 0.7280383706_dt,
+                                     0.8867152333_dt, 0.2971339822_dt, 0.7827857733_dt, 0.5104721189_dt, 0.8187109828_dt, 0.4369543791_dt, 0.1877676845_dt, 0.8780589104_dt,
+                                     0.1925331354_dt, 0.6161287427_dt, 0.7849457264_dt, 0.1380746961_dt, 0.0454765558_dt, 0.7794026732_dt, 0.0058631897_dt, 0.1268088222_dt },
+                                   { 0.2971339822_dt, 0.7827857733_dt, 0.5104721189_dt, 0.6646448374_dt, 0.3886462450_dt, 0.0786520243_dt, 0.1446506977_dt, 0.1808730364_dt,
+                                     0.6439573765_dt, 0.1503965855_dt, 0.7280383706_dt, 0.8867152333_dt, 0.7794026732_dt, 0.0058631897_dt, 0.1268088222_dt, 0.8187109828_dt,
+                                     0.4369543791_dt, 0.1877676845_dt, 0.8780589104_dt, 0.1925331354_dt, 0.6161287427_dt, 0.7849457264_dt, 0.1380746961_dt, 0.0454765558_dt },
+                                   { 0.3886462450_dt, 0.0786520243_dt, 0.6646448374_dt, 0.1808730364_dt, 0.6439573765_dt, 0.1446506977_dt, 0.7280383706_dt, 0.8867152333_dt,
+                                     0.1503965855_dt, 0.7827857733_dt, 0.5104721189_dt, 0.2971339822_dt, 0.4369543791_dt, 0.1877676845_dt, 0.8187109828_dt, 0.1925331354_dt,
+                                     0.6161287427_dt, 0.8780589104_dt, 0.1380746961_dt, 0.0454765558_dt, 0.7849457264_dt, 0.0058631897_dt, 0.1268088222_dt, 0.7794026732_dt } };
+
+    const raul::Dimension dimensions[]{ raul::Dimension::Batch, raul::Dimension::Depth, raul::Dimension::Height, raul::Dimension::Width };
+    std::vector<size_t> shifts = { 1, 2, 3, 4 };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, depth, height, width });
+
+        // Apply function
+        raul::RollLayer roller("roll", raul::RollLayerParams{ { "x" }, { "out" }, dimensions[iter], shifts[iter] }, networkParameters);
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+        roller.forwardCompute(raul::NetworkMode::Test);
+        roller.backwardCompute();
+
+        // Checks
+        const auto& xTensor = memory_manager["x"];
+        const auto& xNablaTensor = memory_manager[raul::Name("x").grad()];
+
+        EXPECT_EQ(xNablaTensor.size(), xTensor.size());
+        EXPECT_EQ(xNablaTensor.size(), realGrad[iter].size());
+        for (size_t i = 0; i < xNablaTensor.size(); ++i)
+        {
+            EXPECT_EQ(xNablaTensor[i], realGrad[iter][i]);
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Round.cpp b/training/src/tests/tests/layers/Test_Layer_Round.cpp
new file mode 100644
index 00000000..db03c6a8
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Round.cpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/RoundLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerRound, RoundCPUUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    Tensor t({ 0.9_dt, 2.5_dt, 2.3_dt, 1.5_dt, -4.5_dt, 0.45_dt, 0.51_dt });
+    Tensor golden({ 1_dt, 2_dt, 2_dt, 2_dt, -4_dt, 0_dt, 1.0_dt });
+    const auto eps_rel = TODTYPE(1e-6);
+
+    work.add<DataLayer>("data", DataParams{ { "x" }, 1, 1, t.size() });
+
+    // Apply function
+    RoundLayer l("exp", BasicParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(1);
+    work.getMemoryManager()["x"] = TORANGE(t);
+    l.forwardCompute(NetworkMode::Train);
+
+    // Checks
+
+    const auto& out_tensor = work.getMemoryManager()["out"];
+
+    EXPECT_EQ(out_tensor.size(), golden.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], golden[i], eps_rel));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Scale.cpp b/training/src/tests/tests/layers/Test_Layer_Scale.cpp
new file mode 100644
index 00000000..b01f1690
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Scale.cpp
@@ -0,0 +1,184 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/initializers/ConstantInitializer.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ScaleLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+using namespace raul;
+
+namespace
+{
+
+dtype golden_scale_layer(const dtype x, const dtype scale)
+{
+    return scale * x;
+}
+
+dtype golden_scale_layer_grad(const dtype grad, const dtype scale)
+{
+    return grad * scale;
+}
+
+}
+
+TEST(TestLayerScale, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto scale = 2.0_dt;
+    const auto params = ScaleParams{ { "x", "y" }, { "x_out" }, scale };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x", "y" }, 1, 1, 1 });
+    ASSERT_THROW(ScaleLayer("twice", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerScale, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto scale = 2.0_dt;
+    const auto params = ScaleParams{ { "x" }, { "x_out", "y_out" }, scale };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    work.add<DataLayer>("input", DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    ASSERT_THROW(ScaleLayer("twice", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerScale, TwiceForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto scale = 2.0_dt;
+    const auto params = ScaleParams{ { "x" }, { "out" }, scale };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x" }, 1, 1, 1 });
+    ScaleLayer twice("twice", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    initializers::ConstantInitializer initializer{ 1.0_dt };
+    initializer(memory_manager["x"]);
+    twice.forwardCompute(NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_scale_layer(x_value, scale);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerScale, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = random::uniform::rand<int>(1, 1000);
+    const auto scale = random::uniform::rand<raul::dtype>(-100., 100.);
+
+    std::cout << "Run test with scale=" << scale << " and tensor shape (" << tensor_size << ",1,1,1)" << std::endl;
+
+    const auto params = ScaleParams{ { "x" }, { "out" }, scale };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x" }, 1, 1, 1 });
+    ScaleLayer scaler("scaler", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    initializers::ConstantInitializer initializer{ 1.0_dt };
+    initializer(memory_manager["x"]);
+    scaler.forwardCompute(NetworkMode::Test);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_scale_layer(x_value, scale);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerScale, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = 1e-6_dt;
+    const auto tensor_size = random::uniform::rand<int>(1, 1000);
+    const auto scale = random::uniform::rand<raul::dtype>(-100., 100.);
+
+    std::cout << "Run test with scale=" << scale << " and tensor shape (" << tensor_size << ",1,1,1)" << std::endl;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const auto params = ScaleParams{ { "x" }, { "out" }, scale };
+
+    // Apply function
+    work.add<DataLayer>("input", DataParams{ { "x" }, 1, 1, 1 });
+    ScaleLayer scaler("scaler", params, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", { -100.0_dt, 100.0_dt }, memory_manager);
+    scaler.forwardCompute(NetworkMode::Train);
+    memory_manager[Name("out").grad()].memAllocate(nullptr);
+    tools::init_rand_tensor(Name("out").grad(), { -100.0_dt, 100.0_dt }, memory_manager);
+    scaler.backwardCompute();
+
+    // Checks
+    const auto& x_tensor_grad = memory_manager[Name("x").grad()];
+    const auto& out_tensor_grad = memory_manager[Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto golden_out_value_x = golden_scale_layer_grad(out_grad_value, scale);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel)) << "expected: " << golden_out_value_x << ", got: " << x_grad_value;
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Select.cpp b/training/src/tests/tests/layers/Test_Layer_Select.cpp
new file mode 100644
index 00000000..60a8db79
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Select.cpp
@@ -0,0 +1,249 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/SelectLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_select_layer(const raul::dtype cond, const raul::dtype x, const raul::dtype y)
+{
+    return static_cast<bool>(cond) == true ? x : y;
+}
+
+std::pair<raul::dtype, raul::dtype> golden_select_layer_grad(const raul::dtype cond, const raul::dtype grad)
+{
+    return std::make_pair(grad * cond, grad * (1.0_dt - cond));
+}
+
+}
+
+TEST(TestLayerSelect, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::SelectLayer("select", raul::ElementWiseLayerParams{ { { "cond", "x", "y" }, { "x_out", "y_out" } } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerSelect, IncorrectInputNumUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::SelectLayer("select", raul::ElementWiseLayerParams{ { { "x", "y" }, { "out" } } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerSelect, ForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 30U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    const raul::Tensor cond{ 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                             0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("cond", "cond", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::SelectLayer select("select", raul::ElementWiseLayerParams{ { "cond", "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    memory_manager["cond"] = TORANGE(cond);
+
+    select.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& cond_tensor = memory_manager["cond"];
+    const auto& x_tensor = memory_manager["x"];
+    const auto& y_tensor = memory_manager["y"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto golden_out_value = golden_select_layer(cond_tensor[i], x_tensor[i], y_tensor[i]);
+        EXPECT_EQ(golden_out_value, out_tensor[i]);
+    }
+}
+
+TEST(TestLayerSelect, BackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 30U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    const raul::Tensor cond{ 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                             0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("cond", "cond", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::SelectLayer select("select", raul::ElementWiseLayerParams{ { "cond", "x", "y" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor("y", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+    memory_manager["cond"] = TORANGE(cond);
+
+    select.forwardCompute(raul::NetworkMode::Test);
+    select.backwardCompute();
+
+    // Checks
+    const auto& cond_tensor = memory_manager["cond"];
+    const auto& x_nabla_tensor = memory_manager[raul::Name("x").grad()];
+    const auto& y_nabla_tensor = memory_manager[raul::Name("y").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    for (size_t i = 0; i < x_nabla_tensor.size(); ++i)
+    {
+        const auto [golden_x_grad_value, golden_y_grad_value] = golden_select_layer_grad(cond_tensor[i], out_tensor_grad[i]);
+        EXPECT_EQ(golden_x_grad_value, x_nabla_tensor[i]);
+        EXPECT_EQ(golden_y_grad_value, y_nabla_tensor[i]);
+    }
+}
+
+TEST(TestLayerSelect, BroadcastForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const bool enable_broadcast = true;
+    const auto shape = yato::dims(1, 3, 4, 5);
+    const size_t batch_size = 1;
+
+    const raul::Tensor x{ 0.4963_dt, 0.7682_dt, 0.0885_dt, 0.1320_dt, 0.3074_dt, 0.6341_dt, 0.4901_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.3489_dt, 0.4017_dt, 0.0223_dt, 0.1689_dt, 0.2939_dt };
+
+    const raul::Tensor y{ 0.3051_dt, 0.9320_dt, 0.1759_dt, 0.2698_dt, 0.1507_dt, 0.0317_dt, 0.2081_dt, 0.9298_dt, 0.7231_dt, 0.7423_dt,
+                          0.5263_dt, 0.2437_dt, 0.5846_dt, 0.0332_dt, 0.1387_dt, 0.2422_dt, 0.8155_dt, 0.7932_dt, 0.2783_dt, 0.4820_dt };
+
+    const raul::Tensor cond{ 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt };
+
+    const raul::Tensor realOutput{
+        0.3051_dt, 0.9320_dt, 0.1759_dt, 0.2698_dt, 0.1507_dt, 0.0317_dt, 0.2081_dt, 0.9298_dt, 0.7231_dt, 0.7423_dt, 0.4963_dt, 0.7682_dt, 0.0885_dt, 0.1320_dt, 0.3074_dt,
+        0.2422_dt, 0.8155_dt, 0.7932_dt, 0.2783_dt, 0.4820_dt, 0.6341_dt, 0.4901_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.0317_dt, 0.2081_dt, 0.9298_dt, 0.7231_dt, 0.7423_dt,
+        0.5263_dt, 0.2437_dt, 0.5846_dt, 0.0332_dt, 0.1387_dt, 0.6341_dt, 0.4901_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.3051_dt, 0.9320_dt, 0.1759_dt, 0.2698_dt, 0.1507_dt,
+        0.3489_dt, 0.4017_dt, 0.0223_dt, 0.1689_dt, 0.2939_dt, 0.3489_dt, 0.4017_dt, 0.0223_dt, 0.1689_dt, 0.2939_dt, 0.2422_dt, 0.8155_dt, 0.7932_dt, 0.2783_dt, 0.4820_dt
+    };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("cond", "cond", raul::WShape{ raul::BS(), 3u, 4u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 3u, 1u, 5u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 4u, 5u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::SelectLayer select("select", raul::ElementWiseLayerParams{ { "cond", "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(batch_size);
+    memory_manager["cond"] = TORANGE(cond);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+
+    select.forwardCompute(raul::NetworkMode::Test);
+
+    // Checks
+    const auto& out_tensor = memory_manager["out"];
+    EXPECT_EQ(out_tensor.getShape(), shape);
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        EXPECT_EQ(realOutput[i], out_tensor[i]);
+    }
+}
+
+TEST(TestLayerSelect, BroadcastBackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const bool enable_broadcast = true;
+    const size_t batch_size = 1;
+
+    const raul::Tensor x{ 0.4963_dt, 0.7682_dt, 0.0885_dt, 0.1320_dt, 0.3074_dt, 0.6341_dt, 0.4901_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.3489_dt, 0.4017_dt, 0.0223_dt, 0.1689_dt, 0.2939_dt };
+
+    const raul::Tensor y{ 0.3051_dt, 0.9320_dt, 0.1759_dt, 0.2698_dt, 0.1507_dt, 0.0317_dt, 0.2081_dt, 0.9298_dt, 0.7231_dt, 0.7423_dt,
+                          0.5263_dt, 0.2437_dt, 0.5846_dt, 0.0332_dt, 0.1387_dt, 0.2422_dt, 0.8155_dt, 0.7932_dt, 0.2783_dt, 0.4820_dt };
+
+    const raul::Tensor cond{ 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt };
+
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                               1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt,
+                               1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor x_grad{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt };
+    const raul::Tensor y_grad{ 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 2.0_dt };
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("cond", "cond", raul::WShape{ raul::BS(), 3u, 4u, 1u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), 3u, 1u, 5u }, DEC_FORW_READ_NOMEMOPT);
+    work.tensorNeeded("y", "y", raul::WShape{ raul::BS(), 1u, 4u, 5u }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::SelectLayer select("select", raul::ElementWiseLayerParams{ { "cond", "x", "y" }, { "out" }, enable_broadcast }, networkParameters);
+    TENSORS_CREATE(batch_size);
+    memory_manager["cond"] = TORANGE(cond);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager["y"] = TORANGE(y);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    select.forwardCompute(raul::NetworkMode::Test);
+    select.backwardCompute();
+
+    // Checks
+    const auto& x_nabla_tensor = memory_manager[raul::Name("x").grad()];
+    const auto& y_nabla_tensor = memory_manager[raul::Name("y").grad()];
+    EXPECT_EQ(x_nabla_tensor.size(), x.size());
+    EXPECT_EQ(y_nabla_tensor.size(), y.size());
+    for (size_t i = 0; i < x_nabla_tensor.size(); ++i)
+    {
+        EXPECT_EQ(x_nabla_tensor[i], x_grad[i]);
+    }
+    for (size_t i = 0; i < y_nabla_tensor.size(); ++i)
+    {
+        EXPECT_EQ(y_nabla_tensor[i], y_grad[i]);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Sqrt.cpp b/training/src/tests/tests/layers/Test_Layer_Sqrt.cpp
new file mode 100644
index 00000000..2042c737
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Sqrt.cpp
@@ -0,0 +1,202 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/SqrtLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_sqrt_layer(const raul::dtype x)
+{
+    return std::sqrt(x);
+}
+
+raul::dtype golden_sqrt_layer_grad(const raul::dtype x, const raul::dtype grad)
+{
+    return grad * (1.0_dt / 2.0_dt / std::sqrt(x));
+}
+
+}
+
+TEST(TestLayerSqrt, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto params = raul::ElementWiseLayerParams{ { "x", "y" }, { "x_out" } };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    memory_manager.createTensor("x", tensor_size, 1, 1, 1);
+    memory_manager.createTensor("y", tensor_size, 1, 1, 1);
+
+    // Apply function
+    ASSERT_THROW(raul::SqrtLayer("sqrt", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerSqrt, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto params = raul::ElementWiseLayerParams{ { { "x" }, { "x_out", "y_out" } } };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    memory_manager.createTensor("x", tensor_size, 1, 1, 1);
+
+    // Apply function
+    ASSERT_THROW(raul::SqrtLayer("sqrt", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerSqrt, ForwardBackwardFailUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 0.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::SqrtLayer sqrt("sqrt", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+
+    ASSERT_THROW(sqrt.forwardCompute(raul::NetworkMode::Train), raul::Exception);
+    ASSERT_THROW(sqrt.backwardCompute(), raul::Exception);
+}
+
+TEST(TestLayerSqrt, BackwardZeroUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+
+    const raul::Tensor x{ 3.0_dt, 2.0_dt, 1.0_dt, 0.0_dt };
+    const raul::Tensor deltas{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+    const raul::Tensor x_grad{ 0.2886751294_dt, 0.3535533845_dt, 0.5_dt, std::numeric_limits<raul::dtype>::infinity() };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 4 });
+
+    // Apply function
+    raul::SqrtLayer sqrt("sqrt", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["x"] = TORANGE(x);
+    memory_manager[raul::Name("out").grad()] = TORANGE(deltas);
+
+    sqrt.forwardCompute(raul::NetworkMode::Train);
+    sqrt.backwardCompute();
+
+    // Checks
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+
+    for (size_t i = 0; i < x_tensor_grad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], x_grad[i], eps_rel)) << "expected: " << x_grad[i] << ", got: " << x_tensor_grad[i];
+    }
+}
+
+TEST(TestLayerSqrt, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(0.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::SqrtLayer sqrt("sqrt", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+
+    sqrt.forwardCompute(raul::NetworkMode::Train);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_sqrt_layer(x_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerSqrt, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(0.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::SqrtLayer sqrt("sqrt", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    sqrt.forwardCompute(raul::NetworkMode::Train);
+    sqrt.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto golden_out_value_x = golden_sqrt_layer_grad(x_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Square.cpp b/training/src/tests/tests/layers/Test_Layer_Square.cpp
new file mode 100644
index 00000000..35b3291e
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Square.cpp
@@ -0,0 +1,148 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/LogLayer.h>
+#include <training/base/layers/basic/SquareLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_square_layer(const raul::dtype x)
+{
+    return x * x;
+}
+
+raul::dtype golden_square_layer_grad(const raul::dtype x, const raul::dtype grad)
+{
+    return grad * 2.0_dt * x;
+}
+
+}
+
+TEST(TestLayerSquare, InputNumExceedsUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto params = raul::ElementWiseLayerParams{ { "x", "y" }, { "x_out" } };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    memory_manager.createTensor("x", tensor_size, 1, 1, 1);
+    memory_manager.createTensor("y", tensor_size, 1, 1, 1);
+
+    // Apply function
+    ASSERT_THROW(raul::SquareLayer("square", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerSquare, OutputNumExceedsUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto tensor_size = 1000U;
+    const auto params = raul::ElementWiseLayerParams{ { { "x" }, { "x_out", "y_out" } } };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    memory_manager.createTensor("x", tensor_size, 1, 1, 1);
+
+    // Apply function
+    ASSERT_THROW(raul::SquareLayer("square", params, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerSquare, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::SquareLayer square("square", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+
+    square.forwardCompute(raul::NetworkMode::Train);
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_value = out_tensor[i];
+        const auto golden_out_value = golden_square_layer(x_value);
+        ASSERT_TRUE(tools::expect_near_relative(out_value, golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLayerSquare, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1, 1, 1 });
+
+    // Apply function
+    raul::SquareLayer square("square", raul::ElementWiseLayerParams{ { "x" }, { "out" } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range, memory_manager);
+
+    square.forwardCompute(raul::NetworkMode::Train);
+    square.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["x"];
+    const auto& x_tensor_grad = memory_manager[raul::Name("x").grad()];
+    const auto& out_tensor_grad = memory_manager[raul::Name("out").grad()];
+
+    EXPECT_EQ(out_tensor_grad.size(), x_tensor_grad.size());
+
+    for (size_t i = 0; i < out_tensor_grad.size(); ++i)
+    {
+        const auto x_value = x_tensor[i];
+        const auto out_grad_value = out_tensor_grad[i];
+        const auto x_grad_value = x_tensor_grad[i];
+        const auto golden_out_value_x = golden_square_layer_grad(x_value, out_grad_value);
+        ASSERT_TRUE(tools::expect_near_relative(x_grad_value, golden_out_value_x, eps_rel));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Tile.cpp b/training/src/tests/tests/layers/Test_Layer_Tile.cpp
new file mode 100644
index 00000000..44a14101
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Tile.cpp
@@ -0,0 +1,310 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/TileLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerTile, IncorrectDimensionUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::TileLayer("tile", raul::TilingParams{ { "x" }, { "out" }, 1, raul::Dimension::Batch }, networkParameters), raul::Exception);
+}
+
+TEST(TestLayerTile, ForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const size_t batch = 1;
+    const size_t depth = 2;
+    const size_t height = 3;
+    const size_t width = 4;
+
+    const raul::Tensor x{ 0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt,  0.4166745_dt,  0.80782795_dt,  0.4932251_dt,  0.99812925_dt,
+                          0.69673514_dt, 0.1253736_dt,  0.7098167_dt,  0.6624156_dt,  0.57225657_dt, 0.36475348_dt,  0.42051828_dt, 0.630057_dt,
+                          0.913813_dt,   0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt };
+
+    std::vector<size_t> repeats{ 3, 3, 2, 2 };
+
+    std::string dimensions[] = { "depth", "height", "width", "default" };
+
+    const raul::Tensor realOuts[] = {
+        { 0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt },
+        { 0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt },
+        { 0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt,  0.29197514_dt, 0.20656645_dt,  0.53539073_dt, 0.5612575_dt, 0.4166745_dt,  0.80782795_dt,  0.4932251_dt,  0.99812925_dt,
+          0.4166745_dt,  0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,   0.57225657_dt, 0.36475348_dt,  0.42051828_dt, 0.630057_dt,  0.913813_dt,   0.6616472_dt,   0.83347356_dt, 0.08395803_dt,
+          0.913813_dt,   0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt },
+        { 0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt,  0.29197514_dt, 0.20656645_dt,  0.53539073_dt, 0.5612575_dt, 0.4166745_dt,  0.80782795_dt,  0.4932251_dt,  0.99812925_dt,
+          0.4166745_dt,  0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt,  0.29197514_dt, 0.20656645_dt,  0.53539073_dt, 0.5612575_dt, 0.4166745_dt,  0.80782795_dt,  0.4932251_dt,  0.99812925_dt,
+          0.4166745_dt,  0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,   0.57225657_dt, 0.36475348_dt,  0.42051828_dt, 0.630057_dt,  0.913813_dt,   0.6616472_dt,   0.83347356_dt, 0.08395803_dt,
+          0.913813_dt,   0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,   0.57225657_dt, 0.36475348_dt,  0.42051828_dt, 0.630057_dt,  0.913813_dt,   0.6616472_dt,   0.83347356_dt, 0.08395803_dt,
+          0.913813_dt,   0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt,  0.29197514_dt, 0.20656645_dt,  0.53539073_dt, 0.5612575_dt, 0.4166745_dt,  0.80782795_dt,  0.4932251_dt,  0.99812925_dt,
+          0.4166745_dt,  0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt,  0.29197514_dt, 0.20656645_dt,  0.53539073_dt, 0.5612575_dt, 0.4166745_dt,  0.80782795_dt,  0.4932251_dt,  0.99812925_dt,
+          0.4166745_dt,  0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,   0.57225657_dt, 0.36475348_dt,  0.42051828_dt, 0.630057_dt,  0.913813_dt,   0.6616472_dt,   0.83347356_dt, 0.08395803_dt,
+          0.913813_dt,   0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,   0.57225657_dt, 0.36475348_dt,  0.42051828_dt, 0.630057_dt,  0.913813_dt,   0.6616472_dt,   0.83347356_dt, 0.08395803_dt,
+          0.913813_dt,   0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt },
+        { 0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt,  0.99812925_dt, 0.69673514_dt, 0.1253736_dt,   0.7098167_dt,  0.6624156_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt,
+          0.57225657_dt, 0.36475348_dt, 0.42051828_dt, 0.630057_dt,  0.913813_dt,  0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt }
+    };
+
+    yato::dimensionality<4> expectedShapes[] = { yato::dims(1, 6, 3, 4), yato::dims(1, 2, 9, 4), yato::dims(1, 2, 3, 8), yato::dims(1, 4, 6, 8), yato::dims(1, 6, 6, 4) };
+
+    for (size_t iter = 0; iter < std::size(dimensions); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+        std::shared_ptr<raul::TileLayer> tile;
+        if (iter < 4)
+        {
+            raul::TilingParams params = raul::TilingParams{ { "x" }, { "out" }, repeats[iter], dimensions[iter] };
+            tile = std::make_shared<raul::TileLayer>("tile", params, networkParameters);
+        }
+        else
+        {
+            raul::TilingParams params = raul::TilingParams{ { "x" }, { "out" }, { 3, 2, 1 } };
+            tile = std::make_shared<raul::TileLayer>("tile", params, networkParameters);
+        }
+
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        tile->forwardCompute(raul::NetworkMode::Test);
+
+        // Checks
+        const auto& outTensor = memory_manager["out"];
+        EXPECT_EQ(outTensor.getShape(), expectedShapes[iter]);
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            EXPECT_EQ(outTensor[i], realOuts[iter][i]);
+        }
+    }
+}
+
+TEST(TestLayerTile, BackwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const size_t batch = 1;
+    const size_t depth = 2;
+    const size_t height = 3;
+    const size_t width = 4;
+    const auto eps = TODTYPE(1e-6);
+
+    const raul::Tensor x{ 0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt,  0.4166745_dt,  0.80782795_dt,  0.4932251_dt,  0.99812925_dt,
+                          0.69673514_dt, 0.1253736_dt,  0.7098167_dt,  0.6624156_dt,  0.57225657_dt, 0.36475348_dt,  0.42051828_dt, 0.630057_dt,
+                          0.913813_dt,   0.6616472_dt,  0.83347356_dt, 0.08395803_dt, 0.2797594_dt,  0.015523195_dt, 0.72637355_dt, 0.7655387_dt };
+
+    std::vector<size_t> repeats{ 3, 3, 2, 2 };
+    std::string dimensions[] = { "depth", "height", "width", "default" };
+
+    const raul::Tensor realGrads[] = {
+        { 1.053449_dt,  2.374159_dt,  1.6732247_dt, 0.70244753_dt, 1.536461_dt,  1.5391887_dt, 2.374594_dt,  1.4916971_dt, 1.2150586_dt, 2.096571_dt,  1.5140249_dt, 1.9761292_dt,
+          1.8315855_dt, 2.2907639_dt, 1.8733183_dt, 0.7677839_dt,  1.4215688_dt, 0.9596634_dt, 1.4550278_dt, 1.1843588_dt, 1.9347681_dt, 0.6152824_dt, 0.9150052_dt, 1.2870691_dt },
+        { 1.3056484_dt, 2.4377136_dt, 1.6344289_dt, 0.70290947_dt, 1.365818_dt,  1.4812039_dt, 2.7720208_dt, 1.8925366_dt,  1.9438181_dt, 1.2583629_dt, 1.2491584_dt, 1.3657807_dt,
+          1.5793861_dt, 2.2272096_dt, 1.9121141_dt, 0.76732194_dt, 1.5922117_dt, 1.0176482_dt, 1.0576011_dt, 0.78351927_dt, 1.2060086_dt, 1.4534905_dt, 1.1798717_dt, 1.8974175_dt },
+        { 0.45706987_dt, 1.5439833_dt, 1.6067597_dt, 0.8696456_dt,  1.5494368_dt,  1.2326576_dt, 1.1686065_dt, 0.640427_dt,  1.2281567_dt, 0.31222272_dt, 0.957317_dt,  1.1957082_dt,
+          0.8841866_dt,  1.4945016_dt, 1.3814917_dt, 0.81252885_dt, 0.78891516_dt, 1.3278598_dt, 1.4611369_dt, 1.1114358_dt, 0.6816741_dt, 0.48272014_dt, 1.1572896_dt, 0.531559_dt },
+        { 2.8751132_dt, 4.4563737_dt, 4.323347_dt, 3.5126996_dt, 3.9233608_dt, 3.6477427_dt, 4.7230253_dt, 3.2775667_dt, 4.0725403_dt, 2.4545398_dt, 3.6378407_dt, 3.4821033_dt,
+          4.8579206_dt, 3.920012_dt,  5.172658_dt, 3.1395707_dt, 2.5730727_dt, 5.196271_dt,  3.7009196_dt, 5.7858143_dt, 4.1964903_dt, 4.246865_dt,  1.5605282_dt, 3.057518_dt },
+        { 3.01969481_dt, 4.01116037_dt, 2.63166189_dt, 1.18747461_dt, 3.21782255_dt, 2.73187304_dt, 3.62101221_dt, 3.91345072_dt, 2.62518144_dt, 2.09885907_dt, 1.66223443_dt, 2.72998357_dt,
+          2.61920929_dt, 3.12346983_dt, 3.83695722_dt, 3.02822185_dt, 2.19693327_dt, 3.14736152_dt, 3.00416517_dt, 1.82628059_dt, 2.78075862_dt, 2.08497286_dt, 3.07653618_dt, 3.58166742_dt }
+    };
+
+    const raul::Tensor outNablas[] = {
+        { 0.16513085_dt, 0.9014813_dt,  0.6309742_dt,  0.4345461_dt,   0.29193902_dt, 0.64250207_dt, 0.9757855_dt,  0.43509948_dt,  0.6601019_dt,   0.60489583_dt,  0.6366315_dt,  0.6144488_dt,
+          0.8893349_dt,  0.6277617_dt,  0.53197503_dt, 0.025978208_dt, 0.44087505_dt, 0.25267076_dt, 0.8862232_dt,  0.88729346_dt,  0.78728163_dt,  0.059551954_dt, 0.0710938_dt,  0.3084147_dt,
+          0.25118268_dt, 0.9084705_dt,  0.47147965_dt, 0.24238515_dt,  0.63300395_dt, 0.5860311_dt,  0.910012_dt,   0.5701437_dt,   0.49643457_dt,  0.5939151_dt,   0.5414331_dt,  0.44291723_dt,
+          0.2924806_dt,  0.73394465_dt, 0.91970384_dt, 0.66851854_dt,  0.21609557_dt, 0.18653381_dt, 0.40716708_dt, 0.009662032_dt, 0.46557856_dt,  0.29618633_dt,  0.75012255_dt, 0.52189696_dt,
+          0.6371355_dt,  0.5642073_dt,  0.57077086_dt, 0.025516272_dt, 0.611518_dt,   0.3106556_dt,  0.48879647_dt, 0.4864539_dt,   0.058522105_dt, 0.89776003_dt,  0.33596027_dt, 0.91876316_dt,
+          0.64977_dt,    0.9290575_dt,  0.42163944_dt, 0.07328713_dt,  0.76459813_dt, 0.5204588_dt,  0.16163754_dt, 0.28740335_dt,  0.6819079_dt,   0.25954413_dt,  0.09378886_dt, 0.45675743_dt },
+        { 0.16513085_dt, 0.9014813_dt,  0.6309742_dt,  0.4345461_dt,   0.29193902_dt, 0.64250207_dt, 0.9757855_dt,  0.43509948_dt,  0.6601019_dt,   0.60489583_dt,  0.6366315_dt,  0.6144488_dt,
+          0.8893349_dt,  0.6277617_dt,  0.53197503_dt, 0.025978208_dt, 0.44087505_dt, 0.25267076_dt, 0.8862232_dt,  0.88729346_dt,  0.78728163_dt,  0.059551954_dt, 0.0710938_dt,  0.3084147_dt,
+          0.25118268_dt, 0.9084705_dt,  0.47147965_dt, 0.24238515_dt,  0.63300395_dt, 0.5860311_dt,  0.910012_dt,   0.5701437_dt,   0.49643457_dt,  0.5939151_dt,   0.5414331_dt,  0.44291723_dt,
+          0.2924806_dt,  0.73394465_dt, 0.91970384_dt, 0.66851854_dt,  0.21609557_dt, 0.18653381_dt, 0.40716708_dt, 0.009662032_dt, 0.46557856_dt,  0.29618633_dt,  0.75012255_dt, 0.52189696_dt,
+          0.6371355_dt,  0.5642073_dt,  0.57077086_dt, 0.025516272_dt, 0.611518_dt,   0.3106556_dt,  0.48879647_dt, 0.4864539_dt,   0.058522105_dt, 0.89776003_dt,  0.33596027_dt, 0.91876316_dt,
+          0.64977_dt,    0.9290575_dt,  0.42163944_dt, 0.07328713_dt,  0.76459813_dt, 0.5204588_dt,  0.16163754_dt, 0.28740335_dt,  0.6819079_dt,   0.25954413_dt,  0.09378886_dt, 0.45675743_dt },
+        { 0.16513085_dt, 0.9014813_dt,  0.6309742_dt,  0.4345461_dt,   0.29193902_dt, 0.64250207_dt, 0.9757855_dt,  0.43509948_dt,  0.6601019_dt,  0.60489583_dt,  0.6366315_dt,  0.6144488_dt,
+          0.8893349_dt,  0.6277617_dt,  0.53197503_dt, 0.025978208_dt, 0.44087505_dt, 0.25267076_dt, 0.8862232_dt,  0.88729346_dt,  0.78728163_dt, 0.059551954_dt, 0.0710938_dt,  0.3084147_dt,
+          0.25118268_dt, 0.9084705_dt,  0.47147965_dt, 0.24238515_dt,  0.63300395_dt, 0.5860311_dt,  0.910012_dt,   0.5701437_dt,   0.49643457_dt, 0.5939151_dt,   0.5414331_dt,  0.44291723_dt,
+          0.2924806_dt,  0.73394465_dt, 0.91970384_dt, 0.66851854_dt,  0.21609557_dt, 0.18653381_dt, 0.40716708_dt, 0.009662032_dt, 0.46557856_dt, 0.29618633_dt,  0.75012255_dt, 0.52189696_dt },
+        { 0.16513085_dt,  0.9014813_dt,   0.6309742_dt,  0.4345461_dt,   0.29193902_dt, 0.64250207_dt, 0.9757855_dt,   0.43509948_dt,  0.6601019_dt,   0.60489583_dt,  0.6366315_dt,  0.6144488_dt,
+          0.8893349_dt,   0.6277617_dt,   0.53197503_dt, 0.025978208_dt, 0.44087505_dt, 0.25267076_dt, 0.8862232_dt,   0.88729346_dt,  0.78728163_dt,  0.059551954_dt, 0.0710938_dt,  0.3084147_dt,
+          0.25118268_dt,  0.9084705_dt,   0.47147965_dt, 0.24238515_dt,  0.63300395_dt, 0.5860311_dt,  0.910012_dt,    0.5701437_dt,   0.49643457_dt,  0.5939151_dt,   0.5414331_dt,  0.44291723_dt,
+          0.2924806_dt,   0.73394465_dt,  0.91970384_dt, 0.66851854_dt,  0.21609557_dt, 0.18653381_dt, 0.40716708_dt,  0.009662032_dt, 0.46557856_dt,  0.29618633_dt,  0.75012255_dt, 0.52189696_dt,
+          0.6371355_dt,   0.5642073_dt,   0.57077086_dt, 0.025516272_dt, 0.611518_dt,   0.3106556_dt,  0.48879647_dt,  0.4864539_dt,   0.058522105_dt, 0.89776003_dt,  0.33596027_dt, 0.91876316_dt,
+          0.64977_dt,     0.9290575_dt,   0.42163944_dt, 0.07328713_dt,  0.76459813_dt, 0.5204588_dt,  0.16163754_dt,  0.28740335_dt,  0.6819079_dt,   0.25954413_dt,  0.09378886_dt, 0.45675743_dt,
+          0.80607176_dt,  0.6874014_dt,   0.7146628_dt,  0.7172555_dt,   0.19159257_dt, 0.674997_dt,   0.84543407_dt,  0.2922609_dt,   0.092414856_dt, 0.71844184_dt,  0.29468465_dt, 0.8941331_dt,
+          0.7456336_dt,   0.20546448_dt,  0.78517365_dt, 0.9046478_dt,   0.21869493_dt, 0.71179736_dt, 0.2794757_dt,   0.093688846_dt, 0.1307261_dt,   0.15925503_dt,  0.15632963_dt, 0.84454226_dt,
+          0.5565096_dt,   0.5307274_dt,   0.16428733_dt, 0.16310573_dt,  0.4557836_dt,  0.34946883_dt, 0.6564821_dt,   0.9208337_dt,   0.10741806_dt,  0.05706513_dt,  0.2794541_dt,  0.0661242_dt,
+          0.121813655_dt, 0.45792544_dt,  0.31201506_dt, 0.46504116_dt,  0.65310884_dt, 0.65611696_dt, 0.45208728_dt,  0.8963671_dt,   0.32994986_dt,  0.22004199_dt,  0.2453059_dt,  0.36547518_dt,
+          0.04364562_dt,  0.27318442_dt,  0.19908202_dt, 0.33521748_dt,  0.4779179_dt,  0.26450837_dt, 0.31524432_dt,  0.41136813_dt,  0.87558246_dt,  0.2572304_dt,   0.7549573_dt,  0.83434105_dt,
+          0.48019493_dt,  0.31500447_dt,  0.746855_dt,   0.1601975_dt,   0.45962846_dt, 0.7234938_dt,  0.24683177_dt,  0.449157_dt,    0.72002196_dt,  0.059944034_dt, 0.57900906_dt, 0.04383695_dt,
+          0.85585284_dt,  0.027230382_dt, 0.3232242_dt,  0.83496463_dt,  0.71301806_dt, 0.33625674_dt, 0.82607245_dt,  0.03485012_dt,  0.052514672_dt, 0.64586926_dt,  0.49611855_dt, 0.96355164_dt,
+          0.74480903_dt,  0.7778795_dt,   0.42672014_dt, 0.95287013_dt,  0.9313873_dt,  0.7965926_dt,  0.113613844_dt, 0.42684054_dt,  0.4257661_dt,   0.8725982_dt,   0.6143615_dt,  0.4515493_dt,
+          0.43694246_dt,  0.40165102_dt,  0.74053156_dt, 0.55630386_dt,  0.60578966_dt, 0.91761243_dt, 0.6631657_dt,   0.19196546_dt,  0.010732412_dt, 0.44978714_dt,  0.24843419_dt, 0.76548266_dt,
+          0.21867585_dt,  0.57201135_dt,  0.69218874_dt, 0.31307876_dt,  0.72334254_dt, 0.4814824_dt,  0.029916286_dt, 0.12069726_dt,  0.3200673_dt,   0.44513643_dt,  0.11140478_dt, 0.37603903_dt },
+        { 0.16513085_dt,  0.9014813_dt,  0.6309742_dt,  0.4345461_dt,   0.29193902_dt, 0.64250207_dt, 0.9757855_dt,  0.43509948_dt,  0.6601019_dt,   0.60489583_dt,  0.6366315_dt,  0.6144488_dt,
+          0.8893349_dt,   0.6277617_dt,  0.53197503_dt, 0.025978208_dt, 0.44087505_dt, 0.25267076_dt, 0.8862232_dt,  0.88729346_dt,  0.78728163_dt,  0.059551954_dt, 0.0710938_dt,  0.3084147_dt,
+          0.25118268_dt,  0.9084705_dt,  0.47147965_dt, 0.24238515_dt,  0.63300395_dt, 0.5860311_dt,  0.910012_dt,   0.5701437_dt,   0.49643457_dt,  0.5939151_dt,   0.5414331_dt,  0.44291723_dt,
+          0.2924806_dt,   0.73394465_dt, 0.91970384_dt, 0.66851854_dt,  0.21609557_dt, 0.18653381_dt, 0.40716708_dt, 0.009662032_dt, 0.46557856_dt,  0.29618633_dt,  0.75012255_dt, 0.52189696_dt,
+          0.6371355_dt,   0.5642073_dt,  0.57077086_dt, 0.025516272_dt, 0.611518_dt,   0.3106556_dt,  0.48879647_dt, 0.4864539_dt,   0.058522105_dt, 0.89776003_dt,  0.33596027_dt, 0.91876316_dt,
+          0.64977_dt,     0.9290575_dt,  0.42163944_dt, 0.07328713_dt,  0.76459813_dt, 0.5204588_dt,  0.16163754_dt, 0.28740335_dt,  0.6819079_dt,   0.25954413_dt,  0.09378886_dt, 0.45675743_dt,
+          0.80607176_dt,  0.6874014_dt,  0.7146628_dt,  0.7172555_dt,   0.19159257_dt, 0.674997_dt,   0.84543407_dt, 0.2922609_dt,   0.092414856_dt, 0.71844184_dt,  0.29468465_dt, 0.8941331_dt,
+          0.7456336_dt,   0.20546448_dt, 0.78517365_dt, 0.9046478_dt,   0.21869493_dt, 0.71179736_dt, 0.2794757_dt,  0.093688846_dt, 0.1307261_dt,   0.15925503_dt,  0.15632963_dt, 0.84454226_dt,
+          0.5565096_dt,   0.5307274_dt,  0.16428733_dt, 0.16310573_dt,  0.4557836_dt,  0.34946883_dt, 0.6564821_dt,  0.9208337_dt,   0.10741806_dt,  0.05706513_dt,  0.2794541_dt,  0.0661242_dt,
+          0.121813655_dt, 0.45792544_dt, 0.31201506_dt, 0.46504116_dt,  0.65310884_dt, 0.65611696_dt, 0.45208728_dt, 0.8963671_dt,   0.32994986_dt,  0.22004199_dt,  0.2453059_dt,  0.36547518_dt,
+          0.04364562_dt,  0.27318442_dt, 0.19908202_dt, 0.33521748_dt,  0.4779179_dt,  0.26450837_dt, 0.31524432_dt, 0.41136813_dt,  0.87558246_dt,  0.2572304_dt,   0.7549573_dt,  0.83434105_dt,
+          0.48019493_dt,  0.31500447_dt, 0.746855_dt,   0.1601975_dt,   0.45962846_dt, 0.7234938_dt,  0.24683177_dt, 0.449157_dt,    0.72002196_dt,  0.059944034_dt, 0.57900906_dt, 0.04383695_dt }
+    };
+
+    yato::dimensionality<4> expectedShapes[] = { yato::dims(1, 6, 3, 4), yato::dims(1, 2, 9, 4), yato::dims(1, 2, 3, 8), yato::dims(1, 4, 6, 8), yato::dims(1, 6, 6, 4) };
+
+    for (size_t iter = 0; iter < std::size(expectedShapes); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+        work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+        std::shared_ptr<raul::TileLayer> tile;
+        if (iter < 4)
+        {
+            raul::TilingParams params = raul::TilingParams{ { "x" }, { "out" }, repeats[iter], dimensions[iter] };
+            tile = std::make_shared<raul::TileLayer>("tile", params, networkParameters);
+        }
+        else
+        {
+            raul::TilingParams params = raul::TilingParams{ { "x" }, { "out" }, { 3, 2, 1 } };
+            tile = std::make_shared<raul::TileLayer>("tile", params, networkParameters);
+        }
+
+        // Apply function
+        TENSORS_CREATE(batch);
+        memory_manager["x"] = TORANGE(x);
+        memory_manager[raul::Name("out").grad()] = TORANGE(outNablas[iter]);
+        tile->forwardCompute(raul::NetworkMode::Test);
+        tile->backwardCompute();
+
+        // Checks
+        const auto& xTensorGrad = memory_manager[raul::Name("x").grad()];
+        EXPECT_EQ(xTensorGrad.getShape(), memory_manager["x"].getShape());
+        for (size_t i = 0; i < xTensorGrad.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(xTensorGrad[i], realGrads[iter][i], eps));
+        }
+    }
+}
+
+TEST(TestLayerTile, BigBackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const size_t tensor_size = 100;
+    const size_t depth = 20;
+    const size_t height = 30;
+    const size_t width = 40;
+    const auto random_range1 = std::make_pair(1.0_dt, 100.0_dt);
+    const auto random_range2 = std::make_pair(1.0_dt, 1.0_dt);
+    const auto eps = TODTYPE(1e-6);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("x", "x", raul::WShape{ raul::BS(), depth, height, width }, DEC_FORW_READ_NOMEMOPT);
+
+    // Apply function
+    raul::TileLayer tile("tile", raul::TilingParams{ { "x" }, { "out" }, { 3, 2, 1 } }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("x", random_range1, memory_manager);
+    tools::init_rand_tensor(raul::Name("out").grad(), random_range2, memory_manager);
+
+    tile.forwardCompute(raul::NetworkMode::Test);
+    tile.backwardCompute();
+
+    // Checks
+    const auto& xTensorGrad = memory_manager[raul::Name("x").grad()];
+    EXPECT_EQ(xTensorGrad.getShape(), memory_manager["x"].getShape());
+    for (auto& i : xTensorGrad)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(i, 6.0_dt, eps));
+    }
+}
+
+TEST(TestLayerTile, SimpleWidthUnit) 
+{
+    PROFILE_TEST
+
+    raul::WorkflowEager work;
+    auto& mm = work.getMemoryManager();
+    size_t width = 2;
+    size_t depth = 1;
+    size_t height = 2;
+    size_t batch = 1;
+
+    const raul::Tensor in{1_dt, 0_dt};
+    const raul::Tensor golden(batch, depth, height, width, {1_dt, 1_dt, 0_dt, 0_dt});
+
+    work.add<raul::DataLayer>("in", raul::DataParams{ {"in"}, depth, height, 1 });
+    work.add<raul::TileLayer>("tile", raul::TilingParams{ "in", "tile", width, raul::Dimension::Width });
+    TENSORS_CREATE(batch);
+    mm["in"] = TORANGE(in);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    const auto& outTensor = mm["tile"];
+    EXPECT_EQ(outTensor.getShape(), golden.getShape());
+    for (size_t i = 0; i < outTensor.size(); ++i)
+    {
+        EXPECT_EQ(outTensor[i], golden[i]);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_TrainableParamsConvert.cpp b/training/src/tests/tests/layers/Test_Layer_TrainableParamsConvert.cpp
new file mode 100644
index 00000000..6aac1b65
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_TrainableParamsConvert.cpp
@@ -0,0 +1,168 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+#include <training/compiler/Compiler.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLayerTrainableParamsConvert, CompilerImplicittFailToWrapNonTrainableLayerUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16, true);
+    auto& compiler = work.getCompiler();
+    work.add<raul::DataLayer>("in", raul::DataParams{ { "in" }, 1u, 1u, 1u });
+    work.add<raul::TransposeLayer>("t", raul::TransposingParams{ "in", "out", raul::Dimension::Width, raul::Dimension::Height });
+    compiler.setConstraint(raul::Constraint("t", raul::ConstraintImpl::CPUFP16FP32MasterWeights));
+    EXPECT_THROW(work.preparePipelines(), raul::Exception);
+}
+
+TEST(TestLayerTrainableParamsConvert, CompilerImplicitWrapTrainableLayerUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16, true);
+    auto& compiler = work.getCompiler();
+
+    work.add<raul::DataLayer>("in", raul::DataParams{ { "in" }, 2u, 2u, 2u });
+    work.add<raul::LinearLayer>("l", raul::LinearParams{ { "in" }, { "out" }, 1 });
+
+    compiler.setConstraint(raul::Constraint("l", raul::ConstraintImpl::CPUFP16FP32MasterWeights));
+
+    EXPECT_NO_THROW(work.preparePipelines());
+    EXPECT_NO_THROW(work.prepareMemoryForTraining());
+    EXPECT_NO_THROW(work.setBatchSize(1));
+
+    // Check that needed weights exists
+    const auto& memory_managerFP16 = work.getMemoryManager<MemoryManagerFP16>();
+    const auto& memory_managerFP32 = work.getMemoryManager<MemoryManager>();
+
+    // Initial
+    EXPECT_TRUE(memory_managerFP16.tensorExists("l::Weights"));
+    EXPECT_TRUE(memory_managerFP16.tensorExists("l::Biases"));
+
+    // Initial_copies
+    EXPECT_TRUE(memory_managerFP32.tensorExists("l::Weights_fp32"));
+    EXPECT_TRUE(memory_managerFP32.tensorExists("l::Biases_fp32"));
+
+    EXPECT_EQ(work.getTrainableParameterNames().size(), 4);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST(TestLayerTrainableParamsConvert, CompilerImplicitForwardBackwardWithSharingUnit)
+{
+    PROFILE_TEST
+
+    constexpr size_t MODEL_SIZE = 5;
+    constexpr size_t BATCH_SIZE = 2;
+    constexpr size_t DEPTH = 1;
+    constexpr size_t HEIGHT = 2;
+    constexpr raul::dtype EPS = 2e-2_dt;
+
+    raul::TensorFP16 in{ 1.0_hf, 1.0_hf, 2.0_hf, 0.0_hf, 5.0_hf, -1.0_hf, 2.0_hf, 2.0_hf, 0.0_hf, 5.0_hf, -1.0_hf, 4.0_hf, 1.0_hf, 2.0_hf, 1.0_hf, -3.0_hf, 4.0_hf, 5.0_hf, 2.0_hf, 1.0_hf };
+    raul::TensorFP16 weights{ -0.2381_hf, 0.1714_hf, -0.0612_hf, -0.1329_hf, -0.3701_hf, 0.0283_hf,  -0.2147_hf, -0.0502_hf, 0.2090_hf, 0.4333_hf, -0.1200_hf, 0.1664_hf, -0.3021_hf,
+                              -0.2250_hf, 0.3329_hf, -0.1200_hf, 0.1664_hf,  -0.3021_hf, -0.2250_hf, 0.3329_hf,  0.1200_hf,  0.1664_hf, 0.3021_hf, 0.2250_hf,  0.3329_hf };
+    raul::TensorFP16 biases{ 0.3548_hf, 0.2879_hf, 0.0343_hf, 0.1269_hf, 0.2234_hf };
+
+    raul::TensorFP16 weightsGrad{ -0.0078125_hf, 0.585938_hf, -0.772461_hf, 1.5752_hf,   6.19922_hf, -3.15234_hf, 9.22656_hf,   7.08594_hf, 4.71875_hf,
+                                  15.6328_hf,    0.325928_hf, -0.335938_hf, -1.60938_hf, 1.24219_hf, 5.19531_hf,  -0.730469_hf, 2.57227_hf, 1.03223_hf,
+                                  2.29688_hf,    8.36719_hf,  -5.57812_hf,  15.8984_hf,  13.1484_hf, 7.14453_hf,  22.9062_hf };
+    raul::TensorFP16 biasesGrad{ 2.67969_hf, 5.82812_hf, 2.34375_hf, 3.40625_hf, 8.25_hf };
+
+    raul::Workflow work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16, true);
+    auto& compiler = work.getCompiler();
+
+    work.add<raul::DataLayer>("in", raul::DataParams{ { "in" }, DEPTH, HEIGHT, MODEL_SIZE });
+    // First
+    work.add<raul::LinearLayer>("l1", raul::LinearParams{ { "in" }, { "out1" }, MODEL_SIZE });
+    // Second
+    work.add<raul::LinearLayer>("l2", raul::LinearParams{ { "out1" }, { "out2" }, "l1", MODEL_SIZE });
+
+    compiler.setConstraint(raul::Constraint("l1", raul::ConstraintImpl::CPUFP16FP32MasterWeights));
+    compiler.setConstraint(raul::Constraint("l2", raul::ConstraintImpl::CPUFP16FP32MasterWeights));
+
+    EXPECT_NO_THROW(work.preparePipelines());
+    EXPECT_NO_THROW(work.prepareMemoryForTraining());
+    EXPECT_NO_THROW(work.setBatchSize(BATCH_SIZE));
+
+    // Check that needed weights exists
+    auto& memory_managerFP16 = work.getMemoryManager<MemoryManagerFP16>();
+    auto& memory_managerFP32 = work.getMemoryManager<MemoryManager>();
+
+    // Initial
+    EXPECT_TRUE(memory_managerFP16.tensorExists("l1::Weights"));
+    EXPECT_TRUE(memory_managerFP16.tensorExists("l1::Biases"));
+    EXPECT_TRUE(!memory_managerFP16.tensorExists("l2::Weights"));
+    EXPECT_TRUE(!memory_managerFP16.tensorExists("l2::Biases"));
+
+    // Initial_copies
+    EXPECT_TRUE(memory_managerFP32.tensorExists("l1::Weights_fp32"));
+    EXPECT_TRUE(memory_managerFP32.tensorExists("l1::Biases_fp32"));
+    EXPECT_TRUE(!memory_managerFP32.tensorExists("l2::Weights_fp32"));
+    EXPECT_TRUE(!memory_managerFP32.tensorExists("l2::Biases_fp32"));
+
+    EXPECT_EQ(work.getTrainableParameterNames().size(), 4);
+
+    memory_managerFP16["l1::Weights"] = TORANGE_FP16(weights);
+    memory_managerFP16["l1::Biases"] = TORANGE_FP16(biases);
+    memory_managerFP16["in"] = TORANGE_FP16(in);
+
+    EXPECT_NO_THROW(work.forwardPassTraining());
+
+    // Check
+    const auto& weightsFP32 = memory_managerFP32["l1::Weights_fp32"];
+    EXPECT_EQ(weights.size(), weightsFP32.size());
+    for (size_t i = 0; i < weightsFP32.size(); ++i)
+    {
+        EXPECT_NEAR(weights[i], weightsFP32[i], EPS);
+    }
+
+    const auto& biasesFP32 = memory_managerFP32["l1::Biases_fp32"];
+    EXPECT_EQ(biases.size(), biasesFP32.size());
+    for (size_t i = 0; i < biasesFP32.size(); ++i)
+    {
+        EXPECT_NEAR(biases[i], biasesFP32[i], EPS);
+    }
+
+    memory_managerFP16[Name("out2").grad()].memAllocate(nullptr);
+    memory_managerFP16[Name("out2").grad()] = 1.0_hf;
+
+    EXPECT_NO_THROW(work.backwardPassTraining());
+
+    // Check
+    const auto& weightsGradFP32 = memory_managerFP32[Name("l1::Weights_fp32").grad()];
+    EXPECT_EQ(weightsGrad.size(), weightsGradFP32.size());
+    for (size_t i = 0; i < weightsGradFP32.size(); ++i)
+    {
+        EXPECT_NEAR(weightsGrad[i], weightsGradFP32[i], EPS);
+    }
+
+    const auto& biasesGradFP32 = memory_managerFP32[Name("l1::Biases_fp32").grad()];
+    EXPECT_EQ(biasesGrad.size(), biasesGradFP32.size());
+    for (size_t i = 0; i < biasesGradFP32.size(); ++i)
+    {
+        EXPECT_NEAR(biasesGrad[i], biasesGradFP32[i], EPS);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_ZeroOutput.cpp b/training/src/tests/tests/layers/Test_Layer_ZeroOutput.cpp
new file mode 100644
index 00000000..516e747a
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_ZeroOutput.cpp
@@ -0,0 +1,101 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/composite/rnn/ZeroOutputLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_zero_output_layer(raul::dtype index, raul::dtype in, raul::dtype length)
+{
+    return index < length ? in : 0.0_dt;
+}
+
+}
+
+TEST(TestZeroOutput, CpuUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t BATCH_SIZE = 3;
+    constexpr size_t WIDTH = 11;
+    constexpr size_t HEIGHT = 1;
+    constexpr size_t DEPTH = 23;
+    constexpr auto range = std::make_pair(0.0_dt, 1.0_dt);
+
+    const Tensor realLength{ 1.0_dt, 5.0_dt, 21.0_dt };
+
+    WorkflowEager work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPU);
+
+    work.add<DataLayer>("dataX", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<DataLayer>("dataY", DataParams{ { "realLength" }, 1u, 1u, 1u });
+    work.add<ZeroOutputLayer>("zerooutput", BasicParams{ { "in", "realLength" }, { "out" } });
+
+    TENSORS_CREATE(BATCH_SIZE)
+
+    auto& memory_manager = work.getMemoryManager();
+
+    memory_manager["realLength"] = TORANGE(realLength);
+    tools::init_rand_tensor("in", range, memory_manager);
+    tools::init_rand_tensor(Name("out").grad(), range, memory_manager);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    const Tensor& in = memory_manager["in"];
+    const Tensor& out = memory_manager["out"];
+    EXPECT_EQ(in.size(), out.size());
+
+    auto in3D = in.reshape(yato::dims(BATCH_SIZE, DEPTH * HEIGHT, WIDTH));
+    auto out3D = out.reshape(yato::dims(BATCH_SIZE, DEPTH * HEIGHT, WIDTH));
+    for (size_t i = 0; i < BATCH_SIZE; ++i)
+    {
+        for (size_t j = 0; j < DEPTH * HEIGHT; ++j)
+        {
+            for (size_t k = 0; k < WIDTH; ++k)
+            {
+                EXPECT_EQ(out3D[i][j][k], golden_zero_output_layer(static_cast<dtype>(j), in3D[i][j][k], realLength[i]));
+            }
+        }
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    const Tensor& inGrad = memory_manager[Name("in").grad()];
+    const Tensor& outGrad = memory_manager[Name("out").grad()];
+    EXPECT_EQ(in.size(), inGrad.size());
+
+    auto inGrad3D = inGrad.reshape(yato::dims(BATCH_SIZE, DEPTH * HEIGHT, WIDTH));
+    auto outGrad3D = outGrad.reshape(yato::dims(BATCH_SIZE, DEPTH * HEIGHT, WIDTH));
+    for (size_t i = 0; i < BATCH_SIZE; ++i)
+    {
+        for (size_t j = 0; j < DEPTH * HEIGHT; ++j)
+        {
+            for (size_t k = 0; k < WIDTH; ++k)
+            {
+                EXPECT_EQ(inGrad3D[i][j][k], golden_zero_output_layer(static_cast<dtype>(j), outGrad3D[i][j][k], realLength[i]));
+            }
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Layer_Zoneout.cpp b/training/src/tests/tests/layers/Test_Layer_Zoneout.cpp
new file mode 100644
index 00000000..bd6e9bdb
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Layer_Zoneout.cpp
@@ -0,0 +1,311 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/initializers/ConstantInitializer.h>
+#include <training/base/initializers/RandomUniformInitializer.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/composite/rnn/ZoneoutLayer.h>
+#include <training/compiler/Layers.h>
+
+namespace UT
+{
+
+using namespace std;
+using namespace raul;
+
+/*namespace
+{
+
+dtype outputs_test_golden(const dtype prev, const dtype curr, const dtype prob)
+{
+    return prob * prev + (1.0_dt - prob) * curr;
+}
+
+}*/
+
+TEST(TestZoneout, SimpleTestForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    [[maybe_unused]] const auto eps_rel = 1e-5_dt;
+    const auto probability = random::uniform::rand<raul::dtype>(0., 1.);
+    const auto input_size = (size_t)random::uniform::rand<int>(1, 1000);
+    const auto batch_size = random::uniform::rand<int>(1, 1000);
+
+    cout << "Test with p=" << probability << " and shape (" << batch_size << "," << input_size << ")" << endl;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Network
+    work.add<DataLayer>("input", DataParams{ { "current", "previous" }, 1, 1, input_size });
+    const auto params = ZoneoutParams{ { "current", "previous" }, { "out" }, probability };
+    ZoneoutLayer zo("zoneout", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+    initializers::RandomUniformInitializer initializer{ -1e3_dt, 1e3_dt };
+    initializer(memory_manager["current"]);
+    initializer(memory_manager["previous"]);
+    // Apply
+    EXPECT_THROW(zo.forwardCompute(NetworkMode::Test), raul::Exception);
+
+    // Checks
+    /*const auto& outputs = memory_manager["out"];
+    const auto& current = memory_manager["current"];
+    const auto& previous = memory_manager["previous"];
+
+    EXPECT_EQ(outputs.size(), current.size());
+    EXPECT_EQ(outputs.size(), previous.size());
+
+    for (size_t i = 0; i < outputs.size(); ++i)
+    {
+        const auto val = outputs[i];
+        const auto golden_val = outputs_test_golden(previous[i], current[i], probability);
+        ASSERT_TRUE(tools::expect_near_relative(val, golden_val, eps_rel)) << "at " << i << ", expected: " << golden_val << ", got: " << val;
+    }*/
+}
+
+TEST(TestZoneout, SimpleTrainForwardP0RandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto probability = 0.0_dt;
+    const auto input_size = 1'000'000;
+    const auto batch_size = 1U;
+
+    cout << "Test with p=" << probability << " and shape (" << batch_size << "," << input_size << ")" << endl;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Network
+    work.add<DataLayer>("input", DataParams{ { "current", "previous" }, 1, 1, input_size });
+    const auto params = ZoneoutParams{ { "current", "previous" }, { "out" }, probability };
+    ZoneoutLayer zo("zoneout", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+    initializers::RandomUniformInitializer initializer{ -1e3_dt, 1e3_dt };
+    initializer(memory_manager["current"]);
+    initializer(memory_manager["previous"]);
+
+    // Apply
+    zo.forwardCompute(NetworkMode::Train);
+
+    const auto& outputs = memory_manager["out"];
+    const auto& current = memory_manager["current"];
+
+    EXPECT_EQ(outputs.size(), current.size());
+
+    for (size_t i = 0; i < outputs.size(); ++i)
+    {
+        EXPECT_EQ(outputs[i], current[i]);
+    }
+}
+
+TEST(TestZoneout, SimpleTrainForwardP1RandUnit)
+{
+    PROFILE_TEST
+    const auto probability = 1.0_dt;
+    const auto input_size = 1'000'000;
+    const auto batch_size = 1U;
+
+    cout << "Test with p=" << probability << " and shape (" << batch_size << "," << input_size << ")" << endl;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Network
+    work.add<DataLayer>("input", DataParams{ { "current", "previous" }, 1, 1, input_size });
+    const auto params = ZoneoutParams{ { "current", "previous" }, { "out" }, probability };
+    EXPECT_NO_THROW(ZoneoutLayer("zoneout", params, networkParameters));
+}
+
+TEST(TestZoneout, SimpleTrainForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto probability = random::uniform::rand<raul::dtype>(0., 1.);
+    const auto input_size = 1'000'000;
+    const auto batch_size = 1U;
+
+    cout << "Test with p=" << probability << " and shape (" << batch_size << "," << input_size << ")" << endl;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    // Network
+    work.add<DataLayer>("input", DataParams{ { "current", "previous" }, 1, 1, input_size });
+    const auto params = ZoneoutParams{ { "current", "previous" }, { "out" }, probability };
+    ZoneoutLayer zo("zoneout", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    size_t curr_cnt = 0U;
+    size_t prev_cnt = 0U;
+    initializers::RandomUniformInitializer initializer{ -1e3_dt, 1e3_dt };
+    initializer(memory_manager["current"]);
+    initializer(memory_manager["previous"]);
+
+    // Apply
+    zo.forwardCompute(NetworkMode::Train);
+
+    const auto& outputs = memory_manager["out"];
+    const auto& current = memory_manager["current"];
+    const auto& previous = memory_manager["previous"];
+
+    EXPECT_EQ(outputs.size(), current.size());
+    EXPECT_EQ(outputs.size(), previous.size());
+
+    for (size_t i = 0; i < outputs.size(); ++i)
+    {
+        if (outputs[i] == current[i])
+        {
+            ++curr_cnt;
+        }
+
+        if (outputs[i] == previous[i])
+        {
+            ++prev_cnt;
+        }
+    }
+
+    // Checks
+    dtype curr_prob = static_cast<dtype>(curr_cnt) / static_cast<dtype>(outputs.size());
+    dtype prev_prob = static_cast<dtype>(prev_cnt) / static_cast<dtype>(outputs.size());
+
+    // Assumption
+    ASSERT_TRUE(TODTYPE(outputs.size()) * curr_prob * (1.0_dt - curr_prob) >= 10.0_dt);
+    ASSERT_TRUE(TODTYPE(outputs.size()) * prev_prob * (1.0_dt - prev_prob) >= 10.0_dt);
+
+    // The confident interval for p estimation
+    // See https://www.wolframalpha.com/input/?i=confidence+99.999%25
+    const auto z_ci = 4.417_dt; // 99.999%
+    const auto prev_ci = z_ci * sqrt(prev_prob * (1.0_dt - prev_prob) / TODTYPE(outputs.size()));
+    const auto curr_ci = z_ci * sqrt(curr_prob * (1.0_dt - curr_prob) / TODTYPE(outputs.size()));
+
+    cout << "[prev prob] expected: " << probability << ", got: " << prev_prob << ", ci: " << prev_ci << endl;
+    cout << "[curr prob] expected: " << 1.0_dt - probability << ", got: " << curr_prob << ", ci: " << curr_ci << endl;
+    EXPECT_NEAR(prev_prob, probability, prev_ci);
+    EXPECT_NEAR(curr_prob, 1.0_dt - probability, curr_ci);
+}
+
+TEST(TestZoneout, SimpleTrainBackwardRandUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto probability = random::uniform::rand<raul::dtype>(0., 1.);
+    const auto input_size = 10'000;
+    const auto batch_size = 1U;
+
+    cout << "Test with p=" << probability << " and shape (" << batch_size << "," << input_size << ")" << endl;
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    memory_manager.createTensor(Name("out").grad(), batch_size, 1, 1, input_size);
+
+    // Network
+    work.add<DataLayer>("input", DataParams{ { "current", "previous" }, 1, 1, input_size });
+    const auto params = ZoneoutParams{ { "current", "previous" }, { "out" }, probability };
+    ZoneoutLayer zo("zoneout", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    // Apply
+    initializers::RandomUniformInitializer initializer{ -1e3_dt, 1e3_dt };
+    initializer(memory_manager["current"]);
+    initializer(memory_manager["previous"]);
+
+    auto& outputs = memory_manager["out"];
+    auto& current = memory_manager["current"];
+    auto& previous = memory_manager["previous"];
+
+    zo.forwardCompute(NetworkMode::Train);
+
+    Tensor outputs_buffer = TORANGE(outputs);
+    Tensor current_buffer = TORANGE(current);
+    Tensor previous_buffer = TORANGE(previous);
+
+    memory_manager[Name("out").grad()].memAllocate(nullptr);
+    initializer(memory_manager[Name("out").grad()]);
+
+    zo.backwardCompute();
+
+    const auto& outputs_grad = memory_manager[Name("out").grad()];
+    const auto& current_grad = memory_manager[Name("current").grad()];
+    const auto& previous_grad = memory_manager[Name("previous").grad()];
+
+    EXPECT_EQ(outputs_grad.size(), current_grad.size());
+    EXPECT_EQ(outputs_grad.size(), previous_grad.size());
+
+    for (size_t i = 0; i < outputs_buffer.size(); ++i)
+    {
+        //        cout << i << " prev: " << previous_grad[i] << ", curr: " << current_grad[i] << ", out: " << outputs_grad[i] << endl;
+
+        if (outputs_buffer[i] == current_buffer[i])
+        {
+            EXPECT_TRUE(current_grad[i] == outputs_grad[i]);
+            EXPECT_TRUE(previous_grad[i] == 0.0_dt);
+        }
+
+        if (outputs_buffer[i] == previous_buffer[i])
+        {
+            EXPECT_TRUE(previous_grad[i] == outputs_grad[i]);
+            EXPECT_TRUE(current_grad[i] == 0.0_dt);
+        }
+    }
+}
+
+#ifdef ANDROID
+TEST(TestZoneout, SimpleTrainForwardP0RandFP16Unit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto probability = 0.0_dt;
+    const auto input_size = 1'000'000;
+    const auto batch_size = 1U;
+
+    cout << "Test with p=" << probability << " and shape (" << batch_size << "," << input_size << ")" << endl;
+
+    // Initialization
+    raul::WorkflowEager work{ raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16 };
+    auto& memory_manager = work.getMemoryManager<raul::MemoryManagerFP16>();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Network
+    work.add<DataLayer>("input", DataParams{ { "current", "previous" }, 1, 1, input_size });
+    const auto params = ZoneoutParams{ { "current", "previous" }, { "out" }, probability };
+    ZoneoutLayer zo("zoneout", params, networkParameters);
+    TENSORS_CREATE(batch_size)
+
+    tools::init_rand_tensor("current", { TOHTYPE(-1e3_hf), 1e3_hf }, memory_manager);
+    tools::init_rand_tensor("previous", { TOHTYPE(-1e3_hf), 1e3_hf }, memory_manager);
+
+    // Apply
+    zo.forwardCompute(NetworkMode::Train);
+
+    const auto& outputs = memory_manager["out"];
+    const auto& current = memory_manager["current"];
+
+    EXPECT_EQ(outputs.size(), current.size());
+
+    for (size_t i = 0; i < outputs.size(); ++i)
+    {
+        EXPECT_TRUE(outputs[i] == current[i]);
+    }
+}
+#endif // ANDROID
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Linear.cpp b/training/src/tests/tests/layers/Test_Linear.cpp
new file mode 100644
index 00000000..9b7deb9c
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Linear.cpp
@@ -0,0 +1,297 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/GTestExtensions.h>
+#include <tests/tools/TestTools.h>
+
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+using namespace raul;
+using namespace std;
+
+struct TestLinear : public testing::Test
+{
+    using il = initializer_list<dtype>;
+
+    static constexpr dtype eps = 2e-4_dt;
+
+    size_t MODEL_SIZE = 5;
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 1;
+    size_t HEIGHT = 2;
+    size_t LIN_SIZE = 3;
+
+    unique_ptr<Tensor> Input;
+    unique_ptr<Tensor> Weights;
+    unique_ptr<Tensor> Bias;
+    unique_ptr<Tensor> RealOut;
+    unique_ptr<Tensor> RealInGrad;
+    unique_ptr<Tensor> RealWeightGrad;
+    unique_ptr<Tensor> RealBiasGrad;
+    unique_ptr<Tensor> OutNabla;
+
+    void SetUp() final
+    {
+        Input = make_unique<Tensor>(BATCH_SIZE, DEPTH, HEIGHT, MODEL_SIZE, il{ 1., 1., 2., 0., 5., -1., 2., 2., 0., 5., -1., 4., 1., 2., 1., -3., 4., 5., 2., 1. });
+        Weights = make_unique<Tensor>(il{ -0.2381f, 0.1714f, -0.0612f, -0.1329f, -0.3701f, 0.0283f, -0.2147f, -0.0502f, 0.209f, 0.4333f, -0.12f, 0.1664f, -0.3021f, -0.225f, 0.3329f });
+        Bias = make_unique<Tensor>(il{ 0.3548f, 0.2879f, 0.0343f });
+
+        RealOut = make_unique<Tensor>(BATCH_SIZE, DEPTH, HEIGHT, LIN_SIZE, il{ -1.6848f, 2.1676f, 1.141f, -1.0372f, 1.8963f, 1.5474f, 0.5814f, 0.2019f, 0.4007f, 0.8128f, -0.0555f, -0.5677f });
+
+        RealInGrad = make_unique<Tensor>(BATCH_SIZE, DEPTH, HEIGHT, MODEL_SIZE, il{ -0.0615f,  -0.4244f, 0.1405f,  0.5101f, 0.1636f, -0.2107f, 0.0374f, -0.3829f, -0.0824f, 0.581145f,
+                                                                                    -0.06925f, -1.0361f, -0.6339f, 0.9626f, 2.7476f, -0.6845f, 0.8071f, -0.5253f, -0.8123f, -0.67415f });
+
+        RealWeightGrad = make_unique<Tensor>(1, 1, LIN_SIZE, MODEL_SIZE, il{ -6.f, 12.f, 13.5f, 5.f, 10.f, -2.f, 24.f, 7.f, 10.f, 20.f, -7.5f, 11.f, 8.5f, 5.f, 2.5f });
+
+        RealBiasGrad = make_unique<Tensor>(1, 1, 1, LIN_SIZE, il{ 4.f, 8.f, 2.5f });
+
+        OutNabla = make_unique<Tensor>(BATCH_SIZE, DEPTH, HEIGHT, LIN_SIZE, il{ 1.f, 2.f, -1.f, 0.5f, 1.f, 1.f, 0.5f, 6.f, 1.f, 2.f, -1.f, 1.5f });
+    }
+};
+
+// corresponds to linear.py test
+TEST_F(TestLinear, LinearUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    WorkflowEager work(CompressionMode::NONE, CalculationMode::DETERMINISTIC, AllocationMode::STANDARD);
+
+    auto& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, MODEL_SIZE });
+    work.add<LinearLayer>("l", LinearParams{ { "in" }, { "out" }, LIN_SIZE });
+
+    TENSORS_CREATE(BATCH_SIZE)
+    memory_manager["in"] = TORANGE(*Input);
+    memory_manager["l::Weights"] = TORANGE(*Weights);
+    memory_manager["l::Biases"] = TORANGE(*Bias);
+    memory_manager[Name("out").grad()] = TORANGE(*OutNabla);
+
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    ASSERT_FLOAT_TENSORS_EQ(memory_manager["out"], (*RealOut), eps);
+    ASSERT_FLOAT_TENSORS_EQ(memory_manager[Name("in").grad()], (*RealInGrad), eps);
+    ASSERT_FLOAT_TENSORS_EQ(memory_manager["l::WeightsGradient"], (*RealWeightGrad), eps);
+    ASSERT_FLOAT_TENSORS_EQ(memory_manager["l::BiasesGradient"], (*RealBiasGrad), eps);
+
+    printf(" - Linear backward is Ok.\n");
+}
+
+// corresponds to linear.py test
+TEST_F(TestLinear, ShouldAccumulateGradientsDuringBackwardPassUnit)
+{
+    PROFILE_TEST
+
+    WorkflowEager work(CompressionMode::NONE, CalculationMode::DETERMINISTIC, AllocationMode::STANDARD);
+
+    auto& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    transform(RealWeightGrad->begin(), RealWeightGrad->end(), RealWeightGrad->begin(), [](dtype v) { return v * 2; });
+    transform(RealBiasGrad->begin(), RealBiasGrad->end(), RealBiasGrad->begin(), [](dtype v) { return v * 2; });
+    transform(RealInGrad->begin(), RealInGrad->end(), RealInGrad->begin(), [](dtype v) { return v * 2; });
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, MODEL_SIZE });
+    work.add<LinearLayer> ("l", LinearParams{ { "in" }, { "out" }, LIN_SIZE });
+
+    TENSORS_CREATE(BATCH_SIZE)
+    memory_manager["in"] = TORANGE(*Input);
+    memory_manager["l::Weights"] = TORANGE(*Weights);
+    memory_manager["l::Biases"] = TORANGE(*Bias);
+    memory_manager[Name("out").grad()] = TORANGE(*OutNabla);
+
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+    work.backwardPassTraining();
+
+    ASSERT_FLOAT_TENSORS_EQ(memory_manager["out"], (*RealOut), eps);
+    ASSERT_FLOAT_TENSORS_EQ(memory_manager[Name("in").grad()], (*RealInGrad), eps);
+    ASSERT_FLOAT_TENSORS_EQ(memory_manager["l::WeightsGradient"], (*RealWeightGrad), eps);
+    ASSERT_FLOAT_TENSORS_EQ(memory_manager["l::BiasesGradient"], (*RealBiasGrad), eps);
+}
+
+// linear_sharing.py
+TEST_F(TestLinear, LinearSharingUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+    using namespace std;
+
+    WorkflowEager work(CompressionMode::NONE, CalculationMode::DETERMINISTIC, AllocationMode::STANDARD);
+
+    auto& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    LIN_SIZE = MODEL_SIZE;
+    size_t LAYERS_COUNT = 3;
+
+    Tensor in = Tensor({ 1.0, 1.0, 2.0, 0.0, 5.0, -1.0, 2.0, 2.0, 0.0, 5.0, -1.0, 4.0, 1.0, 2.0, 1.0, -3.0, 4.0, 5.0, 2.0, 1.0 });
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, MODEL_SIZE });
+
+    string layerName = "l";
+    string inName = "in";
+    string outName = "out0";
+    for (size_t i = 1; i < LAYERS_COUNT; ++i)
+    {
+        auto suffix = "_" + to_string(i);
+
+        work.add<LinearLayer>(layerName + suffix, LinearParams{ { { inName }, { outName }, { layerName + "::Weights", layerName + "::Biases" } }, LIN_SIZE });
+
+        inName = outName;
+        outName = i == LAYERS_COUNT - 1 ? "out" : "out" + to_string(i);
+    }
+    work.add<LinearLayer>(layerName, LinearParams{ inName, outName, LIN_SIZE });
+
+    Tensor weights = { -0.2381_dt, 0.1714_dt, -0.0612_dt, -0.1329_dt, -0.3701_dt, 0.0283_dt,  -0.2147_dt, -0.0502_dt, 0.2090_dt, 0.4333_dt, -0.1200_dt, 0.1664_dt, -0.3021_dt,
+                       -0.2250_dt, 0.3329_dt, -0.1200_dt, 0.1664_dt,  -0.3021_dt, -0.2250_dt, 0.3329_dt,  0.1200_dt,  0.1664_dt, 0.3021_dt, 0.2250_dt,  0.3329_dt };
+
+    Tensor bias = { 0.3548_dt, 0.2879_dt, 0.0343_dt, 0.1269_dt, 0.2234_dt };
+
+    Tensor realOut(BATCH_SIZE, DEPTH, HEIGHT, LIN_SIZE, { -0.3120_dt, 1.0290_dt, 0.3937_dt, 0.4863_dt, 1.5408_dt, -0.2452_dt, 1.0373_dt, 0.7068_dt, 0.7994_dt, 1.4122_dt,
+                                                          0.1546_dt,  0.6165_dt, 0.4569_dt, 0.5495_dt, 0.9656_dt, 0.2022_dt,  0.5715_dt, 0.0744_dt, 0.1670_dt, 1.3219_dt });
+
+    Tensor realInGrad(BATCH_SIZE, DEPTH, HEIGHT, MODEL_SIZE, { -0.0438_dt, 0.1801_dt, -0.0390_dt, -0.0028_dt, 0.3830_dt, -0.0828_dt, 0.2114_dt, -0.1093_dt, -0.0880_dt, 0.3210_dt,
+                                                               -0.0687_dt, 0.3145_dt, -0.1089_dt, 0.1102_dt,  1.1691_dt, -0.1112_dt, 0.2029_dt, -0.1073_dt, -0.2367_dt, -0.2216_dt });
+
+    Tensor realWeightGrad(1, 1, LIN_SIZE, MODEL_SIZE, { -3.0951_dt, 7.2950_dt,  5.5701_dt,  4.8715_dt,  5.2898_dt, -2.1553_dt, 11.0552_dt, 1.7749_dt, 4.7404_dt,
+                                                        15.4022_dt, -3.0406_dt, 7.4858_dt,  3.8192_dt,  3.2663_dt, 4.5374_dt,  -2.3601_dt, 8.4283_dt, 6.2212_dt,
+                                                        4.9974_dt,  10.4397_dt, -4.0017_dt, 16.0471_dt, 9.8539_dt, 10.2852_dt, 22.5297_dt });
+    TENSORS_CREATE(BATCH_SIZE)
+    memory_manager["in"] = TORANGE(in);
+    memory_manager[Name("out").grad()] =
+        TORANGE(Tensor({ 1.0_dt, 2.0_dt, -1.0_dt, 2.0_dt, 1.0_dt, 0.5_dt, 1.0_dt, 1.0_dt, 0.4_dt, 0.8_dt, 0.5_dt, 6.0_dt, 1.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, -1.0_dt, 1.5_dt, -0.5_dt, 0.1_dt }));
+
+    Tensor realBiasGrad(1, 1, 1, LIN_SIZE, { 3.9697_dt, 9.0458_dt, 3.0549_dt, 5.1412_dt, 11.1337_dt });
+
+    memory_manager[layerName + "::Weights"] = TORANGE(weights);
+    memory_manager[layerName + "::Biases"] = TORANGE(bias);
+
+    work.forwardPassTraining();
+
+    const Tensor& out = memory_manager["out"];
+    EXPECT_EQ(out.getShape(), realOut.getShape());
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], eps);
+    }
+    printf(" - Linear forward is Ok.\n");
+
+    work.backwardPassTraining();
+
+    auto& grad = memory_manager[Name("in").grad()];
+    EXPECT_EQ(grad.getShape(), realInGrad.getShape());
+    for (size_t i = 0; i < grad.size(); ++i)
+    {
+        EXPECT_NEAR(grad[i], realInGrad[i], eps);
+    }
+
+    auto& gradW = memory_manager[layerName + "::WeightsGradient"];
+
+    EXPECT_EQ(gradW.getShape(), realWeightGrad.getShape());
+    for (size_t i = 0; i < gradW.size(); ++i)
+    {
+        EXPECT_NEAR(gradW[i], realWeightGrad[i], eps);
+    }
+
+    auto& gradB = memory_manager[layerName + "::BiasesGradient"];
+    EXPECT_EQ(gradB.getShape(), realBiasGrad.getShape());
+    for (size_t i = 0; i < gradB.size(); ++i)
+    {
+        EXPECT_NEAR(gradB[i], realBiasGrad[i], eps);
+    }
+
+    printf(" - Linear backward is Ok.\n");
+}
+
+TEST_F(TestLinear, BiasesUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    const size_t INPUT_SIZE = 2;
+    const size_t OUTPUT_SIZE = 2;
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    WorkflowEager work(CompressionMode::NONE, CalculationMode::DETERMINISTIC, AllocationMode::STANDARD);
+
+    auto& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    Tensor input = { 10.0, 20.0 };
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 1, 1, INPUT_SIZE });
+    work.add<LinearLayer>("fc1", LinearParams{ { "in" }, { "fc1" }, OUTPUT_SIZE, true });
+    TENSORS_CREATE(1);
+    memory_manager["in"] = TORANGE(input);
+
+    memory_manager["fc1::Weights"] = TORANGE((Tensor{ 1.0f, 1.0f, 1.0f, 1.0f }));
+    memory_manager["fc1::Biases"] = TORANGE((Tensor{ 1.0f, 1.0f }));
+    ASSERT_NO_THROW(memory_manager["fc1::Biases"]);
+    ASSERT_NO_THROW(memory_manager["fc1::BiasesGradient"]);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    EXPECT_EQ(memory_manager["fc1"].size(), static_cast<size_t>(OUTPUT_SIZE));
+    CHECK_NEAR(memory_manager["fc1"][0], 31.0f, EPSILON);
+    CHECK_NEAR(memory_manager["fc1"][1], 31.0f, EPSILON);
+
+    memory_manager[Name("fc1").grad()] = TORANGE((Tensor{ 1.0f, 1.0f }));
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST_F(TestLinear, Biases2Unit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    const size_t INPUT_SIZE = 2;
+    const size_t OUTPUT_SIZE = 2;
+    const dtype EPSILON = TODTYPE(1e-6);
+
+    WorkflowEager work(CompressionMode::NONE, CalculationMode::DETERMINISTIC, AllocationMode::STANDARD);
+
+    auto& memory_manager = work.getMemoryManager();
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    Tensor input = { 10.0, 20.0 };
+
+    work.add<raul::DataLayer>("data", DataParams{ { "in" }, 1, 1, INPUT_SIZE });
+    work.add<LinearLayer>("fc1", LinearParams{ { "in" }, { "fc1" }, OUTPUT_SIZE, false });
+    TENSORS_CREATE(1);
+    memory_manager["in"] = TORANGE(input);
+
+    memory_manager["fc1::Weights"] = TORANGE((Tensor{ 1.0f, 1.0f, 1.0f, 1.0f }));
+    ASSERT_THROW(memory_manager["fc1::Biases"], raul::Exception);
+    ASSERT_THROW(memory_manager["fc1::BiasesGradient"], raul::Exception);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    EXPECT_EQ(memory_manager["fc1"].size(), static_cast<size_t>(OUTPUT_SIZE));
+    CHECK_NEAR(memory_manager["fc1"][0], 30.0f, EPSILON);
+    CHECK_NEAR(memory_manager["fc1"][1], 30.0f, EPSILON);
+
+    memory_manager[Name("fc1").grad()] = TORANGE((Tensor{ 1.0f, 1.0f }));
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_MaskedFillLayer.cpp b/training/src/tests/tests/layers/Test_MaskedFillLayer.cpp
new file mode 100644
index 00000000..aac9502e
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_MaskedFillLayer.cpp
@@ -0,0 +1,279 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/MaskedFillLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestMaskedFill, PlainUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    const size_t BATCH_SIZE = 2;
+    const size_t WIDTH = 4;
+    const size_t HEIGHT = 1;
+    const size_t DEPTH = 3;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+    const dtype FILL_VALUE = TODTYPE(1e-3);
+
+    // Inputs
+    const raul::Tensor in{ 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f };
+    const raul::Tensor mask{ 0.f, 1.f, 0.f, 0.f };
+
+    // Outputs
+    const raul::Tensor realOut{ 1.f, 1e-3f, 1.f, 1.f, 1.f, 1e-3f, 1.f, 1.f, 1.f, 1e-3f, 1.f, 1.f, 1.f, 1e-3f, 1.f, 1.f, 1.f, 1e-3f, 1.f, 1.f, 1.f, 1e-3f, 1.f, 1.f };
+    const raul::Tensor realGrad{ 2.f, 0.f, 2.f, 2.f, 2.f, 0.f, 2.f, 2.f, 2.f, 0.f, 2.f, 2.f, 2.f, 0.f, 2.f, 2.f, 2.f, 0.f, 2.f, 2.f, 2.f, 0.f, 2.f, 2.f };
+
+    // Initialization
+    raul::Workflow work;
+
+    work.add<raul::DataLayer>("data_in", raul::DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<raul::TensorLayer>("data_mask", raul::TensorParams{ { "mask" }, 1u, 1u, HEIGHT, WIDTH });
+    work.add<raul::MaskedFillLayer>("mask", raul::MaskedFillParams{ { "in", "mask" }, { "out" }, FILL_VALUE });
+
+    work.preparePipelines();
+    work.setBatchSize(BATCH_SIZE);
+    work.prepareMemoryForTraining();
+
+    auto& memory_manager = work.getMemoryManager();
+    memory_manager["in"] = TORANGE(in);
+    memory_manager["mask"] = TORANGE(mask);
+    memory_manager[raul::Name("out").grad()].memAllocate(nullptr);
+    memory_manager[raul::Name("out").grad()] = 2.0_dt;
+
+    work.forwardPassTraining();
+
+    const raul::Tensor& out = memory_manager["out"];
+
+    EXPECT_EQ(out.getShape(), shape(BATCH_SIZE, DEPTH, HEIGHT, WIDTH));
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], EPSILON);
+    }
+
+    printf(" - MaskedFill forward is Ok.\n");
+
+    work.backwardPassTraining();
+
+    const raul::Tensor& inGrad = memory_manager[raul::Name("in").grad()];
+    EXPECT_EQ(inGrad.getShape(), shape(BATCH_SIZE, DEPTH, HEIGHT, WIDTH));
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], realGrad[i], EPSILON);
+    }
+
+    printf(" - MaskedFill backward is Ok.\n");
+}
+
+TEST(TestMaskedFill, ChannelWiseUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    const size_t BATCH_SIZE = 2;
+    const size_t WIDTH = 4;
+    const size_t HEIGHT = 1;
+    const size_t DEPTH = 3;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+    const dtype FILL_VALUE = TODTYPE(1e-3);
+
+    // Inputs
+    const raul::Tensor mask{ 0.f, 1.f, 0.f, 0.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f, 0.f };
+
+    // Outputs
+    const raul::Tensor realOut = { 1.f, 1e-3f, 1.f, 1.f, 1e-3f, 1e-3f, 1e-3f, 1e-3f, 1.f, 1.f, 1.f, 1.f, 1.f, 1e-3f, 1.f, 1.f, 1e-3f, 1e-3f, 1e-3f, 1e-3f, 1.f, 1.f, 1.f, 1.f };
+    const raul::Tensor realGrad = { 2.f, 0.f, 2.f, 2.f, 0.f, 0.f, 0.f, 0.f, 2.f, 2.f, 2.f, 2.f, 2.f, 0.f, 2.f, 2.f, 0.f, 0.f, 0.f, 0.f, 2.f, 2.f, 2.f, 2.f };
+
+    // Initialization
+    raul::Workflow work;
+
+    work.add<raul::DataLayer>("data_in", raul::DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<raul::TensorLayer>("data_mask", raul::TensorParams{ { "mask" }, 1u, DEPTH, HEIGHT, WIDTH });
+    work.add<raul::MaskedFillLayer>("mask", raul::MaskedFillParams{ { "in", "mask" }, { "out" }, FILL_VALUE });
+
+    work.preparePipelines();
+    work.setBatchSize(BATCH_SIZE);
+    work.prepareMemoryForTraining();
+
+    auto& memory_manager = work.getMemoryManager();
+    memory_manager["in"] = 1.0_dt;
+    memory_manager["mask"] = TORANGE(mask);
+    memory_manager[raul::Name("out").grad()].memAllocate(nullptr);
+    memory_manager[raul::Name("out").grad()] = 2.0_dt;
+
+    work.forwardPassTraining();
+
+    const raul::Tensor& out = memory_manager["out"];
+
+    EXPECT_EQ(out.getShape(), shape(BATCH_SIZE, DEPTH, HEIGHT, WIDTH));
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], EPSILON);
+    }
+
+    printf(" - MaskedFill forward is Ok.\n");
+
+    work.backwardPassTraining();
+
+    const raul::Tensor& inGrad = memory_manager[raul::Name("in").grad()];
+    EXPECT_EQ(inGrad.getShape(), shape(BATCH_SIZE, DEPTH, HEIGHT, WIDTH));
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], realGrad[i], EPSILON);
+    }
+
+    printf(" - MaskedFill backward is Ok.\n");
+}
+
+TEST(TestMaskedFill, BroadcastUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    const size_t BATCH_SIZE = 2;
+    const size_t WIDTH = 2;
+    const size_t HEIGHT = 2;
+    const size_t DEPTH = 3;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+    const dtype FILL_VALUE = TODTYPE(1e-3);
+
+    // Inputs
+    const raul::Tensor mask{ 0.0_dt, 1.0_dt };
+
+    // Outputs
+    const raul::Tensor realOut = {
+        1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt,
+        1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt,
+    };
+    const raul::Tensor realGrad = { 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt,
+                                    2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt };
+
+    // Initialization
+    raul::Workflow work;
+
+    work.add<raul::DataLayer>("data_in", raul::DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<raul::TensorLayer>("data_mask", raul::TensorParams{ { "mask" }, 1u, 1u, 1u, WIDTH });
+    work.add<raul::MaskedFillLayer>("mask", raul::MaskedFillParams{ { "in", "mask" }, { "out" }, FILL_VALUE });
+
+    work.preparePipelines();
+    work.setBatchSize(BATCH_SIZE);
+    work.prepareMemoryForTraining();
+
+    auto& memory_manager = work.getMemoryManager();
+    memory_manager["in"] = 1.0_dt;
+    memory_manager["mask"] = TORANGE(mask);
+    memory_manager[raul::Name("out").grad()].memAllocate(nullptr);
+    memory_manager[raul::Name("out").grad()] = 2.0_dt;
+
+    work.forwardPassTraining();
+
+    const raul::Tensor& out = memory_manager["out"];
+
+    EXPECT_EQ(out.getShape(), shape(BATCH_SIZE, DEPTH, HEIGHT, WIDTH));
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], EPSILON);
+    }
+
+    printf(" - MaskedFill forward is Ok.\n");
+
+    work.backwardPassTraining();
+
+    const raul::Tensor& inGrad = memory_manager[raul::Name("in").grad()];
+    EXPECT_EQ(inGrad.getShape(), shape(BATCH_SIZE, DEPTH, HEIGHT, WIDTH));
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], realGrad[i], EPSILON);
+    }
+
+    printf(" - MaskedFill backward is Ok.\n");
+}
+
+TEST(TestMaskedFill, Broadcast2Unit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    const size_t BATCH_SIZE = 2;
+    const size_t WIDTH = 2;
+    const size_t HEIGHT = 2;
+    const size_t DEPTH = 3;
+
+    const dtype EPSILON = TODTYPE(1e-6);
+    const dtype FILL_VALUE = TODTYPE(1e-3);
+
+    // Inputs
+    const raul::Tensor mask{ 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 1.0_dt };
+
+    // Outputs
+    const raul::Tensor realOut = {
+        1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt,
+        1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt, 1.0_dt, 1e-3_dt,
+    };
+    const raul::Tensor realGrad = { 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt,
+                                    2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 2.0_dt, 0.0_dt };
+
+    // Initialization
+    raul::Workflow work;
+
+    work.add<raul::DataLayer>("data_in", raul::DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    work.add<raul::TensorLayer>("data_mask", raul::TensorParams{ { "mask" }, 1u, DEPTH, 1u, WIDTH });
+    work.add<raul::MaskedFillLayer>("mask", raul::MaskedFillParams{ { "in", "mask" }, { "out" }, FILL_VALUE });
+
+    work.preparePipelines();
+    work.setBatchSize(BATCH_SIZE);
+    work.prepareMemoryForTraining();
+
+    auto& memory_manager = work.getMemoryManager();
+    memory_manager["in"] = 1.0_dt;
+    memory_manager["mask"] = TORANGE(mask);
+    memory_manager[raul::Name("out").grad()].memAllocate(nullptr);
+    memory_manager[raul::Name("out").grad()] = 2.0_dt;
+
+    work.forwardPassTraining();
+
+    const raul::Tensor& out = memory_manager["out"];
+
+    EXPECT_EQ(out.getShape(), shape(BATCH_SIZE, DEPTH, HEIGHT, WIDTH));
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], EPSILON);
+    }
+
+    printf(" - MaskedFill forward is Ok.\n");
+
+    work.backwardPassTraining();
+
+    const raul::Tensor& inGrad = memory_manager[raul::Name("in").grad()];
+    EXPECT_EQ(inGrad.getShape(), shape(BATCH_SIZE, DEPTH, HEIGHT, WIDTH));
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], realGrad[i], EPSILON);
+    }
+
+    printf(" - MaskedFill backward is Ok.\n");
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_MatMul.cpp b/training/src/tests/tests/layers/Test_MatMul.cpp
new file mode 100644
index 00000000..14b20708
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_MatMul.cpp
@@ -0,0 +1,90 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/MatMulLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+using namespace raul;
+
+TEST(TestMatMul, MatMulUnit)
+{
+    PROFILE_TEST
+
+    constexpr auto eps = 1e-6_dt;
+
+    constexpr size_t BATCH_SIZE = 2;
+    constexpr size_t MODEL_SIZE = 5;
+
+    const Tensor realOut = { 10._dt, 16._dt, 15._dt, 16._dt, 26._dt, 24._dt, 20._dt, 32._dt, 30._dt, 32._dt, 52._dt, 48._dt };
+
+    const Tensor realGrad1 = { -4.0000, 9.0000,  6.0000,  5.0000,  2.0000, -20.5000, 24.0000, 33.5000, 9.0000,  7.5000,
+                               -8.0000, 18.0000, 12.0000, 10.0000, 4.0000, -41.0000, 48.0000, 67.0000, 18.0000, 15.0000 };
+
+    const Tensor realGrad2 = { 0.0000, 1.5000, -7.0000, 3.0000, 3.0000, 11.0000, 4.0000, 5.0000, 10.0000, 0.0000, 0.0000, 0.0000, 10.0000, 12.5000, 25.0000,
+                               0.0000, 1.5000, -7.0000, 3.0000, 3.0000, 11.0000, 4.0000, 5.0000, 10.0000, 0.0000, 0.0000, 0.0000, 10.0000, 12.5000, 25.0000 };
+
+    const Tensor raw1 = { 1._dt, 1._dt, 2._dt, 0._dt, 5._dt, -1._dt, 2._dt, 2._dt, 0._dt, 5._dt, 1._dt, 1._dt, 2._dt, 0._dt, 5._dt, -1._dt, 2._dt, 2._dt, 0._dt, 5._dt };
+
+    const Tensor raw2 = { -1., -3., -3., 4., 4., 3., 1., 5., 5., 2., 2., 1., 1., 1., 1., -2., -6., -6., 8., 8., 6., 2., 10., 10., 4., 4., 2., 2., 2., 2. };
+
+    const Tensor deltas = { 1._dt, 2._dt, -1._dt, 1._dt, 0.5_dt, 6._dt, 1._dt, 2._dt, -1._dt, 1._dt, 0.5_dt, 6._dt };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data_in1", DataParams{ { "in1" }, 1u, raw1.size() / BATCH_SIZE / MODEL_SIZE, MODEL_SIZE });
+    work.add<DataLayer>("data_in2", DataParams{ { "in2" }, 1u, MODEL_SIZE, raw2.size() / BATCH_SIZE / MODEL_SIZE });
+
+    MatMulLayer mm("mm", { { "in1", "in2" }, "out", 1._dt }, networkParameters);
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in1"] = TORANGE(raw1);
+    memory_manager["in2"] = TORANGE(raw2);
+    memory_manager[Name("out").grad()] = TORANGE(deltas);
+
+    mm.forwardCompute(NetworkMode::Train);
+    const Tensor& out = memory_manager["out"];
+
+    for (size_t i = 0; i < out.size(); ++i)
+    {
+        EXPECT_NEAR(out[i], realOut[i], eps);
+    }
+
+    printf(" - MatMul forward is Ok.\n");
+
+    mm.backwardCompute();
+
+    const Tensor& in1Grad = memory_manager[Name("in1").grad()];
+    const Tensor& in2Grad = memory_manager[Name("in2").grad()];
+
+    for (size_t i = 0; i < in1Grad.size(); ++i)
+    {
+        EXPECT_NEAR(in1Grad[i], realGrad1[i], eps);
+    }
+
+    for (size_t i = 0; i < in2Grad.size(); ++i)
+    {
+        EXPECT_NEAR(in2Grad[i], realGrad2[i], eps);
+    }
+
+    printf(" - MatMul backward is Ok.\n");
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_NetworkParams.cpp b/training/src/tests/tests/layers/Test_NetworkParams.cpp
new file mode 100644
index 00000000..e7709551
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_NetworkParams.cpp
@@ -0,0 +1,160 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ElementWiseMulLayer.h>
+#include <training/base/layers/basic/ElementWiseSumLayer.h>
+#include <training/base/layers/basic/LogLayer.h>
+
+namespace UT
+{
+
+TEST(TestNetworkParameters, CallbackUnit)
+{
+    PROFILE_TEST
+    const size_t batch = 1u;
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    memory_manager.createTensor("x", batch, 1u, 1u, 1u, 2.7_dt);
+    memory_manager.createTensor("y", batch, 1u, 1u, 1u, 2.0_dt);
+    memory_manager.createTensor("z", batch, 1u, 1u, 1u, 3.0_dt);
+
+    raul::DataParams dataParams{ { "x", "y", "z" }, 1u, 1u, 1u };
+    raul::BasicParams logParams{ { "x" }, { "out1" } };
+    raul::ElementWiseLayerParams mulParams{ { "out1", "y" }, { "out2" } };
+    raul::ElementWiseLayerParams sumParams{ { "out2", "z" }, { "out3" } };
+
+    // Apply function
+    const raul::Names lNames = { "data", "log", "mul", "sum" };
+    work.add<raul::DataLayer>(lNames[0], dataParams);
+    work.add<raul::LogLayer>(lNames[1], logParams);
+    work.add<raul::ElementWiseMulLayer>(lNames[2], mulParams);
+    work.add<raul::ElementWiseSumLayer>(lNames[3], sumParams);
+
+    TENSORS_CREATE(batch)
+    memory_manager["x"] = 2.7_dt;
+    memory_manager["y"] = 2.0_dt;
+    memory_manager["z"] = 3.0_dt;
+
+    auto first = [](raul::BasicLayer* layer, const raul::MemoryManager& mem)
+    {
+        std::cout << "Before forward of " << layer->getName() << std::endl;
+
+        // Check that everything allocated
+        for (size_t i = 0; i < layer->getOutputs().size(); ++i)
+        {
+            EXPECT_EQ(mem[layer->getOutputs()[i]].size(), 1u);
+        }
+        for (size_t i = 0; i < layer->getInputs().size(); ++i)
+        {
+            std::cout << "Input[" + layer->getInputs()[i] + "]:" << std::endl;
+            for (size_t j = 0; j < mem[layer->getInputs()[i]].size(); ++j)
+            {
+                std::cout << mem[layer->getInputs()[i]][j] << std::endl;
+            }
+        }
+    };
+
+    auto second = [](raul::BasicLayer* layer, const raul::MemoryManager& mem)
+    {
+        std::cout << "After forward of " << layer->getName() << std::endl;
+        for (size_t i = 0; i < layer->getOutputs().size(); ++i)
+        {
+            std::cout << "Output[" + layer->getOutputs()[i] + "]:" << std::endl;
+            for (size_t j = 0; j < mem[layer->getOutputs()[i]].size(); ++j)
+            {
+                std::cout << mem[layer->getOutputs()[i]][j] << std::endl;
+            }
+        }
+    };
+
+    auto third = [](raul::BasicLayer* layer, const raul::MemoryManager& mem)
+    {
+        std::cout << "Before backward of " << layer->getName() << std::endl;
+
+        // Check that gradients allocated
+        for (size_t i = 0; i < layer->getInputs().size(); ++i)
+        {
+            EXPECT_EQ(mem[layer->getInputs()[i].grad()].size(), 1u);
+        }
+        for (size_t i = 0; i < layer->getOutputs().size(); ++i)
+        {
+            std::cout << "Incoming deltas[" + layer->getOutputs()[i].grad() + "]:" << std::endl;
+            for (size_t j = 0; j < mem[layer->getOutputs()[i].grad()].size(); ++j)
+            {
+                std::cout << mem[layer->getOutputs()[i].grad()][j] << std::endl;
+            }
+        }
+    };
+
+    auto fourth = [](raul::BasicLayer* layer, const raul::MemoryManager& mem)
+    {
+        std::cout << "After backward of " << layer->getName() << std::endl;
+
+        // Check that gradients still exist (Eager execution)
+        for (size_t i = 0; i < layer->getOutputs().size(); ++i)
+        {
+            EXPECT_EQ(mem[layer->getOutputs()[i]].size(), 1u);
+            EXPECT_EQ(mem[layer->getOutputs()[i].grad()].size(), 1u);
+        }
+    };
+
+    networkParameters.mCallback = raul::CallbackHelper(first, second, third, fourth);
+
+    // Apply function
+    for (size_t i = 0; i < 4; ++i)
+    {
+        networkParameters.mCallback(work[lNames[i]], memory_manager, raul::NetworkParameters::CallbackPlace::Before_Forward);
+        work[lNames[i]]->forwardComputeImpl(raul::NetworkMode::Train);
+        networkParameters.mCallback(work[lNames[i]], memory_manager, raul::NetworkParameters::CallbackPlace::After_Forward);
+    }
+
+    for (size_t i = 4; i > 0; --i)
+    {
+        networkParameters.mCallback(work[lNames[i - 1]], memory_manager, raul::NetworkParameters::CallbackPlace::Before_Backward);
+        work[lNames[i - 1]]->backwardComputeImpl();
+        networkParameters.mCallback(work[lNames[i - 1]], memory_manager, raul::NetworkParameters::CallbackPlace::After_Backward);
+    }
+}
+
+TEST(TestNetworkParameters, DummyClassOperatorUnit)
+{
+    PROFILE_TEST
+    const size_t batch = 1u;
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    class Dummy
+    {
+      public:
+        void operator()(raul::BasicLayer* layer, const raul::MemoryManager& mem, raul::NetworkParameters::CallbackPlace place)
+        {
+            EXPECT_EQ(layer->getName(), raul::Name("log"));
+            EXPECT_EQ(mem[layer->getInputs()[0]].size(), 1u);
+            EXPECT_EQ(place, raul::NetworkParameters::CallbackPlace::Before_Forward);
+        }
+    };
+
+    ASSERT_NO_THROW(networkParameters.mCallback = Dummy());
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "x" }, 1u, 1u, 1u });
+    work.add<raul::LogLayer>("log", raul::BasicParams{ { "x" }, { "out1" } });
+    TENSORS_CREATE(batch)
+
+    networkParameters.mCallback(work["log"], memory_manager, raul::NetworkParameters::CallbackPlace::Before_Forward);
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_ReshapeLayer.cpp b/training/src/tests/tests/layers/Test_ReshapeLayer.cpp
new file mode 100644
index 00000000..a1288b9a
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_ReshapeLayer.cpp
@@ -0,0 +1,112 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ReshapeLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestReshapeLayer, ReshapeLayerUnit)
+{
+    PROFILE_TEST
+
+    {
+        int N = 5, C = 1, H = 4, W = 1;
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, static_cast<size_t>(C), static_cast<size_t>(H), static_cast<size_t>(W) });
+        work.add<raul::ReshapeLayer>("r", raul::ViewParams{ "in", "out", C, -1, W });
+        TENSORS_CREATE(N);
+
+        work.forwardPassTraining();
+        EXPECT_EQ(memory_manager["out"].getShape(), memory_manager["in"].getShape());
+
+        work.backwardPassTraining();
+        EXPECT_EQ(memory_manager[raul::Name("in").grad()].getShape(), memory_manager["in"].getShape());
+    }
+
+    {
+        int N = 5, C = 1, H = 4, W = 1;
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, static_cast<size_t>(C), static_cast<size_t>(H), static_cast<size_t>(W) });
+        work.add<raul::ReshapeLayer>("r", raul::ViewParams{ "in", "out", C * H * W });
+        TENSORS_CREATE(N);
+
+        work.forwardPassTraining();
+        EXPECT_EQ(memory_manager["out"].getShape(), yato::dims(N, C * H * W, 1, 1));
+
+        work.backwardPassTraining();
+        EXPECT_EQ(memory_manager[raul::Name("in").grad()].getShape(), memory_manager["in"].getShape());
+    }
+
+    {
+        int N = 5, C = 4, H = 3, W = 2;
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, static_cast<size_t>(C), static_cast<size_t>(H), static_cast<size_t>(W) });
+        work.add<raul::ReshapeLayer>("r", raul::ViewParams{ "in", "out", C, -1, H });
+        TENSORS_CREATE(N);
+
+        work.forwardPassTraining();
+        EXPECT_EQ(memory_manager["out"].getShape(), yato::dims(N, C, W, H));
+
+        work.backwardPassTraining();
+        EXPECT_EQ(memory_manager[raul::Name("in").grad()].getShape(), memory_manager["in"].getShape());
+    }
+
+    {
+        int N = 5, C = 4, H = 3, W = 2;
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, static_cast<size_t>(C), static_cast<size_t>(H), static_cast<size_t>(W) });
+        work.add<raul::ReshapeLayer>("r", raul::ViewParams{ "in", "out", H, C, W });
+        TENSORS_CREATE(N);
+
+        work.forwardPassTraining();
+        EXPECT_EQ(memory_manager["out"].getShape(), yato::dims(N, H, C, W));
+
+        work.backwardPassTraining();
+        EXPECT_EQ(memory_manager[raul::Name("in").grad()].getShape(), memory_manager["in"].getShape());
+    }
+
+    {
+        int C = 1, H = 4, W = 1;
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, static_cast<size_t>(C), static_cast<size_t>(H), static_cast<size_t>(W) });
+        EXPECT_THROW(work.add<raul::ReshapeLayer>("r", raul::ViewParams{ "in", "out", H, H, H }), raul::Exception);
+    }
+
+    {
+        int C = 2, H = 4, W = 1;
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, static_cast<size_t>(C), static_cast<size_t>(H), static_cast<size_t>(W) });
+        EXPECT_THROW(work.add<raul::ReshapeLayer>("r", raul::ViewParams{ "in", "out", -1, -1, W }), raul::Exception);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Slicing.cpp b/training/src/tests/tests/layers/Test_Slicing.cpp
new file mode 100644
index 00000000..7bbfd13f
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Slicing.cpp
@@ -0,0 +1,526 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/ConcatenationLayer.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/SlicerLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+using namespace raul;
+using namespace std;
+
+TEST(TestSlicing, SlicingConcatUnit)
+{
+    PROFILE_TEST
+
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 3;
+    size_t HEIGHT = 2;
+    size_t WIDTH = 4;
+    size_t SLICES[] = { 2u, 2u, 3u };
+    string dirs[] = { "width", "height", "depth" };
+
+    Tensor raw = {
+        1111._dt, 1112._dt, 1113._dt, 1114._dt, 1121._dt, 1122._dt, 1123._dt, 1124._dt, 1211._dt, 1212._dt, 1213._dt, 1214._dt, 1221._dt, 1222._dt, 1223._dt, 1224._dt,
+        1311._dt, 1312._dt, 1313._dt, 1314._dt, 1321._dt, 1322._dt, 1323._dt, 1324._dt, 2111._dt, 2112._dt, 2113._dt, 2114._dt, 2121._dt, 2122._dt, 2123._dt, 2124._dt,
+        2211._dt, 2212._dt, 2213._dt, 2214._dt, 2221._dt, 2222._dt, 2223._dt, 2224._dt, 2311._dt, 2312._dt, 2313._dt, 2314._dt, 2321._dt, 2322._dt, 2323._dt, 2324._dt,
+    };
+
+    Tensor rawSliced[] = {
+        { 1111._dt, 1112._dt, 1121._dt, 1122._dt, 1211._dt, 1212._dt, 1221._dt, 1222._dt, 1311._dt, 1312._dt, 1321._dt, 1322._dt,
+          2111._dt, 2112._dt, 2121._dt, 2122._dt, 2211._dt, 2212._dt, 2221._dt, 2222._dt, 2311._dt, 2312._dt, 2321._dt, 2322._dt },
+
+        { 1113._dt, 1114._dt, 1123._dt, 1124._dt, 1213._dt, 1214._dt, 1223._dt, 1224._dt, 1313._dt, 1314._dt, 1323._dt, 1324._dt,
+          2113._dt, 2114._dt, 2123._dt, 2124._dt, 2213._dt, 2214._dt, 2223._dt, 2224._dt, 2313._dt, 2314._dt, 2323._dt, 2324._dt },
+
+        { 1111._dt, 1112._dt, 1113._dt, 1114._dt, 1211._dt, 1212._dt, 1213._dt, 1214._dt, 1311._dt, 1312._dt, 1313._dt, 1314._dt,
+          2111._dt, 2112._dt, 2113._dt, 2114._dt, 2211._dt, 2212._dt, 2213._dt, 2214._dt, 2311._dt, 2312._dt, 2313._dt, 2314._dt },
+
+        { 1121._dt, 1122._dt, 1123._dt, 1124._dt, 1221._dt, 1222._dt, 1223._dt, 1224._dt, 1321._dt, 1322._dt, 1323._dt, 1324._dt,
+          2121._dt, 2122._dt, 2123._dt, 2124._dt, 2221._dt, 2222._dt, 2223._dt, 2224._dt, 2321._dt, 2322._dt, 2323._dt, 2324._dt },
+
+        { 1111._dt, 1112._dt, 1113._dt, 1114._dt, 1121._dt, 1122._dt, 1123._dt, 1124._dt, 2111._dt, 2112._dt, 2113._dt, 2114._dt, 2121._dt, 2122._dt, 2123._dt, 2124._dt },
+        { 1211._dt, 1212._dt, 1213._dt, 1214._dt, 1221._dt, 1222._dt, 1223._dt, 1224._dt, 2211._dt, 2212._dt, 2213._dt, 2214._dt, 2221._dt, 2222._dt, 2223._dt, 2224._dt },
+        { 1311._dt, 1312._dt, 1313._dt, 1314._dt, 1321._dt, 1322._dt, 1323._dt, 1324._dt, 2311._dt, 2312._dt, 2313._dt, 2314._dt, 2321._dt, 2322._dt, 2323._dt, 2324._dt },
+
+    };
+
+    for (size_t k = 0, curRow = 0; k < size(dirs); curRow += SLICES[k], ++k)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+
+        auto dir = dirs[k];
+        Names outputs;
+        for (size_t n = 0; n < SLICES[k]; ++n)
+        {
+            outputs.push_back("slice[" + to_string(n) + "]");
+        }
+        SlicerLayer slicer("slicing", { "in", outputs, dir }, networkParameters);
+        ConcatenationLayer concat("concat", { outputs, { "out" }, dir }, networkParameters);
+        TENSORS_CREATE(BATCH_SIZE);
+        memory_manager["in"] = TORANGE(raw);
+        slicer.forwardCompute(NetworkMode::Train);
+
+        for (size_t n = 0; n < SLICES[k]; ++n)
+        {
+            auto& slice = memory_manager.getTensor(outputs[n]);
+            EXPECT_EQ(slice.size(), rawSliced[curRow + n].size());
+            for (size_t i = 0; i < slice.size(); ++i)
+            {
+                EXPECT_EQ(rawSliced[curRow + n][i], slice[i]);
+            }
+        }
+
+        cout << " - Slicer [" + dir + "] forward is Ok." << endl;
+
+        concat.forwardCompute(NetworkMode::Train);
+
+        auto& out = memory_manager.getTensor("out");
+        EXPECT_EQ(out.size(), raw.size());
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_EQ(raw[i], out[i]);
+        }
+
+        cout << " - Concat [" + dir + "] forward is Ok." << endl;
+
+        memory_manager[Name("out").grad()].memAllocate(nullptr);
+        memory_manager[Name("out").grad()] = TORANGE(raw);
+        concat.backwardCompute();
+
+        for (size_t n = 0; n < SLICES[k]; ++n)
+        {
+            auto& sliceMabla = memory_manager.getTensor(outputs[n].grad());
+            EXPECT_EQ(sliceMabla.size(), rawSliced[curRow + n].size());
+            for (size_t i = 0; i < sliceMabla.size(); ++i)
+            {
+                EXPECT_EQ(rawSliced[curRow + n][i], sliceMabla[i]);
+            }
+        }
+
+        cout << " - Concat [" + dir + "] backward is Ok." << endl;
+
+        slicer.backwardCompute();
+
+        auto& inNabla = memory_manager.getTensor(Name("in").grad());
+        EXPECT_EQ(raw.size(), inNabla.size());
+        for (size_t i = 0; i < inNabla.size(); ++i)
+        {
+            EXPECT_EQ(raw[i], inNabla[i]);
+        }
+
+        cout << " - Slicer [" + dir + "] backward is Ok." << endl;
+
+        memory_manager.clear();
+    }
+}
+
+TEST(TestSlicing, SlicingSizeSplits1Unit)
+{
+    PROFILE_TEST
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 1;
+    size_t HEIGHT = 1;
+    size_t WIDTH = 6;
+    vector<int> slices = { 1, 1, 1 };
+    vector<size_t> realSlices = { 1, 1, 1 };
+    vector<vector<dtype>> realValues = { { 11._dt, 21._dt }, { 12._dt, 22._dt }, { 13._dt, 23._dt } };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    SlicerLayer slicer("slicing", { "in", { "out[0]", "out[1]", "out[2]" }, "width", slices }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = { 11._dt, 12._dt, 13._dt, 14._dt, 15._dt, 16._dt, 21._dt, 22._dt, 23._dt, 24._dt, 25._dt, 26._dt };
+    slicer.forwardCompute(NetworkMode::Train);
+    for (size_t q = 0; q < realSlices.size(); ++q)
+    {
+        auto str = "out[" + to_string(q) + "]";
+        ASSERT_TRUE(memory_manager.tensorExists(str));
+        auto& out = memory_manager[str];
+        ASSERT_EQ(out.getBatchSize(), BATCH_SIZE);
+        ASSERT_EQ(out.getDepth(), DEPTH);
+        ASSERT_EQ(out.getHeight(), HEIGHT);
+        ASSERT_EQ(out.getWidth(), realSlices[q]);
+
+        ASSERT_EQ(out.size(), realValues[q].size());
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            ASSERT_EQ(out[i], realValues[q][i]);
+        }
+    }
+}
+
+TEST(TestSlicing, SlicingConcatSizeSplits2Unit)
+{
+    PROFILE_TEST
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 1;
+    size_t HEIGHT = 1;
+    size_t WIDTH = 6;
+    vector<int> slices = { -1, 4 };
+    vector<size_t> realSlices = { 2, 4 };
+    vector<vector<dtype>> realValues = { { 11._dt, 12._dt, 21._dt, 22._dt }, { 13._dt, 14._dt, 15._dt, 16._dt, 23._dt, 24._dt, 25._dt, 26._dt } };
+
+    Tensor raw({ 11._dt, 12._dt, 13._dt, 14._dt, 15._dt, 16._dt, 21._dt, 22._dt, 23._dt, 24._dt, 25._dt, 26._dt });
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    SlicerLayer slicer("slicing", { "in", { "out[0]", "out[1]" }, "width", slices }, networkParameters);
+    ConcatenationLayer concat("concat", { { "out[0]", "out[1]" }, { "concat" }, "width" }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = TORANGE(raw);
+    slicer.forwardCompute(NetworkMode::Train);
+    for (size_t q = 0; q < realSlices.size(); ++q)
+    {
+        auto str = "out[" + to_string(q) + "]";
+        ASSERT_TRUE(memory_manager.tensorExists(str));
+        auto& out = memory_manager[str];
+        EXPECT_EQ(out.getBatchSize(), BATCH_SIZE);
+        EXPECT_EQ(out.getDepth(), DEPTH);
+        EXPECT_EQ(out.getHeight(), HEIGHT);
+        EXPECT_EQ(out.getWidth(), realSlices[q]);
+
+        ASSERT_EQ(out.size(), realValues[q].size());
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_EQ(out[i], realValues[q][i]);
+        }
+    }
+
+    concat.forwardCompute(NetworkMode::Train);
+    const auto& c = memory_manager["concat"];
+    EXPECT_EQ(c.getShape(), memory_manager["in"].getShape());
+    for (size_t i = 0; i < c.size(); ++i)
+    {
+        EXPECT_EQ(c[i], raw[i]);
+    }
+
+    memory_manager[Name("concat").grad()].memAllocate(nullptr);
+    memory_manager[Name("concat").grad()] = TORANGE(raw);
+    concat.backwardCompute();
+    slicer.backwardCompute();
+
+    ASSERT_TRUE(memory_manager.tensorExists(Name("in").grad()));
+
+    auto& in_nabla = memory_manager[Name("in").grad()];
+
+    EXPECT_EQ(in_nabla.getShape(), memory_manager["in"].getShape());
+    for (size_t i = 0; i < c.size(); ++i)
+    {
+        ASSERT_EQ(in_nabla[i], raw[i]);
+    }
+}
+
+TEST(TestSlicing, SlicingSizeSplits3Unit)
+{
+    PROFILE_TEST
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 1;
+    size_t HEIGHT = 1;
+    size_t WIDTH = 6;
+    vector<int> slices = { 2, -1 };
+    vector<size_t> realSlices = { 2, 4 };
+    vector<vector<dtype>> realValues = { { 11._dt, 12._dt, 21._dt, 22._dt }, { 13._dt, 14._dt, 15._dt, 16._dt, 23._dt, 24._dt, 25._dt, 26._dt } };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    SlicerLayer slicer("slicing", { "in", { "out[0]", "out[1]" }, "width", slices }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = { 11._dt, 12._dt, 13._dt, 14._dt, 15._dt, 16._dt, 21._dt, 22._dt, 23._dt, 24._dt, 25._dt, 26._dt };
+    slicer.forwardCompute(NetworkMode::Train);
+    for (size_t q = 0; q < realSlices.size(); ++q)
+    {
+        auto str = "out[" + to_string(q) + "]";
+        ASSERT_TRUE(memory_manager.tensorExists(str));
+        auto& out = memory_manager[str];
+        ASSERT_EQ(out.getBatchSize(), BATCH_SIZE);
+        ASSERT_EQ(out.getDepth(), DEPTH);
+        ASSERT_EQ(out.getHeight(), HEIGHT);
+        ASSERT_EQ(out.getWidth(), realSlices[q]);
+
+        ASSERT_EQ(out.size(), realValues[q].size());
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            ASSERT_EQ(out[i], realValues[q][i]);
+        }
+    }
+}
+
+TEST(TestSlicing, SlicingSizeSplits4Unit)
+{
+    PROFILE_TEST
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 1;
+    size_t HEIGHT = 1;
+    size_t WIDTH = 6;
+    vector<int> slices = { 2, -1, 2 };
+    vector<size_t> realSlices = { 2, 2, 2 };
+    vector<vector<dtype>> realValues = { { 11._dt, 12._dt, 21._dt, 22._dt }, { 13._dt, 14._dt, 23._dt, 24._dt }, { 15._dt, 16._dt, 25._dt, 26._dt } };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    SlicerLayer slicer("slicing", { "in", { "out[0]", "out[1]", "out[2]" }, "width", slices }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = { 11._dt, 12._dt, 13._dt, 14._dt, 15._dt, 16._dt, 21._dt, 22._dt, 23._dt, 24._dt, 25._dt, 26._dt };
+    slicer.forwardCompute(NetworkMode::Train);
+    for (size_t q = 0; q < realSlices.size(); ++q)
+    {
+        auto str = "out[" + to_string(q) + "]";
+        ASSERT_TRUE(memory_manager.tensorExists(str));
+        auto& out = memory_manager[str];
+        ASSERT_EQ(out.getBatchSize(), BATCH_SIZE);
+        ASSERT_EQ(out.getDepth(), DEPTH);
+        ASSERT_EQ(out.getHeight(), HEIGHT);
+        ASSERT_EQ(out.getWidth(), realSlices[q]);
+
+        ASSERT_EQ(out.size(), realValues[q].size());
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            ASSERT_EQ(out[i], realValues[q][i]);
+        }
+    }
+}
+
+TEST(TestSlicing, SlicingSizeSplits5Unit)
+{
+    PROFILE_TEST
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 4;
+    size_t HEIGHT = 1;
+    size_t WIDTH = 2;
+    vector<int> slices = { -1, 1 };
+    vector<size_t> realSlices = { 3, 1 };
+    vector<vector<dtype>> realValues = { { 111._dt, 112._dt, 121._dt, 122._dt, 131._dt, 132._dt, 211._dt, 212._dt, 221._dt, 222._dt, 231._dt, 232._dt }, { 141._dt, 142._dt, 241._dt, 242._dt } };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    SlicerLayer slicer("slicing", { "in", { "out[0]", "out[1]" }, "depth", slices }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = { 111_dt, 112_dt, 121_dt, 122_dt, 131_dt, 132_dt, 141_dt, 142_dt, 211_dt, 212_dt, 221_dt, 222_dt, 231_dt, 232_dt, 241_dt, 242_dt };
+    slicer.forwardCompute(NetworkMode::Train);
+    for (size_t q = 0; q < realSlices.size(); ++q)
+    {
+        auto str = "out[" + to_string(q) + "]";
+        ASSERT_TRUE(memory_manager.tensorExists(str));
+        auto& out = memory_manager[str];
+        ASSERT_EQ(out.getBatchSize(), BATCH_SIZE);
+        ASSERT_EQ(out.getDepth(), realSlices[q]);
+        ASSERT_EQ(out.getHeight(), HEIGHT);
+        ASSERT_EQ(out.getWidth(), WIDTH);
+
+        ASSERT_EQ(out.size(), realValues[q].size());
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            ASSERT_EQ(out[i], realValues[q][i]);
+        }
+    }
+}
+
+TEST(TestSlicing, SkipGradientUnit)
+{
+    PROFILE_TEST
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 1;
+    size_t HEIGHT = 2;
+    size_t WIDTH = 2;
+    vector<int> slices = { 1, -1 };
+
+    vector<dtype> realGradient = {
+        0._dt, 1._dt, 0._dt, 2._dt, 0._dt, 3._dt, 0._dt, 4._dt,
+    };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    SlicerLayer slicer("slicing", { "in", { "out[0]", "out[1]" }, "width", slices }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = { 111._dt, 112._dt, 121._dt, 122._dt, 211._dt, 212._dt, 221._dt, 222._dt };
+    slicer.forwardCompute(NetworkMode::Train);
+
+    memory_manager[Name("out[1]").grad()].memAllocate(nullptr);
+    memory_manager[Name("out[1]").grad()] = { 1._dt, 2._dt, 3._dt, 4._dt };
+    slicer.backwardCompute();
+
+    ASSERT_TRUE(memory_manager.tensorExists(Name("in").grad()));
+
+    auto& in_nabla = memory_manager[Name("in").grad()];
+
+    ASSERT_EQ(memory_manager["in"].getShape(), in_nabla.getShape());
+
+    for (size_t q = 0; q < realGradient.size(); ++q)
+    {
+        ASSERT_EQ(in_nabla[q], realGradient[q]);
+    }
+}
+
+TEST(TestSlicing, SkipGradient2Unit)
+{
+    PROFILE_TEST
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 1;
+    size_t HEIGHT = 2;
+    size_t WIDTH = 2;
+    vector<int> slices = { 1, -1 };
+
+    vector<dtype> realGradient = {
+        0._dt, 1._dt, 0._dt, 2._dt, 0._dt, 3._dt, 0._dt, 4._dt,
+    };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    SlicerLayer slicer("slicing", { "in", { "out[0]", "out[1]" }, "width", slices }, networkParameters);
+    ConcatenationLayer concat("concat", { { "out[0]", "out[1]" }, { "out" }, "width" }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = { 111._dt, 112._dt, 121._dt, 122._dt, 211._dt, 212._dt, 221._dt, 222._dt };
+
+    slicer.forwardCompute(NetworkMode::Train);
+    concat.forwardCompute(NetworkMode::Train);
+
+    memory_manager[Name("out").grad()].memAllocate(nullptr);
+    memory_manager[Name("out").grad()] = { 1_dt, 1_dt, 2_dt, 2_dt, 3_dt, 3_dt, 4_dt, 4_dt };
+
+    concat.backwardCompute();
+
+    // EXPECT_FALSE(memory_manager.tensorExists(Name("out[0]").grad()));
+    memory_manager[Name("out[0]").grad()] = 0_dt;
+    ASSERT_TRUE(memory_manager.tensorExists(Name("out[1]").grad()));
+
+    slicer.backwardCompute();
+
+    ASSERT_TRUE(memory_manager.tensorExists(Name("in").grad()));
+
+    auto& in_nabla = memory_manager[Name("in").grad()];
+
+    ASSERT_EQ(memory_manager["in"].getShape(), in_nabla.getShape());
+
+    for (size_t q = 0; q < realGradient.size(); ++q)
+    {
+        ASSERT_EQ(in_nabla[q], realGradient[q]);
+    }
+}
+
+TEST(TestSlicing, ExtractHSliceUnit)
+{
+    PROFILE_TEST
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 1;
+    size_t HEIGHT = 2;
+    size_t WIDTH = 3;
+    vector<int> slices = { 1 };
+    vector<size_t> realSlices = { 1 };
+    vector<vector<dtype>> realValues = { { 111._dt, 112._dt, 113._dt, 211._dt, 212._dt, 213._dt } };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    SlicerLayer slicer("slicing", { "in", { "out" }, "height", slices }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = { 111_dt, 112_dt, 113_dt, 121_dt, 122_dt, 123_dt, 211_dt, 212_dt, 213_dt, 221_dt, 222_dt, 223_dt };
+    slicer.forwardCompute(NetworkMode::Train);
+    for (size_t q = 0; q < realSlices.size(); ++q)
+    {
+        string str = "out";
+        ASSERT_TRUE(memory_manager.tensorExists(str));
+        auto& out = memory_manager[str];
+        ASSERT_EQ(out.getBatchSize(), BATCH_SIZE);
+        ASSERT_EQ(out.getDepth(), DEPTH);
+        ASSERT_EQ(out.getHeight(), realSlices[q]);
+        ASSERT_EQ(out.getWidth(), WIDTH);
+
+        ASSERT_EQ(out.size(), realValues[q].size());
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            ASSERT_EQ(out[i], realValues[q][i]);
+        }
+    }
+}
+
+TEST(TestSlicing, SingleSliceUnit)
+{
+    PROFILE_TEST
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t BATCH_SIZE = 2;
+    size_t DEPTH = 1;
+    size_t HEIGHT = 2;
+    size_t WIDTH = 3;
+    vector<int> slices = { -1 };
+    vector<size_t> realSlices = { 2 };
+
+    work.add<DataLayer>("data", DataParams{ { "in" }, DEPTH, HEIGHT, WIDTH });
+    SlicerLayer slicer("slicing", { "in", { "out" }, "height", slices }, networkParameters);
+
+    TENSORS_CREATE(BATCH_SIZE);
+    memory_manager["in"] = { 111_dt, 112_dt, 113_dt, 121_dt, 122_dt, 123_dt, 211_dt, 212_dt, 213_dt, 221_dt, 222_dt, 223_dt };
+    slicer.forwardCompute(NetworkMode::Train);
+    for (size_t q = 0; q < realSlices.size(); ++q)
+    {
+        string str = "out";
+        ASSERT_TRUE(memory_manager.tensorExists(str));
+        auto& out = memory_manager[str];
+        ASSERT_EQ(out.getBatchSize(), BATCH_SIZE);
+        ASSERT_EQ(out.getDepth(), DEPTH);
+        ASSERT_EQ(out.getHeight(), realSlices[q]);
+        ASSERT_EQ(out.getWidth(), WIDTH);
+
+        ASSERT_EQ(out.size(), memory_manager["in"].size());
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            ASSERT_EQ(out[i], memory_manager["in"][i]);
+        }
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_Transpose.cpp b/training/src/tests/tests/layers/Test_Transpose.cpp
new file mode 100644
index 00000000..74620f9f
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_Transpose.cpp
@@ -0,0 +1,372 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestTranspose, TransposeUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    Tensor data = { 111._dt, 112._dt, 121._dt, 122._dt, 131._dt, 132._dt, 141._dt, 142._dt, 211._dt, 212._dt, 221._dt, 222._dt, 231._dt, 232._dt, 241._dt, 242._dt };
+    Tensor data_t_wh = { 111._dt, 121._dt, 131._dt, 141._dt, 112._dt, 122._dt, 132._dt, 142._dt, 211._dt, 221._dt, 231._dt, 241._dt, 212._dt, 222._dt, 232._dt, 242._dt };
+    Tensor data_t_dh = { 111._dt, 112._dt, 211._dt, 212._dt, 121._dt, 122._dt, 221._dt, 222._dt, 131._dt, 132._dt, 231._dt, 232._dt, 141._dt, 142._dt, 241._dt, 242._dt };
+
+    Tensor data_t_bd = { 111._dt, 112._dt, 121._dt, 122._dt, 131._dt, 132._dt, 141._dt, 142._dt, 211._dt, 212._dt, 221._dt, 222._dt, 231._dt, 232._dt, 241._dt, 242._dt };
+
+    Tensor data_t_bh = { 111._dt, 112._dt, 211._dt, 212._dt, 121._dt, 122._dt, 221._dt, 222._dt, 131._dt, 132._dt, 231._dt, 232._dt, 141._dt, 142._dt, 241._dt, 242._dt };
+
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 1u, 4u, 2u }, DEC_FORW_READ_NOMEMOPT);
+
+        TransposeLayer t("t", { "in", "out", Dimension::Width, Dimension::Height }, networkParameters);
+        TENSORS_CREATE(2);
+
+        memory_manager["in"] = TORANGE(data);
+
+        t.forwardCompute(NetworkMode::Train);
+
+        const auto& out = memory_manager["out"];
+        const auto& in = memory_manager["in"];
+
+        EXPECT_EQ(in.getBatchSize(), out.getBatchSize());
+        EXPECT_EQ(in.getDepth(), out.getDepth());
+        EXPECT_EQ(in.getWidth(), out.getHeight());
+        EXPECT_EQ(in.getHeight(), out.getWidth());
+
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_EQ(data_t_wh[i], out[i]);
+        }
+
+        printf(" - TransposeLayer[W <-> H] forward is Ok.\n");
+
+        memory_manager[raul::Name("out").grad()] = TORANGE(data_t_wh);
+
+        t.backwardCompute();
+
+        const auto& in_nabla = memory_manager[raul::Name("in").grad()];
+        const auto shape1 = in_nabla.getShape();
+        const auto shape2 = in.getShape();
+
+        for (size_t i = 0; i < 4; ++i)
+        {
+            EXPECT_EQ(shape1[i], shape2[i]);
+        }
+
+        for (size_t i = 0; i < in.size(); ++i)
+        {
+            EXPECT_EQ(in[i], in_nabla[i]);
+        }
+
+        printf(" - TransposeLayer[W <-> H] backward is Ok.\n");
+    }
+
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 2u, 4u, 2u }, DEC_FORW_READ_NOMEMOPT);
+
+        TransposeLayer t("t", { "in", "out", Dimension::Depth, Dimension::Height }, networkParameters);
+        TENSORS_CREATE(1);
+
+        memory_manager["in"] = TORANGE(data);
+
+        t.forwardCompute(NetworkMode::Train);
+
+        const auto& out = memory_manager["out"];
+        const auto& in = memory_manager["in"];
+
+        EXPECT_EQ(in.getBatchSize(), out.getBatchSize());
+        EXPECT_EQ(in.getDepth(), out.getHeight());
+        EXPECT_EQ(in.getWidth(), out.getWidth());
+        EXPECT_EQ(in.getHeight(), out.getDepth());
+
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_EQ(data_t_dh[i], out[i]);
+        }
+
+        printf(" - TransposeLayer[C <-> H] forward is Ok.\n");
+
+        memory_manager[raul::Name("out").grad()] = TORANGE(data_t_dh);
+
+        t.backwardCompute();
+
+        const auto& in_nabla = memory_manager[raul::Name("in").grad()];
+        const auto shape1 = in_nabla.getShape();
+        const auto shape2 = in.getShape();
+        for (size_t i = 0; i < 4; ++i)
+        {
+            EXPECT_EQ(shape1[i], shape2[i]);
+        }
+
+        for (size_t i = 0; i < in.size(); ++i)
+        {
+            EXPECT_EQ(in[i], in_nabla[i]);
+        }
+
+        printf(" - TransposeLayer[C <-> H] backward is Ok.\n");
+    }
+
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 1u, 4u, 2u }, DEC_FORW_READ_NOMEMOPT);
+
+        TransposeLayer t("t", { "in", "out" }, networkParameters);
+        TENSORS_CREATE(2);
+        memory_manager["in"] = TORANGE(data);
+
+        t.forwardCompute(NetworkMode::Train);
+
+        const auto& out = memory_manager["out"];
+        const auto& in = memory_manager["in"];
+
+        EXPECT_EQ(in.getBatchSize(), out.getBatchSize());
+        EXPECT_EQ(in.getDepth(), out.getDepth());
+        EXPECT_EQ(in.getWidth(), out.getHeight());
+        EXPECT_EQ(in.getHeight(), out.getWidth());
+
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_EQ(data_t_wh[i], out[i]);
+        }
+
+        printf(" - TransposeLayer[Default] forward is Ok.\n");
+
+        memory_manager[raul::Name("out").grad()] = TORANGE(data_t_wh);
+
+        t.backwardCompute();
+
+        const auto& in_nabla = memory_manager[raul::Name("in").grad()];
+        const auto shape1 = in_nabla.getShape();
+        const auto shape2 = in.getShape();
+        for (size_t i = 0; i < 4; ++i)
+        {
+            EXPECT_EQ(shape1[i], shape2[i]);
+        }
+
+        for (size_t i = 0; i < in.size(); ++i)
+        {
+            EXPECT_EQ(in[i], in_nabla[i]);
+        }
+
+        printf(" - TransposeLayer[Default] backward is Ok.\n");
+    }
+
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 1u, 4u, 2u }, DEC_FORW_READ_NOMEMOPT);
+
+        TransposeLayer t("t", { "in", "out", Dimension::Width, Dimension::Width }, networkParameters);
+        TENSORS_CREATE(2);
+        memory_manager["in"] = TORANGE(data);
+
+        t.forwardCompute(NetworkMode::Train);
+
+        const auto& out = memory_manager["out"];
+        const auto& in = memory_manager["in"];
+
+        EXPECT_EQ(in.getBatchSize(), out.getBatchSize());
+        EXPECT_EQ(in.getDepth(), out.getDepth());
+        EXPECT_EQ(in.getWidth(), out.getWidth());
+        EXPECT_EQ(in.getHeight(), out.getHeight());
+
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_EQ(data[i], out[i]);
+        }
+
+        printf(" - TransposeLayer[W <-> W] forward is Ok.\n");
+
+        memory_manager[raul::Name("out").grad()] = TORANGE(data);
+
+        t.backwardCompute();
+
+        const auto& in_nabla = memory_manager[raul::Name("in").grad()];
+        const auto shape1 = in_nabla.getShape();
+        const auto shape2 = in.getShape();
+        for (size_t i = 0; i < 4; ++i)
+        {
+            EXPECT_EQ(shape1[i], shape2[i]);
+        }
+
+        for (size_t i = 0; i < in.size(); ++i)
+        {
+            EXPECT_EQ(in[i], in_nabla[i]);
+        }
+
+        printf(" - TransposeLayer[W <-> W] backward is Ok.\n");
+    }
+
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 1u, 4u, 2u }, DEC_FORW_READ_NOMEMOPT);
+
+        TransposeLayer t("t", { "in", "out", Dimension::Batch, Dimension::Batch }, networkParameters);
+        TENSORS_CREATE(2);
+        memory_manager["in"] = TORANGE(data);
+
+        t.forwardCompute(NetworkMode::Train);
+
+        const auto& out = memory_manager["out"];
+        const auto& in = memory_manager["in"];
+
+        EXPECT_EQ(in.getBatchSize(), out.getBatchSize());
+        EXPECT_EQ(in.getDepth(), out.getDepth());
+        EXPECT_EQ(in.getWidth(), out.getWidth());
+        EXPECT_EQ(in.getHeight(), out.getHeight());
+
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_EQ(data[i], out[i]);
+        }
+
+        printf(" - TransposeLayer[B <-> B] forward is Ok.\n");
+
+        memory_manager[raul::Name("out").grad()] = TORANGE(data);
+
+        t.backwardCompute();
+
+        const auto& in_nabla = memory_manager[raul::Name("in").grad()];
+        const auto shape1 = in_nabla.getShape();
+        const auto shape2 = in.getShape();
+        for (size_t i = 0; i < 4; ++i)
+        {
+            EXPECT_EQ(shape1[i], shape2[i]);
+        }
+
+        for (size_t i = 0; i < in.size(); ++i)
+        {
+            EXPECT_EQ(in[i], in_nabla[i]);
+        }
+
+        printf(" - TransposeLayer[B <-> B] backward is Ok.\n");
+    }
+
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 1u, 4u, 2u }, DEC_FORW_READ_NOMEMOPT);
+
+        TransposeLayer t("t", { "in", "out", Dimension::Batch, Dimension::Depth }, networkParameters);
+        TENSORS_CREATE(2);
+        memory_manager["in"] = TORANGE(data);
+
+        t.forwardCompute(NetworkMode::Train);
+
+        const auto& out = memory_manager["out"];
+        const auto& in = memory_manager["in"];
+
+        EXPECT_EQ(in.getBatchSize(), out.getDepth());
+        EXPECT_EQ(in.getDepth(), out.getBatchSize());
+        EXPECT_EQ(in.getWidth(), out.getWidth());
+        EXPECT_EQ(in.getHeight(), out.getHeight());
+
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_EQ(data_t_bd[i], out[i]);
+        }
+
+        printf(" - TransposeLayer[B <-> D] forward is Ok.\n");
+
+        memory_manager[raul::Name("out").grad()] = TORANGE(data_t_bd);
+
+        t.backwardCompute();
+
+        const auto& in_nabla = memory_manager[raul::Name("in").grad()];
+        const auto shape1 = in_nabla.getShape();
+        const auto shape2 = in.getShape();
+        for (size_t i = 0; i < 4; ++i)
+        {
+            EXPECT_EQ(shape1[i], shape2[i]);
+        }
+
+        for (size_t i = 0; i < in.size(); ++i)
+        {
+            EXPECT_EQ(in[i], in_nabla[i]);
+        }
+
+        printf(" - TransposeLayer[B <-> D] backward is Ok.\n");
+    }
+
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 1u, 4u, 2u }, DEC_FORW_READ_NOMEMOPT);
+
+        TransposeLayer t("t", { "in", "out", Dimension::Batch, Dimension::Height }, networkParameters);
+        TENSORS_CREATE(2);
+        memory_manager["in"] = TORANGE(data);
+
+        t.forwardCompute(NetworkMode::Train);
+
+        const auto& out = memory_manager["out"];
+        const auto& in = memory_manager["in"];
+
+        EXPECT_EQ(in.getBatchSize(), out.getHeight());
+        EXPECT_EQ(in.getDepth(), out.getDepth());
+        EXPECT_EQ(in.getWidth(), out.getWidth());
+        EXPECT_EQ(in.getHeight(), out.getBatchSize());
+
+        for (size_t i = 0; i < out.size(); ++i)
+        {
+            EXPECT_EQ(data_t_bh[i], out[i]);
+        }
+
+        printf(" - TransposeLayer[B <-> H] forward is Ok.\n");
+
+        memory_manager[raul::Name("out").grad()] = TORANGE(data_t_bh);
+
+        t.backwardCompute();
+
+        const auto& in_nabla = memory_manager[raul::Name("in").grad()];
+        const auto shape1 = in_nabla.getShape();
+        const auto shape2 = in.getShape();
+        for (size_t i = 0; i < 4; ++i)
+        {
+            EXPECT_EQ(shape1[i], shape2[i]);
+        }
+
+        for (size_t i = 0; i < in.size(); ++i)
+        {
+            EXPECT_EQ(in[i], in_nabla[i]);
+        }
+
+        printf(" - TransposeLayer[B <-> H] backward is Ok.\n");
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_TransposedCNNLayer1D.cpp b/training/src/tests/tests/layers/Test_TransposedCNNLayer1D.cpp
new file mode 100644
index 00000000..b79cc74a
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_TransposedCNNLayer1D.cpp
@@ -0,0 +1,228 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/trainable/TransposedConvolution1DLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestTransposedCNN1DLayer, IncorrectSetupUnit)
+{
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 2;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    EXPECT_THROW(raul::TransposedConvolution1DLayer cnnLayer("cnn1", raul::TransposedConvolution1DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS }, networkParameters), raul::Exception);
+}
+
+TEST(TestTransposedCNN1DLayer, ProperSetupUnit)
+{
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 2;
+    const size_t STRIDE = 3;
+    const size_t PADDING = 2;
+    const size_t OUTPUT_PADDING = 1;
+    const size_t DILATION = 3;
+
+    const size_t outputHeight = 1;
+    const size_t outputWidth = 10;
+
+    const auto expectedShape = yato::dims(1, 3, FILTERS, KERNEL_SIZE);
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 3u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    raul::TransposedConvolution1DLayer cnnLayer(
+        "cnn1", raul::TransposedConvolution1DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, OUTPUT_PADDING, DILATION }, networkParameters);
+    TENSORS_CREATE(2);
+
+    ASSERT_NO_THROW(memory_manager["cnn1::Weights"]);
+    ASSERT_NO_THROW(memory_manager["cnn1::WeightsGradient"]);
+    ASSERT_NO_THROW(memory_manager["cnn1::Biases"]);
+    ASSERT_NO_THROW(memory_manager["cnn1::BiasesGradient"]);
+
+    EXPECT_EQ(memory_manager["cnn1::Weights"].getShape(), expectedShape);
+    EXPECT_EQ(memory_manager["cnn1::Biases"].size(), FILTERS);
+    EXPECT_EQ(memory_manager["cnn1"].getDepth(), static_cast<size_t>(FILTERS));
+    EXPECT_EQ(memory_manager["cnn1"].getHeight(), outputHeight);
+    EXPECT_EQ(memory_manager["cnn1"].getWidth(), outputWidth);
+}
+
+TEST(TestTransposedCNN1DLayer, SimpleUnit)
+{
+    const size_t KERNEL_SIZE = 2;
+    const size_t FILTERS = 1;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 0;
+    const size_t OUTPUT_PADDING = 0;
+    const size_t DILATION = 1;
+    const size_t GROUPS = 1;
+    const bool BIAS = false;
+
+    const raul::dtype EPSILON = TODTYPE(1e-6);
+
+    const raul::Tensor input = { 0.63434184_dt, 0.36441028_dt, 0.71042877_dt, 0.94641107_dt, 0.78902978_dt, 0.28141373_dt };
+
+    const raul::Tensor realOutput = { 0.04903505_dt, -0.11329067_dt, -0.02634779_dt, -0.15842740_dt, 0.07315820_dt, -0.15005954_dt, -0.15420216_dt, -0.06275597_dt };
+
+    const raul::Tensor realInputNabla = { -0.14570186_dt, -0.14570186_dt, -0.14570186_dt, -0.14570186_dt, -0.14570186_dt, -0.14570186_dt };
+
+    const raul::Tensor realWeightsGrad = { 3.72603536_dt, 3.72603536_dt };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 1u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    raul::TransposedConvolution1DLayer cnnLayer(
+        "cnn1", raul::TransposedConvolution1DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, OUTPUT_PADDING, DILATION, GROUPS, BIAS }, networkParameters);
+    TENSORS_CREATE(2);
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["cnn1::Weights"] = TORANGE((raul::Tensor{ 0.07730067_dt, -0.22300252_dt }));
+
+    ASSERT_THROW(memory_manager["cnn1::Biases"], raul::Exception);
+    ASSERT_THROW(memory_manager["cnn1::BiasesGradient"], raul::Exception);
+
+    // Forward
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+
+    const auto& output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    // Backward
+    memory_manager[raul::Name("cnn1").grad()] = TORANGE(*memory_manager.createTensor("gradient", output.getShape(), 1.0_dt));
+    cnnLayer.backwardCompute();
+    const auto inputNabla = memory_manager[raul::Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+    const auto weightsGrad = memory_manager[raul::Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGrad.size(), realWeightsGrad.size());
+    for (size_t i = 0; i < weightsGrad.size(); ++i)
+    {
+        CHECK_NEAR(weightsGrad[i], realWeightsGrad[i], EPSILON);
+    }
+}
+
+TEST(TestTransposedCNN1DLayer, NonTrivialUnit)
+{
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 6;
+    const size_t STRIDE = 2;
+    const size_t PADDING = 2;
+    const size_t OUTPUT_PADDING = 1;
+    const size_t DILATION = 3;
+    const size_t GROUPS = 2;
+    const bool BIAS = true;
+
+    const raul::dtype EPSILON = TODTYPE(1e-6);
+
+    const raul::Tensor input = { 0.24767512_dt, 0.65243822_dt, 0.60570377_dt, 0.37252063_dt, 0.79803473_dt, 0.83990461_dt, 0.13741332_dt, 0.23306590_dt,
+                                 0.95783097_dt, 0.33128375_dt, 0.32274181_dt, 0.01620269_dt, 0.21366489_dt, 0.62490183_dt, 0.43400341_dt, 0.13705701_dt,
+                                 0.51172835_dt, 0.15845925_dt, 0.07580167_dt, 0.22466868_dt, 0.06239396_dt, 0.18163097_dt, 0.99980444_dt, 0.59443748_dt };
+    const raul::Tensor deltas = { 0.65407985_dt, 0.03365785_dt, 0.17161310_dt, 0.33357209_dt, 0.57818556_dt, 0.06003934_dt, 0.28456348_dt, 0.20066571_dt, 0.50138563_dt, 0.31394839_dt, 0.46535212_dt,
+                                  0.16118515_dt, 0.15680242_dt, 0.20829910_dt, 0.32885128_dt, 0.10535955_dt, 0.91923493_dt, 0.40076798_dt, 0.93019837_dt, 0.65579104_dt, 0.07660151_dt, 0.84601760_dt,
+                                  0.36242759_dt, 0.30833697_dt, 0.08496475_dt, 0.00291967_dt, 0.64305532_dt, 0.39077806_dt, 0.69466156_dt, 0.08966827_dt, 0.87121457_dt, 0.13297313_dt, 0.41366333_dt,
+                                  0.60443485_dt, 0.75812590_dt, 0.90365517_dt, 0.95547962_dt, 0.10353893_dt, 0.62583363_dt, 0.28493702_dt, 0.44520760_dt, 0.12575495_dt, 0.95542932_dt, 0.13302475_dt,
+                                  0.76722562_dt, 0.67571980_dt, 0.66247797_dt, 0.22967690_dt, 0.95447576_dt, 0.60987520_dt, 0.56432003_dt, 0.05937260_dt, 0.70989424_dt, 0.42498970_dt, 0.27093786_dt,
+                                  0.92947328_dt, 0.61147439_dt, 0.22336179_dt, 0.24693054_dt, 0.47612214_dt, 0.77918065_dt, 0.37223309_dt, 0.21471256_dt, 0.32877856_dt, 0.12646258_dt, 0.67831624_dt,
+                                  0.88702011_dt, 0.02927983_dt, 0.61612535_dt, 0.75829589_dt, 0.59066468_dt, 0.32193768_dt, 0.76097107_dt, 0.76275659_dt, 0.68696362_dt, 0.41213930_dt, 0.36759937_dt,
+                                  0.55349046_dt, 0.41167295_dt, 0.35099947_dt, 0.81960344_dt, 0.92969978_dt, 0.45050132_dt, 0.38805157_dt, 0.50729614_dt, 0.47014588_dt, 0.62020564_dt, 0.64011681_dt,
+                                  0.04587162_dt, 0.31548113_dt, 0.92106473_dt, 0.69477749_dt, 0.47513121_dt, 0.19854712_dt, 0.19409746_dt, 0.05211657_dt };
+
+    const raul::Tensor realOutput = { 0.41258818_dt,  0.23090252_dt,  0.40937650_dt,  0.29990131_dt,  0.21662532_dt,  0.30152792_dt,  0.28267553_dt,  0.17685941_dt,  -0.05910678_dt, -0.30468705_dt,
+                                      -0.03868926_dt, -0.53902435_dt, -0.15881371_dt, -0.53351986_dt, -0.20201123_dt, -0.13473484_dt, -0.14606732_dt, 0.30833948_dt,  -0.14364448_dt, 0.46278352_dt,
+                                      0.18332419_dt,  0.45430458_dt,  0.17333917_dt,  0.20230797_dt,  -0.14407888_dt, -0.19817400_dt, 0.01427421_dt,  -0.20199262_dt, -0.12750518_dt, -0.15097588_dt,
+                                      -0.09883307_dt, -0.07910022_dt, 0.07226747_dt,  0.20809202_dt,  -0.00218725_dt, 0.18792307_dt,  0.20290421_dt,  0.00245337_dt,  0.18286784_dt,  0.19068196_dt,
+                                      -0.09972149_dt, -0.30075273_dt, 0.03266975_dt,  -0.28865686_dt, -0.29019397_dt, -0.15537907_dt, -0.25757235_dt, -0.25898933_dt, 0.36775893_dt,  0.20405143_dt,
+                                      0.28224969_dt,  0.26807645_dt,  0.21223350_dt,  0.21944910_dt,  0.27967441_dt,  0.17685941_dt,  -0.12820084_dt, -0.23769802_dt, -0.18157344_dt, -0.46213004_dt,
+                                      -0.15937553_dt, -0.31547189_dt, -0.20447388_dt, -0.13473484_dt, -0.07388842_dt, 0.27483535_dt,  0.05529284_dt,  0.42582422_dt,  0.20708869_dt,  0.33730060_dt,
+                                      0.20356168_dt,  0.20230797_dt,  -0.33119115_dt, -0.14441624_dt, -0.23625243_dt, -0.42526352_dt, -0.10551096_dt, -0.27996942_dt, -0.27772439_dt, -0.07910022_dt,
+                                      -0.07789105_dt, 0.20013525_dt,  0.04506442_dt,  0.28100497_dt,  0.19729096_dt,  0.25853509_dt,  0.26535058_dt,  0.19068196_dt,  0.08529019_dt,  -0.28183529_dt,
+                                      -0.07563852_dt, -0.40614045_dt, -0.27594924_dt, -0.35438591_dt, -0.41411963_dt, -0.25898933_dt };
+    const raul::Tensor realInputNabla = { 0.09306403_dt, -0.02433971_dt, -0.13310526_dt, -0.03846926_dt, 0.04001012_dt, -0.02258831_dt, 0.14178184_dt,  0.19735041_dt,
+                                          0.25060210_dt, -0.18246308_dt, -0.32595548_dt, -0.21425098_dt, 0.22717594_dt, -0.04353829_dt, -0.05844389_dt, 0.02262217_dt,
+                                          0.09416393_dt, -0.02489161_dt, -0.04957995_dt, -0.04627144_dt, 0.14746195_dt, -0.32425511_dt, -0.66132170_dt, -0.18858255_dt };
+
+    const raul::Tensor realWeightsGrad = { 1.37206388_dt, 0.61419535_dt, 0.64985132_dt, 1.09826875_dt, 0.81589270_dt, 0.55404902_dt, 1.62716508_dt, 1.53189170_dt, 0.75618565_dt,
+                                           1.24397111_dt, 0.51048154_dt, 0.67842019_dt, 1.14301169_dt, 0.75377727_dt, 0.53751355_dt, 1.72013032_dt, 1.61132288_dt, 0.70446956_dt,
+                                           0.84956944_dt, 0.36231279_dt, 0.41886079_dt, 1.03481507_dt, 0.57983154_dt, 0.45495081_dt, 1.08667731_dt, 0.88790613_dt, 0.33945137_dt,
+                                           1.20702004_dt, 1.00815487_dt, 0.98966736_dt, 1.23302817_dt, 1.32987463_dt, 1.23074257_dt, 0.75254571_dt, 0.96550834_dt, 0.74833679_dt };
+
+    const raul::Tensor realBiasGrad = { 6.83971596_dt, 5.49397755_dt, 8.50747871_dt, 7.21682835_dt, 9.47528934_dt, 6.89160442_dt };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 4u, 1u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    raul::TransposedConvolution1DLayer cnnLayer(
+        "cnn1", raul::TransposedConvolution1DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, OUTPUT_PADDING, DILATION, GROUPS, BIAS }, networkParameters);
+    TENSORS_CREATE(2);
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["cnn1::Weights"] = TORANGE((raul::Tensor{
+        0.19242159_dt, 0.05964208_dt,  0.16927835_dt, -0.20316836_dt, -0.32996950_dt, -0.12878685_dt, -0.25567430_dt, 0.27351299_dt,  0.09601045_dt, 0.13807118_dt,  0.10542038_dt,  -0.00579867_dt,
+        0.26086941_dt, -0.23683786_dt, 0.02098790_dt, -0.22751340_dt, 0.10278401_dt,  -0.11479411_dt, 0.10213876_dt,  -0.06944716_dt, 0.27646396_dt, -0.19756731_dt, -0.19879934_dt, -0.19881134_dt,
+        0.29981425_dt, 0.11108372_dt,  0.32075027_dt, -0.27509210_dt, -0.33062539_dt, -0.26078793_dt, -0.22422969_dt, 0.13501337_dt,  0.11935863_dt, 0.27697483_dt,  -0.17214179_dt, -0.22723727_dt }));
+    memory_manager["cnn1::Biases"] = TORANGE((raul::Tensor{ 0.17685941_dt, -0.13473484_dt, 0.20230797_dt, -0.07910022_dt, 0.19068196_dt, -0.25898933_dt }));
+
+    // Forward
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+
+    const auto output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    // Backward
+    memory_manager[raul::Name("cnn1").grad()] = TORANGE(deltas);
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto inputNabla = memory_manager[raul::Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+
+    const auto weightsGrad = memory_manager[raul::Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGrad.size(), realWeightsGrad.size());
+    for (size_t i = 0; i < weightsGrad.size(); ++i)
+    {
+        CHECK_NEAR(weightsGrad[i], realWeightsGrad[i], EPSILON);
+    }
+
+    const auto biasGrad = memory_manager[raul::Name("cnn1::Biases").grad()];
+    EXPECT_EQ(biasGrad.size(), realBiasGrad.size());
+    for (size_t i = 0; i < biasGrad.size(); ++i)
+    {
+        CHECK_NEAR(biasGrad[i], realBiasGrad[i], EPSILON);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/layers/Test_TransposedCNNLayer2D.cpp b/training/src/tests/tests/layers/Test_TransposedCNNLayer2D.cpp
new file mode 100644
index 00000000..9841d04f
--- /dev/null
+++ b/training/src/tests/tests/layers/Test_TransposedCNNLayer2D.cpp
@@ -0,0 +1,253 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/basic/trainable/TransposedConvolution2DLayer.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestTransposedCNN2DLayer, BiasesUnit)
+{
+    const size_t KERNEL_SIZE = 3;
+    const size_t FILTERS = 2;
+
+    const size_t realOutputSize = 50;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    const raul::Tensor input = { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+
+    work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 1u, 3u, 3u }, DEC_FORW_READ_NOMEMOPT);
+    raul::TransposedConvolution2DLayer cnnLayer("cnn1", raul::TransposedConvolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["in"] = TORANGE(input);
+    memory_manager["cnn1::Weights"] = TORANGE((raul::Tensor{
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+        1.0_dt,
+    }));
+    memory_manager["cnn1::Biases"] = TORANGE((raul::Tensor{ 2.0f, 3.0f }));
+    ASSERT_NO_THROW(memory_manager["cnn1::Biases"]);
+    ASSERT_NO_THROW(memory_manager["cnn1::BiasesGradient"]);
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+
+    EXPECT_EQ(memory_manager["cnn1"].getDepth(), static_cast<size_t>(FILTERS));
+    EXPECT_EQ(memory_manager["cnn1"].size(), realOutputSize);
+    EXPECT_EQ(memory_manager["cnn1"][0], 3.0_dt);
+    EXPECT_EQ(memory_manager["cnn1"][1], 4.0_dt);
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+}
+
+TEST(TestTransposedCNN2DLayer, SimpleUnit)
+{
+    const size_t KERNEL_SIZE = 2;
+    const size_t FILTERS = 1;
+    const size_t STRIDE = 1;
+    const size_t PADDING = 0;
+    const size_t OUTPUT_PADDING = 0;
+    const size_t DILATION = 1;
+
+    const raul::dtype EPSILON = TODTYPE(1e-6);
+
+    const raul::Tensor input = { -2.17878938_dt, 0.56843126_dt, -1.08452237_dt, -1.39859545_dt };
+
+    const raul::Tensor realOutput = { 0.00815610_dt, -0.58652669_dt, 0.15246566_dt, 0.90068078_dt, 0.28214878_dt, -0.58429915_dt, 0.44630542_dt, 0.97462475_dt, 0.51464051_dt };
+
+    const raul::Tensor realInputNabla = { -0.51501369_dt, -0.51501369_dt, -0.51501369_dt, -0.51501369_dt };
+
+    const raul::Tensor realWeightsGrad = { -4.09347582_dt, -4.09347582_dt, -4.09347582_dt, -4.09347582_dt };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 1u, 2u, 2u }, DEC_FORW_READ_NOMEMOPT);
+
+    raul::TransposedConvolution2DLayer cnnLayer(
+        "cnn1", raul::TransposedConvolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE, FILTERS, STRIDE, PADDING, OUTPUT_PADDING, false, DILATION }, networkParameters);
+    TENSORS_CREATE(1);
+    memory_manager["in"] = TORANGE(input);
+
+    memory_manager["cnn1::Weights"] = TORANGE((raul::Tensor{ -0.00374341_dt, 0.26822180_dt, -0.41152257_dt, -0.36796951_dt }));
+    ASSERT_THROW(memory_manager["cnn1::Biases"], raul::Exception);
+    ASSERT_THROW(memory_manager["cnn1::BiasesGradient"], raul::Exception);
+
+    // Forward
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+
+    const auto& output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    // Backward
+    memory_manager[raul::Name("cnn1").grad()] = TORANGE(*memory_manager.createTensor("gradient", output.getShape(), 1.0_dt));
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto inputNabla = memory_manager[raul::Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+    const auto weightsGrad = memory_manager[raul::Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGrad.size(), realWeightsGrad.size());
+    for (size_t i = 0; i < weightsGrad.size(); ++i)
+    {
+        CHECK_NEAR(weightsGrad[i], realWeightsGrad[i], EPSILON);
+    }
+}
+
+TEST(TestTransposedCNN2DLayer, NonTrivialUnit)
+{
+    const size_t KERNEL_SIZE_W = 1;
+    const size_t KERNEL_SIZE_H = 3;
+    const size_t FILTERS = 2;
+    const size_t STRIDE_W = 3;
+    const size_t STRIDE_H = 2;
+    const size_t PADDING_W = 1;
+    const size_t PADDING_H = 2;
+    const size_t DILATION_W = 3;
+    const size_t DILATION_H = 2;
+
+    const raul::dtype EPSILON = TODTYPE(1e-5);
+
+    const raul::Tensor input = { -1.12583983_dt, -1.15236020_dt, -0.25057858_dt, -0.43387881_dt, 0.84871036_dt,  0.69200915_dt,  -0.31601277_dt, -2.11521935_dt, 0.32227492_dt,
+                                 -1.26333475_dt, 0.34998319_dt,  0.30813393_dt,  0.11984151_dt,  1.23765790_dt,  1.11677718_dt,  -0.24727815_dt, -1.35265374_dt, -1.69593120_dt,
+                                 0.56665063_dt,  0.79350835_dt,  0.59883946_dt,  -1.55509508_dt, -0.34136039_dt, 1.85300612_dt,  0.75018948_dt,  -0.58549756_dt, -0.17339675_dt,
+                                 0.18347794_dt,  1.38936615_dt,  1.58633423_dt,  0.94629836_dt,  -0.84367675_dt, -0.61358309_dt, 0.03159274_dt,  -0.49267697_dt, 0.24841475_dt,
+                                 0.43969584_dt,  0.11241119_dt,  0.54329473_dt,  -0.39515755_dt, 0.20552567_dt,  -0.45032975_dt, -0.57307708_dt, -0.55535841_dt, 0.59432304_dt,
+                                 1.54194260_dt,  1.81972528_dt,  -0.55152869_dt, -1.32532597_dt, 0.18855357_dt,  -0.06907269_dt, -0.49492535_dt, -1.49591494_dt, -0.19383712_dt };
+
+    const raul::Tensor deltas = { -0.47311980_dt, 0.33555076_dt,  1.50912189_dt,  2.08195543_dt,  1.70671165_dt,  2.38036752_dt,  -1.12560165_dt, -0.31699809_dt, -0.14067143_dt, 0.80575359_dt,
+                                  0.32761431_dt,  -0.76070720_dt, -1.59908199_dt, 0.01848667_dt,  -0.75042683_dt, 0.18540798_dt,  1.03946197_dt,  0.35815310_dt,  -0.00330387_dt, -0.53444070_dt,
+                                  1.16868782_dt,  0.39450276_dt,  1.94146204_dt,  0.79149806_dt,  0.03353186_dt,  0.71008658_dt,  -1.53528714_dt, -0.41267914_dt, 0.96630329_dt,  1.62478316_dt,
+                                  -0.36561880_dt, -1.30244040_dt, -0.22824179_dt, 0.27995500_dt,  0.07324605_dt,  1.11331844_dt,  0.28226724_dt,  0.43422565_dt,  -0.80249292_dt, -1.29518616_dt,
+                                  0.78130794_dt,  -0.92678940_dt, 0.20641631_dt,  -0.33344787_dt, -0.42883000_dt, 0.23291829_dt,  0.79688716_dt,  -0.18484163_dt, -0.92146200_dt, -0.05619479_dt,
+                                  -0.70152360_dt, 1.03668678_dt,  -0.60367012_dt, -1.27876520_dt, 0.09295023_dt,  -0.66609973_dt, 0.92337352_dt,  1.38729525_dt,  1.37503791_dt,  0.65963107_dt,
+                                  0.47655711_dt,  -1.01630747_dt, 0.18036698_dt,  0.10833187_dt,  1.95065713_dt,  -1.06309855_dt, 1.14035070_dt,  -0.08988207_dt, 0.72979623_dt,  -1.84531903_dt,
+                                  -0.02501994_dt, 1.36938095_dt,  -0.31263882_dt, 0.24578547_dt,  0.37718192_dt,  1.10123479_dt,  -1.14277780_dt, 0.03758540_dt,  2.69627643_dt,  1.23576367_dt,
+                                  -0.20106392_dt, -0.11792699_dt, -0.82936686_dt, -1.40725660_dt, 1.91589761_dt,  0.69019532_dt,  -2.32170153_dt, -1.19641018_dt, 0.19702816_dt,  -1.17733157_dt,
+                                  -0.06614405_dt, -0.35835508_dt, -1.39517200_dt, 0.47511867_dt,  -0.81372583_dt, 0.92423636_dt,  -0.24734129_dt, -1.41538787_dt, -1.07865798_dt, -0.72090983_dt };
+
+    const raul::Tensor realOutput = { -0.27676830_dt, -0.27676830_dt, -0.50986284_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt,
+                                      -0.27676830_dt, -0.27676830_dt, 0.88876522_dt,  -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt,
+                                      -0.27676830_dt, -0.27676830_dt, -1.29211509_dt, -0.27676830_dt, -0.27676830_dt, -0.17777696_dt, -0.17777696_dt, -0.03814605_dt, -0.17777696_dt, -0.17777696_dt,
+                                      -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, 0.00164664_dt,  -0.17777696_dt, -0.17777696_dt,
+                                      -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, 0.13718286_dt,  -0.17777696_dt, -0.17777696_dt,
+                                      -0.27676830_dt, -0.27676830_dt, -0.50073957_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt,
+                                      -0.27676830_dt, -0.27676830_dt, -0.63370347_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt, -0.27676830_dt,
+                                      -0.27676830_dt, -0.27676830_dt, 0.08432946_dt,  -0.27676830_dt, -0.27676830_dt, -0.17777696_dt, -0.17777696_dt, 0.16211960_dt,  -0.17777696_dt, -0.17777696_dt,
+                                      -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, 0.61419845_dt,  -0.17777696_dt, -0.17777696_dt,
+                                      -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.17777696_dt, -0.33588973_dt, -0.17777696_dt, -0.17777696_dt };
+
+    const raul::Tensor realInputNabla = { 0._dt, 0.98022926_dt,  0._dt, 0._dt, -0.97168428_dt, 0._dt, 0._dt, 0.32867494_dt,  0._dt, 0._dt, 0.56250048_dt, 0._dt, 0._dt, -0.68348289_dt, 0._dt,
+                                          0._dt, 0.71109134_dt,  0._dt, 0._dt, -0.09915833_dt, 0._dt, 0._dt, -0.46439925_dt, 0._dt, 0._dt, 0.07548344_dt, 0._dt, 0._dt, -0.32969624_dt, 0._dt,
+                                          0._dt, 0.16827486_dt,  0._dt, 0._dt, 0.51299018_dt,  0._dt, 0._dt, -0.11056838_dt, 0._dt, 0._dt, 0.33984596_dt, 0._dt, 0._dt, -0.05727647_dt, 0._dt,
+                                          0._dt, -0.15414071_dt, 0._dt, 0._dt, -0.29994714_dt, 0._dt, 0._dt, -0.23234649_dt, 0._dt };
+
+    const raul::Tensor realWeightsGrad = { 5.08365631_dt,  -8.03968811_dt, 4.00481939_dt,  -0.71099371_dt, 2.99399972_dt, -1.12538338_dt, 3.80654287_dt, -3.93423772_dt, 1.79923403_dt,
+                                           -0.42595136_dt, 1.18739843_dt,  -0.50218743_dt, 0.03746638_dt,  0.00981903_dt, -1.66235399_dt, 1.68344891_dt, 1.59264016_dt,  -2.03635502_dt };
+
+    const raul::Tensor realBiasGrad = { 13.82497597_dt, -6.50798988_dt };
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.tensorNeeded("in", "in", raul::WShape{ raul::BS(), 3u, 3u, 3u }, DEC_FORW_READ_NOMEMOPT);
+
+    raul::TransposedConvolution2DLayer cnnLayer(
+        "cnn1",
+        raul::TransposedConvolution2DParams{ { "in" }, { "cnn1" }, KERNEL_SIZE_W, KERNEL_SIZE_H, FILTERS, STRIDE_W, STRIDE_H, PADDING_W, PADDING_H, 0, 0, true, DILATION_W, DILATION_H },
+        networkParameters);
+    TENSORS_CREATE(2);
+    memory_manager["in"] = TORANGE(input);
+
+    memory_manager["cnn1::Weights"] = TORANGE((raul::Tensor{ -0.00305650_dt,
+                                                             0.21900219_dt,
+                                                             -0.33600679_dt,
+                                                             -0.30044585_dt,
+                                                             -0.15723860_dt,
+                                                             0.10947478_dt,
+                                                             -0.00808871_dt,
+                                                             0.32369578_dt,
+                                                             -0.03622961_dt,
+                                                             0.10802764_dt,
+                                                             -0.12337798_dt,
+                                                             -0.08024749_dt,
+                                                             -0.39001942_dt,
+                                                             -0.27037555_dt,
+                                                             -0.16828938_dt,
+                                                             0.01512298_dt,
+                                                             0.16139495_dt,
+                                                             0.24495828_dt }));
+    memory_manager["cnn1::Biases"] = TORANGE((raul::Tensor{ -0.27676830_dt, -0.17777696_dt }));
+
+    ASSERT_NO_THROW(cnnLayer.forwardCompute(raul::NetworkMode::Train));
+
+    const auto output = memory_manager["cnn1"];
+    EXPECT_EQ(output.size(), realOutput.size());
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        CHECK_NEAR(output[i], realOutput[i], EPSILON);
+    }
+
+    memory_manager[raul::Name("cnn1").grad()] = TORANGE(deltas);
+
+    ASSERT_NO_THROW(cnnLayer.backwardCompute());
+    const auto inputNabla = memory_manager[raul::Name("in").grad()];
+    EXPECT_EQ(inputNabla.size(), realInputNabla.size());
+    for (size_t i = 0; i < inputNabla.size(); ++i)
+    {
+        CHECK_NEAR(inputNabla[i], realInputNabla[i], EPSILON);
+    }
+
+    const auto weightsGrad = memory_manager[raul::Name("cnn1::Weights").grad()];
+    EXPECT_EQ(weightsGrad.size(), realWeightsGrad.size());
+    for (size_t i = 0; i < weightsGrad.size(); ++i)
+    {
+        CHECK_NEAR(weightsGrad[i], realWeightsGrad[i], EPSILON);
+    }
+
+    const auto biasGrad = memory_manager[raul::Name("cnn1::Biases").grad()];
+    EXPECT_EQ(biasGrad.size(), realBiasGrad.size());
+    for (size_t i = 0; i < biasGrad.size(); ++i)
+    {
+        CHECK_NEAR(biasGrad[i], realBiasGrad[i], EPSILON);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_API.cpp b/training/src/tests/tests/lib/Test_API.cpp
new file mode 100644
index 00000000..b3acf03c
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_API.cpp
@@ -0,0 +1,251 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/api/API.h>
+
+#define ASSERT_OK(x) ASSERT_EQ(x, STATUS_OK)
+#define EXPECT_OK(x) EXPECT_EQ(x, STATUS_OK)
+#define ASSERT_ERROR(x) ASSERT_NE(x, STATUS_OK)
+
+namespace UT
+{
+
+TEST(TestAPI, EmptyGraphUnit)
+{
+    PROFILE_TEST
+    Graph_Description_t* desc = NULL;
+    ASSERT_OK(create_graph_description_eager(&desc));
+
+    Graph_t* graph = NULL;
+    ASSERT_ERROR(create_graph(&desc, &graph, 0));
+
+    ASSERT_EQ(graph, nullptr); // zero batch size
+
+    ASSERT_OK(delete_graph(graph));
+}
+
+TEST(TestAPI, SimpleGraph1Unit)
+{
+    PROFILE_TEST
+    Graph_Description_t* desc = NULL;
+    ASSERT_OK(create_graph_description_eager(&desc));
+
+    Graph_t* graph = NULL;
+    ASSERT_OK(create_graph(&desc, &graph, 1u));
+    ASSERT_NE(graph, nullptr);
+    ASSERT_OK(delete_graph(graph));
+}
+
+TEST(TestAPI, SimpleGraphDescriptionUnit)
+{
+    PROFILE_TEST
+    Graph_Description_t* desc = NULL;
+    ASSERT_OK(create_graph_description_eager(&desc));
+    ASSERT_OK(delete_graph_description(desc));
+}
+
+TEST(TestAPI, SimpleGraph2Unit)
+{
+    PROFILE_TEST
+    Graph_Description_t* desc = NULL;
+    ASSERT_OK(create_graph_description_eager(&desc));
+
+    Graph_t* graph = NULL;
+    ASSERT_OK(create_graph(&desc, &graph, 1u));
+    ASSERT_EQ(desc, nullptr);
+    ASSERT_OK(delete_graph(graph));
+    ASSERT_OK(delete_graph_description(desc));
+}
+
+TEST(TestAPI, ArangeUnit)
+{
+    PROFILE_TEST
+    constexpr size_t size = 8;
+    Graph_Description_t* desc = NULL;
+    ASSERT_OK(create_graph_description_eager(&desc));
+
+    {
+        const char* tensors[] = { "data" };
+        ASSERT_OK(add_data_layer(desc, "data", tensors, 1, 1, 2, 2));
+    }
+
+    Graph_t* graph = NULL;
+    ASSERT_OK(create_graph(&desc, &graph, 2));
+
+    arange(graph, "data", FLOAT_TYPE(1), FLOAT_TYPE(2));
+
+    size_t output_size = 0;
+    ASSERT_OK(get_tensor(graph, "data", nullptr, &output_size));
+    ASSERT_EQ(output_size, size);
+    FLOAT_TYPE data[size];
+    ASSERT_OK(get_tensor(graph, "data", data, &output_size));
+    FLOAT_TYPE real[size] = { 1.0_dt, 3.0_dt, 5.0_dt, 7.0_dt, 9.0_dt, 11.0_dt, 13.0_dt, 15.0_dt };
+
+    for (size_t q = 0; q < output_size; ++q)
+        EXPECT_EQ(data[q], real[q]);
+}
+
+TEST(TestAPI, LSTMLayerUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = 1e-6_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 5U;
+    const auto batch_size = 2U;
+    const auto lstmName = raul::Name("lstm");
+
+    Graph_Description_t* desc = NULL;
+
+    ASSERT_OK(create_graph_description_eager(&desc));
+
+    {
+        const char* tensors[] = { "data" };
+        const size_t tensor_amount = 1U;
+        ASSERT_OK(add_data_layer(desc, "data", tensors, tensor_amount, sequence_length, 1, input_size));
+    }
+    ASSERT_OK(add_lstm_layer(desc, lstmName.c_str(), "data", "out", hidden_size, false, true));
+
+    Graph_t* graph = NULL;
+    ASSERT_OK(create_graph(&desc, &graph, batch_size));
+    ASSERT_NE(graph, nullptr);
+
+    const FLOAT_TYPE input_data[] = { -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                                      7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt,
+                                      9.295023e-02_dt,  -6.660997e-01_dt, 6.080472e-01_dt,  -7.300199e-01_dt, -8.833758e-01_dt, -4.189135e-01_dt, -8.048265e-01_dt, 5.656096e-01_dt,
+                                      2.885762e-01_dt,  3.865978e-01_dt,  -2.010639e-01_dt, -1.179270e-01_dt, -8.293669e-01_dt, -1.407257e+00_dt, 1.626847e+00_dt,  1.722732e-01_dt,
+                                      -7.042940e-01_dt, 3.147210e-01_dt,  1.573929e-01_dt,  3.853627e-01_dt,  5.736546e-01_dt,  9.979313e-01_dt,  5.436094e-01_dt,  7.880439e-02_dt };
+
+    ASSERT_OK(set_tensor(graph, "data", input_data, batch_size * sequence_length * input_size));
+
+    {
+        ASSERT_OK(fill_tensor(graph, (lstmName / "cell" / "linear_ih" / "Weights").c_str(), 1.0_dt));
+        ASSERT_OK(fill_tensor(graph, (lstmName / "cell" / "linear_hh" / "Weights").c_str(), 1.0_dt));
+        ASSERT_OK(fill_tensor(graph, (lstmName / "cell" / "linear_ih" / "Biases").c_str(), 1.0_dt));
+        ASSERT_OK(fill_tensor(graph, (lstmName / "cell" / "linear_hh" / "Biases").c_str(), 1.0_dt));
+    }
+
+    ASSERT_OK(network_forward(graph, true));
+
+    FLOAT_TYPE* output_data;
+    size_t output_size = 0;
+    ASSERT_OK(get_tensor(graph, "out", nullptr, &output_size));
+
+    ASSERT_EQ(output_size, batch_size * sequence_length * hidden_size);
+
+    output_data = (FLOAT_TYPE*)malloc(output_size * sizeof(FLOAT_TYPE));
+
+    ASSERT_OK(get_tensor(graph, "out", output_data, &output_size));
+
+    const FLOAT_TYPE output_golden_data[] = { -1.037908e-02_dt, -1.037908e-02_dt, -1.037908e-02_dt, -7.253138e-02_dt, -7.253138e-02_dt, -7.253138e-02_dt, 2.026980e-01_dt, 2.026980e-01_dt,
+                                              2.026980e-01_dt,  8.062391e-01_dt,  8.062391e-01_dt,  8.062391e-01_dt,  9.519626e-01_dt,  9.519626e-01_dt,  9.519626e-01_dt, 1.573658e-01_dt,
+                                              1.573658e-01_dt,  1.573658e-01_dt,  7.829522e-01_dt,  7.829522e-01_dt,  7.829522e-01_dt,  9.537140e-01_dt,  9.537140e-01_dt, 9.537140e-01_dt,
+                                              9.895445e-01_dt,  9.895445e-01_dt,  9.895445e-01_dt,  9.986963e-01_dt,  9.986963e-01_dt,  9.986963e-01_dt };
+
+    for (size_t i = 0; i < output_size; ++i)
+    {
+        EXPECT_NEAR(output_data[i], output_golden_data[i], eps);
+    }
+
+    free(output_data);
+}
+
+TEST(TestAPI, LSTMLayerExtUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = 1e-6_dt;
+    const auto input_size = 4U;
+    const auto hidden_size = 3U;
+    const auto sequence_length = 5U;
+    const auto batch_size = 2U;
+    const auto lstmName = raul::Name("lstm");
+
+    Graph_Description_t* desc = NULL;
+
+    ASSERT_OK(create_graph_description_eager(&desc));
+
+    {
+        const char* tensors[] = { "data" };
+        const size_t tensor_amount = 1U;
+        ASSERT_OK(add_data_layer(desc, "data", tensors, tensor_amount, sequence_length, 1, input_size));
+    }
+    {
+        const char* tensors[] = { "hidden_in", "cell_in" };
+        const size_t tensor_amount = 2U;
+        ASSERT_OK(add_data_layer(desc, "lstm_state_init", tensors, tensor_amount, 1, 1, hidden_size));
+    }
+    ASSERT_OK(add_lstm_layer_ext(desc, lstmName.c_str(), "data", "hidden_in", "cell_in", "out", "hidden_out", "cell_out", false, true));
+
+    Graph_t* graph = NULL;
+    // ASSERT_OK(create_graph(&desc, &graph, batch_size));
+    create_graph(&desc, &graph, batch_size);
+    std::cout << get_last_error() << std::endl;
+    ASSERT_NE(graph, nullptr);
+
+    const FLOAT_TYPE input_data[] = { -8.024929e-01_dt, -1.295186e+00_dt, -7.501815e-01_dt, -1.311966e+00_dt, -2.188337e-01_dt, -2.435065e+00_dt, -7.291476e-02_dt, -3.398641e-02_dt,
+                                      7.968872e-01_dt,  -1.848416e-01_dt, -3.701473e-01_dt, -1.210281e+00_dt, -6.226985e-01_dt, -4.637222e-01_dt, 1.921782e+00_dt,  -4.025455e-01_dt,
+                                      9.295023e-02_dt,  -6.660997e-01_dt, 6.080472e-01_dt,  -7.300199e-01_dt, -8.833758e-01_dt, -4.189135e-01_dt, -8.048265e-01_dt, 5.656096e-01_dt,
+                                      2.885762e-01_dt,  3.865978e-01_dt,  -2.010639e-01_dt, -1.179270e-01_dt, -8.293669e-01_dt, -1.407257e+00_dt, 1.626847e+00_dt,  1.722732e-01_dt,
+                                      -7.042940e-01_dt, 3.147210e-01_dt,  1.573929e-01_dt,  3.853627e-01_dt,  5.736546e-01_dt,  9.979313e-01_dt,  5.436094e-01_dt,  7.880439e-02_dt };
+
+    const FLOAT_TYPE hidden_data[] = { -4.468389e-01_dt, 4.520225e-01_dt, -9.759244e-01_dt, 7.112372e-01_dt, -7.582265e-01_dt, -6.435831e-01_dt };
+    const FLOAT_TYPE cell_data[] = { -6.461524e-01_dt, -1.590926e-01_dt, -1.778664e+00_dt, 8.476512e-01_dt, 2.459428e-01_dt, -1.311679e-01_dt };
+
+    ASSERT_OK(set_tensor(graph, "data", input_data, batch_size * sequence_length * input_size));
+    ASSERT_OK(set_tensor(graph, "hidden_in", hidden_data, batch_size * hidden_size));
+    ASSERT_OK(set_tensor(graph, "cell_in", cell_data, batch_size * hidden_size));
+
+    {
+        ASSERT_OK(fill_tensor(graph, (lstmName / "cell" / "linear_ih" / "Weights").c_str(), 1.0_dt));
+        ASSERT_OK(fill_tensor(graph, (lstmName / "cell" / "linear_hh" / "Weights").c_str(), 1.0_dt));
+        ASSERT_OK(fill_tensor(graph, (lstmName / "cell" / "linear_ih" / "Biases").c_str(), 1.0_dt));
+        ASSERT_OK(fill_tensor(graph, (lstmName / "cell" / "linear_hh" / "Biases").c_str(), 1.0_dt));
+    }
+
+    ASSERT_OK(network_forward(graph, true));
+
+    const auto verify_tensor = [&](const char* name, const FLOAT_TYPE* data, const size_t size) {
+        FLOAT_TYPE* output_data;
+        size_t output_size = 0;
+        ASSERT_OK(get_tensor(graph, name, nullptr, &output_size));
+
+        ASSERT_EQ(output_size, size);
+
+        output_data = (FLOAT_TYPE*)malloc(output_size * sizeof(FLOAT_TYPE));
+        ASSERT_OK(get_tensor(graph, name, output_data, &output_size));
+
+        for (size_t i = 0; i < output_size; ++i)
+        {
+            EXPECT_NEAR(output_data[i], data[i], eps);
+        }
+        free(output_data);
+    };
+
+    const FLOAT_TYPE output_golden_data[] = { -2.873812e-03_dt, -2.023149e-03_dt, -4.841400e-03_dt, -7.045983e-02_dt, -6.851754e-02_dt, -7.495427e-02_dt, 2.086206e-01_dt, 2.114406e-01_dt,
+                                              2.020342e-01_dt,  8.093282e-01_dt,  8.104742e-01_dt,  8.066313e-01_dt,  9.525990e-01_dt,  9.527962e-01_dt,  9.521345e-01_dt, 1.182437e-01_dt,
+                                              3.509183e-03_dt,  -6.965960e-02_dt, 7.515067e-01_dt,  6.616085e-01_dt,  5.865334e-01_dt,  9.432809e-01_dt,  9.260048e-01_dt, 9.104356e-01_dt,
+                                              9.885837e-01_dt,  9.860258e-01_dt,  9.836924e-01_dt,  9.986322e-01_dt,  9.982802e-01_dt,  9.979585e-01_dt };
+
+    const FLOAT_TYPE hidden_golden_data[] = { 9.525990e-01_dt, 9.527962e-01_dt, 9.521345e-01_dt, 9.986322e-01_dt, 9.982802e-01_dt, 9.979585e-01_dt };
+    const FLOAT_TYPE cell_golden_data[] = { 2.193398e+00_dt, 2.197572e+00_dt, 2.183694e+00_dt, 4.067711e+00_dt, 3.832196e+00_dt, 3.684592e+00_dt };
+
+    verify_tensor("out", output_golden_data, batch_size * sequence_length * hidden_size);
+    verify_tensor("hidden_out", hidden_golden_data, batch_size * hidden_size);
+    verify_tensor("cell_out", cell_golden_data, batch_size * hidden_size);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_Common.cpp b/training/src/tests/tests/lib/Test_Common.cpp
new file mode 100644
index 00000000..01685783
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_Common.cpp
@@ -0,0 +1,1315 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Tensor.h>
+
+namespace UT
+{
+
+const raul::dtype EPSILON_ACCURACY = TODTYPE(1e-6);
+
+TEST(TestCommon, CheckGemmUnit)
+{
+    PROFILE_TEST
+
+    const raul::Tensor matA = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f };
+    const raul::Tensor matB = { 10.0f, 20.0f, 40.0f, 50.0f, 70.0f, 80.0f };
+    const raul::Tensor matCG = { 300.0f, 360.0f, 660.0f, 810.0f, 1020.0f, 1260.0f };
+    raul::Tensor matC(matCG.size());
+
+    raul::Common::gemm(CblasNoTrans, CblasNoTrans, 3, 2, 3, 1.0_dt, matA.data(), matB.data(), 0.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], matCG[q], EPSILON_ACCURACY);
+
+    raul::Common::gemm(CblasNoTrans, CblasNoTrans, 3, 2, 3, 1.0_dt, matA.data(), matB.data(), 1.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], 2.0f * matCG[q], EPSILON_ACCURACY);
+}
+
+TEST(TestCommon, CheckGemmFP16Unit)
+{
+    PROFILE_TEST
+
+    const raul::TensorFP16 matA = { 1.0_hf, 2.0_hf, 3.0_hf, 4.0_hf, 5.0_hf, 6.0_hf, 7.0_hf, 8.0_hf, 9.0_hf };
+    const raul::TensorFP16 matB = { 10.0_hf, 20.0_hf, 40.0_hf, 50.0_hf, 70.0_hf, 80.0_hf };
+    const raul::TensorFP16 matCG = { 300.0_hf, 360.0_hf, 660.0_hf, 810.0_hf, 1020.0_hf, 1260.0_hf };
+    raul::TensorFP16 matC(matCG.size());
+
+    raul::Common::gemm(CblasNoTrans, CblasNoTrans, 3, 2, 3, 1.0_dt, matA.data(), matB.data(), 0.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], matCG[q], EPSILON_ACCURACY);
+
+    raul::Common::gemm(CblasNoTrans, CblasNoTrans, 3, 2, 3, 1.0_dt, matA.data(), matB.data(), 1.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(raul::toFloat32(matC[q]), 2.0f * raul::toFloat32(matCG[q]), EPSILON_ACCURACY);
+}
+
+TEST(TestCommon, Arange1Unit)
+{
+    PROFILE_TEST
+    raul::Tensor t(2, 1, 2, 2);
+    raul::Common::arange(t.begin(), t.end());
+    raul::Tensor realT({ 0.0_dt, 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt });
+    for (size_t q = 0; q < t.size(); ++q)
+        EXPECT_EQ(t[q], realT[q]);
+}
+
+TEST(TestCommon, Arange2Unit)
+{
+    PROFILE_TEST
+    raul::Tensor t(2, 1, 2, 2);
+    raul::Common::arange(t.begin(), t.end(), 1._dt, 2.0_dt);
+    raul::Tensor realT({ 1.0_dt, 3.0_dt, 5.0_dt, 7.0_dt, 9.0_dt, 11.0_dt, 13.0_dt, 15.0_dt });
+    for (size_t q = 0; q < t.size(); ++q)
+        EXPECT_EQ(t[q], realT[q]);
+}
+
+TEST(TestCommon, Arange3Unit)
+{
+    PROFILE_TEST
+    raul::Tensor t(2, 1, 2, 2);
+    raul::Common::arange(t.begin(), t.end(), 1._dt);
+    raul::Tensor realT({ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt, 8.0_dt });
+    for (size_t q = 0; q < t.size(); ++q)
+        EXPECT_EQ(t[q], realT[q]);
+}
+
+TEST(TestCommon, Arange4Unit)
+{
+    PROFILE_TEST
+    raul::Tensor t(2, 1, 2, 2);
+    raul::Common::arange(t.begin(), t.end(), static_cast<unsigned char>(1));
+    raul::Tensor realT({ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt, 8.0_dt });
+    for (size_t q = 0; q < t.size(); ++q)
+        EXPECT_EQ(t[q], realT[q]);
+}
+
+TEST(TestCommon, Arange5Unit)
+{
+    PROFILE_TEST
+    raul::Tensor t(2, 1, 2, 2);
+    raul::Common::arange(t, static_cast<unsigned char>(1));
+    raul::Tensor realT({ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt, 8.0_dt });
+    for (size_t q = 0; q < t.size(); ++q)
+        EXPECT_EQ(t[q], realT[q]);
+}
+
+TEST(TestCommon, CheckTriuUnit)
+{
+    PROFILE_TEST
+    const raul::Tensor res_m1 = { 1, 2, 3, 4, 5, 6, 0, 8, 9, 0, 0, 12 };
+    const raul::Tensor res_p1 = { 0, 2, 3, 0, 0, 6, 0, 0, 0, 0, 0, 0 };
+    const raul::Tensor res_0 = { 1, 2, 3, 0, 5, 6, 0, 0, 9, 0, 0, 0 };
+
+    {
+        raul::Tensor m = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
+        raul::Common::triu(m.data(), 4, 3, -1);
+        for (size_t q = 0; q < m.size(); ++q)
+            EXPECT_EQ(m[q], res_m1[q]);
+    }
+    {
+        raul::Tensor m = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
+        raul::Common::triu(m.data(), 4, 3, 1);
+        for (size_t q = 0; q < m.size(); ++q)
+            EXPECT_EQ(m[q], res_p1[q]);
+    }
+    {
+        raul::Tensor m = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
+        raul::Common::triu(m.data(), 4, 3);
+        for (size_t q = 0; q < m.size(); ++q)
+            EXPECT_EQ(m[q], res_0[q]);
+    }
+}
+
+TEST(TestCommon, CheckGemmMoreUnit)
+{
+    PROFILE_TEST
+    const raul::Tensor matA = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f };
+    const raul::Tensor matB = { 10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f };
+    const raul::Tensor matCG = { 300.0f, 360.0f, 420.0f, 660.0f, 810.0f, 960.0f, 1020.0f, 1260.0f, 1500.0f };
+    raul::Tensor matC(matCG.size());
+
+    raul::Common::gemm(CblasNoTrans, CblasNoTrans, 3, 3, 3, 1.0_dt, matA.data(), matB.data(), 0.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], matCG[q], EPSILON_ACCURACY);
+
+    raul::Common::gemm(CblasNoTrans, CblasNoTrans, 3, 3, 3, 1.0_dt, matA.data(), matB.data(), 1.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], 2.0f * matCG[q], EPSILON_ACCURACY);
+
+    const raul::Tensor matCG_AT = { 660.0f, 780.0f, 900.0f, 780.0f, 930.0f, 1080.0f, 900.0f, 1080.0f, 1260.0f };
+
+    raul::Common::gemm(CblasTrans, CblasNoTrans, 3, 3, 3, 1.0_dt, matA.data(), matB.data(), 0.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], matCG_AT[q], EPSILON_ACCURACY);
+
+    const raul::Tensor matCG_BT = { 140.0f, 320.0f, 500.0f, 320.0f, 770.0f, 1220.0f, 500.0f, 1220.0f, 1940.0f };
+
+    raul::Common::gemm(CblasNoTrans, CblasTrans, 3, 3, 3, 1.0_dt, matA.data(), matB.data(), 0.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], matCG_BT[q], EPSILON_ACCURACY);
+
+    const raul::Tensor matCG_ATBT = { 300.0f, 660.0f, 1020.0f, 360.0f, 810.0f, 1260.0f, 420.0f, 960.0f, 1500.0f };
+
+    raul::Common::gemm(CblasTrans, CblasTrans, 3, 3, 3, 1.0_dt, matA.data(), matB.data(), 0.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], matCG_ATBT[q], EPSILON_ACCURACY);
+}
+
+TEST(TestCommon, CheckGemmFP16MoreUnit)
+{
+    PROFILE_TEST
+    const raul::TensorFP16 matA = { 1.0_hf, 2.0_hf, 3.0_hf, 4.0_hf, 5.0_hf, 6.0_hf, 7.0_hf, 8.0_hf, 9.0_hf };
+    const raul::TensorFP16 matB = { 10.0_hf, 20.0_hf, 30.0_hf, 40.0_hf, 50.0_hf, 60.0_hf, 70.0_hf, 80.0_hf, 90.0_hf };
+    const raul::TensorFP16 matCG = { 300.0_hf, 360.0_hf, 420.0_hf, 660.0_hf, 810.0_hf, 960.0_hf, 1020.0_hf, 1260.0_hf, 1500.0_hf };
+    raul::TensorFP16 matC(matCG.size());
+
+    raul::Common::gemm(CblasNoTrans, CblasNoTrans, 3, 3, 3, 1.0_dt, matA.data(), matB.data(), 0.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], matCG[q], EPSILON_ACCURACY);
+
+    raul::Common::gemm(CblasNoTrans, CblasNoTrans, 3, 3, 3, 1.0_dt, matA.data(), matB.data(), 1.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(raul::toFloat32(matC[q]), 2.0f * raul::toFloat32(matCG[q]), EPSILON_ACCURACY);
+
+    const raul::TensorFP16 matCG_AT = { 660.0_hf, 780.0_hf, 900.0_hf, 780.0_hf, 930.0_hf, 1080.0_hf, 900.0_hf, 1080.0_hf, 1260.0_hf };
+
+    raul::Common::gemm(CblasTrans, CblasNoTrans, 3, 3, 3, 1.0_dt, matA.data(), matB.data(), 0.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], matCG_AT[q], EPSILON_ACCURACY);
+
+    const raul::TensorFP16 matCG_BT = { 140.0_hf, 320.0_hf, 500.0_hf, 320.0_hf, 770.0_hf, 1220.0_hf, 500.0_hf, 1220.0_hf, 1940.0_hf };
+
+    raul::Common::gemm(CblasNoTrans, CblasTrans, 3, 3, 3, 1.0_dt, matA.data(), matB.data(), 0.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], matCG_BT[q], EPSILON_ACCURACY);
+
+    const raul::TensorFP16 matCG_ATBT = { 300.0_hf, 660.0_hf, 1020.0_hf, 360.0_hf, 810.0_hf, 1260.0_hf, 420.0_hf, 960.0_hf, 1500.0_hf };
+
+    raul::Common::gemm(CblasTrans, CblasTrans, 3, 3, 3, 1.0_dt, matA.data(), matB.data(), 0.0_dt, matC.data());
+    for (size_t q = 0; q < matC.size(); ++q)
+        CHECK_NEAR(matC[q], matCG_ATBT[q], EPSILON_ACCURACY);
+}
+
+#if !defined(_BLAS_ENHANCE)
+TEST(TestCommon, CheckAXPYUnit)
+{
+    PROFILE_TEST
+    const raul::Tensor vecA = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
+    const raul::Tensor vecBOriginal = { 10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f };
+
+    {
+        raul::Tensor vecB(TORANGE(vecBOriginal));
+        raul::Common::axpy(vecA.size(), 1.0_dt, vecA.data(), 1, vecB.data(), 1);
+        for (size_t q = 0; q < vecB.size(); ++q)
+            CHECK_NEAR(vecB[q], vecA[q] + vecBOriginal[q], EPSILON_ACCURACY);
+
+        vecB = TORANGE(vecBOriginal);
+        raul::Common::axpy(vecA.size(), 2.0_dt, vecA.data(), 1, vecB.data(), 1);
+        for (size_t q = 0; q < vecB.size(); ++q)
+            CHECK_NEAR(vecB[q], 2.0f * vecA[q] + vecBOriginal[q], EPSILON_ACCURACY);
+
+        vecB = TORANGE(vecBOriginal);
+        raul::Common::axpy(3, 1.0_dt, vecA.data(), 1, vecB.data(), 1, 3);
+        for (size_t q = 0; q < 3; ++q)
+            CHECK_NEAR(vecB[q], vecA[q + 3] + vecBOriginal[q], EPSILON_ACCURACY);
+
+        vecB = TORANGE(vecBOriginal);
+        raul::Common::axpy(3, 1.0_dt, vecA.data(), 1, vecB.data(), 1, 0, 3);
+        for (size_t q = 0; q < 3; ++q)
+            CHECK_NEAR(vecB[q + 3], vecA[q] + vecBOriginal[q + 3], EPSILON_ACCURACY);
+
+        vecB = TORANGE(vecBOriginal);
+        raul::Common::axpy(3, 1.0_dt, vecA.data(), 1, vecB.data(), 1, 3, 3);
+        for (size_t q = 0; q < 3; ++q)
+            CHECK_NEAR(vecB[q + 3], vecA[q + 3] + vecBOriginal[q + 3], EPSILON_ACCURACY);
+    }
+
+    const raul::Tensor vecASparse = { 1.0f, 1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 4.0f, 1.0f, 5.0f, 1.0f, 6.0f, 1.0f };
+    const raul::Tensor vecBOriginalSparse = { 10.0f, 1.0f, 20.0f, 1.0f, 30.0f, 1.0f, 40.0f, 1.0f, 50.0f, 1.0f, 60.0f, 1.0f };
+
+    {
+        raul::Tensor vecB(TORANGE(vecBOriginal));
+        raul::Common::axpy(vecA.size(), 1.0_dt, vecASparse.data(), 2, vecB.data(), 1);
+        for (size_t q = 0; q < vecB.size(); ++q)
+            CHECK_NEAR(vecB[q], vecA[q] + vecBOriginal[q], EPSILON_ACCURACY);
+    }
+
+    {
+        raul::Tensor vecB(TORANGE(vecBOriginalSparse));
+
+        vecB = TORANGE(vecBOriginalSparse);
+        raul::Common::axpy(vecA.size(), 1.0_dt, vecA.data(), 1, vecB.data(), 2);
+        for (size_t q = 0; q < vecB.size(); q += 2)
+            CHECK_NEAR(vecB[q], vecA[q / 2] + vecBOriginalSparse[q], EPSILON_ACCURACY);
+
+        vecB = TORANGE(vecBOriginalSparse);
+        raul::Common::axpy(vecA.size(), 1.0_dt, vecASparse.data(), 2, vecB.data(), 2);
+        for (size_t q = 0; q < vecB.size(); q += 2)
+            CHECK_NEAR(vecB[q], vecASparse[q] + vecBOriginalSparse[q], EPSILON_ACCURACY);
+    }
+}
+
+TEST(TestCommon, CheckAXPYFP16Unit)
+{
+    PROFILE_TEST
+    const raul::TensorFP16 vecA = { 1.0_hf, 2.0_hf, 3.0_hf, 4.0_hf, 5.0_hf, 6.0_hf };
+    const raul::TensorFP16 vecBOriginal = { 10.0_hf, 20.0_hf, 30.0_hf, 40.0_hf, 50.0_hf, 60.0_hf };
+
+    {
+        raul::TensorFP16 vecB(TORANGE_FP16(vecBOriginal));
+        raul::Common::axpy(vecA.size(), 1.0_dt, vecA.data(), 1, vecB.data(), 1);
+        for (size_t q = 0; q < vecB.size(); ++q)
+            CHECK_NEAR(raul::toFloat32(vecB[q]), raul::toFloat32(vecA[q]) + raul::toFloat32(vecBOriginal[q]), EPSILON_ACCURACY);
+
+        vecB = TORANGE_FP16(vecBOriginal);
+        raul::Common::axpy(vecA.size(), 2.0_dt, vecA.data(), 1, vecB.data(), 1);
+        for (size_t q = 0; q < vecB.size(); ++q)
+            CHECK_NEAR(raul::toFloat32(vecB[q]), 2.0f * raul::toFloat32(vecA[q]) + raul::toFloat32(vecBOriginal[q]), EPSILON_ACCURACY);
+
+        vecB = TORANGE_FP16(vecBOriginal);
+        raul::Common::axpy(3, 1.0_dt, vecA.data(), 1, vecB.data(), 1, 3);
+        for (size_t q = 0; q < 3; ++q)
+            CHECK_NEAR(raul::toFloat32(vecB[q]), raul::toFloat32(vecA[q + 3]) + raul::toFloat32(vecBOriginal[q]), EPSILON_ACCURACY);
+
+        vecB = TORANGE_FP16(vecBOriginal);
+        raul::Common::axpy(3, 1.0_dt, vecA.data(), 1, vecB.data(), 1, 0, 3);
+        for (size_t q = 0; q < 3; ++q)
+            CHECK_NEAR(raul::toFloat32(vecB[q + 3]), raul::toFloat32(vecA[q]) + raul::toFloat32(vecBOriginal[q + 3]), EPSILON_ACCURACY);
+
+        vecB = TORANGE_FP16(vecBOriginal);
+        raul::Common::axpy(3, 1.0_dt, vecA.data(), 1, vecB.data(), 1, 3, 3);
+        for (size_t q = 0; q < 3; ++q)
+            CHECK_NEAR(raul::toFloat32(vecB[q + 3]), raul::toFloat32(vecA[q + 3]) + raul::toFloat32(vecBOriginal[q + 3]), EPSILON_ACCURACY);
+    }
+
+    const raul::TensorFP16 vecASparse = { 1.0_hf, 1.0_hf, 2.0_hf, 1.0_hf, 3.0_hf, 1.0_hf, 4.0_hf, 1.0_hf, 5.0_hf, 1.0_hf, 6.0_hf, 1.0_hf };
+    const raul::TensorFP16 vecBOriginalSparse = { 10.0_hf, 1.0_hf, 20.0_hf, 1.0_hf, 30.0_hf, 1.0_hf, 40.0_hf, 1.0_hf, 50.0_hf, 1.0_hf, 60.0_hf, 1.0_hf };
+
+    {
+        raul::TensorFP16 vecB(TORANGE_FP16(vecBOriginal));
+        raul::Common::axpy(vecA.size(), 1.0_dt, vecASparse.data(), 2, vecB.data(), 1);
+        for (size_t q = 0; q < vecB.size(); ++q)
+            CHECK_NEAR(raul::toFloat32(vecB[q]), raul::toFloat32(vecA[q]) + raul::toFloat32(vecBOriginal[q]), EPSILON_ACCURACY);
+    }
+
+    {
+        raul::TensorFP16 vecB(TORANGE_FP16(vecBOriginalSparse));
+
+        vecB = TORANGE_FP16(vecBOriginalSparse);
+        raul::Common::axpy(vecA.size(), 1.0_dt, vecA.data(), 1, vecB.data(), 2);
+        for (size_t q = 0; q < vecB.size(); q += 2)
+            CHECK_NEAR(raul::toFloat32(vecB[q]), raul::toFloat32(vecA[q / 2]) + raul::toFloat32(vecBOriginalSparse[q]), EPSILON_ACCURACY);
+
+        vecB = TORANGE_FP16(vecBOriginalSparse);
+        raul::Common::axpy(vecA.size(), 1.0_dt, vecASparse.data(), 2, vecB.data(), 2);
+        for (size_t q = 0; q < vecB.size(); q += 2)
+            CHECK_NEAR(raul::toFloat32(vecB[q]), raul::toFloat32(vecASparse[q]) + raul::toFloat32(vecBOriginalSparse[q]), EPSILON_ACCURACY);
+    }
+}
+#endif 
+
+TEST(TestCommon, CheckDotUnit)
+{
+    PROFILE_TEST
+    const raul::Tensor vecA = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
+    const raul::Tensor vecASparse = { 1.0f, 0.0f, 2.0f, 0.0f, 3.0f, 0.0f, 4.0f, 0.0f, 5.0f, 0.0f, 6.0f, 0.0f };
+
+    const raul::Tensor vecB = { 10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f };
+    const raul::Tensor vecBSparse = { 10.0f, 0.0f, 20.0f, 0.0f, 30.0f, 0.0f, 40.0f, 0.0f, 50.0f, 0.0f, 60.0f, 0.0f };
+
+    const float dotRes = 910.0f;
+
+    CHECK_NEAR(raul::Common::dot(vecA.size(), vecA.data(), 1, vecB.data(), 1), dotRes, EPSILON_ACCURACY);
+    CHECK_NEAR(raul::Common::dot(vecA.size(), vecA.data(), 1, vecBSparse.data(), 2), dotRes, EPSILON_ACCURACY);
+    CHECK_NEAR(raul::Common::dot(vecA.size(), vecASparse.data(), 2, vecB.data(), 1), dotRes, EPSILON_ACCURACY);
+    CHECK_NEAR(raul::Common::dot(vecA.size(), vecASparse.data(), 2, vecBSparse.data(), 2), dotRes, EPSILON_ACCURACY);
+}
+
+TEST(TestCommon, CheckScaleUnit)
+{
+    PROFILE_TEST
+    const raul::Tensor vecAoriginal = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
+    const raul::Tensor vecAoriginalSparse = { 1.0f, 1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 4.0f, 1.0f, 5.0f, 1.0f, 6.0f, 1.0f };
+
+    {
+        raul::Tensor vecA(TORANGE(vecAoriginal));
+        raul::Common::scal(vecA.size(), 10.0f, vecA.data(), 1);
+        for (size_t q = 0; q < vecA.size(); ++q)
+            CHECK_NEAR(vecA[q], vecAoriginal[q] * 10.0f, EPSILON_ACCURACY);
+    }
+
+    {
+        raul::Tensor vecA(TORANGE(vecAoriginalSparse));
+        raul::Common::scal(vecAoriginal.size(), 10.0f, vecA.data(), 2);
+        for (size_t q = 0; q < vecA.size(); q += 2)
+            CHECK_NEAR(vecA[q], vecAoriginalSparse[q] * 10.0f, EPSILON_ACCURACY);
+    }
+}
+
+TEST(TestCommon, CheckTransposeUnit)
+{
+    PROFILE_TEST
+    raul::Tensor vecA = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
+    raul::Common::transpose(vecA, 3);
+    CHECK_NEAR(vecA[0], 1.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecA[1], 4.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecA[2], 2.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecA[3], 5.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecA[4], 3.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecA[5], 6.0f, EPSILON_ACCURACY);
+
+    raul::Tensor vecB = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f };
+    raul::Common::transpose(vecB, 3);
+    CHECK_NEAR(vecB[0], 1.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecB[1], 4.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecB[2], 7.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecB[3], 2.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecB[4], 5.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecB[5], 8.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecB[6], 3.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecB[7], 6.0f, EPSILON_ACCURACY);
+    CHECK_NEAR(vecB[8], 9.0f, EPSILON_ACCURACY);
+}
+
+TEST(TestCommon, CheckAddPadding2DUnit)
+{
+    PROFILE_TEST
+    {
+        raul::Tensor src = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
+
+        raul::Tensor dst(5 * 4);
+        raul::Common::addPadding2D(src.data(), dst.data(), 1, 3, 2, 5, 4);
+
+        raul::Tensor dstGold = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        ASSERT_EQ(dst.size(), static_cast<size_t>(5 * 4));
+
+        for (size_t i = 0; i < 5 * 4; ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+        }
+    }
+
+    {
+        raul::Tensor src = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
+
+        raul::Tensor dst(6 * 5);
+        raul::Common::addPadding2D(src.data(), dst.data(), 1, 3, 2, 6, 5);
+
+        raul::Tensor dstGold = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 4.0f, 5.0f,
+                                 6.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        ASSERT_EQ(dst.size(), static_cast<size_t>(6 * 5));
+
+        for (size_t i = 0; i < 6 * 5; ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+        }
+    }
+
+    {
+        raul::Tensor src = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+
+                             4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f };
+
+        raul::Tensor dst(2 * 5 * 4);
+        raul::Common::addPadding2D(src.data(), dst.data(), 2, 3, 2, 5, 4);
+
+        raul::Tensor dstGold = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+
+                                 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.0f, 7.0f, 8.0f, 9.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        ASSERT_EQ(dst.size(), static_cast<size_t>(2 * 5 * 4));
+
+        for (size_t i = 0; i < 5 * 4; ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+            ASSERT_EQ(dstGold[5 * 4 + i], dst[5 * 4 + i]);
+        }
+    }
+}
+
+TEST(TestCommon, CheckRemovePadding2DUnit)
+{
+    PROFILE_TEST
+    {
+        raul::Tensor src = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        raul::Tensor dst(3 * 2);
+        raul::Common::removePadding2D(src.data(), dst.data(), 1, 5, 4, 3, 2);
+
+        raul::Tensor dstGold = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
+
+        ASSERT_EQ(dst.size(), static_cast<size_t>(3 * 2));
+
+        for (size_t i = 0; i < 3 * 2; ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+        }
+    }
+
+    {
+
+        raul::Tensor src = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 4.0f, 5.0f,
+                             6.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        raul::Tensor dst(3 * 2);
+        raul::Common::removePadding2D(src.data(), dst.data(), 1, 6, 5, 3, 2);
+
+        raul::Tensor dstGold = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
+        ASSERT_EQ(dst.size(), static_cast<size_t>(3 * 2));
+
+        for (size_t i = 0; i < 3 * 2; ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+        }
+    }
+
+    {
+        raul::Tensor src = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+
+                             0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.0f, 7.0f, 8.0f, 9.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        raul::Tensor dst(2 * 3 * 2);
+        raul::Common::removePadding2D(src.data(), dst.data(), 2, 5, 4, 3, 2);
+
+        raul::Tensor dstGold = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+
+                                 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f };
+
+        ASSERT_EQ(dst.size(), static_cast<size_t>(2 * 3 * 2));
+
+        for (size_t i = 0; i < 3 * 2; ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+            ASSERT_EQ(dstGold[2 * 3 + i], dst[2 * 3 + i]);
+        }
+    }
+}
+
+TEST(TestCommon, CheckAddPadding1DUnit)
+{
+    PROFILE_TEST
+    {
+        raul::Tensor src = { 1_dt, 2_dt, 3_dt };
+
+        raul::Tensor dst(5);
+        raul::Common::addPadding1D(src.data(), dst.data(), 1, 3, 5);
+
+        raul::Tensor dstGold = { 0_dt, 1_dt, 2_dt, 3_dt, 0_dt };
+
+        for (size_t i = 0; i < dst.size(); ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+        }
+    }
+
+    {
+        raul::Tensor src = { 1_dt, 2_dt, 3_dt };
+
+        raul::Tensor dst(9);
+        raul::Common::addPadding1D(src.data(), dst.data(), 3, 1, 3);
+
+        raul::Tensor dstGold = { 0_dt, 1_dt, 0_dt, 0_dt, 2_dt, 0_dt, 0_dt, 3_dt, 0_dt };
+
+        for (size_t i = 0; i < dst.size(); ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+        }
+    }
+
+    {
+        raul::Tensor src = { 1_dt, 2_dt, 3_dt, 4_dt, 5_dt, 6_dt };
+
+        raul::Tensor dst(12);
+        raul::Common::addPadding1D(src.data(), dst.data(), 2, 3, 6);
+
+        raul::Tensor dstGold = { 0_dt, 1_dt, 2_dt, 3_dt, 0_dt, 0_dt, 0_dt, 4_dt, 5_dt, 6_dt, 0_dt, 0_dt };
+
+        for (size_t i = 0; i < dst.size(); ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+        }
+    }
+}
+
+TEST(TestCommon, CheckRemovePadding1DUnit)
+{
+    PROFILE_TEST
+    {
+        raul::Tensor src = { 0_dt, 1_dt, 2_dt, 3_dt, 0_dt };
+
+        raul::Tensor dst(3);
+        raul::Common::removePadding1D(src.data(), dst.data(), 1, 5, 3);
+
+        raul::Tensor dstGold = { 1_dt, 2_dt, 3_dt };
+
+        for (size_t i = 0; i < dst.size(); ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+        }
+    }
+
+    {
+        raul::Tensor src = { 0_dt, 1_dt, 0_dt, 0_dt, 2_dt, 0_dt, 0_dt, 3_dt, 0_dt };
+
+        raul::Tensor dst(3);
+        raul::Common::removePadding1D(src.data(), dst.data(), 3, 3, 1);
+
+        raul::Tensor dstGold = { 1_dt, 2_dt, 3_dt };
+
+        for (size_t i = 0; i < dst.size(); ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+        }
+    }
+
+    {
+        raul::Tensor src = { 0_dt, 1_dt, 2_dt, 3_dt, 0_dt, 0_dt, 0_dt, 4_dt, 5_dt, 6_dt, 0_dt, 0_dt };
+
+        raul::Tensor dst(6);
+        raul::Common::removePadding1D(src.data(), dst.data(), 2, 6, 3);
+
+        raul::Tensor dstGold = { 1_dt, 2_dt, 3_dt, 4_dt, 5_dt, 6_dt };
+
+        for (size_t i = 0; i < dst.size(); ++i)
+        {
+            ASSERT_EQ(dstGold[i], dst[i]);
+        }
+    }
+}
+
+/*
+TEST(TestCommon, Reshape3DTo2DUnit)
+{
+    PROFILE_TEST
+
+    const std::vector<std::vector<raul::Tensor>> input = { {
+                                                              // batch0
+                                                              { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },           // ch0
+                                                              { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 }, // ch1
+                                                              { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 }  // ch2
+                                                          },
+                                                          {
+                                                              // batch1
+                                                              { 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }, // ch0
+                                                              { 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 }, // ch1
+                                                              { 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 }  // ch2
+                                                          } };
+
+    const std::vector<raul::Tensor> goldOutput = { { // batch0
+                                                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 },
+                                                  { // batch1
+                                                    20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 } };
+
+    std::vector<raul::Tensor> output = raul::Common::reshape3DTo2D(input, 3, 2);
+
+    ASSERT_EQ(output.size(), static_cast<size_t>(2));
+    ASSERT_EQ(output[0].size(), static_cast<size_t>(3 * 10));
+    ASSERT_EQ(output[1].size(), static_cast<size_t>(3 * 10));
+
+    for (size_t i = 0; i < goldOutput.size(); ++i)
+    {
+        for (size_t w = 0; w < goldOutput[i].size(); ++w)
+        {
+            ASSERT_EQ(goldOutput[i][w], output[i][w]);
+        }
+    }
+}
+
+TEST(TestCommon, Reshape1Dto2DUnit)
+{
+    PROFILE_TEST
+
+    const raul::Tensor input = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 };
+
+    const std::vector<raul::Tensor> goldOutput = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 } };
+
+    {
+        std::vector<raul::Tensor> output = raul::Common::reshape1DTo2D(input, 10, 2);
+
+        ASSERT_EQ(output.size(), static_cast<size_t>(2));
+        ASSERT_EQ(output[0].size(), static_cast<size_t>(10));
+
+        for (size_t i = 0; i < 2; ++i)
+            for (size_t j = 0; j < 10; ++j)
+            {
+                ASSERT_EQ(goldOutput[i][j], output[i][j]);
+            }
+    }
+
+    {
+        std::vector<raul::Tensor> output = raul::Common::reshape1DTo2D(input, 10, 2, 0);
+
+        ASSERT_EQ(output.size(), static_cast<size_t>(2));
+        ASSERT_EQ(output[0].size(), static_cast<size_t>(10));
+
+        for (size_t i = 0; i < 2; ++i)
+            for (size_t j = 0; j < 10; ++j)
+            {
+                ASSERT_EQ(goldOutput[i][j], output[i][j]);
+            }
+    }
+
+    const std::vector<raul::Tensor> goldOutputOffset = { { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 } };
+
+    {
+        std::vector<raul::Tensor> output = raul::Common::reshape1DTo2D(input, 10, 1, 1);
+
+        ASSERT_EQ(output.size(), static_cast<size_t>(1));
+        ASSERT_EQ(output[0].size(), static_cast<size_t>(10));
+
+        for (size_t i = 0; i < 1; ++i)
+            for (size_t j = 0; j < 10; ++j)
+            {
+                ASSERT_EQ(goldOutputOffset[i][j], output[i][j]);
+            }
+    }
+}*/
+
+TEST(TestCommon, Im2ColUnit)
+{
+    PROFILE_TEST
+    // https://leonardoaraujosantos.gitbooks.io/artificial-inteligence/content/making_faster.html
+
+    // basic
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(4 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 1, 1, 0, 0, matrix.data());
+
+        ASSERT_EQ(matrix.size(), static_cast<size_t>(4 * 4));
+
+        raul::Tensor matrixGold = { 1.0f, 4.0f, 2.0f, 5.0f, 4.0f, 7.0f, 5.0f, 8.0f, 2.0f, 5.0f, 3.0f, 6.0f, 5.0f, 8.0f, 6.0f, 9.0f };
+
+        for (size_t q = 0; q < 4 * 4; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]);
+        }
+    }
+
+    // not symmetric stride
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(2 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 2, 1, 0, 0, matrix.data());
+
+        ASSERT_EQ(matrix.size(), static_cast<size_t>(2 * 4));
+
+        raul::Tensor matrixGold = { 1.0f, 2.0f, 4.0f, 5.0f, 2.0f, 3.0f, 5.0f, 6.0f };
+
+        for (size_t q = 0; q < 2 * 4; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]);
+        }
+    }
+
+    // not symmetric stride
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(2 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 1, 2, 0, 0, matrix.data());
+
+        ASSERT_EQ(matrix.size(), static_cast<size_t>(2 * 4));
+
+        raul::Tensor matrixGold = {
+            1.0f, 4.0f, 4.0f, 7.0f, 2.0f, 5.0f, 5.0f, 8.0f,
+        };
+
+        for (size_t q = 0; q < 2 * 4; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]);
+        }
+    }
+
+    // not square kernel
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(2 * 6);
+        raul::Common::im2col(image.data(), 3, 3, 1, 3, 2, 1, 1, 0, 0, matrix.data());
+
+        ASSERT_EQ(matrix.size(), static_cast<size_t>(2 * 6));
+
+        raul::Tensor matrixGold = { 1.0f, 2.0f, 4.0f, 5.0f, 7.0f, 8.0f, 2.0f, 3.0f, 5.0f, 6.0f, 8.0f, 9.0f };
+
+        for (size_t q = 0; q < 2 * 6; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]);
+        }
+    }
+
+    // not square kernel
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(2 * 6);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 3, 1, 1, 0, 0, matrix.data());
+
+        ASSERT_EQ(matrix.size(), static_cast<size_t>(2 * 6));
+
+        raul::Tensor matrixGold = { 1.0f, 4.0f, 4.0f, 7.0f, 2.0f, 5.0f, 5.0f, 8.0f, 3.0f, 6.0f, 6.0f, 9.0f };
+
+        for (size_t q = 0; q < 2 * 6; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]);
+        }
+    }
+
+    // not symmetric stride & not square kernel
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(1 * 6);
+        raul::Common::im2col(image.data(), 3, 3, 1, 3, 2, 1, 2, 0, 0, matrix.data());
+
+        ASSERT_EQ(matrix.size(), static_cast<size_t>(1 * 6));
+
+        raul::Tensor matrixGold = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f };
+
+        for (size_t q = 0; q < 1 * 6; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]);
+        }
+    }
+
+    // not symmetric stride & not square kernel
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(1 * 6);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 3, 2, 1, 0, 0, matrix.data());
+
+        ASSERT_EQ(matrix.size(), static_cast<size_t>(1 * 6));
+
+        raul::Tensor matrixGold = { 1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f };
+
+        for (size_t q = 0; q < 1 * 6; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]);
+        }
+    }
+
+    // symmetric padding
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(16 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 1, 1, 1, 1, matrix.data());
+
+        ASSERT_EQ(matrix.size(), static_cast<size_t>(16 * 4));
+
+        raul::Tensor matrixGold = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 4.0f, 7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 4.0f,
+                                    7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f, 0.0f, 0.0f, 1.0f, 4.0f, 7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f,
+                                    0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 4.0f, 7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        for (size_t q = 0; q < 16 * 4; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]) << q;
+        }
+    }
+
+    // not symmetric padding
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(8 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 1, 1, 1, 0, matrix.data());
+
+        ASSERT_EQ(matrix.size(), static_cast<size_t>(8 * 4));
+
+        raul::Tensor matrixGold = { 0.0f, 1.0f, 4.0f, 7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 1.0f, 4.0f, 7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 0.0f,
+                                    0.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f, 0.0f };
+
+        for (size_t q = 0; q < 8 * 4; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]) << q;
+        }
+    }
+
+    // not symmetric padding
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(8 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 1, 1, 0, 1, matrix.data());
+
+        ASSERT_EQ(matrix.size(), static_cast<size_t>(8 * 4));
+
+        raul::Tensor matrixGold = { 0.0f, 0.0f, 1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, 0.0f, 0.0f, 4.0f, 7.0f, 5.0f, 8.0f, 6.0f, 9.0f,
+                                    1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, 0.0f, 0.0f, 4.0f, 7.0f, 5.0f, 8.0f, 6.0f, 9.0f, 0.0f, 0.0f };
+
+        for (size_t q = 0; q < 8 * 4; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]) << q;
+        }
+    }
+
+    // larger
+    {
+        raul::Tensor image = { 1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
+
+                               17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
+
+                               33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f };
+
+        raul::Tensor matrix(3 * 2 * 2 * 3 * 3);
+        raul::Common::im2col(image.data(), 4, 4, 3, 2, 2, 1, 1, 0, 0, matrix.data());
+
+        const size_t mSize = 3 * 2 * 2 * 3 * 3; // 3 channels, 2x2 filter, 3x3 output result after convolution
+        ASSERT_EQ(matrix.size(), mSize);
+
+        raul::Tensor matrixGold = {
+            1.0f,  2.0f,  3.0f,  5.0f,  6.0f,  7.0f,  9.0f,  10.0f, 11.0f, 2.0f,  3.0f,  4.0f,  6.0f,  7.0f,  8.0f,  10.0f, 11.0f, 12.0f,
+            5.0f,  6.0f,  7.0f,  9.0f,  10.0f, 11.0f, 13.0f, 14.0f, 15.0f, 6.0f,  7.0f,  8.0f,  10.0f, 11.0f, 12.0f, 14.0f, 15.0f, 16.0f,
+
+            17.0f, 18.0f, 19.0f, 21.0f, 22.0f, 23.0f, 25.0f, 26.0f, 27.0f, 18.0f, 19.0f, 20.0f, 22.0f, 23.0f, 24.0f, 26.0f, 27.0f, 28.0f,
+            21.0f, 22.0f, 23.0f, 25.0f, 26.0f, 27.0f, 29.0f, 30.0f, 31.0f, 22.0f, 23.0f, 24.0f, 26.0f, 27.0f, 28.0f, 30.0f, 31.0f, 32.0f,
+
+            33.0f, 34.0f, 35.0f, 37.0f, 38.0f, 39.0f, 41.0f, 42.0f, 43.0f, 34.0f, 35.0f, 36.0f, 38.0f, 39.0f, 40.0f, 42.0f, 43.0f, 44.0f,
+            37.0f, 38.0f, 39.0f, 41.0f, 42.0f, 43.0f, 45.0f, 46.0f, 47.0f, 38.0f, 39.0f, 40.0f, 42.0f, 43.0f, 44.0f, 46.0f, 47.0f, 48.0f,
+        };
+
+        for (size_t q = 0; q < mSize; ++q)
+        {
+            ASSERT_EQ(matrix[q], matrixGold[q]);
+        }
+    }
+}
+
+TEST(TestCommon, Im2ColGEMMUnit)
+{
+    PROFILE_TEST
+
+    // basic
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(4 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 1, 1, 0, 0, matrix.data());
+
+        raul::Tensor filter = {
+            1.0f,
+            3.0f,
+            2.0f,
+            4.0f,
+        };
+
+        raul::Tensor output(2 * 2);
+
+        raul::Common::gemm(CblasNoTrans, CblasNoTrans, 1, 4, 4, 1.0_dt, filter.data(), matrix.data(), 0.0_dt, output.data());
+
+        raul::Tensor outputGold = {
+            37.0f,
+            67.0f,
+            47.0f,
+            77.0f,
+        };
+
+        for (size_t q = 0; q < 2 * 2; ++q)
+        {
+            ASSERT_EQ(output[q], outputGold[q]);
+        }
+    }
+
+    // padding
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(16 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 1, 1, 1, 1, matrix.data());
+
+        raul::Tensor filter = {
+            1.0f,
+            3.0f,
+            2.0f,
+            4.0f,
+        };
+
+        raul::Tensor output(4 * 4);
+
+        raul::Common::gemm(CblasNoTrans, CblasNoTrans, 1, 16, 4, 1.0_dt, filter.data(), matrix.data(), 0.0_dt, output.data());
+
+        raul::Tensor outputGold = { 4.0f, 18.0f, 36.0f, 14.0f, 11.0f, 37.0f, 67.0f, 23.0f, 18.0f, 47.0f, 77.0f, 26.0f, 9.0f, 21.0f, 33.0f, 9.0f };
+
+        for (size_t q = 0; q < 4 * 4; ++q)
+        {
+            ASSERT_EQ(output[q], outputGold[q]);
+        }
+    }
+
+    // stride & padding
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(4 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 2, 2, 1, 1, matrix.data());
+
+        raul::Tensor filter = {
+            1.0f,
+            3.0f,
+            2.0f,
+            4.0f,
+        };
+
+        raul::Tensor output(2 * 2);
+
+        raul::Common::gemm(CblasNoTrans, CblasNoTrans, 1, 4, 4, 1.0_dt, filter.data(), matrix.data(), 0.0_dt, output.data());
+
+        raul::Tensor outputGold = { 4.0f, 36.0f, 18.0f, 77.0f };
+
+        for (size_t q = 0; q < 2 * 2; ++q)
+        {
+            ASSERT_EQ(output[q], outputGold[q]);
+        }
+    }
+
+    // not symmetrical stride & symmetrical padding
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(8 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 2, 1, 1, 1, matrix.data());
+
+        raul::Tensor filter = {
+            1.0f,
+            3.0f,
+            2.0f,
+            4.0f,
+        };
+
+        raul::Tensor output(2 * 4);
+
+        raul::Common::gemm(CblasNoTrans, CblasNoTrans, 1, 8, 4, 1.0_dt, filter.data(), matrix.data(), 0.0_dt, output.data());
+
+        raul::Tensor outputGold = { 4.0f, 36.0f, 11.0f, 67.0f, 18.0f, 77.0f, 9.0f, 33.0f };
+
+        for (size_t q = 0; q < 2 * 4; ++q)
+        {
+            ASSERT_EQ(output[q], outputGold[q]);
+        }
+    }
+
+    // not symmetrical stride & symmetrical padding
+    {
+        raul::Tensor image = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 3.0f, 6.0f, 9.0f };
+
+        raul::Tensor matrix(8 * 4);
+        raul::Common::im2col(image.data(), 3, 3, 1, 2, 2, 1, 2, 1, 1, matrix.data());
+
+        raul::Tensor filter = {
+            1.0f,
+            3.0f,
+            2.0f,
+            4.0f,
+        };
+
+        raul::Tensor output(4 * 2);
+
+        raul::Common::gemm(CblasNoTrans, CblasNoTrans, 1, 8, 4, 1.0_dt, filter.data(), matrix.data(), 0.0_dt, output.data());
+
+        raul::Tensor outputGold = { 4.0f, 18.0f, 36.0f, 14.0f, 18.0f, 47.0f, 77.0f, 26.0f };
+
+        for (size_t q = 0; q < 4 * 2; ++q)
+        {
+            ASSERT_EQ(output[q], outputGold[q]);
+        }
+    }
+
+    // larger
+    {
+        raul::Tensor image = { 1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
+
+                               17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
+
+                               33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f };
+
+        raul::Tensor matrix(9 * 12);
+        raul::Common::im2col(image.data(), 4, 4, 3, 2, 2, 1, 1, 0, 0, matrix.data());
+
+        raul::Tensor filter = { 1.0f, 3.0f, 2.0f, 4.0f,
+
+                                1.0f, 3.0f, 2.0f, 4.0f,
+
+                                1.0f, 3.0f, 2.0f, 4.0f };
+
+        raul::Tensor output(3 * 3);
+
+        raul::Common::gemm(CblasNoTrans, CblasNoTrans, 1, 9, 12, 1.0_dt, filter.data(), matrix.data(), 0.0_dt, output.data());
+
+        raul::Tensor outputGold = { 603.0f, 633.0f, 663.0f, 723.0f, 753.0f, 783.0f, 843.0f, 873.0f, 903.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(output[q], outputGold[q]);
+        }
+    }
+}
+
+TEST(TestCommon, Col2ImUnit)
+{
+    PROFILE_TEST
+    // https://leonardoaraujosantos.gitbooks.io/artificial-inteligence/content/making_faster.html
+
+    // basic
+    {
+        raul::Tensor matrix = { 1.0f, 4.0f, 2.0f, 5.0f, 4.0f, 7.0f, 5.0f, 8.0f, 2.0f, 5.0f, 3.0f, 6.0f, 5.0f, 8.0f, 6.0f, 9.0f };
+
+        raul::Tensor image(3 * 3);
+        raul::Common::col2im(matrix.data(), 3, 3, 1, 2, 2, 1, 1, 0, 0, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 3));
+
+        raul::Tensor imageGold = { 1.0f, 8.0f, 7.0f, 4.0f, 20.0f, 16.0f, 3.0f, 12.0f, 9.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+
+    // not symmetric stride
+    {
+        raul::Tensor matrix = { 1.0f, 2.0f, 4.0f, 5.0f, 2.0f, 3.0f, 5.0f, 6.0f };
+
+        raul::Tensor image(3 * 3);
+        raul::Common::col2im(matrix.data(), 3, 3, 1, 2, 2, 2, 1, 0, 0, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 3));
+
+        raul::Tensor imageGold = { 1.0f, 4.0f, 0.0f, 4.0f, 10.0f, 0.0f, 3.0f, 6.0f, 0.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+
+    // not symmetric stride
+    {
+        raul::Tensor matrix = {
+            1.0f, 4.0f, 4.0f, 7.0f, 2.0f, 5.0f, 5.0f, 8.0f,
+        };
+
+        raul::Tensor image(3 * 3);
+        raul::Common::col2im(matrix.data(), 3, 3, 1, 2, 2, 1, 2, 0, 0, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 3));
+
+        raul::Tensor imageGold = { 1.0f, 8.0f, 7.0f, 2.0f, 10.0f, 8.0f, 0.0f, 0.0f, 0.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+
+    // not square kernel
+    {
+        raul::Tensor matrix = { 1.0f, 2.0f, 4.0f, 5.0f, 7.0f, 8.0f, 2.0f, 3.0f, 5.0f, 6.0f, 8.0f, 9.0f };
+
+        raul::Tensor image(3 * 3);
+        raul::Common::col2im(matrix.data(), 3, 3, 1, 3, 2, 1, 1, 0, 0, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 3));
+
+        raul::Tensor imageGold = { 1.0f, 4.0f, 7.0f, 4.0f, 10.0f, 16.0f, 3.0f, 6.0f, 9.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+
+    // not square kernel
+    {
+        raul::Tensor matrix = { 1.0f, 4.0f, 4.0f, 7.0f, 2.0f, 5.0f, 5.0f, 8.0f, 3.0f, 6.0f, 6.0f, 9.0f };
+
+        raul::Tensor image(3 * 3);
+        raul::Common::col2im(matrix.data(), 3, 3, 1, 2, 3, 1, 1, 0, 0, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 3));
+
+        raul::Tensor imageGold = { 1.0f, 8.0f, 7.0f, 2.0f, 10.0f, 8.0f, 3.0f, 12.0f, 9.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+
+    // not symmetric stride & not square kernel
+    {
+        raul::Tensor matrix = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f };
+
+        raul::Tensor image(3 * 3);
+        raul::Common::col2im(matrix.data(), 3, 3, 1, 3, 2, 1, 2, 0, 0, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 3));
+
+        raul::Tensor imageGold = { 1.0f, 4.0f, 7.0f, 2.0f, 5.0f, 8.0f, 0.0f, 0.0f, 0.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+
+    // not symmetric stride & not square kernel
+    {
+        raul::Tensor matrix = { 1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f };
+
+        raul::Tensor image(3 * 3);
+        raul::Common::col2im(matrix.data(), 3, 3, 1, 2, 3, 2, 1, 0, 0, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 3));
+
+        raul::Tensor imageGold = { 1.0f, 4.0f, 0.0f, 2.0f, 5.0f, 0.0f, 3.0f, 6.0f, 0.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+
+    // symmetric padding
+    {
+        raul::Tensor matrix = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 4.0f, 7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 4.0f,
+                                7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f, 0.0f, 0.0f, 1.0f, 4.0f, 7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f,
+                                0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 4.0f, 7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+        raul::Tensor image(3 * 3);
+        raul::Common::col2im(matrix.data(), 3, 3, 1, 2, 2, 1, 1, 1, 1, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 3));
+
+        raul::Tensor imageGold = { 4.0f, 16.0f, 28.0f, 8.0f, 20.0f, 32.0f, 12.0f, 24.0f, 36.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+
+    // not symmetric padding
+    {
+        raul::Tensor matrix = { 0.0f, 1.0f, 4.0f, 7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 1.0f, 4.0f, 7.0f, 0.0f, 2.0f, 5.0f, 8.0f, 0.0f,
+                                0.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f, 2.0f, 5.0f, 8.0f, 0.0f, 3.0f, 6.0f, 9.0f, 0.0f };
+
+        raul::Tensor image(3 * 3);
+        raul::Common::col2im(matrix.data(), 3, 3, 1, 2, 2, 1, 1, 1, 0, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 3));
+
+        raul::Tensor imageGold = { 2.0f, 8.0f, 14.0f, 8.0f, 20.0f, 32.0f, 6.0f, 12.0f, 18.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+
+    // not symmetric padding
+    {
+        raul::Tensor matrix = { 0.0f, 0.0f, 1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, 0.0f, 0.0f, 4.0f, 7.0f, 5.0f, 8.0f, 6.0f, 9.0f,
+                                1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, 0.0f, 0.0f, 4.0f, 7.0f, 5.0f, 8.0f, 6.0f, 9.0f, 0.0f, 0.0f };
+
+        raul::Tensor image(3 * 3);
+        raul::Common::col2im(matrix.data(), 3, 3, 1, 2, 2, 1, 1, 0, 1, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 3));
+
+        raul::Tensor imageGold = { 2.0f, 16.0f, 14.0f, 4.0f, 20.0f, 16.0f, 6.0f, 24.0f, 18.0f };
+
+        for (size_t q = 0; q < 3 * 3; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+
+    // larger
+    {
+
+        raul::Tensor matrix = {
+            1.0f,  2.0f,  3.0f,  5.0f,  6.0f,  7.0f,  9.0f,  10.0f, 11.0f, 2.0f,  3.0f,  4.0f,  6.0f,  7.0f,  8.0f,  10.0f, 11.0f, 12.0f,
+            5.0f,  6.0f,  7.0f,  9.0f,  10.0f, 11.0f, 13.0f, 14.0f, 15.0f, 6.0f,  7.0f,  8.0f,  10.0f, 11.0f, 12.0f, 14.0f, 15.0f, 16.0f,
+
+            17.0f, 18.0f, 19.0f, 21.0f, 22.0f, 23.0f, 25.0f, 26.0f, 27.0f, 18.0f, 19.0f, 20.0f, 22.0f, 23.0f, 24.0f, 26.0f, 27.0f, 28.0f,
+            21.0f, 22.0f, 23.0f, 25.0f, 26.0f, 27.0f, 29.0f, 30.0f, 31.0f, 22.0f, 23.0f, 24.0f, 26.0f, 27.0f, 28.0f, 30.0f, 31.0f, 32.0f,
+
+            33.0f, 34.0f, 35.0f, 37.0f, 38.0f, 39.0f, 41.0f, 42.0f, 43.0f, 34.0f, 35.0f, 36.0f, 38.0f, 39.0f, 40.0f, 42.0f, 43.0f, 44.0f,
+            37.0f, 38.0f, 39.0f, 41.0f, 42.0f, 43.0f, 45.0f, 46.0f, 47.0f, 38.0f, 39.0f, 40.0f, 42.0f, 43.0f, 44.0f, 46.0f, 47.0f, 48.0f,
+        };
+
+        raul::Tensor image(3 * 4 * 4);
+        raul::Common::col2im(matrix.data(), 4, 4, 3, 2, 2, 1, 1, 0, 0, image.data());
+
+        ASSERT_EQ(image.size(), static_cast<size_t>(3 * 4 * 4));
+
+        raul::Tensor imageGold = { 1.0f,  4.0f,  6.0f,  4.0f,  10.0f, 24.0f,  28.0f,  16.0f, 18.0f, 40.0f,  44.0f,  24.0f, 13.0f, 28.0f, 30.0f, 16.0f,
+
+                                   17.0f, 36.0f, 38.0f, 20.0f, 42.0f, 88.0f,  92.0f,  48.0f, 50.0f, 104.0f, 108.0f, 56.0f, 29.0f, 60.0f, 62.0f, 32.0f,
+
+                                   33.0f, 68.0f, 70.0f, 36.0f, 74.0f, 152.0f, 156.0f, 80.0f, 82.0f, 168.0f, 172.0f, 88.0f, 45.0f, 92.0f, 94.0f, 48.0f };
+
+        for (size_t q = 0; q < 3 * 4 * 4; ++q)
+        {
+            ASSERT_EQ(image[q], imageGold[q]);
+        }
+    }
+}
+
+TEST(TestCommon, CheckHadamardTrivialUnit)
+{
+    PROFILE_TEST
+
+    const raul::Tensor vecA = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f };
+    const raul::Tensor vecB = { 10.0f, 20.0f, 40.0f, 50.0f, 70.0f };
+    raul::Tensor vecC(vecA.size());
+
+    const raul::Tensor realVecC = { 10.0f, 40.0f, 120.0f, 200.0f, 350.0f };
+
+    // Trivial
+    raul::Common::hadamard(5, 1.0f, vecA.data(), vecB.data(), 1, 0.0f, vecC.data(), 1);
+    for (size_t q = 0; q < vecC.size(); ++q)
+    {
+        EXPECT_EQ(vecC[q], realVecC[q]);
+    }
+}
+
+TEST(TestCommon, CheckHadamardUnit)
+{
+    PROFILE_TEST
+
+    const raul::Tensor vecA = { 10.0f, 20.0f, 40.0f, 50.0f, 70.0f };
+    const raul::Tensor vecB = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f };
+    raul::Tensor vecC(vecA.size() * 2);
+
+    const raul::Tensor realVecC = { 20.0f, 0.0f, 160.0f, 0.0f, 560.0f, 0.0f, 1000.0f, 0.0f, 1820.0f, 0.0f };
+
+    // Non-trivial
+    raul::Common::hadamard(5, 2.0f, vecA.data(), vecB.data(), 3, 0.0f, vecC.data(), 2);
+    for (size_t q = 0; q < vecC.size(); ++q)
+    {
+        EXPECT_EQ(vecC[q], realVecC[q]);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_Compiler.cpp b/training/src/tests/tests/lib/Test_Compiler.cpp
new file mode 100644
index 00000000..ce5f3fe4
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_Compiler.cpp
@@ -0,0 +1,665 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/BasicImpl.h>
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/layers/TrainableLayer.h>
+#include <training/base/layers/basic/ConvertPrecisionLayer.h>
+#include <training/compiler/Compiler.h>
+#include <training/compiler/LayersResolver.h>
+
+namespace
+{
+
+class TestLayer : public raul::BasicLayer
+{
+  public:
+    TestLayer(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "TestLayer", params, networkParameters, { false, false })
+    {
+        for (auto& input : params.getInputs())
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(
+                name, input, raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false);
+        }
+
+        for (auto& output : params.getOutputs())
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(name, output, raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Write, true, true, false, false, false);
+        }
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override {}
+    void backwardComputeImpl() override {}
+
+    bool isImplResolved() const { return mImpl != nullptr; }
+};
+
+class TestImpl : public raul::BasicImpl
+{
+  public:
+    TestImpl(TestLayer&) {}
+
+    void forwardComputeImpl(raul::NetworkMode) override {}
+    void backwardComputeImpl() override {}
+};
+
+class TestTrainableLayer : public raul::TrainableLayer
+{
+  public:
+    TestTrainableLayer(const raul::Name& name, const raul::TrainableParams& params, raul::NetworkParameters& networkParameters)
+        : TrainableLayer(name, "TestTrainableLayer", params, networkParameters, { false, false })
+    {
+        for (auto& input : params.getInputs())
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(
+                name, input, raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false);
+        }
+
+        for (auto& output : params.getOutputs())
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(name, output, raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Write, true, true, false, false, false);
+        }
+
+        // Declare trainable params
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mWeightsName, WShape{ 1u, 1u, 1u, 1u }, DEC_TRAINABLE);
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mBiasesName, WShape{ 1u, 1u, 1u, 1u }, DEC_TRAINABLE);
+
+        if (!isFrozen())
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mWeightsName, mWeightsName.grad(), DEC_TRAINABLE_GRAD);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mBiasesName, mBiasesName.grad(), DEC_TRAINABLE_GRAD);
+        }
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override {}
+    void backwardComputeImpl() override {}
+
+    bool isImplResolved() const { return mImpl != nullptr; }
+};
+
+class TestTrainableImpl : public raul::BasicImpl
+{
+  public:
+    TestTrainableImpl(TestTrainableLayer&) {}
+
+    void forwardComputeImpl(raul::NetworkMode) override {}
+    void backwardComputeImpl() override {}
+};
+
+} // anonymous namespace
+
+namespace UT
+{
+
+TEST(TestCompiler, RegistrationUnit)
+{
+    PROFILE_TEST
+
+    auto& implFactory = Compiler(raul::ExecutionTarget::CPU).getImplFactory(); // factory from library
+
+    size_t mapSizeCPUFP32 = implFactory.getCPUFP32Map().size();
+    size_t mapSizeCPUFP16 = implFactory.getCPUFP16Map().size();
+    size_t CPUFP32FP16MixedLocal = implFactory.getCPUFP32FP16MixedLocalMap().size();
+
+    implFactory.regCPUFP32<TestLayer, TestImpl>();
+    EXPECT_THROW((implFactory.regCPUFP32<TestLayer, TestImpl>()), raul::Exception); // double registration not possibe
+
+    EXPECT_EQ(implFactory.getCPUFP32Map().size(), mapSizeCPUFP32 + 1u);
+    EXPECT_EQ(implFactory.getCPUFP16Map().size(), mapSizeCPUFP16);
+    EXPECT_EQ(implFactory.getCPUFP32FP16MixedLocalMap().size(), CPUFP32FP16MixedLocal);
+
+    implFactory.clearRegistrationFromEveryMap(typeid(TestLayer).name());
+
+    EXPECT_EQ(implFactory.getCPUFP32Map().size(), mapSizeCPUFP32);
+    EXPECT_EQ(implFactory.getCPUFP16Map().size(), mapSizeCPUFP16);
+    EXPECT_EQ(implFactory.getCPUFP32FP16MixedLocalMap().size(), CPUFP32FP16MixedLocal);
+}
+
+TEST(TestCompiler, ResolveSimpleUnit)
+{
+    PROFILE_TEST
+
+    auto& implFactory = Compiler(raul::ExecutionTarget::CPU).getImplFactory(); // factory from library
+
+    implFactory.regCPUFP32<TestLayer, TestImpl>();
+
+    std::vector<BasicLayerBuilder> layers;
+
+    raul::Workflow work;
+
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l1", raul::BasicParams{ {}, {} }));
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l2", raul::BasicParams{ {}, {} }));
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l3", raul::BasicParams{ {}, {} }));
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l4", raul::BasicParams{ {}, {} }));
+
+    Compiler compiler(raul::ExecutionTarget::CPU);
+    std::vector<LayerMem> fronts = compiler.resolveImplementation(layers, work.getNetworkParameters());
+    EXPECT_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()), raul::Exception); // not possible to recompile
+
+    EXPECT_EQ(fronts.size(), 4u);
+
+    EXPECT_EQ(static_cast<TestLayer*>(fronts[0].get())->isImplResolved(), true);
+    EXPECT_EQ(static_cast<TestLayer*>(fronts[1].get())->isImplResolved(), true);
+    EXPECT_EQ(static_cast<TestLayer*>(fronts[2].get())->isImplResolved(), true);
+    EXPECT_EQ(static_cast<TestLayer*>(fronts[3].get())->isImplResolved(), true);
+
+    implFactory.clearRegistrationFromEveryMap(typeid(TestLayer).name());
+}
+
+TEST(TestCompiler, ResolveAdvancedUnit)
+{
+    PROFILE_TEST
+
+    auto& implFactory = Compiler(raul::ExecutionTarget::CPU).getImplFactory(); // factory from library
+
+    implFactory.regCPUFP32<TestLayer, TestImpl>();
+    implFactory.regCPUFP16<TestLayer, TestImpl>();
+
+    std::vector<BasicLayerBuilder> layers;
+
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l0", raul::BasicParams{ {}, { "l0" } }));
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l1", raul::BasicParams{ {}, { "l1" } }));
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l2", raul::BasicParams{ { "l1" }, { "l2" } }));
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l3", raul::BasicParams{ { "l0", "l2" }, { "l3", "l5" } }));
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l4", raul::BasicParams{ { "l3" }, { "l4" } }));
+
+    {
+        raul::Workflow work;
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l2", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l3", raul::ConstraintImpl::CPUFP32FP16MixedLocal)); // not used for conversions
+
+        std::vector<BasicLayerBuilder> localLayers = layers;
+        std::vector<LayerMem> fronts = compiler.resolveImplementation(localLayers, work.getNetworkParameters());
+
+        EXPECT_EQ(fronts.size(), 7u);
+
+        EXPECT_EQ(fronts[2]->getName(), "TensorConvertor_conv::2::0");
+        EXPECT_EQ(fronts[2]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[2]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[2]->getInputs()[0], "l1");
+        EXPECT_EQ(fronts[2]->getOutputs()[0], "l1_FP16_0");
+
+        EXPECT_EQ(fronts[3]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[3]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[3]->getInputs()[0], "l1_FP16_0");
+        EXPECT_EQ(fronts[3]->getOutputs()[0], "l2_FP16_0");
+
+        EXPECT_EQ(fronts[4]->getName(), "TensorConvertor_deconv::2::0");
+        EXPECT_EQ(fronts[4]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[4]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[4]->getInputs()[0], "l2_FP16_0");
+        EXPECT_EQ(fronts[4]->getOutputs()[0], "l2");
+    }
+
+    // disable output conversion
+    {
+        raul::Workflow work;
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        auto constrain = raul::Constraint("l4", raul::ConstraintImpl::CPUFP16);
+        constrain.disableOutputConversion();
+        compiler.setConstraint(constrain);
+
+        std::vector<BasicLayerBuilder> localLayers = layers;
+        std::vector<LayerMem> fronts = compiler.resolveImplementation(localLayers, work.getNetworkParameters());
+
+        EXPECT_EQ(fronts.size(), 6u);
+
+        EXPECT_EQ(fronts[4]->getName(), "TensorConvertor_conv::4::0");
+        EXPECT_EQ(fronts[4]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[4]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[4]->getInputs()[0], "l3");
+        EXPECT_EQ(fronts[4]->getOutputs()[0], "l3_FP16_0");
+
+        EXPECT_EQ(fronts[5]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[5]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[5]->getInputs()[0], "l3_FP16_0");
+        EXPECT_EQ(fronts[5]->getOutputs()[0], "l4");
+    }
+
+    {
+        raul::Workflow work;
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l2", "l3", raul::ConstraintImpl::CPUFP16));
+
+        std::vector<BasicLayerBuilder> localLayers = layers;
+        std::vector<LayerMem> fronts = compiler.resolveImplementation(localLayers, work.getNetworkParameters());
+
+        EXPECT_EQ(fronts.size(), 9u);
+
+        EXPECT_EQ(fronts[2]->getName(), "TensorConvertor_conv::2::0");
+        EXPECT_EQ(fronts[2]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[2]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[2]->getInputs()[0], "l0");
+        EXPECT_EQ(fronts[2]->getOutputs()[0], "l0_FP16_0");
+
+        EXPECT_EQ(fronts[3]->getName(), "TensorConvertor_conv::2::1");
+        EXPECT_EQ(fronts[3]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[3]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[3]->getInputs()[0], "l1");
+        EXPECT_EQ(fronts[3]->getOutputs()[0], "l1_FP16_0");
+
+        EXPECT_EQ(fronts[4]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[4]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[4]->getInputs()[0], "l1_FP16_0");
+        EXPECT_EQ(fronts[4]->getOutputs()[0], "l2");
+
+        EXPECT_EQ(fronts[5]->getInputs().size(), 2u);
+        EXPECT_EQ(fronts[5]->getOutputs().size(), 2u);
+        EXPECT_EQ(fronts[5]->getInputs()[0], "l0_FP16_0");
+        EXPECT_EQ(fronts[5]->getInputs()[1], "l2");
+        EXPECT_EQ(fronts[5]->getOutputs()[0], "l3_FP16_0");
+        EXPECT_EQ(fronts[5]->getOutputs()[1], "l5_FP16_0");
+
+        EXPECT_EQ(fronts[6]->getName(), "TensorConvertor_deconv::3::0");
+        EXPECT_EQ(fronts[6]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[6]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[6]->getInputs()[0], "l3_FP16_0");
+        EXPECT_EQ(fronts[6]->getOutputs()[0], "l3");
+
+        EXPECT_EQ(fronts[7]->getName(), "TensorConvertor_deconv::3::1");
+        EXPECT_EQ(fronts[7]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[7]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[7]->getInputs()[0], "l5_FP16_0");
+        EXPECT_EQ(fronts[7]->getOutputs()[0], "l5");
+    }
+
+    {
+        raul::Workflow work;
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l0", "l1", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l3", "l4", raul::ConstraintImpl::CPUFP16));
+
+        std::vector<BasicLayerBuilder> localLayers = layers;
+        std::vector<LayerMem> fronts = compiler.resolveImplementation(localLayers, work.getNetworkParameters());
+
+        EXPECT_EQ(fronts.size(), 11u);
+
+        EXPECT_EQ(fronts[0]->getInputs().size(), 0u);
+        EXPECT_EQ(fronts[0]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[0]->getOutputs()[0], "l0_FP16_0");
+
+        EXPECT_EQ(fronts[1]->getInputs().size(), 0u);
+        EXPECT_EQ(fronts[1]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[1]->getOutputs()[0], "l1_FP16_0");
+
+        EXPECT_EQ(fronts[2]->getName(), "TensorConvertor_deconv::1::0");
+        EXPECT_EQ(fronts[2]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[2]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[2]->getInputs()[0], "l0_FP16_0");
+        EXPECT_EQ(fronts[2]->getOutputs()[0], "l0");
+
+        EXPECT_EQ(fronts[3]->getName(), "TensorConvertor_deconv::1::1");
+        EXPECT_EQ(fronts[3]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[3]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[3]->getInputs()[0], "l1_FP16_0");
+        EXPECT_EQ(fronts[3]->getOutputs()[0], "l1");
+
+        EXPECT_EQ(fronts[5]->getName(), "TensorConvertor_conv::3::0");
+        EXPECT_EQ(fronts[5]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[5]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[5]->getInputs()[0], "l0");
+        EXPECT_EQ(fronts[5]->getOutputs()[0], "l0_FP16_1");
+
+        EXPECT_EQ(fronts[6]->getName(), "TensorConvertor_conv::3::1");
+        EXPECT_EQ(fronts[6]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[6]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[6]->getInputs()[0], "l2");
+        EXPECT_EQ(fronts[6]->getOutputs()[0], "l2_FP16_1");
+
+        EXPECT_EQ(fronts[7]->getInputs().size(), 2u);
+        EXPECT_EQ(fronts[7]->getOutputs().size(), 2u);
+        EXPECT_EQ(fronts[7]->getInputs()[0], "l0_FP16_1");
+        EXPECT_EQ(fronts[7]->getInputs()[1], "l2_FP16_1");
+        EXPECT_EQ(fronts[7]->getOutputs()[0], "l3");
+        EXPECT_EQ(fronts[7]->getOutputs()[1], "l5_FP16_1");
+
+        EXPECT_EQ(fronts[8]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[8]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[8]->getInputs()[0], "l3");
+        EXPECT_EQ(fronts[8]->getOutputs()[0], "l4_FP16_1");
+
+        EXPECT_EQ(fronts[9]->getName(), "TensorConvertor_deconv::4::0");
+        EXPECT_EQ(fronts[9]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[9]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[9]->getInputs()[0], "l4_FP16_1");
+        EXPECT_EQ(fronts[9]->getOutputs()[0], "l4");
+
+        EXPECT_EQ(fronts[10]->getName(), "TensorConvertor_deconv::4::1");
+        EXPECT_EQ(fronts[10]->getInputs().size(), 1u);
+        EXPECT_EQ(fronts[10]->getOutputs().size(), 1u);
+        EXPECT_EQ(fronts[10]->getInputs()[0], "l5_FP16_1");
+        EXPECT_EQ(fronts[10]->getOutputs()[0], "l5");
+    }
+
+    // usage of same input (l0) by two constraints
+    {
+        raul::Workflow work(CompressionMode::NONE, CalculationMode::DETERMINISTIC, AllocationMode::STANDARD, ExecutionTarget::CPU, true);
+
+        work.add<TestLayer>("l0", raul::BasicParams{ {}, { "l0" } });
+        work.add<TestLayer>("l1", raul::BasicParams{ {}, { "l1" } });
+        work.add<TestLayer>("l2", raul::BasicParams{ { "l1" }, { "l2" } });
+        work.add<TestLayer>("l3", raul::BasicParams{ { "l0", "l2" }, { "l3", "l5" } });
+        work.add<TestLayer>("l4", raul::BasicParams{ { "l3" }, { "l4" } });
+        work.add<TestLayer>("l5", raul::BasicParams{ { "l0" }, { "l6" } });
+
+        auto& compiler = work.getCompiler();
+        compiler.setConstraint(raul::Constraint("l3", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l5", raul::ConstraintImpl::CPUFP16));
+
+        EXPECT_NO_THROW(work.preparePipelines());
+
+        EXPECT_EQ(work["TensorConvertor_conv::3::0"]->getInputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_conv::3::0"]->getOutputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_conv::3::0"]->getInputs()[0], "l0");
+        EXPECT_EQ(work["TensorConvertor_conv::3::0"]->getOutputs()[0], "l0_FP16_0");
+
+        EXPECT_EQ(work["TensorConvertor_conv::3::1"]->getInputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_conv::3::1"]->getOutputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_conv::3::1"]->getInputs()[0], "l2");
+        EXPECT_EQ(work["TensorConvertor_conv::3::1"]->getOutputs()[0], "l2_FP16_0");
+
+        EXPECT_EQ(work["l3"]->getInputs().size(), 2u);
+        EXPECT_EQ(work["l3"]->getOutputs().size(), 2u);
+        EXPECT_EQ(work["l3"]->getInputs()[0], "l0_FP16_0");
+        EXPECT_EQ(work["l3"]->getInputs()[1], "l2_FP16_0");
+        EXPECT_EQ(work["l3"]->getOutputs()[0], "l3_FP16_0");
+        EXPECT_EQ(work["l3"]->getOutputs()[1], "l5_FP16_0");
+
+        EXPECT_EQ(work["TensorConvertor_deconv::3::0"]->getInputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_deconv::3::0"]->getOutputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_deconv::3::0"]->getInputs()[0], "l3_FP16_0");
+        EXPECT_EQ(work["TensorConvertor_deconv::3::0"]->getOutputs()[0], "l3");
+
+        EXPECT_EQ(work["TensorConvertor_deconv::3::1"]->getInputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_deconv::3::1"]->getOutputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_deconv::3::1"]->getInputs()[0], "l5_FP16_0");
+        EXPECT_EQ(work["TensorConvertor_deconv::3::1"]->getOutputs()[0], "l5");
+
+        EXPECT_EQ(work["TensorConvertor_conv::5::0"]->getInputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_conv::5::0"]->getOutputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_conv::5::0"]->getInputs()[0], "l0");
+        EXPECT_EQ(work["TensorConvertor_conv::5::0"]->getOutputs()[0], "l0_FP16_1");
+
+        EXPECT_EQ(work["l5"]->getInputs().size(), 1u);
+        EXPECT_EQ(work["l5"]->getOutputs().size(), 1u);
+        EXPECT_EQ(work["l5"]->getInputs()[0], "l0_FP16_1");
+        EXPECT_EQ(work["l5"]->getOutputs()[0], "l6_FP16_1");
+
+        EXPECT_EQ(work["TensorConvertor_deconv::5::0"]->getInputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_deconv::5::0"]->getOutputs().size(), 1u);
+        EXPECT_EQ(work["TensorConvertor_deconv::5::0"]->getInputs()[0], "l6_FP16_1");
+        EXPECT_EQ(work["TensorConvertor_deconv::5::0"]->getOutputs()[0], "l6");
+    }
+
+    // direct usage of conversions
+    {
+        raul::Workflow work(CompressionMode::NONE, CalculationMode::DETERMINISTIC, AllocationMode::STANDARD, ExecutionTarget::CPU, true);
+
+        EXPECT_THROW(work.add<raul::ConvertPrecisionLayer>("c1", raul::ConvertPrecisionParams{ { "l1" }, { "l2" }, LayerExecutionTarget::CPU, LayerExecutionTarget::CPUFP16, false }), raul::Exception);
+    }
+
+    implFactory.clearRegistrationFromEveryMap(typeid(TestLayer).name());
+}
+
+TEST(TestCompiler, ConstraintUnit)
+{
+    PROFILE_TEST
+
+    auto& implFactory = Compiler(raul::ExecutionTarget::CPU).getImplFactory(); // factory from library
+
+    implFactory.regCPUFP32<TestLayer, TestImpl>();
+    implFactory.regCPUFP16<TestLayer, TestImpl>();
+
+    std::vector<BasicLayerBuilder> layers;
+
+    raul::Workflow work;
+
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l1", raul::BasicParams{ {}, {} }));
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l2", raul::BasicParams{ {}, {} }));
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l3", raul::BasicParams{ {}, {} }));
+    layers.emplace_back(raul::LayerBuilder<TestLayer, raul::BasicParams>("l4", raul::BasicParams{ {}, {} }));
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("", raul::ConstraintImpl::CPUFP16)), raul::Exception);
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("", "", raul::ConstraintImpl::CPUFP16)), raul::Exception);
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l2", raul::ConstraintImpl::CPUFP16));
+
+        EXPECT_NO_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()));
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l2", "l4", raul::ConstraintImpl::CPUFP16));
+        EXPECT_NO_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()));
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l1", "l5", raul::ConstraintImpl::CPUFP16)); // wrong to name
+        EXPECT_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()), raul::Exception);
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("lA", "l4", raul::ConstraintImpl::CPUFP16)); // wrong from name
+        EXPECT_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()), raul::Exception);
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l2", "l4", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l3", raul::ConstraintImpl::CPUFP16)); // overlap
+        EXPECT_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()), raul::Exception);
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l2", "l4", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l2", raul::ConstraintImpl::CPUFP16)); // overlap
+        EXPECT_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()), raul::Exception);
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l2", "l4", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l4", raul::ConstraintImpl::CPUFP16)); // overlap
+        EXPECT_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()), raul::Exception);
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l2", "l1", raul::ConstraintImpl::CPUFP16)); // wrong order
+        compiler.setConstraint(raul::Constraint("l3", raul::ConstraintImpl::CPUFP16));
+        EXPECT_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()), raul::Exception);
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l2", "l2", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l3", raul::ConstraintImpl::CPUFP16));
+        EXPECT_NO_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()));
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l2", "l3", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l1", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l4", raul::ConstraintImpl::CPUFP16));
+        EXPECT_NO_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()));
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l1", "l2", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l3", "l4", raul::ConstraintImpl::CPUFP16));
+        EXPECT_NO_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()));
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l1", "l2", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l3", "l4", raul::ConstraintImpl::CPUFP16));
+        compiler.setConstraint(raul::Constraint("l2", raul::ConstraintImpl::CPUFP16)); // overlap
+        EXPECT_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()), raul::Exception);
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        compiler.setConstraint(raul::Constraint("l1", "l2", raul::ConstraintImpl::CPUFP16));
+        compiler.resolveImplementation(layers, work.getNetworkParameters());
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("l3", "l4", raul::ConstraintImpl::CPUFP16)), raul::Exception); // already resolved
+    }
+
+    implFactory.clearRegistrationFromEveryMap(typeid(TestLayer).name());
+}
+
+TEST(TestCompiler, ConstraintCombinationUnit)
+{
+    PROFILE_TEST
+
+    // redundant
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("l1", "l2", raul::ConstraintImpl::CPU)), raul::Exception);
+    }
+
+    // redundant2
+    {
+        Compiler compiler(raul::ExecutionTarget::CPUFP16);
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("l1", "l2", raul::ConstraintImpl::CPUFP16)), raul::Exception);
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPUFP16);
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("l1", "l2", raul::ConstraintImpl::CPUFP32FP16MixedLocal)), raul::Exception);
+    }
+
+    {
+        Compiler compiler(raul::ExecutionTarget::CPU);
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("l1", "l2", raul::ConstraintImpl::CPUFP16FP32MasterWeights)), raul::Exception);
+    }
+}
+
+TEST(TestCompiler, LinearUnit)
+{
+    using namespace frontend;
+    auto topology = Graph{ { "l1", Linear{ 1 } }, { "l2", Linear{ 2 } }, { "l3", Linear{ 3 } }, { "l4", Linear{ 4 } } };
+
+    std::vector<BasicLayerBuilder> layers;
+    raul::Workflow work;
+
+    auto resolver = LayersResolver(layers);
+    topology.apply(resolver);
+    resolver.resolveInputs();
+
+    Compiler compiler(raul::ExecutionTarget::CPU);
+    std::vector<LayerMem> fronts = compiler.resolveImplementation(layers, work.getNetworkParameters());
+    EXPECT_EQ(fronts.size(), 5u);
+}
+
+TEST(TestCompiler, AutoMasterWeightsConstraintForTrainableLayerUnit)
+{
+    PROFILE_TEST
+
+    auto& implFactory = Compiler(raul::ExecutionTarget::CPUFP16).getImplFactory(); // factory from library
+
+    // Register one implementation for two situations
+    implFactory.regCPUFP16<TestTrainableLayer, TestTrainableImpl>();
+    implFactory.regCPUFP32<TestTrainableLayer, TestTrainableImpl>();
+
+    std::vector<BasicLayerBuilder> layers;
+
+    layers.emplace_back(raul::LayerBuilder<TestTrainableLayer, raul::TrainableParams>("l1", raul::TrainableParams{ raul::Names{}, {} }));
+    layers.emplace_back(raul::LayerBuilder<TestTrainableLayer, raul::TrainableParams>("l2", raul::TrainableParams{ raul::Names{}, {} }));
+    layers.emplace_back(raul::LayerBuilder<TestTrainableLayer, raul::TrainableParams>("l3", raul::TrainableParams{ raul::Names{}, {} }));
+    layers.emplace_back(raul::LayerBuilder<TestTrainableLayer, raul::TrainableParams>("l4", raul::TrainableParams{ raul::Names{}, {} }));
+    // Sharing
+    layers.emplace_back(raul::LayerBuilder<TestTrainableLayer, raul::TrainableParams>("l5", raul::TrainableParams{ raul::Names{}, {}, "l1", false }));
+
+    {
+        raul::Workflow work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16, true);
+        Compiler compiler(raul::ExecutionTarget::CPUFP16);
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("l1", raul::ConstraintImpl::CPUFP16)), raul::Exception);
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("l1", "l4", raul::ConstraintImpl::CPUFP16)), raul::Exception);
+    }
+
+    {
+        raul::Workflow work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16, true);
+        Compiler compiler(raul::ExecutionTarget::CPUFP16);
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("l1", raul::ConstraintImpl::CPUFP32FP16MixedLocal)), raul::Exception);
+        EXPECT_THROW(compiler.setConstraint(raul::Constraint("l1", "l4", raul::ConstraintImpl::CPUFP32FP16MixedLocal)), raul::Exception);
+    }
+
+    // No additional weights expected - 8 expected
+    {
+        raul::Workflow work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16, true);
+        Compiler compiler(raul::ExecutionTarget::CPUFP16);
+        EXPECT_NO_THROW(compiler.setConstraint(raul::Constraint("l1", raul::ConstraintImpl::CPU)));
+        EXPECT_NO_THROW(compiler.setConstraint(raul::Constraint("l3", "l4", raul::ConstraintImpl::CPU)));
+        EXPECT_NO_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()));
+        EXPECT_EQ(work.getLayerTrainableParameterNames("l1").size() + work.getLayerTrainableParameterNames("l2").size() + work.getLayerTrainableParameterNames("l3").size() +
+                      work.getLayerTrainableParameterNames("l4").size(),
+                  8);
+    }
+
+    // Additional weights expected - 4 * 2 initially + 3 * 2 copies
+    {
+        raul::Workflow work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16, true);
+        Compiler compiler(raul::ExecutionTarget::CPUFP16);
+        EXPECT_NO_THROW(compiler.setConstraint(raul::Constraint("l1", raul::ConstraintImpl::CPUFP16FP32MasterWeights)));
+        EXPECT_NO_THROW(compiler.setConstraint(raul::Constraint("l3", "l4", raul::ConstraintImpl::CPUFP16FP32MasterWeights)));
+        EXPECT_NO_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()));
+
+        // Calculate unique parameters
+        std::unordered_set<raul::Name> uniqueTrainableParameterNames;
+        for (size_t i = 0; i < layers.size(); ++i)
+        {
+            const auto trainableNames = work.getLayerTrainableParameterNames(layers[i].getName());
+            uniqueTrainableParameterNames.insert(trainableNames.begin(), trainableNames.end());
+        }
+        EXPECT_EQ(uniqueTrainableParameterNames.size(), 14);
+    }
+
+    // No change from previous case due to sharing
+    {
+        raul::Workflow work(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16, true);
+        Compiler compiler(raul::ExecutionTarget::CPUFP16);
+        EXPECT_NO_THROW(compiler.setConstraint(raul::Constraint("l1", raul::ConstraintImpl::CPUFP16FP32MasterWeights)));
+        EXPECT_NO_THROW(compiler.setConstraint(raul::Constraint("l3", "l4", raul::ConstraintImpl::CPUFP16FP32MasterWeights)));
+        // Set constraint to shared layer - no additional copies required
+        EXPECT_NO_THROW(compiler.setConstraint(raul::Constraint("l5", raul::ConstraintImpl::CPUFP16FP32MasterWeights)));
+        EXPECT_NO_THROW(compiler.resolveImplementation(layers, work.getNetworkParameters()));
+
+        // Calculate unique parameters
+        std::unordered_set<raul::Name> uniqueTrainableParameterNames;
+        for (size_t i = 0; i < layers.size(); ++i)
+        {
+            const auto trainableNames = work.getLayerTrainableParameterNames(layers[i].getName());
+            uniqueTrainableParameterNames.insert(trainableNames.begin(), trainableNames.end());
+        }
+        EXPECT_EQ(uniqueTrainableParameterNames.size(), 14);
+    }
+
+    implFactory.clearRegistrationFromEveryMap(typeid(TestTrainableLayer).name());
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_DataTransformations.cpp b/training/src/tests/tests/lib/Test_DataTransformations.cpp
new file mode 100644
index 00000000..a32b817d
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_DataTransformations.cpp
@@ -0,0 +1,215 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/GTestExtensions.h>
+#include <tests/tools/TestTools.h>
+
+#include <fstream>
+
+#include <training/base/tools/DataTransformations.h>
+
+using namespace raul;
+
+struct TestDataTransformationNormalization : public testing::Test
+{
+    dtype normalizationCoefficient = 255_dt;
+    std::unique_ptr<Tensor> testData;
+    std::unique_ptr<Tensor> refData;
+
+    void SetUp() final
+    {
+        testData = std::make_unique<Tensor>(2, 1, 2, 3);
+        std::generate(testData->begin(), testData->end(), [n = 0]() mutable { return static_cast<dtype>(n++); });
+        refData = std::make_unique<Tensor>(2, 1, 2, 3);
+        std::transform(testData->begin(), testData->end(), refData->begin(), [nc = normalizationCoefficient](dtype v) { return v / nc; });
+    }
+};
+
+TEST_F(TestDataTransformationNormalization, CanReturnTensorWithNormalizedDataUnit)
+{
+    PROFILE_TEST
+    Normalize normalize(normalizationCoefficient);
+    auto normalizedData = normalize(*testData);
+    ASSERT_FLOAT_TENSORS_EQ((*normalizedData), (*refData), 1e-6_dt);
+}
+
+struct TestDataTransformationBuildOneHotVector : public testing::Test
+{
+    std::unique_ptr<Tensor> testData;
+    std::unique_ptr<Tensor> refData;
+
+    void SetUp() final
+    {
+        testData = std::make_unique<Tensor>(3, 1, 1, 1);
+        std::generate(testData->begin(), testData->end(), [n = 1]() mutable { return static_cast<dtype>(n++); });
+
+        refData = std::make_unique<Tensor>(3, 1, 1, 10);
+        (*refData)[1] = 1_dt;
+        (*refData)[12] = 1_dt;
+        (*refData)[23] = 1_dt;
+    }
+};
+
+TEST_F(TestDataTransformationBuildOneHotVector, CheckThatRationalValueCannotHaveFractionalPart)
+{
+    PROFILE_TEST
+    {
+        dtype v = 9_dt;
+        ASSERT_TRUE(v == static_cast<dtype>(static_cast<int32_t>(v)));
+    }
+
+    {
+        dtype v = -3_dt;
+        ASSERT_TRUE(v == static_cast<dtype>(static_cast<int32_t>(v)));
+    }
+
+    {
+        dtype v = 9.7_dt;
+        ASSERT_FALSE(v == static_cast<dtype>(static_cast<int32_t>(v)));
+    }
+
+    {
+        dtype v = -3.2_dt;
+        ASSERT_FALSE(v == static_cast<dtype>(static_cast<int32_t>(v)));
+    }
+
+    {
+        dtype v = -0.999999_dt;
+        ASSERT_FALSE(v == static_cast<dtype>(static_cast<int32_t>(v)));
+    }
+
+    {
+        dtype v = 0.999999_dt;
+        ASSERT_FALSE(v == static_cast<dtype>(static_cast<int32_t>(v)));
+    }
+
+    {
+        dtype v = 0._dt;
+        ASSERT_TRUE(v == static_cast<dtype>(static_cast<int32_t>(v)));
+        ASSERT_TRUE(static_cast<int32_t>(v) == 0);
+    }
+
+    {
+        dtype v = -0._dt;
+        ASSERT_TRUE(v == static_cast<dtype>(static_cast<int32_t>(v)));
+        ASSERT_TRUE(static_cast<int32_t>(v) == 0);
+    }
+
+    {
+        dtype v = 9.01_dt;
+        ASSERT_FALSE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = 9.001_dt;
+        ASSERT_FALSE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = 9.0001_dt;
+        ASSERT_FALSE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = 9.00001_dt;
+        ASSERT_FALSE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = 9.000001_dt;
+        ASSERT_FALSE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = 9.0000001_dt;
+        ASSERT_TRUE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = -3.01_dt;
+        ASSERT_FALSE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = -3.001_dt;
+        ASSERT_FALSE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = -3.0001_dt;
+        ASSERT_FALSE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = -3.00001_dt;
+        ASSERT_FALSE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = -3.000001_dt;
+        ASSERT_FALSE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+
+    {
+        dtype v = -3.0000001_dt;
+        ASSERT_TRUE(static_cast<dtype>(static_cast<int32_t>(v)) == v);
+    }
+}
+
+TEST_F(TestDataTransformationBuildOneHotVector, CanReturnTensorWithBinaryVectorsCreatedFromDataWithLevelsUnit)
+{
+    PROFILE_TEST
+    BuildOneHotVector buildOneHotVector(10);
+    auto binaryVectors = buildOneHotVector(*testData);
+    ASSERT_FLOAT_TENSORS_EQ((*binaryVectors), (*refData), 1e-6_dt);
+}
+
+TEST_F(TestDataTransformationBuildOneHotVector, ThrowAnExceptionIfIncomingLevelValueIsIncorrect)
+{
+    PROFILE_TEST
+    BuildOneHotVector buildOneHotVector(10);
+
+    (*testData)[0] = 5.6_dt;
+    ASSERT_THROW(buildOneHotVector(*testData), raul::Exception);
+
+    (*testData)[0] = -2_dt;
+    ASSERT_THROW(buildOneHotVector(*testData), raul::Exception);
+
+    (*testData)[0] = 10_dt;
+    ASSERT_THROW(buildOneHotVector(*testData), raul::Exception);
+}
+
+struct TestDataTransformationResize : public testing::Test
+{
+    Tensor testData =
+        Tensor("", 2u, 2u, 3u, 4u, { 1_dt, 2_dt, 3_dt, 4_dt, 5_dt, 6_dt, 7_dt, 8_dt, 9_dt, 10_dt, 11_dt, 12_dt, 1_dt, 2_dt, 3_dt, 4_dt, 5_dt, 6_dt, 7_dt, 8_dt, 9_dt, 10_dt, 11_dt, 12_dt,
+                                     1_dt, 2_dt, 3_dt, 4_dt, 5_dt, 6_dt, 7_dt, 8_dt, 9_dt, 10_dt, 11_dt, 12_dt, 1_dt, 2_dt, 3_dt, 4_dt, 5_dt, 6_dt, 7_dt, 8_dt, 9_dt, 10_dt, 11_dt, 12_dt });
+
+    Tensor refData =
+        Tensor("", 2u, 2u, 6u, 8u, { 1_dt, 1_dt, 2_dt, 2_dt, 3_dt, 3_dt, 4_dt, 4_dt, 1_dt, 1_dt, 2_dt,  2_dt,  3_dt,  3_dt,  4_dt,  4_dt,  5_dt, 5_dt, 6_dt,  6_dt,  7_dt,  7_dt,  8_dt,  8_dt,
+                                     5_dt, 5_dt, 6_dt, 6_dt, 7_dt, 7_dt, 8_dt, 8_dt, 9_dt, 9_dt, 10_dt, 10_dt, 11_dt, 11_dt, 12_dt, 12_dt, 9_dt, 9_dt, 10_dt, 10_dt, 11_dt, 11_dt, 12_dt, 12_dt,
+                                     1_dt, 1_dt, 2_dt, 2_dt, 3_dt, 3_dt, 4_dt, 4_dt, 1_dt, 1_dt, 2_dt,  2_dt,  3_dt,  3_dt,  4_dt,  4_dt,  5_dt, 5_dt, 6_dt,  6_dt,  7_dt,  7_dt,  8_dt,  8_dt,
+                                     5_dt, 5_dt, 6_dt, 6_dt, 7_dt, 7_dt, 8_dt, 8_dt, 9_dt, 9_dt, 10_dt, 10_dt, 11_dt, 11_dt, 12_dt, 12_dt, 9_dt, 9_dt, 10_dt, 10_dt, 11_dt, 11_dt, 12_dt, 12_dt,
+                                     1_dt, 1_dt, 2_dt, 2_dt, 3_dt, 3_dt, 4_dt, 4_dt, 1_dt, 1_dt, 2_dt,  2_dt,  3_dt,  3_dt,  4_dt,  4_dt,  5_dt, 5_dt, 6_dt,  6_dt,  7_dt,  7_dt,  8_dt,  8_dt,
+                                     5_dt, 5_dt, 6_dt, 6_dt, 7_dt, 7_dt, 8_dt, 8_dt, 9_dt, 9_dt, 10_dt, 10_dt, 11_dt, 11_dt, 12_dt, 12_dt, 9_dt, 9_dt, 10_dt, 10_dt, 11_dt, 11_dt, 12_dt, 12_dt,
+                                     1_dt, 1_dt, 2_dt, 2_dt, 3_dt, 3_dt, 4_dt, 4_dt, 1_dt, 1_dt, 2_dt,  2_dt,  3_dt,  3_dt,  4_dt,  4_dt,  5_dt, 5_dt, 6_dt,  6_dt,  7_dt,  7_dt,  8_dt,  8_dt,
+                                     5_dt, 5_dt, 6_dt, 6_dt, 7_dt, 7_dt, 8_dt, 8_dt, 9_dt, 9_dt, 10_dt, 10_dt, 11_dt, 11_dt, 12_dt, 12_dt, 9_dt, 9_dt, 10_dt, 10_dt, 11_dt, 11_dt, 12_dt, 12_dt });
+};
+
+TEST_F(TestDataTransformationResize, ShouldCorrectlyDoTransformation)
+{
+    Resize resize(6, 8);
+    auto transformationResult = resize(testData);
+
+    ASSERT_FLOAT_TENSORS_EQ((*transformationResult), refData, 1e-6_dt);
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_ElementSequence.cpp b/training/src/tests/tests/lib/Test_ElementSequence.cpp
new file mode 100644
index 00000000..0193a7e2
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_ElementSequence.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <unordered_set>
+
+#include <training/base/tools/ElementSequence.h>
+
+using namespace raul;
+
+struct TestElementSequenceRandomized : public testing::Test
+{
+};
+
+TEST_F(TestElementSequenceRandomized, ShouldReturnNumbersInRandomOrderFromRangeRangeWithoutRepetitionsUnit)
+{
+    PROFILE_TEST
+    RandomSequence rs(0, 60000u / 16u);
+
+    std::unordered_set<uint32_t> generatedNumbers;
+    for (size_t i = 0; i < 60000u / 16u + 1; ++i)
+    {
+        uint32_t rv = rs.getElement();
+        ASSERT_TRUE(rv <= 60000u / 16u);
+        auto [_, isNewOne] = generatedNumbers.insert(rv);
+        ASSERT_TRUE(isNewOne);
+    }
+    ASSERT_TRUE(generatedNumbers.find(0) != generatedNumbers.end());
+    ASSERT_TRUE(generatedNumbers.find(60000u / 16u) != generatedNumbers.end());
+}
+
+TEST_F(TestElementSequenceRandomized, ShouldGenerateTwoDifferentSequencesUnit)
+{
+    PROFILE_TEST
+    RandomSequence rs(0, 60000u / 16u);
+
+    std::vector<uint32_t> s1;
+    for (size_t i = 0; i < 60000u / 16u + 1; ++i)
+    {
+        s1.push_back(rs.getElement());
+    }
+    std::vector<uint32_t> s2;
+    for (size_t i = 0; i < 60000u / 16u + 1; ++i)
+    {
+        s2.push_back(rs.getElement());
+    }
+
+    ASSERT_FALSE(std::is_sorted(s1.begin(), s1.end()));
+    ASSERT_FALSE(std::is_sorted(s2.begin(), s2.end()));
+    ASSERT_FALSE(std::equal(s1.begin(), s1.end(), s2.begin()));
+}
+
+struct TestElementSequenceMonotonic : public testing::Test
+{
+};
+
+TEST_F(TestElementSequenceMonotonic, ShouldReturnNumbersInMonotonicIncreasingOrderWithStepOneUnit)
+{
+    PROFILE_TEST
+    std::vector<uint32_t> ref(60000u / 16u + 1, uint32_t());
+    std::generate(ref.begin(), ref.end(), [n = 0]() mutable { return n++; });
+
+    MonotonicSequence ms(0, 60000u / 16u);
+    std::vector<uint32_t> s1;
+    for (size_t i = 0; i < 60000u / 16u + 1; ++i)
+    {
+        s1.push_back(ms.getElement());
+    }
+    std::vector<uint32_t> s2;
+    for (size_t i = 0; i < 60000u / 16u + 1; ++i)
+    {
+        s2.push_back(ms.getElement());
+    }
+
+    ASSERT_TRUE(std::equal(s1.begin(), s1.end(), ref.begin()));
+    ASSERT_TRUE(std::equal(s2.begin(), s2.end(), ref.begin()));
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_LossScale.cpp b/training/src/tests/tests/lib/Test_LossScale.cpp
new file mode 100644
index 00000000..d80d9b28
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_LossScale.cpp
@@ -0,0 +1,385 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/BasicLayer.h>
+#include <training/base/initializers/ConstantInitializer.h>
+#include <training/compiler/Layers.h>
+#include <training/base/optimizers/Adam.h>
+
+namespace UT
+{
+
+template<typename MM>
+void init_grad(MM& memoryManager, const raul::Name& tensorName, typename MM::type value)
+{
+    auto& grad = memoryManager[tensorName.grad()];
+    grad.memAllocate(nullptr);
+    grad = value;
+}
+
+TEST(TestLossScale, NoThrowUnit)
+{
+    PROFILE_TEST
+
+    const auto batch_size = 1;
+
+    auto network = raul::Workflow();
+
+    network.add<raul::DataLayer>("data", raul::DataParams{ { "data" }, 1, 1, 1 });
+    network.add<raul::LinearLayer>("fc_in", raul::LinearParams{ { "data" }, { "fc_in" }, 1 });
+    network.add<raul::ReLUActivation>("nl", raul::BasicParams{ { "fc_in" }, { "nl" } });
+    network.add<raul::LinearLayer>("fc_out", raul::LinearParams{ { "nl" }, { "fc_out" }, 1 });
+
+    network.setScaling("fc_in", raul::ScalingStrategy(2.0_dt));
+
+    network.preparePipelines();
+    network.setBatchSize(batch_size);
+    network.prepareMemoryForTraining();
+
+    ASSERT_NO_THROW(network.forwardPassTraining());
+    ASSERT_NO_THROW(network.backwardPassTraining());
+}
+
+TEST(TestLossScale, OneLayerUnit)
+{
+    PROFILE_TEST
+
+    const auto abs_err = 1e-3;
+    const auto batch_size = 1;
+    const auto scale_factor = 300.0_dt;
+
+    auto network = raul::Workflow();
+
+    network.add<raul::DataLayer>("data", raul::DataParams{ { "data" }, 1, 1, 1 });
+    network.add<raul::LinearLayer>("fc_in", raul::LinearParams{ { "data" }, { "fc_in" }, 1 });
+    network.add<raul::ReLUActivation>("relu", raul::BasicParams{ { "fc_in" }, { "relu" } });
+    network.add<raul::LinearLayer>("fc_out", raul::LinearParams{ { "relu" }, { "fc_out" }, 1 });
+
+    network.setScaling("fc_in", raul::ScalingStrategy(scale_factor));
+
+    network.preparePipelines();
+    network.setBatchSize(batch_size);
+    network.prepareMemoryForTraining();
+
+    auto init_ones = raul::initializers::ConstantInitializer{ 1.0 };
+    auto& memoryManagerFP32 = network.getMemoryManager();
+    for (auto& paramName : network.getTrainableParameterNames())
+    {
+        auto& tensor = memoryManagerFP32[paramName];
+        init_ones(tensor);
+    }
+
+    network.getNetworkParameters().mCallback = [&](raul::BasicLayer* layer, raul::MemoryManager& memoryManager, raul::NetworkParameters::CallbackPlace place)
+    {
+        if (layer->getName() == "fc_in" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["fc_inGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor, abs_err);
+        }
+    };
+
+    init_ones(memoryManagerFP32["data"]);
+    init_grad(memoryManagerFP32, "fc_out", 1.0_dt);
+
+    network.forwardPassTraining();
+    network.backwardPassTraining();
+}
+
+TEST(TestLossScale, TwoLayersUnit)
+{
+    PROFILE_TEST
+
+    const auto abs_err = 1e-3;
+    const auto batch_size = 1;
+    const auto scale_factor_1 = 300.0_dt;
+    const auto scale_factor_2 = 70.0_dt;
+
+    auto network = raul::Workflow();
+
+    network.add<raul::DataLayer>("data", raul::DataParams{ { "data" }, 1, 1, 1 });
+    network.add<raul::LinearLayer>("fc_in", raul::LinearParams{ { "data" }, { "fc_in" }, 1 });
+    network.add<raul::ReLUActivation>("relu", raul::BasicParams{ { "fc_in" }, { "relu" } });
+    network.add<raul::LinearLayer>("fc_out", raul::LinearParams{ { "relu" }, { "fc_out" }, 1 });
+
+    network.setScaling("fc_out", raul::ScalingStrategy(scale_factor_1));
+    network.setScaling("fc_in", raul::ScalingStrategy(scale_factor_2));
+
+    network.preparePipelines();
+    network.setBatchSize(batch_size);
+    network.prepareMemoryForTraining();
+
+    auto init_ones = raul::initializers::ConstantInitializer{ 1.0 };
+    auto& memoryManagerFP32 = network.getMemoryManager();
+    for (auto& paramName : network.getTrainableParameterNames())
+    {
+        auto& tensor = memoryManagerFP32[paramName];
+        init_ones(tensor);
+    }
+
+    network.getNetworkParameters().mCallback = [&](raul::BasicLayer* layer, raul::MemoryManager& memoryManager, raul::NetworkParameters::CallbackPlace place)
+    {
+        if (layer->getName() == "fc_in" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["fc_inGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor_2, abs_err);
+        }
+
+        if (layer->getName() == "fc_out" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["fc_outGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor_1, abs_err);
+        }
+    };
+
+    init_ones(memoryManagerFP32["data"]);
+    init_grad(memoryManagerFP32, "fc_out", 1.0_dt);
+
+    network.forwardPassTraining();
+    network.backwardPassTraining();
+}
+
+TEST(TestLossScale, ThreeLayersUnit)
+{
+    PROFILE_TEST
+
+    const auto abs_err = 1e-3;
+    const auto batch_size = 1;
+    const auto scale_factor_1 = 300.0_dt;
+    const auto scale_factor_2 = 5.0_dt;
+    const auto scale_factor_3 = 70.0_dt;
+
+    auto network = raul::Workflow();
+
+    network.add<raul::DataLayer>("data", raul::DataParams{ { "data" }, 1, 1, 1 });
+    network.add<raul::LinearLayer>("fc_in", raul::LinearParams{ { "data" }, { "fc_in" }, 1 });
+    network.add<raul::ReLUActivation>("relu", raul::BasicParams{ { "fc_in" }, { "relu" } });
+    network.add<raul::LinearLayer>("fc_out", raul::LinearParams{ { "relu" }, { "fc_out" }, 1 });
+
+    network.setScaling("fc_out", raul::ScalingStrategy(scale_factor_1));
+    network.setScaling("relu", raul::ScalingStrategy(scale_factor_2));
+    network.setScaling("fc_in", raul::ScalingStrategy(scale_factor_3));
+
+    network.preparePipelines();
+    network.setBatchSize(batch_size);
+    network.prepareMemoryForTraining();
+
+    auto init_ones = raul::initializers::ConstantInitializer{ 1.0 };
+    auto& memoryManagerFP32 = network.getMemoryManager();
+    for (auto& paramName : network.getTrainableParameterNames())
+    {
+        auto& tensor = memoryManagerFP32[paramName];
+        init_ones(tensor);
+    }
+
+    network.getNetworkParameters().mCallback = [&](raul::BasicLayer* layer, raul::MemoryManager& memoryManager, raul::NetworkParameters::CallbackPlace place)
+    {
+        if (layer->getName() == "fc_in" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["fc_inGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor_3, abs_err);
+        }
+
+        if (layer->getName() == "relu" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["reluGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor_2, abs_err);
+        }
+
+        if (layer->getName() == "fc_out" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["fc_outGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor_1, abs_err);
+        }
+    };
+
+    init_ones(memoryManagerFP32["data"]);
+    init_grad(memoryManagerFP32, "fc_out", 1.0_dt);
+
+    network.forwardPassTraining();
+    network.backwardPassTraining();
+}
+
+TEST(TestLossScale, OneLayerFP16Unit)
+{
+    PROFILE_TEST
+
+    const auto abs_err = 1e-3;
+    const auto batch_size = 1;
+    const auto scale_factor = 300.0_dt;
+
+    auto network = raul::Workflow(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16);
+
+    network.add<raul::DataLayer>("data", raul::DataParams{ { "data" }, 1, 1, 1 });
+    network.add<raul::LinearLayer>("fc_in", raul::LinearParams{ { "data" }, { "fc_in" }, 1 });
+    network.add<raul::ReLUActivation>("relu", raul::BasicParams{ { "fc_in" }, { "relu" } });
+    network.add<raul::LinearLayer>("fc_out", raul::LinearParams{ { "relu" }, { "fc_out" }, 1 });
+
+    network.setScaling("fc_in", raul::ScalingStrategy(scale_factor));
+
+    network.preparePipelines();
+    network.setBatchSize(batch_size);
+    network.prepareMemoryForTraining();
+
+    auto init_ones = raul::initializers::ConstantInitializer{ 1.0 };
+    auto& memoryManagerFP16 = network.getMemoryManager<raul::MemoryManagerFP16>();
+    for (auto& paramName : network.getTrainableParameterNames())
+    {
+        auto& tensor = memoryManagerFP16[paramName];
+        init_ones(tensor);
+    }
+
+    network.getNetworkParameters().mCallbackFP16 = [&](raul::BasicLayer* layer, raul::MemoryManagerFP16& memoryManager, raul::NetworkParameters::CallbackPlace place)
+    {
+        if (layer->getName() == "fc_in" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["fc_inGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor, abs_err);
+        }
+    };
+
+    init_ones(memoryManagerFP16["data"]);
+    init_grad(memoryManagerFP16, "fc_out", 1.0_hf);
+
+    network.forwardPassTraining();
+    network.backwardPassTraining();
+}
+
+TEST(TestLossScale, TwoLayersFP16Unit)
+{
+    PROFILE_TEST
+
+    const auto abs_err = 1e-3;
+    const auto batch_size = 1;
+    const auto scale_factor_1 = 300.0_dt;
+    const auto scale_factor_2 = 70.0_dt;
+
+    auto network = raul::Workflow(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16);
+
+    network.add<raul::DataLayer>("data", raul::DataParams{ { "data" }, 1, 1, 1 });
+    network.add<raul::LinearLayer>("fc_in", raul::LinearParams{ { "data" }, { "fc_in" }, 1 });
+    network.add<raul::ReLUActivation>("relu", raul::BasicParams{ { "fc_in" }, { "relu" } });
+    network.add<raul::LinearLayer>("fc_out", raul::LinearParams{ { "relu" }, { "fc_out" }, 1 });
+
+    network.setScaling("fc_out", raul::ScalingStrategy(scale_factor_1));
+    network.setScaling("fc_in", raul::ScalingStrategy(scale_factor_2));
+
+    network.preparePipelines();
+    network.setBatchSize(batch_size);
+    network.prepareMemoryForTraining();
+
+    auto init_ones = raul::initializers::ConstantInitializer{ 1.0 };
+    auto& memoryManagerFP16 = network.getMemoryManager<raul::MemoryManagerFP16>();
+    for (auto& paramName : network.getTrainableParameterNames())
+    {
+        auto& tensor = memoryManagerFP16[paramName];
+        init_ones(tensor);
+    }
+
+    network.getNetworkParameters().mCallbackFP16 = [&](raul::BasicLayer* layer, raul::MemoryManagerFP16& memoryManager, raul::NetworkParameters::CallbackPlace place)
+    {
+        if (layer->getName() == "fc_in" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["fc_inGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor_2, abs_err);
+        }
+
+        if (layer->getName() == "fc_out" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["fc_outGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor_1, abs_err);
+        }
+    };
+
+    init_ones(memoryManagerFP16["data"]);
+    init_grad(memoryManagerFP16, "fc_out", 1.0_hf);
+
+    network.forwardPassTraining();
+    network.backwardPassTraining();
+}
+
+TEST(TestLossScale, ThreeLayersFP16Unit)
+{
+    PROFILE_TEST
+
+    const auto abs_err = 1e-3;
+    const auto batch_size = 1;
+    const auto scale_factor_1 = 300.0_dt;
+    const auto scale_factor_2 = 5.0_dt;
+    const auto scale_factor_3 = 70.0_dt;
+
+    auto network = raul::Workflow(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPUFP16);
+
+    network.add<raul::DataLayer>("data", raul::DataParams{ { "data" }, 1, 1, 1 });
+    network.add<raul::LinearLayer>("fc_in", raul::LinearParams{ { "data" }, { "fc_in" }, 1 });
+    network.add<raul::ReLUActivation>("relu", raul::BasicParams{ { "fc_in" }, { "relu" } });
+    network.add<raul::LinearLayer>("fc_out", raul::LinearParams{ { "relu" }, { "fc_out" }, 1 });
+
+    network.setScaling("fc_out", raul::ScalingStrategy(scale_factor_1));
+    network.setScaling("relu", raul::ScalingStrategy(scale_factor_2));
+    network.setScaling("fc_in", raul::ScalingStrategy(scale_factor_3));
+
+    network.preparePipelines();
+    network.setBatchSize(batch_size);
+    network.prepareMemoryForTraining();
+
+    auto init_ones = raul::initializers::ConstantInitializer{ 1.0 };
+    auto& memoryManagerFP16 = network.getMemoryManager<raul::MemoryManagerFP16>();
+    for (auto& paramName : network.getTrainableParameterNames())
+    {
+        auto& tensor = memoryManagerFP16[paramName];
+        init_ones(tensor);
+    }
+
+    network.getNetworkParameters().mCallbackFP16 = [&](raul::BasicLayer* layer, raul::MemoryManagerFP16& memoryManager, raul::NetworkParameters::CallbackPlace place)
+    {
+        if (layer->getName() == "fc_in" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["fc_inGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor_3, abs_err);
+        }
+
+        if (layer->getName() == "relu" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["reluGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor_2, abs_err);
+        }
+
+        if (layer->getName() == "fc_out" && place == raul::NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            auto& tensorValue = memoryManager["fc_outGradient"];
+            ASSERT_EQ(tensorValue.size(), 1);
+            ASSERT_NEAR(tensorValue[0], scale_factor_1, abs_err);
+        }
+    };
+
+    init_ones(memoryManagerFP16["data"]);
+    init_grad(memoryManagerFP16, "fc_out", 1.0_hf);
+
+    network.forwardPassTraining();
+    network.backwardPassTraining();
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_MemoryManager.cpp b/training/src/tests/tests/lib/Test_MemoryManager.cpp
new file mode 100644
index 00000000..4ccdba23
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_MemoryManager.cpp
@@ -0,0 +1,243 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/MemoryManager.h>
+
+namespace UT
+{
+
+TEST(TestMemoryManager, GeneralUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    EXPECT_EQ(memory_manager.tensorExists("test"), false);
+    EXPECT_EQ(memory_manager.tensorExists(""), false);
+
+    {
+        raul::Tensor& blob = *memory_manager.createTensor("test", 1, 1, 1, 1);
+        EXPECT_EQ(blob.size(), static_cast<size_t>(1));
+    }
+
+    {
+        raul::Tensor& blob = *memory_manager.createTensor("test1", 1, 10, 1, 1);
+        EXPECT_EQ(blob.size(), static_cast<size_t>(10));
+    }
+
+    {
+        raul::Tensor& blob = *memory_manager.createTensor("test11", 1, 10, 1, 1, 1.0_dt);
+        EXPECT_EQ(blob.size(), static_cast<size_t>(10));
+        for (raul::dtype d : blob)
+            EXPECT_EQ(d, 1.0_dt);
+    }
+
+    EXPECT_THROW(memory_manager.createTensor("test", 1, 1, 1, 1), raul::Exception);
+
+    EXPECT_EQ(memory_manager.tensorExists("test"), true);
+
+    EXPECT_THROW(memory_manager.getTensor("test2"), raul::Exception);
+
+    raul::Tensor& blob = memory_manager.getTensor("test");
+
+    EXPECT_EQ(blob.size(), static_cast<size_t>(1));
+    // blob.getData().resize(10);
+    // EXPECT_EQ(memory_manager.getTensor("test").size(), static_cast<size_t>(10));
+
+    EXPECT_THROW(memory_manager.deleteTensor("test2"), raul::Exception);
+    memory_manager.deleteTensor("test");
+    EXPECT_THROW(memory_manager.deleteTensor("test"), raul::Exception);
+    EXPECT_EQ(memory_manager.tensorExists("test"), false);
+    EXPECT_THROW(memory_manager.getTensor("test"), raul::Exception);
+}
+
+TEST(TestMemoryManager, ClearUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    EXPECT_EQ(memory_manager.size(), static_cast<size_t>(0));
+
+    memory_manager.createTensor("test", 1, 1, 1, 1);
+    memory_manager.createTensor("test2", 1, 1, 1, 1);
+
+    EXPECT_EQ(memory_manager.tensorExists("test"), true);
+    EXPECT_EQ(memory_manager.tensorExists("test2"), true);
+
+    EXPECT_EQ(memory_manager.size(), static_cast<size_t>(2));
+
+    memory_manager.clear();
+
+    EXPECT_EQ(memory_manager.tensorExists("test"), false);
+    EXPECT_EQ(memory_manager.tensorExists("test2"), false);
+
+    EXPECT_EQ(memory_manager.size(), static_cast<size_t>(0));
+}
+
+TEST(TestMemoryManager, AnonymousUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memoryManager;
+
+    raul::Tensor& t1 = *memoryManager.createTensor(1, 1, 1, 1);
+    raul::Tensor& t2 = *memoryManager.createTensor(1, 1, 1, 2);
+    raul::Tensor& t3 = *memoryManager.createTensor(1, 1, 1, 3, 1.0_dt);
+
+    EXPECT_EQ(memoryManager.size(), 3u);
+
+    EXPECT_EQ(t1.getName(), "Tensor_1");
+    EXPECT_EQ(t2.getName(), "Tensor_2");
+    EXPECT_EQ(t3.getName(), "Tensor_3");
+    EXPECT_EQ(t3.size(), 3u);
+    EXPECT_EQ(t3[0], 1.0_dt);
+    EXPECT_EQ(t3[1], 1.0_dt);
+    EXPECT_EQ(t3[2], 1.0_dt);
+}
+
+#if 0
+TEST(TestMemoryManager, ShapeUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memoryManager;
+
+    memoryManager.createShape("test", 1, 1, 1, 2);
+
+    EXPECT_EQ(memoryManager["test"].size(), 0u);
+    EXPECT_EQ(memoryManager["test"].getBatchSize(), 1u);
+    EXPECT_EQ(memoryManager["test"].getDepth(), 1u);
+    EXPECT_EQ(memoryManager["test"].getHeight(), 1u);
+    EXPECT_EQ(memoryManager["test"].getWidth(), 2u);
+
+    memoryManager.createShape("test2", memoryManager["test"]);
+    EXPECT_EQ(memoryManager["test2"].size(), 0u);
+    EXPECT_EQ(memoryManager["test2"].getBatchSize(), 1u);
+    EXPECT_EQ(memoryManager["test2"].getDepth(), 1u);
+    EXPECT_EQ(memoryManager["test2"].getHeight(), 1u);
+    EXPECT_EQ(memoryManager["test2"].getWidth(), 2u);
+
+    memoryManager["test"].allocate();
+    EXPECT_EQ(memoryManager["test"].size(), 2u);
+    EXPECT_EQ(memoryManager["test2"].size(), 0u);
+
+    memoryManager["test2"].allocate();
+    EXPECT_EQ(memoryManager["test2"].size(), 2u);
+
+    raul::Tensor* shape = memoryManager.createShape(memoryManager["test"]);
+    EXPECT_NE(shape, nullptr);
+    EXPECT_EQ(shape, &memoryManager[shape->getName()]);
+    EXPECT_EQ(shape->getName(), "Tensor_1");
+    EXPECT_EQ(shape->size(), 0u);
+    EXPECT_EQ(shape->getBatchSize(), 1u);
+    EXPECT_EQ(shape->getDepth(), 1u);
+    EXPECT_EQ(shape->getHeight(), 1u);
+    EXPECT_EQ(shape->getWidth(), 2u);
+}
+#endif
+
+TEST(TestMemoryManager, AliasesUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memoryManager;
+    const raul::MemoryManager& memoryManagerC = memoryManager;
+
+    memoryManager.createTensor("test", 1, 1, 1, 2);
+    EXPECT_THROW(memoryManager.createAlias("test", "test"), raul::Exception);
+    EXPECT_THROW(memoryManager.createAlias("test2", "test3"), raul::Exception);
+    EXPECT_THROW(memoryManager.createAlias("test2", "test"), raul::Exception);
+
+    EXPECT_NO_THROW(memoryManager.createAlias("test", "test2"));
+    EXPECT_THROW(memoryManager.createAlias("test2", "test2"), raul::Exception);
+    EXPECT_NO_THROW(memoryManager.createAlias("test2", "test3")); // alias on alias possible
+
+    EXPECT_EQ(memoryManager.getTensor("test").getWidth(), 2u);
+    EXPECT_EQ(memoryManager.getTensor("test2").getWidth(), 2u);
+    EXPECT_EQ(memoryManager.getTensor("test3").getWidth(), 2u);
+    EXPECT_EQ(memoryManager.getTensor("test").getName(), "test");
+    EXPECT_EQ(memoryManager.getTensor("test2").getName(), "test");
+    EXPECT_EQ(memoryManager.getTensor("test3").getName(), "test");
+
+    EXPECT_EQ(memoryManager["test"].getWidth(), 2u);
+    EXPECT_EQ(memoryManager["test2"].getWidth(), 2u);
+    EXPECT_EQ(memoryManager["test3"].getWidth(), 2u);
+    EXPECT_EQ(memoryManager["test"].getName(), "test");
+    EXPECT_EQ(memoryManager["test2"].getName(), "test");
+    EXPECT_EQ(memoryManager["test3"].getName(), "test");
+
+    EXPECT_EQ(memoryManagerC.getTensor("test").getWidth(), 2u);
+    EXPECT_EQ(memoryManagerC.getTensor("test2").getWidth(), 2u);
+    EXPECT_EQ(memoryManagerC.getTensor("test3").getWidth(), 2u);
+    EXPECT_EQ(memoryManagerC.getTensor("test").getName(), "test");
+    EXPECT_EQ(memoryManagerC.getTensor("test2").getName(), "test");
+    EXPECT_EQ(memoryManagerC.getTensor("test3").getName(), "test");
+
+    EXPECT_EQ(memoryManagerC["test"].getWidth(), 2u);
+    EXPECT_EQ(memoryManagerC["test2"].getWidth(), 2u);
+    EXPECT_EQ(memoryManagerC["test3"].getWidth(), 2u);
+    EXPECT_EQ(memoryManagerC["test"].getName(), "test");
+    EXPECT_EQ(memoryManagerC["test2"].getName(), "test");
+    EXPECT_EQ(memoryManagerC["test3"].getName(), "test");
+
+    EXPECT_THROW(memoryManager.createTensor("test", 1, 1, 1, 1), raul::Exception);
+    EXPECT_THROW(memoryManager.createTensor("test2", 1, 1, 1, 1), raul::Exception);
+    EXPECT_THROW(memoryManager.createTensor("test3", 1, 1, 1, 1), raul::Exception);
+
+    EXPECT_THROW(memoryManager.createTensor("test", 1, 1, 1, 1, 1.0_dt), raul::Exception);
+    EXPECT_THROW(memoryManager.createTensor("test2", 1, 1, 1, 1, 1.0_dt), raul::Exception);
+    EXPECT_THROW(memoryManager.createTensor("test3", 1, 1, 1, 1, 1.0_dt), raul::Exception);
+
+    EXPECT_THROW(memoryManager.createShape("test", 1, 1, 1, 1, raul::AllocationMode::STANDARD), raul::Exception);
+    EXPECT_THROW(memoryManager.createShape("test2", 1, 1, 1, 1, raul::AllocationMode::STANDARD), raul::Exception);
+    EXPECT_THROW(memoryManager.createShape("test3", 1, 1, 1, 1, raul::AllocationMode::STANDARD), raul::Exception);
+
+    EXPECT_EQ(memoryManager.size(), 1u);
+    EXPECT_EQ(memoryManager.getTotalMemory(), 8u);
+
+    EXPECT_NO_THROW(memoryManager.createShape("shape", 1, 1, 1, 4, raul::AllocationMode::STANDARD));
+    EXPECT_NO_THROW(memoryManager.createAlias("shape", "shape2"));
+    EXPECT_NO_THROW(memoryManager.createAlias("shape2", "shape3"));
+
+    EXPECT_EQ(memoryManagerC["shape"].getWidth(), 4u);
+    EXPECT_EQ(memoryManagerC["shape2"].getWidth(), 4u);
+    EXPECT_EQ(memoryManagerC["shape3"].getWidth(), 4u);
+    EXPECT_EQ(memoryManagerC["shape"].getName(), "shape");
+    EXPECT_EQ(memoryManagerC["shape2"].getName(), "shape");
+    EXPECT_EQ(memoryManagerC["shape3"].getName(), "shape");
+
+    EXPECT_EQ(memoryManager.size(), 2u);
+    EXPECT_EQ(memoryManager.getTotalMemory(), 8u);
+
+    EXPECT_EQ(memoryManager.tensorExists("test"), true);
+    EXPECT_EQ(memoryManager.tensorExists("test2"), true);
+    EXPECT_EQ(memoryManager.tensorExists("test3"), true);
+
+    EXPECT_EQ(memoryManager.tensorExists("shape"), true);
+    EXPECT_EQ(memoryManager.tensorExists("shape2"), true);
+    EXPECT_EQ(memoryManager.tensorExists("shape3"), true);
+
+    EXPECT_NO_THROW(memoryManager.deleteTensor("shape3"));
+    EXPECT_EQ(memoryManager.tensorExists("shape"), false);
+    EXPECT_EQ(memoryManager.tensorExists("shape2"), false);
+    EXPECT_EQ(memoryManager.tensorExists("shape3"), false);
+
+    EXPECT_EQ(memoryManager.size(), 1u);
+    EXPECT_EQ(memoryManager.getTotalMemory(), 8u);
+
+    EXPECT_NO_THROW(memoryManager.deleteTensor("test"));
+    EXPECT_EQ(memoryManager.tensorExists("test"), false);
+    EXPECT_EQ(memoryManager.tensorExists("test2"), false);
+    EXPECT_EQ(memoryManager.tensorExists("test3"), false);
+
+    EXPECT_EQ(memoryManager.size(), 0u);
+    EXPECT_EQ(memoryManager.getTotalMemory(), 0u);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_Name.cpp b/training/src/tests/tests/lib/Test_Name.cpp
new file mode 100644
index 00000000..33521cf6
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_Name.cpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/system/Name.h>
+
+namespace UT
+{
+
+TEST(TestName, PrefixUnit)
+{
+    PROFILE_TEST
+    raul::Name name("Test");
+
+    name /= "1";
+
+    EXPECT_EQ(name.str(), "Test::1");
+
+    name /= "2";
+
+    EXPECT_EQ(name.str(), "Test::1::2");
+
+    EXPECT_EQ(name.getPrefix(), "Test::1");
+    EXPECT_EQ(name.getPrefix().getPrefix(), "Test");
+    EXPECT_EQ(name.getPrefix().getPrefix().getPrefix(), "Test");
+}
+
+TEST(TestName, LastNameUnit)
+{
+    PROFILE_TEST
+    raul::Name name("Test");
+
+    name /= "1";
+
+    EXPECT_EQ(name.str(), "Test::1");
+
+    name /= "2";
+
+    EXPECT_EQ(name.str(), "Test::1::2");
+
+    EXPECT_EQ(name.getLastName(), "2");
+    EXPECT_EQ(name.getPrefix().getLastName(), "1");
+    EXPECT_EQ(name.getPrefix().getPrefix().getLastName(), "Test");
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_NameGenerator.cpp b/training/src/tests/tests/lib/Test_NameGenerator.cpp
new file mode 100644
index 00000000..523fe213
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_NameGenerator.cpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/system/NameGenerator.h>
+
+namespace UT
+{
+
+TEST(TestNameGenerator, GeneralUnit)
+{
+    PROFILE_TEST
+    raul::NameGenerator nameGenerator("Test_");
+
+    EXPECT_EQ(nameGenerator.generate(), "Test_1");
+    EXPECT_EQ(nameGenerator.getNext(), static_cast<size_t>(2));
+    nameGenerator.reset();
+    EXPECT_EQ(nameGenerator.generate(), "Test_1");
+    nameGenerator.setNext(10);
+    EXPECT_EQ(nameGenerator.generate(), "Test_10");
+    EXPECT_EQ(nameGenerator.getNext(), static_cast<size_t>(11));
+
+    nameGenerator.setPrefix("TT_");
+    EXPECT_EQ(nameGenerator.generate(), "TT_11");
+}
+
+TEST(TestNameGenerator, RandomDefaultLengthUnit)
+{
+    PROFILE_TEST
+    const auto default_length = DEFAULT_RANDOM_PREFIX_LENGTH;
+    raul::NameGenerator nameGenerator{};
+    EXPECT_EQ(nameGenerator.getPrefix().length(), default_length);
+}
+
+TEST(TestNameGenerator, RandomCustomLengthUnit)
+{
+    PROFILE_TEST
+    const auto length = 50U;
+    raul::NameGenerator nameGenerator{ length };
+    EXPECT_EQ(nameGenerator.getPrefix().length(), length);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_Quantization.cpp b/training/src/tests/tests/lib/Test_Quantization.cpp
new file mode 100644
index 00000000..7efce906
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_Quantization.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/quantization/SymmetricQuantizer.h>
+
+namespace UT
+{
+
+using namespace raul::quantization;
+
+TEST(TestQuantization, SymmetricQuantizeSimpleUnit)
+{
+    PROFILE_TEST
+    const auto digits = 8U;
+    const auto round = static_cast<raul::dtype (*)(raul::dtype)>(&std::trunc);
+    auto quantizer = raul::quantization::SymmetricQuantizer(round, digits);
+
+    const raul::Tensor x{ 1.2_dt, 1.5_dt, 1.7_dt, -1.2_dt, -1.5_dt, -1.7_dt };
+    raul::Tensor y = x;
+
+    quantizer.quantize(y.begin(), y.end());
+
+    // Check
+    constexpr auto max_value = 1.7_dt;
+    const auto max_mapped_value = static_cast<raul::dtype>(std::pow(2, digits - 1) - 1);
+    const raul::dtype scale = max_mapped_value / max_value;
+
+    for (size_t i = 0; i < y.size(); ++i)
+    {
+        raul::dtype golden_val = round(x[i] * scale);
+        EXPECT_FLOAT_EQ(golden_val, y[i]);
+    }
+}
+
+TEST(TestQuantization, SymmetricQudeqSimpleUnit)
+{
+    PROFILE_TEST
+    const auto digits = 8U;
+    const auto round = static_cast<raul::dtype (*)(raul::dtype)>(&std::trunc);
+    auto quantizer = raul::quantization::SymmetricQuantizer(round, digits);
+
+    const raul::Tensor x{ 1.2_dt, 1.5_dt, 1.7_dt, -1.2_dt, -1.5_dt, -1.7_dt };
+    raul::Tensor y = x;
+
+    quantizer.quantize(y.begin(), y.end());
+    quantizer.dequantize(y.begin(), y.end());
+
+    // Check
+    constexpr auto max_value = 1.7_dt;
+    const auto max_mapped_value = static_cast<raul::dtype>(std::pow(2, digits - 1) - 1);
+    const raul::dtype scale = max_mapped_value / max_value;
+
+    for (size_t i = 0; i < y.size(); ++i)
+    {
+        raul::dtype golden_val = round(x[i] * scale) / scale;
+        EXPECT_FLOAT_EQ(golden_val, y[i]);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_Random.cpp b/training/src/tests/tests/lib/Test_Random.cpp
new file mode 100644
index 00000000..826d803a
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_Random.cpp
@@ -0,0 +1,146 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Random.h>
+
+#include <future>
+#include <set>
+#include <thread>
+
+namespace UT
+{
+
+TEST(TestRandom, SeedSetGetUnit)
+{
+    auto checkGlobalSeed = [](std::optional<size_t> seed = std::nullopt) {
+        auto currentSeedCheck1 = raul::random::getGlobalSeed();
+        auto currentSeedCheck2 = raul::random::getGlobalSeed();
+        if (seed)
+        {
+            EXPECT_EQ(currentSeedCheck1, *seed);
+            EXPECT_EQ(currentSeedCheck2, *seed);
+        }
+        else
+        {
+            EXPECT_EQ(currentSeedCheck1, currentSeedCheck2);
+        }
+    };
+
+    size_t manualSeed = 12345;
+
+    checkGlobalSeed();
+    raul::random::setGlobalSeed(manualSeed);
+    checkGlobalSeed(manualSeed);
+}
+
+TEST(TestRandom, SeedSetGetThreadUnit)
+{
+    auto checkGlobalSeed = [](size_t seed) {
+        auto currentSeedCheck1 = raul::random::getGlobalSeed();
+        auto currentSeedCheck2 = raul::random::getGlobalSeed();
+        EXPECT_EQ(currentSeedCheck1, seed);
+        EXPECT_EQ(currentSeedCheck2, seed);
+    };
+
+    size_t numberOfThreads = 100U;
+    size_t seed = raul::random::getGlobalSeed();
+
+    std::vector<std::thread> threads;
+
+    for (size_t i = 0; i < numberOfThreads; ++i)
+    {
+        threads.emplace_back(std::thread(checkGlobalSeed, seed));
+    }
+    for (auto& t : threads)
+    {
+        t.join();
+    }
+}
+
+TEST(TestRandom, LocalSeedThreadUnit)
+{
+    auto checkLocalSeed = [](size_t seed) {
+        const size_t localSeed = raul::random::getThreadSeed();
+        EXPECT_NE(localSeed, seed);
+    };
+
+    size_t numberOfThreads = 100U;
+    size_t seed = raul::random::getGlobalSeed();
+
+    std::vector<std::thread> threads;
+
+    for (size_t i = 0; i < numberOfThreads; ++i)
+    {
+        threads.emplace_back(std::thread(checkLocalSeed, seed));
+    }
+    for (auto& t : threads)
+    {
+        t.join();
+    }
+}
+
+TEST(TestRandom, GeneratorsThreadUnit)
+{
+    const raul::dtype p = 0.5_dt;
+    size_t numberOfValues = 100U;
+    size_t numberOfThreads = 10U;
+
+    auto generateSequence = [](const raul::dtype p, const size_t num) -> std::vector<bool> {
+        auto gen = raul::random::getGenerator();
+        std::vector<bool> container;
+        std::bernoulli_distribution distrib(p);
+        for (size_t i = 0; i < num; ++i)
+        {
+            container.push_back(distrib(gen));
+        }
+        return container;
+    };
+
+    std::vector<std::future<std::vector<bool>>> sequences;
+
+    for (size_t i = 0; i < numberOfThreads; ++i)
+    {
+        sequences.emplace_back(std::async(generateSequence, p, numberOfValues));
+    }
+    std::set<std::vector<bool>> seqSet;
+    for (auto& seq : sequences)
+    {
+        std::vector<bool> data = seq.get();
+        EXPECT_TRUE(seqSet.find(data) == seqSet.cend());
+        seqSet.insert(data);
+    }
+}
+
+// TEST(TestRandom, GeneratorRestoreSequenceUnit)
+//{
+//    const raul::dtype p = 0.5_dt;
+//    const size_t numberOfThreads = 10U;
+//    const size_t goldenSeed = 2477580634U;
+//    const std::vector<bool> goldenSequence{false, false, true, true, true, false, true, true, false, true};
+//    raul::random::setGlobalSeed(goldenSeed);
+//    auto gen = raul::random::getGenerator();
+//    std::vector<bool> container;
+//    std::bernoulli_distribution distrib(p);
+//
+//    for(size_t i=0; i<numberOfThreads; ++i)
+//    {
+//        container.push_back(distrib(gen));
+//    }
+//
+//    EXPECT_EQ(goldenSequence, container);
+//}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_Tensor.cpp b/training/src/tests/tests/lib/Test_Tensor.cpp
new file mode 100644
index 00000000..46f3ab47
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_Tensor.cpp
@@ -0,0 +1,1100 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <utility>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/system/TypeHalf.h>
+
+namespace UT
+{
+
+TEST(TestTensor, TensorIOUnit)
+{
+    raul::Tensor tensor("test_name", 1, 1, 1, 5);
+    tensor = { 1, 2, 3, 4, 5 };
+
+    raul::Tensor noname_tensor(1, 1, 1, 5);
+    noname_tensor = { 1, 2, 3, 4, 5 };
+
+    raul::Tensor noname_big_tensor(1, 1, 1, 20);
+    noname_big_tensor = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 };
+
+    // Default aka brief
+    {
+        testing::internal::CaptureStdout();
+        std::cout << tensor;
+        std::string output = testing::internal::GetCapturedStdout();
+        EXPECT_STREQ(output.c_str(), "Tensor 'test_name' (1,1,1,5)");
+    }
+
+    {
+        testing::internal::CaptureStdout();
+        std::cout << noname_tensor;
+        std::string output = testing::internal::GetCapturedStdout();
+        EXPECT_STREQ(output.c_str(), "Tensor (1,1,1,5)");
+    }
+
+    // Explicit brief
+    {
+        testing::internal::CaptureStdout();
+        std::cout << raul::io::tensor::brief << tensor;
+        std::string output = testing::internal::GetCapturedStdout();
+        EXPECT_STREQ(output.c_str(), "Tensor 'test_name' (1,1,1,5)");
+    }
+
+    // Full
+    {
+        testing::internal::CaptureStdout();
+        std::cout << raul::io::tensor::full << tensor;
+        std::string output = testing::internal::GetCapturedStdout();
+        EXPECT_STREQ(output.c_str(), "Tensor 'test_name' (1,1,1,5), size: 20\n[1,2,3,4,5]");
+    }
+
+    {
+        testing::internal::CaptureStdout();
+        std::cout << raul::io::tensor::full << noname_big_tensor;
+        std::string output = testing::internal::GetCapturedStdout();
+        EXPECT_STREQ(output.c_str(), "Tensor (1,1,1,20), size: 80\n[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]");
+    }
+
+    // Compact
+    {
+        testing::internal::CaptureStdout();
+        std::cout << raul::io::tensor::compact << tensor;
+        std::string output = testing::internal::GetCapturedStdout();
+        EXPECT_STREQ(output.c_str(), "Tensor 'test_name' (1,1,1,5), size: 20\n[1,2,3,4,5]");
+    }
+
+    {
+        testing::internal::CaptureStdout();
+        std::cout << raul::io::tensor::compact << noname_big_tensor;
+        std::string output = testing::internal::GetCapturedStdout();
+        EXPECT_STREQ(output.c_str(), "Tensor (1,1,1,20), size: 80\n[1,2,3,4,5,...,16,17,18,19,20]");
+    }
+
+    // Manual flags
+    {
+        testing::internal::CaptureStdout();
+        std::cout << raul::io::tensor::setview(raul::io::tensor::TensorView::content | raul::io::tensor::TensorView::reduced) << tensor;
+        std::string output = testing::internal::GetCapturedStdout();
+        EXPECT_STREQ(output.c_str(), "Tensor 'test_name' (1,1,1,5)\n[1,2,3,4,5]");
+    }
+}
+
+TEST(TestTensor, TensorFP16IOUnit)
+{
+    raul::TensorFP16 noname_big_tensor = { 1_hf, 2_hf, 3_hf, 4_hf, 5_hf, 6_hf, 7_hf, 8_hf, 9_hf, 10_hf, 11_hf, 12_hf, 13_hf, 14_hf, 15_hf, 16_hf, 17_hf, 18_hf, 19_hf, 20_hf };
+
+    {
+        testing::internal::CaptureStdout();
+        std::cout << raul::io::tensor::compact << noname_big_tensor;
+        std::string output = testing::internal::GetCapturedStdout();
+        EXPECT_STREQ(output.c_str(), "Tensor (1,1,1,20), size: 40\n[1,2,3,4,5,...,16,17,18,19,20]");
+    }
+
+}
+
+TEST(TestTensor, ConstructorUnit)
+{
+    PROFILE_TEST
+    // Tensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, bool isAllocate = true)
+    raul::Tensor t("test", 1, 2, 3, 4);
+
+    EXPECT_EQ(t.size(), 24u);
+
+    EXPECT_EQ(t.getBatchSize(), 1u);
+    EXPECT_EQ(t.getDepth(), 2u);
+    EXPECT_EQ(t.getHeight(), 3u);
+    EXPECT_EQ(t.getWidth(), 4u);
+}
+
+TEST(TestTensor, Constructor2Unit)
+{
+    PROFILE_TEST
+    // Tensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, dt filler)
+    raul::Tensor t("test", 1, 2, 3, 4, 2.0_dt);
+
+    EXPECT_EQ(t.size(), 24u);
+
+    EXPECT_EQ(t.getBatchSize(), 1u);
+    EXPECT_EQ(t.getDepth(), 2u);
+    EXPECT_EQ(t.getHeight(), 3u);
+    EXPECT_EQ(t.getWidth(), 4u);
+
+    size_t count = 0;
+    for (auto d : t)
+    {
+        ++count;
+        EXPECT_EQ(d, 2.0_dt);
+    }
+
+    EXPECT_EQ(count, 24u);
+}
+
+TEST(TestTensor, Constructor3Unit)
+{
+    PROFILE_TEST
+    raul::dtype data[] = { 1.0_dt, 2.0_dt, 3.0_dt };
+
+    // Tensor(dt_range beginEnd)
+    raul::Tensor t(raul::Tensor::dt_range(data, data + 3));
+    // Tensor(size_t size, dt filler)
+    raul::Tensor t2(10, 1.0_dt);
+    // TensorImpl(size_t size)
+    raul::Tensor t3(10);
+
+    EXPECT_EQ(t.size(), 3u);
+    EXPECT_EQ(t2.size(), 10u);
+    EXPECT_EQ(t3.size(), 10u);
+
+    EXPECT_EQ(t.getBatchSize(), 1u);
+    EXPECT_EQ(t.getDepth(), 1u);
+    EXPECT_EQ(t.getHeight(), 1u);
+    EXPECT_EQ(t.getWidth(), 3u);
+
+    EXPECT_EQ(t2.getBatchSize(), 1u);
+    EXPECT_EQ(t2.getDepth(), 1u);
+    EXPECT_EQ(t2.getHeight(), 1u);
+    EXPECT_EQ(t2.getWidth(), 10u);
+
+    EXPECT_EQ(t3.getBatchSize(), 1u);
+    EXPECT_EQ(t3.getDepth(), 1u);
+    EXPECT_EQ(t3.getHeight(), 1u);
+    EXPECT_EQ(t3.getWidth(), 10u);
+
+    EXPECT_EQ(t[0], 1.0_dt);
+    EXPECT_EQ(t[1], 2.0_dt);
+    EXPECT_EQ(t[2], 3.0_dt);
+
+    size_t count = 0;
+    for (auto d : t2)
+    {
+        ++count;
+        EXPECT_EQ(d, 1.0_dt);
+    }
+
+    EXPECT_EQ(count, 10u);
+}
+
+TEST(TestTensor, Constructor4Unit)
+{
+    PROFILE_TEST
+    raul::dtype data[] = { 1.0_dt, 2.0_dt, 3.0_dt };
+
+    // Tensor(shape inShape, dt_range beginEnd)
+    raul::Tensor t(yato::dims(1, 3, 1, 1), raul::Tensor::dt_range(data, data + 3));
+
+    EXPECT_EQ(t.size(), 3u);
+
+    EXPECT_EQ(t.getBatchSize(), 1u);
+    EXPECT_EQ(t.getDepth(), 3u);
+    EXPECT_EQ(t.getHeight(), 1u);
+    EXPECT_EQ(t.getWidth(), 1u);
+
+    EXPECT_EQ(t[0], 1.0_dt);
+    EXPECT_EQ(t[1], 2.0_dt);
+    EXPECT_EQ(t[2], 3.0_dt);
+}
+
+TEST(TestTensor, InitializerListUnit)
+{
+    PROFILE_TEST
+    // Tensor(const Name& name, std::initializer_list<dt> list)
+    raul::Tensor t("test", { 1.0_dt, 2.0_dt });
+    // Tensor(std::initializer_list<dt> list)
+    raul::Tensor t3({ 1.0_dt, 2.0_dt });
+    raul::Tensor t2{ 1.0_dt, 2.0_dt };
+
+    EXPECT_EQ(t.size(), 2u);
+    EXPECT_EQ(t2.size(), 2u);
+    EXPECT_EQ(t3.size(), 2u);
+
+    EXPECT_EQ(t.getBatchSize(), 1u);
+    EXPECT_EQ(t.getDepth(), 1u);
+    EXPECT_EQ(t.getHeight(), 1u);
+    EXPECT_EQ(t.getWidth(), 2u);
+
+    EXPECT_EQ(t2.getBatchSize(), 1u);
+    EXPECT_EQ(t2.getDepth(), 1u);
+    EXPECT_EQ(t2.getHeight(), 1u);
+    EXPECT_EQ(t2.getWidth(), 2u);
+
+    EXPECT_EQ(t3.getBatchSize(), 1u);
+    EXPECT_EQ(t3.getDepth(), 1u);
+    EXPECT_EQ(t3.getHeight(), 1u);
+    EXPECT_EQ(t3.getWidth(), 2u);
+
+    EXPECT_EQ(t[0], 1.0_dt);
+    EXPECT_EQ(t[1], 2.0_dt);
+
+    EXPECT_EQ(t2[0], 1.0_dt);
+    EXPECT_EQ(t2[1], 2.0_dt);
+
+    EXPECT_EQ(t3[0], 1.0_dt);
+    EXPECT_EQ(t3[1], 2.0_dt);
+}
+
+TEST(TestTensor, InitializerList2Unit)
+{
+    PROFILE_TEST
+    // Tensor(const Name& name, size_t batchSize, size_t depth, size_t height, size_t width, std::initializer_list<dtype> list)
+    raul::Tensor t("test", 2, 1, 1, 1, { 1.0_dt, 2.0_dt });
+    // Tensor(size_t batchSize, size_t depth, size_t height, size_t width, std::initializer_list<dtype> list)
+    EXPECT_THROW(raul::Tensor t3(2, 1, 1, 2, { 1.0_dt, 2.0_dt }), raul::Exception);
+    raul::Tensor t2(2, 1, 1, 1, { 1.0_dt, 2.0_dt });
+
+    EXPECT_EQ(t.getShape(), yato::dims(2, 1, 1, 1));
+    EXPECT_EQ(t2.getShape(), yato::dims(2, 1, 1, 1));
+
+    EXPECT_EQ(t[0], 1.0_dt);
+    EXPECT_EQ(t[1], 2.0_dt);
+
+    EXPECT_EQ(t2[0], 1.0_dt);
+    EXPECT_EQ(t2[1], 2.0_dt);
+}
+
+TEST(TestTensor, InitializerList3Unit)
+{
+    PROFILE_TEST
+    // Tensor(const Name& name, shape inShape, std::initializer_list<dtype> list)
+    raul::Tensor t("test", yato::dims(2, 1, 1, 1), { 1.0_dt, 2.0_dt });
+    // Tensor(shape inShape, std::initializer_list<dtype> list)
+    EXPECT_THROW(raul::Tensor t3(t.getShape(), { 1.0_dt, 2.0_dt, 1.0_dt }), raul::Exception);
+    raul::Tensor t2(t.getShape(), { 1.0_dt, 2.0_dt });
+
+    EXPECT_EQ(t.getShape(), yato::dims(2, 1, 1, 1));
+    EXPECT_EQ(t2.getShape(), yato::dims(2, 1, 1, 1));
+
+    EXPECT_EQ(t[0], 1.0_dt);
+    EXPECT_EQ(t[1], 2.0_dt);
+
+    EXPECT_EQ(t2[0], 1.0_dt);
+    EXPECT_EQ(t2[1], 2.0_dt);
+}
+
+TEST(TestTensor, CompressDecompressUnit)
+{
+    PROFILE_TEST
+    const size_t size = 100;
+    constexpr raul::dtype minVal = -1000.0_dt;
+    constexpr raul::dtype maxVal = 1000.0_dt;
+    constexpr raul::dtype unit = (maxVal - minVal) / TODTYPE(size - 1);
+
+    constexpr raul::dtype epsFP16 = 0.5_dt;
+    constexpr raul::dtype epsINT8 = 10.0_dt;
+
+    raul::Tensor t(size);
+
+    for (size_t q = 0; q < size; ++q)
+    {
+        t[q] = minVal + static_cast<raul::dtype>(q) * unit;
+    }
+
+    raul::Tensor t2(TORANGE(t));
+    raul::Tensor t3(TORANGE(t));
+
+    ASSERT_NO_THROW(t2.compress(raul::CompressionMode::FP16));
+    ASSERT_NO_THROW(t2.decompress(raul::CompressionMode::FP16));
+
+    ASSERT_NO_THROW(t3.compress(raul::CompressionMode::INT8));
+    ASSERT_NO_THROW(t3.decompress(raul::CompressionMode::INT8));
+
+    for (size_t q = 0; q < size; ++q)
+    {
+        EXPECT_NEAR(t[q], t2[q], epsFP16);
+        EXPECT_NEAR(t[q], t3[q], epsINT8);
+    }
+
+    raul::Tensor t4(0);
+    ASSERT_THROW(t4.compress(raul::CompressionMode::FP16), raul::Exception);
+    ASSERT_NO_THROW(t4.decompress(raul::CompressionMode::FP16));
+
+    ASSERT_THROW(t4.compress(raul::CompressionMode::INT8), raul::Exception);
+    ASSERT_NO_THROW(t4.decompress(raul::CompressionMode::INT8));
+
+    raul::Tensor t5(10, 0.0_dt);
+    ASSERT_NO_THROW(t5.compress(raul::CompressionMode::FP16));
+    ASSERT_NO_THROW(t5.decompress(raul::CompressionMode::FP16));
+    for (size_t q = 0; q < 10; ++q)
+    {
+        EXPECT_EQ(t5[q], 0.0_dt);
+    }
+
+    ASSERT_NO_THROW(t5.compress(raul::CompressionMode::INT8));
+    ASSERT_NO_THROW(t5.decompress(raul::CompressionMode::INT8));
+    for (size_t q = 0; q < 10; ++q)
+    {
+        EXPECT_EQ(t5[q], 0.0_dt);
+    }
+
+    raul::Tensor t6(10, 1.55_dt);
+    ASSERT_NO_THROW(t6.compress(raul::CompressionMode::FP16));
+    ASSERT_NO_THROW(t6.decompress(raul::CompressionMode::FP16));
+    for (size_t q = 0; q < 10; ++q)
+    {
+        EXPECT_NEAR(t6[q], 1.55_dt, epsFP16);
+    }
+
+    raul::Tensor t7(10, 1.55_dt);
+    ASSERT_NO_THROW(t7.compress(raul::CompressionMode::INT8));
+    ASSERT_NO_THROW(t7.decompress(raul::CompressionMode::INT8));
+    for (size_t q = 0; q < 10; ++q)
+    {
+        EXPECT_EQ(t7[q], 1.55_dt); // no accuracy loss
+    }
+}
+
+TEST(TestTensor, BroadcastCaseFrom12To22Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensor = memory_manager.createTensor("x", 1, 2, 1, 1);
+    std::generate(tensor->begin(), tensor->end(), [n = 1.0_dt]() mutable { return n++; });
+
+    // Broadcasting
+    const auto tensor_viewer = tensor->getBroadcastedViewer(yato::dims(2, 2, 1, 1));
+
+    const auto mapping = { std::make_pair(0, 0), std::make_pair(1, 1), std::make_pair(2, 0), std::make_pair(3, 1) };
+
+    // Check
+    for (auto test_case : mapping)
+    {
+        EXPECT_EQ(tensor_viewer[test_case.first], (*tensor)[test_case.second]);
+    }
+}
+
+TEST(TestTensor, BroadcastCaseFrom21To22Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensor = memory_manager.createTensor("x", 2, 1, 1, 1);
+    std::generate(tensor->begin(), tensor->end(), [n = 1.0_dt]() mutable { return n++; });
+
+    // Broadcasting
+    auto tensor_viewer = tensor->getBroadcastedViewer(yato::dims(2, 2, 1, 1));
+
+    const auto mapping = { std::make_pair(0, 0), std::make_pair(1, 0), std::make_pair(2, 1), std::make_pair(3, 1) };
+
+    // Check
+    for (auto test_case : mapping)
+    {
+        EXPECT_EQ(tensor_viewer[test_case.first], (*tensor)[test_case.second]);
+    }
+}
+
+TEST(TestTensor, BroadcastCaseFrom22To22Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensor = memory_manager.createTensor("x", 2, 2, 1, 1);
+    std::generate(tensor->begin(), tensor->end(), [n = 1.0_dt]() mutable { return n++; });
+
+    // Broadcasting
+    auto tensor_viewer = tensor->getBroadcastedViewer(yato::dims(2, 2, 1, 1));
+
+    const auto mapping = { std::make_pair(0, 0), std::make_pair(1, 1), std::make_pair(2, 2), std::make_pair(3, 3) };
+
+    // Check
+    for (auto test_case : mapping)
+    {
+        EXPECT_EQ(tensor_viewer[test_case.first], (*tensor)[test_case.second]);
+    }
+}
+
+TEST(TestTensor, BroadcastCaseFrom212To222Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensor = memory_manager.createTensor("x", 2, 1, 2, 1);
+    std::generate(tensor->begin(), tensor->end(), [n = 1.0_dt]() mutable { return n++; });
+
+    // Broadcasting
+    auto tensor_viewer = tensor->getBroadcastedViewer(yato::dims(2, 2, 2, 1));
+
+    const auto mapping = { std::make_pair(0, 0), std::make_pair(1, 1), std::make_pair(2, 0), std::make_pair(3, 1),
+                           std::make_pair(4, 2), std::make_pair(5, 3), std::make_pair(6, 2), std::make_pair(7, 3) };
+
+    // Check
+    for (auto test_case : mapping)
+    {
+        EXPECT_EQ(tensor_viewer[test_case.first], (*tensor)[test_case.second]);
+    }
+}
+
+TEST(TestTensor, BroadcastCaseFrom122To222Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensor = memory_manager.createTensor("x", 1, 2, 2, 1);
+    std::generate(tensor->begin(), tensor->end(), [n = 1.0_dt]() mutable { return n++; });
+
+    // Broadcasting
+    auto tensor_viewer = tensor->getBroadcastedViewer(yato::dims(2, 2, 2, 1));
+
+    const auto mapping = { std::make_pair(0, 0), std::make_pair(1, 1), std::make_pair(2, 2), std::make_pair(3, 3),
+                           std::make_pair(4, 0), std::make_pair(5, 1), std::make_pair(6, 2), std::make_pair(7, 3) };
+
+    // Check
+    for (auto test_case : mapping)
+    {
+        EXPECT_EQ(tensor_viewer[test_case.first], (*tensor)[test_case.second]);
+    }
+}
+
+TEST(TestTensor, BroadcastCaseFrom111To122Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensor = memory_manager.createTensor("x", 1, 1, 1, 1);
+    std::generate(tensor->begin(), tensor->end(), [n = 1.0_dt]() mutable { return n++; });
+
+    // Broadcasting
+    auto tensor_viewer = tensor->getBroadcastedViewer(yato::dims(1, 2, 2, 1));
+
+    const auto mapping = {
+        std::make_pair(0, 0),
+        std::make_pair(1, 0),
+        std::make_pair(2, 0),
+        std::make_pair(3, 0),
+    };
+
+    // Check
+    for (auto test_case : mapping)
+    {
+        EXPECT_EQ(tensor_viewer[test_case.first], (*tensor)[test_case.second]);
+    }
+}
+
+TEST(TestTensor, BroadcastWriteFrom11To22Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensor = memory_manager.createTensor("x", 1, 1, 1, 1);
+    std::fill(tensor->begin(), tensor->end(), 0.0_dt);
+
+    // Broadcasting
+    auto tensor_viewer = tensor->getBroadcastedViewer(yato::dims(2, 2, 1, 1));
+
+    for (size_t i = 0; i < tensor_viewer.size(); ++i)
+    {
+        tensor_viewer[0] += 1.0_dt;
+    }
+
+    // Check
+    EXPECT_EQ((*tensor)[0], static_cast<raul::dtype>(tensor_viewer.size()));
+}
+
+TEST(TestTensor, PlusEqualOperatorUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensorA = memory_manager.createTensor("x", 1, 1, 1, 10);
+    auto* tensorB = memory_manager.createTensor("y", 1, 1, 1, 10);
+    std::fill(tensorA->begin(), tensorA->end(), 1.0_dt);
+    std::fill(tensorB->begin(), tensorB->end(), 2.0_dt);
+
+    EXPECT_EQ(tensorA->size(), tensorB->size());
+    EXPECT_EQ(tensorA->size(), 10u);
+
+    memory_manager["y"] += memory_manager["x"];
+
+    for (size_t i = 0; i < tensorA->size(); ++i)
+    {
+        EXPECT_EQ((*tensorA)[i], 1.0_dt);
+        EXPECT_EQ((*tensorB)[i], 3.0_dt);
+    }
+
+    memory_manager["y"] += 10.0_dt;
+
+    for (size_t i = 0; i < tensorA->size(); ++i)
+    {
+        EXPECT_EQ((*tensorB)[i], 13.0_dt);
+    }
+}
+
+TEST(TestTensor, MinusEqualOperatorUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensorA = memory_manager.createTensor("x", 1, 1, 1, 10);
+    auto* tensorB = memory_manager.createTensor("y", 1, 1, 1, 10);
+    std::fill(tensorA->begin(), tensorA->end(), 1.0_dt);
+    std::fill(tensorB->begin(), tensorB->end(), 2.0_dt);
+
+    EXPECT_EQ(tensorA->size(), tensorB->size());
+    EXPECT_EQ(tensorA->size(), 10u);
+
+    memory_manager["y"] -= memory_manager["x"];
+
+    for (size_t i = 0; i < tensorA->size(); ++i)
+    {
+        EXPECT_EQ((*tensorA)[i], 1.0_dt);
+        EXPECT_EQ((*tensorB)[i], 1.0_dt);
+    }
+
+    memory_manager["y"] -= 10.0_dt;
+
+    for (size_t i = 0; i < tensorA->size(); ++i)
+    {
+        EXPECT_EQ((*tensorB)[i], -9.0_dt);
+    }
+}
+
+TEST(TestTensor, MulEqualOperatorUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensorA = memory_manager.createTensor("x", 1, 1, 1, 10);
+    auto* tensorB = memory_manager.createTensor("y", 1, 1, 1, 10);
+    std::fill(tensorA->begin(), tensorA->end(), 1.0_dt);
+    std::fill(tensorB->begin(), tensorB->end(), 2.0_dt);
+
+    EXPECT_EQ(tensorA->size(), tensorB->size());
+    EXPECT_EQ(tensorA->size(), 10u);
+
+    memory_manager["y"] *= memory_manager["x"];
+
+    for (size_t i = 0; i < tensorA->size(); ++i)
+    {
+        EXPECT_EQ((*tensorA)[i], 1.0_dt);
+        EXPECT_EQ((*tensorB)[i], 2.0_dt);
+    }
+
+    memory_manager["y"] *= 10.0_dt;
+
+    for (size_t i = 0; i < tensorA->size(); ++i)
+    {
+        EXPECT_EQ((*tensorB)[i], 20.0_dt);
+    }
+}
+
+TEST(TestTensor, DivEqualOperatorUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto* tensorA = memory_manager.createTensor("x", 1, 1, 1, 10);
+    auto* tensorB = memory_manager.createTensor("y", 1, 1, 1, 10);
+    std::fill(tensorA->begin(), tensorA->end(), 1.0_dt);
+    std::fill(tensorB->begin(), tensorB->end(), 2.0_dt);
+
+    EXPECT_EQ(tensorA->size(), tensorB->size());
+    EXPECT_EQ(tensorA->size(), 10u);
+
+    memory_manager["x"] /= memory_manager["y"];
+
+    for (size_t i = 0; i < tensorA->size(); ++i)
+    {
+        EXPECT_EQ((*tensorB)[i], 2.0_dt);
+        EXPECT_EQ((*tensorA)[i], 0.5_dt);
+    }
+
+    memory_manager["x"] /= 10.0_dt;
+
+    for (size_t i = 0; i < tensorA->size(); ++i)
+    {
+        EXPECT_EQ((*tensorA)[i], 0.05_dt);
+    }
+}
+
+TEST(TestTensor, MaxIndexUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto& t = *memory_manager.createTensor(1, 1, 1, 10, 0.0_dt);
+
+    EXPECT_EQ(t.size(), 10u);
+
+    t[3] = 10.0_dt;
+    t[5] = 15.0_dt;
+    t[7] = 5.0_dt;
+
+    EXPECT_EQ(t.getMaxIndex(), 5u);
+    EXPECT_EQ(t.getMaxIndex(6, 10), 1u);
+}
+
+TEST(TestTensor, TensorU8GeneralUnit)
+{
+    PROFILE_TEST
+    raul::TensorU8 t(10, 255);
+
+    EXPECT_EQ(t.size(), 10u);
+
+    EXPECT_EQ(t.getBatchSize(), 1u);
+    EXPECT_EQ(t.getDepth(), 1u);
+    EXPECT_EQ(t.getHeight(), 1u);
+    EXPECT_EQ(t.getWidth(), 10u);
+
+    size_t count = 0;
+    for (auto d : t)
+    {
+        ++count;
+        EXPECT_EQ(d, 255);
+    }
+
+    EXPECT_EQ(count, 10u);
+}
+
+TEST(TestTensor, FindUnit)
+{
+    PROFILE_TEST
+    raul::Tensor::iterator it;
+    raul::Tensor t{ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt };
+    EXPECT_EQ(t.size(), 5U);
+
+    for (size_t i = 0; i < t.size(); ++i)
+    {
+        it = std::find(t.begin(), t.end(), t[i]);
+        const auto pos = std::distance(t.begin(), it);
+        EXPECT_TRUE(it != t.end());
+        EXPECT_EQ(static_cast<ptrdiff_t>(i), pos);
+    }
+
+    it = std::find(t.begin(), t.end(), -10.0_dt);
+    EXPECT_TRUE(it == t.end());
+}
+
+TEST(TestTensor, PoolAllocationUnit)
+{
+    PROFILE_TEST
+
+    raul::Tensor t("", 10u, 1u, 1u, 1u, raul::AllocationMode::POOL, false);
+
+    std::vector<raul::dtype> pool(t.getShape().total_size(), 10_dt);
+
+    EXPECT_EQ(t.size(), 0u);
+    EXPECT_TRUE(t.empty());
+
+    t.memAllocate(pool.data());
+
+    EXPECT_EQ(t.size(), 10u);
+    EXPECT_FALSE(t.empty());
+
+    {
+        size_t counter = 0;
+        for (auto data : t)
+        {
+            ++counter;
+            EXPECT_EQ(data, 0_dt);
+        }
+
+        EXPECT_EQ(counter, 10u);
+    }
+
+    std::fill(pool.begin(), pool.end(), 10_dt);
+
+    {
+        size_t counter = 0;
+        for (auto data : t)
+        {
+            ++counter;
+            EXPECT_EQ(data, 10_dt);
+        }
+
+        EXPECT_EQ(counter, 10u);
+    }
+
+    t = 12_dt;
+
+    for (auto data : pool)
+    {
+        EXPECT_EQ(data, 12_dt);
+    }
+
+    {
+        raul::Tensor tt(10u, 1u, 1u, 1u);
+        tt = 22_dt;
+        t = TORANGE(tt);
+    }
+
+    for (auto data : pool)
+    {
+        EXPECT_EQ(data, 22_dt);
+    }
+
+    // wrong size
+    {
+        raul::Tensor tt(11u, 1u, 1u, 1u);
+        tt = 22_dt;
+        EXPECT_THROW(t = TORANGE(tt), raul::Exception);
+    }
+
+    t.memClear();
+
+    EXPECT_EQ(t.size(), 0u);
+
+    EXPECT_EQ(pool.size(), 10u);
+
+    pool.push_back(11_dt);
+    EXPECT_EQ(pool.size(), 11u);
+
+    t.memAllocate(pool.data());
+
+    std::fill(pool.begin(), pool.end(), 22_dt);
+
+    EXPECT_EQ(t.size(), 10u);
+
+    {
+        size_t counter = 0;
+        for (auto data : t)
+        {
+            ++counter;
+            EXPECT_EQ(data, 22_dt);
+        }
+
+        EXPECT_EQ(counter, 10u);
+    }
+
+    {
+        raul::Tensor t2("", 1u, 1u, 1u, 1u, raul::AllocationMode::POOL, true);
+
+        EXPECT_EQ(t2.size(), 0u); // not allocated with POOL
+        EXPECT_TRUE(t2.empty());
+
+        t2.memAllocate(&pool[10]);
+        EXPECT_EQ(t2[0], 0_dt);
+        pool[10] = 11_dt;
+
+        EXPECT_EQ(t2.size(), 1u);
+        EXPECT_EQ(t2[0], 11_dt);
+    }
+}
+
+TEST(TestTensor, ConversionsFP16FP32)
+{
+    raul::Tensor t = { 1.0_dt, 2.0_dt };
+    raul::TensorFP16 t16(t.size());
+    raul::Tensor t2(t.size());
+
+    t16 = TORANGE(t);
+    t2 = TORANGE_FP16(t16);
+
+    EXPECT_EQ(t16.size(), 2u);
+    EXPECT_EQ(raul::toFloat32(t16[0]), 1_dt);
+    EXPECT_EQ(raul::toFloat32(t16[1]), 2_dt);
+
+    EXPECT_EQ(t2.size(), 2u);
+    EXPECT_EQ(t2[0], 1_dt);
+    EXPECT_EQ(t2[1], 2_dt);
+}
+
+TEST(TestTensor, PerformanceFP16FP32)
+{
+    // d.polubotko: make sure -D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC=1 -march=armv8.2-a+fp16+dotprod added into cxx flags
+
+    const size_t size = 100000000;
+
+    raul::TensorFP16 tA16(size);
+    raul::Tensor tA(size);
+    raul::TensorFP16 tB16(size);
+    raul::Tensor tB(size);
+    raul::TensorFP16 tC16(size);
+    raul::Tensor tC(size);
+
+    auto gen = raul::random::getGenerator();
+    std::uniform_real_distribution distrib(1.0, 2.0);
+
+    for (size_t q = 0; q < size; ++q)
+    {
+        tA[q] = static_cast<float>(distrib(gen));
+        tB[q] = static_cast<float>(distrib(gen));
+        tC[q] = static_cast<float>(distrib(gen));
+        tA16[q] = raul::toFloat16(tA[q]);
+        tB16[q] = raul::toFloat16(tB[q]);
+        tC16[q] = raul::toFloat16(tC[q]);
+    }
+
+    std::chrono::steady_clock::time_point timeStart = std::chrono::steady_clock::now();
+    for (size_t q = 0; q < size; ++q)
+    {
+        tC[q] = tA[q] * tB[q];
+    }
+    printf("FP32: %.6f\n", static_cast<float>(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count()) / 1000.0f);
+
+    timeStart = std::chrono::steady_clock::now();
+    for (size_t q = 0; q < size; ++q)
+    {
+        tC16[q] = tA16[q] * tB16[q];
+    }
+    printf("FP16: %.6f\n", static_cast<float>(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count()) / 1000.0f);
+}
+
+TEST(TestTensor, ScaleUnscaleUnit)
+{
+    raul::Tensor original = { 1.0_dt, 2.0_dt };
+    raul::Tensor scaled = { 2.0_dt, 4.0_dt };
+
+    raul::Tensor tensor = original;
+
+    tensor.scale(2.0_dt);
+    EXPECT_EQ(tensor.size(), scaled.size());
+
+    for (size_t i = 0; i < tensor.size(); ++i)
+    {
+        EXPECT_EQ(tensor[i], scaled[i]);
+    }
+
+    tensor.unscale();
+
+    EXPECT_EQ(tensor.size(), tensor.size());
+
+    for (size_t i = 0; i < tensor.size(); ++i)
+    {
+        EXPECT_EQ(tensor[i], original[i]);
+    }
+}
+
+TEST(TestTensor, ScaleUnscaleTwiceUnit)
+{
+    raul::Tensor original = { 1.0_dt, 2.0_dt };
+    raul::Tensor scaled_twice = { 6.0_dt, 12.0_dt };
+
+    raul::Tensor tensor = original;
+
+    tensor.scale(2.0_dt);
+    tensor.scale(3.0_dt);
+
+    EXPECT_EQ(tensor.size(), scaled_twice.size());
+
+    for (size_t i = 0; i < tensor.size(); ++i)
+    {
+        EXPECT_EQ(tensor[i], scaled_twice[i]);
+    }
+
+    tensor.unscale();
+
+    EXPECT_EQ(tensor.size(), tensor.size());
+
+    for (size_t i = 0; i < tensor.size(); ++i)
+    {
+        EXPECT_EQ(tensor[i], original[i]);
+    }
+
+    tensor.unscale();
+
+    EXPECT_EQ(tensor.size(), tensor.size());
+
+    for (size_t i = 0; i < tensor.size(); ++i)
+    {
+        EXPECT_EQ(tensor[i], original[i]);
+    }
+}
+
+TEST(TestTensor, FP16DivEqualOperatorUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    auto* tensorA = memory_manager.createTensor("x", 1, 1, 1, 10);
+    auto* tensorB = memory_manager.createTensor("y", 1, 1, 1, 10);
+    std::fill(tensorA->begin(), tensorA->end(), 1.0_hf);
+    std::fill(tensorB->begin(), tensorB->end(), 2.0_hf);
+
+    EXPECT_EQ(tensorA->size(), tensorB->size());
+    EXPECT_EQ(tensorA->size(), 10u);
+
+    memory_manager["x"] /= memory_manager["y"];
+
+    for (size_t i = 0; i < tensorA->size(); ++i)
+    {
+        EXPECT_EQ((*tensorB)[i], 2.0_hf);
+        EXPECT_EQ((*tensorA)[i], 0.5_hf);
+    }
+
+    memory_manager["x"] /= 10.0_hf;
+
+    for (size_t i = 0; i < tensorA->size(); ++i)
+    {
+        EXPECT_EQ((*tensorA)[i], 0.05_hf);
+    }
+}
+
+TEST(TestTensor, TensorFP32FP16PromotionsSumUnit)
+{
+    std::cout << raul::io::tensor::full;
+
+    const auto abs_err = 1e-3_dt;
+    const auto arg_a = 0.1_dt;
+    const auto arg_b = 0.2_dt;
+    const auto res_c = arg_a+arg_b;
+
+    // Tensor[dtype] ?= dtype
+    {
+        raul::Tensor result{static_cast<raul::dtype>(arg_a)};
+        result += static_cast<raul::dtype>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[half] ?= half
+    {
+        raul::TensorFP16 result{static_cast<raul::half>(arg_a)};
+        result += static_cast<raul::half>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[dtype] ?= half (promote to dtype)
+    {
+        raul::Tensor result{static_cast<raul::dtype>(arg_a)};
+        result += static_cast<raul::half>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[half] ?= dtype (promote to dtype and result convert to half)
+    {
+        raul::TensorFP16 result{static_cast<raul::half>(arg_a)};
+        result += static_cast<raul::dtype>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    std::cout << raul::io::tensor::brief;
+}
+
+TEST(TestTensor, TensorFP32FP16PromotionsSubUnit)
+{
+    std::cout << raul::io::tensor::full;
+
+    const auto abs_err = 1e-3_dt;
+    const auto arg_a = 0.1_dt;
+    const auto arg_b = 0.2_dt;
+    const auto res_c = arg_a-arg_b;
+
+    // Tensor[dtype] ?= dtype
+    {
+        raul::Tensor result{static_cast<raul::dtype>(arg_a)};
+        result -= static_cast<raul::dtype>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[half] ?= half
+    {
+        raul::TensorFP16 result{static_cast<raul::half>(arg_a)};
+        result -= static_cast<raul::half>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[dtype] ?= half (promote to dtype)
+    {
+        raul::Tensor result{static_cast<raul::dtype>(arg_a)};
+        result -= static_cast<raul::half>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[half] ?= dtype (promote to dtype and result convert to half)
+    {
+        raul::TensorFP16 result{static_cast<raul::half>(arg_a)};
+        result -= static_cast<raul::dtype>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    std::cout << raul::io::tensor::brief;
+}
+
+TEST(TestTensor, TensorFP32FP16PromotionsMulUnit)
+{
+    std::cout << raul::io::tensor::full;
+
+    const auto abs_err = 1e-3_dt;
+    const auto arg_a = 5.0_dt;
+    const auto arg_b = 1e-1_dt;
+    const auto res_c = arg_a*arg_b;
+
+    // Tensor[dtype] ?= dtype
+    {
+        raul::Tensor result{static_cast<raul::dtype>(arg_a)};
+        result *= static_cast<raul::dtype>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[half] ?= half
+    {
+        raul::TensorFP16 result{static_cast<raul::half>(arg_a)};
+        result *= static_cast<raul::half>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[dtype] ?= half (promote to dtype)
+    {
+        raul::Tensor result{static_cast<raul::dtype>(arg_a)};
+        result *= static_cast<raul::half>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[half] ?= dtype (promote to dtype and result convert to half)
+    {
+        raul::TensorFP16 result{static_cast<raul::half>(arg_a)};
+        result *= static_cast<raul::dtype>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    std::cout << raul::io::tensor::brief;
+}
+
+TEST(TestTensor, TensorFP32FP16PromotionsDivUnit)
+{
+    std::cout << raul::io::tensor::full;
+
+    const auto abs_err = 1e-3;
+    const auto arg_a = 0.5;
+    const auto arg_b = 3.0;
+    const auto res_c = arg_a/arg_b;
+
+    // Tensor[dtype] ?= dtype
+    {
+        raul::Tensor result{static_cast<raul::dtype>(arg_a)};
+        result /= static_cast<raul::dtype>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[half] ?= half
+    {
+        raul::TensorFP16 result{static_cast<raul::half>(arg_a)};
+        result /= static_cast<raul::half>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[dtype] ?= half (promote to dtype)
+    {
+        raul::Tensor result{static_cast<raul::dtype>(arg_a)};
+        result /= static_cast<raul::half>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    // Tensor[half] ?= dtype (promote to dtype and result convert to half)
+    {
+        raul::TensorFP16 result{static_cast<raul::half>(arg_a)};
+        result /= static_cast<raul::dtype>(arg_b);
+        EXPECT_NEAR(result[0], res_c, abs_err);
+        std::cout << result << std::endl;
+    }
+
+    std::cout << raul::io::tensor::brief;
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/Test_Yato.cpp b/training/src/tests/tests/lib/Test_Yato.cpp
new file mode 100644
index 00000000..934024d4
--- /dev/null
+++ b/training/src/tests/tests/lib/Test_Yato.cpp
@@ -0,0 +1,242 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <yato/array_view.h>
+
+namespace UT
+{
+
+TEST(TestYato, ReshapeUnit)
+{
+    PROFILE_TEST
+
+    std::vector<int> raw = { 1, 2, 3, 4, 5, 6 };
+
+    yato::array_view_1d<int> plain_view = yato::view(raw);
+    EXPECT_EQ(6U, plain_view.size(0));
+    EXPECT_EQ(6U, plain_view.total_size());
+
+    yato::array_view_2d<int> view_2x3 = plain_view.reshape(yato::dims(2, 3));
+    EXPECT_EQ(2U, view_2x3.size(0));
+    EXPECT_EQ(3U, view_2x3.size(1));
+    EXPECT_EQ(6U, view_2x3.total_size());
+    EXPECT_EQ(1, view_2x3[0][0]);
+    EXPECT_EQ(2, view_2x3[0][1]);
+    EXPECT_EQ(3, view_2x3[0][2]);
+    EXPECT_EQ(4, view_2x3[1][0]);
+    EXPECT_EQ(5, view_2x3[1][1]);
+    EXPECT_EQ(6, view_2x3[1][2]);
+
+    yato::array_view_2d<int> view_3x2 = view_2x3.reshape(yato::dims(3, 2));
+    EXPECT_EQ(3U, view_3x2.size(0));
+    EXPECT_EQ(2U, view_3x2.size(1));
+    EXPECT_EQ(6U, view_3x2.total_size());
+    EXPECT_EQ(1, view_3x2[0][0]);
+    EXPECT_EQ(2, view_3x2[0][1]);
+    EXPECT_EQ(3, view_3x2[1][0]);
+    EXPECT_EQ(4, view_3x2[1][1]);
+    EXPECT_EQ(5, view_3x2[2][0]);
+    EXPECT_EQ(6, view_3x2[2][1]);
+
+    yato::array_view_1d<int> view_6 = view_3x2.reshape(yato::dims(6));
+    EXPECT_EQ(6U, view_6.size(0));
+    EXPECT_EQ(6U, view_6.total_size());
+    EXPECT_EQ(1, view_6[0]);
+    EXPECT_EQ(2, view_6[1]);
+    EXPECT_EQ(3, view_6[2]);
+    EXPECT_EQ(4, view_6[3]);
+    EXPECT_EQ(5, view_6[4]);
+    EXPECT_EQ(6, view_6[5]);
+
+    yato::array_view_3d<int> view_1x2x3 = plain_view.reshape(yato::dims(1, 2, 3));
+    EXPECT_EQ(1U, view_1x2x3.size(0));
+    EXPECT_EQ(2U, view_1x2x3.size(1));
+    EXPECT_EQ(3U, view_1x2x3.size(2));
+    EXPECT_EQ(6U, view_1x2x3.total_size());
+    EXPECT_EQ(1, view_1x2x3[0][0][0]);
+    EXPECT_EQ(2, view_1x2x3[0][0][1]);
+    EXPECT_EQ(3, view_1x2x3[0][0][2]);
+    EXPECT_EQ(4, view_1x2x3[0][1][0]);
+    EXPECT_EQ(5, view_1x2x3[0][1][1]);
+    EXPECT_EQ(6, view_1x2x3[0][1][2]);
+
+    yato::array_view_3d<int> view_2x3x1 = plain_view.reshape(yato::dims(2, 3, 1));
+    EXPECT_EQ(2U, view_2x3x1.size(0));
+    EXPECT_EQ(3U, view_2x3x1.size(1));
+    EXPECT_EQ(1U, view_2x3x1.size(2));
+    EXPECT_EQ(6U, view_2x3x1.total_size());
+    EXPECT_EQ(1, view_2x3x1[0][0][0]);
+    EXPECT_EQ(2, view_2x3x1[0][1][0]);
+    EXPECT_EQ(3, view_2x3x1[0][2][0]);
+    EXPECT_EQ(4, view_2x3x1[1][0][0]);
+    EXPECT_EQ(5, view_2x3x1[1][1][0]);
+    EXPECT_EQ(6, view_2x3x1[1][2][0]);
+
+    yato::array_view_4d<int> view_1x1x2x3 = plain_view.reshape(yato::dims(1, 1, 2, 3));
+    EXPECT_EQ(1U, view_1x1x2x3.size(0));
+    EXPECT_EQ(1U, view_1x1x2x3.size(1));
+    EXPECT_EQ(2U, view_1x1x2x3.size(2));
+    EXPECT_EQ(3U, view_1x1x2x3.size(3));
+    EXPECT_EQ(6U, view_1x1x2x3.total_size());
+    EXPECT_EQ(1, view_1x1x2x3[0][0][0][0]);
+    EXPECT_EQ(2, view_1x1x2x3[0][0][0][1]);
+    EXPECT_EQ(3, view_1x1x2x3[0][0][0][2]);
+    EXPECT_EQ(4, view_1x1x2x3[0][0][1][0]);
+    EXPECT_EQ(5, view_1x1x2x3[0][0][1][1]);
+    EXPECT_EQ(6, view_1x1x2x3[0][0][1][2]);
+
+    view_1x1x2x3[0][0][1][2] = 7;
+
+    EXPECT_EQ(7, view_2x3[1][2]);
+    EXPECT_EQ(7, view_3x2[2][1]);
+    EXPECT_EQ(7, view_6[5]);
+    EXPECT_EQ(7, view_1x2x3[0][1][2]);
+    EXPECT_EQ(7, view_2x3x1[1][2][0]);
+    EXPECT_EQ(7, view_1x1x2x3[0][0][1][2]);
+
+    EXPECT_THROW(plain_view.at(6), yato::out_of_range_error);
+    EXPECT_THROW(view_2x3.at(0, 3), yato::out_of_range_error);
+    EXPECT_THROW(view_2x3.at(2, 0), yato::out_of_range_error);
+}
+
+TEST(TestYato, WidthWiseViewUnit)
+{
+    PROFILE_TEST
+    int arr[2][3][2][4] = { {
+                                { { 1111, 1112, 1113, 1114 }, { 1121, 1122, 1123, 1124 } },
+                                { { 1211, 1212, 1213, 1214 }, { 1221, 1222, 1223, 1224 } },
+                                { { 1311, 1312, 1313, 1314 }, { 1321, 1322, 1323, 1324 } },
+                            },
+                            {
+                                { { 2111, 2112, 2113, 2114 }, { 2121, 2122, 2123, 2124 } },
+                                { { 2211, 2212, 2213, 2214 }, { 2221, 2222, 2223, 2224 } },
+                                { { 2311, 2312, 2313, 2314 }, { 2321, 2322, 2323, 2324 } },
+                            } };
+
+    yato::array_view_4d<int> view(&arr[0][0][0][1], yato::dims(2, 3, 2, 2), yato::dims(3, 2, 4));
+
+    EXPECT_EQ(1112, view[0][0][0][0]);
+    EXPECT_EQ(1113, view[0][0][0][1]);
+    EXPECT_EQ(1122, view[0][0][1][0]);
+    EXPECT_EQ(1123, view[0][0][1][1]);
+    EXPECT_EQ(1212, view[0][1][0][0]);
+    EXPECT_EQ(1213, view[0][1][0][1]);
+    EXPECT_EQ(1222, view[0][1][1][0]);
+    EXPECT_EQ(1223, view[0][1][1][1]);
+    EXPECT_EQ(1312, view[0][2][0][0]);
+    EXPECT_EQ(1313, view[0][2][0][1]);
+    EXPECT_EQ(1322, view[0][2][1][0]);
+    EXPECT_EQ(1323, view[0][2][1][1]);
+    EXPECT_EQ(2112, view[1][0][0][0]);
+    EXPECT_EQ(2113, view[1][0][0][1]);
+    EXPECT_EQ(2122, view[1][0][1][0]);
+    EXPECT_EQ(2123, view[1][0][1][1]);
+    EXPECT_EQ(2212, view[1][1][0][0]);
+    EXPECT_EQ(2213, view[1][1][0][1]);
+    EXPECT_EQ(2222, view[1][1][1][0]);
+    EXPECT_EQ(2223, view[1][1][1][1]);
+    EXPECT_EQ(2312, view[1][2][0][0]);
+    EXPECT_EQ(2313, view[1][2][0][1]);
+    EXPECT_EQ(2322, view[1][2][1][0]);
+    EXPECT_EQ(2323, view[1][2][1][1]);
+}
+
+TEST(TestYato, WidthWiseView2Unit)
+{
+    PROFILE_TEST
+    int arr[2 * 3 * 2 * 4] = {
+        1111, 1112, 1113, 1114, 1121, 1122, 1123, 1124, 1211, 1212, 1213, 1214, 1221, 1222, 1223, 1224, 1311, 1312, 1313, 1314, 1321, 1322, 1323, 1324,
+        2111, 2112, 2113, 2114, 2121, 2122, 2123, 2124, 2211, 2212, 2213, 2214, 2221, 2222, 2223, 2224, 2311, 2312, 2313, 2314, 2321, 2322, 2323, 2324,
+    };
+
+    yato::array_view_4d<int> arr4(arr, yato::dims(2, 3, 2, 4));
+
+    yato::array_view_4d<int> view(&arr4[0][0][0][1], yato::dims(2, 3, 2, 2), yato::dims(3, 2, 4));
+
+    EXPECT_EQ(1112, view[0][0][0][0]);
+    EXPECT_EQ(1113, view[0][0][0][1]);
+    EXPECT_EQ(1122, view[0][0][1][0]);
+    EXPECT_EQ(1123, view[0][0][1][1]);
+    EXPECT_EQ(1212, view[0][1][0][0]);
+    EXPECT_EQ(1213, view[0][1][0][1]);
+    EXPECT_EQ(1222, view[0][1][1][0]);
+    EXPECT_EQ(1223, view[0][1][1][1]);
+    EXPECT_EQ(1312, view[0][2][0][0]);
+    EXPECT_EQ(1313, view[0][2][0][1]);
+    EXPECT_EQ(1322, view[0][2][1][0]);
+    EXPECT_EQ(1323, view[0][2][1][1]);
+    EXPECT_EQ(2112, view[1][0][0][0]);
+    EXPECT_EQ(2113, view[1][0][0][1]);
+    EXPECT_EQ(2122, view[1][0][1][0]);
+    EXPECT_EQ(2123, view[1][0][1][1]);
+    EXPECT_EQ(2212, view[1][1][0][0]);
+    EXPECT_EQ(2213, view[1][1][0][1]);
+    EXPECT_EQ(2222, view[1][1][1][0]);
+    EXPECT_EQ(2223, view[1][1][1][1]);
+    EXPECT_EQ(2312, view[1][2][0][0]);
+    EXPECT_EQ(2313, view[1][2][0][1]);
+    EXPECT_EQ(2322, view[1][2][1][0]);
+    EXPECT_EQ(2323, view[1][2][1][1]);
+}
+
+TEST(TestYato, DepthWiseViewUnit)
+{
+    PROFILE_TEST
+    int arr[2][3][2][4] = { {
+                                { { 1111, 1112, 1113, 1114 }, { 1121, 1122, 1123, 1124 } },
+                                { { 1211, 1212, 1213, 1214 }, { 1221, 1222, 1223, 1224 } },
+                                { { 1311, 1312, 1313, 1314 }, { 1321, 1322, 1323, 1324 } },
+                            },
+                            {
+                                { { 2111, 2112, 2113, 2114 }, { 2121, 2122, 2123, 2124 } },
+                                { { 2211, 2212, 2213, 2214 }, { 2221, 2222, 2223, 2224 } },
+                                { { 2311, 2312, 2313, 2314 }, { 2321, 2322, 2323, 2324 } },
+                            } };
+
+    yato::array_view_4d<int> view(&arr[0][1][0][1], yato::dims(2, 1, 2, 2), yato::dims(3, 2, 4));
+
+    EXPECT_EQ(1212, view[0][0][0][0]);
+    EXPECT_EQ(1213, view[0][0][0][1]);
+    EXPECT_EQ(1222, view[0][0][1][0]);
+    EXPECT_EQ(1223, view[0][0][1][1]);
+    EXPECT_EQ(2212, view[1][0][0][0]);
+    EXPECT_EQ(2213, view[1][0][0][1]);
+    EXPECT_EQ(2222, view[1][0][1][0]);
+    EXPECT_EQ(2223, view[1][0][1][1]);
+}
+
+TEST(TestYato, DepthWise2ViewUnit)
+{
+    PROFILE_TEST
+    int arr[2 * 3 * 2 * 4] = {
+        1111, 1112, 1113, 1114, 1121, 1122, 1123, 1124, 1211, 1212, 1213, 1214, 1221, 1222, 1223, 1224, 1311, 1312, 1313, 1314, 1321, 1322, 1323, 1324,
+        2111, 2112, 2113, 2114, 2121, 2122, 2123, 2124, 2211, 2212, 2213, 2214, 2221, 2222, 2223, 2224, 2311, 2312, 2313, 2314, 2321, 2322, 2323, 2324,
+    };
+
+    yato::array_view_4d<int> arr4(arr, yato::dims(2, 3, 2, 4));
+
+    yato::array_view_4d<int> view(&arr4[0][1][0][1], yato::dims(2, 1, 2, 2), yato::dims(3, 2, 4));
+
+    EXPECT_EQ(1212, view[0][0][0][0]);
+    EXPECT_EQ(1213, view[0][0][0][1]);
+    EXPECT_EQ(1222, view[0][0][1][0]);
+    EXPECT_EQ(1223, view[0][0][1][1]);
+    EXPECT_EQ(2212, view[1][0][0][0]);
+    EXPECT_EQ(2213, view[1][0][0][1]);
+    EXPECT_EQ(2222, view[1][0][1][0]);
+    EXPECT_EQ(2223, view[1][0][1][1]);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/workflow/Test_IntervalTree.cpp b/training/src/tests/tests/lib/workflow/Test_IntervalTree.cpp
new file mode 100644
index 00000000..f45f5064
--- /dev/null
+++ b/training/src/tests/tests/lib/workflow/Test_IntervalTree.cpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/compiler/IntervalTree.h>
+
+namespace UT
+{
+
+TEST(TestIntervalTree, FindUnit)
+{
+    PROFILE_TEST
+
+    raul::IntervalTree tree(6u);
+
+    typedef raul::WorkflowPool<raul::MemoryManager>::Interval Interval;
+
+    std::vector<Interval> intervals;
+    intervals.push_back(Interval(15, 20, ""));
+    intervals.push_back(Interval(10, 30, ""));
+    intervals.push_back(Interval(17, 19, ""));
+    intervals.push_back(Interval(5, 20, ""));
+    intervals.push_back(Interval(12, 15, ""));
+    intervals.push_back(Interval(30, 40, ""));
+
+    for (size_t q = 0; q < intervals.size(); ++q)
+    {
+        tree.insert(&intervals[q]);
+    }
+
+    EXPECT_THROW(tree.insert(&intervals[0]), raul::Exception);
+
+    Interval interval(17, 18, "");
+    const Interval* found = tree.find(&interval);
+
+    EXPECT_NE(found, nullptr);
+    EXPECT_EQ(found->start, 15u);
+    EXPECT_EQ(found->finish, 20u);
+}
+
+TEST(TestIntervalTree, FindAllUnit)
+{
+    PROFILE_TEST
+
+    raul::IntervalTree tree(6u);
+
+    typedef raul::WorkflowPool<raul::MemoryManager>::Interval Interval;
+
+    std::vector<Interval> intervals;
+    intervals.push_back(Interval(15, 20, ""));
+    intervals.push_back(Interval(10, 30, ""));
+    intervals.push_back(Interval(17, 19, ""));
+    intervals.push_back(Interval(5, 20, ""));
+    intervals.push_back(Interval(12, 15, ""));
+    intervals.push_back(Interval(30, 40, ""));
+
+    for (size_t q = 0; q < intervals.size(); ++q)
+    {
+        tree.insert(&intervals[q]);
+    }
+
+    Interval interval(17, 18, "");
+
+    std::vector<const Interval*> foundAll = tree.findAll(&interval);
+
+    EXPECT_EQ(foundAll.size(), 4u);
+
+    EXPECT_EQ(foundAll[0]->start, 5u);
+    EXPECT_EQ(foundAll[0]->finish, 20u);
+
+    EXPECT_EQ(foundAll[1]->start, 10u);
+    EXPECT_EQ(foundAll[1]->finish, 30u);
+
+    EXPECT_EQ(foundAll[2]->start, 17u);
+    EXPECT_EQ(foundAll[2]->finish, 19u);
+
+    EXPECT_EQ(foundAll[3]->start, 15u);
+    EXPECT_EQ(foundAll[3]->finish, 20u);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/workflow/Test_Workflow.cpp b/training/src/tests/tests/lib/workflow/Test_Workflow.cpp
new file mode 100644
index 00000000..c399f606
--- /dev/null
+++ b/training/src/tests/tests/lib/workflow/Test_Workflow.cpp
@@ -0,0 +1,2043 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/compiler/WorkflowActions.h>
+
+#include "Test_WorkflowTools.h"
+
+namespace UT
+{
+
+TEST(TestWorkflow, WShapeConstructorUnit)
+{
+    {
+        raul::WShape wShape;
+
+        EXPECT_FALSE(wShape.isBSDependent());
+
+        raul::Workflow w;
+
+        raul::shape shapeVal = wShape.getShape(w);
+
+        EXPECT_EQ(shapeVal[0], 0u);
+        EXPECT_EQ(shapeVal[1], 0u);
+        EXPECT_EQ(shapeVal[2], 0u);
+        EXPECT_EQ(shapeVal[3], 0u);
+    }
+
+    {
+        raul::WShape wShape(1u, 2u, 3u, 4u);
+
+        EXPECT_FALSE(wShape.isBSDependent());
+
+        raul::Workflow w;
+
+        raul::shape shapeVal = wShape.getShape(w);
+
+        EXPECT_EQ(shapeVal[0], 1u);
+        EXPECT_EQ(shapeVal[1], 2u);
+        EXPECT_EQ(shapeVal[2], 3u);
+        EXPECT_EQ(shapeVal[3], 4u);
+    }
+
+    {
+        raul::WShape wShape(raul::BS(), 2u, 3u, 4u);
+
+        EXPECT_TRUE(wShape.isBSDependent());
+
+        raul::Workflow w;
+
+        EXPECT_THROW([[maybe_unused]] const auto shape = wShape.getShape(w), raul::Exception);
+    }
+
+    {
+        raul::WShape wShape(raul::BS(), 2u, 3u, 4u);
+
+        EXPECT_TRUE(wShape.isBSDependent());
+
+        raul::Workflow w;
+
+        w.preparePipelines();
+        w.setBatchSize(1u);
+
+        raul::shape shapeVal = wShape.getShape(w);
+
+        EXPECT_EQ(shapeVal[0], 1u);
+        EXPECT_EQ(shapeVal[1], 2u);
+        EXPECT_EQ(shapeVal[2], 3u);
+        EXPECT_EQ(shapeVal[3], 4u);
+
+        w.setBatchSize(10u);
+
+        shapeVal = wShape.getShape(w);
+
+        EXPECT_EQ(shapeVal[0], 10u);
+        EXPECT_EQ(shapeVal[1], 2u);
+        EXPECT_EQ(shapeVal[2], 3u);
+        EXPECT_EQ(shapeVal[3], 4u);
+    }
+
+    {
+        raul::WShape wShape(1u, raul::BS(), 3u, raul::BS(2u));
+
+        EXPECT_TRUE(wShape.isBSDependent());
+
+        raul::Workflow w;
+
+        w.preparePipelines();
+        w.setBatchSize(1u);
+
+        raul::shape shapeVal = wShape.getShape(w);
+
+        EXPECT_EQ(shapeVal[0], 1u);
+        EXPECT_EQ(shapeVal[1], 1u);
+        EXPECT_EQ(shapeVal[2], 3u);
+        EXPECT_EQ(shapeVal[3], 2u);
+
+        w.setBatchSize(10u);
+
+        shapeVal = wShape.getShape(w);
+
+        EXPECT_EQ(shapeVal[0], 1u);
+        EXPECT_EQ(shapeVal[1], 10u);
+        EXPECT_EQ(shapeVal[2], 3u);
+        EXPECT_EQ(shapeVal[3], 20u);
+    }
+}
+
+TEST(TestWorkflow, WShapeComparisonUnit)
+{
+    // equal
+    {
+        raul::WShape wShape;
+        raul::WShape wShape2;
+
+        EXPECT_TRUE(wShape == wShape2);
+        EXPECT_FALSE(wShape != wShape2);
+    }
+
+    {
+        raul::WShape wShape(1u, 2u, 3u, 4u);
+        raul::WShape wShape2(1u, 2u, 3u, 4u);
+
+        EXPECT_TRUE(wShape == wShape2);
+        EXPECT_FALSE(wShape != wShape2);
+    }
+
+    {
+        raul::WShape wShape(1u, raul::BS(), 3u, 4u);
+        raul::WShape wShape2(1u, raul::BS(), 3u, 4u);
+
+        EXPECT_TRUE(wShape == wShape2);
+        EXPECT_FALSE(wShape != wShape2);
+    }
+
+    {
+        raul::WShape wShape(1u, raul::BS(), 3u, 4u);
+        raul::WShape wShape2(1u, raul::BS(1u), 3u, 4u);
+
+        EXPECT_TRUE(wShape == wShape2);
+        EXPECT_FALSE(wShape != wShape2);
+    }
+
+    {
+        raul::WShape wShape(1u, raul::BS(), 3u, raul::BS(2u));
+        raul::WShape wShape2(1u, raul::BS(), 3u, raul::BS(2u));
+
+        EXPECT_TRUE(wShape == wShape2);
+        EXPECT_FALSE(wShape != wShape2);
+    }
+
+    // not equal
+    {
+        raul::WShape wShape(1u, raul::BS(), 3u, raul::BS(2u));
+        raul::WShape wShape2(2u, raul::BS(), 3u, raul::BS(2u));
+
+        EXPECT_TRUE(wShape != wShape2);
+        EXPECT_FALSE(wShape == wShape2);
+    }
+
+    {
+        raul::WShape wShape(1u, raul::BS(), 3u, raul::BS(2u));
+        raul::WShape wShape2(1u, raul::BS(), 3u, 4u);
+
+        EXPECT_TRUE(wShape != wShape2);
+        EXPECT_FALSE(wShape == wShape2);
+    }
+
+    {
+        raul::WShape wShape(1u, raul::BS(), 3u, raul::BS(2u));
+        raul::WShape wShape2(1u, raul::BS(), 3u, raul::BS(3u));
+
+        EXPECT_TRUE(wShape != wShape2);
+        EXPECT_FALSE(wShape == wShape2);
+    }
+}
+
+TEST(TestWorkflow, TestOfCheckBlockUnit)
+{
+    EXPECT_TRUE(checkBlock({ "1", "2", "3" }, 0, { "1", "2" }));
+    EXPECT_TRUE(checkBlock({ "1", "2", "3" }, 1, { "2", "3" }));
+    EXPECT_FALSE(checkBlock({ "1", "2", "3" }, 2, { "2", "3" }));
+}
+
+TEST(TestWorkflow, TestOfCheckBlocksUnit)
+{
+    EXPECT_FALSE(checkBlocks({ "1", "2", "3" }, { { "1" }, { "3" } }));
+    EXPECT_TRUE(checkBlocks({ "1", "2", "3" }, { { "1", "2" }, { "3" } }));
+    EXPECT_TRUE(checkBlocks({ "1", "2", "3" }, { { "3" }, { "1", "2" } }));
+    EXPECT_FALSE(checkBlocks({ "1", "2", "3" }, { { "3", "4" }, { "1", "2" } }));
+    EXPECT_FALSE(checkBlocks({ "1", "2", "3", "5" }, { { "3", "4" }, { "1", "2" } }));
+    EXPECT_TRUE(checkBlocks({ "1", "2", "3", "4", "1", "2", "3" }, { { "1", "2", "3" }, { "1", "2", "3", "4" } }));
+    EXPECT_FALSE(checkBlocks({ "1", "2", "3", "1", "2", "4" }, { { "1", "2", "3" }, { "1", "2", "3" } }));
+}
+
+TEST(TestWorkflow, TensorNeededUnit)
+{
+    PROFILE_TEST
+    raul::Workflow w;
+
+    EXPECT_FALSE(w.isTensorDeclared("t1"));
+
+    w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+    w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+    EXPECT_THROW(w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false), raul::Exception);
+    EXPECT_THROW(w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false),
+                 raul::Exception);
+    EXPECT_THROW(w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, false, false, false),
+                 raul::Exception);
+
+    EXPECT_TRUE(w.isTensorDeclared("t1"));
+    EXPECT_TRUE(w.isBatchPlaceholded("t1"));
+    EXPECT_FALSE(w.isDepthPlaceholded("t1"));
+    EXPECT_FALSE(w.isHeightPlaceholded("t1"));
+    EXPECT_FALSE(w.isWidthPlaceholded("t1"));
+    EXPECT_THROW(w.getBatch("t1"), raul::Exception);
+    EXPECT_EQ(w.getDepth("t1"), 1u);
+    EXPECT_EQ(w.getHeight("t1"), 1u);
+    EXPECT_EQ(w.getWidth("t1"), 1u);
+
+    // WSHape
+    EXPECT_THROW(w.getShape("t2"), raul::Exception);
+    raul::WShape sh = w.getShape("t1");
+    EXPECT_THROW([[maybe_unused]] const auto shape = sh.getShape(w), raul::Exception);
+    EXPECT_TRUE(sh.isBSDependent());
+
+    w.tensorNeeded("L1", "t2", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+
+    w.tensorNeeded("L2", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+
+    w.tensorNeeded("L2", "t2", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, false, false, false);
+    EXPECT_THROW(w.tensorNeeded("L2", "t2", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false), raul::Exception);
+    EXPECT_THROW(w.tensorNeeded("L2", "t2", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false),
+                 raul::Exception);
+
+    w.tensorNeeded("L3", "t3", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+    EXPECT_THROW(w.tensorNeeded("L3", "t3", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false), raul::Exception);
+    EXPECT_THROW(w.tensorNeeded("L3", "t3", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, false, false, false),
+                 raul::Exception);
+
+    w.tensorNeeded("L4", "t4", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+    EXPECT_THROW(w.tensorNeeded("L4", "t4", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false),
+                 raul::Exception);
+    EXPECT_THROW(w.tensorNeeded("L4", "t4", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, false, false, false),
+                 raul::Exception);
+
+    // copy to layer (flags ignored)
+    w.copyDec("L5", "ignored", "t4", raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+    EXPECT_THROW(w.copyDeclaration("L5", "t4", raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false), raul::Exception);
+
+    // copy to tensor
+    w.copyDec("L5", "t4", "t5", raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+    EXPECT_THROW(w.copyDeclaration("L5", "t5", raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false), raul::Exception);
+}
+
+TEST(TestWorkflow, TensorNeededMaxShapeUnit)
+{
+    PROFILE_TEST
+    raul::Workflow w;
+
+    EXPECT_FALSE(w.isTensorDeclared("t1"));
+
+    w.tensorNeededMaxShape("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+    EXPECT_NO_THROW(w.tensorNeededMaxShape("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 2u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false));
+
+    w.tensorNeeded("L1", "t2", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+    EXPECT_THROW(w.tensorNeeded("L1", "t2", raul::WShape{ 1u, 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false), raul::Exception);
+
+    w.tensorNeededMaxShape("L1", "t3", raul::WShape{ 1u, 2u, 3u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+    w.tensorNeededMaxShape("L1", "t3", raul::WShape{ 10u, 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+
+    EXPECT_TRUE(w.isTensorDeclared("t1"));
+    EXPECT_TRUE(w.isBatchPlaceholded("t1"));
+    EXPECT_FALSE(w.isDepthPlaceholded("t1"));
+    EXPECT_FALSE(w.isHeightPlaceholded("t1"));
+    EXPECT_FALSE(w.isWidthPlaceholded("t1"));
+    EXPECT_THROW(w.getBatch("t1"), raul::Exception);
+    EXPECT_EQ(w.getDepth("t1"), 1u);
+    EXPECT_EQ(w.getHeight("t1"), 1u);
+    EXPECT_EQ(w.getWidth("t1"), 2u);
+
+    EXPECT_EQ(w.getBatch("t3"), 10u);
+    EXPECT_EQ(w.getDepth("t3"), 2u);
+    EXPECT_EQ(w.getHeight("t3"), 3u);
+    EXPECT_EQ(w.getWidth("t3"), 1u);
+
+    // WSHape
+    raul::WShape sh = w.getShape("t1");
+    EXPECT_THROW([[maybe_unused]] const auto shape = sh.getShape(w), raul::Exception);
+    EXPECT_TRUE(sh.isBSDependent());
+}
+
+TEST(TestWorkflow, AddOpUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w;
+
+    w.add<TestLayer>("f0", raul::BasicParams{ { "in" }, { "x1" } }, true);
+
+    EXPECT_THROW(w.add<TestLayer>("f0", raul::BasicParams{ { "x1" }, { "x2" } }, true), raul::Exception);
+
+    w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" } }, true);
+
+    ASSERT_EQ(w["f0"]->getName(), "f0");
+    ASSERT_EQ(w["f1"]->getName(), "f1");
+
+    ASSERT_EQ(w["f0"]->getInputs().size(), 1u);
+    ASSERT_EQ(w["f0"]->getOutputs().size(), 1u);
+
+    ASSERT_EQ(w["f0"]->getInputs()[0], "in");
+    ASSERT_EQ(w["f0"]->getOutputs()[0], "x1");
+
+    ASSERT_EQ(w["f1"]->getInputs().size(), 1u);
+    ASSERT_EQ(w["f1"]->getOutputs().size(), 1u);
+
+    ASSERT_EQ(w["f1"]->getInputs()[0], "x1");
+    ASSERT_EQ(w["f1"]->getOutputs()[0], "x2");
+}
+
+TEST(TestWorkflow, GetLayerParameterNamesUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    Workflow w;
+
+    w.add<raul::DataLayer>("data", DataParams{ { "in" }, 1, 1, 1 });
+    w.add<raul::LinearLayer>("fc", LinearParams{ "in", "x1", 1 });
+    w.add<raul::BatchNormLayer>("bn", BatchnormParams{ { "x1" }, { "x2" } });
+
+    EXPECT_THROW(w.getLayerParameterNames("f"), raul::Exception);
+
+    std::map<Name, std::set<Name>> correctParams = {
+        { "data", {} },
+        { "fc", { Name("fc") / "Weights", Name("fc") / "Biases" } },
+        { "bn", { Name("bn") / "Weights", Name("bn") / "Biases", Name("bn") / "MeanEval", Name("bn") / "VarianceEval" } },
+    };
+
+    for (const auto& [name, idealParams] : correctParams)
+    {
+        auto params = w.getLayerParameterNames(name);
+        std::set<Name> paramsSet(params.begin(), params.end());
+        EXPECT_EQ(paramsSet.size(), idealParams.size());
+        for (const auto& p : paramsSet)
+        {
+            EXPECT_TRUE(idealParams.find(p) != idealParams.end());
+        }
+    }
+}
+
+TEST(TestWorkflow, AddLayerUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w;
+
+    w.add<TestLayer>("f0", raul::BasicParams{ { "in" }, { "x1" } }, true);
+
+    EXPECT_THROW(w.add<TestLayer>("f0", raul::BasicParams{ { "x1" }, { "x2" } }, true), raul::Exception);
+
+    w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" } }, true);
+
+    ASSERT_EQ(w["f0"]->getName(), "f0");
+    ASSERT_EQ(w["f1"]->getName(), "f1");
+
+    ASSERT_EQ(w["f0"]->getInputs().size(), 1u);
+    ASSERT_EQ(w["f0"]->getOutputs().size(), 1u);
+
+    ASSERT_EQ(w["f0"]->getInputs()[0], "in");
+    ASSERT_EQ(w["f0"]->getOutputs()[0], "x1");
+
+    ASSERT_EQ(w["f1"]->getInputs().size(), 1u);
+    ASSERT_EQ(w["f1"]->getOutputs().size(), 1u);
+
+    ASSERT_EQ(w["f1"]->getInputs()[0], "x1");
+    ASSERT_EQ(w["f1"]->getOutputs()[0], "x2");
+}
+
+TEST(TestWorkflow, PreparePipelinesSimpleUnit)
+{
+    PROFILE_TEST
+
+    {
+        raul::Workflow w;
+
+        w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+        w.add<TestLayer>("f1", raul::BasicParams{ { "" }, { "x2" } }, true);
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+        w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "" } }, true);
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+}
+
+TEST(TestWorkflow, PreparePipelinesSimple2Unit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w;
+
+    w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+
+    // empty layer name not possible
+    EXPECT_THROW(w.add<TestLayer>("", raul::BasicParams{ {}, { "x2" } }, true), raul::Exception);
+    EXPECT_THROW(w.add<TestLayer>("", raul::BasicParams{ { "x2" }, { "x3" } }, true), raul::Exception);
+
+    w.preparePipelines();
+
+    EXPECT_THROW(w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" } }, true), raul::Exception);
+    EXPECT_THROW(w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" } }, true), raul::Exception);
+}
+
+TEST(TestWorkflow, PreparePipelinesInequalityUnit)
+{
+    PROFILE_TEST
+
+    class TestLayer2 : public raul::BasicLayer
+    {
+      public:
+        TestLayer2(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+
+        EXPECT_NO_THROW(w.preparePipelines());
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        EXPECT_THROW(w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, true, false),
+                     raul::Exception);
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+
+        EXPECT_NO_THROW(w.preparePipelines());
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, true, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, true, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, true, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L1", "t1", raul::WShape{ 1u, 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 2u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ 1u, 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L1", "t1", raul::WShape{ 1u, 1u, 1u, 2u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L2", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+        w.add<TestLayer2>("L2", raul::BasicParams{ {}, {} });
+
+        EXPECT_NO_THROW(w.preparePipelines());
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L2", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, false, false, true, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+        w.add<TestLayer2>("L2", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L2", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, true, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+        w.add<TestLayer2>("L2", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L2", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, true, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+        w.add<TestLayer2>("L2", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L2", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+        w.add<TestLayer2>("L2", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L2", "t1", raul::WShape{ raul::BS(), 1u, 1u, 2u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+        w.add<TestLayer2>("L2", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L2", "t1", raul::WShape{ 1u, 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+        w.add<TestLayer2>("L2", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ 1u, 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.tensorNeeded("L2", "t1", raul::WShape{ 1u, 1u, 1u, 2u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, false, false, false);
+        w.add<TestLayer2>("L1", raul::BasicParams{ {}, {} });
+        w.add<TestLayer2>("L2", raul::BasicParams{ {}, {} });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    // copy tensor with flags to new layer
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ 1u, 2u, 3u, 4u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+
+        EXPECT_NO_THROW(w.copyDeclaration("L1", "t1", raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, true, true, false, false, false));
+        EXPECT_THROW(w.copyDeclaration("L2", "t2", raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false), raul::Exception);
+        EXPECT_NO_THROW(w.copyDeclaration("L2", "t1", raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, true, true, false, false, false));
+        EXPECT_THROW(w.copyDeclaration("L2", "t1", raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false), raul::Exception);
+
+        EXPECT_THROW(w.getBatch("t2"), raul::Exception);
+        EXPECT_THROW(w.getDepth("t2"), raul::Exception);
+        EXPECT_THROW(w.getWidth("t2"), raul::Exception);
+        EXPECT_THROW(w.getHeight("t2"), raul::Exception);
+
+        EXPECT_EQ(w.getBatch("t1"), 1u);
+        EXPECT_EQ(w.getDepth("t1"), 2u);
+        EXPECT_EQ(w.getHeight("t1"), 3u);
+        EXPECT_EQ(w.getWidth("t1"), 4u);
+
+        EXPECT_NO_THROW(w.preparePipelines());
+    }
+
+    // copy tensor with flags to new tensor (same or new layer)
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ 1u, 2u, 3u, 4u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+
+        EXPECT_NO_THROW(w.copyDeclaration("L1", "t1", "t2", raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, true, true, false, false, false));
+        EXPECT_THROW(w.copyDeclaration("L2", "t3", "t4", raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false), raul::Exception);
+
+        EXPECT_EQ(w.getBatch("t2"), 1u);
+        EXPECT_EQ(w.getDepth("t2"), 2u);
+        EXPECT_EQ(w.getHeight("t2"), 3u);
+        EXPECT_EQ(w.getWidth("t2"), 4u);
+
+        EXPECT_NO_THROW(w.preparePipelines());
+    }
+
+    // copy tensor to new layer
+    {
+        raul::Workflow w;
+
+        w.tensorNeeded("L1", "t1", raul::WShape{ 1u, 2u, 3u, 4u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false);
+
+        EXPECT_NO_THROW(w.copyDeclaration("L1", "t1", raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read));
+        EXPECT_THROW(w.copyDeclaration("L2", "t2", raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read), raul::Exception);
+        EXPECT_NO_THROW(w.copyDeclaration("L2", "t1", raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read));
+        EXPECT_THROW(w.copyDeclaration("L2", "t1", raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read), raul::Exception);
+
+        EXPECT_THROW(w.getBatch("t2"), raul::Exception);
+        EXPECT_THROW(w.getDepth("t2"), raul::Exception);
+        EXPECT_THROW(w.getWidth("t2"), raul::Exception);
+        EXPECT_THROW(w.getHeight("t2"), raul::Exception);
+
+        EXPECT_EQ(w.getBatch("t1"), 1u);
+        EXPECT_EQ(w.getDepth("t1"), 2u);
+        EXPECT_EQ(w.getHeight("t1"), 3u);
+        EXPECT_EQ(w.getWidth("t1"), 4u);
+
+        EXPECT_NO_THROW(w.preparePipelines());
+    }
+}
+
+TEST(TestWorkflow, PreparePipelinesCorrectTopologyUnit)
+{
+    PROFILE_TEST
+
+    {
+        raul::Workflow w;
+
+        w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+        w.add<TestLayer>("f1", raul::BasicParams{ { "x3" }, { "x2" } }, true);
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+        w.add<TestLayer>("f1", raul::BasicParams{ {}, { "x1" } }, true);
+        w.add<TestLayer>("f2", raul::BasicParams{ { "x1" }, { "x3" } }, true);
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    class Test : public raul::BasicLayer
+    {
+      public:
+        Test(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, params.getOutputs()[0], raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false);
+            }
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    {
+        raul::Workflow w;
+
+        w.add<Test>("f0", raul::BasicParams{ {}, { "x1", "x1" } });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+        w.add<Test>("f1", raul::BasicParams{ { "x1" }, { "x1" } });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+        w.add<Test>("f1", raul::BasicParams{ { "x1", "x1" }, { "x2" } });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    {
+        raul::Workflow w;
+
+        w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+        w.add<Test>("f1", raul::BasicParams{ { "x1" }, { "x2" }, { "w1", "w1" } });
+
+        EXPECT_THROW(w.preparePipelines(), raul::Exception);
+    }
+
+    class Test2 : public raul::BasicLayer
+    {
+      public:
+        Test2(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    // output not declared
+    {
+        raul::Workflow w;
+
+        w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+        EXPECT_THROW(w.add<Test2>("f1", raul::BasicParams{ { "x1" }, { "x2" } }), raul::Exception);
+    }
+}
+
+TEST(TestWorkflow, PreparePipelinesSimpleTopologyUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w;
+
+    w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+    w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" } }, true);
+    w.add<TestLayer>("f2", raul::BasicParams{ { "x2" }, { "x3" } }, true);
+    w.add<TestLayer>("f3", raul::BasicParams{ { "x3" }, {} }, true);
+
+    static_cast<TestLayer*>(w["f0"])->setExpectGrad(3.0_dt);
+    static_cast<TestLayer*>(w["f1"])->setExpectGrad(2.0_dt);
+    static_cast<TestLayer*>(w["f2"])->setExpectGrad(1.0_dt);
+
+    auto names = w.getTrainableParameterNames();
+
+    EXPECT_EQ(names.size(), 4u);
+
+    {
+        std::unordered_set<raul::Name> tNames(names.begin(), names.end());
+
+        ASSERT_NE(tNames.find(raul::Name("f0") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f1") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f2") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f3") / "Weights"), tNames.end());
+    }
+
+    EXPECT_THROW(w.getTrainableParameters(), raul::Exception);
+
+    EXPECT_NO_THROW(w.preparePipelines());
+
+    // forward test pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTest = w.getPipeline(raul::Workflow::Pipelines::ForwardTest);
+
+        ASSERT_EQ(pipeForwardTest.size(), 10u);
+
+        ASSERT_EQ(pipeForwardTest[0]->type(), "Allocate");   // x1
+        ASSERT_EQ(pipeForwardTest[1]->type(), "Forward");    // f0
+        ASSERT_EQ(pipeForwardTest[2]->type(), "Allocate");   // x2
+        ASSERT_EQ(pipeForwardTest[3]->type(), "Forward");    // f1
+        ASSERT_EQ(pipeForwardTest[4]->type(), "Deallocate"); // x1
+        ASSERT_EQ(pipeForwardTest[5]->type(), "Allocate");   // x3
+        ASSERT_EQ(pipeForwardTest[6]->type(), "Forward");    // f2
+        ASSERT_EQ(pipeForwardTest[7]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeForwardTest[8]->type(), "Forward");    // f3
+        ASSERT_EQ(pipeForwardTest[9]->type(), "Deallocate"); // x3
+
+        ASSERT_TRUE(checkName(pipeForwardTest[0].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTest[1].get(), "f0"));
+        ASSERT_TRUE(checkName(pipeForwardTest[2].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTest[3].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeForwardTest[4].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTest[5].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTest[6].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeForwardTest[7].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTest[8].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeForwardTest[9].get(), "x3"));
+    }
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 7u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate"); // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");  // f0
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate"); // x2
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");  // f1
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate"); // x3
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");  // f2
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Forward");  // f3
+
+        ASSERT_TRUE(checkName(pipeForwardTrain[0].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[1].get(), "f0"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[2].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[3].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[4].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[5].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[6].get(), "f3"));
+    }
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 16u);
+
+        ASSERT_EQ(pipeBackwardTrain[0]->type(), "Allocate");    // xg3
+        ASSERT_EQ(pipeBackwardTrain[1]->type(), "Zero");        // xg3
+        ASSERT_EQ(pipeBackwardTrain[2]->type(), "Backward");    // f3
+        ASSERT_EQ(pipeBackwardTrain[3]->type(), "Deallocate");  // x3
+        ASSERT_EQ(pipeBackwardTrain[4]->type(), "Allocate");    // xg2
+        ASSERT_EQ(pipeBackwardTrain[5]->type(), "Zero");        // xg2
+        ASSERT_EQ(pipeBackwardTrain[6]->type(), "Backward");    // f2
+        ASSERT_EQ(pipeBackwardTrain[7]->type(), "Deallocate");  // x2
+        ASSERT_EQ(pipeBackwardTrain[8]->type(), "Deallocate");  // xg3
+        ASSERT_EQ(pipeBackwardTrain[9]->type(), "Allocate");    // xg1
+        ASSERT_EQ(pipeBackwardTrain[10]->type(), "Zero");       // xg1
+        ASSERT_EQ(pipeBackwardTrain[11]->type(), "Backward");   // f1
+        ASSERT_EQ(pipeBackwardTrain[12]->type(), "Deallocate"); // x1
+        ASSERT_EQ(pipeBackwardTrain[13]->type(), "Deallocate"); // xg2
+        ASSERT_EQ(pipeBackwardTrain[14]->type(), "Backward");   // f0
+        ASSERT_EQ(pipeBackwardTrain[15]->type(), "Deallocate"); // xg1
+
+        ASSERT_TRUE(checkName(pipeBackwardTrain[0].get(), getGradName("x3")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[1].get(), getGradName("x3")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[2].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[3].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[4].get(), getGradName("x2")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[5].get(), getGradName("x2")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[6].get(), "f2"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[7].get()), getName(pipeBackwardTrain[8].get()) }, { "x2", getGradName("x3") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[9].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[10].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[11].get(), "f1"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[12].get()), getName(pipeBackwardTrain[13].get()) }, { "x1", getGradName("x2") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[14].get(), "f0"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[15].get(), getGradName("x1")));
+    }
+
+    EXPECT_NO_THROW(w.prepareMemoryForTraining());
+    EXPECT_NO_THROW(w.setBatchSize(10));
+
+    auto trainables = w.getTrainableParameters();
+    EXPECT_EQ(trainables.size(), 4u);
+
+    // order matters
+    EXPECT_NO_THROW(w.forwardPassTraining());
+    EXPECT_THROW(w.preparePipelines(), raul::Exception);
+
+    EXPECT_NO_THROW(w.backwardPassTraining());
+    EXPECT_NO_THROW(w.preparePipelines());
+}
+
+TEST(TestWorkflow, WorkflowEagerBatchSizeChangedUnit)
+{
+    PROFILE_TEST
+
+    constexpr size_t BATCH_SIZE = 10;
+
+    raul::WorkflowEager w;
+    w.add<TestInitLayer>("f", raul::BasicParams{ {}, {} });
+
+    auto* layer = static_cast<TestInitLayer*>(w["f"]);
+
+    ASSERT_TRUE(layer != nullptr);
+
+    EXPECT_NO_THROW(w.preparePipelines());
+    EXPECT_NO_THROW(w.prepareMemoryForTraining());
+
+    ASSERT_NE(layer->getBatchSize(), BATCH_SIZE);
+    EXPECT_NO_THROW(w.setBatchSize(BATCH_SIZE));
+    ASSERT_EQ(layer->getBatchSize(), BATCH_SIZE);
+}
+
+TEST(TestWorkflow, PreparePipelinesSimpleTopologyZeroingUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w;
+
+    w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+    w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" } }, true);
+    w.add<TestLayer>("f2", raul::BasicParams{ { "x2" }, { "x3" } }, true);
+    w.add<TestLayer>("f3", raul::BasicParams{ { "x3" }, {} }, true);
+
+    static_cast<TestLayer*>(w["f0"])->setExpectGrad(3.0_dt);
+    static_cast<TestLayer*>(w["f1"])->setExpectGrad(2.0_dt);
+    static_cast<TestLayer*>(w["f2"])->setExpectGrad(1.0_dt);
+
+    EXPECT_NO_THROW(w.preparePipelines());
+    EXPECT_NO_THROW(w.prepareMemoryForTraining());
+    EXPECT_NO_THROW(w.setBatchSize(10));
+
+    EXPECT_NO_THROW(w.forwardPassTraining());
+    EXPECT_NO_THROW(w.backwardPassTraining());
+
+    static_cast<TestLayer*>(w["f0"])->setPerformGradWeightsChecks(false);
+    static_cast<TestLayer*>(w["f1"])->setPerformGradWeightsChecks(false);
+    static_cast<TestLayer*>(w["f2"])->setPerformGradWeightsChecks(false);
+    static_cast<TestLayer*>(w["f3"])->setPerformGradWeightsChecks(false);
+
+    EXPECT_NO_THROW(w.forwardPassTraining(false));
+    EXPECT_NO_THROW(w.backwardPassTraining());
+
+    raul::MemoryManager& mm = w.getMemoryManager();
+
+    EXPECT_EQ(mm[(raul::Name("f0") / "Weights").grad()][0], 2_dt);
+    EXPECT_EQ(mm[(raul::Name("f1") / "Weights").grad()][0], 2_dt);
+    EXPECT_EQ(mm[(raul::Name("f2") / "Weights").grad()][0], 2_dt);
+    EXPECT_EQ(mm[(raul::Name("f3") / "Weights").grad()][0], 2_dt);
+
+    static_cast<TestLayer*>(w["f0"])->setPerformGradWeightsChecks(true);
+    static_cast<TestLayer*>(w["f1"])->setPerformGradWeightsChecks(true);
+    static_cast<TestLayer*>(w["f2"])->setPerformGradWeightsChecks(true);
+    static_cast<TestLayer*>(w["f3"])->setPerformGradWeightsChecks(true);
+
+    EXPECT_NO_THROW(w.forwardPassTraining(true));
+    EXPECT_NO_THROW(w.backwardPassTraining());
+}
+
+TEST(TestWorkflow, PreparePipelinesSimpleTopologyWShapeUnit)
+{
+    PROFILE_TEST
+
+    class Test : public raul::BasicLayer
+    {
+      public:
+        Test(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, params.getOutputs()[0], raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false);
+            }
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override
+        {
+            auto& mm = mNetworkParams.mWorkflow.getMemoryManager();
+            size_t bs = mNetworkParams.mWorkflow.getBatchSize();
+
+            const raul::Tensor& t = mm[mOutputs[0]];
+
+            EXPECT_EQ(t.size(), bs);
+
+            auto wShape = mNetworkParams.mWorkflow.getShape(mOutputs[0]);
+            EXPECT_TRUE(wShape.isBSDependent());
+
+            auto shape = wShape.getShape(mNetworkParams.mWorkflow);
+            EXPECT_EQ(shape[0], bs);
+            EXPECT_EQ(shape[1], 1u);
+            EXPECT_EQ(shape[2], 1u);
+            EXPECT_EQ(shape[3], 1u);
+
+            EXPECT_EQ(mNetworkParams.mWorkflow.getBatch(mOutputs[0]), bs);
+            EXPECT_EQ(mNetworkParams.mWorkflow.getDepth(mOutputs[0]), 1u);
+            EXPECT_EQ(mNetworkParams.mWorkflow.getHeight(mOutputs[0]), 1u);
+            EXPECT_EQ(mNetworkParams.mWorkflow.getWidth(mOutputs[0]), 1u);
+        }
+        void backwardComputeImpl() override {}
+    };
+
+    class Test2 : public raul::BasicLayer
+    {
+      public:
+        Test2(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            if (!params.getInputs().empty())
+            {
+                mNetworkParams.mWorkflow.copyDeclaration(name, params.getInputs()[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+            }
+
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, params.getOutputs()[0], raul::WShape{ 1u, 1u, raul::BS(8u), 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false);
+            }
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override
+        {
+            auto& mm = mNetworkParams.mWorkflow.getMemoryManager();
+            size_t bs = mNetworkParams.mWorkflow.getBatchSize();
+
+            {
+                const raul::Tensor& t = mm[mInputs[0]];
+
+                EXPECT_EQ(t.size(), bs);
+
+                auto wShape = mNetworkParams.mWorkflow.getShape(mInputs[0]);
+                EXPECT_TRUE(wShape.isBSDependent());
+
+                auto shape = wShape.getShape(mNetworkParams.mWorkflow);
+                EXPECT_EQ(shape[0], bs);
+                EXPECT_EQ(shape[1], 1u);
+                EXPECT_EQ(shape[2], 1u);
+                EXPECT_EQ(shape[3], 1u);
+
+                EXPECT_EQ(mNetworkParams.mWorkflow.getBatch(mInputs[0]), bs);
+                EXPECT_EQ(mNetworkParams.mWorkflow.getDepth(mInputs[0]), 1u);
+                EXPECT_EQ(mNetworkParams.mWorkflow.getHeight(mInputs[0]), 1u);
+                EXPECT_EQ(mNetworkParams.mWorkflow.getWidth(mInputs[0]), 1u);
+            }
+
+            {
+                const raul::Tensor& t = mm[mOutputs[0]];
+
+                EXPECT_EQ(t.size(), bs * 8u);
+
+                auto wShape = mNetworkParams.mWorkflow.getShape(mOutputs[0]);
+                EXPECT_TRUE(wShape.isBSDependent());
+
+                auto shape = wShape.getShape(mNetworkParams.mWorkflow);
+                EXPECT_EQ(shape[0], 1u);
+                EXPECT_EQ(shape[1], 1u);
+                EXPECT_EQ(shape[2], bs * 8u);
+                EXPECT_EQ(shape[3], 1u);
+
+                EXPECT_EQ(mNetworkParams.mWorkflow.getBatch(mOutputs[0]), 1u);
+                EXPECT_EQ(mNetworkParams.mWorkflow.getDepth(mOutputs[0]), 1u);
+                EXPECT_EQ(mNetworkParams.mWorkflow.getHeight(mOutputs[0]), bs * 8u);
+                EXPECT_EQ(mNetworkParams.mWorkflow.getWidth(mOutputs[0]), 1u);
+            }
+        }
+        void backwardComputeImpl() override {}
+    };
+
+    raul::Workflow w;
+
+    w.add<Test>("f0", raul::BasicParams{ {}, { "x1" } });
+    w.add<Test2>("f1", raul::BasicParams{ { "x1" }, { "x2" } });
+
+    w.preparePipelines();
+    w.prepareMemoryForTraining();
+    w.setBatchSize(10u);
+
+    w.forwardPassTesting();
+
+    w.setBatchSize(20u);
+    w.forwardPassTesting();
+}
+
+TEST(TestWorkflow, PreparePipelinesSimpleTopologyEagerUnit)
+{
+    PROFILE_TEST
+
+    raul::WorkflowEager w;
+
+    w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+    w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" } }, true);
+    w.add<TestLayer>("f2", raul::BasicParams{ { "x2" }, { "x3" } }, true);
+    w.add<TestLayer>("f3", raul::BasicParams{ { "x3" }, {} }, true);
+
+    static_cast<TestLayer*>(w["f0"])->setExpectGrad(3.0_dt);
+    static_cast<TestLayer*>(w["f1"])->setExpectGrad(2.0_dt);
+    static_cast<TestLayer*>(w["f2"])->setExpectGrad(1.0_dt);
+
+    static_cast<TestLayer*>(w["f0"])->setPerformSizeChecks(false);
+    static_cast<TestLayer*>(w["f1"])->setPerformSizeChecks(false);
+    static_cast<TestLayer*>(w["f2"])->setPerformSizeChecks(false);
+    static_cast<TestLayer*>(w["f3"])->setPerformSizeChecks(false);
+
+    EXPECT_NO_THROW(w.preparePipelines());
+    EXPECT_NO_THROW(w.setBatchSize(10));
+    EXPECT_NO_THROW(w.prepareMemoryForTraining());
+
+    raul::MemoryManager& mm = w.getMemoryManager();
+
+    EXPECT_EQ(mm["x1"].size(), 10u);
+    EXPECT_EQ(mm["x2"].size(), 10u);
+    EXPECT_EQ(mm["x3"].size(), 10u);
+
+    EXPECT_EQ(mm[raul::Name("x1").grad()].size(), 10u);
+    EXPECT_EQ(mm[raul::Name("x2").grad()].size(), 10u);
+    EXPECT_EQ(mm[raul::Name("x3").grad()].size(), 10u);
+
+    EXPECT_NO_THROW(w.setBatchSize(20));
+
+    EXPECT_EQ(mm["x1"].size(), 20u);
+    EXPECT_EQ(mm["x2"].size(), 20u);
+    EXPECT_EQ(mm["x3"].size(), 20u);
+
+    EXPECT_EQ(mm[raul::Name("x1").grad()].size(), 20u);
+    EXPECT_EQ(mm[raul::Name("x2").grad()].size(), 20u);
+    EXPECT_EQ(mm[raul::Name("x3").grad()].size(), 20u);
+
+    EXPECT_NO_THROW(w.forwardPassTraining());
+    EXPECT_NO_THROW(w.backwardPassTraining());
+
+    EXPECT_NO_THROW(w.forwardPassTraining());
+    EXPECT_NO_THROW(w.backwardPassTraining());
+
+    static_cast<TestLayer*>(w["f0"])->setExpectGrad(6.0_dt);
+    static_cast<TestLayer*>(w["f1"])->setExpectGrad(5.0_dt);
+    static_cast<TestLayer*>(w["f2"])->setExpectGrad(2.0_dt);
+
+    static_cast<TestLayer*>(w["f0"])->setPerformGradWeightsChecks(false);
+    static_cast<TestLayer*>(w["f1"])->setPerformGradWeightsChecks(false);
+    static_cast<TestLayer*>(w["f2"])->setPerformGradWeightsChecks(false);
+    static_cast<TestLayer*>(w["f3"])->setPerformGradWeightsChecks(false);
+
+    EXPECT_NO_THROW(w["f3"]->backwardCompute());
+    EXPECT_NO_THROW(w["f2"]->backwardCompute());
+    EXPECT_NO_THROW(w["f1"]->backwardCompute());
+    EXPECT_NO_THROW(w["f0"]->backwardCompute());
+}
+
+TEST(TestWorkflow, PreparePipelinesResidualTopologyUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w;
+
+    w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+    w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" } }, true);
+    w.add<TestLayer>("f2", raul::BasicParams{ { "x1" }, { "x3" } }, true);
+    w.add<TestLayer>("f3", raul::BasicParams{ { "x2", "x3" }, { "x4" } }, true);
+
+    static_cast<TestLayer*>(w["f1"])->setExpectGrad(4.0_dt);
+    static_cast<TestLayer*>(w["f2"])->setExpectGrad(2.0_dt);
+    static_cast<TestLayer*>(w["f3"])->setExpectGrad(1.0_dt);
+
+    class Loss : public raul::BasicLayer
+    {
+      public:
+        Loss(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            for (auto& input : params.getInputs())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, input.grad(), raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Write, true, true, false, true, false);
+            }
+
+            // batched not optimized
+            mNetworkParams.mWorkflow.tensorNeeded(
+                name, "T1", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, false, false, false);
+
+            // not batched optimized
+            mNetworkParams.mWorkflow.tensorNeeded(name, "T2", raul::WShape{ 50u, 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, true, false, false, false);
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override
+        {
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists("T1"));
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists("T2"));
+
+            ASSERT_EQ(mNetworkParams.mMemoryManager["T1"].size(), 1u * mNetworkParams.mWorkflow.getBatchSize());
+            ASSERT_EQ(mNetworkParams.mMemoryManager["T2"].size(), 50u);
+        }
+
+        void backwardComputeImpl() override
+        {
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists("T1"));
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists("T2"));
+
+            ASSERT_EQ(mNetworkParams.mMemoryManager["T1"].size(), 1u * mNetworkParams.mWorkflow.getBatchSize());
+            ASSERT_EQ(mNetworkParams.mMemoryManager["T2"].size(), 50u);
+
+            for (auto& input : mInputs)
+            {
+                ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(input));
+                ASSERT_EQ(mNetworkParams.mMemoryManager[input].size(), 0u);
+
+                ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(input.grad()));
+                ASSERT_EQ(mNetworkParams.mMemoryManager[input.grad()].size(), 1u * mNetworkParams.mWorkflow.getBatchSize());
+            }
+        }
+    };
+
+    w.add<Loss>("f4", raul::BasicParams{ { "x4" }, {} });
+
+    w.preparePipelines();
+
+    ASSERT_EQ(w["f0"]->getName(), "f0");
+    ASSERT_EQ(w["f1"]->getName(), "f1");
+    ASSERT_EQ(w["f2"]->getName(), "f2");
+    ASSERT_EQ(w["f3"]->getName(), "f3");
+    ASSERT_EQ(w["f4"]->getName(), "f4");
+
+    EXPECT_THROW(w.getBatchSize(), raul::Exception);
+
+    // create batched pipe
+    {
+        const raul::Workflow::Pipeline& pipeCreateBatched = w.getPipeline(raul::Workflow::Pipelines::CreateBatched);
+
+        std::unordered_set<raul::Name> tNames;
+
+        ASSERT_EQ(pipeCreateBatched.size(), 9u + 5u);
+
+        size_t createShapeCount = 0;
+        size_t createTensorCount = 0;
+
+        for (size_t q = 0; q < 9; ++q)
+        {
+            if (pipeCreateBatched[q]->type() == "CreateShape") ++createShapeCount;
+            if (pipeCreateBatched[q]->type() == "CreateTensor") ++createTensorCount; // T1
+            tNames.insert(static_cast<raul::TensorAction<raul::MemoryManager>*>(pipeCreateBatched[q].get())->mName);
+        }
+
+        ASSERT_EQ(createShapeCount, 8u);
+        ASSERT_EQ(createTensorCount, 1u); // T1
+
+        ASSERT_NE(tNames.find("x1"), tNames.end());
+        ASSERT_NE(tNames.find("x2"), tNames.end());
+        ASSERT_NE(tNames.find("x3"), tNames.end());
+        ASSERT_NE(tNames.find("x4"), tNames.end());
+        ASSERT_NE(tNames.find("T1"), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x1")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x2")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x3")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x4")), tNames.end());
+    }
+
+    // delete batched pipe
+    {
+        const raul::Workflow::Pipeline& pipeDeleteBatched = w.getPipeline(raul::Workflow::Pipelines::DeleteBatched);
+
+        std::unordered_set<raul::Name> tNames;
+
+        ASSERT_EQ(pipeDeleteBatched.size(), 9u);
+
+        for (size_t q = 0; q < 9; ++q)
+        {
+            ASSERT_EQ(pipeDeleteBatched[q]->type(), "DeleteTensor");
+            tNames.insert(static_cast<raul::TensorAction<raul::MemoryManager>*>(pipeDeleteBatched[q].get())->mName);
+        }
+
+        ASSERT_NE(tNames.find("x1"), tNames.end());
+        ASSERT_NE(tNames.find("x2"), tNames.end());
+        ASSERT_NE(tNames.find("x3"), tNames.end());
+        ASSERT_NE(tNames.find("x4"), tNames.end());
+        ASSERT_NE(tNames.find("T1"), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x1")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x2")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x3")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x4")), tNames.end());
+    }
+
+    // create not batched pipe
+    {
+        const raul::Workflow::Pipeline& pipeCreateNotBatched = w.getPipeline(raul::Workflow::Pipelines::CreateNotBatched);
+
+        std::unordered_set<raul::Name> tNames;
+
+        ASSERT_EQ(pipeCreateNotBatched.size(), 9u + 5u);
+
+        size_t createShapeCount = 0;
+        size_t createTensorCount = 0;
+
+        for (size_t q = 0; q < 9; ++q)
+        {
+            if (pipeCreateNotBatched[q]->type() == "CreateTensor") ++createTensorCount;
+            if (pipeCreateNotBatched[q]->type() == "CreateShape") ++createShapeCount; // T2
+            tNames.insert(static_cast<raul::TensorAction<raul::MemoryManager>*>(pipeCreateNotBatched[q].get())->mName);
+        }
+
+        ASSERT_EQ(createTensorCount, 8u);
+        ASSERT_EQ(createShapeCount, 1u); // T2
+
+        ASSERT_NE(tNames.find("T2"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f0") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f1") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f2") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f3") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f0") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f1") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f2") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f3") / "Weights")), tNames.end());
+    }
+
+    // zero pipe
+    {
+        std::unordered_set<raul::Name> tNames;
+
+        const raul::Workflow::Pipeline& pipeZero = w.getPipeline(raul::Workflow::Pipelines::Zero);
+
+        ASSERT_EQ(pipeZero.size(), 4u);
+
+        for (size_t q = 0; q < 4; ++q)
+        {
+            ASSERT_EQ(pipeZero[q]->type(), "Zero");
+            tNames.insert(static_cast<raul::TensorAction<raul::MemoryManager>*>(pipeZero[q].get())->mName);
+        }
+
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f0") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f1") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f2") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f3") / "Weights")), tNames.end());
+    }
+
+    // forward test pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTest = w.getPipeline(raul::Workflow::Pipelines::ForwardTest);
+
+        ASSERT_EQ(pipeForwardTest.size(), 15u);
+
+        ASSERT_EQ(pipeForwardTest[0]->type(), "Allocate");    // x1
+        ASSERT_EQ(pipeForwardTest[1]->type(), "Forward");     // f0
+        ASSERT_EQ(pipeForwardTest[2]->type(), "Allocate");    // x2
+        ASSERT_EQ(pipeForwardTest[3]->type(), "Forward");     // f1
+        ASSERT_EQ(pipeForwardTest[4]->type(), "Allocate");    // x3
+        ASSERT_EQ(pipeForwardTest[5]->type(), "Forward");     // f2
+        ASSERT_EQ(pipeForwardTest[6]->type(), "Deallocate");  // x1
+        ASSERT_EQ(pipeForwardTest[7]->type(), "Allocate");    // x4
+        ASSERT_EQ(pipeForwardTest[8]->type(), "Forward");     // f3
+        ASSERT_EQ(pipeForwardTest[9]->type(), "Deallocate");  // x2
+        ASSERT_EQ(pipeForwardTest[10]->type(), "Deallocate"); // x3
+        ASSERT_EQ(pipeForwardTest[11]->type(), "Deallocate"); // x4
+        ASSERT_EQ(pipeForwardTest[12]->type(), "Allocate");   // T2
+        ASSERT_EQ(pipeForwardTest[13]->type(), "Forward");    // f4
+        ASSERT_EQ(pipeForwardTest[14]->type(), "Deallocate"); // T2
+
+        ASSERT_TRUE(checkName(pipeForwardTest[0].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTest[1].get(), "f0"));
+        ASSERT_TRUE(checkName(pipeForwardTest[2].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTest[3].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeForwardTest[4].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTest[5].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeForwardTest[6].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTest[7].get(), "x4"));
+        ASSERT_TRUE(checkName(pipeForwardTest[8].get(), "f3"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeForwardTest[9].get()), getName(pipeForwardTest[10].get()), getName(pipeForwardTest[11].get()) }, { "x2", "x3", "x4" }));
+        ASSERT_TRUE(checkName(pipeForwardTest[12].get(), "T2"));
+        ASSERT_TRUE(checkName(pipeForwardTest[13].get(), "f4"));
+        ASSERT_TRUE(checkName(pipeForwardTest[14].get(), "T2"));
+    }
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 11u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate");   // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");    // f0
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate");   // x2
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");    // f1
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate");   // x3
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");    // f2
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Allocate");   // x4
+        ASSERT_EQ(pipeForwardTrain[7]->type(), "Forward");    // f3
+        ASSERT_EQ(pipeForwardTrain[8]->type(), "Deallocate"); // x4
+        ASSERT_EQ(pipeForwardTrain[9]->type(), "Allocate");   // T2
+        ASSERT_EQ(pipeForwardTrain[10]->type(), "Forward");   // f4
+
+        ASSERT_TRUE(checkName(pipeForwardTrain[0].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[1].get(), "f0"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[2].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[3].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[4].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[5].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[6].get(), "x4"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[7].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[8].get(), "x4"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[9].get(), "T2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[10].get(), "f4"));
+    }
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 21u);
+
+        ASSERT_EQ(pipeBackwardTrain[0]->type(), "Allocate");    // xg4
+        ASSERT_EQ(pipeBackwardTrain[1]->type(), "Zero");        // xg4
+        ASSERT_EQ(pipeBackwardTrain[2]->type(), "Backward");    // f4
+        ASSERT_EQ(pipeBackwardTrain[3]->type(), "Deallocate");  // T2
+        ASSERT_EQ(pipeBackwardTrain[4]->type(), "Allocate");    // xg2
+        ASSERT_EQ(pipeBackwardTrain[5]->type(), "Zero");        // xg2
+        ASSERT_EQ(pipeBackwardTrain[6]->type(), "Allocate");    // xg3
+        ASSERT_EQ(pipeBackwardTrain[7]->type(), "Zero");        // xg3
+        ASSERT_EQ(pipeBackwardTrain[8]->type(), "Backward");    // f3
+        ASSERT_EQ(pipeBackwardTrain[9]->type(), "Deallocate");  // x2
+        ASSERT_EQ(pipeBackwardTrain[10]->type(), "Deallocate"); // x3
+        ASSERT_EQ(pipeBackwardTrain[11]->type(), "Deallocate"); // xg4
+        ASSERT_EQ(pipeBackwardTrain[12]->type(), "Allocate");   // xg1
+        ASSERT_EQ(pipeBackwardTrain[13]->type(), "Zero");       // xg1
+        ASSERT_EQ(pipeBackwardTrain[14]->type(), "Backward");   // f2
+        ASSERT_EQ(pipeBackwardTrain[15]->type(), "Deallocate"); // x3
+        ASSERT_EQ(pipeBackwardTrain[16]->type(), "Backward");   // f1
+        ASSERT_EQ(pipeBackwardTrain[17]->type(), "Deallocate"); // xg2
+        ASSERT_EQ(pipeBackwardTrain[18]->type(), "Deallocate"); // x1
+        ASSERT_EQ(pipeBackwardTrain[19]->type(), "Backward");   // f0
+        ASSERT_EQ(pipeBackwardTrain[20]->type(), "Deallocate"); // xg1
+
+        ASSERT_TRUE(checkName(pipeBackwardTrain[0].get(), getGradName("x4")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[1].get(), getGradName("x4")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[2].get(), "f4"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[3].get(), "T2"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[4].get()), getName(pipeBackwardTrain[5].get()), getName(pipeBackwardTrain[6].get()), getName(pipeBackwardTrain[7].get()) },
+                                     { getGradName("x2"), getGradName("x3") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[8].get(), "f3"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[9].get()), getName(pipeBackwardTrain[10].get()), getName(pipeBackwardTrain[11].get()) }, { "x2", "x3", getGradName("x4") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[12].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[13].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[14].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[15].get(), getGradName("x3")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[16].get(), "f1"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[17].get()), getName(pipeBackwardTrain[18].get()) }, { "x1", getGradName("x2") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[19].get(), "f0"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[20].get(), getGradName("x1")));
+    }
+
+    EXPECT_THROW(w.forwardPassTesting(), raul::Exception);
+    EXPECT_THROW(w.forwardPassTraining(), raul::Exception);
+    EXPECT_THROW(w.backwardPassTraining(), raul::Exception);
+
+    w.setBatchSize(10);
+    ASSERT_EQ(w.getBatchSize(), 10u);
+    ASSERT_EQ(w.getMemoryManager()["x1"].getBatchSize(), 10u);
+    ASSERT_EQ(w.getMemoryManager()["x2"].getBatchSize(), 10u);
+    ASSERT_EQ(w.getMemoryManager()["x3"].getBatchSize(), 10u);
+    ASSERT_EQ(w.getMemoryManager()["x4"].getBatchSize(), 10u);
+    ASSERT_EQ(w.getMemoryManager()["T1"].getBatchSize(), 10u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x1")].getBatchSize(), 10u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x2")].getBatchSize(), 10u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x3")].getBatchSize(), 10u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x4")].getBatchSize(), 10u);
+    ASSERT_EQ(w.getMemoryManager()["x1"].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()["x2"].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()["x3"].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()["x4"].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()["T1"].size(), 10u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x1")].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x2")].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x3")].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x4")].size(), 0u);
+
+    w.setBatchSize(2);
+    ASSERT_EQ(w.getBatchSize(), 2u);
+    ASSERT_EQ(w.getMemoryManager()["x1"].getBatchSize(), 2u);
+    ASSERT_EQ(w.getMemoryManager()["x2"].getBatchSize(), 2u);
+    ASSERT_EQ(w.getMemoryManager()["x3"].getBatchSize(), 2u);
+    ASSERT_EQ(w.getMemoryManager()["x4"].getBatchSize(), 2u);
+    ASSERT_EQ(w.getMemoryManager()["T1"].getBatchSize(), 2u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x1")].getBatchSize(), 2u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x2")].getBatchSize(), 2u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x3")].getBatchSize(), 2u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x4")].getBatchSize(), 2u);
+    ASSERT_EQ(w.getMemoryManager()["x1"].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()["x2"].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()["x3"].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()["x4"].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()["T1"].size(), 2u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x1")].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x2")].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x3")].size(), 0u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName("x4")].size(), 0u);
+
+    ASSERT_FALSE(w.getMemoryManager().tensorExists(raul::Name("f0") / "Weights"));
+    ASSERT_FALSE(w.getMemoryManager().tensorExists("T2"));
+
+    EXPECT_THROW(w.forwardPassTesting(), raul::Exception);
+    EXPECT_THROW(w.forwardPassTraining(), raul::Exception);
+    EXPECT_THROW(w.backwardPassTraining(), raul::Exception);
+
+    w.prepareMemoryForTraining();
+
+    ASSERT_TRUE(w.getMemoryManager().tensorExists(raul::Name("f0") / "Weights"));
+    ASSERT_TRUE(w.getMemoryManager().tensorExists("T2"));
+
+    ASSERT_EQ(w.getMemoryManager()[raul::Name("f0") / "Weights"].getBatchSize(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[raul::Name("f1") / "Weights"].getBatchSize(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[raul::Name("f2") / "Weights"].getBatchSize(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[raul::Name("f3") / "Weights"].getBatchSize(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName(raul::Name("f0") / "Weights")].getBatchSize(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName(raul::Name("f1") / "Weights")].getBatchSize(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName(raul::Name("f2") / "Weights")].getBatchSize(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName(raul::Name("f3") / "Weights")].getBatchSize(), 1u);
+    ASSERT_EQ(w.getMemoryManager()["T2"].getBatchSize(), 50u);
+    ASSERT_EQ(w.getMemoryManager()[raul::Name("f0") / "Weights"].size(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[raul::Name("f1") / "Weights"].size(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[raul::Name("f2") / "Weights"].size(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[raul::Name("f3") / "Weights"].size(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName(raul::Name("f0") / "Weights")].size(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName(raul::Name("f1") / "Weights")].size(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName(raul::Name("f2") / "Weights")].size(), 1u);
+    ASSERT_EQ(w.getMemoryManager()[getGradName(raul::Name("f3") / "Weights")].size(), 1u);
+    ASSERT_EQ(w.getMemoryManager()["T2"].size(), 0u);
+
+    EXPECT_THROW(w.prepareMemoryForTraining(), raul::Exception);
+
+    EXPECT_NO_THROW(w.forwardPassTesting());
+    EXPECT_NO_THROW(w.forwardPassTraining());
+    EXPECT_NO_THROW(w.backwardPassTraining());
+    EXPECT_NO_THROW(w.forwardPassTesting());
+
+    ASSERT_EQ(static_cast<TestLayer*>(w["f0"])->getForwardCountTest(), 2u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f1"])->getForwardCountTest(), 2u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f2"])->getForwardCountTest(), 2u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f3"])->getForwardCountTest(), 2u);
+
+    ASSERT_EQ(static_cast<TestLayer*>(w["f0"])->getForwardCountTrain(), 1u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f1"])->getForwardCountTrain(), 1u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f2"])->getForwardCountTrain(), 1u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f3"])->getForwardCountTrain(), 1u);
+
+    ASSERT_EQ(static_cast<TestLayer*>(w["f0"])->getBackwardCount(), 1u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f1"])->getBackwardCount(), 1u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f2"])->getBackwardCount(), 1u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f3"])->getBackwardCount(), 1u);
+
+    EXPECT_NO_THROW(w.forwardPassTraining());
+    EXPECT_NO_THROW(w.backwardPassTraining());
+
+    ASSERT_EQ(static_cast<TestLayer*>(w["f0"])->getForwardCountTrain(), 2u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f1"])->getForwardCountTrain(), 2u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f2"])->getForwardCountTrain(), 2u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f3"])->getForwardCountTrain(), 2u);
+
+    ASSERT_EQ(static_cast<TestLayer*>(w["f0"])->getBackwardCount(), 2u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f1"])->getBackwardCount(), 2u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f2"])->getBackwardCount(), 2u);
+    ASSERT_EQ(static_cast<TestLayer*>(w["f3"])->getBackwardCount(), 2u);
+}
+
+TEST(TestWorkflow, PreparePipelinesComplexTopologyUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w;
+
+    w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+    w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" }, { "w1", "w2" } }, true);
+    w.add<TestLayer>("f2", raul::BasicParams{ { "x1" }, { "x3" } }, true);
+    w.add<TestLayer>("f3", raul::BasicParams{ { "x1" }, { "x4" }, raul::Names{ "w1" } }, true);
+    w.add<TestLayer>("f4", raul::BasicParams{ { "x2", "x3" }, { "x5" } }, true);
+    w.add<TestLayer>("f5", raul::BasicParams{ { "x2" }, { "x6" } }, true);
+    w.add<TestLayer>("f6", raul::BasicParams{ { "x4", "x5", "x6" }, { "x7" } }, true);
+    w.add<TestLayer>("f7", raul::BasicParams{ { "x7" }, {}, raul::Names{ "w2" } }, true);
+
+    EXPECT_THROW(w["f8"], raul::Exception);
+
+    static_cast<TestLayer*>(w["f1"])->setExpectGrad(10.0_dt);
+    static_cast<TestLayer*>(w["f2"])->setExpectGrad(5.0_dt);
+    static_cast<TestLayer*>(w["f3"])->setExpectGrad(2.0_dt);
+    static_cast<TestLayer*>(w["f4"])->setExpectGrad(4.0_dt);
+    static_cast<TestLayer*>(w["f5"])->setExpectGrad(2.0_dt);
+    static_cast<TestLayer*>(w["f6"])->setExpectGrad(1.0_dt);
+
+    EXPECT_NO_THROW(w.preparePipelines());
+    EXPECT_NO_THROW(w.prepareMemoryForTraining());
+    EXPECT_NO_THROW(w.setBatchSize(10));
+
+    // create batched pipe
+    {
+        const raul::Workflow::Pipeline& pipeCreateBatched = w.getPipeline(raul::Workflow::Pipelines::CreateBatched);
+
+        std::unordered_set<raul::Name> tNames;
+
+        ASSERT_EQ(pipeCreateBatched.size(), 14u + 8u);
+
+        for (size_t q = 0; q < 14; ++q)
+        {
+            ASSERT_EQ(pipeCreateBatched[q]->type(), "CreateShape");
+            tNames.insert(static_cast<raul::TensorAction<raul::MemoryManager>*>(pipeCreateBatched[q].get())->mName);
+        }
+
+        ASSERT_NE(tNames.find("x1"), tNames.end());
+        ASSERT_NE(tNames.find("x2"), tNames.end());
+        ASSERT_NE(tNames.find("x3"), tNames.end());
+        ASSERT_NE(tNames.find("x4"), tNames.end());
+        ASSERT_NE(tNames.find("x5"), tNames.end());
+        ASSERT_NE(tNames.find("x6"), tNames.end());
+        ASSERT_NE(tNames.find("x7"), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x1")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x2")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x3")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x4")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x5")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x6")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x7")), tNames.end());
+    }
+
+    // delete batched pipe
+    {
+        const raul::Workflow::Pipeline& pipeDeleteBatched = w.getPipeline(raul::Workflow::Pipelines::DeleteBatched);
+
+        std::unordered_set<raul::Name> tNames;
+
+        ASSERT_EQ(pipeDeleteBatched.size(), 14u);
+
+        for (size_t q = 0; q < 14; ++q)
+        {
+            ASSERT_EQ(pipeDeleteBatched[q]->type(), "DeleteTensor");
+            tNames.insert(static_cast<raul::TensorAction<raul::MemoryManager>*>(pipeDeleteBatched[q].get())->mName);
+        }
+
+        ASSERT_NE(tNames.find("x1"), tNames.end());
+        ASSERT_NE(tNames.find("x2"), tNames.end());
+        ASSERT_NE(tNames.find("x3"), tNames.end());
+        ASSERT_NE(tNames.find("x4"), tNames.end());
+        ASSERT_NE(tNames.find("x5"), tNames.end());
+        ASSERT_NE(tNames.find("x6"), tNames.end());
+        ASSERT_NE(tNames.find("x7"), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x1")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x2")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x3")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x4")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x5")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x6")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("x7")), tNames.end());
+    }
+
+    // create not batched pipe
+    {
+        const raul::Workflow::Pipeline& pipeCreateNotBatched = w.getPipeline(raul::Workflow::Pipelines::CreateNotBatched);
+
+        std::unordered_set<raul::Name> tNames;
+
+        ASSERT_EQ(pipeCreateNotBatched.size(), 14u + 8u);
+
+        for (size_t q = 0; q < 14; ++q)
+        {
+            ASSERT_EQ(pipeCreateNotBatched[q]->type(), "CreateTensor");
+            tNames.insert(static_cast<raul::TensorAction<raul::MemoryManager>*>(pipeCreateNotBatched[q].get())->mName);
+        }
+
+        ASSERT_NE(tNames.find("w1"), tNames.end());
+        ASSERT_NE(tNames.find("w2"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f0") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f2") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f4") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f5") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(raul::Name("f6") / "Weights"), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("w1")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("w2")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f0") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f2") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f4") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f5") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f6") / "Weights")), tNames.end());
+    }
+
+    // zero pipe
+    {
+        const raul::Workflow::Pipeline& pipeZero = w.getPipeline(raul::Workflow::Pipelines::Zero);
+
+        std::unordered_set<raul::Name> tNames;
+
+        ASSERT_EQ(pipeZero.size(), 7u);
+
+        for (size_t q = 0; q < 7; ++q)
+        {
+            ASSERT_EQ(pipeZero[q]->type(), "Zero");
+            tNames.insert(static_cast<raul::TensorAction<raul::MemoryManager>*>(pipeZero[q].get())->mName);
+        }
+
+        ASSERT_NE(tNames.find(getGradName("w1")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName("w2")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f0") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f2") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f4") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f5") / "Weights")), tNames.end());
+        ASSERT_NE(tNames.find(getGradName(raul::Name("f6") / "Weights")), tNames.end());
+    }
+
+    // forward test pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTest = w.getPipeline(raul::Workflow::Pipelines::ForwardTest);
+
+        ASSERT_EQ(pipeForwardTest.size(), 22u);
+
+        ASSERT_EQ(pipeForwardTest[0]->type(), "Allocate");    // x1
+        ASSERT_EQ(pipeForwardTest[1]->type(), "Forward");     // f0
+        ASSERT_EQ(pipeForwardTest[2]->type(), "Allocate");    // x2
+        ASSERT_EQ(pipeForwardTest[3]->type(), "Forward");     // f1
+        ASSERT_EQ(pipeForwardTest[4]->type(), "Allocate");    // x3
+        ASSERT_EQ(pipeForwardTest[5]->type(), "Forward");     // f2
+        ASSERT_EQ(pipeForwardTest[6]->type(), "Allocate");    // x4
+        ASSERT_EQ(pipeForwardTest[7]->type(), "Forward");     // f3
+        ASSERT_EQ(pipeForwardTest[8]->type(), "Deallocate");  // x1
+        ASSERT_EQ(pipeForwardTest[9]->type(), "Allocate");    // x5
+        ASSERT_EQ(pipeForwardTest[10]->type(), "Forward");    // f4
+        ASSERT_EQ(pipeForwardTest[11]->type(), "Deallocate"); // x3
+        ASSERT_EQ(pipeForwardTest[12]->type(), "Allocate");   // x6
+        ASSERT_EQ(pipeForwardTest[13]->type(), "Forward");    // f5
+        ASSERT_EQ(pipeForwardTest[14]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeForwardTest[15]->type(), "Allocate");   // x7
+        ASSERT_EQ(pipeForwardTest[16]->type(), "Forward");    // f6
+        ASSERT_EQ(pipeForwardTest[17]->type(), "Deallocate"); // x4
+        ASSERT_EQ(pipeForwardTest[18]->type(), "Deallocate"); // x5
+        ASSERT_EQ(pipeForwardTest[19]->type(), "Deallocate"); // x6
+        ASSERT_EQ(pipeForwardTest[20]->type(), "Forward");    // f7
+        ASSERT_EQ(pipeForwardTest[21]->type(), "Deallocate"); // x7
+
+        ASSERT_TRUE(checkName(pipeForwardTest[0].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTest[1].get(), "f0"));
+        ASSERT_TRUE(checkName(pipeForwardTest[2].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTest[3].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeForwardTest[4].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTest[5].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeForwardTest[6].get(), "x4"));
+        ASSERT_TRUE(checkName(pipeForwardTest[7].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeForwardTest[8].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTest[9].get(), "x5"));
+        ASSERT_TRUE(checkName(pipeForwardTest[10].get(), "f4"));
+        ASSERT_TRUE(checkName(pipeForwardTest[11].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTest[12].get(), "x6"));
+        ASSERT_TRUE(checkName(pipeForwardTest[13].get(), "f5"));
+        ASSERT_TRUE(checkName(pipeForwardTest[14].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTest[15].get(), "x7"));
+        ASSERT_TRUE(checkName(pipeForwardTest[16].get(), "f6"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeForwardTest[17].get()), getName(pipeForwardTest[18].get()), getName(pipeForwardTest[19].get()) }, { "x4", "x5", "x6" }));
+        ASSERT_TRUE(checkName(pipeForwardTest[20].get(), "f7"));
+        ASSERT_TRUE(checkName(pipeForwardTest[21].get(), "x7"));
+    }
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 15u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate");  // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");   // f0
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate");  // x2
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");   // f1
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate");  // x3
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");   // f2
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Allocate");  // x4
+        ASSERT_EQ(pipeForwardTrain[7]->type(), "Forward");   // f3
+        ASSERT_EQ(pipeForwardTrain[8]->type(), "Allocate");  // x5
+        ASSERT_EQ(pipeForwardTrain[9]->type(), "Forward");   // f4
+        ASSERT_EQ(pipeForwardTrain[10]->type(), "Allocate"); // x6
+        ASSERT_EQ(pipeForwardTrain[11]->type(), "Forward");  // f5
+        ASSERT_EQ(pipeForwardTrain[12]->type(), "Allocate"); // x7
+        ASSERT_EQ(pipeForwardTrain[13]->type(), "Forward");  // f6
+        ASSERT_EQ(pipeForwardTrain[14]->type(), "Forward");  // f7
+
+        ASSERT_TRUE(checkName(pipeForwardTrain[0].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[1].get(), "f0"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[2].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[3].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[4].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[5].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[6].get(), "x4"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[7].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[8].get(), "x5"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[9].get(), "f4"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[10].get(), "x6"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[11].get(), "f5"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[12].get(), "x7"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[13].get(), "f6"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[14].get(), "f7"));
+    }
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 36u);
+
+        ASSERT_EQ(pipeBackwardTrain[0]->type(), "Allocate");    // xg7
+        ASSERT_EQ(pipeBackwardTrain[1]->type(), "Zero");        // xg7
+        ASSERT_EQ(pipeBackwardTrain[2]->type(), "Backward");    // f7
+        ASSERT_EQ(pipeBackwardTrain[3]->type(), "Deallocate");  // x7
+        ASSERT_EQ(pipeBackwardTrain[4]->type(), "Allocate");    // xg4
+        ASSERT_EQ(pipeBackwardTrain[5]->type(), "Zero");        // xg4
+        ASSERT_EQ(pipeBackwardTrain[6]->type(), "Allocate");    // xg5
+        ASSERT_EQ(pipeBackwardTrain[7]->type(), "Zero");        // xg5
+        ASSERT_EQ(pipeBackwardTrain[8]->type(), "Allocate");    // xg6
+        ASSERT_EQ(pipeBackwardTrain[9]->type(), "Zero");        // xg6
+        ASSERT_EQ(pipeBackwardTrain[10]->type(), "Backward");   // f6
+        ASSERT_EQ(pipeBackwardTrain[11]->type(), "Deallocate"); // x4
+        ASSERT_EQ(pipeBackwardTrain[12]->type(), "Deallocate"); // x5
+        ASSERT_EQ(pipeBackwardTrain[13]->type(), "Deallocate"); // x6
+        ASSERT_EQ(pipeBackwardTrain[14]->type(), "Deallocate"); // xg7
+        ASSERT_EQ(pipeBackwardTrain[15]->type(), "Allocate");   // xg2
+        ASSERT_EQ(pipeBackwardTrain[16]->type(), "Zero");       // xg2
+        ASSERT_EQ(pipeBackwardTrain[17]->type(), "Backward");   // f5
+        ASSERT_EQ(pipeBackwardTrain[18]->type(), "Deallocate"); // xg6
+        ASSERT_EQ(pipeBackwardTrain[19]->type(), "Allocate");   // xg3
+        ASSERT_EQ(pipeBackwardTrain[20]->type(), "Zero");       // xg3
+        ASSERT_EQ(pipeBackwardTrain[21]->type(), "Backward");   // f4
+        ASSERT_EQ(pipeBackwardTrain[22]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeBackwardTrain[23]->type(), "Deallocate"); // x3
+        ASSERT_EQ(pipeBackwardTrain[24]->type(), "Deallocate"); // xg5
+        ASSERT_EQ(pipeBackwardTrain[25]->type(), "Allocate");   // xg1
+        ASSERT_EQ(pipeBackwardTrain[26]->type(), "Zero");       // xg1
+        ASSERT_EQ(pipeBackwardTrain[27]->type(), "Backward");   // f3
+        ASSERT_EQ(pipeBackwardTrain[28]->type(), "Deallocate"); // xg4
+        ASSERT_EQ(pipeBackwardTrain[29]->type(), "Backward");   // f2
+        ASSERT_EQ(pipeBackwardTrain[30]->type(), "Deallocate"); // xg3
+        ASSERT_EQ(pipeBackwardTrain[31]->type(), "Backward");   // f1
+        ASSERT_EQ(pipeBackwardTrain[32]->type(), "Deallocate"); // xg2
+        ASSERT_EQ(pipeBackwardTrain[33]->type(), "Deallocate"); // x1
+        ASSERT_EQ(pipeBackwardTrain[34]->type(), "Backward");   // f0
+        ASSERT_EQ(pipeBackwardTrain[35]->type(), "Deallocate"); // xg1
+
+        ASSERT_TRUE(checkName(pipeBackwardTrain[0].get(), getGradName("x7")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[1].get(), getGradName("x7")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[2].get(), "f7"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[3].get(), "x7"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[4].get()),
+                                       getName(pipeBackwardTrain[5].get()),
+                                       getName(pipeBackwardTrain[6].get()),
+                                       getName(pipeBackwardTrain[7].get()),
+                                       getName(pipeBackwardTrain[8].get()),
+                                       getName(pipeBackwardTrain[9].get()) },
+                                     { getGradName("x4"), getGradName("x5"), getGradName("x6") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[10].get(), "f6"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[11].get()), getName(pipeBackwardTrain[12].get()), getName(pipeBackwardTrain[13].get()), getName(pipeBackwardTrain[14].get()) },
+                                     { "x4", "x5", "x6", getGradName("x7") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[15].get(), getGradName("x2")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[16].get(), getGradName("x2")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[17].get(), "f5"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[18].get(), getGradName("x6")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[19].get(), getGradName("x3")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[20].get(), getGradName("x3")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[21].get(), "f4"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[22].get()), getName(pipeBackwardTrain[23].get()), getName(pipeBackwardTrain[24].get()) }, { "x2", "x3", getGradName("x5") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[25].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[26].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[27].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[28].get(), getGradName("x4")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[29].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[30].get(), getGradName("x3")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[31].get(), "f1"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[32].get()), getName(pipeBackwardTrain[33].get()) }, { "x1", getGradName("x2") }));
+        ASSERT_EQ(static_cast<raul::LayerAction*>(pipeBackwardTrain[34].get())->mLayer->getName(), "f0");
+        ASSERT_EQ(static_cast<raul::TensorAction<raul::MemoryManager>*>(pipeBackwardTrain[35].get())->mName, getGradName("x1"));
+    }
+
+    EXPECT_NO_THROW(w.forwardPassTesting());
+
+    for (size_t epoch = 1; epoch <= 10; ++epoch)
+    {
+        EXPECT_NO_THROW(w.forwardPassTraining());
+        EXPECT_NO_THROW(w.backwardPassTraining());
+    }
+
+    EXPECT_NO_THROW(w.forwardPassTesting());
+}
+
+TEST(TestWorkflow, FlushUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow work;
+
+    ASSERT_NO_THROW(work.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true));
+    ASSERT_NO_THROW(work.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" }, { "w1", "w2" } }, true));
+
+    // Prepare
+    ASSERT_NO_THROW(work.preparePipelines());
+    ASSERT_NO_THROW(work.setBatchSize(1));
+    ASSERT_NO_THROW(work.prepareMemoryForTraining());
+
+    // Flush
+    ASSERT_NO_THROW(work.flush());
+
+    // Checks
+    EXPECT_EQ(work.getPipeline(raul::Workflow::Pipelines::ForwardTrain).size(), 0u);
+    EXPECT_EQ(work.getPipeline(raul::Workflow::Pipelines::BackwardTrain).size(), 0u);
+    EXPECT_EQ(work.getPipeline(raul::Workflow::Pipelines::ForwardTest).size(), 0u);
+    EXPECT_EQ(work.getPipeline(raul::Workflow::Pipelines::CreateBatched).size(), 0u);
+    EXPECT_EQ(work.getPipeline(raul::Workflow::Pipelines::CreateNotBatched).size(), 0u);
+    EXPECT_EQ(work.getPipeline(raul::Workflow::Pipelines::DeleteBatched).size(), 0u);
+    EXPECT_EQ(work.getPipeline(raul::Workflow::Pipelines::Zero).size(), 0u);
+    EXPECT_EQ(work.getMemoryManager().size(), 0u);
+
+    // Can add new operations
+    ASSERT_NO_THROW(work.add<TestLayer>("f2", raul::BasicParams{ { "x1" }, { "x3" } }, true));
+    ASSERT_NO_THROW(work.add<TestLayer>("f3", raul::BasicParams{ { "x1" }, { "x4" } }, true));
+
+    // Can't start training after flush
+    EXPECT_THROW(work.setBatchSize(1), raul::Exception);
+    EXPECT_THROW(work.prepareMemoryForTraining(), raul::Exception);
+    EXPECT_THROW(work.forwardPassTraining(), raul::Exception);
+    EXPECT_THROW(work.backwardPassTraining(), raul::Exception);
+    EXPECT_THROW(work.forwardPassTesting(), raul::Exception);
+}
+
+TEST(TestWorkflow, ListenersUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow work;
+
+    class TestLayer2 : public raul::BasicLayer
+    {
+      public:
+        TestLayer2(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    ASSERT_NO_THROW(work.add<TestLayer2>("f0", raul::BasicParams{ {}, {} }));
+    ASSERT_NO_THROW(work.add<TestLayer2>("f1", raul::BasicParams{ {}, {} }));
+
+    ASSERT_NO_THROW(work.preparePipelines());
+    ASSERT_NO_THROW(work.setBatchSize(1));
+    ASSERT_NO_THROW(work.prepareMemoryForTraining());
+
+    class ListenerHelper : public raul::WorkflowListener
+    {
+      public:
+        ListenerHelper()
+            : beforeF(0)
+            , afterF(0)
+            , beforeB(0)
+            , afterB(0)
+        {
+        }
+
+        void BeforeForward(raul::Workflow&) override { ++beforeF; }
+
+        void AfterForward(raul::Workflow&) override { ++afterF; }
+
+        void BeforeBackward(raul::Workflow&) override { ++beforeB; }
+
+        void AfterBackward(raul::Workflow&) override { ++afterB; }
+
+        size_t beforeF;
+        size_t afterF;
+        size_t beforeB;
+        size_t afterB;
+    };
+
+    ListenerHelper listener;
+    ListenerHelper listener2;
+
+    EXPECT_THROW(work.addCallback("", listener), raul::Exception);
+    EXPECT_THROW(work.addCallback("f", listener), raul::Exception);
+    EXPECT_NO_THROW(work.addCallback("f0", listener));
+    EXPECT_NO_THROW(work.addCallback("f1", listener));
+    EXPECT_NO_THROW(work.addCallback("f1", listener2));
+
+    EXPECT_EQ(listener.beforeF, 0u);
+    EXPECT_EQ(listener.afterF, 0u);
+    EXPECT_EQ(listener.beforeB, 0u);
+    EXPECT_EQ(listener.afterB, 0u);
+
+    EXPECT_EQ(listener2.beforeF, 0u);
+    EXPECT_EQ(listener2.afterF, 0u);
+    EXPECT_EQ(listener2.beforeB, 0u);
+    EXPECT_EQ(listener2.afterB, 0u);
+
+    work.forwardPassTraining();
+
+    EXPECT_EQ(listener.beforeF, 2u);
+    EXPECT_EQ(listener.afterF, 2u);
+    EXPECT_EQ(listener.beforeB, 0u);
+    EXPECT_EQ(listener.afterB, 0u);
+
+    EXPECT_EQ(listener2.beforeF, 1u);
+    EXPECT_EQ(listener2.afterF, 1u);
+    EXPECT_EQ(listener2.beforeB, 0u);
+    EXPECT_EQ(listener2.afterB, 0u);
+
+    work.backwardPassTraining();
+
+    EXPECT_EQ(listener.beforeF, 2u);
+    EXPECT_EQ(listener.afterF, 2u);
+    EXPECT_EQ(listener.beforeB, 2u);
+    EXPECT_EQ(listener.afterB, 2u);
+
+    EXPECT_EQ(listener2.beforeF, 1u);
+    EXPECT_EQ(listener2.afterF, 1u);
+    EXPECT_EQ(listener2.beforeB, 1u);
+    EXPECT_EQ(listener2.afterB, 1u);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/workflow/Test_WorkflowAllocation.cpp b/training/src/tests/tests/lib/workflow/Test_WorkflowAllocation.cpp
new file mode 100644
index 00000000..93606a3d
--- /dev/null
+++ b/training/src/tests/tests/lib/workflow/Test_WorkflowAllocation.cpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+TEST(TestWorkflowAllocation, ConstructorUnit)
+{
+    PROFILE_TEST
+
+    EXPECT_THROW(raul::Workflow(raul::CompressionMode::INT8, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::POOL), raul::Exception);
+    EXPECT_THROW(raul::Workflow(raul::CompressionMode::FP16, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::POOL), raul::Exception);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/workflow/Test_WorkflowBenchmarks.cpp b/training/src/tests/tests/lib/workflow/Test_WorkflowBenchmarks.cpp
new file mode 100644
index 00000000..02463170
--- /dev/null
+++ b/training/src/tests/tests/lib/workflow/Test_WorkflowBenchmarks.cpp
@@ -0,0 +1,490 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/compiler/Workflow.h>
+#include <training/compiler/WorkflowActions.h>
+
+namespace UT
+{
+
+TEST(TestWorkflowBenchmarks, TensorNeededPerfUnit)
+{
+    PROFILE_TEST
+    raul::Workflow ww;
+
+    auto timeStart = std::chrono::steady_clock::now();
+    for (size_t q = 0; q < 500; ++q)
+    {
+        for (size_t w = 0; w < 100; ++w)
+        {
+            ww.tensorNeeded("L" + Conversions::toString(q),
+                            "t" + Conversions::toString(w),
+                            raul::WShape{ raul::BS(), 1u, 1u, 1u },
+                            raul::Workflow::Usage::ForwardAndBackward,
+                            raul::Workflow::Mode::Read,
+                            false,
+                            false,
+                            false,
+                            false,
+                            false);
+        }
+    }
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+
+    timeStart = std::chrono::steady_clock::now();
+    EXPECT_THROW(ww.tensorNeeded("L499", "t99", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false),
+                 raul::Exception);
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+
+    ww.preparePipelines();
+    EXPECT_THROW(ww.tensorNeeded("L500", "t0", raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, false, false, false, false, false),
+                 raul::Exception);
+}
+
+TEST(TestWorkflowBenchmarks, TensorPipelinesPerfUnit)
+{
+    PROFILE_TEST
+
+    class TestLayer : public raul::BasicLayer
+    {
+      public:
+        TestLayer(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, params.getOutputs()[0], raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false);
+            }
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    raul::Workflow w;
+
+    auto timeStart = std::chrono::steady_clock::now();
+
+#ifdef _DEBUG
+    const size_t maxLayers = 500u;
+#else
+    const size_t maxLayers = 20000u;
+#endif
+
+    for (size_t q = 0; q < maxLayers; ++q)
+    {
+        if (q == 0)
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ {}, { "x0" } });
+        }
+        else if (q == maxLayers - 1) // last
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, {} });
+        }
+        else
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, { "x" + Conversions::toString(q) } });
+        }
+    }
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+
+    timeStart = std::chrono::steady_clock::now();
+    EXPECT_NO_THROW(w.preparePipelines());
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+
+    std::cout << "Forward test pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::ForwardTest).size() << std::endl;
+    std::cout << "Forward train pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::ForwardTrain).size() << std::endl;
+    std::cout << "Backward train pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::BackwardTrain).size() << std::endl;
+
+    EXPECT_NO_THROW(w.prepareMemoryForTraining());
+    EXPECT_NO_THROW(w.setBatchSize(10));
+
+    EXPECT_NO_THROW(w.forwardPassTesting());
+    EXPECT_NO_THROW(w.forwardPassTraining());
+    EXPECT_NO_THROW(w.backwardPassTraining());
+
+    // order matters
+    EXPECT_THROW(w.backwardPassTraining(), raul::Exception);
+    EXPECT_NO_THROW(w.forwardPassTraining());
+    EXPECT_THROW(w.forwardPassTraining(), raul::Exception);
+    EXPECT_NO_THROW(w.backwardPassTraining());
+}
+
+TEST(TestWorkflowBenchmarks, TensorPipelinesCheckpointedPerfUnit)
+{
+    PROFILE_TEST
+
+    class TestLayer : public raul::BasicLayer
+    {
+      public:
+        TestLayer(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, params.getOutputs()[0], raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false);
+            }
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    raul::Workflow w;
+
+    auto timeStart = std::chrono::steady_clock::now();
+
+#ifdef _DEBUG
+    const size_t maxLayers = 500u;
+    const size_t step = 5u;
+#else
+    const size_t maxLayers = 20000u;
+    const size_t step = 10u;
+#endif
+
+    for (size_t q = 0; q < maxLayers; ++q)
+    {
+        if (q == 0)
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ {}, { "x0" } });
+        }
+        else if (q == maxLayers - 1) // last
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, {} });
+        }
+        else
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, { "x" + Conversions::toString(q) } });
+        }
+    }
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+
+#if 0
+    EXPECT_NO_THROW(w.setCheckpoints({ "x0" }));
+
+    timeStart = std::chrono::steady_clock::now();
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+
+    std::cout << "Forward test pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::ForwardTest).size() << std::endl;
+    std::cout << "Forward train pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::ForwardTrain).size() << std::endl;
+    std::cout << "Backward train pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::BackwardTrain).size() << std::endl;
+#endif
+
+    raul::Names checkpoints;
+
+    checkpoints.clear();
+
+#ifdef _DEBUG
+    for (size_t q = 0; q < maxLayers - 1; q += step)
+#else
+    for (size_t q = 0; q < maxLayers - 1; q += step)
+#endif
+    {
+        checkpoints.push_back("x" + Conversions::toString(q));
+    }
+    EXPECT_NO_THROW(w.setCheckpoints(checkpoints));
+
+    timeStart = std::chrono::steady_clock::now();
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+
+    std::cout << "Forward test pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::ForwardTest).size() << std::endl;
+    std::cout << "Forward train pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::ForwardTrain).size() << std::endl;
+    std::cout << "Backward train pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::BackwardTrain).size() << std::endl;
+
+    checkpoints.clear();
+    for (size_t q = 0; q < maxLayers - 1; ++q)
+    {
+        checkpoints.push_back("x" + Conversions::toString(q));
+    }
+    EXPECT_NO_THROW(w.setCheckpoints(checkpoints));
+
+    timeStart = std::chrono::steady_clock::now();
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+
+    std::cout << "Forward test pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::ForwardTest).size() << std::endl;
+    std::cout << "Forward train pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::ForwardTrain).size() << std::endl;
+    std::cout << "Backward train pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::BackwardTrain).size() << std::endl;
+}
+
+TEST(TestWorkflowBenchmarks, CheckpointingMemoryUnit)
+{
+    PROFILE_TEST
+
+    class TestLayer : public raul::BasicLayer
+    {
+      public:
+        TestLayer(const raul::Name& name, const raul::BasicParams& params, size_t depth, size_t height, size_t width, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            if (!params.getInputs().empty())
+            {
+                mNetworkParams.mWorkflow.copyDeclaration(mName, params.getInputs()[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+            }
+
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(name,
+                                                      params.getOutputs()[0],
+                                                      raul::WShape{ raul::BS(), depth, height, width },
+                                                      raul::Workflow::Usage::ForwardAndBackward,
+                                                      raul::Workflow::Mode::Read,
+                                                      true,
+                                                      true,
+                                                      false,
+                                                      false,
+                                                      false);
+            }
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    raul::Workflow w;
+
+    const size_t maxLayers = 50u;
+    const size_t tensorN = 64u;
+    const size_t tensorC = 10u;
+    const size_t tensorHW = 224u;
+
+    for (size_t q = 0; q < maxLayers; ++q)
+    {
+        const raul::Name lName = "f" + Conversions::toString(q);
+
+        if (q == 0)
+        {
+            w.add<TestLayer>(lName, raul::BasicParams{ {}, { "x0" } }, tensorC, tensorHW, tensorHW);
+        }
+        else if (q == maxLayers - 1) // last
+        {
+            w.add<TestLayer>(lName, raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, {} }, tensorC, tensorHW, tensorHW);
+        }
+        else
+        {
+            w.add<TestLayer>(lName, raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, { "x" + Conversions::toString(q) } }, tensorC, tensorHW, tensorHW);
+        }
+    }
+
+    // w.setCheckpoints({"x0"});
+    // w.setCheckpoints({"x0", "x10"});
+    // w.setCheckpoints({"x0", "x10", "x20"});
+    // w.setCheckpoints({"x0", "x10", "x20", "x30"});
+    w.setCheckpoints({ "x0", "x10", "x20", "x30", "x40" });
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+    // EXPECT_NO_THROW(w.preparePipelines());
+    EXPECT_NO_THROW(w.prepareMemoryForTraining());
+    EXPECT_NO_THROW(w.setBatchSize(tensorN));
+
+    std::cout << "Forward test pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::ForwardTest).size() << std::endl;
+    std::cout << "Forward train pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::ForwardTrain).size() << std::endl;
+    std::cout << "Backward train pipeline length: " << w.getPipeline(raul::Workflow::Pipelines::BackwardTrain).size() << std::endl;
+    std::cout << "Each tensor size: " << (sizeof(raul::dtype) * tensorN * tensorC * tensorHW * tensorHW) / (1024 * 1024) << "MB" << std::endl;
+
+    auto timeStart = std::chrono::steady_clock::now();
+    EXPECT_NO_THROW(w.forwardPassTraining());
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+    timeStart = std::chrono::steady_clock::now();
+    EXPECT_NO_THROW(w.backwardPassTraining());
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+}
+
+TEST(TestWorkflowBenchmarks, MemoryPoolPerformanceUnit)
+{
+    PROFILE_TEST
+
+    class TestLayer : public raul::BasicLayer
+    {
+      public:
+        TestLayer(const raul::Name& name, const raul::BasicParams& params, size_t depth, size_t height, size_t width, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            if (!params.getInputs().empty())
+            {
+                mNetworkParams.mWorkflow.copyDeclaration(mName, params.getInputs()[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+            }
+
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(name,
+                                                      params.getOutputs()[0],
+                                                      raul::WShape{ raul::BS(), depth, height, width },
+                                                      raul::Workflow::Usage::ForwardAndBackward,
+                                                      raul::Workflow::Mode::Read,
+                                                      true,
+                                                      true,
+                                                      false,
+                                                      false,
+                                                      false);
+            }
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    const size_t maxLayers = 50u;
+    const size_t tensorN = 64u;
+    const size_t tensorC = 10u;
+    const size_t tensorHW = 224u;
+
+    raul::Workflow w(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD);
+    raul::Workflow wPool(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::POOL);
+
+    for (size_t q = 0; q < maxLayers; ++q)
+    {
+        if (q == 0)
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ {}, { "x0" } }, tensorC, tensorHW, tensorHW);
+            wPool.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ {}, { "x0" } }, tensorC, tensorHW, tensorHW);
+        }
+        else if (q == maxLayers - 1) // last
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, {} }, tensorC, tensorHW, tensorHW);
+            wPool.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, {} }, tensorC, tensorHW, tensorHW);
+        }
+        else
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, { "x" + Conversions::toString(q) } }, tensorC, tensorHW, tensorHW);
+            wPool.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, { "x" + Conversions::toString(q) } }, tensorC, tensorHW, tensorHW);
+        }
+    }
+
+    EXPECT_NO_THROW(w.preparePipelines());
+    EXPECT_NO_THROW(w.prepareMemoryForTraining());
+    EXPECT_NO_THROW(w.setBatchSize(tensorN));
+
+    EXPECT_NO_THROW(wPool.preparePipelines());
+    EXPECT_NO_THROW(wPool.prepareMemoryForTraining());
+    EXPECT_NO_THROW(wPool.setBatchSize(tensorN));
+
+    for (size_t q = 0; q < 5; ++q)
+    {
+        auto timeStart = std::chrono::steady_clock::now();
+        EXPECT_NO_THROW(w.forwardPassTraining());
+        EXPECT_NO_THROW(w.backwardPassTraining());
+        std::cout << "Train:" << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+    }
+
+    for (size_t q = 0; q < 5; ++q)
+    {
+        auto timeStart = std::chrono::steady_clock::now();
+        EXPECT_NO_THROW(wPool.forwardPassTraining());
+        EXPECT_NO_THROW(wPool.backwardPassTraining());
+        std::cout << "Train pool:" << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+    }
+}
+
+TEST(TestWorkflowBenchmarks, MemoryPoolPerformance2Unit)
+{
+    PROFILE_TEST
+
+    class TestLayer : public raul::BasicLayer
+    {
+      public:
+        TestLayer(const raul::Name& name, const raul::BasicParams& params, size_t depth, size_t height, size_t width, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            if (!params.getInputs().empty())
+            {
+                mNetworkParams.mWorkflow.copyDeclaration(mName, params.getInputs()[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+            }
+
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(name,
+                                                      params.getOutputs()[0],
+                                                      raul::WShape{ raul::BS(), depth, height, width },
+                                                      raul::Workflow::Usage::ForwardAndBackward,
+                                                      raul::Workflow::Mode::Read,
+                                                      true,
+                                                      true,
+                                                      false,
+                                                      false,
+                                                      false);
+            }
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+#ifdef _DEBUG
+    const size_t maxLayers = 500u;
+#else
+    const size_t maxLayers = 20000u;
+#endif
+
+    const size_t tensorN = 10u;
+    const size_t tensorC = 1u;
+    const size_t tensorHW = 1u;
+
+    raul::Workflow w(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD);
+    raul::Workflow wPool(raul::CompressionMode::NONE, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::POOL);
+
+    for (size_t q = 0; q < maxLayers; ++q)
+    {
+        if (q == 0)
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ {}, { "x0" } }, tensorC, tensorHW, tensorHW);
+            wPool.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ {}, { "x0" } }, tensorC, tensorHW, tensorHW);
+        }
+        else if (q == maxLayers - 1) // last
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, {} }, tensorC, tensorHW, tensorHW);
+            wPool.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, {} }, tensorC, tensorHW, tensorHW);
+        }
+        else
+        {
+            w.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, { "x" + Conversions::toString(q) } }, tensorC, tensorHW, tensorHW);
+            wPool.add<TestLayer>("f" + Conversions::toString(q), raul::BasicParams{ { "x" + Conversions::toString(q - 1) }, { "x" + Conversions::toString(q) } }, tensorC, tensorHW, tensorHW);
+        }
+    }
+
+    EXPECT_NO_THROW(w.preparePipelines());
+    EXPECT_NO_THROW(w.prepareMemoryForTraining());
+    EXPECT_NO_THROW(w.setBatchSize(tensorN));
+
+    EXPECT_NO_THROW(wPool.preparePipelines());
+    EXPECT_NO_THROW(wPool.prepareMemoryForTraining());
+    EXPECT_NO_THROW(wPool.setBatchSize(tensorN));
+
+    for (size_t q = 0; q < 5; ++q)
+    {
+        auto timeStart = std::chrono::steady_clock::now();
+        EXPECT_NO_THROW(w.forwardPassTraining());
+        EXPECT_NO_THROW(w.backwardPassTraining());
+        std::cout << "Train:" << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+    }
+
+    for (size_t q = 0; q < 5; ++q)
+    {
+        auto timeStart = std::chrono::steady_clock::now();
+        EXPECT_NO_THROW(wPool.forwardPassTraining());
+        EXPECT_NO_THROW(wPool.backwardPassTraining());
+        std::cout << "Train pool:" << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - timeStart).count() << "ms" << std::endl;
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/workflow/Test_WorkflowCheckpointing.cpp b/training/src/tests/tests/lib/workflow/Test_WorkflowCheckpointing.cpp
new file mode 100644
index 00000000..71752018
--- /dev/null
+++ b/training/src/tests/tests/lib/workflow/Test_WorkflowCheckpointing.cpp
@@ -0,0 +1,949 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/compiler/Workflow.h>
+#include <training/compiler/WorkflowActions.h>
+#include <training/base/optimizers/SGD.h>
+
+#include "Test_WorkflowTools.h"
+
+namespace UT
+{
+
+TEST(TestWorkflowCheckpointing, PreparePipelinesSimpleTopologyCheckpointedUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w;
+
+    w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+    w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" } }, true);
+    w.add<TestLayer>("f2", raul::BasicParams{ { "x2" }, { "x3" } }, true);
+    w.add<TestLayer>("f3", raul::BasicParams{ { "x3" }, {} }, true);
+
+    EXPECT_NO_THROW(w.preparePipelines());
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 7u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate"); // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");  // f0
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate"); // x2
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");  // f1
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate"); // x3
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");  // f2
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Forward");  // f3
+    }
+
+    EXPECT_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed), raul::Exception);
+
+    EXPECT_NO_THROW(w.setCheckpoints({}));
+    EXPECT_THROW(w.setCheckpoints({ "x1", "x2", "w1" }), raul::Exception);
+
+    EXPECT_THROW(w.setCheckpoints({ "x1", "x1" }), raul::Exception);
+
+    EXPECT_NO_THROW(w.setCheckpoints({ "x1" }));
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 9u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate");   // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");    // f0
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate");   // x2
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");    // f1
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate");   // x3
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");    // f2
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeForwardTrain[7]->type(), "Forward");    // f3
+        ASSERT_EQ(pipeForwardTrain[8]->type(), "Deallocate"); // x3
+
+        ASSERT_TRUE(checkName(pipeForwardTrain[0].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[1].get(), "f0"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[2].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[3].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[4].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[5].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[6].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[7].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[8].get(), "x3"));
+    }
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 23u);
+
+        ASSERT_TRUE(checkBlocksType(pipeBackwardTrain,
+                                    0,
+                                    6,
+                                    { {
+                                          "Allocate", // xg3
+                                          "Zero"      // xg3
+                                      },
+                                      {
+                                          "Allocate",  // x2
+                                          "Forward",   // f1
+                                          "Allocate",  // x3
+                                          "Forward",   // f2
+                                          "Deallocate" // x2
+                                      } }));
+        ASSERT_EQ(pipeBackwardTrain[7]->type(), "Backward");   // f3
+        ASSERT_EQ(pipeBackwardTrain[8]->type(), "Deallocate"); // x3
+        ASSERT_TRUE(checkBlocksType(pipeBackwardTrain,
+                                    9,
+                                    12,
+                                    { {
+                                          "Allocate", // xg2
+                                          "Zero"      // xg2
+                                      },
+                                      {
+                                          "Allocate", // x2
+                                          "Forward"   // f1
+                                      } }));
+        ASSERT_EQ(pipeBackwardTrain[13]->type(), "Backward");   // f2
+        ASSERT_EQ(pipeBackwardTrain[14]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeBackwardTrain[15]->type(), "Deallocate"); // xg3
+        ASSERT_EQ(pipeBackwardTrain[16]->type(), "Allocate");   // xg1
+        ASSERT_EQ(pipeBackwardTrain[17]->type(), "Zero");       // xg1
+        ASSERT_EQ(pipeBackwardTrain[18]->type(), "Backward");   // f1
+        ASSERT_EQ(pipeBackwardTrain[19]->type(), "Deallocate"); // xg2
+        ASSERT_EQ(pipeBackwardTrain[20]->type(), "Backward");   // f0
+        ASSERT_EQ(pipeBackwardTrain[21]->type(), "Deallocate"); // x1
+        ASSERT_EQ(pipeBackwardTrain[22]->type(), "Deallocate"); // xg1
+
+        ASSERT_TRUE(checkBlocksName(pipeBackwardTrain, 0, 6, { { getGradName("x3"), getGradName("x3") }, { "x2", "f1", "x3", "f2", "x2" } }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[7].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[8].get(), "x3"));
+        ASSERT_TRUE(checkBlocksName(pipeBackwardTrain, 9, 12, { { getGradName("x2"), getGradName("x2") }, { "x2", "f1" } }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[13].get(), "f2"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[14].get()), getName(pipeBackwardTrain[15].get()) }, { "x2", getGradName("x3") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[16].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[17].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[18].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[19].get(), getGradName("x2")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[20].get(), "f0"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[21].get()), getName(pipeBackwardTrain[22].get()) }, { "x1", getGradName("x1") }));
+    }
+
+    EXPECT_NO_THROW(w.setCheckpoints({ "x1", "x2", "x3" }));
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 7u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate"); // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");  // f0
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate"); // x2
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");  // f1
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate"); // x3
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");  // f2
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Forward");  // f3
+    }
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 16u);
+
+        ASSERT_EQ(pipeBackwardTrain[0]->type(), "Allocate");    // xg3
+        ASSERT_EQ(pipeBackwardTrain[1]->type(), "Zero");        // xg3
+        ASSERT_EQ(pipeBackwardTrain[2]->type(), "Backward");    // f3
+        ASSERT_EQ(pipeBackwardTrain[3]->type(), "Allocate");    // xg2
+        ASSERT_EQ(pipeBackwardTrain[4]->type(), "Zero");        // xg2
+        ASSERT_EQ(pipeBackwardTrain[5]->type(), "Backward");    // f2
+        ASSERT_EQ(pipeBackwardTrain[6]->type(), "Deallocate");  // x3
+        ASSERT_EQ(pipeBackwardTrain[7]->type(), "Deallocate");  // xg3
+        ASSERT_EQ(pipeBackwardTrain[8]->type(), "Allocate");    // xg1
+        ASSERT_EQ(pipeBackwardTrain[9]->type(), "Zero");        // xg1
+        ASSERT_EQ(pipeBackwardTrain[10]->type(), "Backward");   // f1
+        ASSERT_EQ(pipeBackwardTrain[11]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeBackwardTrain[12]->type(), "Deallocate"); // xg2
+        ASSERT_EQ(pipeBackwardTrain[13]->type(), "Backward");   // f0
+        ASSERT_EQ(pipeBackwardTrain[14]->type(), "Deallocate"); // x1
+        ASSERT_EQ(pipeBackwardTrain[15]->type(), "Deallocate"); // xg1
+    }
+
+    EXPECT_NO_THROW(w.preparePipelines());
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 7u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate"); // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");  // f0
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate"); // x2
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");  // f1
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate"); // x3
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");  // f2
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Forward");  // f3
+    }
+}
+
+TEST(TestWorkflowCheckpointing, PreparePipelinesComplexTopologyCheckpointedUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w;
+
+    w.add<TestLayer>("f0", raul::BasicParams{ {}, { "x1" } }, true);
+    w.add<TestLayer>("f1", raul::BasicParams{ { "x1" }, { "x2" }, { "w1", "w2" } }, true);
+    w.add<TestLayer>("f2", raul::BasicParams{ { "x1" }, { "x3" } }, true);
+    w.add<TestLayer>("f3", raul::BasicParams{ { "x1" }, { "x4" }, raul::Names{ "w1" } }, true);
+    w.add<TestLayer>("f4", raul::BasicParams{ { "x2", "x3" }, { "x5" } }, true);
+    w.add<TestLayer>("f5", raul::BasicParams{ { "x2" }, { "x6" } }, true);
+    w.add<TestLayer>("f6", raul::BasicParams{ { "x4", "x5", "x6" }, { "x7" } }, true);
+    w.add<TestLayer>("f7", raul::BasicParams{ { "x7" }, {}, raul::Names{ "w2" } }, true);
+
+    EXPECT_NO_THROW(w.setCheckpoints({ "x1" }));
+
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 21u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate");    // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");     // f0
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate");    // x2
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");     // f1
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate");    // x3
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");     // f2
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Allocate");    // x4
+        ASSERT_EQ(pipeForwardTrain[7]->type(), "Forward");     // f3
+        ASSERT_EQ(pipeForwardTrain[8]->type(), "Allocate");    // x5
+        ASSERT_EQ(pipeForwardTrain[9]->type(), "Forward");     // f4
+        ASSERT_EQ(pipeForwardTrain[10]->type(), "Deallocate"); // x3
+        ASSERT_EQ(pipeForwardTrain[11]->type(), "Allocate");   // x6
+        ASSERT_EQ(pipeForwardTrain[12]->type(), "Forward");    // f5
+        ASSERT_EQ(pipeForwardTrain[13]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeForwardTrain[14]->type(), "Allocate");   // x7
+        ASSERT_EQ(pipeForwardTrain[15]->type(), "Forward");    // f6
+        ASSERT_EQ(pipeForwardTrain[16]->type(), "Deallocate"); // x4
+        ASSERT_EQ(pipeForwardTrain[17]->type(), "Deallocate"); // x5
+        ASSERT_EQ(pipeForwardTrain[18]->type(), "Deallocate"); // x6
+        ASSERT_EQ(pipeForwardTrain[19]->type(), "Forward");    // f7
+        ASSERT_EQ(pipeForwardTrain[20]->type(), "Deallocate"); // x7
+
+        ASSERT_TRUE(checkName(pipeForwardTrain[0].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[1].get(), "f0"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[2].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[3].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[4].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[5].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[6].get(), "x4"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[7].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[8].get(), "x5"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[9].get(), "f4"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[10].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[11].get(), "x6"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[12].get(), "f5"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[13].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[14].get(), "x7"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[15].get(), "f6"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeForwardTrain[16].get()), getName(pipeForwardTrain[17].get()), getName(pipeForwardTrain[18].get()) }, { "x4", "x5", "x6" }));
+        ASSERT_TRUE(checkName(pipeForwardTrain[19].get(), "f7"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[20].get(), "x7"));
+    }
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 72u);
+
+        ASSERT_TRUE(checkBlocksType(pipeBackwardTrain,
+                                    0,
+                                    18,
+                                    { {
+                                          "Allocate", // xg7
+                                          "Zero"      // xg7
+                                      },
+                                      {
+                                          // x7 forward
+                                          "Allocate",   // x2
+                                          "Forward",    // f1
+                                          "Allocate",   // x6
+                                          "Forward",    // f5
+                                          "Allocate",   // x3
+                                          "Forward",    // f2
+                                          "Allocate",   // x5
+                                          "Forward",    // f4
+                                          "Deallocate", // x2
+                                          "Deallocate", // x3
+                                          "Allocate",   // x4
+                                          "Forward",    // f3
+                                          "Allocate",   // x7
+                                          "Forward",    // f6
+                                          "Deallocate", // x4
+                                          "Deallocate", // x5
+                                          "Deallocate"  // x6
+                                      } }));
+        ASSERT_EQ(pipeBackwardTrain[19]->type(), "Backward");   // f7
+        ASSERT_EQ(pipeBackwardTrain[20]->type(), "Deallocate"); // x7
+        ASSERT_TRUE(checkBlocksType(pipeBackwardTrain,
+                                    21,
+                                    41,
+                                    { {
+                                          "Allocate", // xg4
+                                          "Zero",     // xg4
+                                      },
+                                      {
+                                          "Allocate", // xg5
+                                          "Zero",     // xg5
+                                      },
+                                      {
+                                          "Allocate", // xg6
+                                          "Zero"      // xg6
+                                      },
+                                      {
+                                          // x4 forward
+                                          "Allocate", // x4
+                                          "Forward"   // f3
+                                      },
+                                      {
+                                          // x5 forward
+                                          "Allocate",   // x3
+                                          "Forward",    // f2
+                                          "Allocate",   // x2
+                                          "Forward",    // f1
+                                          "Allocate",   // x5
+                                          "Forward",    // f4
+                                          "Deallocate", // x2
+                                          "Deallocate"  // x3
+                                      },
+                                      {
+                                          // x6 forward
+                                          "Allocate",  // x2
+                                          "Forward",   // f1
+                                          "Allocate",  // x6
+                                          "Forward",   // f5
+                                          "Deallocate" // x2
+                                      } }));
+        ASSERT_EQ(pipeBackwardTrain[42]->type(), "Backward");   // f6
+        ASSERT_EQ(pipeBackwardTrain[43]->type(), "Deallocate"); // x4
+        ASSERT_EQ(pipeBackwardTrain[44]->type(), "Deallocate"); // x5
+        ASSERT_EQ(pipeBackwardTrain[45]->type(), "Deallocate"); // x6
+        ASSERT_EQ(pipeBackwardTrain[46]->type(), "Deallocate"); // xg7
+        ASSERT_TRUE(checkBlocksType(pipeBackwardTrain,
+                                    47,
+                                    50,
+                                    { {
+                                          "Allocate", // xg2
+                                          "Zero"      // xg2
+                                      },
+                                      {
+                                          // x2 forward
+                                          "Allocate", // x2
+                                          "Forward",  // f1
+                                      } }));
+        ASSERT_EQ(pipeBackwardTrain[51]->type(), "Backward");   // f5
+        ASSERT_EQ(pipeBackwardTrain[52]->type(), "Deallocate"); // xg6
+        ASSERT_TRUE(checkBlocksType(pipeBackwardTrain,
+                                    53,
+                                    56,
+                                    { {
+                                          "Allocate", // xg3
+                                          "Zero"      // xg3
+                                      },
+                                      {
+                                          // x3 forward
+                                          "Allocate", // x3
+                                          "Forward",  // f2
+                                      } }));
+        ASSERT_EQ(pipeBackwardTrain[57]->type(), "Backward");   // f4
+        ASSERT_EQ(pipeBackwardTrain[58]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeBackwardTrain[59]->type(), "Deallocate"); // x3
+        ASSERT_EQ(pipeBackwardTrain[60]->type(), "Deallocate"); // xg5
+        ASSERT_EQ(pipeBackwardTrain[61]->type(), "Allocate");   // xg1
+        ASSERT_EQ(pipeBackwardTrain[62]->type(), "Zero");       // xg1
+        ASSERT_EQ(pipeBackwardTrain[63]->type(), "Backward");   // f3
+        ASSERT_EQ(pipeBackwardTrain[64]->type(), "Deallocate"); // xg4
+        ASSERT_EQ(pipeBackwardTrain[65]->type(), "Backward");   // f2
+        ASSERT_EQ(pipeBackwardTrain[66]->type(), "Deallocate"); // xg3
+        ASSERT_EQ(pipeBackwardTrain[67]->type(), "Backward");   // f1
+        ASSERT_EQ(pipeBackwardTrain[68]->type(), "Deallocate"); // xg2
+        ASSERT_EQ(pipeBackwardTrain[69]->type(), "Backward");   // f0
+        ASSERT_EQ(pipeBackwardTrain[70]->type(), "Deallocate"); // x1
+        ASSERT_EQ(pipeBackwardTrain[71]->type(), "Deallocate"); // xg1
+
+        ASSERT_TRUE(checkBlocksName(pipeBackwardTrain,
+                                    0,
+                                    18,
+                                    { { getGradName("x7"), getGradName("x7") },
+                                      { // x7 forward
+                                        "x2",
+                                        "f1",
+                                        "x6",
+                                        "f5",
+                                        "x3",
+                                        "f2",
+                                        "x5",
+                                        "f4",
+                                        "x2",
+                                        "x3",
+                                        "x4",
+                                        "f3",
+                                        "x7",
+                                        "f6" },
+                                      { // x7 forward more
+                                        "x4" },
+                                      { // x7 forward more
+                                        "x5" },
+                                      { // x7 forward more
+                                        "x6" } }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[19].get(), "f7"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[20].get(), "x7"));
+        ASSERT_TRUE(checkBlocksName(pipeBackwardTrain,
+                                    21,
+                                    41,
+                                    { { getGradName("x4"), getGradName("x4") },
+                                      { getGradName("x5"), getGradName("x5") },
+                                      { getGradName("x6"), getGradName("x6") },
+                                      { // x4 forward
+                                        "x4",
+                                        "f3" },
+                                      { // x5 forward
+                                        "x3",
+                                        "f2",
+                                        "x2",
+                                        "f1",
+                                        "x5",
+                                        "f4" },
+                                      { // x5 forward more
+                                        "x2" },
+                                      { // x5 forward more
+                                        "x3" },
+                                      { // x6 forward
+                                        "x2",
+                                        "f1",
+                                        "x6",
+                                        "f5",
+                                        "x2" } }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[42].get(), "f6"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[43].get()), getName(pipeBackwardTrain[44].get()), getName(pipeBackwardTrain[45].get()), getName(pipeBackwardTrain[46].get()) },
+                                     { "x4", "x5", "x6", getGradName("x7") }));
+        ASSERT_TRUE(checkBlocksName(pipeBackwardTrain,
+                                    47,
+                                    50,
+                                    { { getGradName("x2"), getGradName("x2") },
+                                      { // x2 forward
+                                        "x2",
+                                        "f1" } }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[51].get(), "f5"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[52].get(), getGradName("x6")));
+        ASSERT_TRUE(checkBlocksName(pipeBackwardTrain,
+                                    53,
+                                    56,
+                                    { { getGradName("x3"), getGradName("x3") },
+                                      { // x3 forward
+                                        "x3",
+                                        "f2" } }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[57].get(), "f4"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[58].get()), getName(pipeBackwardTrain[59].get()), getName(pipeBackwardTrain[60].get()) }, { "x2", "x3", getGradName("x5") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[61].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[62].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[63].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[64].get(), getGradName("x4")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[65].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[66].get(), getGradName("x3")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[67].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[68].get(), getGradName("x2")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[69].get(), "f0"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[70].get()), getName(pipeBackwardTrain[71].get()) }, { "x1", getGradName("x1") }));
+    }
+
+    EXPECT_NO_THROW(w.setCheckpoints({ "x1", "x3" }));
+
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 64u);
+
+        ASSERT_TRUE(checkBlocksType(pipeBackwardTrain,
+                                    0,
+                                    15,
+                                    { {
+                                          "Allocate", // xg7
+                                          "Zero"      // xg7
+                                      },
+                                      {
+                                          // x7 forward
+                                          "Allocate",   // x2
+                                          "Forward",    // f1
+                                          "Allocate",   // x6
+                                          "Forward",    // f5
+                                          "Allocate",   // x5
+                                          "Forward",    // f4
+                                          "Deallocate", // x2
+                                          "Allocate",   // x4
+                                          "Forward",    // f3
+                                          "Allocate",   // x7
+                                          "Forward",    // f6
+                                          "Deallocate", // x4
+                                          "Deallocate", // x5
+                                          "Deallocate"  // x6
+                                      } }));
+        ASSERT_EQ(pipeBackwardTrain[16]->type(), "Backward");   // f7
+        ASSERT_EQ(pipeBackwardTrain[17]->type(), "Deallocate"); // x7
+        ASSERT_TRUE(checkBlocksType(pipeBackwardTrain,
+                                    18,
+                                    35,
+                                    { {
+                                          "Allocate", // xg4
+                                          "Zero",     // xg4
+                                      },
+                                      {
+                                          "Allocate", // xg5
+                                          "Zero",     // xg5
+                                      },
+                                      {
+                                          "Allocate", // xg6
+                                          "Zero"      // xg6
+                                      },
+                                      {
+                                          // x4 forward
+                                          "Allocate", // x4
+                                          "Forward"   // f3
+                                      },
+                                      {
+                                          // x5 forward
+                                          "Allocate",  // x2
+                                          "Forward",   // f1
+                                          "Allocate",  // x5
+                                          "Forward",   // f4
+                                          "Deallocate" // x2
+                                      },
+                                      {
+                                          // x6 forward
+                                          "Allocate",  // x2
+                                          "Forward",   // f1
+                                          "Allocate",  // x6
+                                          "Forward",   // f5
+                                          "Deallocate" // x2
+                                      } }));
+        ASSERT_EQ(pipeBackwardTrain[36]->type(), "Backward");   // f6
+        ASSERT_EQ(pipeBackwardTrain[37]->type(), "Deallocate"); // x4
+        ASSERT_EQ(pipeBackwardTrain[38]->type(), "Deallocate"); // x5
+        ASSERT_EQ(pipeBackwardTrain[39]->type(), "Deallocate"); // x6
+        ASSERT_EQ(pipeBackwardTrain[40]->type(), "Deallocate"); // xg7
+        ASSERT_TRUE(checkBlocksType(pipeBackwardTrain,
+                                    41,
+                                    44,
+                                    { {
+                                          "Allocate", // xg2
+                                          "Zero"      // xg2
+                                      },
+                                      {
+                                          // x2 forward
+                                          "Allocate", // x2
+                                          "Forward",  // f1
+                                      } }));
+        ASSERT_EQ(pipeBackwardTrain[45]->type(), "Backward");   // f5
+        ASSERT_EQ(pipeBackwardTrain[46]->type(), "Deallocate"); // xg6
+        ASSERT_EQ(pipeBackwardTrain[47]->type(), "Allocate");   // xg3
+        ASSERT_EQ(pipeBackwardTrain[48]->type(), "Zero");       // xg3
+        ASSERT_EQ(pipeBackwardTrain[49]->type(), "Backward");   // f4
+        ASSERT_EQ(pipeBackwardTrain[50]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeBackwardTrain[51]->type(), "Deallocate"); // xg5
+        ASSERT_EQ(pipeBackwardTrain[52]->type(), "Allocate");   // xg1
+        ASSERT_EQ(pipeBackwardTrain[53]->type(), "Zero");       // xg1
+        ASSERT_EQ(pipeBackwardTrain[54]->type(), "Backward");   // f3
+        ASSERT_EQ(pipeBackwardTrain[55]->type(), "Deallocate"); // xg4
+        ASSERT_EQ(pipeBackwardTrain[56]->type(), "Backward");   // f2
+        ASSERT_EQ(pipeBackwardTrain[57]->type(), "Deallocate"); // x3
+        ASSERT_EQ(pipeBackwardTrain[58]->type(), "Deallocate"); // xg3
+        ASSERT_EQ(pipeBackwardTrain[59]->type(), "Backward");   // f1
+        ASSERT_EQ(pipeBackwardTrain[60]->type(), "Deallocate"); // xg2
+        ASSERT_EQ(pipeBackwardTrain[61]->type(), "Backward");   // f0
+        ASSERT_EQ(pipeBackwardTrain[62]->type(), "Deallocate"); // x1
+        ASSERT_EQ(pipeBackwardTrain[63]->type(), "Deallocate"); // xg1
+
+        ASSERT_TRUE(checkBlocksName(pipeBackwardTrain,
+                                    0,
+                                    15,
+                                    { { getGradName("x7"), getGradName("x7") },
+                                      { // x7 forward
+                                        "x2",
+                                        "f1",
+                                        "x6",
+                                        "f5",
+                                        "x5",
+                                        "f4",
+                                        "x2",
+                                        "x4",
+                                        "f3",
+                                        "x7",
+                                        "f6" },
+                                      { // x7 forward more
+                                        "x4" },
+                                      { // x7 forward more
+                                        "x5" },
+                                      { // x7 forward more
+                                        "x6" } }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[16].get(), "f7"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[17].get(), "x7"));
+        ASSERT_TRUE(checkBlocksName(pipeBackwardTrain,
+                                    18,
+                                    35,
+                                    { { getGradName("x4"), getGradName("x4") },
+                                      { getGradName("x5"), getGradName("x5") },
+                                      { getGradName("x6"), getGradName("x6") },
+                                      { // x4 forward
+                                        "x4",
+                                        "f3" },
+                                      { // x5 forward
+                                        "x2",
+                                        "f1",
+                                        "x5",
+                                        "f4" },
+                                      { // x5 forward more
+                                        "x2" },
+                                      { // x6 forward
+                                        "x2",
+                                        "f1",
+                                        "x6",
+                                        "f5",
+                                        "x2" } }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[36].get(), "f6"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[37].get()), getName(pipeBackwardTrain[38].get()), getName(pipeBackwardTrain[39].get()), getName(pipeBackwardTrain[40].get()) },
+                                     { "x4", "x5", "x6", getGradName("x7") }));
+        ASSERT_TRUE(checkBlocksName(pipeBackwardTrain,
+                                    41,
+                                    44,
+                                    { { getGradName("x2"), getGradName("x2") },
+                                      { // x2 forward
+                                        "x2",
+                                        "f1" } }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[45].get(), "f5"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[46].get(), getGradName("x6")));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[47].get()), getName(pipeBackwardTrain[48].get()) }, { getGradName("x3") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[49].get(), "f4"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[50].get()), getName(pipeBackwardTrain[51].get()) }, { "x2", getGradName("x5") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[52].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[53].get(), getGradName("x1")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[54].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[55].get(), getGradName("x4")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[56].get(), "f2"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[57].get()), getName(pipeBackwardTrain[58].get()) }, { "x3", getGradName("x3") }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[59].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[60].get(), getGradName("x2")));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[61].get(), "f0"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[62].get()), getName(pipeBackwardTrain[63].get()) }, { "x1", getGradName("x1") }));
+    }
+}
+
+TEST(TestWorkflowCheckpointing, PreparePipelinesForwardBackwardTensorInTopologyCheckpointedUnit)
+{
+    PROFILE_TEST
+
+    class Test : public raul::BasicLayer
+    {
+      public:
+        Test(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            if (!params.getInputs().empty())
+            {
+                mNetworkParams.mWorkflow.copyDeclaration(name, params.getInputs()[0], raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+            }
+
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, params.getOutputs()[0], raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, true, true, false, false, false);
+            }
+
+            mNetworkParams.mWorkflow.tensorNeeded(mName, mName / "XHat", raul::WShape{ raul::BS(), 1u, 1u, 1u }, DEC_FORW_WRIT);
+            mNetworkParams.mWorkflow.copyDeclaration(mName, mName / "XHat", DEC_BACK_READ);
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    raul::Workflow w;
+
+    w.add<Test>("f1", raul::BasicParams{ {}, { "x1" } });
+    w.add<Test>("f2", raul::BasicParams{ { "x1" }, { "x2" } });
+    w.add<Test>("f3", raul::BasicParams{ { "x2" }, { "x3" } });
+
+    EXPECT_NO_THROW(w.setCheckpoints({ "x1" }));
+
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 11u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate");    // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Allocate");    // xh1
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Forward");     // f1
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Allocate");    // x2
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate");    // xh2
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");     // f2
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Allocate");    // x3
+        ASSERT_EQ(pipeForwardTrain[7]->type(), "Allocate");    // xh3
+        ASSERT_EQ(pipeForwardTrain[8]->type(), "Forward");     // f3
+        ASSERT_EQ(pipeForwardTrain[9]->type(), "Deallocate");  // x2
+        ASSERT_EQ(pipeForwardTrain[10]->type(), "Deallocate"); // x3
+
+        ASSERT_TRUE(checkGroupedName({ getName(pipeForwardTrain[0].get()), getName(pipeForwardTrain[1].get()) }, { "x1", "f1::XHat" }));
+        ASSERT_TRUE(checkName(pipeForwardTrain[2].get(), "f1"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeForwardTrain[3].get()), getName(pipeForwardTrain[4].get()) }, { "x2", "f2::XHat" }));
+        ASSERT_TRUE(checkName(pipeForwardTrain[5].get(), "f2"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeForwardTrain[6].get()), getName(pipeForwardTrain[7].get()) }, { "x3", "f3::XHat" }));
+        ASSERT_TRUE(checkName(pipeForwardTrain[8].get(), "f3"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeForwardTrain[9].get()), getName(pipeForwardTrain[10].get()) }, { "x2", "x3" }));
+    }
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 10u);
+
+        ASSERT_EQ(pipeBackwardTrain[0]->type(), "Allocate");   // x2
+        ASSERT_EQ(pipeBackwardTrain[1]->type(), "Forward");    // f2
+        ASSERT_EQ(pipeBackwardTrain[2]->type(), "Backward");   // f3
+        ASSERT_EQ(pipeBackwardTrain[3]->type(), "Deallocate"); // xh3
+        ASSERT_EQ(pipeBackwardTrain[4]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeBackwardTrain[5]->type(), "Backward");   // f2
+        ASSERT_EQ(pipeBackwardTrain[6]->type(), "Deallocate"); // xh2
+        ASSERT_EQ(pipeBackwardTrain[7]->type(), "Backward");   // f1
+        ASSERT_EQ(pipeBackwardTrain[8]->type(), "Deallocate"); // x1
+        ASSERT_EQ(pipeBackwardTrain[9]->type(), "Deallocate"); // xh1
+
+        ASSERT_TRUE(checkName(pipeBackwardTrain[0].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[1].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[2].get(), "f3"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[3].get()), getName(pipeBackwardTrain[4].get()) }, { "x2", "f3::XHat" }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[5].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[6].get(), "f2::XHat"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[7].get(), "f1"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[8].get()), getName(pipeBackwardTrain[9].get()) }, { "x1", "f1::XHat" }));
+    }
+}
+
+TEST(TestWorkflowCheckpointing, PreparePipelinesSkipConnectionCheckpointedUnit)
+{
+    PROFILE_TEST
+
+    class Test : public raul::BasicLayer
+    {
+      public:
+        Test(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            for (auto& input : params.getInputs())
+            {
+                mNetworkParams.mWorkflow.copyDeclaration(name, input, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+            }
+
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, params.getOutputs()[0], raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, true, true, false, false, false);
+            }
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    raul::Workflow w;
+
+    w.add<Test>("f1", raul::BasicParams{ {}, { "x1" } });
+    w.add<Test>("f2", raul::BasicParams{ { "x1" }, { "x2" } });
+    w.add<Test>("f3", raul::BasicParams{ { "x2" }, { "x3" } });
+    w.add<Test>("f4", raul::BasicParams{ { "x2", "x3" }, {} });
+
+    EXPECT_NO_THROW(w.setCheckpoints({ "x1" }));
+
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 9u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate");   // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");    // f1
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate");   // x2
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");    // f2
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate");   // x3
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");    // f3
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Forward");    // f4
+        ASSERT_EQ(pipeForwardTrain[7]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeForwardTrain[8]->type(), "Deallocate"); // x3
+
+        ASSERT_TRUE(checkName(pipeForwardTrain[0].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[1].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[2].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[3].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[4].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[5].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[6].get(), "f4"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeForwardTrain[7].get()), getName(pipeForwardTrain[8].get()) }, { "x2", "x3" }));
+    }
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 11u);
+
+        ASSERT_EQ(pipeBackwardTrain[0]->type(), "Allocate");    // x2
+        ASSERT_EQ(pipeBackwardTrain[1]->type(), "Forward");     // f2
+        ASSERT_EQ(pipeBackwardTrain[2]->type(), "Allocate");    // x3
+        ASSERT_EQ(pipeBackwardTrain[3]->type(), "Forward");     // f3
+        ASSERT_EQ(pipeBackwardTrain[4]->type(), "Backward");    // f4
+        ASSERT_EQ(pipeBackwardTrain[5]->type(), "Deallocate");  // x3
+        ASSERT_EQ(pipeBackwardTrain[6]->type(), "Backward");    // f3
+        ASSERT_EQ(pipeBackwardTrain[7]->type(), "Deallocate");  // x2
+        ASSERT_EQ(pipeBackwardTrain[8]->type(), "Backward");    // f2
+        ASSERT_EQ(pipeBackwardTrain[9]->type(), "Backward");    // f1
+        ASSERT_EQ(pipeBackwardTrain[10]->type(), "Deallocate"); // x1
+
+        ASSERT_TRUE(checkName(pipeBackwardTrain[0].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[1].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[2].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[3].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[4].get(), "f4"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[5].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[6].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[7].get(), "x2"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[8].get()), getName(pipeBackwardTrain[9].get()) }, { "f1", "f2" }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[10].get(), "x1"));
+    }
+}
+
+TEST(TestWorkflowCheckpointing, PreparePipelinesKeepActivationCheckpointedUnit)
+{
+    PROFILE_TEST
+
+    class Test : public raul::BasicLayer
+    {
+      public:
+        Test(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+            : BasicLayer(name, "test", params, networkParameters, { false, false })
+        {
+            for (auto& input : params.getInputs())
+            {
+                mNetworkParams.mWorkflow.copyDeclaration(name, input, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+            }
+
+            if (!params.getOutputs().empty())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, params.getOutputs()[0], raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Read, true, true, false, false, false);
+            }
+        }
+
+        void forwardComputeImpl(raul::NetworkMode) override {}
+        void backwardComputeImpl() override {}
+    };
+
+    raul::Workflow w;
+
+    w.add<Test>("f1", raul::BasicParams{ {}, { "x1" } });
+    w.add<Test>("f2", raul::BasicParams{ { "x1" }, { "x2" } });
+    w.add<Test>("f3", raul::BasicParams{ { "x2" }, { "x3" } });
+    w.add<Test>("f4", raul::BasicParams{ { "x2", "x3" }, { "x4" } });
+    w.add<Test>("f5", raul::BasicParams{ { "x2", "x4" }, {} });
+
+    {
+        raul::Names checkpoints = w.getPotentialCheckpoints();
+        EXPECT_EQ(checkpoints.size(), 4u);
+
+        ASSERT_TRUE(checkGroupedName(checkpoints, { "x1", "x2", "x3", "x4" }));
+    }
+
+    EXPECT_NO_THROW(w.setCheckpoints({ "x1" }));
+
+    EXPECT_NO_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed));
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 18u);
+
+        ASSERT_EQ(pipeBackwardTrain[0]->type(), "Allocate");    // x2
+        ASSERT_EQ(pipeBackwardTrain[1]->type(), "Forward");     // f2
+        ASSERT_EQ(pipeBackwardTrain[2]->type(), "Allocate");    // x3
+        ASSERT_EQ(pipeBackwardTrain[3]->type(), "Forward");     // f3
+        ASSERT_EQ(pipeBackwardTrain[4]->type(), "Allocate");    // x4
+        ASSERT_EQ(pipeBackwardTrain[5]->type(), "Forward");     // f4
+        ASSERT_EQ(pipeBackwardTrain[6]->type(), "Deallocate");  // x3
+        ASSERT_EQ(pipeBackwardTrain[7]->type(), "Backward");    // f5
+        ASSERT_EQ(pipeBackwardTrain[8]->type(), "Deallocate");  // x4
+        ASSERT_EQ(pipeBackwardTrain[9]->type(), "Allocate");    // x3
+        ASSERT_EQ(pipeBackwardTrain[10]->type(), "Forward");    // f3
+        ASSERT_EQ(pipeBackwardTrain[11]->type(), "Backward");   // f4
+        ASSERT_EQ(pipeBackwardTrain[12]->type(), "Deallocate"); // x3
+        ASSERT_EQ(pipeBackwardTrain[13]->type(), "Backward");   // f3
+        ASSERT_EQ(pipeBackwardTrain[14]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeBackwardTrain[15]->type(), "Backward");   // f2
+        ASSERT_EQ(pipeBackwardTrain[16]->type(), "Backward");   // f1
+        ASSERT_EQ(pipeBackwardTrain[17]->type(), "Deallocate"); // x1
+
+        ASSERT_TRUE(checkName(pipeBackwardTrain[0].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[1].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[2].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[3].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[4].get(), "x4"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[5].get(), "f4"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[6].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[7].get(), "f5"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[8].get(), "x4"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[9].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[10].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[11].get(), "f4"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[12].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[13].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[14].get(), "x2"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[15].get()), getName(pipeBackwardTrain[16].get()) }, { "f1", "f2" }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[17].get(), "x1"));
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/workflow/Test_WorkflowCompression.cpp b/training/src/tests/tests/lib/workflow/Test_WorkflowCompression.cpp
new file mode 100644
index 00000000..c646790d
--- /dev/null
+++ b/training/src/tests/tests/lib/workflow/Test_WorkflowCompression.cpp
@@ -0,0 +1,271 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/BasicLayer.h>
+
+#include <training/compiler/Workflow.h>
+#include <training/compiler/WorkflowActions.h>
+
+#include "Test_WorkflowTools.h"
+
+namespace
+{
+
+class TestLayerCompress : public raul::BasicLayer
+{
+  public:
+    TestLayerCompress(const raul::Name& name, const raul::BasicParams& params, bool isCompress, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "test", params, networkParameters, { false, false })
+    {
+        for (auto& input : params.getInputs())
+        {
+            mNetworkParams.mWorkflow.copyDeclaration(mName, input, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read);
+        }
+
+        for (auto& output : params.getOutputs())
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(
+                name, output, raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Write, true, true, false, false, isCompress);
+        }
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override {}
+    void backwardComputeImpl() override {}
+};
+
+} // anonymous namespace
+
+namespace UT
+{
+
+TEST(TestWorkflowCompression, SimpleTopologyCompressUnit)
+{
+    PROFILE_TEST
+
+    {
+        raul::Workflow w;
+
+        w.add<TestLayerCompress>("f1", raul::BasicParams{ {}, { "x1" } }, true);
+        w.add<TestLayerCompress>("f2", raul::BasicParams{ { "x1" }, { "x2" } }, false);
+        w.add<TestLayerCompress>("f3", raul::BasicParams{ { "x1" }, { "x3" } }, false);
+        w.add<TestLayerCompress>("f4", raul::BasicParams{ { "x2", "x3" }, {} }, false);
+
+        w.preparePipelines();
+
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+        ASSERT_EQ(pipeForwardTrain.size(), 7u);
+
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+        ASSERT_EQ(pipeBackwardTrain.size(), 7u);
+    }
+
+    {
+        raul::Workflow w(raul::CompressionMode::FP16);
+
+        w.add<TestLayerCompress>("f1", raul::BasicParams{ {}, { "x1" } }, true);
+        w.add<TestLayerCompress>("f2", raul::BasicParams{ { "x1" }, { "x2" } }, false);
+        w.add<TestLayerCompress>("f3", raul::BasicParams{ { "x1" }, { "x3" } }, false);
+        w.add<TestLayerCompress>("f4", raul::BasicParams{ { "x2", "x3" }, {} }, false);
+
+        w.preparePipelines();
+
+        // forward train pipe
+        {
+            const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+            ASSERT_EQ(pipeForwardTrain.size(), 8u);
+
+            ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate"); // x1
+            ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");  // f1
+            ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate"); // x2
+            ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");  // f2
+            ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate"); // x3
+            ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");  // f3
+            ASSERT_EQ(pipeForwardTrain[6]->type(), "Compress"); // x1
+            ASSERT_EQ(pipeForwardTrain[7]->type(), "Forward");  // f4
+
+            ASSERT_TRUE(checkName(pipeForwardTrain[0].get(), "x1"));
+            ASSERT_TRUE(checkName(pipeForwardTrain[1].get(), "f1"));
+            ASSERT_TRUE(checkName(pipeForwardTrain[2].get(), "x2"));
+            ASSERT_TRUE(checkName(pipeForwardTrain[3].get(), "f2"));
+            ASSERT_TRUE(checkName(pipeForwardTrain[4].get(), "x3"));
+            ASSERT_TRUE(checkName(pipeForwardTrain[5].get(), "f3"));
+            ASSERT_TRUE(checkName(pipeForwardTrain[6].get(), "x1"));
+            ASSERT_TRUE(checkName(pipeForwardTrain[7].get(), "f4"));
+        }
+
+        // backward train pipe
+        {
+            const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+            ASSERT_EQ(pipeBackwardTrain.size(), 8u);
+
+            ASSERT_EQ(pipeBackwardTrain[0]->type(), "Backward");   // f4
+            ASSERT_EQ(pipeBackwardTrain[1]->type(), "Deallocate"); // x2
+            ASSERT_EQ(pipeBackwardTrain[2]->type(), "Deallocate"); // x3
+            ASSERT_EQ(pipeBackwardTrain[3]->type(), "Decompress"); // x1
+            ASSERT_EQ(pipeBackwardTrain[4]->type(), "Backward");   // f3
+            ASSERT_EQ(pipeBackwardTrain[5]->type(), "Backward");   // f2
+            ASSERT_EQ(pipeBackwardTrain[6]->type(), "Deallocate"); // x1
+            ASSERT_EQ(pipeBackwardTrain[7]->type(), "Backward");   // f1
+
+            ASSERT_TRUE(checkName(pipeBackwardTrain[0].get(), "f4"));
+            ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[1].get()), getName(pipeBackwardTrain[2].get()) }, { "x2", "x3" }));
+            ASSERT_TRUE(checkName(pipeBackwardTrain[3].get(), "x1"));
+            ASSERT_TRUE(checkName(pipeBackwardTrain[4].get(), "f3"));
+            ASSERT_TRUE(checkName(pipeBackwardTrain[5].get(), "f2"));
+            ASSERT_TRUE(checkName(pipeBackwardTrain[6].get(), "x1"));
+            ASSERT_TRUE(checkName(pipeBackwardTrain[7].get(), "f1"));
+        }
+    }
+}
+
+TEST(TestWorkflowCompression, ComplexTopologyCompressUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w(raul::CompressionMode::FP16);
+
+    w.add<TestLayerCompress>("f1", raul::BasicParams{ {}, { "x1" } }, true);
+    w.add<TestLayerCompress>("f2", raul::BasicParams{ { "x1" }, { "x2" } }, true);
+    w.add<TestLayerCompress>("f3", raul::BasicParams{ { "x1" }, { "x3" } }, true);
+    w.add<TestLayerCompress>("f4", raul::BasicParams{ { "x1" }, { "x4" } }, true);
+    w.add<TestLayerCompress>("f5", raul::BasicParams{ { "x2", "x3" }, { "x5" } }, false);
+    w.add<TestLayerCompress>("f6", raul::BasicParams{ { "x2" }, { "x6" } }, true);
+    w.add<TestLayerCompress>("f7", raul::BasicParams{ { "x4", "x5", "x6" }, { "x7" } }, true);
+    w.add<TestLayerCompress>("f8",
+                             raul::BasicParams{
+                                 { "x7" },
+                                 {},
+                             },
+                             false);
+
+    w.preparePipelines();
+
+    // forward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeForwardTrain = w.getPipeline(raul::Workflow::Pipelines::ForwardTrain);
+
+        ASSERT_EQ(pipeForwardTrain.size(), 21u);
+
+        ASSERT_EQ(pipeForwardTrain[0]->type(), "Allocate");  // x1
+        ASSERT_EQ(pipeForwardTrain[1]->type(), "Forward");   // f1
+        ASSERT_EQ(pipeForwardTrain[2]->type(), "Allocate");  // x2
+        ASSERT_EQ(pipeForwardTrain[3]->type(), "Forward");   // f2
+        ASSERT_EQ(pipeForwardTrain[4]->type(), "Allocate");  // x3
+        ASSERT_EQ(pipeForwardTrain[5]->type(), "Forward");   // f3
+        ASSERT_EQ(pipeForwardTrain[6]->type(), "Allocate");  // x4
+        ASSERT_EQ(pipeForwardTrain[7]->type(), "Forward");   // f4
+        ASSERT_EQ(pipeForwardTrain[8]->type(), "Compress");  // x1
+        ASSERT_EQ(pipeForwardTrain[9]->type(), "Allocate");  // x5
+        ASSERT_EQ(pipeForwardTrain[10]->type(), "Forward");  // f5
+        ASSERT_EQ(pipeForwardTrain[11]->type(), "Compress"); // x3
+        ASSERT_EQ(pipeForwardTrain[12]->type(), "Allocate"); // x6
+        ASSERT_EQ(pipeForwardTrain[13]->type(), "Forward");  // f6
+        ASSERT_EQ(pipeForwardTrain[14]->type(), "Compress"); // x2
+        ASSERT_EQ(pipeForwardTrain[15]->type(), "Allocate"); // x7
+        ASSERT_EQ(pipeForwardTrain[16]->type(), "Forward");  // f7
+        ASSERT_EQ(pipeForwardTrain[17]->type(), "Compress"); // x4
+        ASSERT_EQ(pipeForwardTrain[18]->type(), "Compress"); // x6
+        ASSERT_EQ(pipeForwardTrain[19]->type(), "Forward");  // f8
+        ASSERT_EQ(pipeForwardTrain[20]->type(), "Compress"); // x7
+
+        ASSERT_TRUE(checkName(pipeForwardTrain[0].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[1].get(), "f1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[2].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[3].get(), "f2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[4].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[5].get(), "f3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[6].get(), "x4"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[7].get(), "f4"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[8].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[9].get(), "x5"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[10].get(), "f5"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[11].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[12].get(), "x6"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[13].get(), "f6"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[14].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[15].get(), "x7"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[16].get(), "f7"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeForwardTrain[17].get()), getName(pipeForwardTrain[18].get()) }, { "x4", "x6" }));
+        ASSERT_TRUE(checkName(pipeForwardTrain[19].get(), "f8"));
+        ASSERT_TRUE(checkName(pipeForwardTrain[20].get(), "x7"));
+    }
+
+    // backward train pipe
+    {
+        const raul::Workflow::Pipeline& pipeBackwardTrain = w.getPipeline(raul::Workflow::Pipelines::BackwardTrain);
+
+        ASSERT_EQ(pipeBackwardTrain.size(), 21u);
+
+        ASSERT_EQ(pipeBackwardTrain[0]->type(), "Decompress");  // x7
+        ASSERT_EQ(pipeBackwardTrain[1]->type(), "Backward");    // f8
+        ASSERT_EQ(pipeBackwardTrain[2]->type(), "Deallocate");  // x7
+        ASSERT_EQ(pipeBackwardTrain[3]->type(), "Decompress");  // x4
+        ASSERT_EQ(pipeBackwardTrain[4]->type(), "Decompress");  // x6
+        ASSERT_EQ(pipeBackwardTrain[5]->type(), "Backward");    // f7
+        ASSERT_EQ(pipeBackwardTrain[6]->type(), "Deallocate");  // x4
+        ASSERT_EQ(pipeBackwardTrain[7]->type(), "Deallocate");  // x5
+        ASSERT_EQ(pipeBackwardTrain[8]->type(), "Deallocate");  // x6
+        ASSERT_EQ(pipeBackwardTrain[9]->type(), "Decompress");  // x2
+        ASSERT_EQ(pipeBackwardTrain[10]->type(), "Backward");   // f6
+        ASSERT_EQ(pipeBackwardTrain[11]->type(), "Decompress"); // x3
+        ASSERT_EQ(pipeBackwardTrain[12]->type(), "Backward");   // f5
+        ASSERT_EQ(pipeBackwardTrain[13]->type(), "Deallocate"); // x3
+        ASSERT_EQ(pipeBackwardTrain[14]->type(), "Deallocate"); // x2
+        ASSERT_EQ(pipeBackwardTrain[15]->type(), "Decompress"); // x1
+        ASSERT_EQ(pipeBackwardTrain[16]->type(), "Backward");   // f4
+        ASSERT_EQ(pipeBackwardTrain[17]->type(), "Backward");   // f3
+        ASSERT_EQ(pipeBackwardTrain[18]->type(), "Backward");   // f2
+        ASSERT_EQ(pipeBackwardTrain[19]->type(), "Deallocate"); // x1
+        ASSERT_EQ(pipeBackwardTrain[20]->type(), "Backward");   // f1
+
+        ASSERT_TRUE(checkName(pipeBackwardTrain[0].get(), "x7"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[1].get(), "f8"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[2].get(), "x7"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[3].get()), getName(pipeBackwardTrain[4].get()) }, { "x4", "x6" }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[5].get(), "f7"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[6].get()), getName(pipeBackwardTrain[7].get()), getName(pipeBackwardTrain[8].get()) }, { "x4", "x5", "x6" }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[9].get(), "x2"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[10].get(), "f6"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[11].get(), "x3"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[12].get(), "f5"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[13].get()), getName(pipeBackwardTrain[14].get()) }, { "x2", "x3" }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[15].get(), "x1"));
+        ASSERT_TRUE(checkGroupedName({ getName(pipeBackwardTrain[16].get()), getName(pipeBackwardTrain[17].get()), getName(pipeBackwardTrain[18].get()) }, { "f4", "f3", "f2" }));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[19].get(), "x1"));
+        ASSERT_TRUE(checkName(pipeBackwardTrain[20].get(), "f1"));
+    }
+}
+
+TEST(TestWorkflowCompression, SimpleTopologyCompressCheckpointedUnit)
+{
+    PROFILE_TEST
+
+    raul::Workflow w(raul::CompressionMode::FP16);
+
+    w.add<TestLayerCompress>("f1", raul::BasicParams{ {}, { "x1" } }, true);
+    w.add<TestLayerCompress>("f2", raul::BasicParams{ { "x1" }, { "x2" } }, false);
+    w.add<TestLayerCompress>("f3", raul::BasicParams{ { "x1" }, { "x3" } }, false);
+    w.add<TestLayerCompress>("f4", raul::BasicParams{ { "x2", "x3" }, {} }, true);
+
+    // compression not possible together with checkpoints
+    EXPECT_THROW(w.preparePipelines(raul::Workflow::Execution::Checkpointed), raul::Exception);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/workflow/Test_WorkflowInterference.cpp b/training/src/tests/tests/lib/workflow/Test_WorkflowInterference.cpp
new file mode 100644
index 00000000..e9928221
--- /dev/null
+++ b/training/src/tests/tests/lib/workflow/Test_WorkflowInterference.cpp
@@ -0,0 +1,150 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+
+#include <tests/tools/callbacks/TensorChecker.h>
+
+namespace UT
+{
+
+struct TestWorkflowInterference : public testing::Test
+{
+    using il = std::initializer_list<raul::dtype>;
+
+    const size_t BATCH = 1U;
+    const size_t DEPTH = 2U;
+    const size_t HEIGHT = 1U;
+    const size_t WIDTH = 4U;
+    const size_t LINSIZE = 2U;
+    const raul::dtype SFACTOR = 3.0_dt;
+    const raul::dtype eps = 1.0e-6_dt;
+
+    std::unique_ptr<raul::Tensor> Input;
+    std::unique_ptr<raul::Tensor> Weights1;
+    std::unique_ptr<raul::Tensor> Weights2;
+    std::unique_ptr<raul::Tensor> RealOut;
+    std::unique_ptr<raul::Tensor> RealInGrad;
+
+    void SetUp() final
+    {
+        Input = std::make_unique<raul::Tensor>(BATCH, DEPTH, HEIGHT, WIDTH, il{ 1.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt, 8.0_dt });
+        Weights1 = std::make_unique<raul::Tensor>(1U, 1U, WIDTH, LINSIZE, il{ 0.0_dt, 0.5_dt, 1.5_dt, 2.0_dt, 2.5_dt, 3.5_dt, 4.5_dt, 5.0_dt });
+        Weights2 = std::make_unique<raul::Tensor>(1U, 1U, LINSIZE, LINSIZE, il{ 4.0_dt, 3.0_dt, 2.0_dt, 1.0_dt });
+        RealOut = std::make_unique<raul::Tensor>(1U, 1U, 1U, 1U, il{ 637.5_dt });
+        RealInGrad = std::make_unique<raul::Tensor>(BATCH, DEPTH, HEIGHT, WIDTH, il{ 7.5_dt, 12.75_dt, 20.25_dt, 24.0_dt, 7.5_dt, 12.75_dt, 20.25_dt, 24.0_dt });
+    }
+};
+
+TEST_F(TestWorkflowInterference, DirectCalculationUnit)
+{
+    raul::Workflow work;
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in", "realInGradient" }, DEPTH, HEIGHT, WIDTH });
+    work.add<raul::LinearLayer>("linear1", raul::LinearParams{ { "in" }, { "linear1_out" }, LINSIZE });
+    work.add<raul::ScaleLayer>("scale", raul::ScaleParams{ { "linear1_out" }, { "scale_out" }, SFACTOR });
+    work.add<raul::LinearLayer>("linear2", raul::LinearParams{ { "scale_out" }, { "linear2_out" }, LINSIZE });
+    work.add<raul::ReduceMeanLayer>("mean", raul::BasicParamsWithDim{ { "linear2_out" }, { "out" } });
+    work.add<raul::DataLayer>("dataOut", raul::DataParams{ { "realOut" }, 1U, 1U, 1U });
+
+    work.preparePipelines();
+    work.setBatchSize(BATCH);
+    work.prepareMemoryForTraining();
+
+    auto& memory_manager = work.getMemoryManager();
+    memory_manager["in"] = TORANGE(*Input);
+    memory_manager["linear1::Weights"] = TORANGE(*Weights1);
+    memory_manager["linear2::Weights"] = TORANGE(*Weights2);
+    memory_manager["outGradient"].memAllocate(nullptr);
+    memory_manager["outGradient"] = 1.0_dt;
+    memory_manager["realOut"] = TORANGE(*RealOut);
+    memory_manager["realInGradient"] = TORANGE(*RealInGrad);
+
+    tools::callbacks::TensorChecker checker{ { { "out", "realOut" } }, { { "inGradient", "realInGradient" } }, eps };
+    auto& networkParameters = work.getNetworkParameters();
+    networkParameters.mCallback = checker;
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+TEST_F(TestWorkflowInterference, ExternalCalculationThroughCallbackUnit)
+{
+    raul::Workflow work;
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in", "realInGradient" }, DEPTH, HEIGHT, WIDTH });
+    work.add<raul::DataLayer>("dataOut", raul::DataParams{ { "realOut" }, 1U, 1U, 1U });
+    work.add<raul::LinearLayer>("linear1", raul::LinearParams{ { "in" }, { "linear1_out" }, LINSIZE });
+    work.add<raul::DataLayer>("external_data", raul::DataParams{ { "external_out" }, DEPTH, HEIGHT, LINSIZE });
+    work.add<raul::LinearLayer>("linear2", raul::LinearParams{ { "external_out" }, { "linear2_out" }, LINSIZE });
+    work.add<raul::ReduceMeanLayer>("mean", raul::BasicParamsWithDim{ { "linear2_out" }, { "out" } });
+
+    work.preparePipelines();
+    work.setBatchSize(BATCH);
+    work.prepareMemoryForTraining();
+
+    auto& memory_manager = work.getMemoryManager();
+    memory_manager["in"] = TORANGE(*Input);
+    memory_manager["linear1::Weights"] = TORANGE(*Weights1);
+    memory_manager["linear2::Weights"] = TORANGE(*Weights2);
+    memory_manager["outGradient"].memAllocate(nullptr);
+    memory_manager["outGradient"] = 1.0_dt;
+    memory_manager["realOut"] = TORANGE(*RealOut);
+    memory_manager["realInGradient"] = TORANGE(*RealInGrad);
+
+    auto afterForward = [](raul::BasicLayer* layer, raul::MemoryManager& mem) {
+        if (layer->getName() == "linear1")
+        {
+            // Scale forward
+            mem["external_out"] = TORANGE(mem[layer->getOutputs()[0]]);
+            mem["external_out"] *= 3.0_dt;
+        }
+        else if (layer->getName() == "mean")
+        {
+            EXPECT_EQ(mem["out"].size(), mem["realOut"].size());
+            EXPECT_EQ(mem["out"][0], mem["realOut"][0]);
+        }
+    };
+
+    auto afterBackward = [](raul::BasicLayer* layer, raul::MemoryManager& mem) {
+        if (layer->getName() == "linear2")
+        {
+            // Scale backward
+            mem["linear1_outGradient"].memAllocate(nullptr);
+            mem["linear1_outGradient"] = TORANGE(mem[layer->getInputs()[0].grad()]);
+            mem["linear1_outGradient"] *= 3.0_dt;
+        }
+        else if (layer->getName() == "linear1")
+        {
+            EXPECT_EQ(mem["inGradient"].size(), mem["realInGradient"].size());
+            for (size_t i = 0; i < mem["realInGradient"].size(); ++i)
+            {
+                EXPECT_EQ(mem["inGradient"][i], mem["realInGradient"][i]);
+            }
+        }
+    };
+
+    auto& networkParameters = work.getNetworkParameters();
+    networkParameters.mCallback = raul::CallbackHelper(std::nullopt, afterForward, std::nullopt, afterBackward);
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+    // Backward
+    ASSERT_NO_THROW(work.backwardPassTraining());
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/workflow/Test_WorkflowOverrideLayerExecutionTarget.cpp b/training/src/tests/tests/lib/workflow/Test_WorkflowOverrideLayerExecutionTarget.cpp
new file mode 100644
index 00000000..4ba218fd
--- /dev/null
+++ b/training/src/tests/tests/lib/workflow/Test_WorkflowOverrideLayerExecutionTarget.cpp
@@ -0,0 +1,222 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+
+#include <training/compiler/Workflow.h>
+
+#include <training/compiler/Layers.h>
+#include <training/base/layers/basic/ConvertPrecisionLayer.h>
+
+#include <training/base/optimizers/SGD.h>
+
+namespace
+{
+
+class TestLayerOverride : public raul::BasicLayer
+{
+  public:
+    TestLayerOverride(const raul::Name& name, const raul::BasicParams& params, std::function<void()> onForward, std::function<void()> onBackward, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "test", params, networkParameters, { false, false }),
+        mOnForward(onForward),
+        mOnBackward(onBackward)
+    {
+
+        if (mInputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of input names");
+        }
+        if (mOutputs.size() != 1)
+        {
+            THROW(mTypeName, mName, "wrong number of output names");
+        }
+
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], Workflow::Usage::ForwardAndBackward, Workflow::Mode::Read);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mInputs[0], mInputs[0].grad(), DEC_BACK_WRIT_ZERO);
+
+        mNetworkParams.mWorkflow.tensorNeeded(mName, mOutputs[0], WShape{ BS(), 1u, 1u, 1u}, DEC_FORW_WRIT_COMP);
+        mNetworkParams.mWorkflow.copyDeclaration(mName, mOutputs[0], mOutputs[0].grad(), DEC_BACK_READ);
+
+        mLayerExecutionTarget = mNetworkParams.mWorkflow.getOverrideLayerExecutionTarget();
+    }
+
+    void forwardComputeImpl(raul::NetworkMode) override
+    {
+        mOnForward();
+
+        auto executionTarget = mNetworkParams.mWorkflow.getExecutionTarget();
+
+        if (mLayerExecutionTarget != raul::LayerExecutionTarget::Default)
+        {
+            executionTarget = static_cast<raul::ExecutionTarget>(mLayerExecutionTarget);
+        }
+
+        if (executionTarget == raul::ExecutionTarget::CPU)
+        {
+            mNetworkParams.mWorkflow.getMemoryManager<MemoryManager>()[mOutputs[0]] = TORANGE(mNetworkParams.mWorkflow.getMemoryManager<MemoryManager>()[mInputs[0]]);
+        }
+        else if (executionTarget == raul::ExecutionTarget::CPUFP16)
+        {
+            mNetworkParams.mWorkflow.getMemoryManager<MemoryManagerFP16>()[mOutputs[0]] = TORANGE_FP16(mNetworkParams.mWorkflow.getMemoryManager<MemoryManagerFP16>()[mInputs[0]]);
+        }
+        else
+        {
+            THROW(mTypeName, mName, "Target not supported");
+        }
+    }
+
+    void backwardComputeImpl() override
+    {
+        mOnBackward();
+
+        auto executionTarget = mNetworkParams.mWorkflow.getExecutionTarget();
+
+        if (mLayerExecutionTarget != raul::LayerExecutionTarget::Default)
+        {
+            executionTarget = static_cast<raul::ExecutionTarget>(mLayerExecutionTarget);
+        }
+
+        if (executionTarget == raul::ExecutionTarget::CPU)
+        {
+            mNetworkParams.mWorkflow.getMemoryManager<MemoryManager>()[mInputs[0].grad()] = TORANGE(mNetworkParams.mWorkflow.getMemoryManager<MemoryManager>()[mOutputs[0].grad()]);
+        }
+        else if (executionTarget == raul::ExecutionTarget::CPUFP16)
+        {
+            mNetworkParams.mWorkflow.getMemoryManager<MemoryManagerFP16>()[mInputs[0].grad()] = TORANGE_FP16(mNetworkParams.mWorkflow.getMemoryManager<MemoryManagerFP16>()[mOutputs[0].grad()]);
+        }
+        else
+        {
+            THROW(mTypeName, mName, "Target not supported");
+        }
+    }
+
+  private:
+
+      std::function<void()> mOnForward;
+      std::function<void()> mOnBackward;
+
+      raul::LayerExecutionTarget mLayerExecutionTarget;
+
+};
+
+} //anonymous
+
+namespace UT
+{
+
+TEST(TestWorkflowOverrideLayerExecutionTarget, OverrideUnit)
+{
+    PROFILE_TEST
+
+    {
+        Workflow work(CompressionMode::NONE, CalculationMode::DETERMINISTIC, AllocationMode::STANDARD, ExecutionTarget::CPU);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "data"}, 1u, 1u, 1u });
+        work.add<TestLayerOverride>("o1", raul::BasicParams{ { "data"}, {"o1"} }, [](){}, [&]()
+            {
+                ASSERT_EQ(work.getMemoryManager<MemoryManager>()[raul::Name("o1").grad()][0], 33_dt);
+            });
+
+        EXPECT_THROW(work.overrideLayerExecutionTarget(raul::LayerExecutionTarget::Default), raul::Exception);
+
+        work.add<raul::ConvertPrecisionLayer>("converter", raul::ConvertPrecisionParams{ { "o1"}, { "o1_fp16" }, raul::LayerExecutionTarget::Default, raul::LayerExecutionTarget::CPUFP16 });
+
+        work.overrideLayerExecutionTarget(raul::LayerExecutionTarget::CPUFP16);
+        EXPECT_THROW(work.overrideLayerExecutionTarget(raul::LayerExecutionTarget::Default), raul::Exception);
+        EXPECT_THROW(work.overrideLayerExecutionTarget(raul::LayerExecutionTarget::CPU), raul::Exception);
+        ASSERT_TRUE(work.getOverrideLayerExecutionTarget() == raul::LayerExecutionTarget::CPUFP16);
+
+        work.add<TestLayerOverride>("o2", raul::BasicParams{ { "o1_fp16"}, {"o2_fp16"} }, [](){}, [](){});
+
+        //reset before pipelines preparation
+        EXPECT_THROW(work.preparePipelines(), raul::Exception);
+
+        work.resetLayerExecutionTargetOverride();
+        ASSERT_TRUE(work.getOverrideLayerExecutionTarget() == raul::LayerExecutionTarget::Default);
+
+        EXPECT_THROW(work.add<raul::ConvertPrecisionLayer>("converter2", raul::ConvertPrecisionParams{ { "o2_fp16"}, { "o2"}, raul::LayerExecutionTarget::Default, raul::LayerExecutionTarget::Default }), raul::Exception);
+        work.add<raul::ConvertPrecisionLayer>("converter2", raul::ConvertPrecisionParams{ { "o2_fp16"}, { "o2"}, raul::LayerExecutionTarget::CPUFP16, raul::LayerExecutionTarget::Default });
+
+        work.add<TestLayerOverride>("o3", raul::BasicParams{ { "o2"}, {"o3"} }, [](){}, [&]()
+            {
+                work.getMemoryManager<MemoryManager>()[raul::Name("o3").grad()][0] = 33_dt;
+                ASSERT_EQ(work.getMemoryManager<MemoryManager>()["o2"][0], 12_dt);
+            });
+
+        work.preparePipelines();
+        work.setBatchSize(1u);
+        work.prepareMemoryForTraining();
+
+        work.getMemoryManager<MemoryManager>()["data"][0] = 12_dt;
+
+        ASSERT_TRUE(work.getMemoryManager<MemoryManager>().tensorExists("data"));
+        ASSERT_TRUE(work.getMemoryManager<MemoryManager>().tensorExists("o1"));
+        ASSERT_TRUE(work.getMemoryManager<MemoryManagerFP16>().tensorExists("o1_fp16"));
+        ASSERT_TRUE(work.getMemoryManager<MemoryManagerFP16>().tensorExists("o2_fp16"));
+        ASSERT_TRUE(work.getMemoryManager<MemoryManager>().tensorExists("o2"));
+        ASSERT_TRUE(work.getMemoryManager<MemoryManager>().tensorExists("o3"));
+
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+    }
+
+    {
+        Workflow work(CompressionMode::NONE, CalculationMode::DETERMINISTIC, AllocationMode::STANDARD, ExecutionTarget::CPUFP16);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "data"}, 1u, 1u, 1u });
+        work.add<TestLayerOverride>("o1", raul::BasicParams{ { "data"}, {"o1"} }, [](){}, [&]()
+            {
+                ASSERT_EQ(work.getMemoryManager<MemoryManagerFP16>()[raul::Name("o1").grad()][0], 33_hf);
+            });
+
+        work.add<raul::ConvertPrecisionLayer>("converter", raul::ConvertPrecisionParams{ { "o1"}, { "o1_fp32"}, raul::LayerExecutionTarget::Default, raul::LayerExecutionTarget::CPU });
+
+        work.overrideLayerExecutionTarget(raul::LayerExecutionTarget::CPU);
+        ASSERT_TRUE(work.getOverrideLayerExecutionTarget() == raul::LayerExecutionTarget::CPU);
+
+        work.add<TestLayerOverride>("o2", raul::BasicParams{ { "o1_fp32"}, {"o2_fp32"} }, [](){}, [](){});
+
+        work.resetLayerExecutionTargetOverride();
+        ASSERT_TRUE(work.getOverrideLayerExecutionTarget() == raul::LayerExecutionTarget::Default);
+
+        work.add<raul::ConvertPrecisionLayer>("converter2", raul::ConvertPrecisionParams{ { "o2_fp32"}, { "o2"}, raul::LayerExecutionTarget::CPU, raul::LayerExecutionTarget::Default });
+
+        work.add<TestLayerOverride>("o3", raul::BasicParams{ { "o2"}, {"o3"} }, [](){}, [&]()
+            {
+                work.getMemoryManager<MemoryManagerFP16>()[raul::Name("o3").grad()][0] = 33_hf;
+                ASSERT_EQ(work.getMemoryManager<MemoryManagerFP16>()["o2"][0], 12_hf);
+            });
+
+        work.preparePipelines();
+        work.setBatchSize(1u);
+        work.prepareMemoryForTraining();
+
+        work.getMemoryManager<MemoryManagerFP16>()["data"][0] = 12_hf;
+
+        ASSERT_TRUE(work.getMemoryManager<MemoryManagerFP16>().tensorExists("data"));
+        ASSERT_TRUE(work.getMemoryManager<MemoryManagerFP16>().tensorExists("o1"));
+        ASSERT_TRUE(work.getMemoryManager<MemoryManager>().tensorExists("o1_fp32"));
+        ASSERT_TRUE(work.getMemoryManager<MemoryManager>().tensorExists("o2_fp32"));
+        ASSERT_TRUE(work.getMemoryManager<MemoryManagerFP16>().tensorExists("o2"));
+        ASSERT_TRUE(work.getMemoryManager<MemoryManagerFP16>().tensorExists("o3"));
+
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/workflow/Test_WorkflowPool.cpp b/training/src/tests/tests/lib/workflow/Test_WorkflowPool.cpp
new file mode 100644
index 00000000..d2e831d3
--- /dev/null
+++ b/training/src/tests/tests/lib/workflow/Test_WorkflowPool.cpp
@@ -0,0 +1,259 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/compiler/WorkflowPool.h>
+
+namespace UT
+{
+
+TEST(TestWorkflowPool, GeneralIncorrectOrderUnit)
+{
+    PROFILE_TEST
+
+    raul::WorkflowPool<raul::MemoryManager> pool;
+
+    EXPECT_THROW(pool.getOffset(""), raul::Exception);
+
+    raul::MemoryManager manager;
+    EXPECT_NO_THROW(pool.createPool(manager));
+}
+
+TEST(TestWorkflowPool, CreateIntervalsIncorrectParamsUnit)
+{
+    PROFILE_TEST
+
+    raul::WorkflowPool<raul::MemoryManager> pool;
+
+    {
+        raul::Names layers = {};
+        raul::WorkflowPool<raul::MemoryManager>::Timeline timeline = {};
+
+        EXPECT_NO_THROW(pool.createIntervals(layers, timeline));
+    }
+
+    {
+        raul::Names layers = { "LA", "L2", "L3", "L4" };
+        raul::WorkflowPool<raul::MemoryManager>::Timeline timeline = {
+            { "t1", { "L1", "L3" } },
+            { "t2", { "L2", "L3" } },
+            { "t3", { "L2", "L4" } },
+        };
+
+        EXPECT_THROW(pool.createIntervals(layers, timeline), raul::Exception);
+    }
+
+    {
+        raul::Names layers = { "L1", "LA", "L3", "L4" };
+        raul::WorkflowPool<raul::MemoryManager>::Timeline timeline = {
+            { "t1", { "L1", "L2" } },
+            { "t2", { "L1", "L3" } },
+            { "t3", { "L1", "L4" } },
+        };
+
+        EXPECT_THROW(pool.createIntervals(layers, timeline), raul::Exception);
+    }
+
+    {
+        raul::Names layers = { "L1", "L2", "L3", "L4" };
+        raul::WorkflowPool<raul::MemoryManager>::Timeline timeline = {
+            { "t1", { "L2", "L1" } },
+            { "t2", { "L1", "L3" } },
+            { "t3", { "L1", "L4" } },
+        };
+
+        EXPECT_THROW(pool.createIntervals(layers, timeline), raul::Exception);
+    }
+}
+
+TEST(TestWorkflowPool, CreatePoolIncorrectParamsUnit)
+{
+    PROFILE_TEST
+
+    raul::WorkflowPool<raul::MemoryManager> pool;
+
+    {
+        raul::Names layers = { "L1", "L2", "L3", "L4" };
+        raul::WorkflowPool<raul::MemoryManager>::Timeline timeline = {
+            { "t1", { "L1", "L2" } },
+            { "t2", { "L2", "L3" } },
+            { "t3", { "L2", "L4" } },
+            { "t4", { "L3", "L4" } },
+        };
+
+        pool.createIntervals(layers, timeline);
+
+        raul::MemoryManager manager;
+
+        manager.createShape("t1", 10u, 1u, 1u, 1u, raul::AllocationMode::STANDARD);
+        manager.createShape("t2", 10u, 2u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t3", 10u, 3u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t4", 10u, 4u, 1u, 1u, raul::AllocationMode::POOL);
+
+        EXPECT_THROW(pool.createPool(manager), raul::Exception);
+    }
+}
+
+TEST(TestWorkflowPool, CreatePoolUnit)
+{
+    PROFILE_TEST
+
+    raul::WorkflowPool<raul::MemoryManager> pool;
+
+    {
+        /*
+        1       2       3       4
+        |-t1-10|        |-t4-40-|
+                |-t2-20-|
+                |-----t3-30-----|
+        */
+        raul::Names layers = { "L1", "L2", "L3", "L4" };
+        raul::WorkflowPool<raul::MemoryManager>::Timeline timeline = {
+            { "t1", { "L1", "L1" } },
+            { "t2", { "L2", "L3" } },
+            { "t3", { "L2", "L4" } },
+            { "t4", { "L3", "L4" } },
+        };
+
+        pool.createIntervals(layers, timeline);
+
+        raul::MemoryManager manager;
+
+        manager.createShape("t1", 10u, 1u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t2", 10u, 2u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t3", 10u, 3u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t4", 10u, 4u, 1u, 1u, raul::AllocationMode::POOL);
+
+        pool.createPool(manager);
+
+        EXPECT_EQ(pool.getPool().size(), 90u);
+
+        EXPECT_EQ(pool.getOffset("t1"), &pool.getPool()[0]);
+        EXPECT_EQ(pool.getOffset("t2"), &pool.getPool()[0]);
+        EXPECT_EQ(pool.getOffset("t3"), &pool.getPool()[20]);
+        EXPECT_EQ(pool.getOffset("t4"), &pool.getPool()[50]);
+    }
+
+    {
+        /*
+        1       2       3       4
+        |-t1-10-|       |-t4-40-|
+                |-t2-20-|
+                |-----t3-30-----|
+        */
+        raul::Names layers = { "L1", "L2", "L3", "L4" };
+        raul::WorkflowPool<raul::MemoryManager>::Timeline timeline = {
+            { "t1", { "L1", "L2" } },
+            { "t2", { "L2", "L3" } },
+            { "t3", { "L2", "L4" } },
+            { "t4", { "L3", "L4" } },
+        };
+
+        pool.createIntervals(layers, timeline);
+
+        raul::MemoryManager manager;
+
+        manager.createShape("t1", 10u, 1u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t2", 10u, 2u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t3", 10u, 3u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t4", 10u, 4u, 1u, 1u, raul::AllocationMode::POOL);
+
+        pool.createPool(manager);
+
+        EXPECT_THROW(pool.getOffset(""), raul::Exception);
+
+        EXPECT_EQ(pool.getPool().size(), 100u);
+
+        EXPECT_EQ(pool.getOffset("t1"), &pool.getPool()[0]);
+        EXPECT_EQ(pool.getOffset("t2"), &pool.getPool()[10]);
+        EXPECT_EQ(pool.getOffset("t3"), &pool.getPool()[30]);
+        EXPECT_EQ(pool.getOffset("t4"), &pool.getPool()[60]);
+    }
+
+    {
+        /*
+        1       2       3       4       5       6
+        |-t1-100|       |-t2-10-|       |-t4-10-|
+                        |---------t3-80---------|
+        */
+        raul::Names layers = { "L1", "L2", "L3", "L4", "L5", "L6" };
+        raul::WorkflowPool<raul::MemoryManager>::Timeline timeline = {
+            { "t1", { "L1", "L2" } },
+            { "t2", { "L3", "L4" } },
+            { "t3", { "L3", "L6" } },
+            { "t4", { "L5", "L6" } },
+        };
+
+        pool.createIntervals(layers, timeline);
+
+        raul::MemoryManager manager;
+
+        manager.createShape("t1", 10u, 10u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t2", 10u, 1u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t3", 10u, 8u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t4", 10u, 1u, 1u, 1u, raul::AllocationMode::POOL);
+
+        pool.createPool(manager);
+
+        EXPECT_THROW(pool.getOffset(""), raul::Exception);
+
+        EXPECT_EQ(pool.getPool().size(), 100u);
+
+        EXPECT_EQ(pool.getOffset("t1"), &pool.getPool()[0]);
+        EXPECT_EQ(pool.getOffset("t2"), &pool.getPool()[0]);
+        EXPECT_EQ(pool.getOffset("t3"), &pool.getPool()[10]);
+        EXPECT_EQ(pool.getOffset("t4"), &pool.getPool()[0]);
+    }
+
+    {
+        /*
+        1       2       3       4       5       6
+        |-t1-100|       |-t2-10-|       |-t4-10-|
+                        |---------t3-70---------|
+                                        |-t5-10-|
+                                        |-t6-10-|
+        */
+        raul::Names layers = { "L1", "L2", "L3", "L4", "L5", "L6" };
+        raul::WorkflowPool<raul::MemoryManager>::Timeline timeline = {
+            { "t1", { "L1", "L2" } }, { "t2", { "L3", "L4" } }, { "t3", { "L3", "L6" } }, { "t4", { "L5", "L6" } }, { "t5", { "L5", "L6" } }, { "t6", { "L5", "L6" } },
+        };
+
+        pool.createIntervals(layers, timeline);
+
+        raul::MemoryManager manager;
+
+        manager.createShape("t1", 10u, 10u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t2", 10u, 1u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t3", 10u, 7u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t4", 10u, 1u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t5", 10u, 1u, 1u, 1u, raul::AllocationMode::POOL);
+        manager.createShape("t6", 10u, 1u, 1u, 1u, raul::AllocationMode::POOL);
+
+        pool.createPool(manager);
+
+        EXPECT_THROW(pool.getOffset(""), raul::Exception);
+
+        EXPECT_EQ(pool.getPool().size(), 100u);
+        EXPECT_EQ(pool.getPoolSize(), 400u);
+
+        EXPECT_EQ(pool.getOffset("t1"), &pool.getPool()[0]);
+        EXPECT_EQ(pool.getOffset("t2"), &pool.getPool()[0]);
+        EXPECT_EQ(pool.getOffset("t3"), &pool.getPool()[10]);
+        EXPECT_EQ(pool.getOffset("t4"), &pool.getPool()[0]);
+        EXPECT_EQ(pool.getOffset("t5"), &pool.getPool()[80]);
+        EXPECT_EQ(pool.getOffset("t6"), &pool.getPool()[90]);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/lib/workflow/Test_WorkflowTools.h b/training/src/tests/tests/lib/workflow/Test_WorkflowTools.h
new file mode 100644
index 00000000..54819be7
--- /dev/null
+++ b/training/src/tests/tests/lib/workflow/Test_WorkflowTools.h
@@ -0,0 +1,466 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TESTWORKFLOWTOOLS_H
+#define TESTWORKFLOWTOOLS_H
+
+namespace
+{
+
+class TestInitLayer : public raul::BasicLayer
+{
+  public:
+    TestInitLayer(const raul::Name& name, const raul::BasicParams& params, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "TestInit", params, networkParameters, { false, false })
+        , mBS(0)
+    {
+    }
+
+    void onBatchSizeChanged(size_t size) override { mBS = size; }
+
+    void forwardComputeImpl(raul::NetworkMode) override {}
+    void backwardComputeImpl() override {}
+
+    size_t getBatchSize() const { return mBS; }
+
+  private:
+    size_t mBS;
+};
+
+class TestLayer : public raul::BasicLayer
+{
+  public:
+    TestLayer(const raul::Name& name, const raul::BasicParams& params, bool performGradChecks, raul::NetworkParameters& networkParameters)
+        : BasicLayer(name, "test", params, networkParameters, { false, false })
+        , mForwardCountTest(0)
+        , mForwardCountTrain(0)
+        , mBackwardCount(0)
+        , mExpectGrad(0.0_dt)
+        , mPerformGradChecks(performGradChecks)
+        , mPerformGradWeightsChecks(true)
+        , mPerformSizeChecks(true)
+    {
+        for (auto& input : params.getInputs())
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(
+                name, input, raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, true, true, false, false, false);
+
+            mNetworkParams.mWorkflow.tensorNeeded(
+                name, input.grad(), raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Write, true, true, false, true, false);
+        }
+
+        for (auto& output : params.getOutputs())
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(name, output, raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Forward, raul::Workflow::Mode::Write, true, true, false, false, false);
+
+            mNetworkParams.mWorkflow.tensorNeeded(
+                name, output.grad(), raul::WShape{ raul::BS(), 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Read, true, true, false, false, false);
+        }
+
+        if (mSharedWeights.empty())
+        {
+            mNetworkParams.mWorkflow.tensorNeeded(
+                name, name / "Weights", raul::WShape{ 1u, 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, true, false, false);
+
+            mNetworkParams.mWorkflow.tensorNeeded(
+                name, (name / "Weights").grad(), raul::WShape{ 1u, 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Write, false, false, false, true, false);
+        }
+        else
+        {
+            for (auto& weights : params.getSharedWeights())
+            {
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, weights, raul::WShape{ 1u, 1u, 1u, 1u }, raul::Workflow::Usage::ForwardAndBackward, raul::Workflow::Mode::Read, false, false, true, false, false);
+
+                mNetworkParams.mWorkflow.tensorNeeded(
+                    name, weights.grad(), raul::WShape{ 1u, 1u, 1u, 1u }, raul::Workflow::Usage::Backward, raul::Workflow::Mode::Write, false, false, false, true, false);
+            }
+        }
+    }
+
+    void forwardComputeImpl(raul::NetworkMode mode) override
+    {
+        if (mode == raul::NetworkMode::Test)
+        {
+            ++mForwardCountTest;
+        }
+
+        if (mode == raul::NetworkMode::Train)
+        {
+            ++mForwardCountTrain;
+        }
+
+        for (auto& input : mInputs)
+        {
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(input));
+            ASSERT_EQ(mNetworkParams.mMemoryManager[input].size(), 1u * mNetworkParams.mWorkflow.getBatchSize());
+
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(input.grad()));
+
+            if (mPerformSizeChecks)
+            {
+                ASSERT_EQ(mNetworkParams.mMemoryManager[input.grad()].size(), 0u);
+            }
+        }
+
+        for (auto& output : mOutputs)
+        {
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(output));
+            ASSERT_EQ(mNetworkParams.mMemoryManager[output].size(), 1u * mNetworkParams.mWorkflow.getBatchSize());
+
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(output.grad()));
+
+            if (mPerformSizeChecks)
+            {
+                ASSERT_EQ(mNetworkParams.mMemoryManager[output.grad()].size(), 0u);
+            }
+        }
+
+        if (mSharedWeights.empty())
+        {
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(mName / "Weights"));
+            ASSERT_EQ(mNetworkParams.mMemoryManager[mName / "Weights"].size(), 1u);
+
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists((mName / "Weights").grad()));
+            ASSERT_EQ(mNetworkParams.mMemoryManager[(mName / "Weights").grad()].size(), 1u);
+        }
+        else
+        {
+            for (auto& weights : mSharedWeights)
+            {
+                ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(weights));
+                ASSERT_EQ(mNetworkParams.mMemoryManager[weights].size(), 1u);
+
+                ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(weights.grad()));
+                ASSERT_EQ(mNetworkParams.mMemoryManager[weights.grad()].size(), 1u);
+            }
+        }
+    }
+
+    void backwardComputeImpl() override
+    {
+        ++mBackwardCount;
+
+        for (auto& input : mInputs)
+        {
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(input));
+
+            if (mPerformSizeChecks)
+            {
+                ASSERT_EQ(mNetworkParams.mMemoryManager[input].size(), 1u * mNetworkParams.mWorkflow.getBatchSize());
+            }
+
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(input.grad()));
+
+            if (mPerformSizeChecks)
+            {
+                ASSERT_EQ(mNetworkParams.mMemoryManager[input.grad()].size(), 1u * mNetworkParams.mWorkflow.getBatchSize());
+            }
+        }
+
+        for (auto& output : mOutputs)
+        {
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(output));
+
+            if (mPerformSizeChecks)
+            {
+                ASSERT_EQ(mNetworkParams.mMemoryManager[output].size(), 0u * mNetworkParams.mWorkflow.getBatchSize());
+            }
+
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(output.grad()));
+
+            if (mPerformSizeChecks)
+            {
+                ASSERT_EQ(mNetworkParams.mMemoryManager[output.grad()].size(), 1u * mNetworkParams.mWorkflow.getBatchSize());
+            }
+        }
+
+        if (mSharedWeights.empty())
+        {
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(mName / "Weights"));
+            ASSERT_EQ(mNetworkParams.mMemoryManager[mName / "Weights"].size(), 1u);
+
+            ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists((mName / "Weights").grad()));
+            ASSERT_EQ(mNetworkParams.mMemoryManager[(mName / "Weights").grad()].size(), 1u);
+
+            auto& gradients = mNetworkParams.mMemoryManager[(mName / "Weights").grad()];
+
+            for (auto& grad : gradients)
+            {
+                if (mPerformGradWeightsChecks)
+                {
+                    ASSERT_EQ(grad, 0.0_dt);
+                }
+                grad += 1.0_dt;
+            }
+
+            for (auto& grad : gradients)
+            {
+                if (mPerformGradWeightsChecks)
+                {
+                    ASSERT_EQ(grad, 1.0_dt);
+                }
+            }
+        }
+        else
+        {
+            for (auto& weights : mSharedWeights)
+            {
+                ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(weights));
+                ASSERT_EQ(mNetworkParams.mMemoryManager[weights].size(), 1u);
+
+                ASSERT_TRUE(mNetworkParams.mMemoryManager.tensorExists(weights.grad()));
+                ASSERT_EQ(mNetworkParams.mMemoryManager[weights.grad()].size(), 1u);
+
+                auto& gradients = mNetworkParams.mMemoryManager[weights.grad()];
+
+                for (auto& grad : gradients)
+                {
+                    grad += 1.0_dt;
+                }
+            }
+
+            auto& gradients = mNetworkParams.mMemoryManager[mSharedWeights[0].grad()];
+            for (auto& grad : gradients)
+            {
+                ASSERT_EQ(grad, TODTYPE(mSharedWeights.size()));
+            }
+        }
+
+        for (size_t q = 0; q < mOutputs.size(); ++q)
+        {
+            const auto& delta = mNetworkParams.mMemoryManager[mOutputs[q].grad()];
+
+            for (size_t w = 0; w < mInputs.size(); ++w)
+            {
+                auto& prevLayerDelta = mNetworkParams.mMemoryManager[mInputs[w].grad()];
+
+                if (prevLayerDelta.size() == delta.size())
+                {
+                    for (size_t e = 0; e < delta.size(); ++e)
+                    {
+                        prevLayerDelta[e] += delta[e] + 1.0_dt;
+                    }
+                }
+            }
+        }
+
+        if (!mInputs.empty())
+        {
+            auto& prevLayerDelta = mNetworkParams.mMemoryManager[mInputs[0].grad()];
+            for (size_t q = 0; q < prevLayerDelta.size(); ++q)
+            {
+                if (mPerformGradChecks)
+                {
+                    ASSERT_EQ(prevLayerDelta[q], mExpectGrad);
+                }
+            }
+        }
+    }
+
+    size_t getForwardCountTest() const { return mForwardCountTest; }
+    size_t getForwardCountTrain() const { return mForwardCountTrain; }
+    size_t getBackwardCount() const { return mBackwardCount; }
+
+    void setExpectGrad(raul::dtype expectGrad) { mExpectGrad = expectGrad; }
+
+    void setPerformGradWeightsChecks(bool param) { mPerformGradWeightsChecks = param; }
+    void setPerformSizeChecks(bool param) { mPerformSizeChecks = param; }
+
+  private:
+    size_t mForwardCountTest;
+    size_t mForwardCountTrain;
+    size_t mBackwardCount;
+
+    raul::dtype mExpectGrad;
+
+    bool mPerformGradChecks;
+    bool mPerformGradWeightsChecks;
+    bool mPerformSizeChecks;
+};
+
+[[maybe_unused]] const raul::Name getName(const raul::Workflow::Action* action)
+{
+    if (dynamic_cast<const raul::TensorAction<raul::MemoryManager>*>(action))
+    {
+        return static_cast<const raul::TensorAction<raul::MemoryManager>*>(action)->mName;
+    }
+    if (dynamic_cast<const raul::LayerAction*>(action))
+    {
+        return static_cast<const raul::LayerAction*>(action)->mLayer->getName();
+    }
+
+    return "";
+}
+
+[[maybe_unused]] bool checkName(const raul::Workflow::Action* action, const raul::Name& name)
+{
+    return getName(action) == name;
+}
+
+[[maybe_unused]] bool checkGroupedName(const raul::Names& names, const raul::Names& check)
+{
+    std::unordered_set<raul::Name> tNames;
+
+    for (const auto& tName : names)
+    {
+        tNames.insert(tName);
+    }
+
+    for (const auto& tName : check)
+    {
+        if (tNames.find(tName) == tNames.end())
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+[[maybe_unused]] raul::Name getGradName(const raul::Name& name)
+{
+    return name.grad();
+}
+
+[[maybe_unused]] bool checkBlock(const raul::Names& plain, size_t offset, const raul::Names& block)
+{
+    bool ret = true;
+
+    if (block.size() + offset > plain.size()) return false;
+
+    for (size_t q = 0; q < block.size(); ++q)
+    {
+        if (block[q] != plain[q + offset])
+        {
+            ret = false;
+            break;
+        }
+    }
+
+    return ret;
+}
+
+[[maybe_unused]] void permutate(std::vector<size_t>& indexes, size_t index, std::vector<std::vector<size_t>>& order)
+{
+    if (index == indexes.size())
+    {
+        order.push_back(indexes);
+        return;
+    }
+
+    for (size_t q = index; q < indexes.size(); ++q)
+    {
+        std::swap(indexes[index], indexes[q]);
+        permutate(indexes, index + 1, order);
+        std::swap(indexes[index], indexes[q]);
+    }
+}
+
+[[maybe_unused]] bool checkBlocks(const raul::Names& plain, const std::vector<raul::Names>& blocks)
+{
+    size_t sumBlocksLen = 0;
+
+    for (const auto& block : blocks)
+    {
+        sumBlocksLen += block.size();
+    }
+
+    if (sumBlocksLen != plain.size())
+    {
+        return false;
+    }
+
+    std::vector<size_t> indexes(blocks.size());
+    for (size_t q = 0; q < blocks.size(); ++q)
+    {
+        indexes[q] = q;
+    }
+
+    std::vector<std::vector<size_t>> order;
+
+    permutate(indexes, 0, order);
+
+    bool ret = false;
+
+    for (size_t q = 0; q < order.size(); ++q)
+    {
+        bool found = true;
+        size_t offset = 0;
+        for (size_t w = 0; w < order[q].size(); ++w)
+        {
+            if (!checkBlock(plain, offset, blocks[order[q][w]]))
+            {
+                found = false;
+                break;
+            }
+            offset += blocks[order[q][w]].size();
+        }
+
+        if (found)
+        {
+            ret = true;
+            break;
+        }
+    }
+
+    return ret;
+}
+
+[[maybe_unused]] bool checkBlocksName(const raul::Workflow::Pipeline& pipe, size_t indexStart, size_t indexFinish, const std::vector<raul::Names>& blocks)
+{
+    if (indexStart >= indexFinish)
+    {
+        return false;
+    }
+
+    if (indexStart >= pipe.size() || indexFinish >= pipe.size())
+    {
+        return false;
+    }
+
+    raul::Names plain;
+
+    for (size_t q = indexStart; q <= indexFinish; ++q)
+    {
+        plain.push_back(getName(pipe[q].get()));
+    }
+
+    return checkBlocks(plain, blocks);
+}
+
+[[maybe_unused]] bool checkBlocksType(const raul::Workflow::Pipeline& pipe, size_t indexStart, size_t indexFinish, const std::vector<raul::Names>& blocks)
+{
+    if (indexStart >= indexFinish)
+    {
+        return false;
+    }
+
+    if (indexStart >= pipe.size() || indexFinish >= pipe.size())
+    {
+        return false;
+    }
+
+    raul::Names plain;
+
+    for (size_t q = indexStart; q <= indexFinish; ++q)
+    {
+        plain.push_back(pipe[q]->type());
+    }
+
+    return checkBlocks(plain, blocks);
+}
+
+} // anonymous namespace
+
+#endif
\ No newline at end of file
diff --git a/training/src/tests/tests/losses/Test_BinaryCrossEntropyLoss.cpp b/training/src/tests/tests/losses/Test_BinaryCrossEntropyLoss.cpp
new file mode 100644
index 00000000..a96039bb
--- /dev/null
+++ b/training/src/tests/tests/losses/Test_BinaryCrossEntropyLoss.cpp
@@ -0,0 +1,186 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/loss/BinaryCrossEntropyLoss.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_bce_loss(const raul::dtype x, const raul::dtype y)
+{
+    return -(y * std::log(x) + (1.0_dt - y) * std::log(1.0_dt - x));
+}
+
+raul::dtype golden_bce_loss_grad(const raul::dtype x, const raul::dtype y, const raul::dtype g = 1.0_dt)
+{
+    return  -(y - x) / ((1.0_dt - x) * x) * g;
+}
+
+}
+
+TEST(TestLoss, BCELossSimpleUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t batch = 2;
+    constexpr dtype eps = 1.0e-4_dt;
+
+    const Tensor inputs = { 0.95963228_dt, 0.17414898_dt, 0.79030937_dt, 0.70006239_dt, 0.60727251_dt, 0.81080395_dt };
+    const Tensor targets = { 0.27816898_dt, 0.22943836_dt, 0.15281659_dt, 0.22011006_dt, 0.92237228_dt, 0.17413777_dt };
+
+    const Tensor realLoss = { 2.32834077_dt, 0.54846245_dt, 1.35936630_dt, 1.01761663_dt, 0.53261262_dt, 1.41155887_dt };
+    const Tensor realInGrad = { 17.59152031_dt, -0.38443166_dt, 3.84679580_dt, 2.28575897_dt, -1.32121396_dt, 4.15034199_dt };
+
+    // See bce_loss.py
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in", "target", "realInGrad" }, 1u, 1u, inputs.size() / batch });
+    work.add<BinaryCrossEntropyLoss>("loss", LossParams{ { "in", "target" }, { "loss" }, "none" });
+
+    TENSORS_CREATE(batch);
+    
+    memory_manager["in"] = TORANGE(inputs);
+    memory_manager["target"] = TORANGE(targets);
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    const auto& loss = memory_manager["loss"];
+    EXPECT_EQ(inputs.size(), loss.size());
+    for (size_t i = 0; i < loss.size(); ++i)
+    {
+        EXPECT_NEAR(loss[i], realLoss[i], eps);
+    }
+
+    // Backward
+    memory_manager[Name("loss").grad()] = 1.0_dt;
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    const auto& inGrad = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inGrad.size(), realInGrad.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], realInGrad[i], eps);
+    }
+}
+
+TEST(TestLoss, BCELossRandomUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t BATCH_SIZE = 5;
+    constexpr size_t WIDTH = 19;
+    constexpr size_t HEIGHT = 2;
+    constexpr size_t DEPTH = 3;
+    constexpr dtype eps = 1.0e-4_dt;
+    constexpr auto range = std::make_pair(0.0001_dt, 0.9999_dt);
+    
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in", "target" }, DEPTH, HEIGHT, WIDTH });
+    work.add<BinaryCrossEntropyLoss>("loss", LossParams{ { "in", "target" }, { "loss" }, "none" });
+
+    TENSORS_CREATE(BATCH_SIZE)
+
+    tools::init_rand_tensor("in", range, memory_manager);
+    tools::init_rand_tensor("target", range, memory_manager);
+    tools::init_rand_tensor("lossGradient", range, memory_manager);
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    const auto& loss = memory_manager["loss"];
+    const auto& in = memory_manager["in"];
+    const auto& target = memory_manager["target"];
+    EXPECT_EQ(in.size(), loss.size());
+    for (size_t i = 0; i < loss.size(); ++i)
+    {
+        EXPECT_NEAR(loss[i], golden_bce_loss(in[i], target[i]), eps);
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    const auto& inGrad = memory_manager[raul::Name("in").grad()];
+    const auto& lossGrad = memory_manager[raul::Name("loss").grad()];
+    EXPECT_EQ(inGrad.size(), in.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], golden_bce_loss_grad(in[i], target[i], lossGrad[i]), eps);
+    }
+}
+
+TEST(TestLoss, BCELossCornerCasesUnit)
+{
+    PROFILE_TEST
+
+    using namespace raul;
+
+    constexpr size_t batch = 4;
+    constexpr dtype eps = 1.0e-4_dt;
+
+    const Tensor inputs = { 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 0.998_dt, 0.002_dt, 1.0_dt, 0.999_dt, 0.0001_dt, 0.11_dt };
+    const Tensor targets = { 0.0_dt, 1.0_dt, 0.99_dt, 0.0_dt, 1.0_dt, 0.5_dt, 0.0001_dt, 0.997_dt, 1.0_dt, 0.999_dt, 0.001_dt, 0.00001_dt };
+
+    const Tensor realLoss = { 0.0_dt, 100.0_dt, 99.0_dt, 100.0_dt, 0.0_dt, 50.0_dt, 6.214_dt, 6.196_dt, 0.0_dt, 0.0079072_dt, 0.0093103_dt, 0.11655_dt };
+
+    const Tensor realInGrad = { 0.0_dt, -9.99999996e+11_dt, -9.89999989e+11_dt, 9.99999996e+11_dt,  0.0_dt,  4.99999998e+11_dt, 4.99956299e+02_dt, -4.98496979e+02_dt, 0.0_dt, 0.0_dt, -9.00090122_dt,  1.12349343_dt };
+
+    // See bce_loss.py
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<DataLayer>("data", DataParams{ { "in", "target", "realInGrad" }, 1u, 1u, inputs.size() / batch });
+    work.add<BinaryCrossEntropyLoss>("loss", LossParams{ { "in", "target" }, { "loss" }, "none" });
+
+    TENSORS_CREATE(batch);
+    
+    memory_manager["in"] = TORANGE(inputs);
+    memory_manager["target"] = TORANGE(targets);
+
+    // Forward
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    const auto& loss = memory_manager["loss"];
+    EXPECT_EQ(inputs.size(), loss.size());
+    for (size_t i = 0; i < loss.size(); ++i)
+    {
+        EXPECT_NEAR(loss[i], realLoss[i], eps);
+    }
+
+    // Backward
+    memory_manager[Name("loss").grad()] = 1.0_dt;
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    const auto& inGrad = memory_manager[Name("in").grad()];
+    EXPECT_EQ(inGrad.size(), realInGrad.size());
+    for (size_t i = 0; i < inGrad.size(); ++i)
+    {
+        EXPECT_NEAR(inGrad[i], realInGrad[i], eps);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/losses/Test_KLDivLoss.cpp b/training/src/tests/tests/losses/Test_KLDivLoss.cpp
new file mode 100644
index 00000000..be4b98d3
--- /dev/null
+++ b/training/src/tests/tests/losses/Test_KLDivLoss.cpp
@@ -0,0 +1,104 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/activations/LogSoftMaxActivation.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/loss/KLDivLoss.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLoss, KLDivLossUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    dtype eps = TODTYPE(1e-4);
+
+    const Tensor raw = { 1, 2, 5, -1, -2, 5, -1, 3 };
+
+    const Tensor targetProbabilities = { 0.7f, 0.0f, 0.2f, 0.1f, 0.2f, 0.2f, 0.2f, 0.4f };
+    const size_t batch = 2;
+
+    std::string reduction[] = { "none", "sum", "batch_mean", "mean" };
+    const Tensor realOut = { -4.0682f, -3.0682f, -0.0682f, -6.0682f, -7.1299f, -0.1299f, -6.1299f, -2.1299f };
+    const Tensor realLoss[] = { { 2.5981f, 0.0000f, -0.3082f, 0.3766f, 1.1041f, -0.2959f, 0.9041f, 0.4854f }, { 4.8641f }, { 2.4321f }, { 0.6080f } };
+
+    const Tensor realOutGrad[] = { { 0.0_dt },
+                                   { -0.7000f, 0.0000f, -0.2000f, -0.1000f, -0.2000f, -0.2000f, -0.2000f, -0.4000f },
+                                   { -0.3500f, 0.0000f, -0.1000f, -0.0500f, -0.1000f, -0.1000f, -0.1000f, -0.2000f },
+                                   { -0.0875f, 0.0000f, -0.0250f, -0.0125f, -0.0250f, -0.0250f, -0.0250f, -0.0500f } };
+
+    const Tensor realInGrad[] = { { 0.0_dt },
+                                  { -0.6829f, 0.0465f, 0.7341f, -0.0977f, -0.1992f, 0.6782f, -0.1978f, -0.2812f },
+                                  { -0.3414f, 0.0233f, 0.3670f, -0.0488f, -0.0996f, 0.3391f, -0.0989f, -0.1406f },
+                                  { -0.0854f, 0.0058f, 0.0918f, -0.0122f, -0.0249f, 0.0848f, -0.0247f, -0.0352f } };
+
+    for (size_t iter = 0; iter < std::size(reduction); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+        networkParameters.mLossReductionCoefficient = batch;
+
+        work.add<DataLayer>("data", DataParams{ { "in", "labels" }, 1u, 1u, raw.size() / batch });
+        work.add<LogSoftMaxActivation>("logsoftmax", BasicParamsWithDim{ { "in" }, { "out" } });
+        work.add<KLDivLoss>("loss", LossParams{ { "out", "labels" }, { "loss" }, reduction[iter].c_str() });
+
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(raw);
+        memory_manager["labels"] = TORANGE(targetProbabilities);
+
+        work.forwardPassTraining();
+        const Tensor& out = memory_manager["out"];
+        const Tensor& loss = memory_manager["loss"];
+
+        if (iter == 0)
+        {
+            for (size_t i = 0; i < out.size(); ++i)
+            {
+                EXPECT_NEAR(out[i], realOut[i], eps);
+            }
+        }
+
+        for (size_t i = 0; i < loss.size(); ++i)
+        {
+            EXPECT_NEAR(loss[i], realLoss[iter][i], eps);
+        }
+        printf(" - KLDivLoss[reduction=%s] forward is Ok.\n", reduction[iter].c_str());
+
+        if (iter > 0)
+        {
+            work.backwardPassTraining();
+
+            const Tensor& out_nabla = memory_manager[raul::Name("out").grad()];
+            const Tensor& in_nabla = memory_manager[raul::Name("in").grad()];
+
+            for (size_t i = 0; i < out_nabla.size(); ++i)
+            {
+                EXPECT_NEAR(out_nabla[i], realOutGrad[iter][i], eps);
+            }
+            printf(" - KLDivLoss[reduction=%s] backward is Ok.\n", reduction[iter].c_str());
+
+            for (size_t i = 0; i < in_nabla.size(); ++i)
+            {
+                EXPECT_NEAR(in_nabla[i], realInGrad[iter][i], eps);
+            }
+            printf(" - LogSoftMax backward is Ok.\n");
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/losses/Test_L1Loss.cpp b/training/src/tests/tests/losses/Test_L1Loss.cpp
new file mode 100644
index 00000000..3606a5d2
--- /dev/null
+++ b/training/src/tests/tests/losses/Test_L1Loss.cpp
@@ -0,0 +1,104 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <tests/tools/callbacks/TensorChecker.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+#include <training/base/loss/L1Loss.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+using namespace raul;
+using namespace UT::tools::callbacks;
+using namespace std;
+
+TEST(TestLoss, L1LossUnit)
+{
+    PROFILE_TEST
+
+    dtype eps = TODTYPE(1e-4);
+
+    Tensor inputs = { 1.3_dt, 1.2_dt, 0.1_dt, -4.0_dt, -0.3_dt, -10.0_dt, 1.0_dt, -1.0_dt, 2.0_dt, -2.3_dt };
+    Tensor targets = { 0.1_dt, 1.2_dt, 1.0_dt, 0.1_dt, 7.7_dt, 0.2_dt, 0.2_dt, 0.2_dt, -1.3_dt, -2.3_dt };
+    Tensor weights = { 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt, 1_dt };
+
+    size_t batch = 2;
+
+    std::string reduction[] = { "none", "sum", "mean", "sum_over_nonzero_weights" };
+
+    Tensor realLoss[] = { { 1.1999999285_dt, 0.0_dt, 0.8999999762_dt, 4.0999999046_dt, 8.0_dt, 10.1999998093_dt, 0.8000000119_dt, 1.2000000477_dt, 3.2999999523_dt, 0.0_dt },
+                          { 29.6999988556_dt },
+                          { 2.9699997902_dt },
+                          { 2.9699997902_dt } };
+
+    Tensor realInGrad[] = { { 1.0_dt, 0.0_dt, -1.0_dt, -1.0_dt, -1.0_dt, -1.0_dt, 1.0_dt, -1.0_dt, 1.0_dt, 0.0_dt },
+                            { 1.0_dt, 0.0_dt, -1.0_dt, -1.0_dt, -1.0_dt, -1.0_dt, 1.0_dt, -1.0_dt, 1.0_dt, 0.0_dt },
+                            { 0.1000000015_dt, 0.0_dt, -0.1000000015_dt, -0.1000000015_dt, -0.1000000015_dt, -0.1000000015_dt, 0.1000000015_dt, -0.1000000015_dt, 0.1000000015_dt, 0.0_dt },
+                            { 0.1000000015_dt, 0.0_dt, -0.1000000015_dt, -0.1000000015_dt, -0.1000000015_dt, -0.1000000015_dt, 0.1000000015_dt, -0.1000000015_dt, 0.1000000015_dt, 0.0_dt } };
+
+    for (size_t iter = 0; iter < std::size(reduction); ++iter)
+    {
+        Workflow work;
+
+        work.add<DataLayer>("data", DataParams{ { "in", "target", "weights" }, 1, 1, inputs.size() / batch });
+        if (iter != 3)
+        {
+            work.add<L1Loss>("loss", LossParams{ { "in", "target" }, { "loss" }, reduction[iter].c_str() });
+        }
+        else
+        {
+            work.add<L1Loss>("loss", LossParams{ { "in", "target", "weights" }, { "loss" }, reduction[iter].c_str() });
+        }
+
+        if (iter == 0)
+        {
+            work.add<TensorLayer>("grad",
+                                  TensorParams{ { Name("loss").grad() }, WShape{ BS(), 1u, 1u, inputs.size() / batch }, 1_dt, Workflow::Usage::ForwardAndBackward, Workflow::Mode::Read, true, true });
+        }
+
+        TENSORS_CREATE(batch);
+        auto& memory_manager = work.getMemoryManager();
+        memory_manager["in"] = TORANGE(inputs);
+        memory_manager["target"] = TORANGE(targets);
+        memory_manager["weights"] = TORANGE(weights);
+        memory_manager.createTensor("real_grad", batch, 1, 1, inputs.size() / batch, TORANGE(realInGrad[iter]));
+
+        TensorChecker checker({}, { { Name("in").grad(), "real_grad" } }, eps);
+
+        work.getNetworkParameters().mCallback = checker;
+
+        work.forwardPassTraining();
+        const Tensor& loss = memory_manager["loss"];
+
+        EXPECT_EQ(loss.size(), realLoss[iter].size());
+        for (size_t i = 0; i < loss.size(); ++i)
+        {
+            EXPECT_NEAR(loss[i], realLoss[iter][i], eps);
+        }
+        printf(" - L1Loss[reduction=%s] forward is Ok.\n", reduction[iter].c_str());
+
+        work.backwardPassTraining();
+
+        printf(" - L1Loss[reduction=%s] backward is Ok.\n", reduction[iter].c_str());
+
+        memory_manager.clear();
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/losses/Test_MSELoss.cpp b/training/src/tests/tests/losses/Test_MSELoss.cpp
new file mode 100644
index 00000000..f25638be
--- /dev/null
+++ b/training/src/tests/tests/losses/Test_MSELoss.cpp
@@ -0,0 +1,203 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <tests/tools/callbacks/TensorChecker.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/loss/MSELoss.h>
+#include <training/compiler/Workflow.h>
+
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/TensorLayer.h>
+
+namespace UT
+{
+
+using namespace raul;
+
+namespace
+{
+
+dtype golden_mse_loss(const dtype x, const dtype y)
+{
+    return (x - y) * (x - y);
+}
+
+dtype golden_mse_loss_grad(const dtype x, const dtype y, const dtype g = 1.0_dt)
+{
+    return 2 * (x - y) * g;
+}
+
+}
+
+TEST(TestLoss, MSELossOutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(MSELoss("loss", LossParams{ { "x", "y" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLoss, MSELossNoneForwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    networkParameters.mLossReductionCoefficient = tensor_size;
+
+    work.add<DataLayer>("data", DataParams{ { "input", "target" }, 1u, 1u, 1u });
+
+    // Apply function
+    MSELoss mse("loss", LossParams{ { "input", "target" }, { "out" }, "none" }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("input", random_range, memory_manager);
+    tools::init_rand_tensor("target", random_range, memory_manager);
+
+    mse.forwardCompute(NetworkMode::Train);
+
+    // Checks
+    const auto& x_tensor = memory_manager["input"];
+    const auto& y_tensor = memory_manager["target"];
+    const auto& out_tensor = memory_manager["out"];
+
+    EXPECT_EQ(out_tensor.size(), x_tensor.size());
+    EXPECT_EQ(out_tensor.size(), y_tensor.size());
+
+    for (size_t i = 0; i < out_tensor.size(); ++i)
+    {
+        const auto golden_out_value = golden_mse_loss(x_tensor[i], y_tensor[i]);
+        ASSERT_TRUE(tools::expect_near_relative(out_tensor[i], golden_out_value, eps_rel));
+    }
+}
+
+TEST(TestLoss, MSELossNoneBackwardRandUnit)
+{
+    PROFILE_TEST
+
+    // Test parameters
+    const auto eps_rel = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    networkParameters.mLossReductionCoefficient = tensor_size;
+
+    work.add<DataLayer>("data", DataParams{ { "input", "target" }, 1u, 1u, 1u });
+
+    // Apply function
+    MSELoss mse("loss", LossParams{ { "input", "target" }, { "out" }, "none" }, networkParameters);
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("input", random_range, memory_manager);
+    tools::init_rand_tensor("target", random_range, memory_manager);
+    std::fill(memory_manager[Name("out").grad()].begin(), memory_manager[Name("out").grad()].end(), 1.0_dt);
+
+    mse.forwardCompute(NetworkMode::Train);
+    mse.backwardCompute();
+
+    // Checks
+    const auto& x_tensor = memory_manager["input"];
+    const auto& y_tensor = memory_manager["target"];
+    const auto& x_tensor_grad = memory_manager[Name("input").grad()];
+
+    for (size_t i = 0; i < x_tensor.size(); ++i)
+    {
+        const auto golden_out_value_x = golden_mse_loss_grad(x_tensor[i], y_tensor[i]);
+        ASSERT_TRUE(tools::expect_near_relative(x_tensor_grad[i], golden_out_value_x, eps_rel));
+    }
+}
+
+TEST(TestLoss, MSELossUnit)
+{
+    PROFILE_TEST
+    constexpr size_t batch = 2;
+    constexpr dtype eps = TODTYPE(1e-4);
+
+    const Tensor inputs = { 0.8822693_dt, 0.9150040_dt, 0.3828638_dt, 0.9593056_dt, 0.3904482_dt, 0.6008953_dt };
+    const Tensor targets = { 0.2565725_dt, 0.7936413_dt, 0.9407715_dt, 0.1331859_dt, 0.9345981_dt, 0.5935796_dt };
+
+    const std::string reduction[] = { "none", "mean", "sum" };
+
+    const Tensor realLoss[] = { { 0.39149645_dt, 0.014728887_dt, 0.311261_dt, 0.68247378_dt, 0.2960991_dt, 0.000053519398_dt }, { 0.2826854_dt }, { 1.6961126_dt } };
+
+    const Tensor realInGrad[] = { { 1.2513936_dt, 0.2427254_dt, -1.1158154_dt, 1.6522393_dt, -1.0882998_dt, 0.0146314_dt },
+                                  { 0.2085656_dt, 0.0404542_dt, -0.1859692_dt, 0.2753732_dt, -0.1813833_dt, 0.0024386_dt },
+                                  { 1.2513936_dt, 0.2427254_dt, -1.1158154_dt, 1.6522393_dt, -1.0882998_dt, 0.0146314_dt } };
+
+    // See mseloss.py
+    for (size_t iter = 0; iter < std::size(reduction); ++iter)
+    {
+        Workflow work;
+
+        work.getNetworkParameters().mLossReductionCoefficient = batch;
+
+        work.add<DataLayer>("data", DataParams{ { "in", "target", "realInGrad" }, 1u, 1u, inputs.size() / batch });
+        work.add<DataLayer>("data_grad", DataParams{ { "inGradient" }, 1u, 1u, inputs.size() / batch });
+        if (iter == 0)
+        {
+            work.add<DataLayer>("data_loss", DataParams{ { "realLoss" }, 1u, 1u, inputs.size() / batch });
+        }
+        else
+        {
+            work.add<TensorLayer>("data_loss", TensorParams{ { "realLoss" }, 1u, 1u, 1u, 1u });
+        }
+        work.add<MSELoss>("loss", LossParams{ { "in", "target" }, { "loss" }, reduction[iter].c_str() });
+
+        if (iter == 0)
+        {
+            work.add<TensorLayer>("grad",
+                                  TensorParams{ { Name("loss").grad() }, WShape{ BS(), 1u, 1u, inputs.size() / batch }, 1_dt, Workflow::Usage::ForwardAndBackward, Workflow::Mode::Read, true, true });
+        }
+
+        TENSORS_CREATE(batch);
+        auto& memory_manager = work.getMemoryManager();
+        memory_manager["in"] = TORANGE(inputs);
+        memory_manager["target"] = TORANGE(targets);
+        memory_manager["realInGrad"] = TORANGE(realInGrad[iter]);
+        memory_manager["realLoss"] = TORANGE(realLoss[iter]);
+        UT::tools::callbacks::TensorChecker checker({ { "loss", "realLoss" } }, { {} }, eps);
+
+        work.getNetworkParameters().mCallback = checker;
+
+        // Forward
+        work.forwardPassTraining();
+        printf(" - MSELoss[reduction=%s] forward is Ok.\n", reduction[iter].c_str());
+
+        // Backward
+        work.backwardPassTraining();
+
+        const auto inGrad = memory_manager[Name("in").grad()];
+        for (size_t i = 0; i < inGrad.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(inGrad[i], realInGrad[iter][i], eps));
+        }
+        printf(" - MSELoss[reduction=%s] backward is Ok.\n", reduction[iter].c_str());
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/losses/Test_SigmoidCrossEntropyLoss.cpp b/training/src/tests/tests/losses/Test_SigmoidCrossEntropyLoss.cpp
new file mode 100644
index 00000000..c288b8b5
--- /dev/null
+++ b/training/src/tests/tests/losses/Test_SigmoidCrossEntropyLoss.cpp
@@ -0,0 +1,235 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/loss/SigmoidCrossEntropyLoss.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+namespace
+{
+
+raul::dtype golden_sigmoid_cs_loss(const raul::dtype x, const raul::dtype y)
+{
+    return std::max(x, 0.0_dt) - x * y + std::log(1.0_dt + std::exp(-abs(x)));
+}
+
+raul::dtype golden_sigmoid_cs_loss_grad(const raul::dtype x, const raul::dtype y, const raul::dtype g = 1.0_dt)
+{
+    return (1_dt - y - std::exp(-x) / (1_dt + std::exp(-x))) * g;
+}
+
+}
+
+TEST(TestLoss, SigmoidCELossOutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::SigmoidCrossEntropyLoss("loss", raul::LossParams{ { "x", "y" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLoss, SigmoidCELossNoneForwardRandUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("data", DataParams{ { "input", "target" }, 1u, 1u, 1u });
+    work.add<SigmoidCrossEntropyLoss>("loss", LossParams{ { "input", "target" }, { "out" }, "none" });
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("input", random_range, memory_manager);
+    tools::init_rand_tensor("target", random_range, memory_manager);
+
+    work.forwardPassTraining();
+
+    // Checks
+    const auto& xTensor = memory_manager["input"];
+    const auto& yTensor = memory_manager["target"];
+    const auto& outTensor = memory_manager["out"];
+
+    for (size_t i = 0; i < outTensor.size(); ++i)
+    {
+        const auto goldenValue = golden_sigmoid_cs_loss(xTensor[i], yTensor[i]);
+        ASSERT_TRUE(tools::expect_near_relative(outTensor[i], goldenValue, eps));
+    }
+}
+
+TEST(TestLoss, SigmoidCELossNoneBackwardRandUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = TODTYPE(1e-6);
+    const auto tensor_size = 1000U;
+    const auto random_range = std::make_pair(1.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "input", "target" }, 1u, 1u, 1u });
+    work.add<raul::SigmoidCrossEntropyLoss>("loss", raul::LossParams{ { "input", "target" }, { "out" }, "none" });
+    TENSORS_CREATE(tensor_size);
+    tools::init_rand_tensor("input", random_range, memory_manager);
+    tools::init_rand_tensor("target", random_range, memory_manager);
+    memory_manager[raul::Name("out").grad()] = 1.0_dt;
+
+    work.forwardPassTraining();
+    work.backwardPassTraining();
+
+    // Checks
+    const auto& xTensor = memory_manager["input"];
+    const auto& yTensor = memory_manager["target"];
+    const auto& inNablaTensor = memory_manager[raul::Name("input").grad()];
+
+    EXPECT_EQ(inNablaTensor.size(), xTensor.size());
+    EXPECT_EQ(inNablaTensor.size(), yTensor.size());
+
+    for (size_t i = 0; i < inNablaTensor.size(); ++i)
+    {
+        const auto goldenGradValue = golden_sigmoid_cs_loss_grad(xTensor[i], yTensor[i]);
+        ASSERT_TRUE(tools::expect_near_relative(inNablaTensor[i], goldenGradValue, eps));
+    }
+}
+
+TEST(TestLoss, SigmoidCELossForwardUnit)
+{
+    PROFILE_TEST
+    // Test parameters
+    const auto eps = TODTYPE(1e-6);
+    const size_t batch = 1;
+    const size_t depth = 3;
+    const size_t height = 3;
+    const size_t width = 3;
+
+    const raul::Tensor x{ -1.0_dt, 2.0_dt,   3.0_dt,  -4.0_dt, 5.0_dt,   6.0_dt,  -7.0_dt, 8.0_dt,   9.0_dt,  -10.0_dt, 11.0_dt,  12.0_dt, -13.0_dt, 14.0_dt,
+                          15.0_dt, -16.0_dt, 17.0_dt, 18.0_dt, -19.0_dt, 20.0_dt, 21.0_dt, -22.0_dt, 23.0_dt, 24.0_dt,  -25.0_dt, 26.0_dt, 27.0_dt };
+    const raul::Tensor y{ 1.0_dt, 1.0_dt,  1.0_dt,  2.0_dt,  2.0_dt,  2.0_dt,  3.0_dt,  3.0_dt, 3.0_dt, 5.0_dt, 5.0_dt, 5.0_dt, 8.0_dt, 8.0_dt,
+                          8.0_dt, 17.0_dt, 16.0_dt, 15.0_dt, 14.0_dt, 21.0_dt, 22.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt };
+
+    const raul::Tensor realOut[]{ {
+                                      1.3132617_dt, 0.126928_dt,   0.048587352_dt, 8.01815_dt, -4.9932847_dt, -5.9975243_dt, 21.000912_dt, -15.999664_dt, -17.999876_dt,
+                                      50.000046_dt, -43.999985_dt, -47.999992_dt,  104.0_dt,   -98.0_dt,      -105.0_dt,     272.0_dt,     -255.0_dt,     -252.0_dt,
+                                      266.0_dt,     -400.0_dt,     -441.0_dt,      44.0_dt,    -46.0_dt,      -72.0_dt,      125.0_dt,     -130.0_dt,     -162.0_dt,
+                                  },
+                                  { -1206.4824_dt },
+                                  { -44.684536_dt } };
+    std::string reduction[] = { "none", "sum", "mean" };
+
+    for (size_t iter = 0; iter < std::size(reduction); ++iter)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        // Apply function
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "input", "target" }, depth, height, width });
+        work.add<raul::SigmoidCrossEntropyLoss>("loss", raul::LossParams{ { "input", "target" }, { "out" }, reduction[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["input"] = TORANGE(x);
+        memory_manager["target"] = TORANGE(y);
+
+        work.forwardPassTraining();
+
+        // Checks
+        const auto& outTensor = memory_manager["out"];
+
+        EXPECT_EQ(outTensor.size(), realOut[iter].size());
+
+        for (size_t i = 0; i < outTensor.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(outTensor[i], realOut[iter][i], eps));
+        }
+    }
+}
+
+TEST(TestLoss, SigmoidCELossBackwardUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps = TODTYPE(1e-6);
+    const size_t batch = 1;
+    const size_t depth = 3;
+    const size_t height = 3;
+    const size_t width = 3;
+
+    const Tensor x{ -1.0_dt, 2.0_dt,   3.0_dt,  -4.0_dt, 5.0_dt,   6.0_dt,  -7.0_dt, 8.0_dt,   9.0_dt,  -10.0_dt, 11.0_dt,  12.0_dt, -13.0_dt, 14.0_dt,
+                    15.0_dt, -16.0_dt, 17.0_dt, 18.0_dt, -19.0_dt, 20.0_dt, 21.0_dt, -22.0_dt, 23.0_dt, 24.0_dt,  -25.0_dt, 26.0_dt, 27.0_dt };
+    const Tensor y{ 1.0_dt, 1.0_dt,  1.0_dt,  2.0_dt,  2.0_dt,  2.0_dt,  3.0_dt,  3.0_dt, 3.0_dt, 5.0_dt, 5.0_dt, 5.0_dt, 8.0_dt, 8.0_dt,
+                    8.0_dt, 17.0_dt, 16.0_dt, 15.0_dt, 14.0_dt, 21.0_dt, 22.0_dt, 2.0_dt, 3.0_dt, 4.0_dt, 5.0_dt, 6.0_dt, 7.0_dt };
+
+    const Tensor realGrad[]{ { -0.7310586_dt, -0.11920291_dt, -0.047425874_dt, -1.9820138_dt, -1.0066929_dt, -1.0024726_dt, -2.999089_dt, -2.0003355_dt, -2.0001235_dt,
+                               -4.9999547_dt, -4.0000167_dt,  -4.000006_dt,    -7.9999976_dt, -7.000001_dt,  -7.0000005_dt, -17.0_dt,     -15.0_dt,      -14.0_dt,
+                               -14.0_dt,      -20.0_dt,       -21.0_dt,        -2.0_dt,       -2.0_dt,       -3.0_dt,       -5.0_dt,      -5.0_dt,       -6.0_dt },
+                             { -0.7310586_dt, -0.11920291_dt, -0.047425874_dt, -1.9820138_dt, -1.0066929_dt, -1.0024726_dt, -2.999089_dt, -2.0003355_dt, -2.0001235_dt,
+                               -4.9999547_dt, -4.0000167_dt,  -4.000006_dt,    -7.9999976_dt, -7.000001_dt,  -7.0000005_dt, -17.0_dt,     -15.0_dt,      -14.0_dt,
+                               -14.0_dt,      -20.0_dt,       -21.0_dt,        -2.0_dt,       -2.0_dt,       -3.0_dt,       -5.0_dt,      -5.0_dt,       -6.0_dt },
+                             { -0.027076244_dt, -0.004414923_dt, -0.0017565137_dt, -0.07340792_dt,  -0.03728492_dt,  -0.037128616_dt, -0.11107737_dt, -0.074086495_dt, -0.07407864_dt,
+                               -0.18518351_dt,  -0.14814878_dt,  -0.14814837_dt,   -0.2962962_dt,   -0.25925928_dt,  -0.25925925_dt,  -0.6296296_dt,  -0.5555556_dt,   -0.51851857_dt,
+                               -0.5185185_dt,   -0.7407408_dt,   -0.7777778_dt,    -0.074074075_dt, -0.074074075_dt, -0.11111111_dt,  -0.1851852_dt,  -0.1851852_dt,   -0.22222221_dt } };
+    std::string reduction[] = { "none", "sum", "mean" };
+
+    for (size_t iter = 0; iter < std::size(reduction); ++iter)
+    {
+        // Initialization
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        work.add<raul::DataLayer>("data", raul::DataParams{ { "input", "target" }, depth, height, width });
+        work.add<raul::SigmoidCrossEntropyLoss>("loss", raul::LossParams{ { "input", "target" }, { "out" }, reduction[iter] });
+        TENSORS_CREATE(batch);
+        memory_manager["input"] = TORANGE(x);
+        memory_manager["target"] = TORANGE(y);
+
+        if (iter == 0)
+        {
+            memory_manager[raul::Name("out").grad()] = 1.0_dt;
+        }
+        work.forwardPassTraining();
+        work.backwardPassTraining();
+
+        // Checks
+        const auto& xNablaTensor = memory_manager[Name("input").grad()];
+
+        EXPECT_EQ(xNablaTensor.size(), realGrad[iter].size());
+
+        for (size_t i = 0; i < xNablaTensor.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(xNablaTensor[i], realGrad[iter][i], eps));
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/losses/Test_SoftmaxCrossEntropyLoss.cpp b/training/src/tests/tests/losses/Test_SoftmaxCrossEntropyLoss.cpp
new file mode 100644
index 00000000..44c8d75d
--- /dev/null
+++ b/training/src/tests/tests/losses/Test_SoftmaxCrossEntropyLoss.cpp
@@ -0,0 +1,190 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/layers/basic/ReduceSumLayer.h>
+#include <training/base/loss/CrossEntropyLoss.h>
+#include <training/base/loss/SoftmaxCrossEntropyLoss.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLoss, SoftmaxCELossOutputNumExceedsUnit)
+{
+    PROFILE_TEST
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    ASSERT_THROW(raul::SoftmaxCrossEntropyLoss("loss", raul::LossParams{ { "x", "y" }, { "x_out", "y_out" } }, networkParameters), raul::Exception);
+}
+
+TEST(TestLoss, SoftmaxCELossElementWiseForwardRandUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // Test parameters
+    const auto eps = TODTYPE(1e-6);
+    const auto batchSize = 1000U;
+    const auto random_range = std::make_pair(-100.0_dt, 100.0_dt);
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("data", DataParams{ { "input", "target" }, 2u, 3u, 4u });
+    work.add<SoftmaxCrossEntropyLoss>("loss", LossParams{ { "input", "target" }, { "out" }, "none" });
+    // To compare
+    work.add<SoftMaxActivation>("sf", BasicParamsWithDim{ { "input" }, { "inputSF" }, "width" });
+    work.add<CrossEntropyLoss>("lossGolden", LossParams{ { "inputSF", "target" }, { "outGolden" }, "none" });
+
+    TENSORS_CREATE(batchSize);
+    tools::init_rand_tensor("input", random_range, memory_manager);
+    tools::init_rand_tensor("target", random_range, memory_manager);
+
+    work.forwardPassTraining();
+
+    // Checks
+    const auto& outTensor = memory_manager["out"];
+    const auto& outTensorGolden = memory_manager["outGolden"];
+
+    for (size_t i = 0; i < outTensor.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(outTensor[i], outTensorGolden[i], eps));
+    }
+}
+
+TEST(TestLoss, SoftmaxCELossElementWiseSimpleUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // See tf_softmax_cross_entropy_with_logits.py
+
+    // Test parameters
+    const auto eps = TODTYPE(1e-6);
+    const auto batchSize = 1U;
+    const auto depth = 1U;
+    const auto height = 1U;
+    const auto width = 3U;
+
+    const Tensor logits{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt };
+    const Tensor targets{ 1.0_dt, 0.0_dt, 0.0_dt };
+
+    // Output
+    const Tensor realLoss{ 0.94083476_dt };
+    const Tensor realInputGrad{ -0.6096981_dt, 0.18731631_dt, 0.42238176_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("data", DataParams{ { "input", "target" }, depth, height, width });
+    work.add<SoftmaxCrossEntropyLoss>("loss", LossParams{ { "input", "target" }, { "loss" }, "none" });
+    work.add<ReduceSumLayer>("final_loss", BasicParamsWithDim{ { "loss" }, { "finalLoss" } });
+
+    TENSORS_CREATE(batchSize);
+    memory_manager["input"] = TORANGE(logits);
+    memory_manager["target"] = TORANGE(targets);
+    memory_manager[Name("finalLoss").grad()] = 1.0_dt;
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Forward checks
+    const auto& loss = memory_manager["finalLoss"];
+    for (size_t i = 0; i < loss.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(loss[i], realLoss[i], eps));
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Backward checks
+    const auto& inputGrad = memory_manager[Name("input").grad()];
+    for (size_t i = 0; i < inputGrad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(inputGrad[i], realInputGrad[i], eps));
+    }
+}
+
+TEST(TestLoss, SoftmaxCELossElementWiseUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    // See tf_softmax_cross_entropy_with_logits.py
+
+    // Test parameters
+    const auto eps = TODTYPE(1e-6);
+    const auto batchSize = 2U;
+    const auto depth = 1U;
+    const auto height = 3U;
+    const auto width = 4U;
+
+    const Tensor logits{ 0.81269646_dt, 0.07857466_dt, 0.8916855_dt,  0.16925514_dt, 0.06311357_dt, 0.54531074_dt, 0.5037316_dt,  0.9248222_dt,
+                         0.66955376_dt, 0.9281193_dt,  0.12239242_dt, 0.8532245_dt,  0.90477383_dt, 0.7104306_dt,  0.40681756_dt, 0.5755513_dt,
+                         0.8547678_dt,  0.59606934_dt, 0.77619946_dt, 0.97301054_dt, 0.06244731_dt, 0.33562684_dt, 0.22166848_dt, 0.32035887_dt };
+    const Tensor targets{ 1.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 2.0_dt, 0.0_dt, 1.0_dt, 1.0_dt,
+                          0.0_dt, 0.0_dt, 1.0_dt, 2.0_dt, 2.0_dt, 2.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 2.0_dt, 2.0_dt, 2.0_dt };
+
+    // Output
+    const Tensor realLoss{ 2.8982296_dt, 0.0_dt, 5.9832497_dt, 4.5992084_dt, 5.8805046_dt, 8.007221_dt };
+    const Tensor realInputGrad{ -0.67612386_dt, 0.15543681_dt, 0.3504963_dt,   -0.82980925_dt, 0.15289354_dt, 0.24763085_dt, 0.23754568_dt, 0.3619299_dt,
+                                -1.7546182_dt,  0.31778693_dt, -0.85802454_dt, -0.7051442_dt,  0.31739688_dt, 0.26133674_dt, -0.8070952_dt, -1.7716384_dt,
+                                -1.7383577_dt,  -1.7979975_dt, 0.24187233_dt,  0.29448295_dt,  0.20916311_dt, -1.7251312_dt, -1.754736_dt,  -1.729296_dt };
+
+    // Initialization
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    // Apply function
+    work.add<DataLayer>("data", DataParams{ { "input", "target" }, depth, height, width });
+    work.add<SoftmaxCrossEntropyLoss>("loss", LossParams{ { "input", "target" }, { "loss" }, "none" });
+    work.add<ReduceSumLayer>("final_loss", BasicParamsWithDim{ { "loss" }, { "finalLoss" }, "width" });
+
+    TENSORS_CREATE(batchSize);
+    memory_manager["input"] = TORANGE(logits);
+    memory_manager["target"] = TORANGE(targets);
+    memory_manager[Name("finalLoss").grad()] = 1.0_dt;
+
+    ASSERT_NO_THROW(work.forwardPassTraining());
+
+    // Forward checks
+    const auto& loss = memory_manager["finalLoss"];
+    for (size_t i = 0; i < loss.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(loss[i], realLoss[i], eps));
+    }
+
+    ASSERT_NO_THROW(work.backwardPassTraining());
+
+    // Backward checks
+    const auto& inputGrad = memory_manager[Name("input").grad()];
+    for (size_t i = 0; i < inputGrad.size(); ++i)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(inputGrad[i], realInputGrad[i], eps));
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/losses/Test_WeightedLoss.cpp b/training/src/tests/tests/losses/Test_WeightedLoss.cpp
new file mode 100644
index 00000000..b29be401
--- /dev/null
+++ b/training/src/tests/tests/losses/Test_WeightedLoss.cpp
@@ -0,0 +1,210 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <training/base/loss/L1Loss.h>
+#include <training/base/loss/LossWrapper.h>
+#include <training/base/loss/SigmoidCrossEntropyLoss.h>
+#include <training/compiler/Workflow.h>
+
+namespace UT
+{
+
+TEST(TestLoss, WeightedSigmoidCrossEntropyUnit)
+{
+    PROFILE_TEST
+    const raul::dtype eps = TODTYPE(1e-4);
+
+    const raul::Tensor inputs = { 1.3_dt, 1.2_dt, 0.1_dt, -4.0_dt, -0.3_dt, -10.0_dt, 1.0_dt, -1.0_dt, 2.0_dt, -2.3_dt };
+    const raul::Tensor targets = { 0.1_dt, 1.2_dt, 1.0_dt, 0.1_dt, 7.7_dt, 0.2_dt, 0.2_dt, 0.2_dt, -1.3_dt, -2.3_dt };
+    const raul::Tensor weights = {
+        1.0_dt, 1.0_dt, 0.0_dt, 0.5_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+    };
+
+    const size_t batch = 2;
+
+    raul::LossParams::Reduction reduction[] = { raul::LossParams::Reduction::None,
+                                                raul::LossParams::Reduction::Sum,
+                                                raul::LossParams::Reduction::Mean,
+                                                raul::LossParams::Reduction::Sum_Over_Weights,
+                                                raul::LossParams::Reduction::Sum_Over_Nonzero_Weights,
+                                                raul::LossParams::Reduction::None,
+                                                raul::LossParams::Reduction::Sum,
+                                                raul::LossParams::Reduction::Mean };
+    std::string reductionName[] = { "none", "sum", "mean", "sum_over_weights", "sum_over_nonzero_weights", "none", "sum", "mean" };
+
+    yato::dimensionality<4> expectedShapes[] = { yato::dims(batch, 1, 1, 5), yato::dims(1, 1, 1, 1),     yato::dims(1, 1, 1, 1), yato::dims(1, 1, 1, 1),
+                                                 yato::dims(1, 1, 1, 1),     yato::dims(batch, 1, 1, 5), yato::dims(1, 1, 1, 1), yato::dims(1, 1, 1, 1) };
+    raul::Tensor realLoss[] = { { 1.4110085_dt, 0.02328247_dt, 0.0_dt, 0.2090749_dt, 0.0_dt, 2.0000453_dt, 1.1132617_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                { 4.75667197_dt },
+                                { 0.475667197_dt },
+                                { 1.05703822_dt },
+                                { 0.95133439_dt },
+                                { 1.4110085_dt, 0.02328247_dt, 0.64439676_dt, 0.41814995_dt, 2.864355_dt, 2.0000453_dt, 1.1132617_dt, 0.5132617_dt, 4.7269278_dt, -5.194454_dt },
+                                { 8.520235_dt },
+                                { 0.8520235_dt } };
+
+    raul::Tensor realInGrad[] = { { 0.68583494_dt, -0.43147528_dt, 0.0_dt, -0.04100689_dt, 0.0_dt, -0.1999546_dt, 0.53105855_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                  { 0.68583494_dt, -0.43147528_dt, 0.0_dt, -0.04100689_dt, 0.0_dt, -0.1999546_dt, 0.53105855_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                  { 0.068583494_dt, -0.043147528_dt, 0.0_dt, -0.004100689_dt, 0.0_dt, -0.01999546_dt, 0.053105855_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                  { 0.15240776_dt, -0.0958834_dt, 0.0_dt, -0.00911264_dt, 0.0_dt, -0.04443436_dt, 0.11801301_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                  { 0.13716699_dt, -0.08629506_dt, 0.0_dt, -0.00820138_dt, 0.0_dt, -0.03999092_dt, 0.10621171_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                  { 0.68583494_dt, -0.43147528_dt, -0.47502077_dt, -0.08201379_dt, -7.274442_dt, -0.1999546_dt, 0.53105855_dt, 0.06894143_dt, 2.180797_dt, 2.3911228_dt },
+                                  { 0.68583494_dt, -0.43147528_dt, -0.47502077_dt, -0.08201379_dt, -7.274442_dt, -0.1999546_dt, 0.53105855_dt, 0.06894143_dt, 2.180797_dt, 2.3911228_dt },
+                                  { 0.068583494_dt, -0.043147528_dt, -0.047502077_dt, -0.008201379_dt, -0.7274442_dt, -0.01999546_dt, 0.053105855_dt, 0.006894143_dt, 0.2180797_dt, 0.23911228_dt } };
+
+    for (size_t iter = 0; iter < std::size(reduction); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        if (iter < 5)
+        {
+            work.add<raul::DataLayer>("data", raul::DataParams{ { "in", "target", "weight" }, expectedShapes[0][1], expectedShapes[0][2], expectedShapes[0][3] });
+            work.add<raul::SigmoidCrossEntropyLoss>("loss", raul::LossParams{ { "in", "target", "weight" }, { "loss" }, reduction[iter] });
+        }
+        else
+        {
+            work.add<raul::DataLayer>("data", raul::DataParams{ { "in", "target" }, expectedShapes[0][1], expectedShapes[0][2], expectedShapes[0][3] });
+            work.add<raul::SigmoidCrossEntropyLoss>("loss", raul::LossParams{ { "in", "target" }, { "loss" }, reduction[iter] });
+        }
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(inputs);
+        memory_manager["target"] = TORANGE(targets);
+        if (iter < 5)
+        {
+            memory_manager["weight"] = TORANGE(weights);
+        }
+        if (iter == 5)
+        {
+            memory_manager[raul::Name("loss").grad()] = 1.0_dt;
+        }
+
+        work.forwardPassTraining();
+        const raul::Tensor& loss = memory_manager["loss"];
+
+        for (size_t i = 0; i < realLoss[iter].size(); ++i)
+        {
+            EXPECT_NEAR(loss[i], realLoss[iter][i], eps);
+        }
+        printf(" - Weighted SigmoidCrossEntropyLoss[reduction=%s] forward is Ok.\n", reductionName[iter].c_str());
+
+        work.backwardPassTraining();
+        const raul::Tensor& in_nabla = memory_manager[raul::Name("in").grad()];
+
+        for (size_t i = 0; i < in_nabla.size(); ++i)
+        {
+            EXPECT_NEAR(in_nabla[i], realInGrad[iter][i], eps);
+        }
+        printf(" - Weighted SigmoidCrossEntropyLoss[reduction=%s] backward is Ok.\n", reductionName[iter].c_str());
+    }
+}
+
+TEST(TestLoss, WeightedL1LossUnit)
+{
+    PROFILE_TEST
+
+    const raul::dtype eps = TODTYPE(1e-4);
+
+    const raul::Tensor inputs = { 1.3_dt, 1.2_dt, 0.1_dt, -4.0_dt, -0.3_dt, -10.0_dt, 1.0_dt, -1.0_dt, 2.0_dt, -2.3_dt };
+    const raul::Tensor targets = { 0.1_dt, 1.2_dt, 1.0_dt, 0.1_dt, 7.7_dt, 0.2_dt, 0.2_dt, 0.2_dt, -1.3_dt, -2.3_dt };
+    const raul::Tensor weights = {
+        1.0_dt, 1.0_dt, 0.0_dt, 0.5_dt, 0.0_dt, 1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt,
+    };
+
+    const size_t batch = 2;
+
+    raul::LossParams::Reduction reduction[] = { raul::LossParams::Reduction::None,
+                                                raul::LossParams::Reduction::Sum,
+                                                raul::LossParams::Reduction::Mean,
+                                                raul::LossParams::Reduction::Sum_Over_Weights,
+                                                raul::LossParams::Reduction::Sum_Over_Nonzero_Weights,
+                                                raul::LossParams::Reduction::None,
+                                                raul::LossParams::Reduction::Sum,
+                                                raul::LossParams::Reduction::Mean };
+    std::string reductionName[] = { "none", "sum", "mean", "sum_over_weights", "sum_over_nonzero_weights", "none", "sum", "mean" };
+
+    yato::dimensionality<4> expectedShapes[] = { yato::dims(batch, 1, 1, 5), yato::dims(1, 1, 1, 1),     yato::dims(1, 1, 1, 1), yato::dims(1, 1, 1, 1),
+                                                 yato::dims(1, 1, 1, 1),     yato::dims(batch, 1, 1, 5), yato::dims(1, 1, 1, 1), yato::dims(1, 1, 1, 1) };
+
+    raul::Tensor realLoss[] = { { 1.1999999285_dt, 0.0_dt, 0.0_dt, 2.0499999523_dt, 0.0_dt, 10.1999998093_dt, 0.8000000119_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                { 14.24999974493_dt },
+                                { 1.424999974493_dt },
+                                { 3.166666609984_dt },
+                                { 2.849999948986_dt },
+                                { 1.1999999285_dt, 0.0_dt, 0.9_dt, 4.1_dt, 8.0_dt, 10.1999998093_dt, 0.8000000119_dt, 1.2_dt, 3.3_dt, 0.0_dt },
+                                { 29.7_dt },
+                                { 2.97_dt } };
+    raul::Tensor realInGrad[] = { { 1.0_dt, 0.0_dt, 0.0_dt, -0.5_dt, 0.0_dt, -1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                  { 1.0_dt, 0.0_dt, 0.0_dt, -0.5_dt, 0.0_dt, -1.0_dt, 1.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                  { 0.1_dt, 0.0_dt, 0.0_dt, -0.05_dt, 0.0_dt, -0.1_dt, 0.1_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                  { 0.222222_dt, 0.0_dt, 0.0_dt, -0.111111_dt, 0.0_dt, -0.222222_dt, 0.222222_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                  { 0.2_dt, 0.0_dt, 0.0_dt, -0.1_dt, 0.0_dt, -0.2_dt, 0.2_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                  { 1.0_dt, 0.0_dt, -1.0_dt, -1.0_dt, -1.0_dt, -1.0_dt, 1.0_dt, -1.0_dt, 1.0_dt, 0.0_dt },
+                                  { 1.0_dt, 0.0_dt, -1.0_dt, -1.0_dt, -1.0_dt, -1.0_dt, 1.0_dt, -1.0_dt, 1.0_dt, 0.0_dt },
+                                  { 0.1_dt, 0.0_dt, -0.1_dt, -0.1_dt, -0.1_dt, -0.1_dt, 0.1_dt, -0.1_dt, 0.1_dt, 0.0_dt } };
+
+    for (size_t iter = 0; iter < std::size(reduction); ++iter)
+    {
+        MANAGERS_DEFINE
+        NETWORK_PARAMS_DEFINE(networkParameters);
+
+        if (iter < 5)
+        {
+            work.add<raul::DataLayer>("data", raul::DataParams{ { "in", "target", "weight" }, expectedShapes[0][1], expectedShapes[0][2], expectedShapes[0][3] });
+            work.add<raul::L1Loss>("loss", raul::LossParams{ { "in", "target", "weight" }, { "loss" }, reduction[iter] });
+        }
+        else
+        {
+            work.add<raul::DataLayer>("data", raul::DataParams{ { "in", "target" }, expectedShapes[0][1], expectedShapes[0][2], expectedShapes[0][3] });
+            work.add<raul::L1Loss>("loss", raul::LossParams{ { "in", "target" }, { "loss" }, reduction[iter] });
+        }
+        TENSORS_CREATE(batch);
+        memory_manager["in"] = TORANGE(inputs);
+        memory_manager["target"] = TORANGE(targets);
+        if (iter < 5)
+        {
+            memory_manager["weight"] = TORANGE(weights);
+        }
+        if (iter == 5)
+        {
+            memory_manager[raul::Name("loss").grad()] = 1.0_dt;
+        }
+
+        work.forwardPassTraining();
+
+        const raul::Tensor& loss = memory_manager["loss"];
+
+        for (size_t i = 0; i < realLoss[iter].size(); ++i)
+        {
+            EXPECT_NEAR(loss[i], realLoss[iter][i], eps);
+        }
+        printf(" - Weighted L1Loss[reduction=%s] forward is Ok.\n", reductionName[iter].c_str());
+
+        work.backwardPassTraining();
+
+        const raul::Tensor& in_nabla = memory_manager[raul::Name("in").grad()];
+
+        for (size_t i = 0; i < in_nabla.size(); ++i)
+        {
+            EXPECT_NEAR(in_nabla[i], realInGrad[iter][i], eps);
+        }
+        printf(" - Weighted L1Loss[reduction=%s] backward is Ok.\n", reductionName[iter].c_str());
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/main.cpp b/training/src/tests/tests/main.cpp
new file mode 100644
index 00000000..0eaa2a58
--- /dev/null
+++ b/training/src/tests/tests/main.cpp
@@ -0,0 +1,339 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tclap/CmdLine.h>
+
+#include <tests/tools/TestTools.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/Random.h>
+#include <training/system/Profiler.h>
+#include <Version.h>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#if defined(ANDROID)
+#include <android/log.h>
+#include <dlfcn.h>
+#include <signal.h>
+#include <unwind.h>
+#endif
+
+struct cmdParam
+{
+    bool mProfile = false;
+    std::string mProfileOutput;
+    bool mUseJsonFormat = false;
+    std::optional<size_t> mSeed = std::nullopt;
+};
+
+cmdParam parseCMD(int argc, char* argv[])
+{
+    cmdParam res;
+    TCLAP::CmdLine cmd("", '=', Conversions::toString(raul::Version::getNumber()), false);
+
+    TCLAP::UnlabeledMultiArg<std::string> argsToIgnore("", "Capture all unmatched values", false, "string");
+    cmd.add(argsToIgnore);
+
+    TCLAP::SwitchArg profile("", "profile", "Enable internal profiler", false);
+    cmd.add(profile);
+
+    TCLAP::ValueArg<std::string> profileOutput("", "profile_output", "Profiler output location", false, "", "string");
+    cmd.add(profileOutput);
+
+    TCLAP::SwitchArg useJsonFormat("", "use_json_format", "Print output as json", false);
+    cmd.add(useJsonFormat);
+
+    TCLAP::ValueArg<size_t> useSeed("", "raul_seed", "Raul seed", false, 0, "number");
+    cmd.add(useSeed);
+
+    cmd.parse(argc, argv);
+
+    res.mProfile = profile.getValue();
+    res.mProfileOutput = profileOutput.getValue();
+    res.mUseJsonFormat = useJsonFormat.getValue();
+    res.mSeed = useSeed.isSet() ? std::optional<size_t>(useSeed.getValue()) : std::nullopt;
+
+    for (auto s : argsToIgnore.getValue())
+    {
+        if (raul::Common::startsWith(s, "--gtest"))
+        {
+            continue;
+        }
+        if (!raul::Common::startsWith(s, "--") || s.find("=") == string::npos)
+        {
+            throw runtime_error("Bad argument: " + s);
+        }
+        s = s.substr(2);
+        auto pos = s.find("=");
+        auto key = s.substr(0, pos);
+        auto val = s.substr(pos + 1);
+        UT::tools::ARGS::ARGUMENTS[key] = val;
+    }
+
+    return res;
+}
+
+#if defined(ANDROID)
+namespace
+{
+
+struct BacktraceState
+{
+    void** current;
+    void** end;
+};
+
+static _Unwind_Reason_Code unwindCallback(struct _Unwind_Context* context, void* arg)
+{
+    BacktraceState* state = static_cast<BacktraceState*>(arg);
+    uintptr_t pc = _Unwind_GetIP(context);
+    if (pc)
+    {
+        if (state->current == state->end)
+        {
+            return _URC_END_OF_STACK;
+        }
+        else
+        {
+            *state->current++ = reinterpret_cast<void*>(pc);
+        }
+    }
+    return _URC_NO_REASON;
+}
+
+}
+
+size_t captureBacktrace(void** buffer, size_t max)
+{
+    BacktraceState state = { buffer, buffer + max };
+    _Unwind_Backtrace(unwindCallback, &state);
+
+    return state.current - buffer;
+}
+
+void dumpBacktrace(std::ostream& os, void** buffer, size_t count)
+{
+    for (size_t idx = 0; idx < count; ++idx)
+    {
+        const void* addr = buffer[idx];
+        const char* symbol = "";
+
+        Dl_info info;
+        if (dladdr(addr, &info) && info.dli_sname)
+        {
+            symbol = info.dli_sname;
+        }
+
+        os << "  #" << std::setw(2) << idx << ": " << addr << "  " << symbol << "\n";
+    }
+}
+
+void backtraceToLogcat()
+{
+    const size_t max = 30;
+    void* buffer[max];
+    // std::ostringstream oss;
+
+    dumpBacktrace(std::cout, buffer, captureBacktrace(buffer, max));
+
+    //__android_log_print(ANDROID_LOG_INFO, "app_name", "%s", oss.str().c_str());
+}
+
+void sighandler([[maybe_unused]] int signum, siginfo_t* info, [[maybe_unused]] void* f)
+{
+    if (info->si_signo == SIGSEGV)
+    {
+        printf("Invalid access of address %p, ", info->si_addr);
+        switch (info->si_code)
+        {
+            case SEGV_MAPERR:
+                printf("SEGV_MAPERR\n");
+                break;
+            case SEGV_ACCERR:
+                printf("SEGV_ACCERR\n");
+                break;
+            default:
+                printf("unknown si_code\n");
+        } /* switch() */
+    }
+    backtraceToLogcat();
+    exit(1);
+
+    return;
+}
+#endif
+inline void printTestConfig()
+{
+    const auto width = 10;
+#if defined(ANDROID)
+    std::cout << std::setw(width) << "Library: "
+              << "Raul Android" << std::endl;
+#elif defined(_WIN32)
+    std::cout << std::setw(width) << "Library: "
+              << "Raul Windows" << std::endl;
+#elif defined(__linux__)
+    std::cout << std::setw(width) << "Library: "
+              << "Raul Linux" << std::endl;
+#else
+    std::cout << std::setw(width) << "Library: "
+              << "Raul Unknown platform" << std::endl;
+#endif
+    std::cout << std::setw(width) << "Version: " << raul::Version::getNumber() << std::endl;
+    std::cout << std::setw(width) << "Revision: " << raul::Version::getRevisionStr() << std::endl;
+    std::cout << std::setw(width) << "Date: " << raul::Version::getDateStr() << std::endl;
+    std::cout << std::setw(width) << "Seed: " << raul::random::getGlobalSeed() << std::endl;
+#if defined(_OPENMP)
+    std::cout << std::setw(width) << "OpenMP: "
+              << "On (threads: " << omp_get_max_threads() << ")" << std::endl;
+#else
+    std::cout << std::setw(width) << "OpenMP: "
+              << "Off" << std::endl;
+#endif
+#if defined(_BLAS_ENHANCE)
+    std::cout << std::setw(width) << "BLAS: "
+              << "On (Enhance)" << std::endl;
+#elif defined(_BLAS)
+    std::cout << std::setw(width) << "BLAS: "
+              << "On" << std::endl;
+#else
+    std::cout << std::setw(width) << "BLAS: "
+              << "Off" << std::endl;
+#endif
+}
+
+std::string printMemory(long value_kB)
+{
+    std::stringstream os;
+    os << value_kB << " kB";
+    if (value_kB > 1024 * 1024)
+    {
+        os << " (" << value_kB / (1024 * 1024) << " GB)";
+    }
+    else if (value_kB > 1024)
+    {
+        os << " (" << value_kB / 1024 << " MB)";
+    }
+    return os.str();
+}
+
+std::string printTime(double value_s)
+{
+    std::stringstream os;
+    os << value_s << " s";
+    if (value_s > 3600.0)
+    {
+        os << " (" << value_s / (3600.0) << " h)";
+    }
+    else if (value_s > 60.0)
+    {
+        os << " (" << value_s / 60.0 << " min)";
+    }
+    return os.str();
+}
+
+inline void printPerformanceMetrics(size_t repeat, double real_time_s, double cpu_user_time_s, double cpu_system_time_s, long peak)
+{
+    const auto width = 18;
+    std::cout << "Metrics [config]" << std::endl;
+    std::cout << std::setw(width) << "Test repeats: " << repeat << std::endl;
+#if defined(_OPENMP)
+    std::cout << std::setw(width) << "OpenMP threads: " << omp_get_max_threads() << std::endl;
+#else 
+    std::cout << std::setw(width) << "OpenMP threads: 1" << std::endl;
+#endif
+    std::cout << "Metrics [total]" << std::endl;
+    std::cout << std::setw(width) << "Memory peak: " << printMemory(peak) << endl;
+
+    std::cout << std::setw(width) << "CPU usr time: " << printTime(cpu_user_time_s) << std::endl;
+    std::cout << std::setw(width) << "CPU sys time: " << printTime(cpu_system_time_s) << std::endl;
+    std::cout << std::setw(width) << "CPU sum time: " << printTime(cpu_user_time_s + cpu_system_time_s) << std::endl;
+    std::cout << std::setw(width) << "Real time: " << printTime(real_time_s) << std::endl;
+    if (repeat > 1)
+    {
+        std::cout << "Metrics [1 iter, average]" << std::endl;
+        const auto div = static_cast<double>(repeat);
+        std::cout << std::setw(width) << "CPU usr time: " << printTime(cpu_user_time_s / div) << std::endl;
+        std::cout << std::setw(width) << "CPU sys time: " << printTime(cpu_system_time_s / div) << std::endl;
+        std::cout << std::setw(width) << "CPU sum time: " << printTime((cpu_user_time_s + cpu_system_time_s) / div) << std::endl;
+        std::cout << std::setw(width) << "Real time: " << printTime(real_time_s / div) << std::endl;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+#if !defined(_WIN32)
+    const auto beginWallClock = std::chrono::steady_clock::now();
+    const auto beginTimestamp = UT::tools::getCPUTimestamp();
+#endif
+
+#if defined(ANDROID)
+    struct sigaction act;
+    memset(&act, 0, sizeof(act));
+    act.sa_sigaction = sighandler;
+    act.sa_flags = SA_SIGINFO;
+    sigaction(SIGSEGV, &act, NULL);
+#endif
+
+    cmdParam params = parseCMD(argc, argv);
+
+    if (params.mSeed)
+    {
+        raul::random::setGlobalSeed(*params.mSeed);
+    }
+
+    printTestConfig();
+
+    // Create profiler
+    [[maybe_unused]] raul::Profiler& profiler = raul::Profiler::getInstance();
+    std::filebuf fb;
+    fb.open(params.mProfileOutput, std::ios::out);
+    std::ostream os(&fb);
+    if (fb.is_open())
+    {
+        // JSON format
+        if (params.mUseJsonFormat)
+        {
+            os << "[\n";
+        }
+        profiler.initialize(&os, true, false, !params.mProfile, params.mUseJsonFormat);
+    }
+    else
+    {
+        profiler.initialize(&std::cout, true, false, !params.mProfile, params.mUseJsonFormat);
+    }
+
+    // --gtest_filter="type.name"
+    ::testing::InitGoogleTest(&argc, argv);
+#ifdef _DEBUG
+    ::testing::FLAGS_gtest_break_on_failure = true;
+#endif
+    int res = RUN_ALL_TESTS();
+
+    if (params.mUseJsonFormat)
+    {
+        os << "}\n]";
+    }
+
+#if !defined(_WIN32)
+    const auto endTimestamp = UT::tools::getCPUTimestamp();
+    const auto endWallClock = std::chrono::steady_clock::now();
+    const auto [userTime, systemTime] = UT::tools::getElapsedTime(beginTimestamp, endTimestamp);
+    const std::chrono::duration<double> duration = endWallClock - beginWallClock;
+    const auto peak = UT::tools::getPeakOfMemory();
+    printPerformanceMetrics(::testing::FLAGS_gtest_repeat, duration.count(), userTime, systemTime, peak);
+#endif
+
+    return res;
+}
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer.cpp b/training/src/tests/tests/optimizers/Test_Optimizer.cpp
new file mode 100644
index 00000000..0fe49990
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer.cpp
@@ -0,0 +1,59 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/GTestExtensions.h>
+#include <tests/tools/TestTools.h>
+
+#include <training/base/optimizers/Optimizer.h>
+
+using namespace raul;
+
+namespace UT
+{
+
+class OptimizerStub : public optimizers::Optimizer
+{
+  private:
+    void optimize(MemoryManager&, Tensor&, const Tensor&) final {}
+    std::ostream& as_ostream(std::ostream& out) const final { return out; }
+};
+
+struct TestOptimizer : public testing::Test
+{
+    MemoryManager memoryManager;
+    std::unique_ptr<Tensor> zeros;
+    std::unique_ptr<Tensor> testGradients;
+    std::unique_ptr<Tensor> testParameter;
+
+    void SetUp() final
+    {
+        testGradients = std::make_unique<Tensor>("test", 10, 10, 10, 10, 5_dt);
+        testParameter = std::make_unique<Tensor>("test", 10, 10, 10, 10, 7_dt);
+    }
+
+    void TearDown() final {}
+};
+
+TEST_F(TestOptimizer, ShouldNotResetGradientAfterParameterOptimizationUnit)
+{
+    PROFILE_TEST
+    auto optimizer = std::make_unique<OptimizerStub>();
+
+    Tensor grad(testGradients->getShape(), TORANGE(*testGradients));
+
+    optimizer->operator()(memoryManager, *testParameter, *testGradients);
+
+    ASSERT_FLOAT_TENSORS_EQ((*testGradients), grad, 1e-6_dt);
+}
+
+} // namespace UT
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_ASGD.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_ASGD.cpp
new file mode 100644
index 00000000..b8fdd6d7
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_ASGD.cpp
@@ -0,0 +1,132 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/optimizers/ASGD.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerASGD, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    const auto learning_rate = TODTYPE(0.01);
+    {
+        raul::optimizers::ASGD optimizer{ learning_rate };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "ASGD(lr=1.000000e-02, lambda=1.000000e-04, start point=1.000000e+06, weight decay=0.000000e+00)");
+    }
+}
+
+TEST(TestOptimizerASGD, AssertLRNegativeUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    {
+        const auto learning_rate = TODTYPE(-0.5);
+        ASSERT_THROW(raul::optimizers::ASGD{ learning_rate }, raul::Exception);
+    }
+}
+
+TEST(TestOptimizerASGD, OptimizerTwoStepsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const auto learning_rate = TODTYPE(0.1);
+    const auto lambda = TODTYPE(0.01);
+    const auto alpha = TODTYPE(0.75);
+    const auto start = TODTYPE(1.0e6);
+    const auto wDecay = TODTYPE(0.1);
+    const auto eps = TODTYPE(1.0e-6);
+    const raul::Tensor param{ 0.496257_dt, 0.768222_dt, 0.088477_dt, 0.132030_dt, 0.307423_dt, 0.634079_dt, 0.490093_dt, 0.896445_dt, 0.455628_dt, 0.632306_dt, 0.348893_dt, 0.401717_dt,
+                              0.022326_dt, 0.168859_dt, 0.293888_dt, 0.518522_dt, 0.697668_dt, 0.800011_dt, 0.161029_dt, 0.282269_dt, 0.681609_dt, 0.915194_dt, 0.397100_dt, 0.874156_dt };
+    const raul::Tensor grad{ 0.419408_dt, 0.552907_dt, 0.952738_dt, 0.036165_dt, 0.185231_dt, 0.373417_dt, 0.305100_dt, 0.932000_dt, 0.175910_dt, 0.269834_dt, 0.150680_dt, 0.031720_dt,
+                             0.208130_dt, 0.929799_dt, 0.723109_dt, 0.742336_dt, 0.526296_dt, 0.243658_dt, 0.584592_dt, 0.033153_dt, 0.138717_dt, 0.242235_dt, 0.815469_dt, 0.793161_dt };
+    const raul::Tensor paramAfterFirstStep{
+        0.448857_dt, 0.704481_dt, -0.007770_dt, 0.126962_dt, 0.285518_dt, 0.589762_dt, 0.454192_dt, 0.793384_dt, 0.433025_dt, 0.598368_dt, 0.329988_dt, 0.394126_dt,
+        0.001267_dt, 0.074022_dt, 0.218345_dt,  0.438584_dt, 0.637364_dt, 0.766845_dt, 0.100799_dt, 0.275848_dt, 0.660239_dt, 0.880903_dt, 0.311185_dt, 0.785224_dt
+    };
+    const raul::Tensor paramAfterSecondStep{ 0.402014_dt, 0.641488_dt, -0.102887_dt, 0.121952_dt, 0.263871_dt,  0.545966_dt,  0.418713_dt, 0.691533_dt,
+                                             0.410688_dt, 0.564827_dt, 0.311304_dt,  0.386625_dt, -0.019544_dt, -0.019702_dt, 0.143688_dt, 0.359586_dt,
+                                             0.577768_dt, 0.734069_dt, 0.041276_dt,  0.269504_dt, 0.639121_dt,  0.847015_dt,  0.226279_dt, 0.697336_dt };
+    raul::optimizers::ASGD optimizer{ learning_rate, lambda, alpha, start, wDecay };
+    auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+    auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+    // First step
+    optimizer(memory_manager, params, gradients);
+    EXPECT_EQ(params.size(), gradients.size());
+    for (size_t i = 0; i < params.size(); ++i)
+    {
+        EXPECT_NEAR(params[i], paramAfterFirstStep[i], eps);
+    }
+
+    // Second step
+    optimizer(memory_manager, params, gradients);
+    EXPECT_EQ(params.size(), gradients.size());
+    for (size_t i = 0; i < params.size(); ++i)
+    {
+        EXPECT_NEAR(params[i], paramAfterSecondStep[i], eps);
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerASGD, OptimizerTwoStepsFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    const auto learning_rate = TODTYPE(0.1);
+    const auto lambda = TODTYPE(0.01);
+    const auto alpha = TODTYPE(0.75);
+    const auto start = TODTYPE(1.0e6);
+    const auto wDecay = TODTYPE(0.1);
+    const auto eps = TODTYPE(1.0e-3);
+    const raul::TensorFP16 param{ 0.496257_hf, 0.768222_hf, 0.088477_hf, 0.132030_hf, 0.307423_hf, 0.634079_hf, 0.490093_hf, 0.896445_hf, 0.455628_hf, 0.632306_hf, 0.348893_hf, 0.401717_hf,
+                              0.022326_hf, 0.168859_hf, 0.293888_hf, 0.518522_hf, 0.697668_hf, 0.800011_hf, 0.161029_hf, 0.282269_hf, 0.681609_hf, 0.915194_hf, 0.397100_hf, 0.874156_hf };
+    const raul::TensorFP16 grad{ 0.419408_hf, 0.552907_hf, 0.952738_hf, 0.036165_hf, 0.185231_hf, 0.373417_hf, 0.305100_hf, 0.932000_hf, 0.175910_hf, 0.269834_hf, 0.150680_hf, 0.031720_hf,
+                             0.208130_hf, 0.929799_hf, 0.723109_hf, 0.742336_hf, 0.526296_hf, 0.243658_hf, 0.584592_hf, 0.033153_hf, 0.138717_hf, 0.242235_hf, 0.815469_hf, 0.793161_hf };
+    const raul::TensorFP16 paramAfterFirstStep{
+        0.448857_hf, 0.704481_hf, -0.007770_hf, 0.126962_hf, 0.285518_hf, 0.589762_hf, 0.454192_hf, 0.793384_hf, 0.433025_hf, 0.598368_hf, 0.329988_hf, 0.394126_hf,
+        0.001267_hf, 0.074022_hf, 0.218345_hf,  0.438584_hf, 0.637364_hf, 0.766845_hf, 0.100799_hf, 0.275848_hf, 0.660239_hf, 0.880903_hf, 0.311185_hf, 0.785224_hf
+    };
+    const raul::Tensor paramAfterSecondStep{ 0.402014_hf, 0.641488_hf, -0.102887_hf, 0.121952_hf, 0.263871_hf,  0.545966_hf,  0.418713_hf, 0.691533_hf,
+                                             0.410688_hf, 0.564827_hf, 0.311304_hf,  0.386625_hf, -0.019544_hf, -0.019702_hf, 0.143688_hf, 0.359586_hf,
+                                             0.577768_hf, 0.734069_hf, 0.041276_hf,  0.269504_hf, 0.639121_hf,  0.847015_hf,  0.226279_hf, 0.697336_hf };
+    raul::optimizers::ASGD optimizer{ learning_rate, lambda, alpha, start, wDecay };
+    auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+    auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+    // First step
+    optimizer(memory_manager, params, gradients);
+    EXPECT_EQ(params.size(), gradients.size());
+    for (size_t i = 0; i < params.size(); ++i)
+    {
+        EXPECT_NEAR(TODTYPE(params[i]), TODTYPE(paramAfterFirstStep[i]), eps);
+    }
+
+    // Second step
+    optimizer(memory_manager, params, gradients);
+    EXPECT_EQ(params.size(), gradients.size());
+    for (size_t i = 0; i < params.size(); ++i)
+    {
+        EXPECT_NEAR(TODTYPE(params[i]), TODTYPE(paramAfterSecondStep[i]), eps);
+    }
+}
+#endif // ANDROID
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_Adadelta.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_Adadelta.cpp
new file mode 100644
index 00000000..905f89c2
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_Adadelta.cpp
@@ -0,0 +1,154 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/optimizers/Adadelta.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerAdadelta, StreamUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    std::ostringstream stream;
+    const auto rho = 1e-2_dt;
+    const auto epsilon = 1e-6_dt;
+    {
+        raul::optimizers::Adadelta optimizer{ rho, epsilon };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "Adadelta(rho=1.000000e-02, epsilon=1.000000e-06)");
+    }
+}
+
+TEST(TestOptimizerAdadelta, StepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const auto rho = 0.5_dt;
+    const auto grad_val = 1.0_dt;
+    const auto epsilon = 1e-10_dt;
+    auto amount_of_element = 10U;
+    {
+        raul::optimizers::Adadelta optimizer{ rho, epsilon };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, grad_val);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        const auto g_el_golden = 1.0_dt - rho;
+        for (raul::dtype g_el : memory_manager["Adadelta::params::g"])
+            EXPECT_FLOAT_EQ(g_el, g_el_golden);
+
+        const auto delta = -std::sqrt(epsilon) / std::sqrt(g_el_golden + epsilon);
+        const auto u_el_golden = (1.0_dt - rho) * delta * delta;
+        for (raul::dtype u_el : memory_manager["Adadelta::params::u"])
+            EXPECT_FLOAT_EQ(u_el, u_el_golden);
+
+        const auto res = 1.0_dt + delta;
+
+        for (raul::dtype param_tensor_el : params)
+            EXPECT_FLOAT_EQ(param_tensor_el, res);
+    }
+}
+
+TEST(TestOptimizerAdadelta, DoubleStepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const auto rho = 0.5_dt;
+    const auto grad_val = 1.0_dt;
+    const auto epsilon = 1e-10_dt;
+    auto amount_of_element = 10U;
+    {
+        raul::optimizers::Adadelta optimizer{ rho, epsilon };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, grad_val);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), grad_val);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        const auto g_el_golden = 1.0_dt - rho * rho;
+        for (raul::dtype g_el : memory_manager["Adadelta::params::g"])
+            EXPECT_FLOAT_EQ(g_el, g_el_golden);
+
+        const auto delta_prev = -std::sqrt(epsilon) / std::sqrt(1.0_dt - rho + epsilon);
+        const auto u_el_golden_prev = (1.0_dt - rho) * delta_prev * delta_prev;
+
+        const auto delta = -std::sqrt(u_el_golden_prev + epsilon) / std::sqrt(g_el_golden + epsilon);
+        const auto u_el_golden = rho * u_el_golden_prev + (1.0_dt - rho) * delta * delta;
+        for (raul::dtype u_el : memory_manager["Adadelta::params::u"])
+            EXPECT_FLOAT_EQ(u_el, u_el_golden);
+
+        const auto res = 1.0_dt + delta_prev + delta;
+
+        for (raul::dtype param_tensor_el : params)
+            EXPECT_FLOAT_EQ(param_tensor_el, res);
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerAdadelta, DoubleStepFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    const auto rho = 0.5_dt;
+    const auto epsilon = 1e-10_dt;
+    auto amount_of_element = 10U;
+    const auto eps = 1.0e-4_dt;
+    {
+        raul::optimizers::Adadelta optimizer{ rho, epsilon };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_hf);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_hf);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), 1.0_hf);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        const auto g_el_golden = 1.0_dt - rho * rho;
+        for (raul::half g_el : memory_manager["Adadelta::params::g"])
+        {
+            EXPECT_FLOAT_EQ(TODTYPE(g_el), g_el_golden);
+        }
+
+        const auto delta_prev = -std::sqrt(epsilon) / std::sqrt(1.0_dt - rho + epsilon);
+        const auto u_el_golden_prev = (1.0_dt - rho) * delta_prev * delta_prev;
+
+        const auto delta = -std::sqrt(u_el_golden_prev + epsilon) / std::sqrt(g_el_golden + epsilon);
+        const auto u_el_golden = rho * u_el_golden_prev + (1.0_dt - rho) * delta * delta;
+        for (raul::half u_el : memory_manager["Adadelta::params::u"])
+        {
+            EXPECT_NEAR(TODTYPE(u_el), u_el_golden, eps);
+        }
+
+        const auto res = 1.0_dt + delta_prev + delta;
+
+        for (raul::half param_tensor_el : params)
+        {
+            EXPECT_NEAR(TODTYPE(param_tensor_el), res, eps);
+        }
+    }
+}
+#endif // ANDROID
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_Adagrad.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_Adagrad.cpp
new file mode 100644
index 00000000..2fe5dbfe
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_Adagrad.cpp
@@ -0,0 +1,119 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/optimizers/Adagrad.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerAdagrad, StreamUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    std::ostringstream stream;
+    const auto alpha = 1e-2_dt;
+    const auto epsilon = 1e-6_dt;
+    {
+        raul::optimizers::Adagrad optimizer{ alpha, epsilon };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "Adagrad(alpha=1.000000e-02, epsilon=1.000000e-06)");
+    }
+}
+
+TEST(TestOptimizerAdagrad, StepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const auto alpha = 0.8_dt;
+    const auto grad_val = 2.0_dt;
+    auto amount_of_element = 10U;
+    {
+        raul::optimizers::Adagrad optimizer{ alpha };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, grad_val);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        EXPECT_FLOAT_EQ(memory_manager["Adagrad::params::g"][0], grad_val * grad_val);
+
+        const auto res = 1.0_dt - alpha;
+
+        for (raul::dtype param_tensor_el : params)
+            EXPECT_FLOAT_EQ(param_tensor_el, res);
+    }
+}
+
+TEST(TestOptimizerAdagrad, DoubleStepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto alpha = 0.8_dt;
+    const auto grad_val = 2.0_dt;
+    auto amount_of_element = 10U;
+    {
+        raul::optimizers::Adagrad optimizer{ alpha };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, grad_val);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), grad_val);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+        EXPECT_FLOAT_EQ(memory_manager["Adagrad::params::g"][0], 2.0_dt * grad_val * grad_val);
+
+        const auto res = 1.0_dt - alpha * (1.0_dt + 1.0_dt / std::sqrt(2.0_dt));
+
+        for (raul::dtype param_tensor_el : params)
+            EXPECT_FLOAT_EQ(param_tensor_el, res);
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerAdagrad, DoubleStepFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    auto alpha = 0.8_dt;
+    const auto grad_val = 2.0_dt;
+    const auto eps = 1.0e-4_dt;
+    auto amount_of_element = 10U;
+    {
+        raul::optimizers::Adagrad optimizer{ alpha };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, grad_val);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), grad_val);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+        EXPECT_FLOAT_EQ(TODTYPE(memory_manager["Adagrad::params::g"][0]), 2.0_dt * grad_val * grad_val);
+
+        const auto res = 1.0_dt - alpha * (1.0_dt + 1.0_dt / std::sqrt(2.0_dt));
+
+        for (raul::half param_tensor_el : params)
+        {
+            EXPECT_NEAR(TODTYPE(param_tensor_el), res, eps);
+        }
+    }
+}
+#endif // ANDROID
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_Adam.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_Adam.cpp
new file mode 100644
index 00000000..df5a0df6
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_Adam.cpp
@@ -0,0 +1,194 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/optimizers/Adam.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerAdam, StreamUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    std::ostringstream stream;
+    auto alpha = 0.01_dt;
+    {
+        raul::optimizers::Adam optimizer{ alpha };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "Adam(alpha=1.000000e-02, beta_1=9.000000e-01, beta_2=9.990000e-01, epsilon=1.000000e-08)");
+    }
+}
+
+TEST(TestOptimizerAdam, StepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto alpha = 0.8_dt;
+    auto beta_1 = 0.5_dt;
+    auto beta_2 = 0.75_dt;
+    auto epsilon = 0.6_dt;
+    auto amount_of_element = 10U;
+    {
+        raul::optimizers::Adam optimizer{ alpha, beta_1, beta_2, epsilon };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        EXPECT_FLOAT_EQ(memory_manager["Adam::params::beta_1_t"][0], beta_1 * beta_1);
+        EXPECT_FLOAT_EQ(memory_manager["Adam::params::beta_2_t"][0], beta_2 * beta_2);
+
+        const auto m_t = (1.0_dt - beta_1);
+        const auto v_t = (1.0_dt - beta_2);
+
+        for (raul::dtype m_tensor_t_el : memory_manager["Adam::params::m"])
+            EXPECT_FLOAT_EQ(m_tensor_t_el, m_t);
+
+        for (raul::dtype v_tensor_t_el : memory_manager["Adam::params::v"])
+            EXPECT_FLOAT_EQ(v_tensor_t_el, v_t);
+
+        const auto res = 1.0_dt - alpha / (1.0_dt + epsilon);
+
+        for (raul::dtype param_tensor_el : params)
+            EXPECT_FLOAT_EQ(param_tensor_el, res);
+    }
+}
+
+TEST(TestOptimizerAdam, DoubleStepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto alpha = 0.8_dt;
+    auto beta_1 = 0.5_dt;
+    auto beta_2 = 0.75_dt;
+    auto epsilon = 0.6_dt;
+    auto amount_of_element = 10U;
+    {
+        raul::optimizers::Adam optimizer{ alpha, beta_1, beta_2, epsilon };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), 1_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        EXPECT_FLOAT_EQ(memory_manager["Adam::params::beta_1_t"][0], beta_1 * beta_1 * beta_1);
+        EXPECT_FLOAT_EQ(memory_manager["Adam::params::beta_2_t"][0], beta_2 * beta_2 * beta_2);
+
+        // beta*old + (1-beta)*1 = beta*(1-beta)*1 + (1-beta)*1 = (1+beta)*(1-beta) = (1-beta^2)
+        const auto m_t = (1.0_dt - beta_1 * beta_1);
+        const auto v_t = (1.0_dt - beta_2 * beta_2);
+
+        for (raul::dtype m_tensor_t_el : memory_manager.getTensor("Adam::params::m"))
+            EXPECT_FLOAT_EQ(m_tensor_t_el, m_t);
+
+        for (raul::dtype v_tensor_t_el : memory_manager.getTensor("Adam::params::v"))
+            EXPECT_FLOAT_EQ(v_tensor_t_el, v_t);
+
+        const auto res = 1.0_dt - 2 * alpha / (1.0_dt + epsilon);
+
+        for (raul::dtype param_tensor_el : params)
+            EXPECT_FLOAT_EQ(param_tensor_el, res);
+    }
+}
+
+TEST(TestOptimizerAdamQuantized, MappersUnit)
+{
+    PROFILE_TEST
+
+    constexpr dtype EPSILON = 0.01_dt;
+
+    {
+        auto lin = raul::optimizers::AdamQuantized::linspace(0.0f, 1.0f, 0);
+        EXPECT_EQ(lin.size(), 0u);
+    }
+
+    {
+        auto lin = raul::optimizers::AdamQuantized::linspace(-1.0f, 1.0f, 1);
+        EXPECT_EQ(lin.size(), 1u);
+
+        EXPECT_NEAR(lin[0], -1, EPSILON);
+    }
+
+    {
+        auto lin = raul::optimizers::AdamQuantized::linspace(-1.0f, 1.0f, 2);
+        EXPECT_EQ(lin.size(), 2u);
+
+        EXPECT_NEAR(lin[0], -1, EPSILON);
+        EXPECT_NEAR(lin[1], 1, EPSILON);
+    }
+
+    {
+        auto lin = raul::optimizers::AdamQuantized::linspace(0.0f, 1.0f, 11);
+        EXPECT_EQ(lin.size(), 11u);
+
+        dtype elem = 0.0_dt;
+        for (auto val : lin)
+        {
+            EXPECT_NEAR(val, elem, EPSILON);
+            elem += 0.1_dt;
+        }
+    }
+
+    {
+        auto lin = raul::optimizers::AdamQuantized::linspace(-1.0f, 1.0f, 21);
+        EXPECT_EQ(lin.size(), 21u);
+
+        dtype elem = -1.0_dt;
+        for (auto val : lin)
+        {
+            EXPECT_NEAR(val, elem, EPSILON);
+            elem += 0.1_dt;
+        }
+    }
+
+    {
+        auto map = raul::optimizers::AdamQuantized::createNormalQuantileMap(true);
+        EXPECT_EQ(map.size(), 256u);
+        EXPECT_NEAR(map[0], -1.0_dt, EPSILON);
+        EXPECT_NEAR(map[127], 0.0_dt, EPSILON);
+        EXPECT_NEAR(map[255], 1.0_dt, EPSILON);
+    }
+
+    {
+        auto map = raul::optimizers::AdamQuantized::createNormalQuantileMap(false);
+        EXPECT_EQ(map.size(), 256u);
+        EXPECT_NEAR(map[0], 0.0_dt, EPSILON);
+        EXPECT_NEAR(map[255], 1.0_dt, EPSILON);
+    }
+
+    {
+        auto map = raul::optimizers::AdamQuantized::createDynamicMap(true);
+        EXPECT_EQ(map.size(), 256u);
+        EXPECT_NEAR(map[0], -0.993_dt, EPSILON);
+        EXPECT_NEAR(map[127], 0.0_dt, EPSILON);
+        EXPECT_NEAR(map[255], 1.0_dt, EPSILON);
+    }
+
+    {
+        auto map = raul::optimizers::AdamQuantized::createDynamicMap(false);
+        EXPECT_EQ(map.size(), 256u);
+        EXPECT_NEAR(map[0], 0.0_dt, EPSILON);
+        EXPECT_NEAR(map[255], 1.0_dt, EPSILON);
+    }
+}
+
+} // UT namespace
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_AdamW.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_AdamW.cpp
new file mode 100644
index 00000000..a8739667
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_AdamW.cpp
@@ -0,0 +1,170 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/optimizers/AdamW.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerAdamW, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    {
+        raul::optimizers::AdamW optimizer{ 0.01_dt };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "AdamW(alpha=1.000000e-02, beta_1=9.000000e-01, beta_2=9.990000e-01, epsilon=1.000000e-08, lambda=1.000000e-02)");
+    }
+}
+
+// see adamw.py
+TEST(TestOptimizerAdamW, StepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto alpha = 0.8_dt;
+    constexpr auto beta_1 = 0.5_dt;
+    constexpr auto beta_2 = 0.75_dt;
+    constexpr auto epsilon = 0.6_dt;
+    constexpr auto lambda = 0.2_dt;
+    constexpr auto amount_of_element = 10U;
+    constexpr auto res = 0.34_dt;
+    {
+        raul::optimizers::AdamW optimizer{ alpha, beta_1, beta_2, epsilon, lambda };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        EXPECT_FLOAT_EQ(memory_manager["AdamW::params::beta_1_t"][0], beta_1 * beta_1);
+        EXPECT_FLOAT_EQ(memory_manager["AdamW::params::beta_2_t"][0], beta_2 * beta_2);
+
+        const auto m_t = (1.0_dt - beta_1);
+        const auto v_t = (1.0_dt - beta_2);
+
+        for (raul::dtype m_tensor_t_el : memory_manager["AdamW::params::m"])
+        {
+            EXPECT_FLOAT_EQ(m_tensor_t_el, m_t);
+        }
+
+        for (raul::dtype v_tensor_t_el : memory_manager["AdamW::params::v"])
+        {
+            EXPECT_FLOAT_EQ(v_tensor_t_el, v_t);
+        }
+
+        for (raul::dtype param_tensor_el : params)
+        {
+            EXPECT_FLOAT_EQ(param_tensor_el, res);
+        }
+    }
+}
+
+TEST(TestOptimizerAdamW, DoubleStepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto alpha = 0.8_dt;
+    constexpr auto beta_1 = 0.5_dt;
+    constexpr auto beta_2 = 0.75_dt;
+    constexpr auto epsilon = 0.6_dt;
+    constexpr auto lambda = 0.2_dt;
+    constexpr auto amount_of_element = 10U;
+    constexpr auto res = -0.2144_dt;
+    {
+        raul::optimizers::AdamW optimizer{ alpha, beta_1, beta_2, epsilon, lambda };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), 1_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        EXPECT_FLOAT_EQ(memory_manager["AdamW::params::beta_1_t"][0], beta_1 * beta_1 * beta_1);
+        EXPECT_FLOAT_EQ(memory_manager["AdamW::params::beta_2_t"][0], beta_2 * beta_2 * beta_2);
+
+        const auto m_t = (1.0_dt - beta_1 * beta_1);
+        const auto v_t = (1.0_dt - beta_2 * beta_2);
+
+        for (raul::dtype m_tensor_t_el : memory_manager.getTensor("AdamW::params::m"))
+        {
+            EXPECT_FLOAT_EQ(m_tensor_t_el, m_t);
+        }
+
+        for (raul::dtype v_tensor_t_el : memory_manager.getTensor("AdamW::params::v"))
+        {
+            EXPECT_FLOAT_EQ(v_tensor_t_el, v_t);
+        }
+
+        for (raul::dtype param_tensor_el : params)
+        {
+            EXPECT_FLOAT_EQ(param_tensor_el, res);
+        }
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerAdamW, DoubleStepFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    constexpr auto alpha = 0.8_dt;
+    constexpr auto beta_1 = 0.5_dt;
+    constexpr auto beta_2 = 0.75_dt;
+    constexpr auto epsilon = 0.6_dt;
+    constexpr auto lambda = 0.2_dt;
+    constexpr auto amount_of_element = 10U;
+    constexpr auto res = -0.2144_dt;
+    constexpr auto eps = 1.0e-3_dt;
+    {
+        raul::optimizers::AdamW optimizer{ alpha, beta_1, beta_2, epsilon, lambda };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_hf);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_hf);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), 1.0_hf);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        EXPECT_FLOAT_EQ(TODTYPE(memory_manager["AdamW::params::beta_1_t"][0]), beta_1 * beta_1 * beta_1);
+        EXPECT_FLOAT_EQ(TODTYPE(memory_manager["AdamW::params::beta_2_t"][0]), beta_2 * beta_2 * beta_2);
+
+        const auto m_t = (1.0_dt - beta_1 * beta_1);
+        const auto v_t = (1.0_dt - beta_2 * beta_2);
+
+        for (raul::half m_tensor_t_el : memory_manager.getTensor("AdamW::params::m"))
+        {
+            EXPECT_FLOAT_EQ(TODTYPE(m_tensor_t_el), m_t);
+        }
+
+        for (raul::half v_tensor_t_el : memory_manager.getTensor("AdamW::params::v"))
+        {
+            EXPECT_FLOAT_EQ(TODTYPE(v_tensor_t_el), v_t);
+        }
+
+        for (raul::half param_tensor_el : params)
+        {
+            EXPECT_NEAR(TODTYPE(param_tensor_el), res, eps);
+        }
+    }
+}
+#endif // ANDROID
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_Adamax.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_Adamax.cpp
new file mode 100644
index 00000000..5a895795
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_Adamax.cpp
@@ -0,0 +1,158 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/optimizers/Adamax.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerAdamax, StreamUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    std::ostringstream stream;
+    auto alpha = 0.002_dt;
+    {
+        raul::optimizers::Adamax optimizer{ alpha };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "Adamax(alpha=2.000000e-03, beta_1=9.000000e-01, beta_2=9.990000e-01)");
+    }
+}
+
+TEST(TestOptimizerAdamax, StepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto alpha = 0.8_dt;
+    auto beta_1 = 0.5_dt;
+    auto beta_2 = 0.75_dt;
+    auto amount_of_element = 10U;
+    {
+        raul::optimizers::Adamax optimizer{ alpha, beta_1, beta_2 };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        EXPECT_FLOAT_EQ(memory_manager["Adamax::params::beta_1_t"][0], beta_1 * beta_1);
+
+        const auto m_t = (1.0_dt - beta_1);
+        const auto u_t = 1.0_dt;
+
+        for (raul::dtype m_tensor_t_el : memory_manager["Adamax::params::m"])
+            EXPECT_FLOAT_EQ(m_tensor_t_el, m_t);
+
+        for (raul::dtype v_tensor_t_el : memory_manager["Adamax::params::u"])
+            EXPECT_FLOAT_EQ(v_tensor_t_el, u_t);
+
+        const auto res = 1.0_dt - alpha;
+
+        for (raul::dtype param_tensor_el : params)
+            EXPECT_FLOAT_EQ(param_tensor_el, res);
+    }
+}
+
+TEST(TestOptimizerAdamax, DoubleStepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto alpha = 0.8_dt;
+    auto beta_1 = 0.5_dt;
+    auto beta_2 = 0.75_dt;
+    auto amount_of_element = 10U;
+    {
+        raul::optimizers::Adamax optimizer{ alpha, beta_1, beta_2 };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), 1_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        EXPECT_FLOAT_EQ(memory_manager["Adamax::params::beta_1_t"][0], beta_1 * beta_1 * beta_1);
+
+        // beta*old + (1-beta)*1 = beta*(1-beta)*1 + (1-beta)*1 = (1+beta)*(1-beta) = (1-beta^2)
+        const auto m_t = (1.0_dt - beta_1 * beta_1);
+        // Because beta_2 < 1 we always choose |grad| here
+        const auto u_t = 1.0_dt;
+
+        for (raul::dtype m_tensor_t_el : memory_manager["Adamax::params::m"])
+            EXPECT_FLOAT_EQ(m_tensor_t_el, m_t);
+
+        for (raul::dtype v_tensor_t_el : memory_manager["Adamax::params::u"])
+            EXPECT_FLOAT_EQ(v_tensor_t_el, u_t);
+
+        const auto res = 1.0_dt - 2 * alpha / (1.0_dt);
+
+        for (raul::dtype param_tensor_el : params)
+            EXPECT_FLOAT_EQ(param_tensor_el, res);
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerAdamax, DoubleStepFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    auto alpha = 0.8_dt;
+    auto beta_1 = 0.5_dt;
+    auto beta_2 = 0.75_dt;
+    auto amount_of_element = 10U;
+    const auto eps = 1.0e-3_dt;
+    {
+        raul::optimizers::Adamax optimizer{ alpha, beta_1, beta_2 };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_hf);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_hf);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), 1.0_hf);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        EXPECT_FLOAT_EQ(TODTYPE(memory_manager["Adamax::params::beta_1_t"][0]), beta_1 * beta_1 * beta_1);
+
+        // beta*old + (1-beta)*1 = beta*(1-beta)*1 + (1-beta)*1 = (1+beta)*(1-beta) = (1-beta^2)
+        const auto m_t = (1.0_dt - beta_1 * beta_1);
+        // Because beta_2 < 1 we always choose |grad| here
+        const auto u_t = 1.0_dt;
+
+        for (raul::half m_tensor_t_el : memory_manager["Adamax::params::m"])
+        {
+            EXPECT_FLOAT_EQ(TODTYPE(m_tensor_t_el), m_t);
+        }
+
+        for (raul::half v_tensor_t_el : memory_manager["Adamax::params::u"])
+        {
+            EXPECT_FLOAT_EQ(TODTYPE(v_tensor_t_el), u_t);
+        }
+
+        const auto res = 1.0_dt - 2 * alpha / (1.0_dt);
+
+        for (raul::half param_tensor_el : params)
+        {
+            EXPECT_NEAR(TODTYPE(param_tensor_el), res, eps);
+        }
+    }
+}
+#endif // ANDROID
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_LAMB.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_LAMB.cpp
new file mode 100644
index 00000000..e9eebb05
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_LAMB.cpp
@@ -0,0 +1,218 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/optimizers/LAMB.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerLAMB, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    {
+        raul::optimizers::LAMB optimizer{ 0.01_dt };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "LAMB(lr=1.000000e-02, beta1=9.000000e-01, beta2=9.990000e-01, epsilon=1.000000e-06, weight decay=0.000000e+00, adam: false)");
+    }
+}
+
+// see LAMB.py
+TEST(TestOptimizerLAMB, NoWeightDecayTwoStepsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto beta1 = 0.5_dt;
+    constexpr auto beta2 = 0.8_dt;
+    constexpr auto eps = 1.0_dt;
+    constexpr auto EPS = 1e-6_dt;
+
+    const raul::Tensor param{ 0.49625659_dt, 0.76822180_dt, 0.08847743_dt, 0.13203049_dt, 0.30742282_dt, 0.63407868_dt, 0.49009341_dt, 0.89644474_dt,
+                              0.45562798_dt, 0.63230628_dt, 0.34889346_dt, 0.40171731_dt, 0.02232575_dt, 0.16885895_dt, 0.29388845_dt, 0.51852179_dt,
+                              0.69766760_dt, 0.80001140_dt, 0.16102946_dt, 0.28226858_dt, 0.68160856_dt, 0.91519397_dt, 0.39709991_dt, 0.87415588_dt };
+    const raul::Tensor grad{ 0.41940832_dt, 0.55290705_dt, 0.95273811_dt, 0.03616482_dt, 0.18523103_dt, 0.37341738_dt, 0.30510002_dt, 0.93200040_dt,
+                             0.17591017_dt, 0.26983356_dt, 0.15067977_dt, 0.03171951_dt, 0.20812976_dt, 0.92979902_dt, 0.72310919_dt, 0.74233627_dt,
+                             0.52629578_dt, 0.24365824_dt, 0.58459234_dt, 0.03315264_dt, 0.13871688_dt, 0.24223500_dt, 0.81546897_dt, 0.79316062_dt };
+    const raul::Tensor updatedParamFirstStep{ 0.44785520_dt, 0.70746839_dt, -0.00308317_dt, 0.12715299_dt, 0.28397900_dt,  0.59022534_dt, 0.45329982_dt, 0.80629081_dt,
+                                              0.43327782_dt, 0.59930772_dt, 0.32954654_dt,  0.39743096_dt, -0.00376947_dt, 0.07885540_dt, 0.21900323_dt, 0.44214168_dt,
+                                              0.63928115_dt, 0.76989931_dt, 0.09751603_dt,  0.27779141_dt, 0.66370791_dt,  0.88524061_dt, 0.31520593_dt, 0.79391563_dt };
+    const raul::Tensor updatedParamSecondStep{ 0.40191659_dt, 0.65054995_dt, -0.08619092_dt, 0.12230027_dt, 0.26112473_dt,  0.54840213_dt,  0.41794428_dt, 0.72434324_dt,
+                                               0.41146380_dt, 0.56746948_dt, 0.31060183_dt,  0.39316359_dt, -0.02913539_dt, -0.00296792_dt, 0.14986515_dt, 0.37173176_dt,
+                                               0.58444470_dt, 0.74075562_dt, 0.03818277_dt,  0.27333498_dt, 0.64615172_dt,  0.85624558_dt,  0.24013832_dt, 0.72023946_dt };
+    {
+        raul::optimizers::LAMB optimizer{ lr, beta1, beta2, eps };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(updatedParamFirstStep[i], params[i], EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(updatedParamSecondStep[i], params[i], EPS);
+        }
+    }
+}
+
+TEST(TestOptimizerLAMB, AdamStyleTwoStepsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto beta1 = 0.5_dt;
+    constexpr auto beta2 = 0.8_dt;
+    constexpr auto eps = 1.0_dt;
+    constexpr auto weightDecay = 0.1_dt;
+    constexpr auto adam = true;
+    constexpr auto EPS = 1e-6_dt;
+
+    const raul::Tensor param{ 0.27825248_dt, 0.48195881_dt, 0.81978035_dt, 0.99706656_dt, 0.69844109_dt, 0.56754643_dt, 0.83524317_dt, 0.20559883_dt,
+                              0.59317201_dt, 0.11234725_dt, 0.15345693_dt, 0.24170822_dt, 0.72623652_dt, 0.70108020_dt, 0.20382375_dt, 0.65105355_dt,
+                              0.77448601_dt, 0.43689132_dt, 0.51909077_dt, 0.61585236_dt, 0.81018829_dt, 0.98009706_dt, 0.11468822_dt, 0.31676513_dt };
+    const raul::Tensor grad{ 0.69650495_dt, 0.91427469_dt, 0.93510365_dt, 0.94117838_dt, 0.59950727_dt, 0.06520867_dt, 0.54599625_dt, 0.18719733_dt,
+                             0.03402293_dt, 0.94424623_dt, 0.88017988_dt, 0.00123602_dt, 0.59358603_dt, 0.41576999_dt, 0.41771942_dt, 0.27112156_dt,
+                             0.69227809_dt, 0.20384824_dt, 0.68329567_dt, 0.75285405_dt, 0.85793579_dt, 0.68695557_dt, 0.00513238_dt, 0.17565155_dt };
+    const raul::Tensor updatedParamFirstStep{ 0.24891593_dt, 0.44469225_dt, 0.77861434_dt, 0.95397699_dt, 0.66781878_dt, 0.55870295_dt, 0.80494869_dt, 0.19490603_dt,
+                                              0.58556461_dt, 0.07802896_dt, 0.12034364_dt, 0.23922937_dt, 0.69552076_dt, 0.67654026_dt, 0.18418710_dt, 0.63245285_dt,
+                                              0.74031019_dt, 0.42318153_dt, 0.48773158_dt, 0.58153260_dt, 0.77108449_dt, 0.94402057_dt, 0.11328530_dt, 0.30545455_dt };
+    const raul::Tensor updatedParamSecondStep{ 0.20958513_dt, 0.39596522_dt, 0.72590190_dt, 0.89932436_dt, 0.62807232_dt, 0.54840940_dt, 0.76605421_dt, 0.18033487_dt,
+                                               0.57720828_dt, 0.03204196_dt, 0.07594071_dt, 0.23674443_dt, 0.65573812_dt, 0.64481789_dt, 0.15729472_dt, 0.60863918_dt,
+                                               0.69622344_dt, 0.40532726_dt, 0.44650817_dt, 0.53682250_dt, 0.72089487_dt, 0.89809638_dt, 0.11176870_dt, 0.29048216_dt };
+    {
+        raul::optimizers::LAMB optimizer{ lr, beta1, beta2, eps, weightDecay, adam };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(updatedParamFirstStep[i], params[i], EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(updatedParamSecondStep[i], params[i], EPS);
+        }
+    }
+}
+
+TEST(TestOptimizerLAMB, TwoStepsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto beta1 = 0.5_dt;
+    constexpr auto beta2 = 0.8_dt;
+    constexpr auto eps = 1.0_dt;
+    constexpr auto weightDecay = 0.1_dt;
+    constexpr auto adam = false;
+    constexpr auto EPS = 1e-6_dt;
+
+    const raul::Tensor param{ 0.74965751_dt, 0.60465068_dt, 0.10995799_dt, 0.21209025_dt, 0.97037464_dt, 0.83690894_dt, 0.28198743_dt, 0.37415761_dt,
+                              0.02370095_dt, 0.49101293_dt, 0.12347054_dt, 0.11432165_dt, 0.47245020_dt, 0.57507253_dt, 0.29523486_dt, 0.79668880_dt,
+                              0.19573045_dt, 0.95368505_dt, 0.84264994_dt, 0.07835853_dt, 0.37555784_dt, 0.52256131_dt, 0.57295054_dt, 0.61858714_dt };
+    const raul::Tensor grad{ 0.69621414_dt, 0.52995008_dt, 0.25603563_dt, 0.73659450_dt, 0.02037555_dt, 0.20364666_dt, 0.37483507_dt, 0.25644332_dt,
+                             0.32508332_dt, 0.09018916_dt, 0.39364243_dt, 0.60687822_dt, 0.17426711_dt, 0.47434032_dt, 0.85792542_dt, 0.44859987_dt,
+                             0.51389611_dt, 0.45686555_dt, 0.60119069_dt, 0.81791973_dt, 0.97362310_dt, 0.81752795_dt, 0.97470677_dt, 0.46383917_dt };
+    const raul::Tensor updatedParamFirstStep{ 0.67769986_dt,  0.54659086_dt, 0.08335369_dt, 0.14904755_dt, 0.94772899_dt, 0.79949188_dt, 0.24209835_dt, 0.34193403_dt,
+                                              -0.00679680_dt, 0.47147155_dt, 0.08548463_dt, 0.06145668_dt, 0.44537714_dt, 0.52155775_dt, 0.22346349_dt, 0.74035889_dt,
+                                              0.14742965_dt,  0.89343238_dt, 0.77476233_dt, 0.01340881_dt, 0.29593199_dt, 0.44824433_dt, 0.48909670_dt, 0.56491083_dt };
+    const raul::Tensor updatedParamSecondStep{ 0.61265975_dt,  0.49345976_dt, 0.05728398_dt, 0.08968264_dt,  0.93134212_dt, 0.76726788_dt, 0.20425665_dt, 0.31197339_dt,
+                                               -0.03712438_dt, 0.45486891_dt, 0.04858941_dt, 0.01077493_dt,  0.42108607_dt, 0.47247416_dt, 0.15677491_dt, 0.68977797_dt,
+                                               0.10129340_dt,  0.83999664_dt, 0.71378452_dt, -0.04816509_dt, 0.22276920_dt, 0.38020468_dt, 0.41300461_dt, 0.51589596_dt };
+    {
+        raul::optimizers::LAMB optimizer{ lr, beta1, beta2, eps, weightDecay, adam };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(updatedParamFirstStep[i], params[i], EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(updatedParamSecondStep[i], params[i], EPS);
+        }
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerLAMB, TwoStepsFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto beta1 = 0.5_dt;
+    constexpr auto beta2 = 0.8_dt;
+    constexpr auto eps = 1.0_dt;
+    constexpr auto weightDecay = 0.1_dt;
+    constexpr auto adam = false;
+    const auto EPS = 1e-2_hf;
+
+    const raul::TensorFP16 param{ 0.74965751_hf, 0.60465068_hf, 0.10995799_hf, 0.21209025_hf, 0.97037464_hf, 0.83690894_hf, 0.28198743_hf, 0.37415761_hf,
+                              0.02370095_hf, 0.49101293_hf, 0.12347054_hf, 0.11432165_hf, 0.47245020_hf, 0.57507253_hf, 0.29523486_hf, 0.79668880_hf,
+                              0.19573045_hf, 0.95368505_hf, 0.84264994_hf, 0.07835853_hf, 0.37555784_hf, 0.52256131_hf, 0.57295054_hf, 0.61858714_hf };
+    const raul::TensorFP16 grad{ 0.69621414_hf, 0.52995008_hf, 0.25603563_hf, 0.73659450_hf, 0.02037555_hf, 0.20364666_hf, 0.37483507_hf, 0.25644332_hf,
+                             0.32508332_hf, 0.09018916_hf, 0.39364243_hf, 0.60687822_hf, 0.17426711_hf, 0.47434032_hf, 0.85792542_hf, 0.44859987_hf,
+                             0.51389611_hf, 0.45686555_hf, 0.60119069_hf, 0.81791973_hf, 0.97362310_hf, 0.81752795_hf, 0.97470677_hf, 0.46383917_hf };
+    const raul::TensorFP16 updatedParamFirstStep{ 0.67769986_hf,  0.54659086_hf, 0.08335369_hf, 0.14904755_hf, 0.94772899_hf, 0.79949188_hf, 0.24209835_hf, 0.34193403_hf,
+                                              -0.00679680_hf, 0.47147155_hf, 0.08548463_hf, 0.06145668_hf, 0.44537714_hf, 0.52155775_hf, 0.22346349_hf, 0.74035889_hf,
+                                              0.14742965_hf,  0.89343238_hf, 0.77476233_hf, 0.01340881_hf, 0.29593199_hf, 0.44824433_hf, 0.48909670_hf, 0.56491083_hf };
+    const raul::TensorFP16 updatedParamSecondStep{ 0.61265975_hf,  0.49345976_hf, 0.05728398_hf, 0.08968264_hf,  0.93134212_hf, 0.76726788_hf, 0.20425665_hf, 0.31197339_hf,
+                                               -0.03712438_hf, 0.45486891_hf, 0.04858941_hf, 0.01077493_hf,  0.42108607_hf, 0.47247416_hf, 0.15677491_hf, 0.68977797_hf,
+                                               0.10129340_hf,  0.83999664_hf, 0.71378452_hf, -0.04816509_hf, 0.22276920_hf, 0.38020468_hf, 0.41300461_hf, 0.51589596_hf };
+    {
+        raul::optimizers::LAMB optimizer{ lr, beta1, beta2, eps, weightDecay, adam };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(updatedParamFirstStep[i], params[i], EPS));
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(updatedParamSecondStep[i], params[i], EPS));
+        }
+    }
+}
+#endif // ANDROID
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_Momentum.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_Momentum.cpp
new file mode 100644
index 00000000..840da9a9
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_Momentum.cpp
@@ -0,0 +1,145 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include "training/base/optimizers/Momentum.h"
+#include <training/base/layers/activations/SigmoidActivation.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerMomentum, StreamUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    std::ostringstream stream;
+    const auto learning_rate = 0.01_dt;
+    const auto momentum = 1.0_dt;
+    {
+        raul::optimizers::Momentum optimizer{ learning_rate, momentum };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "Momentum(lr=1.000000e-02, momentum=1.000000e+00)");
+    }
+}
+
+TEST(TestOptimizerMomentum, StepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const auto learning_rate = 0.1_dt;
+    const auto momentum = 1.0_dt;
+    const auto amount_of_element = 10U;
+    {
+        raul::optimizers::Momentum optimizer{ learning_rate, momentum };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+        for (raul::dtype d : params)
+            EXPECT_EQ(d, 1.0_dt - learning_rate);
+    }
+}
+
+TEST(TestOptimizerMomentum, DoubleStepUnit)
+{
+    PROFILE_TEST
+    // We expect here after 2 steps of the optimizer
+    // 1st: velocity_new = 0*momentum+lr*1 = lr
+    //      param_new = param - velocity_new = 1-lr
+    // 2st: velocity_new = lr*momentum+lr*1 = 2*lr
+    //      param_new = param - velocity_new = (1-lr) - 2*lr = 1-3*lr
+    raul::MemoryManager memory_manager;
+    const auto learning_rate = 0.1_dt;
+    const auto momentum = 1.0_dt;
+    const auto amount_of_element = 10U;
+    {
+        raul::optimizers::Momentum optimizer{ learning_rate, momentum };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), 1_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+        for (raul::dtype d : params)
+            EXPECT_EQ(d, 1.0_dt - 3.0_dt * learning_rate);
+    }
+}
+
+TEST(TestOptimizerMomentum, SmallMomentumDoubleStepRandUnit)
+{
+    PROFILE_TEST
+    // We expect here after 2 steps of the optimizer
+    // 1st: velocity_new = 0*momentum+lr*1 = lr
+    //      param_new = param - velocity_new = 1-lr
+    // 2st: velocity_new = lr*momentum+lr*1 = (momentum+1)*lr
+    //      param_new = param - velocity_new = (1-lr)-(momentum+1)*lr = 1-(2+momentum)*lr
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<> dis(.0_dt, std::nextafter(1.0, std::numeric_limits<raul::dtype>::max())); // [0,1]
+
+    raul::MemoryManager memory_manager;
+    const auto learning_rate = 0.1_dt;
+    const auto momentum = static_cast<raul::dtype>(dis(gen));
+    const auto amount_of_element = 10U;
+    {
+        raul::optimizers::Momentum optimizer{ learning_rate, momentum };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients1 = *memory_manager.createTensor("gradients1", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients2 = *memory_manager.createTensor("gradients2", 1, amount_of_element, 1, 1, 1.0_dt);
+
+        optimizer(memory_manager, params, gradients1);
+        optimizer(memory_manager, params, gradients2);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients1.size(), amount_of_element);
+        EXPECT_EQ(gradients2.size(), amount_of_element);
+        for (raul::dtype d : params)
+
+            EXPECT_FLOAT_EQ(d, 1.0_dt - (2.0_dt + momentum) * learning_rate);
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerMomentum, DoubleStepFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    const auto learning_rate = 0.1_dt;
+    const auto momentum = 1.0_dt;
+    const auto amount_of_element = 10U;
+    const auto eps = 1e-3_dt;
+    {
+        raul::optimizers::Momentum optimizer{ learning_rate, momentum };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_hf);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_hf);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), 1.0_hf);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+        for (raul::half d : params)
+        {
+            EXPECT_NEAR(TODTYPE(d), 1.0_dt - 3.0_dt * learning_rate, eps);
+        }
+    }
+}
+#endif // ANDROID
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_Nesterov.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_Nesterov.cpp
new file mode 100644
index 00000000..ec96f3f5
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_Nesterov.cpp
@@ -0,0 +1,120 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/layers/activations/SigmoidActivation.h>
+#include <training/base/optimizers/Nesterov.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerNesterov, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    constexpr auto learning_rate = 0.01_dt;
+    constexpr auto momentum = 1.0_dt;
+    {
+        raul::optimizers::Nesterov optimizer{ learning_rate, momentum };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "Nesterov(lr=1.000000e-02, momentum=1.000000e+00)");
+    }
+}
+
+TEST(TestOptimizerNesterov, StepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const auto learning_rate = 0.1_dt;
+    const auto momentum = 1.0_dt;
+    const auto amount_of_element = 10U;
+    {
+        raul::optimizers::Nesterov optimizer{ learning_rate, momentum };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        for (raul::dtype val_el : memory_manager["Nesterov::params::v"])
+            EXPECT_FLOAT_EQ(val_el, -learning_rate);
+
+        for (raul::dtype d : params)
+            EXPECT_EQ(d, 1.0_dt - 2.0_dt * learning_rate);
+    }
+}
+
+TEST(TestOptimizerNesterov, DoubleStepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    const auto learning_rate = 0.1_dt;
+    const auto momentum = 1.0_dt;
+    const auto amount_of_element = 10U;
+    {
+        raul::optimizers::Nesterov optimizer{ learning_rate, momentum };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), 1_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        const auto v_el_golden = -(1.0_dt + momentum) * learning_rate;
+        for (raul::dtype val_el : memory_manager["Nesterov::params::v"])
+            EXPECT_FLOAT_EQ(val_el, v_el_golden);
+
+        for (raul::dtype d : params)
+            EXPECT_EQ(d, 1.0_dt - 5.0_dt * learning_rate);
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerNesterov, DoubleStepFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    const auto learning_rate = 0.1_dt;
+    const auto momentum = 1.0_dt;
+    const auto amount_of_element = 10U;
+    {
+        raul::optimizers::Nesterov optimizer{ learning_rate, momentum };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_hf);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_hf);
+        optimizer(memory_manager, params, gradients);
+        std::fill(gradients.begin(), gradients.end(), 1_hf);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+
+        const auto v_el_golden = -(1.0_dt + momentum) * TOHTYPE(learning_rate);
+        for (raul::half val_el : memory_manager["Nesterov::params::v"])
+        {
+            EXPECT_FLOAT_EQ(TODTYPE(val_el), v_el_golden);
+        }
+
+        for (raul::half d : params)
+        {
+            EXPECT_EQ(TODTYPE(d), 1.0_dt - 5.0_dt * learning_rate);
+        }
+    }
+}
+#endif // ANDROID
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_RMSprop.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_RMSprop.cpp
new file mode 100644
index 00000000..af1d4332
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_RMSprop.cpp
@@ -0,0 +1,349 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/optimizers/RMSprop.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerRMSprop, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    {
+        raul::optimizers::RMSprop optimizer{ 0.01_dt };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "RMSprop(lr=1.000000e-02, alpha=9.900000e-01, eps=1.000000e-08, weight decay=0.000000e+00, momentum: 0.000000e+00, centered: false, style: pytorch)");
+    }
+}
+
+// see RMSprop.py
+TEST(TestOptimizerRMSprop, TwoStepsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.9_dt;
+    constexpr auto eps = 0.1_dt;
+    constexpr auto decay = 0.1_dt;
+    constexpr auto momentum = 0.1_dt;
+    constexpr auto centered = true;
+    constexpr auto EPS = 1.0e-5_dt;
+
+    const raul::Tensor param{ 0.496257_dt, 0.768222_dt, 0.088477_dt, 0.132030_dt, 0.307423_dt, 0.634079_dt, 0.490093_dt, 0.896445_dt, 0.455628_dt, 0.632306_dt, 0.348893_dt, 0.401717_dt,
+                              0.022326_dt, 0.168859_dt, 0.293888_dt, 0.518522_dt, 0.697668_dt, 0.800011_dt, 0.161029_dt, 0.282269_dt, 0.681609_dt, 0.915194_dt, 0.397100_dt, 0.874156_dt };
+    const raul::Tensor grad{ 0.419408_dt, 0.552907_dt, 0.952738_dt, 0.036165_dt, 0.185231_dt, 0.373417_dt, 0.305100_dt, 0.932000_dt, 0.175910_dt, 0.269834_dt, 0.150680_dt, 0.031720_dt,
+                             0.208130_dt, 0.929799_dt, 0.723109_dt, 0.742336_dt, 0.526296_dt, 0.243658_dt, 0.584592_dt, 0.033153_dt, 0.138717_dt, 0.242235_dt, 0.815469_dt, 0.793161_dt };
+    const raul::Tensor paramAfterFirstStep{ 0.301402_dt, 0.550261_dt, -0.159050_dt, 0.089031_dt, 0.176365_dt,  0.445016_dt,  0.318390_dt, 0.645114_dt,
+                                            0.322565_dt, 0.465707_dt, 0.229687_dt,  0.342580_dt, -0.106645_dt, -0.077670_dt, 0.062883_dt, 0.283733_dt,
+                                            0.483886_dt, 0.635799_dt, -0.053345_dt, 0.230434_dt, 0.553956_dt,  0.748422_dt,  0.157254_dt, 0.632354_dt };
+    const raul::Tensor paramAfterSecondStep{ 0.121381_dt, 0.351013_dt, -0.382218_dt, 0.046700_dt, 0.051836_dt,  0.269886_dt,  0.158099_dt,  0.418925_dt,
+                                             0.196239_dt, 0.309829_dt, 0.115852_dt,  0.284721_dt, -0.229299_dt, -0.300042_dt, -0.147013_dt, 0.070777_dt,
+                                             0.288081_dt, 0.481993_dt, -0.249640_dt, 0.179576_dt, 0.432488_dt,  0.592395_dt,  -0.059773_dt, 0.413759_dt };
+    {
+        raul::optimizers::RMSprop optimizer{ lr, alpha, eps, decay, momentum, centered };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(paramAfterFirstStep[i], params[i], EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(paramAfterSecondStep[i], params[i], EPS);
+        }
+    }
+}
+
+TEST(TestOptimizerRMSprop, TwoStepsNotCenteredUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.9_dt;
+    constexpr auto eps = 0.1_dt;
+    constexpr auto decay = 0.1_dt;
+    constexpr auto momentum = 0.1_dt;
+    constexpr auto centered = false;
+    constexpr auto EPS = 1.0e-5_dt;
+
+    const raul::Tensor param{ 0.278252_dt, 0.481959_dt, 0.819780_dt, 0.997067_dt, 0.698441_dt, 0.567546_dt, 0.835243_dt, 0.205599_dt, 0.593172_dt, 0.112347_dt, 0.153457_dt, 0.241708_dt,
+                              0.726237_dt, 0.701080_dt, 0.203824_dt, 0.651054_dt, 0.774486_dt, 0.436891_dt, 0.519091_dt, 0.615852_dt, 0.810188_dt, 0.980097_dt, 0.114688_dt, 0.316765_dt };
+    const raul::Tensor grad{ 0.696505_dt, 0.914275_dt, 0.935104_dt, 0.941178_dt, 0.599507_dt, 0.065209_dt, 0.545996_dt, 0.187197_dt, 0.034023_dt, 0.944246_dt, 0.880180_dt, 0.001236_dt,
+                             0.593586_dt, 0.415770_dt, 0.417719_dt, 0.271122_dt, 0.692278_dt, 0.203848_dt, 0.683296_dt, 0.752854_dt, 0.857936_dt, 0.686956_dt, 0.005132_dt, 0.175652_dt };
+    const raul::Tensor paramAfterFirstStep{ 0.058127_dt, 0.243936_dt,  0.578554_dt,  0.754525_dt, 0.483677_dt, 0.479530_dt, 0.624752_dt, 0.080216_dt,
+                                            0.521104_dt, -0.125246_dt, -0.080246_dt, 0.218191_dt, 0.511796_dt, 0.509524_dt, 0.020164_dt, 0.488093_dt,
+                                            0.550343_dt, 0.298042_dt,  0.297971_dt,  0.388068_dt, 0.573630_dt, 0.754680_dt, 0.098915_dt, 0.191539_dt };
+    const raul::Tensor paramAfterSecondStep{ -0.134884_dt, 0.037550_dt,  0.369809_dt,  0.544814_dt, 0.294736_dt, 0.395875_dt, 0.439078_dt,  -0.036314_dt,
+                                             0.451959_dt,  -0.331315_dt, -0.283434_dt, 0.194984_dt, 0.323103_dt, 0.338558_dt, -0.144557_dt, 0.340064_dt,
+                                             0.354301_dt,  0.170046_dt,  0.104209_dt,  0.189293_dt, 0.368327_dt, 0.557680_dt, 0.083281_dt,  0.075143_dt };
+    {
+        raul::optimizers::RMSprop optimizer{ lr, alpha, eps, decay, momentum, centered };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(paramAfterFirstStep[i], params[i], EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(paramAfterSecondStep[i], params[i], EPS);
+        }
+    }
+}
+
+TEST(TestOptimizerRMSprop, TwoStepsNotCenteredNoMomentumUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.9_dt;
+    constexpr auto eps = 0.1_dt;
+    constexpr auto decay = 0.1_dt;
+    constexpr auto momentum = 0.0_dt;
+    constexpr auto centered = false;
+    constexpr auto EPS = 1.0e-5_dt;
+
+    const raul::Tensor param{ 0.749658_dt, 0.604651_dt, 0.109958_dt, 0.212090_dt, 0.970375_dt, 0.836909_dt, 0.281987_dt, 0.374158_dt, 0.023701_dt, 0.491013_dt, 0.123471_dt, 0.114322_dt,
+                              0.472450_dt, 0.575073_dt, 0.295235_dt, 0.796689_dt, 0.195730_dt, 0.953685_dt, 0.842650_dt, 0.078359_dt, 0.375558_dt, 0.522561_dt, 0.572951_dt, 0.618587_dt };
+    const raul::Tensor grad{ 0.696214_dt, 0.529950_dt, 0.256036_dt, 0.736594_dt, 0.020376_dt, 0.203647_dt, 0.374835_dt, 0.256443_dt, 0.325083_dt, 0.090189_dt, 0.393642_dt, 0.606878_dt,
+                             0.174267_dt, 0.474340_dt, 0.857925_dt, 0.448600_dt, 0.513896_dt, 0.456866_dt, 0.601191_dt, 0.817920_dt, 0.973623_dt, 0.817528_dt, 0.974707_dt, 0.463839_dt };
+    const raul::Tensor paramAfterFirstStep{ 0.525392_dt,  0.398720_dt, -0.034819_dt, -0.011030_dt, 0.884753_dt, 0.686363_dt, 0.104791_dt, 0.221841_dt,
+                                            -0.137170_dt, 0.394315_dt, -0.054295_dt, -0.094901_dt, 0.342186_dt, 0.376759_dt, 0.062086_dt, 0.598875_dt,
+                                            -0.002808_dt, 0.752603_dt, 0.626254_dt,  -0.150302_dt, 0.134665_dt, 0.290650_dt, 0.330894_dt, 0.421135_dt };
+    const raul::Tensor paramAfterSecondStep{ 0.351683_dt,  0.237147_dt, -0.153323_dt, -0.183991_dt, 0.811821_dt,  0.563624_dt, -0.037048_dt, 0.097808_dt,
+                                             -0.267402_dt, 0.312554_dt, -0.196533_dt, -0.258680_dt, 0.234502_dt,  0.220331_dt, -0.117376_dt, 0.442787_dt,
+                                             -0.159389_dt, 0.594299_dt, 0.457711_dt,  -0.326867_dt, -0.049745_dt, 0.111985_dt, 0.145745_dt,  0.265293_dt };
+    {
+        raul::optimizers::RMSprop optimizer{ lr, alpha, eps, decay, momentum, centered };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(paramAfterFirstStep[i], params[i], EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(paramAfterSecondStep[i], params[i], EPS);
+        }
+    }
+}
+
+TEST(TestOptimizerRMSprop, TwoStepsNotCenteredTFStyleUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.9_dt;
+    constexpr auto eps = 0.1_dt;
+    constexpr auto decay = 0.0_dt;
+    constexpr auto momentum = 0.1_dt;
+    constexpr auto centered = false;
+    constexpr auto tfStyle = true;
+    constexpr auto EPS = 1.0e-5_dt;
+
+    const raul::Tensor param{ 1.5110626_dt,   0.42292204_dt,  -0.41969493_dt, -1.0360372_dt,  -1.2368279_dt, 0.47027302_dt, -0.01397489_dt, 1.1888583_dt,
+                              0.60253334_dt,  0.5997111_dt,   -0.7057119_dt,  -0.43297544_dt, 0.7936245_dt,  -0.6974926_dt, -0.9598332_dt,  -0.9006969_dt,
+                              -0.36081055_dt, -0.22377317_dt, 0.30383846_dt,  0.52152544_dt,  0.1554326_dt,  1.5885501_dt,  -0.7958055_dt,  0.07794423_dt };
+    const raul::Tensor paramAfterFirstStep{ 1.2473527_dt,  0.29974532_dt,  -0.29731688_dt, -0.8085083_dt, -0.9909208_dt, 0.335698_dt,    -0.00955607_dt, 0.94685745_dt,
+                                            0.43933123_dt, 0.4370709_dt,   -0.5233781_dt,  -0.3073284_dt, 0.59704304_dt, -0.51658463_dt, -0.7408552_dt,  -0.68906116_dt,
+                                            -0.2534846_dt, -0.15471771_dt, 0.21190614_dt,  0.37529635_dt, 0.10686368_dt, 1.3209327_dt,   -0.5988931_dt,  0.05337063_dt };
+    const raul::Tensor paramAfterSecondStep{ 1.0372865_dt,   0.2026748_dt,   -0.2008816_dt,  -0.6277918_dt, -0.79534674_dt, 0.229552_dt,    -0.0060927_dt,  0.75444734_dt,
+                                             0.31031582_dt,  0.3085053_dt,   -0.37902588_dt, -0.2082925_dt, 0.4412466_dt,   -0.37337673_dt, -0.56703365_dt, -0.52115375_dt,
+                                             -0.16900417_dt, -0.10049157_dt, 0.13961849_dt,  0.25985277_dt, 0.06876031_dt,  1.1076612_dt,   -0.44283062_dt, 0.03410574_dt };
+    {
+        raul::optimizers::RMSprop optimizer{ lr, alpha, eps, decay, momentum, centered, tfStyle };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+
+        // First step
+        optimizer(memory_manager, params, params);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(paramAfterFirstStep[i], params[i], EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, params);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(paramAfterSecondStep[i], params[i], EPS);
+        }
+    }
+}
+
+TEST(TestOptimizerRMSprop, TwoStepsTFStyleUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.9_dt;
+    constexpr auto eps = 0.1_dt;
+    constexpr auto decay = 0.0_dt;
+    constexpr auto momentum = 0.1_dt;
+    constexpr auto centered = true;
+    constexpr auto tfStyle = true;
+    constexpr auto EPS = 1.0e-5_dt;
+
+    const raul::Tensor param{ 1.0372865_dt,   0.2026748_dt,   -0.2008816_dt,  -0.6277918_dt, -0.79534674_dt, 0.229552_dt,    -0.0060927_dt,  0.75444734_dt,
+                              0.31031582_dt,  0.3085053_dt,   -0.37902588_dt, -0.2082925_dt, 0.4412466_dt,   -0.37337673_dt, -0.56703365_dt, -0.52115375_dt,
+                              -0.16900417_dt, -0.10049157_dt, 0.13961849_dt,  0.25985277_dt, 0.06876031_dt,  1.1076612_dt,   -0.44283062_dt, 0.03410574_dt };
+    const raul::Tensor paramAfterFirstStep{ 0.8034859_dt,   0.13973624_dt,  -0.1384803_dt,  -0.45722586_dt, -0.59457576_dt, 0.15862368_dt,  -0.00416605_dt, 0.56044185_dt,
+                                            0.2161798_dt,   0.21487507_dt,  -0.26623726_dt, -0.14367414_dt, 0.31253427_dt,  -0.26208052_dt, -0.40912014_dt, -0.37342036_dt,
+                                            -0.11623431_dt, -0.06885678_dt, 0.09584951_dt,  0.18006864_dt,  0.04706251_dt,  0.8661924_dt,   -0.31372544_dt, 0.0233262_dt };
+    const raul::Tensor paramAfterSecondStep{ 0.6130109_dt,   0.0902389_dt,   -0.08940828_dt, -0.3204778_dt,  -0.43241364_dt, 0.10279267_dt,  -0.00265599_dt, 0.40401128_dt,
+                                             0.14184621_dt,  0.14094658_dt,  -0.17689902_dt, -0.09284654_dt, 0.21027792_dt,  -0.17394763_dt, -0.28288537_dt, -0.2555893_dt,
+                                             -0.07477544_dt, -0.04404078_dt, 0.06148778_dt,  0.1171969_dt,   0.03004941_dt,  0.6690629_dt,   -0.21114905_dt, 0.01487664_dt };
+    {
+        raul::optimizers::RMSprop optimizer{ lr, alpha, eps, decay, momentum, centered, tfStyle };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+
+        // First step
+        optimizer(memory_manager, params, params);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(paramAfterFirstStep[i], params[i], EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, params);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(paramAfterSecondStep[i], params[i], EPS);
+        }
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerRMSprop, TwoStepsFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.9_dt;
+    constexpr auto eps = 0.1_dt;
+    constexpr auto decay = 0.1_dt;
+    constexpr auto momentum = 0.1_dt;
+    constexpr auto centered = true;
+    constexpr auto EPS = 1.0e-2_dt;
+
+    const raul::TensorFP16 param{ 0.496257_hf, 0.768222_hf, 0.088477_hf, 0.132030_hf, 0.307423_hf, 0.634079_hf, 0.490093_hf, 0.896445_hf, 0.455628_hf, 0.632306_hf, 0.348893_hf, 0.401717_hf,
+        0.022326_hf, 0.168859_hf, 0.293888_hf, 0.518522_hf, 0.697668_hf, 0.800011_hf, 0.161029_hf, 0.282269_hf, 0.681609_hf, 0.915194_hf, 0.397100_hf, 0.874156_hf };
+    const raul::TensorFP16 grad{ 0.419408_hf, 0.552907_hf, 0.952738_hf, 0.036165_hf, 0.185231_hf, 0.373417_hf, 0.305100_hf, 0.932000_hf, 0.175910_hf, 0.269834_hf, 0.150680_hf, 0.031720_hf,
+        0.208130_hf, 0.929799_hf, 0.723109_hf, 0.742336_hf, 0.526296_hf, 0.243658_hf, 0.584592_hf, 0.033153_hf, 0.138717_hf, 0.242235_hf, 0.815469_hf, 0.793161_hf };
+    const raul::TensorFP16 paramAfterFirstStep{ 0.301402_hf, 0.550261_hf, -0.159050_hf, 0.089031_hf, 0.176365_hf,  0.445016_hf,  0.318390_hf, 0.645114_hf,
+        0.322565_hf, 0.465707_hf, 0.229687_hf,  0.342580_hf, -0.106645_hf, -0.077670_hf, 0.062883_hf, 0.283733_hf,
+        0.483886_hf, 0.635799_hf, -0.053345_hf, 0.230434_hf, 0.553956_hf,  0.748422_hf,  0.157254_hf, 0.632354_hf };
+    const raul::TensorFP16 paramAfterSecondStep{ 0.121381_hf, 0.351013_hf, -0.382218_hf, 0.046700_hf, 0.051836_hf,  0.269886_hf,  0.158099_hf,  0.418925_hf,
+        0.196239_hf, 0.309829_hf, 0.115852_hf,  0.284721_hf, -0.229299_hf, -0.300042_hf, -0.147013_hf, 0.070777_hf,
+        0.288081_hf, 0.481993_hf, -0.249640_hf, 0.179576_hf, 0.432488_hf,  0.592395_hf,  -0.059773_hf, 0.413759_hf };
+    {
+        raul::optimizers::RMSprop optimizer{ lr, alpha, eps, decay, momentum, centered };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(TODTYPE(paramAfterFirstStep[i]), TODTYPE(params[i]), EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(TODTYPE(paramAfterSecondStep[i]), TODTYPE(params[i]), EPS);
+        }
+    }
+}
+
+TEST(TestOptimizerRMSprop, TwoStepsNotCenteredNoMomentumFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.9_dt;
+    constexpr auto eps = 0.1_dt;
+    constexpr auto decay = 0.1_dt;
+    constexpr auto momentum = 0.0_dt;
+    constexpr auto centered = false;
+    constexpr auto EPS = 1.0e-3_dt;
+
+    const raul::TensorFP16 param{ 0.749658_hf, 0.604651_hf, 0.109958_hf, 0.212090_hf, 0.970375_hf, 0.836909_hf, 0.281987_hf, 0.374158_hf, 0.023701_hf, 0.491013_hf, 0.123471_hf, 0.114322_hf,
+                              0.472450_hf, 0.575073_hf, 0.295235_hf, 0.796689_hf, 0.195730_hf, 0.953685_hf, 0.842650_hf, 0.078359_hf, 0.375558_hf, 0.522561_hf, 0.572951_hf, 0.618587_hf };
+    const raul::TensorFP16 grad{ 0.696214_hf, 0.529950_hf, 0.256036_hf, 0.736594_hf, 0.020376_hf, 0.203647_hf, 0.374835_hf, 0.256443_hf, 0.325083_hf, 0.090189_hf, 0.393642_hf, 0.606878_hf,
+                             0.174267_hf, 0.474340_hf, 0.857925_hf, 0.448600_hf, 0.513896_hf, 0.456866_hf, 0.601191_hf, 0.817920_hf, 0.973623_hf, 0.817528_hf, 0.974707_hf, 0.463839_hf };
+    const raul::TensorFP16 paramAfterFirstStep{ 0.525392_hf,  0.398720_hf, -0.034819_hf, -0.011030_hf, 0.884753_hf, 0.686363_hf, 0.104791_hf, 0.221841_hf,
+                                            -0.137170_hf, 0.394315_hf, -0.054295_hf, -0.094901_hf, 0.342186_hf, 0.376759_hf, 0.062086_hf, 0.598875_hf,
+                                            -0.002808_hf, 0.752603_hf, 0.626254_hf,  -0.150302_hf, 0.134665_hf, 0.290650_hf, 0.330894_hf, 0.421135_hf };
+    const raul::TensorFP16 paramAfterSecondStep{ 0.351683_hf,  0.237147_hf, -0.153323_hf, -0.183991_hf, 0.811821_hf,  0.563624_hf, -0.037048_hf, 0.097808_hf,
+                                             -0.267402_hf, 0.312554_hf, -0.196533_hf, -0.258680_hf, 0.234502_hf,  0.220331_hf, -0.117376_hf, 0.442787_hf,
+                                             -0.159389_hf, 0.594299_hf, 0.457711_hf,  -0.326867_hf, -0.049745_hf, 0.111985_hf, 0.145745_hf,  0.265293_hf };
+    {
+        raul::optimizers::RMSprop optimizer{ lr, alpha, eps, decay, momentum, centered };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(TODTYPE(paramAfterFirstStep[i]), TODTYPE(params[i]), EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(TODTYPE(paramAfterSecondStep[i]), TODTYPE(params[i]), EPS);
+        }
+    }
+}
+#endif // ANDROID
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_Ranger.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_Ranger.cpp
new file mode 100644
index 00000000..dfcecb29
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_Ranger.cpp
@@ -0,0 +1,176 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/optimizers/Ranger.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerRanger, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    {
+        raul::optimizers::Ranger optimizer{ 0.01_dt };
+        stream << optimizer;
+        ASSERT_STREQ(
+            stream.str().c_str(),
+            "Ranger(lr=1.000000e-02, alpha=5.000000e-01, k=6, nSMaThreshold=5.000000e+00, beta1=9.500000e-01, beta2=9.990000e-01, epsilon=1.000000e-05, weightDecay=0.000000e+00, useGc=true)");
+    }
+}
+
+TEST(TestOptimizerRanger, DoubleStepsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.5_dt;
+    constexpr auto k = 1U;
+    constexpr auto nSmaThreshold = 0.0_dt;
+    constexpr auto beta1 = 0.5_dt;
+    constexpr auto beta2 = 0.8_dt;
+    constexpr auto eps = 1.0_dt;
+    constexpr auto weightDecay = 0.3_dt;
+    constexpr auto EPS = 1e-6_dt;
+
+    const raul::Tensor param{ 0.49625659_dt, 0.76822180_dt, 0.08847743_dt, 0.13203049_dt, 0.30742282_dt, 0.63407868_dt, 0.49009341_dt, 0.89644474_dt,
+                              0.45562798_dt, 0.63230628_dt, 0.34889346_dt, 0.40171731_dt, 0.02232575_dt, 0.16885895_dt, 0.29388845_dt, 0.51852179_dt,
+                              0.69766760_dt, 0.80001140_dt, 0.16102946_dt, 0.28226858_dt, 0.68160856_dt, 0.91519397_dt, 0.39709991_dt, 0.87415588_dt };
+    const raul::Tensor grad{ 0.41940832_dt, 0.55290705_dt, 0.95273811_dt, 0.03616482_dt, 0.18523103_dt, 0.37341738_dt, 0.30510002_dt, 0.93200040_dt,
+                             0.17591017_dt, 0.26983356_dt, 0.15067977_dt, 0.03171951_dt, 0.20812976_dt, 0.92979902_dt, 0.72310919_dt, 0.74233627_dt,
+                             0.52629578_dt, 0.24365824_dt, 0.58459234_dt, 0.03315264_dt, 0.13871688_dt, 0.24223500_dt, 0.81546897_dt, 0.79316062_dt };
+    const raul::Tensor updatedParamFirstStep{ 0.48905683_dt, 0.75444406_dt, 0.07885379_dt, 0.13665354_dt, 0.30717474_dt, 0.62568694_dt, 0.48509878_dt, 0.87497157_dt,
+                                              0.45330477_dt, 0.62578964_dt, 0.34856620_dt, 0.40235800_dt, 0.02598595_dt, 0.15832844_dt, 0.28442001_dt, 0.50539047_dt,
+                                              0.68542385_dt, 0.79142129_dt, 0.15580700_dt, 0.28468072_dt, 0.67647505_dt, 0.90489990_dt, 0.38471335_dt, 0.85493547_dt };
+    const raul::Tensor updatedParamSecondStep{ 0.48178747_dt, 0.74252260_dt, 0.07554363_dt, 0.13631819_dt, 0.30371904_dt, 0.61660457_dt, 0.47845405_dt, 0.85978478_dt,
+                                               0.44769484_dt, 0.61719465_dt, 0.34462768_dt, 0.39805260_dt, 0.02665381_dt, 0.15389833_dt, 0.27882481_dt, 0.49640673_dt,
+                                               0.67466360_dt, 0.78045678_dt, 0.15272006_dt, 0.28213549_dt, 0.66766453_dt, 0.89223933_dt, 0.37727112_dt, 0.84051979_dt };
+    {
+        raul::optimizers::Ranger optimizer{ lr, alpha, k, nSmaThreshold, beta1, beta2, eps, weightDecay };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(updatedParamFirstStep[i], params[i], EPS);
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(updatedParamSecondStep[i], params[i], EPS);
+        }
+    }
+}
+
+TEST(TestOptimizerRanger, TenStepsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.5_dt;
+    constexpr auto k = 6U;
+    constexpr auto nSmaThreshold = 5.0_dt;
+    constexpr auto beta1 = 0.5_dt;
+    constexpr auto beta2 = 0.8_dt;
+    constexpr auto eps = 1.0_dt;
+    constexpr auto weightDecay = 0.3_dt;
+    constexpr auto EPS = 1e-6_dt;
+    constexpr auto steps = 10U;
+
+    const raul::Tensor param{ 0.27825248_dt, 0.48195881_dt, 0.81978035_dt, 0.99706656_dt, 0.69844109_dt, 0.56754643_dt, 0.83524317_dt, 0.20559883_dt,
+                              0.59317201_dt, 0.11234725_dt, 0.15345693_dt, 0.24170822_dt, 0.72623652_dt, 0.70108020_dt, 0.20382375_dt, 0.65105355_dt,
+                              0.77448601_dt, 0.43689132_dt, 0.51909077_dt, 0.61585236_dt, 0.81018829_dt, 0.98009706_dt, 0.11468822_dt, 0.31676513_dt };
+    const raul::Tensor grad{ 0.69650495_dt, 0.91427469_dt, 0.93510365_dt, 0.94117838_dt, 0.59950727_dt, 0.06520867_dt, 0.54599625_dt, 0.18719733_dt,
+                             0.03402293_dt, 0.94424623_dt, 0.88017988_dt, 0.00123602_dt, 0.59358603_dt, 0.41576999_dt, 0.41771942_dt, 0.27112156_dt,
+                             0.69227809_dt, 0.20384824_dt, 0.68329567_dt, 0.75285405_dt, 0.85793579_dt, 0.68695557_dt, 0.00513238_dt, 0.17565155_dt };
+    const raul::Tensor updatedParamTenthStep{ 0.15163589_dt, 0.23519443_dt,  0.50189996_dt,  0.64359951_dt, 0.53234357_dt, 0.63823181_dt, 0.66647679_dt, 0.30110210_dt,
+                                              0.66979969_dt, -0.07530542_dt, -0.01910310_dt, 0.39583641_dt, 0.55741066_dt, 0.61427820_dt, 0.21001345_dt, 0.63120264_dt,
+                                              0.55594164_dt, 0.48264995_dt,  0.35231444_dt,  0.40351400_dt, 0.52182317_dt, 0.72489077_dt, 0.29145336_dt, 0.39550915_dt };
+    {
+        raul::optimizers::Ranger optimizer{ lr, alpha, k, nSmaThreshold, beta1, beta2, eps, weightDecay };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // Ten steps
+        for (size_t i = 0; i < steps; ++i)
+        {
+            optimizer(memory_manager, params, gradients);
+        }
+
+        // Check
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            EXPECT_NEAR(updatedParamTenthStep[i], params[i], EPS);
+        }
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerRanger, DoubleStepsFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.5_dt;
+    constexpr auto k = 1U;
+    constexpr auto nSmaThreshold = 0.0_dt;
+    constexpr auto beta1 = 0.5_dt;
+    constexpr auto beta2 = 0.8_dt;
+    constexpr auto eps = 1.0_dt;
+    constexpr auto weightDecay = 0.3_dt;
+    const auto EPS = 1e-2_hf;
+
+    const raul::TensorFP16 param{ 0.49625659_hf, 0.76822180_hf, 0.08847743_hf, 0.13203049_hf, 0.30742282_hf, 0.63407868_hf, 0.49009341_hf, 0.89644474_hf,
+                              0.45562798_hf, 0.63230628_hf, 0.34889346_hf, 0.40171731_hf, 0.02232575_hf, 0.16885895_hf, 0.29388845_hf, 0.51852179_hf,
+                              0.69766760_hf, 0.80001140_hf, 0.16102946_hf, 0.28226858_hf, 0.68160856_hf, 0.91519397_hf, 0.39709991_hf, 0.87415588_hf };
+    const raul::TensorFP16 grad{ 0.41940832_hf, 0.55290705_hf, 0.95273811_hf, 0.03616482_hf, 0.18523103_hf, 0.37341738_hf, 0.30510002_hf, 0.93200040_hf,
+                             0.17591017_hf, 0.26983356_hf, 0.15067977_hf, 0.03171951_hf, 0.20812976_hf, 0.92979902_hf, 0.72310919_hf, 0.74233627_hf,
+                             0.52629578_hf, 0.24365824_hf, 0.58459234_hf, 0.03315264_hf, 0.13871688_hf, 0.24223500_hf, 0.81546897_hf, 0.79316062_hf };
+    const raul::TensorFP16 updatedParamFirstStep{ 0.48905683_hf, 0.75444406_hf, 0.07885379_hf, 0.13665354_hf, 0.30717474_hf, 0.62568694_hf, 0.48509878_hf, 0.87497157_hf,
+                                              0.45330477_hf, 0.62578964_hf, 0.34856620_hf, 0.40235800_hf, 0.02598595_hf, 0.15832844_hf, 0.28442001_hf, 0.50539047_hf,
+                                              0.68542385_hf, 0.79142129_hf, 0.15580700_hf, 0.28468072_hf, 0.67647505_hf, 0.90489990_hf, 0.38471335_hf, 0.85493547_hf };
+    const raul::TensorFP16 updatedParamSecondStep{ 0.48178747_hf, 0.74252260_hf, 0.07554363_hf, 0.13631819_hf, 0.30371904_hf, 0.61660457_hf, 0.47845405_hf, 0.85978478_hf,
+                                               0.44769484_hf, 0.61719465_hf, 0.34462768_hf, 0.39805260_hf, 0.02665381_hf, 0.15389833_hf, 0.27882481_hf, 0.49640673_hf,
+                                               0.67466360_hf, 0.78045678_hf, 0.15272006_hf, 0.28213549_hf, 0.66766453_hf, 0.89223933_hf, 0.37727112_hf, 0.84051979_hf };
+    {
+        raul::optimizers::Ranger optimizer{ lr, alpha, k, nSmaThreshold, beta1, beta2, eps, weightDecay };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        // First step
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(updatedParamFirstStep[i], params[i], EPS));
+        }
+
+        // Second step
+        optimizer(memory_manager, params, gradients);
+        for (size_t i = 0; i < params.size(); ++i)
+        {
+            ASSERT_TRUE(tools::expect_near_relative(updatedParamSecondStep[i], params[i], EPS));
+        }
+    }
+}
+#endif // ANDROID
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_Rprop.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_Rprop.cpp
new file mode 100644
index 00000000..0ec8060f
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_Rprop.cpp
@@ -0,0 +1,162 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/optimizers/Rprop.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerRprop, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    {
+        raul::optimizers::Rprop optimizer{ 0.01_dt };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "Rprop(lr=1.000000e-02, alpha=5.000000e-01, beta=1.200000e+00, min step=1.000000e-06, max step=5.000000e+01)");
+    }
+}
+
+// see Rprop.py
+TEST(TestOptimizerRprop, StepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+
+    const raul::Tensor param{ 0.4963_dt, 0.7682_dt, 0.0885_dt, 0.1320_dt, 0.3074_dt, 0.6341_dt, 0.4901_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.3489_dt, 0.4017_dt,
+                              0.0223_dt, 0.1689_dt, 0.2939_dt, 0.5185_dt, 0.6977_dt, 0.8000_dt, 0.1610_dt, 0.2823_dt, 0.6816_dt, 0.9152_dt, 0.3971_dt, 0.8742_dt };
+    const raul::Tensor grad{ 0.4681_dt,  -0.1577_dt, 1.4437_dt, 0.2660_dt,  0.1665_dt,  0.8744_dt, -0.1435_dt, -0.1116_dt, -0.6731_dt, 0.8728_dt,  1.0554_dt, 0.1778_dt,
+                             -0.2303_dt, -0.3918_dt, 0.5433_dt, -0.3952_dt, -0.4462_dt, 0.7440_dt, 1.5210_dt,  3.4105_dt,  -1.5312_dt, -1.2341_dt, 1.8197_dt, -0.5515_dt };
+    const raul::Tensor updatedParam{ 0.3963_dt, 0.8682_dt, -0.0115_dt, 0.0320_dt, 0.2074_dt, 0.5341_dt, 0.5901_dt, 0.9964_dt, 0.5556_dt, 0.5323_dt, 0.2489_dt, 0.3017_dt,
+                                     0.1223_dt, 0.2689_dt, 0.1939_dt,  0.6185_dt, 0.7977_dt, 0.7000_dt, 0.0610_dt, 0.1823_dt, 0.7816_dt, 1.0152_dt, 0.2971_dt, 0.9742_dt };
+    {
+        raul::optimizers::Rprop optimizer{ lr };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < updatedParam.size(); ++i)
+        {
+            EXPECT_FLOAT_EQ(updatedParam[i], params[i]);
+        }
+    }
+}
+
+TEST(TestOptimizerRprop, DoubleStepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.5_dt;
+    constexpr auto beta = 1.5_dt;
+
+    const raul::Tensor param{ 0.4963_dt, 0.7682_dt, 0.0885_dt, 0.1320_dt, 0.3074_dt, 0.6341_dt, 0.4901_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.3489_dt, 0.4017_dt,
+                              0.0223_dt, 0.1689_dt, 0.2939_dt, 0.5185_dt, 0.6977_dt, 0.8000_dt, 0.1610_dt, 0.2823_dt, 0.6816_dt, 0.9152_dt, 0.3971_dt, 0.8742_dt };
+    const raul::Tensor grad{ 0.4681_dt,  -0.1577_dt, 1.4437_dt, 0.2660_dt,  0.1665_dt,  0.8744_dt, -0.1435_dt, -0.1116_dt, -0.6731_dt, 0.8728_dt,  1.0554_dt, 0.1778_dt,
+                             -0.2303_dt, -0.3918_dt, 0.5433_dt, -0.3952_dt, -0.4462_dt, 0.7440_dt, 1.5210_dt,  3.4105_dt,  -1.5312_dt, -1.2341_dt, 1.8197_dt, -0.5515_dt };
+    const raul::Tensor updatedParam{ 0.2463_dt, 1.0182_dt, -0.1615_dt, -0.1180_dt, 0.0574_dt, 0.3841_dt, 0.7401_dt,  1.1464_dt, 0.7056_dt, 0.3823_dt, 0.0989_dt, 0.1517_dt,
+                                     0.2723_dt, 0.4189_dt, 0.0439_dt,  0.7685_dt,  0.9477_dt, 0.5500_dt, -0.0890_dt, 0.0323_dt, 0.9316_dt, 1.1652_dt, 0.1471_dt, 1.1242_dt };
+    {
+        raul::optimizers::Rprop optimizer{ lr, alpha, beta };
+        auto& params = *memory_manager.createTensor("params", 1, 2, 3, 4, param);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 4, grad);
+
+        optimizer(memory_manager, params, gradients);
+        optimizer(memory_manager, params, gradients);
+
+        for (size_t i = 0; i < updatedParam.size(); ++i)
+        {
+            EXPECT_FLOAT_EQ(updatedParam[i], params[i]);
+        }
+    }
+}
+
+TEST(TestOptimizerRprop, NStepsWithDifferentSignsGradientsUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.1_dt;
+    constexpr auto beta = 20.0_dt;
+    constexpr auto minStep = 0.001_dt;
+    constexpr auto maxStep = 1.2_dt;
+
+    const raul::Tensor param{ 0.4963_dt, 0.7682_dt, 0.0885_dt, 0.1320_dt, 0.3074_dt, 0.6341_dt };
+    const raul::Tensor grads[]{ { 0.4901_dt, 0.8964_dt, 0.4556_dt, 0.6323_dt, 0.3489_dt, 0.4017_dt },
+                                { 0.0223_dt, 0.1689_dt, 0.2939_dt, 0.5185_dt, 0.6977_dt, 0.8000_dt },
+                                { 0.1610_dt, 0.2823_dt, 0.6816_dt, 0.9152_dt, 0.3971_dt, 0.8742_dt },
+                                { -0.1612_dt, 0.1058_dt, 0.9055_dt, -0.9277_dt, -0.6295_dt, -0.2532_dt },
+                                { -0.3898_dt, 0.8640_dt, -0.6482_dt, -0.4603_dt, -0.6986_dt, -0.9366_dt },
+                                { 0.2081_dt, 0.9298_dt, 0.7231_dt, 0.7423_dt, 0.5263_dt, 0.2437_dt },
+                                { 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt },
+                                { 0.1692_dt, -0.9337_dt, -0.7226_dt, -0.5155_dt, 0.6309_dt, 0.5863_dt } };
+    const raul::Tensor updatedParam{ -1.8957_dt, -4.1318_dt, -3.6115_dt, -2.2360_dt, -2.0846_dt, -1.7579_dt };
+
+    raul::optimizers::Rprop optimizer{ lr, alpha, beta, minStep, maxStep };
+    auto& params = *memory_manager.createTensor("params", 1, 2, 3, 1, param);
+    auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 1);
+    for (size_t i = 0; i < std::size(grads); ++i)
+    {
+        gradients = TORANGE(grads[i]);
+        optimizer(memory_manager, params, gradients);
+    }
+    for (size_t q = 0; q < params.size(); ++q)
+    {
+        EXPECT_FLOAT_EQ(updatedParam[q], params[q]);
+    }
+}
+
+#ifdef ANDROID
+TEST(TestOptimizerRprop, NStepsWithDifferentSignsGradientsFP16Unit)
+{
+    PROFILE_TEST
+    raul::MemoryManagerFP16 memory_manager;
+    constexpr auto lr = 0.1_dt;
+    constexpr auto alpha = 0.1_dt;
+    constexpr auto beta = 20.0_dt;
+    constexpr auto minStep = 0.001_dt;
+    constexpr auto maxStep = 1.2_dt;
+    const auto eps = 1e-3_hf;
+
+    const raul::TensorFP16 param{ 0.4963_hf, 0.7682_hf, 0.0885_hf, 0.1320_hf, 0.3074_hf, 0.6341_hf };
+    const raul::TensorFP16 grads[]{ { 0.4901_hf, 0.8964_hf, 0.4556_hf, 0.6323_hf, 0.3489_hf, 0.4017_hf },
+                                { 0.0223_hf, 0.1689_hf, 0.2939_hf, 0.5185_hf, 0.6977_hf, 0.8000_hf },
+                                { 0.1610_hf, 0.2823_hf, 0.6816_hf, 0.9152_hf, 0.3971_hf, 0.8742_hf },
+                                { -0.1612_hf, 0.1058_hf, 0.9055_hf, -0.9277_hf, -0.6295_hf, -0.2532_hf },
+                                { -0.3898_hf, 0.8640_hf, -0.6482_hf, -0.4603_hf, -0.6986_hf, -0.9366_hf },
+                                { 0.2081_hf, 0.9298_hf, 0.7231_hf, 0.7423_hf, 0.5263_hf, 0.2437_hf },
+                                { 0.0_hf, 0.0_hf, 0.0_hf, 0.0_hf, 0.0_hf, 0.0_hf },
+                                { 0.1692_hf, -0.9337_hf, -0.7226_hf, -0.5155_hf, 0.6309_hf, 0.5863_hf } };
+    const raul::TensorFP16 updatedParam{ -1.8957_hf, -4.1318_hf, -3.6115_hf, -2.2360_hf, -2.0846_hf, -1.7579_hf };
+
+    raul::optimizers::Rprop optimizer{ lr, alpha, beta, minStep, maxStep };
+    auto& params = *memory_manager.createTensor("params", 1, 2, 3, 1, param);
+    auto& gradients = *memory_manager.createTensor("gradients", 1, 2, 3, 1);
+    for (size_t i = 0; i < std::size(grads); ++i)
+    {
+        gradients = TORANGE_FP16(grads[i]);
+        optimizer(memory_manager, params, gradients);
+    }
+    for (size_t q = 0; q < params.size(); ++q)
+    {
+        ASSERT_TRUE(tools::expect_near_relative(updatedParam[q], params[q], eps));
+    }
+}
+#endif // ANDROID
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/Test_Optimizer_SGD.cpp b/training/src/tests/tests/optimizers/Test_Optimizer_SGD.cpp
new file mode 100644
index 00000000..00365f1f
--- /dev/null
+++ b/training/src/tests/tests/optimizers/Test_Optimizer_SGD.cpp
@@ -0,0 +1,75 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+TEST(TestOptimizerSGD, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    auto learning_rate = TODTYPE(0.01);
+    {
+        raul::optimizers::SGD optimizer{ learning_rate };
+        stream << optimizer;
+        ASSERT_STREQ(stream.str().c_str(), "SGD(lr=1.000000e-02)");
+    }
+}
+
+TEST(TestOptimizerSGD, AssertLRNegativeUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    {
+        auto learning_rate = TODTYPE(-0.5);
+        ASSERT_THROW(raul::optimizers::SGD{ learning_rate }, raul::Exception);
+    }
+}
+
+TEST(TestOptimizerSGD, AssertLRZeroUnit)
+{
+    PROFILE_TEST
+    std::ostringstream stream;
+    {
+        auto learning_rate = TODTYPE(.0);
+        ASSERT_THROW(raul::optimizers::SGD{ learning_rate }, raul::Exception);
+    }
+}
+
+TEST(TestOptimizerSGD, OptimizerStepUnit)
+{
+    PROFILE_TEST
+    raul::MemoryManager memory_manager;
+    auto learning_rate = TODTYPE(0.1);
+    auto amount_of_element = 10U;
+    {
+        raul::optimizers::SGD optimizer{ learning_rate };
+        auto& params = *memory_manager.createTensor("params", 1, amount_of_element, 1, 1, 1.0_dt);
+        auto& gradients = *memory_manager.createTensor("gradients", 1, amount_of_element, 1, 1, 1.0_dt);
+        optimizer(memory_manager, params, gradients);
+
+        EXPECT_EQ(params.size(), amount_of_element);
+        EXPECT_EQ(gradients.size(), amount_of_element);
+        for (raul::dtype d : params)
+            EXPECT_EQ(d, 1.0_dt - learning_rate);
+    }
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/optimizers/schedulers/Test_Scheduler.cpp b/training/src/tests/tests/optimizers/schedulers/Test_Scheduler.cpp
new file mode 100644
index 00000000..55f42a77
--- /dev/null
+++ b/training/src/tests/tests/optimizers/schedulers/Test_Scheduler.cpp
@@ -0,0 +1,126 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/base/optimizers/Adam.h>
+#include <training/base/optimizers/SGD.h>
+#include <training/base/optimizers/schedulers/strategies/ClipLower.h>
+#include <training/base/optimizers/schedulers/strategies/ClipUpper.h>
+#include <training/base/optimizers/schedulers/strategies/Exponential.h>
+#include <training/base/optimizers/schedulers/strategies/Lambda.h>
+#include <training/base/optimizers/schedulers/strategies/StepOffset.h>
+#include <training/base/optimizers/schedulers/strategies/WarmUp.h>
+
+#include <tests/tools/TestTools.h>
+
+namespace UT
+{
+
+TEST(TestScheduler, CustomFunctionalUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    const auto acc_eps = 1e-6_dt;
+    const size_t tacotron_start_decay = 20'000;
+    const size_t tacotron_decay_steps = 40'000;
+    const raul::dtype tacotron_decay_rate = 0.5_dt;
+    const auto initial_learning_rate = 1e-3_dt;
+    const raul::dtype tacotron_final_learning_rate = 1e-4_dt;
+    const bool warmup_enable = true;
+    const size_t warmup_num_steps = 1000;
+
+    const std::vector<std::pair<size_t, raul::dtype>> phase1Data{ { 50, 5.000e-05_dt },  { 100, 1.000e-04_dt }, { 150, 1.500e-04_dt }, { 200, 2.000e-04_dt }, { 250, 2.500e-04_dt },
+                                                                  { 300, 3.000e-04_dt }, { 350, 3.500e-04_dt }, { 400, 4.000e-04_dt }, { 450, 4.500e-04_dt }, { 500, 5.000e-04_dt },
+                                                                  { 550, 5.500e-04_dt }, { 600, 6.000e-04_dt }, { 650, 6.500e-04_dt }, { 700, 7.000e-04_dt }, { 750, 7.500e-04_dt },
+                                                                  { 800, 8.000e-04_dt }, { 850, 8.500e-04_dt }, { 900, 9.000e-04_dt }, { 950, 9.500e-04_dt }, { 1000, 1.000e-03_dt },
+                                                                  { 1050, 1.000e-03_dt } };
+    const std::vector<std::pair<size_t, raul::dtype>> phase2Data{ { 10000, 1.000e-03_dt },  { 20000, 1.000e-03_dt },  { 30000, 8.409e-04_dt },  { 40000, 7.071e-04_dt },  { 50000, 5.946e-04_dt },
+                                                                  { 60000, 5.000e-04_dt },  { 70000, 4.204e-04_dt },  { 80000, 3.536e-04_dt },  { 90000, 2.973e-04_dt },  { 100000, 2.500e-04_dt },
+                                                                  { 110000, 2.102e-04_dt }, { 120000, 1.768e-04_dt }, { 130000, 1.487e-04_dt }, { 140000, 1.250e-04_dt }, { 150000, 1.051e-04_dt },
+                                                                  { 160000, 1.000e-04_dt }, { 170000, 1.000e-04_dt }, { 180000, 1.000e-04_dt }, { 190000, 1.000e-04_dt } };
+
+    const auto lambda = [&](raul::dtype baseLR, size_t step, raul::dtype& currLR) {
+        // 1. Natural exponential decay
+        const auto step_a = static_cast<raul::dtype>(step) - static_cast<raul::dtype>(tacotron_start_decay);
+        const auto step_b = static_cast<raul::dtype>(tacotron_decay_steps);
+
+        auto lr = baseLR * std::pow(tacotron_decay_rate, step_a / step_b);
+
+        // 2. Clip learning rate by max and min values
+        lr = std::min(std::max(lr, tacotron_final_learning_rate), baseLR);
+
+        // 3. Warmup
+        if (warmup_enable)
+        {
+            const auto warmup_percent_done = static_cast<raul::dtype>(step) / static_cast<raul::dtype>(warmup_num_steps);
+            const auto warmup_learning_rate = baseLR * warmup_percent_done;
+            lr = std::min(lr, warmup_learning_rate);
+        }
+
+        currLR = lr;
+    };
+
+    LrScheduler scheduler{ std::make_unique<Strategies::Lambda>(lambda), std::make_unique<raul::optimizers::SGD>(initial_learning_rate) };
+
+    for (const auto& phaseData : { phase1Data, phase2Data })
+    {
+        for (auto [step, lr] : phaseData)
+        {
+            scheduler.reset(step);
+            ASSERT_NEAR(scheduler.getLearningRate(), lr, acc_eps);
+        }
+    }
+}
+
+TEST(TestScheduler, CustomFunctionalCompositUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    const auto acc_eps = 1e-6_dt;
+    const int tacotron_start_decay = 20'000;
+    const size_t tacotron_decay_steps = 40'000;
+    const raul::dtype tacotron_decay_rate = 0.5_dt;
+    const auto initial_learning_rate = 1e-3_dt;
+    const raul::dtype tacotron_final_learning_rate = 1e-4_dt;
+    const size_t warmup_num_steps = 1000;
+
+    const std::vector<std::pair<size_t, raul::dtype>> phase1Data{ { 50, 5.000e-05_dt },  { 100, 1.000e-04_dt }, { 150, 1.500e-04_dt }, { 200, 2.000e-04_dt }, { 250, 2.500e-04_dt },
+                                                                  { 300, 3.000e-04_dt }, { 350, 3.500e-04_dt }, { 400, 4.000e-04_dt }, { 450, 4.500e-04_dt }, { 500, 5.000e-04_dt },
+                                                                  { 550, 5.500e-04_dt }, { 600, 6.000e-04_dt }, { 650, 6.500e-04_dt }, { 700, 7.000e-04_dt }, { 750, 7.500e-04_dt },
+                                                                  { 800, 8.000e-04_dt }, { 850, 8.500e-04_dt }, { 900, 9.000e-04_dt }, { 950, 9.500e-04_dt }, { 1000, 1.000e-03_dt },
+                                                                  { 1050, 1.000e-03_dt } };
+    const std::vector<std::pair<size_t, raul::dtype>> phase2Data{ { 10000, 1.000e-03_dt },  { 20000, 1.000e-03_dt },  { 30000, 8.409e-04_dt },  { 40000, 7.071e-04_dt },  { 50000, 5.946e-04_dt },
+                                                                  { 60000, 5.000e-04_dt },  { 70000, 4.204e-04_dt },  { 80000, 3.536e-04_dt },  { 90000, 2.973e-04_dt },  { 100000, 2.500e-04_dt },
+                                                                  { 110000, 2.102e-04_dt }, { 120000, 1.768e-04_dt }, { 130000, 1.487e-04_dt }, { 140000, 1.250e-04_dt }, { 150000, 1.051e-04_dt },
+                                                                  { 160000, 1.000e-04_dt }, { 170000, 1.000e-04_dt }, { 180000, 1.000e-04_dt }, { 190000, 1.000e-04_dt } };
+
+    auto strategyExpOffset = std::make_unique<Strategies::StepOffset>(-tacotron_start_decay, std::make_unique<Strategies::Exponential>(tacotron_decay_rate, tacotron_decay_steps));
+    auto strategyClipped = std::make_unique<Strategies::ClipLower>(tacotron_final_learning_rate, std::make_unique<Strategies::ClipUpper>(std::move(strategyExpOffset)));
+    auto strategy = std::make_unique<Strategies::WarmUp>(warmup_num_steps, true, std::move(strategyClipped));
+
+    LrScheduler scheduler{ std::move(strategy), std::make_unique<raul::optimizers::SGD>(initial_learning_rate) };
+
+    for (const auto& phaseData : { phase1Data, phase2Data })
+    {
+        for (auto [step, lr] : phaseData)
+        {
+            scheduler.reset(step);
+            ASSERT_NEAR(scheduler.getLearningRate(), lr, acc_eps);
+        }
+    }
+}
+
+} // UT namespace
diff --git a/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_CosineAnnealing.cpp b/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_CosineAnnealing.cpp
new file mode 100644
index 00000000..454a3de1
--- /dev/null
+++ b/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_CosineAnnealing.cpp
@@ -0,0 +1,87 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/base/optimizers/SGD.h>
+#include <training/base/optimizers/schedulers/strategies/CosineAnnealing.h>
+
+#include <tests/tools/TestTools.h>
+
+namespace UT
+{
+
+using namespace std;
+
+TEST(TestSchedulerCosineAnnealing, MultiStepUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 2.0_dt;
+    const size_t numLoops = 1000;
+    const float warmUpPow = 1.f;
+    const float annealingPow = 2.f;
+    const float warmUpPercentage = 0.01f;
+    LrScheduler scheduler{ std::make_unique<Strategies::CosineAnnealing>(numLoops, 1.f, 0.f, warmUpPercentage, warmUpPow, annealingPow), std::make_unique<raul::optimizers::SGD>(baseLR) };
+
+    vector<float> goldenLR = {4.89434837e-02f, 1.98394199e+00f, 1.91980636e+00f, 1.81056471e+00f,
+                              1.66289814e+00f, 1.48562952e+00f, 1.28899123e+00f, 1.08379106e+00f,
+                              8.80554690e-01f, 6.88725022e-01f, 5.15991828e-01f, 3.67811918e-01f,
+                              2.47160479e-01f, 1.54530966e-01f, 8.81760996e-02f, 4.45586213e-02f,
+                              1.89598363e-02f, 6.17877874e-03f, 1.24644664e-03f, 7.88907588e-05f,
+                              0.00000000e+00f};
+
+    for (size_t i = 0; i <= numLoops; ++i)
+    {
+        scheduler.step();
+        if (i % 50 == 0)
+        {
+            EXPECT_NEAR(scheduler.getLearningRate(), goldenLR[i / 50], acc_eps);
+        }
+    }
+}
+
+TEST(TestSchedulerCosineAnnealing, MultiStep2Unit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 2.0_dt;
+    const size_t numLoops = 1000;
+    const float warmUpPow = 2.f;
+    const float annealingPow = 2.f;
+    const float warmUpPercentage = 0.1f;
+
+    LrScheduler scheduler{ std::make_unique<Strategies::CosineAnnealing>(numLoops, 1.f, 0.f, warmUpPercentage, warmUpPow, annealingPow), std::make_unique<raul::optimizers::SGD>(baseLR) };
+
+    vector<float> goldenLR = {1.21741336e-07f, 5.31904077e-01f, 2.00000000e+00f, 1.96973091e+00f,
+                              1.88120373e+00f, 1.74102540e+00f, 1.55945649e+00f, 1.34937557e+00f,
+                              1.12500000e+00f, 9.00509033e-01f, 6.88725022e-01f, 5.00000000e-01f,
+                              3.41428667e-01f, 2.16468746e-01f, 1.25000000e-01f, 6.38003459e-02f,
+                              2.73676013e-02f, 8.97459622e-03f, 1.81848999e-03f, 1.15402184e-04f,
+                              0.00000000e+00f};
+
+    for (size_t i = 0; i <= numLoops; ++i)
+    {
+        scheduler.step();
+        if (i % 50 == 0)
+        {
+            EXPECT_NEAR(scheduler.getLearningRate(), goldenLR[i / 50], acc_eps);
+        }
+    }
+}
+
+} // UT namespace
diff --git a/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_Exponential.cpp b/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_Exponential.cpp
new file mode 100644
index 00000000..bd976d4e
--- /dev/null
+++ b/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_Exponential.cpp
@@ -0,0 +1,110 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/base/optimizers/SGD.h>
+#include <training/base/optimizers/schedulers/strategies/Exponential.h>
+
+#include <tests/tools/TestTools.h>
+
+namespace UT
+{
+
+TEST(TestSchedulerExponential, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream testStream;
+    std::ostringstream goldenStream;
+    const auto rate = 0.5_dt;
+    const auto steps = 5U;
+    raul::optimizers::Scheduler::Strategies::Exponential strategy{ rate, steps };
+    testStream << strategy;
+    goldenStream << "Strategies::Exponential(decay rate=" << rate << ", decay steps=" << steps << ")";
+    ASSERT_STREQ(testStream.str().c_str(), goldenStream.str().c_str());
+}
+
+TEST(TestSchedulerExponential, OneStepUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto decayRate = 0.5_dt;
+    const auto decaySteps = 1U;
+    LrScheduler scheduler{ std::make_unique<Strategies::Exponential>(decayRate, decaySteps), std::make_unique<raul::optimizers::SGD>(baseLR) };
+    scheduler.step();
+    const auto newLR = scheduler.getLearningRate();
+    EXPECT_NEAR(newLR, baseLR * decayRate, acc_eps);
+}
+
+TEST(TestSchedulerExponential, MultiStepUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto decayRate = 0.5_dt;
+    const auto decaySteps = 3U;
+    const size_t steps = 15;
+    LrScheduler scheduler{ std::make_unique<Strategies::Exponential>(decayRate, decaySteps), std::make_unique<raul::optimizers::SGD>(baseLR) };
+    for (size_t i = 0; i < steps; ++i)
+    {
+        scheduler.step();
+    }
+    const auto goldenLR = baseLR * std::pow(decayRate, static_cast<raul::dtype>(steps) / static_cast<raul::dtype>(decaySteps));
+    EXPECT_NEAR(scheduler.getLearningRate(), goldenLR, acc_eps);
+    scheduler.reset();
+    EXPECT_NEAR(scheduler.getLearningRate(), baseLR, acc_eps);
+}
+
+TEST(TestSchedulerExponential, MultiResetUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto decayRate = 0.5_dt;
+    const auto decaySteps = 3U;
+    const size_t steps = 15;
+    LrScheduler scheduler{ std::make_unique<Strategies::Exponential>(decayRate, decaySteps), std::make_unique<raul::optimizers::SGD>(baseLR) };
+    scheduler.reset(steps);
+    const auto goldenLR = baseLR * std::pow(decayRate, static_cast<raul::dtype>(steps) / static_cast<raul::dtype>(decaySteps));
+    EXPECT_NEAR(scheduler.getLearningRate(), goldenLR, acc_eps);
+    scheduler.reset();
+    EXPECT_NEAR(scheduler.getLearningRate(), baseLR, acc_eps);
+}
+
+TEST(TestSchedulerExponential, CompositionUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto decay1 = -0.1_dt;
+    const auto decay2 = -0.3_dt;
+    {
+        LrScheduler scheduler{ std::make_unique<Strategies::Exponential>(decay1, std::make_unique<Strategies::Exponential>(decay2)), std::make_unique<raul::optimizers::SGD>(baseLR) };
+        scheduler.step();
+        EXPECT_NEAR(scheduler.getLearningRate(), baseLR * decay1, acc_eps);
+    }
+    {
+        LrScheduler scheduler{ std::make_unique<Strategies::Exponential>(decay2, std::make_unique<Strategies::Exponential>(decay1)), std::make_unique<raul::optimizers::SGD>(baseLR) };
+        scheduler.step();
+        EXPECT_NEAR(scheduler.getLearningRate(), baseLR * decay2, acc_eps);
+    }
+}
+
+} // UT namespace
diff --git a/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_Lambda.cpp b/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_Lambda.cpp
new file mode 100644
index 00000000..85e55756
--- /dev/null
+++ b/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_Lambda.cpp
@@ -0,0 +1,102 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/base/optimizers/SGD.h>
+#include <training/base/optimizers/schedulers/strategies/Lambda.h>
+
+#include <tests/tools/TestTools.h>
+
+namespace UT
+{
+
+TEST(TestSchedulerLambda, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream testStream;
+    std::ostringstream goldenStream;
+    const auto lambda = [](raul::dtype baseLR, long long step, raul::dtype& currLR) { currLR = baseLR + 0.1_dt * static_cast<raul::dtype>(step); };
+    raul::optimizers::Scheduler::Strategies::Lambda strategy{ lambda };
+    testStream << strategy;
+    goldenStream << "Strategies::Lambda()";
+    ASSERT_STREQ(testStream.str().c_str(), goldenStream.str().c_str());
+}
+
+TEST(TestSchedulerLambda, OneStepUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto lambda = [](raul::dtype baseLR, long long step, raul::dtype& currLR) { currLR = baseLR + 0.1_dt * static_cast<raul::dtype>(step); };
+    LrScheduler scheduler{ std::make_unique<Strategies::Lambda>(lambda), std::make_unique<raul::optimizers::SGD>(baseLR) };
+    scheduler.step();
+    const auto newLR = scheduler.getLearningRate();
+    EXPECT_NEAR(newLR, baseLR + 0.1_dt, acc_eps);
+}
+
+TEST(TestSchedulerLambda, MultiStepUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto decay = -0.1_dt;
+    const size_t steps = 15;
+    const auto lambda = [&](raul::dtype baseLR, long long step, raul::dtype& currLR) { currLR = baseLR + decay * static_cast<raul::dtype>(step); };
+    LrScheduler scheduler{ std::make_unique<Strategies::Lambda>(lambda), std::make_unique<raul::optimizers::SGD>(baseLR) };
+    for (size_t i = 0; i < steps; ++i)
+    {
+        scheduler.step();
+    }
+    EXPECT_NEAR(scheduler.getLearningRate(), baseLR + steps * decay, acc_eps);
+    scheduler.reset();
+    EXPECT_NEAR(scheduler.getLearningRate(), baseLR, acc_eps);
+}
+
+TEST(TestSchedulerLambda, MultiResetUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto decay = -0.1_dt;
+    const size_t steps = 15;
+    const auto lambda = [&](raul::dtype baseLR, long long step, raul::dtype& currLR) { currLR = baseLR + decay * static_cast<raul::dtype>(step); };
+    Strategies::Lambda strategy{ lambda };
+    LrScheduler scheduler{ std::make_unique<Strategies::Lambda>(lambda), std::make_unique<raul::optimizers::SGD>(baseLR) };
+    scheduler.reset(steps);
+    EXPECT_NEAR(scheduler.getLearningRate(), baseLR + steps * decay, acc_eps);
+    scheduler.reset();
+    EXPECT_NEAR(scheduler.getLearningRate(), baseLR, acc_eps);
+}
+
+TEST(TestSchedulerLambda, CompositionUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto decay1 = -0.1_dt;
+    const auto decay2 = -0.3_dt;
+    const auto lambda1 = [&]([[maybe_unused]] raul::dtype baseLR, [[maybe_unused]] size_t step, raul::dtype& currLR) { currLR = currLR * decay1; };
+    const auto lambda2 = [&]([[maybe_unused]] raul::dtype baseLR, [[maybe_unused]] size_t step, raul::dtype& currLR) { currLR = currLR * decay2; };
+    LrScheduler scheduler{ std::make_unique<Strategies::Lambda>(lambda1, std::make_unique<Strategies::Lambda>(lambda2)), std::make_unique<raul::optimizers::SGD>(baseLR) };
+    scheduler.step();
+    EXPECT_NEAR(scheduler.getLearningRate(), baseLR * decay1 * decay2, acc_eps);
+}
+
+} // UT namespace
diff --git a/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_WarmUp.cpp b/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_WarmUp.cpp
new file mode 100644
index 00000000..f5def6eb
--- /dev/null
+++ b/training/src/tests/tests/optimizers/schedulers/Test_Scheduler_WarmUp.cpp
@@ -0,0 +1,109 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+
+#include <training/base/common/Common.h>
+#include <training/base/optimizers/SGD.h>
+#include <training/base/optimizers/schedulers/strategies/WarmUp.h>
+
+#include <tests/tools/TestTools.h>
+
+namespace UT
+{
+
+TEST(TestSchedulerWarmUp, StreamUnit)
+{
+    PROFILE_TEST
+    std::ostringstream testStream;
+    std::ostringstream goldenStream;
+    const auto steps = 5U;
+    raul::optimizers::Scheduler::Strategies::WarmUp strategy{ steps };
+    testStream << strategy;
+    goldenStream << "Strategies::WarmUp(warm up steps=" << steps << ")";
+    ASSERT_STREQ(testStream.str().c_str(), goldenStream.str().c_str());
+}
+
+TEST(TestSchedulerWarmUp, OneStepUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto warmupSteps = 2U;
+    LrScheduler scheduler{ std::make_unique<Strategies::WarmUp>(warmupSteps), std::make_unique<raul::optimizers::SGD>(baseLR) };
+    scheduler.step();
+    const auto newLR = scheduler.getLearningRate();
+    EXPECT_NEAR(newLR, baseLR / warmupSteps, acc_eps);
+}
+
+TEST(TestSchedulerWarmUp, MultiStepUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto warmupSteps = 3U;
+    const size_t steps = 15;
+    LrScheduler scheduler{ std::make_unique<Strategies::WarmUp>(warmupSteps), std::make_unique<raul::optimizers::SGD>(baseLR) };
+    for (size_t i = 0; i < steps; ++i)
+    {
+        scheduler.step();
+    }
+    const auto goldenLR = baseLR * static_cast<raul::dtype>(steps) / static_cast<raul::dtype>(warmupSteps);
+    EXPECT_NEAR(scheduler.getLearningRate(), goldenLR, acc_eps);
+    scheduler.reset();
+    EXPECT_NEAR(scheduler.getLearningRate(), baseLR, acc_eps);
+}
+
+TEST(TestSchedulerWarmUp, MultiResetUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto warmupSteps = 3U;
+    const size_t steps = 15;
+    LrScheduler scheduler{ std::make_unique<Strategies::WarmUp>(warmupSteps), std::make_unique<raul::optimizers::SGD>(baseLR) };
+    scheduler.reset(steps);
+    const auto goldenLR = baseLR * static_cast<raul::dtype>(steps) / static_cast<raul::dtype>(warmupSteps);
+    EXPECT_NEAR(scheduler.getLearningRate(), goldenLR, acc_eps);
+    scheduler.reset();
+    EXPECT_NEAR(scheduler.getLearningRate(), baseLR, acc_eps);
+}
+
+TEST(TestSchedulerWarmUp, CompositionUnit)
+{
+    using namespace raul::optimizers::Scheduler;
+    PROFILE_TEST
+    constexpr auto acc_eps = 1e-6_dt;
+    const auto baseLR = 1.0_dt;
+    const auto warmupSteps1 = 3U;
+    const auto warmupSteps2 = 5U;
+    {
+        LrScheduler scheduler{ std::make_unique<Strategies::WarmUp>(warmupSteps2, std::make_unique<Strategies::WarmUp>(warmupSteps1)), std::make_unique<raul::optimizers::SGD>(baseLR) };
+        scheduler.step();
+        const auto goldenLR = baseLR / static_cast<raul::dtype>(warmupSteps2);
+        EXPECT_NEAR(scheduler.getLearningRate(), goldenLR, acc_eps);
+        scheduler.reset();
+    }
+    {
+        LrScheduler scheduler{ std::make_unique<Strategies::WarmUp>(warmupSteps1, std::make_unique<Strategies::WarmUp>(warmupSteps2)), std::make_unique<raul::optimizers::SGD>(baseLR) };
+        scheduler.step();
+        const auto goldenLR = baseLR / static_cast<raul::dtype>(warmupSteps1);
+        EXPECT_NEAR(scheduler.getLearningRate(), goldenLR, acc_eps);
+    }
+}
+
+} // UT namespace
diff --git a/training/src/tests/tests/postprocessing/Test_PostProcessing_GradientClipping.cpp b/training/src/tests/tests/postprocessing/Test_PostProcessing_GradientClipping.cpp
new file mode 100644
index 00000000..5910e1d4
--- /dev/null
+++ b/training/src/tests/tests/postprocessing/Test_PostProcessing_GradientClipping.cpp
@@ -0,0 +1,125 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+#include <iostream>
+#include <training/base/common/MemoryManager.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/postprocessing/GradientClipping.h>
+#include <training/base/layers/basic/DataLayer.h>
+#include <tests/tools/TestTools.h>
+
+namespace UT
+{
+
+TEST(TestGradientClipping, IncorrectGlobalNormUnit)
+{
+    EXPECT_THROW(raul::postprocessing::GradientClipping clip(0.1_dt, -1.0_dt), raul::Exception);
+}
+
+TEST(TestGradientClipping, IncorrectClipNormUnit)
+{
+    EXPECT_THROW(raul::postprocessing::GradientClipping clip(0.0_dt), raul::Exception);
+    EXPECT_THROW(raul::postprocessing::GradientClipping clip(-1.0_dt), raul::Exception);
+}
+
+TEST(TestGradientClipping, StreamUnit)
+{
+    std::ostringstream stream;
+    raul::postprocessing::GradientClipping clip(0.1_dt);
+    stream << clip;
+    ASSERT_STREQ(stream.str().c_str(), "GradientClipping(clip norm = 0.1)");
+}
+
+TEST(TestGradientClipping, ProcessGradientsInfClipUnit)
+{
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    constexpr raul::dtype clipNorm = std::numeric_limits<raul::dtype>::infinity();
+    raul::Tensor fictiveParams{ 0.0_dt, 0.0_dt, 0.0_dt };
+    raul::Tensor grads{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+
+    std::vector<raul::ParamAndGradImpl<Tensor>> trainableParams;
+    trainableParams.emplace_back(raul::ParamAndGradImpl<Tensor>{ fictiveParams, grads });
+
+    raul::postprocessing::GradientClipping clip(clipNorm);
+    ASSERT_NO_THROW(clip.processGradients(trainableParams, networkParameters));
+
+    // Checks
+    for (auto& [param, grad] : trainableParams)
+    {
+        for (size_t i = 0; i < grad.size(); ++i)
+        {
+            EXPECT_TRUE(std::isnan(grad[i]));
+        }
+    }
+}
+
+TEST(TestGradientClipping, ProcessGradientsUnit)
+{
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+    const raul::dtype clipNorms[] = { 100.0_dt, 0.5_dt };
+    const raul::dtype eps = TODTYPE(1e-5);
+
+    raul::Tensor fictiveParams{ 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt };
+    raul::Tensor grads[]{ { 0.29197514_dt, 0.20656645_dt, 0.53539073_dt, 0.5612575_dt, 0.4166745_dt, 0.80782795_dt, 0.4932251_dt, 0.99812925_dt },
+                          { 0.5554141_dt, 0.22129297_dt, 0.8649249_dt, 0.77728355_dt, 0.6451167_dt, 0.53036225_dt, 0.01444101_dt, 0.87350917_dt },
+                          { 0.1952138_dt, 0.7401732_dt, 0.4878018_dt, 0.8753203_dt, 0.4071133_dt, 0.01454818_dt, 0.7095418_dt, 0.36551023_dt } };
+
+    std::vector<raul::ParamAndGradImpl<Tensor>> trainableParams;
+    trainableParams.emplace_back(raul::ParamAndGradImpl<Tensor>{ fictiveParams, grads[0] });
+    trainableParams.emplace_back(raul::ParamAndGradImpl<Tensor>{ fictiveParams, grads[1] });
+    trainableParams.emplace_back(raul::ParamAndGradImpl<Tensor>{ fictiveParams, grads[2] });
+
+    constexpr raul::dtype realGlobalNorm = 2.891162_dt;
+    const raul::Tensor realOutput[]{ { 0.05049443_dt, 0.03572378_dt, 0.09259093_dt, 0.09706435_dt, 0.07206004_dt, 0.13970645_dt, 0.08529877_dt, 0.17261732_dt },
+                                     { 0.09605379_dt, 0.0382706_dt, 0.14958085_dt, 0.13442408_dt, 0.11156703_dt, 0.0917213_dt, 0.00249744_dt, 0.15106542_dt },
+                                     { 0.03376044_dt, 0.12800619_dt, 0.08436085_dt, 0.15137865_dt, 0.07040653_dt, 0.00251598_dt, 0.12270876_dt, 0.06321165_dt } };
+
+    for (size_t q = 0; q < std::size(clipNorms); ++q)
+    {
+        raul::postprocessing::GradientClipping clip(clipNorms[q]);
+        ASSERT_NO_THROW(clip.processGradients(trainableParams, networkParameters));
+
+        // Checks
+        ASSERT_TRUE(tools::expect_near_relative(clip.getGlobalNorm(), realGlobalNorm, eps));
+
+        size_t j = 0;
+        // No changes expected on 1st iteration
+        if (q == 0)
+        {
+            for (auto& [param, grad] : trainableParams)
+            {
+                for (size_t i = 0; i < grad.size(); ++i)
+                {
+                    EXPECT_EQ(grad[i], grads[j][i]);
+                }
+                j++;
+            }
+        }
+        else
+        {
+            for (auto& [param, grad] : trainableParams)
+            {
+                for (size_t i = 0; i < grad.size(); ++i)
+                {
+                    ASSERT_TRUE(tools::expect_near_relative(grad[i], realOutput[j][i], eps));
+                }
+                j++;
+            }
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/training/src/tests/tests/python/LAMB.py b/training/src/tests/tests/python/LAMB.py
new file mode 100644
index 00000000..0a10eead
--- /dev/null
+++ b/training/src/tests/tests/python/LAMB.py
@@ -0,0 +1,160 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+"""Lamb optimizer."""
+import collections
+import math
+import torch
+from torch.optim import Optimizer
+
+
+class Lamb(Optimizer):
+    r"""Implements Lamb algorithm.
+    It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        adam (bool, optional): always use trust ratio = 1, which turns this into
+            Adam. Useful for comparison purposes.
+    .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+    """
+
+    def __init__(
+        self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0, adam=False
+    ):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        self.adam = adam
+        super(Lamb, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        "Lamb does not support sparse gradients, consider SparseAdam instad."
+                    )
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
+                state["step"] += 1
+
+                # Decay the first and second moment running average coefficient
+                # m_t
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                # v_t
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+
+                # Paper v3 does not use debiasing.
+                # bias_correction1 = 1 - beta1 ** state['step']
+                # bias_correction2 = 1 - beta2 ** state['step']
+                # Apply bias to lr to avoid broadcast.
+                step_size = group[
+                    "lr"
+                ]  # * math.sqrt(bias_correction2) / bias_correction1
+                weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)
+                adam_step = exp_avg / exp_avg_sq.sqrt().add(group["eps"])
+                if group["weight_decay"] != 0:
+                    adam_step.add_(p.data, alpha=group["weight_decay"])
+
+                adam_norm = adam_step.pow(2).sum().sqrt()
+                if weight_norm == 0 or adam_norm == 0:
+                    trust_ratio = 1
+                else:
+                    trust_ratio = weight_norm / adam_norm
+                state["weight_norm"] = weight_norm
+                state["adam_norm"] = adam_norm
+                state["trust_ratio"] = trust_ratio
+                if self.adam:
+                    trust_ratio = 1
+                p.data.add_(adam_step, alpha=-step_size * trust_ratio)
+
+        return loss
+
+
+# Double step unit (no weight decay)
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+param = torch.rand(1, 2, 3, 4)
+param.grad = torch.rand(1, 2, 3, 4)
+print("Parameter(before update): ", param)
+print("Gradient: ", param.grad)
+optimizer = Lamb([param], lr=0.1, betas=(0.5, 0.8), eps=1)
+# First step
+optimizer.step()
+print("Parameter(after first step): ", param)
+# Second step
+optimizer.step()
+print("Parameter(after second step): ", param)
+
+# Double step unit (adam style)
+param = torch.rand(1, 2, 3, 4)
+param.grad = torch.rand(1, 2, 3, 4)
+print("Parameter(before update): ", param)
+print("Gradient: ", param.grad)
+optimizer = Lamb([param], lr=0.1, betas=(0.5, 0.8), eps=1, weight_decay=0.1, adam=True)
+# First step
+optimizer.step()
+print("Parameter(after first step): ", param)
+# Second step
+optimizer.step()
+print("Parameter(after second step): ", param)
+
+# Double step unit
+param = torch.rand(1, 2, 3, 4)
+param.grad = torch.rand(1, 2, 3, 4)
+print("Parameter(before update): ", param)
+print("Gradient: ", param.grad)
+optimizer = Lamb([param], lr=0.1, betas=(0.5, 0.8), eps=1, weight_decay=0.1, adam=False)
+# First step
+optimizer.step()
+print("Parameter(after first step): ", param)
+# Second step
+optimizer.step()
+print("Parameter(after second step): ", param)
diff --git a/training/src/tests/tests/python/MicrobatchingTestTopology.py b/training/src/tests/tests/python/MicrobatchingTestTopology.py
new file mode 100644
index 00000000..4ea45007
--- /dev/null
+++ b/training/src/tests/tests/python/MicrobatchingTestTopology.py
@@ -0,0 +1,185 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+#!/usr/bin/env python3
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import sys
+import getopt
+import time
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.fc = nn.Linear(28 * 28, 10)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.fc.weight)
+        nn.init.zeros_(self.fc.bias)
+
+    def forward(self, x):
+        out = x.reshape(-1, 28 * 28)
+        out = self.fc(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(backup_dir, index, model):
+    if not os.path.exists(backup_dir):
+        os.mkdir(backup_dir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(name + " size: ", param.size())
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        backup_dir + str(index) + "_" + name + "_" + str(i) + ".txt",
+                        "w",
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(
+                    backup_dir + str(index) + "_" + name + ".txt", "w"
+                ) as outfile:
+                    np.savetxt(outfile, param.data)
+
+
+def CrossEntropy(y, target, num_classes):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def show_reference():
+    print("./MicrobatchingTestTopology.py ", end="")
+    print("[--save-data] ", end="")
+    print("[--backup <path where should be saved weights and losses>] ")
+
+
+def main(command_line_options):
+    save_data = False
+    num_classes = 10
+    learning_rate = 0.05
+    batch_size = 100
+    backup_dir = "./data"
+    try:
+        options, _ = getopt.getopt(
+            command_line_options, "", ["help", "save-data", "backup="]
+        )
+        for option, argument in options:
+            if option == "--help":
+                show_reference()
+                return 0
+            if option == "--save-data":
+                save_data = True
+            elif option == "--backup":
+                backup_dir = argument
+            else:
+                print("ERROR: unknown option")
+                return 1
+
+    except getopt.GetoptError:
+        print("ERROR: wrong command line options")
+        return 2
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist",
+        train=True,
+        transform=transforms.ToTensor(),
+        download=True,
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet()
+
+    predict(test_loader, model)
+    if save_data:
+        saveWeights(backup_dir, 0, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+    total_step = len(train_loader)
+    if os.path.exists(backup_dir + "loss.txt"):
+        os.remove(backup_dir + "loss.txt")
+
+    timeTaken = 0
+    for i, (images, labels) in enumerate(train_loader):
+        start = time.time()
+        outputs = model(images)
+        loss, lossInput = CrossEntropy(outputs, labels, num_classes)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        timeTaken += time.time() - start
+
+        if i % 100 == 0 and save_data == True:
+            with open(backup_dir + "loss.txt", "a") as outfile:
+                print(loss.item(), file=outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/training/src/tests/tests/python/PaddingLayer.py b/training/src/tests/tests/python/PaddingLayer.py
new file mode 100644
index 00000000..87e830a1
--- /dev/null
+++ b/training/src/tests/tests/python/PaddingLayer.py
@@ -0,0 +1,489 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+#!/usr/bin/env python
+
+import unittest
+
+import torch
+import torch.nn as nn
+
+
+class PaddingLayerTests(unittest.TestCase):
+    def setUp(self):
+        self.filling_value = 5.0
+        fv = self.filling_value
+        self.symmetric_padding_result = torch.tensor(
+            [
+                [
+                    [
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv, fv],
+                        [fv, fv, fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv, fv],
+                        [fv, fv, fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                    ]
+                ]
+            ]
+        )
+
+        self.asymmetric_padding_result = torch.tensor(
+            [
+                [
+                    [
+                        [fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv],
+                        [fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv],
+                        [fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv],
+                    ]
+                ]
+            ]
+        )
+
+        self.asymmetric_padding_for_W_result = torch.tensor(
+            [
+                [
+                    [
+                        [fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv],
+                        [fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv],
+                        [fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv],
+                    ]
+                ]
+            ]
+        )
+
+        self.delta1_for_constant_pad = torch.tensor(
+            [
+                [
+                    [
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv, fv],
+                        [fv, fv, fv, fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv, fv],
+                        [fv, fv, fv, fv, 1.0, 1.0, 1.0, 1.0, 1.0, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                    ]
+                ]
+            ]
+        )
+
+        self.constant_pad_backward_pass_result1 = torch.tensor(
+            [
+                [
+                    [
+                        [1.0, 1.0, 1.0, 1.0, 1.0],
+                        [1.0, 1.0, 1.0, 1.0, 1.0],
+                        [1.0, 1.0, 1.0, 1.0, 1.0],
+                    ]
+                ]
+            ]
+        )
+
+        self.delta2_for_constant_pad = torch.tensor(
+            [
+                [
+                    [
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                        [fv, fv, fv, fv, 2.0, 2.0, 2.0, 2.0, 2.0, fv, fv, fv],
+                        [fv, fv, fv, fv, 2.0, 2.0, 2.0, 2.0, 2.0, fv, fv, fv],
+                        [fv, fv, fv, fv, 2.0, 2.0, 2.0, 2.0, 2.0, fv, fv, fv],
+                        [fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv, fv],
+                    ]
+                ]
+            ]
+        )
+
+        # self.backward_pass_result2 filled by 3.0 value because of torch.autograd package calculates gradient
+        # as sum of all results of backward() function calls
+        self.constant_pad_backward_pass_result2 = torch.tensor(
+            [
+                [
+                    [
+                        [3.0, 3.0, 3.0, 3.0, 3.0],
+                        [3.0, 3.0, 3.0, 3.0, 3.0],
+                        [3.0, 3.0, 3.0, 3.0, 3.0],
+                    ]
+                ]
+            ]
+        )
+
+        self.symmetric_reflection_padding_result = torch.tensor(
+            [
+                [
+                    [
+                        [10.0, 9.0, 8.0, 9.0, 10.0, 11.0, 10.0, 9.0],
+                        [6.0, 5.0, 4.0, 5.0, 6.0, 7.0, 6.0, 5.0],
+                        [2.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 1.0],
+                        [6.0, 5.0, 4.0, 5.0, 6.0, 7.0, 6.0, 5.0],
+                        [10.0, 9.0, 8.0, 9.0, 10.0, 11.0, 10.0, 9.0],
+                        [6.0, 5.0, 4.0, 5.0, 6.0, 7.0, 6.0, 5.0],
+                        [2.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 1.0],
+                    ]
+                ]
+            ]
+        )
+
+        self.asymmetric_reflection_padding_result = torch.tensor(
+            [
+                [
+                    [
+                        [7.0, 6.0, 5.0, 4.0, 5.0, 6.0, 7.0, 6.0, 5.0],
+                        [3.0, 2.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 1.0],
+                        [7.0, 6.0, 5.0, 4.0, 5.0, 6.0, 7.0, 6.0, 5.0],
+                        [11.0, 10.0, 9.0, 8.0, 9.0, 10.0, 11.0, 10.0, 9.0],
+                    ]
+                ]
+            ]
+        )
+
+        self.reflection_pad_backward_pass_result = torch.tensor(
+            [
+                [
+                    [
+                        [1.0, 3.0, 3.0, 3.0, 2.0],
+                        [3.0, 9.0, 9.0, 9.0, 6.0],
+                        [2.0, 6.0, 6.0, 6.0, 4.0],
+                    ]
+                ]
+            ]
+        )
+
+        self.symmetric_replication_padding_result = torch.tensor(
+            [
+                [
+                    [
+                        [0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 3.0, 3.0],
+                        [0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 3.0, 3.0],
+                        [0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 3.0, 3.0],
+                        [4.0, 4.0, 4.0, 5.0, 6.0, 7.0, 7.0, 7.0],
+                        [8.0, 8.0, 8.0, 9.0, 10.0, 11.0, 11.0, 11.0],
+                        [8.0, 8.0, 8.0, 9.0, 10.0, 11.0, 11.0, 11.0],
+                        [8.0, 8.0, 8.0, 9.0, 10.0, 11.0, 11.0, 11.0],
+                    ]
+                ]
+            ]
+        )
+
+        self.asymmetric_replication_padding_result = torch.tensor(
+            [
+                [
+                    [
+                        [0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 3.0, 3.0],
+                        [0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 3.0, 3.0],
+                        [4.0, 4.0, 4.0, 4.0, 5.0, 6.0, 7.0, 7.0, 7.0],
+                        [8.0, 8.0, 8.0, 8.0, 9.0, 10.0, 11.0, 11.0, 11.0],
+                    ]
+                ]
+            ]
+        )
+
+        self.replication_pad_backward_pass_result = torch.tensor(
+            [
+                [
+                    [
+                        [15.0, 3.0, 3.0, 3.0, 12.0],
+                        [5.0, 1.0, 1.0, 1.0, 4.0],
+                        [10.0, 2.0, 2.0, 2.0, 8.0],
+                    ]
+                ]
+            ]
+        )
+
+    def test_symmetric_padding_for_each_side_of_H_and_W(self):
+        common_padding = 3
+        input = torch.ones(1, 1, 3, 5, dtype=torch.float)
+        model = nn.ConstantPad2d(common_padding, self.filling_value)
+        output = model(input)
+
+        added_from_left = common_padding
+        added_from_right = common_padding
+        added_from_top = common_padding
+        added_from_bottom = common_padding
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 5]))
+        self.assertEqual(
+            output.size(),
+            torch.Size(
+                [
+                    1,
+                    1,
+                    3 + added_from_top + added_from_bottom,
+                    5 + added_from_left + added_from_right,
+                ]
+            ),
+        )
+        self.assertTrue(torch.equal(output, self.symmetric_padding_result))
+
+    def test_asymmetric_padding_for_each_side_of_H_and_W(self):
+        paddings = [1, 2, 3, 4]
+        input = torch.ones(1, 1, 3, 5, dtype=torch.float)
+        model = nn.ConstantPad2d(paddings, self.filling_value)
+        output = model(input)
+
+        added_from_left = paddings[0]
+        added_from_right = paddings[1]
+        added_from_top = paddings[2]
+        added_from_bottom = paddings[3]
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 5]))
+        self.assertEqual(
+            output.size(),
+            torch.Size(
+                [
+                    1,
+                    1,
+                    3 + added_from_top + added_from_bottom,
+                    5 + added_from_left + added_from_right,
+                ]
+            ),
+        )
+        self.assertTrue(torch.equal(output, self.asymmetric_padding_result))
+
+    def test_assymmetric_padding_only_for_W(self):
+        paddings = [1, 2]
+        input = torch.ones(1, 1, 3, 5, dtype=torch.float)
+        model = nn.ConstantPad2d(paddings, self.filling_value)
+        output = model(input)
+
+        added_from_left = paddings[0]
+        added_from_right = paddings[1]
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 5]))
+        self.assertEqual(
+            output.size(), torch.Size([1, 1, 3, 5 + added_from_left + added_from_right])
+        )
+        self.assertTrue(torch.equal(output, self.asymmetric_padding_for_W_result))
+
+    def test_setting_no_paddings_for_each_side_of_H_and_W(self):
+        no_padding = 0
+        input = torch.ones(1, 1, 3, 5, dtype=torch.float)
+        model = nn.ConstantPad2d(no_padding, 0.0)
+        output = model(input)
+
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 5]))
+        self.assertEqual(output.size(), input.size())
+        self.assertTrue(torch.equal(output, input))
+
+        no_paddings = [0, 0, 0, 0]
+        model = nn.ConstantPad2d(no_paddings, 0.0)
+        output = model(input)
+
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 5]))
+        self.assertEqual(output.size(), input.size())
+        self.assertTrue(torch.equal(output, input))
+
+    def test_that_cannot_be_set_different_paddings_only_for_left_right_of_W_and_top_of_H(
+        self,
+    ):
+        input = torch.ones(1, 1, 3, 5, dtype=torch.float)
+        model = nn.ConstantPad2d([1, 2, 3], 0.0)
+        self.assertRaises(AssertionError, model, input)
+
+    def test_that_cannot_be_set_different_paddings_only_for_left_of_W(self):
+        input = torch.ones(1, 1, 3, 5, dtype=torch.float)
+        model = nn.ConstantPad2d([1], 0.0)
+        self.assertRaises(AssertionError, model, input)
+
+    def test_that_cannot_be_set_more_then_4_paddings(self):
+        input = torch.ones(1, 1, 3, 5, dtype=torch.float)
+        model = nn.ConstantPad2d([1, 2, 3, 4, 5], 0.0)
+        self.assertRaises(AssertionError, model, input)
+
+    def test_backward_computations_of_ConstantPad2d(self):
+        # since ConstantPad2d is functional layer (without weights)
+        # gradient of ConstantPad2d input is result of calculation
+        # of function ConstantPad2d_derivative(backward_input)
+        # where ConstantPad2d_derivative calculation result is tensor
+        # with the same shape as input and
+        # grad[n][c][h][w] = backward_input[n][c][h + padding_top][w + padding_left]
+        paddings = [4, 3, 2, 1]
+        input = torch.ones(1, 1, 3, 5, dtype=torch.float, requires_grad=True)
+        model = nn.ConstantPad2d(paddings, self.filling_value)
+        output = model(input)
+
+        output.backward(self.delta1_for_constant_pad)
+        self.assertEqual(input.grad.size(), input.size())
+        self.assertTrue(
+            torch.equal(input.grad, self.constant_pad_backward_pass_result1)
+        )
+
+        output.backward(self.delta2_for_constant_pad)
+        self.assertEqual(input.grad.size(), input.size())
+        self.assertTrue(
+            torch.equal(input.grad, self.constant_pad_backward_pass_result2)
+        )
+
+    def test_symmetric_reflection_padding(self):
+        common_padding = 2
+        input = torch.arange(12, dtype=torch.float).reshape(1, 1, 3, 4)
+        model = nn.ReflectionPad2d(common_padding)
+        output = model(input)
+
+        added_from_left = common_padding
+        added_from_right = common_padding
+        added_from_top = common_padding
+        added_from_bottom = common_padding
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 4]))
+        self.assertEqual(
+            output.size(),
+            torch.Size(
+                [
+                    1,
+                    1,
+                    3 + added_from_top + added_from_bottom,
+                    4 + added_from_left + added_from_right,
+                ]
+            ),
+        )
+        self.assertTrue(torch.equal(output, self.symmetric_reflection_padding_result))
+
+    def test_asymmetric_reflection_padding(self):
+        paddings = [3, 2, 1, 0]
+        input = torch.arange(12, dtype=torch.float).reshape(1, 1, 3, 4)
+        model = nn.ReflectionPad2d(paddings)
+        output = model(input)
+
+        added_from_left = paddings[0]
+        added_from_right = paddings[1]
+        added_from_top = paddings[2]
+        added_from_bottom = paddings[3]
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 4]))
+        self.assertEqual(
+            output.size(),
+            torch.Size(
+                [
+                    1,
+                    1,
+                    3 + added_from_top + added_from_bottom,
+                    4 + added_from_left + added_from_right,
+                ]
+            ),
+        )
+        self.assertTrue(torch.equal(output, self.asymmetric_reflection_padding_result))
+
+    def test_no_reflection_padding(self):
+        input = torch.arange(12, dtype=torch.float).reshape(1, 1, 3, 4)
+        model = nn.ReflectionPad2d(0)
+        output = model(input)
+
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 4]))
+        self.assertEqual(output.size(), input.size())
+        self.assertTrue(torch.equal(output, input))
+
+    def test_that_reflection_padding_cannot_be_greater_then_or_equal_to_the_corresponding_dimension(
+        self,
+    ):
+        input = torch.arange(12, dtype=torch.float).reshape(1, 1, 3, 4)
+        model = nn.ReflectionPad2d(3)
+        self.assertRaises(RuntimeError, model, input)
+
+        model = nn.ReflectionPad2d([4, 1, 1, 1])
+        self.assertRaises(RuntimeError, model, input)
+
+    def test_backward_computations_of_ReflectionPad2d(self):
+        paddings = [4, 3, 2, 1]
+        input = torch.arange(1 * 1 * 3 * 5, dtype=torch.float).reshape(1, 1, 3, 5)
+        input.requires_grad_(True)
+        model = nn.ReflectionPad2d(paddings)
+        output = model(input)
+
+        delta = torch.ones(1, 1, 6, 12, dtype=torch.float)
+        output.backward(delta)
+        self.assertEqual(input.grad.size(), input.size())
+        self.assertTrue(
+            torch.equal(input.grad, self.reflection_pad_backward_pass_result)
+        )
+
+    def test_symmetric_replecation_padding(self):
+        common_padding = 2
+        input = torch.arange(12, dtype=torch.float).reshape(1, 1, 3, 4)
+        model = nn.ReplicationPad2d(common_padding)
+        output = model(input)
+
+        added_from_left = common_padding
+        added_from_right = common_padding
+        added_from_top = common_padding
+        added_from_bottom = common_padding
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 4]))
+        self.assertEqual(
+            output.size(),
+            torch.Size(
+                [
+                    1,
+                    1,
+                    3 + added_from_top + added_from_bottom,
+                    4 + added_from_left + added_from_right,
+                ]
+            ),
+        )
+        self.assertTrue(torch.equal(output, self.symmetric_replication_padding_result))
+
+    def test_asymmetric_replication_padding(self):
+        paddings = [3, 2, 1, 0]
+        input = torch.arange(12, dtype=torch.float).reshape(1, 1, 3, 4)
+        model = nn.ReplicationPad2d(paddings)
+        output = model(input)
+
+        added_from_left = paddings[0]
+        added_from_right = paddings[1]
+        added_from_top = paddings[2]
+        added_from_bottom = paddings[3]
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 4]))
+        self.assertEqual(
+            output.size(),
+            torch.Size(
+                [
+                    1,
+                    1,
+                    3 + added_from_top + added_from_bottom,
+                    4 + added_from_left + added_from_right,
+                ]
+            ),
+        )
+        self.assertTrue(torch.equal(output, self.asymmetric_replication_padding_result))
+
+    def test_no_replication_padding(self):
+        input = torch.arange(12, dtype=torch.float).reshape(1, 1, 3, 4)
+        model = nn.ReplicationPad2d(0)
+        output = model(input)
+
+        self.assertEqual(input.size(), torch.Size([1, 1, 3, 4]))
+        self.assertEqual(output.size(), input.size())
+        self.assertTrue(torch.equal(output, input))
+
+    def test_backward_computations_of_ReplicationPad2d(self):
+        paddings = [4, 3, 2, 1]
+        input = torch.arange(1 * 1 * 3 * 5, dtype=torch.float).reshape(1, 1, 3, 5)
+        input.requires_grad_(True)
+        model = nn.ReplicationPad2d(paddings)
+        output = model(input)
+
+        delta = torch.ones(1, 1, 6, 12, dtype=torch.float)
+        output.backward(delta)
+        self.assertEqual(input.grad.size(), input.size())
+        self.assertTrue(
+            torch.equal(input.grad, self.replication_pad_backward_pass_result)
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/training/src/tests/tests/python/PaddingLayerTestTopology.py b/training/src/tests/tests/python/PaddingLayerTestTopology.py
new file mode 100644
index 00000000..3e79310a
--- /dev/null
+++ b/training/src/tests/tests/python/PaddingLayerTestTopology.py
@@ -0,0 +1,254 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+#!/usr/bin/env python3
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import sys
+import getopt
+import time
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes, test_type):
+        super(NeuralNet, self).__init__()
+        self.test_type = test_type
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(1, 1, kernel_size=5, stride=1)
+        if self.test_type == "const_sym_pad":
+            self.pad1 = nn.ConstantPad2d(20, 0.0)
+        elif self.test_type == "ref_sym_pad":
+            self.pad1 = nn.ReflectionPad2d(20)
+        elif self.test_type == "rep_sym_pad":
+            self.pad1 = nn.ReplicationPad2d(20)
+        elif self.test_type == "const_asym_pad":
+            self.pad1 = nn.ConstantPad2d([10, 5, 20, 15], 0.0)
+        elif self.test_type == "ref_asym_pad":
+            self.pad1 = nn.ReflectionPad2d([10, 5, 20, 15])
+        elif self.test_type == "rep_asym_pad":
+            self.pad1 = nn.ReplicationPad2d([10, 5, 20, 15])
+        else:
+            print("ERROR: unknown test type")
+
+        if self.test_type == "const_sym_pad":
+            self.pad2 = nn.ConstantPad2d(20, 0.0)
+        elif self.test_type == "ref_sym_pad":
+            self.pad2 = nn.ReflectionPad2d(20)
+        elif self.test_type == "rep_sym_pad":
+            self.pad2 = nn.ReplicationPad2d(20)
+        elif self.test_type == "const_asym_pad":
+            self.pad2 = nn.ConstantPad2d([10, 5, 20, 15], 0.0)
+        elif self.test_type == "ref_asym_pad":
+            self.pad2 = nn.ReflectionPad2d([10, 5, 20, 15])
+        elif self.test_type == "rep_asym_pad":
+            self.pad2 = nn.ReplicationPad2d([10, 5, 20, 15])
+        else:
+            print("ERROR: unknown test type")
+        if (
+            self.test_type == "const_sym_pad"
+            or self.test_type == "ref_sym_pad"
+            or self.test_type == "rep_sym_pad"
+        ):
+            self.fc1 = nn.Linear(10816, 10)
+        elif (
+            self.test_type == "const_asym_pad"
+            or self.test_type == "ref_asym_pad"
+            or self.test_type == "rep_asym_pad"
+        ):
+            self.fc1 = nn.Linear(5076, 10)
+        else:
+            print("ERROR: unknown test type")
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.fc1.bias)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.pad1(out)
+        out = self.pad2(out)
+        if (
+            self.test_type == "const_sym_pad"
+            or self.test_type == "ref_sym_pad"
+            or self.test_type == "rep_sym_pad"
+        ):
+            out = out.reshape(-1, 10816)
+        elif (
+            self.test_type == "const_asym_pad"
+            or self.test_type == "ref_asym_pad"
+            or self.test_type == "rep_asym_pad"
+        ):
+            out = out.reshape(-1, 5076)
+        out = self.fc1(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(backup_dir, index, model):
+    if not os.path.exists(backup_dir):
+        os.mkdir(backup_dir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(name + " size: ", param.size())
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        backup_dir + str(index) + "_" + name + "_" + str(i) + ".txt",
+                        "w",
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(
+                    backup_dir + str(index) + "_" + name + ".txt", "w"
+                ) as outfile:
+                    np.savetxt(outfile, param.data)
+
+
+def CrossEntropy(y, target, num_classes):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def show_reference():
+    print("./PaddingLayerTestTopology.py ", end="")
+    print("<--test <test type>> ", end="")
+    print("[--save-data] ", end="")
+    print("[--backup <path where should be saved weights and losses>] ")
+    print(", where test type can be 'const_sym_pad'")
+
+
+def main(command_line_options):
+    save_data = False
+    test_type = None
+    num_classes = 10
+    learning_rate = 0.01
+    batch_size = 50
+    backup_dir = "./data"
+    try:
+        options, _ = getopt.getopt(
+            command_line_options, "", ["help", "save-data", "test=", "backup="]
+        )
+        for option, argument in options:
+            if option == "--help":
+                show_reference()
+                return 0
+            if option == "--save-data":
+                save_data = True
+            elif option == "--test":
+                test_type = argument
+            elif option == "--backup":
+                backup_dir = argument
+            else:
+                print("ERROR: unknown option")
+                return 1
+
+    except getopt.GetoptError:
+        print("ERROR: wrong command line options")
+        return 2
+
+    if test_type is None:
+        print("ERROR: test type should be defined with option --test")
+        return 1
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist",
+        train=True,
+        transform=transforms.ToTensor(),
+        download=True,
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes, test_type)
+
+    predict(test_loader, model)
+    if save_data:
+        saveWeights(backup_dir, 0, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+    total_step = len(train_loader)
+    if os.path.exists(backup_dir + "loss.txt"):
+        os.remove(backup_dir + "loss.txt")
+
+    timeTaken = 0
+    for i, (images, labels) in enumerate(train_loader):
+        start = time.time()
+        outputs = model(images)
+        loss, lossInput = CrossEntropy(outputs, labels, num_classes)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        timeTaken += time.time() - start
+
+        if i % 100 == 0 and save_data == True:
+            with open(backup_dir + "loss.txt", "a") as outfile:
+                print(loss.item(), file=outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/training/src/tests/tests/python/Pipfile b/training/src/tests/tests/python/Pipfile
new file mode 100644
index 00000000..91fd8bc9
--- /dev/null
+++ b/training/src/tests/tests/python/Pipfile
@@ -0,0 +1,21 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = false
+
+[dev-packages]
+
+[packages]
+numpy = "*"
+pandas = "*"
+matplotlib = "*"
+jupyterlab = "*"
+torch = "*"
+torchvision = "*"
+black = "*"
+
+[requires]
+python_version = "3.8"
+
+[pipenv]
+allow_prereleases = true
diff --git a/training/src/tests/tests/python/RMSprop.py b/training/src/tests/tests/python/RMSprop.py
new file mode 100644
index 00000000..795b7929
--- /dev/null
+++ b/training/src/tests/tests/python/RMSprop.py
@@ -0,0 +1,64 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import torch
+from torch.optim import rmsprop
+
+# Test 1
+torch.manual_seed(0)
+torch.set_printoptions(precision=6)
+param = torch.rand(1, 2, 3, 4)
+param.grad = torch.rand(1, 2, 3, 4)
+print("Parameter: ", param)
+print("Gradeient: ", param.grad)
+# First step
+opt = rmsprop.RMSprop(
+    [param], lr=0.1, alpha=0.9, eps=0.1, weight_decay=0.1, momentum=0.1, centered=True
+)
+opt.step()
+print("Parameter (after first step): ", param)
+# Second step
+opt.step()
+print("Parameter (after second step): ", param)
+
+# Test 2
+param = torch.rand(1, 2, 3, 4)
+param.grad = torch.rand(1, 2, 3, 4)
+print("Parameter: ", param)
+print("Gradeient: ", param.grad)
+
+# First step
+opt = rmsprop.RMSprop(
+    [param], lr=0.1, alpha=0.9, eps=0.1, weight_decay=0.1, momentum=0.1, centered=False
+)
+opt.step()
+print("Parameter (after first step): ", param)
+# Second step
+opt.step()
+print("Parameter (after second step): ", param)
+
+# Test 3
+param = torch.rand(1, 2, 3, 4)
+param.grad = torch.rand(1, 2, 3, 4)
+print("Parameter: ", param)
+print("Gradeient: ", param.grad)
+
+# First step
+opt = rmsprop.RMSprop(
+    [param], lr=0.1, alpha=0.9, eps=0.1, weight_decay=0.1, momentum=0.0, centered=False
+)
+opt.step()
+print("Parameter (after first step): ", param)
+# Second step
+opt.step()
+print("Parameter (after second step): ", param)
diff --git a/training/src/tests/tests/python/RMSpropTF.py b/training/src/tests/tests/python/RMSpropTF.py
new file mode 100644
index 00000000..e4b1d86a
--- /dev/null
+++ b/training/src/tests/tests/python/RMSpropTF.py
@@ -0,0 +1,43 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow as tf
+
+tf.random.set_seed(0)
+opt = tf.keras.optimizers.RMSprop(
+    learning_rate=0.1,
+    rho=0.9,
+    momentum=0.1,
+    epsilon=0.1,
+    centered=False,
+)
+var1 = tf.Variable(tf.random.normal([1, 2, 3, 4]))
+loss = lambda: (var1 ** 2) / 2.0  # grad = var1
+print("Parameter: ", var1)
+step_count = opt.minimize(loss, [var1])
+print("Parameter (after first step): ", var1)
+step_count = opt.minimize(loss, [var1])
+print("Parameter (after second step): ", var1)
+
+opt = tf.keras.optimizers.RMSprop(
+    learning_rate=0.1,
+    rho=0.9,
+    momentum=0.1,
+    epsilon=0.1,
+    centered=True,
+)
+print("Parameter: ", var1)
+step_count = opt.minimize(loss, [var1])
+print("Parameter (after first step): ", var1)
+step_count = opt.minimize(loss, [var1])
+print("Parameter (after second step): ", var1)
diff --git a/training/src/tests/tests/python/Ranger.py b/training/src/tests/tests/python/Ranger.py
new file mode 100644
index 00000000..539ad1c4
--- /dev/null
+++ b/training/src/tests/tests/python/Ranger.py
@@ -0,0 +1,212 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import math
+import torch
+from torch.optim.optimizer import Optimizer, required
+
+
+class Ranger(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,  # lr
+        alpha=0.5,
+        k=6,
+        N_sma_threshhold=5,  # Ranger options
+        betas=(0.95, 0.999),
+        eps=1e-5,
+        weight_decay=0,  # Adam options
+        # Gradient centralization on or off, applied to conv layers only or conv + fc layers
+        use_gc=True,
+        gc_conv_only=False,
+    ):
+        # parameter checks
+        if not 0.0 <= alpha <= 1.0:
+            raise ValueError(f"Invalid slow update rate: {alpha}")
+        if not 1 <= k:
+            raise ValueError(f"Invalid lookahead steps: {k}")
+        if not lr > 0:
+            raise ValueError(f"Invalid Learning Rate: {lr}")
+        if not eps > 0:
+            raise ValueError(f"Invalid eps: {eps}")
+
+        # parameter comments:
+        # beta1 (momentum) of .95 seems to work better than .90...
+        # N_sma_threshold of 5 seems better in testing than 4.
+        # In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you.
+        # prep defaults and init torch.optim base
+        defaults = dict(
+            lr=lr,
+            alpha=alpha,
+            k=k,
+            step_counter=0,
+            betas=betas,
+            N_sma_threshhold=N_sma_threshhold,
+            eps=eps,
+            weight_decay=weight_decay,
+        )
+        super().__init__(params, defaults)
+
+        # adjustable threshold
+        self.N_sma_threshhold = N_sma_threshhold
+        # look ahead params
+        self.alpha = alpha
+        self.k = k
+
+        # radam buffer for state
+        self.radam_buffer = [[None, None, None] for ind in range(10)]
+
+        # gc on or off
+        self.use_gc = use_gc
+        # level of gradient centralization
+        self.gc_gradient_threshold = 3 if gc_conv_only else 1
+
+        print(
+            f"Ranger optimizer loaded. \nGradient Centralization usage = {self.use_gc}"
+        )
+        if self.use_gc and self.gc_gradient_threshold == 1:
+            print(f"GC applied to both conv and fc layers")
+        elif self.use_gc and self.gc_gradient_threshold == 3:
+            print(f"GC applied to conv layers only")
+
+    def __setstate__(self, state):
+        print("set state called")
+        super(Ranger, self).__setstate__(state)
+
+    def step(self, closure=None):
+        loss = None
+        # note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure.
+        # Uncomment if you need to use the actual closure...
+        # if closure is not None:
+        # loss = closure()
+
+        # Evaluate averages and grad, update param tensors
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        "Ranger optimizer does not support sparse gradients"
+                    )
+
+                p_data_fp32 = p.data.float()
+                state = self.state[p]  # get state dict for this param
+
+                if (
+                    len(state) == 0
+                ):  # if first time to run...init dictionary with our desired entries
+                    # if self.first_run_check==0:
+                    # self.first_run_check=1\
+                    # print("Initializing slow buffer...should not see this at load from saved model!")
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
+                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
+
+                    # look ahead weight storage now in state dict
+                    state["slow_buffer"] = torch.empty_like(p.data)
+                    state["slow_buffer"].copy_(p.data)
+                else:
+                    state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
+                    state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)
+
+                # begin computations
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
+
+                # GC operation for Conv layers and FC layers
+                if grad.dim() > self.gc_gradient_threshold:
+                    grad.add_(-grad.mean(dim=tuple(range(1, grad.dim())), keepdim=True))
+
+                state["step"] += 1
+                # compute variance mov avg
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                # compute mean moving avg
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+
+                buffered = self.radam_buffer[int(state["step"] % 10)]
+
+                if state["step"] == buffered[0]:
+                    N_sma, step_size = buffered[1], buffered[2]
+                else:
+                    buffered[0] = state["step"]
+                    beta2_t = beta2 ** state["step"]
+                    N_sma_max = 2 / (1 - beta2) - 1
+                    N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
+                    buffered[1] = N_sma
+                    if N_sma > self.N_sma_threshhold:
+                        step_size = math.sqrt(
+                            (1 - beta2_t)
+                            * (N_sma - 4)
+                            / (N_sma_max - 4)
+                            * (N_sma - 2)
+                            / N_sma
+                            * N_sma_max
+                            / (N_sma_max - 2)
+                        ) / (1 - beta1 ** state["step"])
+                    else:
+                        step_size = 1.0 / (1 - beta1 ** state["step"])
+                    buffered[2] = step_size
+
+                if group["weight_decay"] != 0:
+                    p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)
+
+                # apply lr
+                if N_sma > self.N_sma_threshhold:
+                    denom = exp_avg_sq.sqrt().add_(group["eps"])
+                    p_data_fp32.addcdiv_(-step_size * group["lr"], exp_avg, denom)
+                else:
+                    p_data_fp32.add_(-step_size * group["lr"], exp_avg)
+
+                p.data.copy_(p_data_fp32)
+                # integrated look ahead...
+                # we do it at the param level instead of group level
+                if state["step"] % group["k"] == 0:
+                    # get access to slow param tensor
+                    slow_p = state["slow_buffer"]
+                    # (fast weights - slow weights) * alpha
+                    slow_p.add_(self.alpha, p.data - slow_p)
+                    # copy interpolated weights to RAdam param tensor
+                    p.data.copy_(slow_p)
+        return loss
+
+
+# Three steps unit (k = 1, n_sma = 0)
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+param = torch.rand(1, 2, 3, 4)
+param.grad = torch.rand(1, 2, 3, 4)
+print("Parameter(before update): ", param)
+print("Gradient: ", param.grad)
+optimizer = Ranger(
+    [param], lr=0.1, betas=(0.5, 0.8), eps=1, k=1, N_sma_threshhold=0, weight_decay=0.3
+)
+# First step
+optimizer.step()
+print("Parameter(after first step): ", param)
+# Second step
+optimizer.step()
+print("Parameter(after second step): ", param)
+
+# Ten steps unit
+param = torch.rand(1, 2, 3, 4)
+param.grad = torch.rand(1, 2, 3, 4)
+print("Parameter(before update): ", param)
+print("Gradient: ", param.grad)
+optimizer = Ranger([param], lr=0.1, betas=(0.5, 0.8), eps=1, weight_decay=0.3)
+for i in range(0, 10):
+    optimizer.step()
+print("Parameter(after tenth step): ", param)
diff --git a/training/src/tests/tests/python/Rprop.py b/training/src/tests/tests/python/Rprop.py
new file mode 100644
index 00000000..b6ff6662
--- /dev/null
+++ b/training/src/tests/tests/python/Rprop.py
@@ -0,0 +1,52 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import torch
+
+# Step unit
+torch.manual_seed(0)
+param = torch.rand(1, 2, 3, 4)
+grad = torch.normal(mean=0.0, std=1.0, size=(1, 2, 3, 4))
+param.grad = grad
+optimizer = torch.optim.Rprop([param], lr=0.1)
+print("Initial param: ", param)
+print("Gradient: ", grad)
+optimizer.step()
+print("Updated param: ", param)
+optimizer.step()
+print("Updated param: ", param)
+
+# N steps
+torch.manual_seed(0)
+
+param = torch.rand(1, 2, 3, requires_grad=True)
+multipliers = [
+    torch.rand(1, 2, 3),
+    torch.rand(1, 2, 3),
+    torch.rand(1, 2, 3),
+    (1.0 - (-1.0)) * torch.rand(1, 2, 3) - 1.0,
+    (1.0 - (-1.0)) * torch.rand(1, 2, 3) - 1.0,
+    torch.rand(1, 2, 3),
+    torch.zeros(1, 2, 3),
+    (1.0 - (-1.0)) * torch.rand(1, 2, 3) - 1.0,
+]
+optimizer = torch.optim.Rprop([param], lr=0.1, etas=(0.1, 20), step_sizes=(0.001, 1.2))
+for i in range(len(multipliers)):
+    c = param * multipliers[i]
+    c.sum().backward()
+    print("Param before optimization: ", param)
+    print("Grad for param: ", param.grad)
+    optimizer.step()
+    optimizer.zero_grad()
+    print("State of optimizer: ", optimizer.state_dict())
+    print("Param after optimization: ", param)
diff --git a/training/src/tests/tests/python/adamw.py b/training/src/tests/tests/python/adamw.py
new file mode 100644
index 00000000..5766491e
--- /dev/null
+++ b/training/src/tests/tests/python/adamw.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+params = torch.tensor([1.0, 1, 1, 1, 1, 1, 1, 1, 1, 1], requires_grad=True)
+params.grad = torch.tensor([1.0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+adam = torch.optim.AdamW([params], lr=0.8, betas=(0.5, 0.75), eps=0.6, weight_decay=0.2)
+
+# First step
+adam.step()
+print("Params after first step:", params)
+
+# Second step
+adam.step()
+print("Params after second step:", params)
diff --git a/training/src/tests/tests/python/additive_attention.py b/training/src/tests/tests/python/additive_attention.py
new file mode 100644
index 00000000..99930ffa
--- /dev/null
+++ b/training/src/tests/tests/python/additive_attention.py
@@ -0,0 +1,39 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import tensorflow as tf
+
+tf.random.set_seed(42)
+
+query = tf.random.uniform(shape=[2, 3, 3])
+value = tf.random.uniform(shape=[2, 2, 3])
+key = tf.random.uniform(shape=[2, 2, 3])
+attn = tf.keras.layers.AdditiveAttention(False)
+print("Attention: ", attn)
+# Calculate attention and gradients
+# Query
+with tf.GradientTape() as g:
+    g.watch(query)
+    y = attn([query, value, key])
+print("Query gradient", g.gradient(y, query))
+# Value
+with tf.GradientTape() as g:
+    g.watch(value)
+    y = attn([query, value, key])
+print("Value gradient", g.gradient(y, value))
+# Key
+with tf.GradientTape() as g:
+    g.watch(key)
+    y = attn([query, value, key])
+print("Key gradient", g.gradient(y, key))
diff --git a/training/src/tests/tests/python/argmax.py b/training/src/tests/tests/python/argmax.py
new file mode 100644
index 00000000..dd37fe3c
--- /dev/null
+++ b/training/src/tests/tests/python/argmax.py
@@ -0,0 +1,61 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# Default
+torch.manual_seed(0)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z = torch.max(x)
+z.requires_grad_ = True
+z.backward()
+print("Input x = ", x)
+print("Result (Default) = ", z)
+print("Gradient for input = ", x.grad)
+
+# Batch dimension
+torch.manual_seed(0)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z0 = torch.max(x, 0)
+print("Result (Batch) = ", z0)
+z0.values.requires_grad_ = True
+z0.values.sum().backward()
+print("Gradient for input = ", x.grad)
+
+# Depth dimension
+torch.manual_seed(0)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z1 = torch.max(x, 1)
+print("Result (Depth) = ", z1)
+z1.values.requires_grad_ = True
+z1.values.sum().backward()
+print("Gradient for input = ", x.grad)
+
+# Height dimension
+torch.manual_seed(0)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z2 = torch.max(x, 2)
+print("Result (Depth) = ", z2)
+z2.values.requires_grad_ = True
+z2.values.sum().backward()
+print("Gradient for input = ", x.grad)
+
+# Width dimension
+torch.manual_seed(0)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z3 = torch.max(x, 3)
+print("Result (Depth) = ", z3)
+z3.values.requires_grad_ = True
+z3.values.sum().backward()
+print("Gradient for input = ", x.grad)
diff --git a/training/src/tests/tests/python/argmin.py b/training/src/tests/tests/python/argmin.py
new file mode 100644
index 00000000..bb5a8577
--- /dev/null
+++ b/training/src/tests/tests/python/argmin.py
@@ -0,0 +1,61 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# Default
+torch.manual_seed(0)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z = torch.max(x)
+z.requires_grad_ = True
+z.backward()
+print("Input x = ", x)
+print("Result (Default) = ", z)
+print("Gradient for input = ", x.grad)
+
+# Batch dimension
+torch.manual_seed(0)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z0 = torch.min(x, 0)
+print("Result (Batch) = ", z0)
+z0.values.requires_grad_ = True
+z0.values.sum().backward()
+print("Gradient for input = ", x.grad)
+
+# Depth dimension
+torch.manual_seed(0)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z1 = torch.min(x, 1)
+print("Result (Depth) = ", z1)
+z1.values.requires_grad_ = True
+z1.values.sum().backward()
+print("Gradient for input = ", x.grad)
+
+# Height dimension
+torch.manual_seed(0)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z2 = torch.min(x, 2)
+print("Result (Depth) = ", z2)
+z2.values.requires_grad_ = True
+z2.values.sum().backward()
+print("Gradient for input = ", x.grad)
+
+# Width dimension
+torch.manual_seed(0)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z3 = torch.min(x, 3)
+print("Result (Depth) = ", z3)
+z3.values.requires_grad_ = True
+z3.values.sum().backward()
+print("Gradient for input = ", x.grad)
diff --git a/training/src/tests/tests/python/asgd.py b/training/src/tests/tests/python/asgd.py
new file mode 100644
index 00000000..d1a4ee57
--- /dev/null
+++ b/training/src/tests/tests/python/asgd.py
@@ -0,0 +1,32 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import math
+import torch
+from torch.optim import ASGD
+
+torch.manual_seed(0)
+torch.set_printoptions(precision=6)
+param = torch.rand(1, 2, 3, 4)
+param.grad = torch.rand(1, 2, 3, 4)
+optimizer = ASGD([param], lr=0.1, lambd=0.01, weight_decay=0.1)
+print("Param(initial): ", param)
+print("Gradient: ", param.grad)
+
+# First step
+optimizer.step()
+print("Param(after first step): ", param)
+
+# Second step
+optimizer.step()
+print("Param(after second step): ", param)
diff --git a/training/src/tests/tests/python/attention.py b/training/src/tests/tests/python/attention.py
new file mode 100644
index 00000000..ac42f896
--- /dev/null
+++ b/training/src/tests/tests/python/attention.py
@@ -0,0 +1,62 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import math
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+
+def attention(query, key, value, mask=None, dropout=None):
+    "Compute 'Scaled Dot Product Attention'"
+    d_k = query.size(-1)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+    # print('scores', scores)
+    if mask is not None:
+        scores = scores.masked_fill(mask == 0, -1e9)
+    scores.retain_grad()
+    p_attn = F.softmax(scores, dim=-1)
+    if dropout is not None:
+        p_attn = dropout(p_attn)
+    res = torch.matmul(p_attn, value)
+    return res, p_attn, scores
+
+
+Q = torch.tensor(
+    [[[1.0, 1.0, 2.0, 0.0, 5.0], [-1.0, 2.0, 2.0, 0.0, 5.0]]], requires_grad=True
+)
+K = torch.tensor(
+    [[[-1.0, 4.0, 1.0, 2.0, 1], [-3.0, 4.0, 5.0, 2.0, 1]]], requires_grad=True
+)
+V = torch.tensor([[[0.0, 2.0], [1.0, 1.0]]], requires_grad=True)
+
+mask = torch.tensor([[[1, 1], [1, 1]]])
+
+res, p_attn, s = attention(Q, K, V, mask)
+
+
+print("res", res)
+print()
+print("p_attn", p_attn)
+p_attn.retain_grad()
+res.backward(torch.tensor([[[0.9360, 1.0640], [0.9887, 1.0113]]]))
+
+print(Q.grad)
+
+print(K.grad)
+
+print(V.grad)
diff --git a/training/src/tests/tests/python/bahdanau_aliasing.py b/training/src/tests/tests/python/bahdanau_aliasing.py
new file mode 100644
index 00000000..14e23106
--- /dev/null
+++ b/training/src/tests/tests/python/bahdanau_aliasing.py
@@ -0,0 +1,69 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import texar.torch
+
+torch.manual_seed(42)
+torch.set_printoptions(precision=10)
+# First
+query1 = torch.rand(2, 2, requires_grad=True)
+state1 = torch.rand(2, 2, requires_grad=True)
+memory1 = torch.rand(2, 2, 2, requires_grad=True)
+# Second
+query2 = torch.rand(2, 2, requires_grad=True)
+state2 = torch.rand(2, 2, requires_grad=True)
+memory2 = torch.rand(2, 2, 2, requires_grad=True)
+
+attn = texar.torch.core.BahdanauMonotonicAttention(
+    3,
+    2,
+    2,
+    False,
+    score_mask_value=torch.tensor([1.1]),
+    sigmoid_noise=0.0,
+    score_bias_init=1.7,
+)
+queryWeights = torch.tensor(
+    [[-0.144869, 0.342418], [-0.365538, 0.267883], [0.123214, 0.565657676]]
+)
+memoryWeights = torch.tensor(
+    [[-0.0409466, -0.260044], [0.0764357, -0.269967], [0.675676, 0.234354545]]
+)
+attention_v = torch.tensor([-0.633191, 0.234963, -0.391515])
+attn.attention_v = torch.nn.Parameter(attention_v, requires_grad=True)
+attn.query_layer.weight = torch.nn.Parameter(queryWeights, requires_grad=True)
+attn.memory_layer.weight = torch.nn.Parameter(memoryWeights, requires_grad=True)
+memorySeqLength = torch.tensor([1.0, 1.0])
+
+# Forward steps
+y1 = attn.forward(query1, state1, memory1, memorySeqLength)[0]
+z1 = y1.unsqueeze(1) * attn.values
+y2 = attn.forward(query2, state2, z1, memorySeqLength)[0]
+z2 = attn.values * y2.unsqueeze(1)
+
+# Backward
+z2.sum().backward(retain_graph=True)
+z1.sum().backward(retain_graph=True)
+
+print(query1.grad)
+print(query2.grad)
+
+print(state1.grad)
+print(state2.grad)
+
+print(memory1.grad)
+
+print(attn.attention_v.grad)
+print(attn.attention_score_bias.grad)
diff --git a/training/src/tests/tests/python/bahdanau_attention_scheme.py b/training/src/tests/tests/python/bahdanau_attention_scheme.py
new file mode 100644
index 00000000..19ba5cf3
--- /dev/null
+++ b/training/src/tests/tests/python/bahdanau_attention_scheme.py
@@ -0,0 +1,160 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+# Step by step example of all versions of Bahdanau Attention
+import torch
+
+torch.manual_seed(0)
+
+# Inputs
+torch.set_printoptions(precision=10)
+query = torch.rand(3, 5, requires_grad=True)
+state = torch.rand(3, 6, requires_grad=True)
+memory = torch.rand(3, 6, 7, requires_grad=True)
+print("query:", query)
+print("state:", state)
+print("memory:", memory)
+
+# Settings
+num_units = 4
+sigmoid_noise = 0.0
+score_bias_init = 1.7
+normalize = False
+stepwise = False
+score_mask_value = torch.tensor([1.1])
+
+# Size: [query.shape[1], num_units]
+queryWeights = torch.tensor(
+    [
+        [-0.112215, 0.265236, 0.403131, -0.283145, 0.207502],
+        [0.250163, 0.0882428, 0.0866254, -0.307666, -0.0484487],
+        [-0.307688, -0.357793, -0.395262, -0.0364489, 0.327518],
+        [-0.148736, 0.09044, -0.31943, 0.186106, 0.134959],
+    ]
+)
+# Size: [memory.shape[1], num_units]
+memoryWeights = torch.tensor(
+    [
+        [-0.362404, -0.335321, 0.355218, 0.167815, 0.251303, 0.331515, -0.217451],
+        [-0.377376, -0.240518, 0.372077, -0.239324, 0.0888077, -0.147979, 0.0844018],
+        [0.0187141, -0.372623, -0.0514447, -0.360531, -0.157816, 0.0187279, 0.0845528],
+        [-0.075698, -0.272517, -0.342689, -0.157124, 0.358126, -0.101021, -0.202006],
+    ]
+)
+attention_v = torch.tensor([-0.076089, -0.70909, 0.493939, 0.205051])
+attention_g = torch.sqrt(torch.tensor(1.0 / num_units))
+memory_seq_length = torch.tensor([4, 3, 2])
+
+# Forward path
+
+## Step1:
+queryProcessed = torch.matmul(query, torch.transpose(queryWeights, 0, 1))
+
+## Mask to hide some elems of memory tensor
+max_len = memory.shape[1]
+rank = memory.dim()
+size = memory_seq_length.size()
+row_vector = torch.arange(max_len).view(*([1] * len(size)), -1).expand(*size, max_len)
+mask = row_vector < memory_seq_length.unsqueeze(-1)
+
+## Step2:
+memoryProcessed = torch.matmul(
+    memory * mask.unsqueeze(2), torch.transpose(memoryWeights, 0, 1)
+)
+memoryProcessed
+
+## Step3:
+sum_ = memoryProcessed + torch.unsqueeze(queryProcessed, 1)
+
+## Step 4:
+tanh_ = torch.tanh(sum_)
+
+## Step 5:
+if not normalize:
+    mul_ = attention_v * tanh_
+else:
+    mul_ = attention_g * attention_v * torch.rsqrt(torch.sum(attention_v ** 2)) * tanh_
+
+## Step 6:
+rsum_ = torch.sum(mul_, 2)
+
+## Step 7:
+biased_sum_ = rsum_ + score_bias_init
+
+## Step 8:
+score_mask_values = score_mask_value * torch.ones_like(mask)
+res_ = torch.where(mask, biased_sum_, score_mask_values)
+
+## Step 9:
+noise = torch.randn(res_.shape)
+res_ += sigmoid_noise * noise
+res_
+
+## Step 10:
+sigm_ = torch.sigmoid(res_)
+
+## Step 11:
+rsigm_ = 1 - sigm_[:, :-1]
+
+if stepwise:
+    ## Step 12:
+    batch_size = sigm_.shape[0]
+    pad = sigm_.new_zeros(batch_size, 1)
+
+    ## Step 13:
+    tmp_state = state[:, :-1] * rsigm_
+
+    ## Step 14:
+    tmp_conc = torch.cat((pad, tmp_state), dim=1)
+
+    ## Step 15:
+    pre_final = state * sigm_
+
+    # First output
+    attn = pre_final + tmp_conc
+
+    # Second output
+    indices = torch.argmax(attn, -1)
+
+else:
+    ## Step 12:
+    batch_size = sigm_.shape[0]
+    shifted_1 = torch.cat((sigm_.new_ones(batch_size, 1), 1 - sigm_[:, :-1]), 1)
+
+    ## Step 13:
+    tiny = torch.finfo(shifted_1.dtype).tiny
+    z1 = torch.clamp(shifted_1, tiny, 1)
+
+    ## Step 14:
+    z2 = torch.log(z1)
+
+    ## Step 15:
+    z3 = torch.cumsum(z2, dim=1)
+
+    ## Step 16:
+    z4 = torch.exp(z3)
+
+    ## Step 17:
+    z5 = state / z4.clamp(min=1e-10, max=1.0)
+
+    ## Output
+    attn = sigm_ * z4 * z5
+
+# Backward
+
+attn.sum().backward()
+
+print("query grad: ", query.grad)
+print("state grad: ", state.grad)
+print("memory grad: ", memory.grad)
diff --git a/training/src/tests/tests/python/bahdanau_monotonic_attention.py b/training/src/tests/tests/python/bahdanau_monotonic_attention.py
new file mode 100644
index 00000000..725754fc
--- /dev/null
+++ b/training/src/tests/tests/python/bahdanau_monotonic_attention.py
@@ -0,0 +1,150 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import texar.torch
+import torch
+
+# Default
+torch.manual_seed(42)
+query = torch.rand(2, 3, requires_grad=True)
+state = torch.rand(2, 5, requires_grad=True)
+memory = torch.rand(2, 5, 7, requires_grad=True)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+## Create attention
+attn = texar.torch.core.BahdanauMonotonicAttention(
+    3, 3, 7, False, score_mask_value=1.1, sigmoid_noise=0.0, score_bias_init=1.7
+)
+## Hint to reproduce the result in Raul
+queryWeights = torch.tensor(
+    [
+        [-0.144869, 0.342418, 0.52044],
+        [-0.365538, 0.267883, 0.322959],
+        [0.113921, 0.111833, -0.397195],
+    ]
+)
+memoryWeights = torch.tensor(
+    [
+        [-0.0409466, -0.260044, -0.302391, -0.334057, -0.0308049, 0.276803, -0.125704],
+        [0.0764357, -0.269967, 0.157288, 0.114061, -0.362404, -0.335321, 0.355218],
+        [0.167815, 0.251303, 0.331515, -0.217451, -0.377376, -0.240518, 0.372077],
+    ]
+)
+attention_v = torch.tensor([-0.633191, 0.234963, -0.391515])
+attn.attention_v = torch.nn.Parameter(attention_v, requires_grad=True)
+attn.query_layer.weight = torch.nn.Parameter(queryWeights, requires_grad=True)
+attn.memory_layer.weight = torch.nn.Parameter(memoryWeights, requires_grad=True)
+
+## With mask logic
+masks = [[5, 5], [2, 4], [3, 2]]
+for i in range(len(masks)):
+    print("Mask: ", masks[i])
+    z = attn.forward(query, state, memory, torch.tensor(masks[i]))[0]
+    print("Result: ", z)
+    # Backward
+    z.sum().backward()
+    print("Gradient for attention_v:", attn.attention_v.grad)
+    print("Gradient for attention_score_bias:", attn.attention_score_bias.grad)
+
+# Normalized
+torch.manual_seed(0)
+torch.set_printoptions(precision=10)
+query = torch.rand(3, 5, requires_grad=True)
+state = torch.rand(3, 6, requires_grad=True)
+memory = torch.rand(3, 6, 7, requires_grad=True)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+## Hint to reproduce the result
+queryWeights = torch.tensor(
+    [
+        [-0.112215, 0.265236, 0.403131, -0.283145, 0.207502],
+        [0.250163, 0.0882428, 0.0866254, -0.307666, -0.0484487],
+        [-0.307688, -0.357793, -0.395262, -0.0364489, 0.327518],
+        [-0.148736, 0.09044, -0.31943, 0.186106, 0.134959],
+    ]
+)
+memoryWeights = torch.tensor(
+    [
+        [-0.362404, -0.335321, 0.355218, 0.167815, 0.251303, 0.331515, -0.217451],
+        [-0.377376, -0.240518, 0.372077, -0.239324, 0.0888077, -0.147979, 0.0844018],
+        [0.0187141, -0.372623, -0.0514447, -0.360531, -0.157816, 0.0187279, 0.0845528],
+        [-0.075698, -0.272517, -0.342689, -0.157124, 0.358126, -0.101021, -0.202006],
+    ]
+)
+attention_v = torch.tensor([-0.076089, -0.70909, 0.493939, 0.205051])
+attn.attention_v = torch.nn.Parameter(attention_v, requires_grad=True)
+attn.query_layer.weight = torch.nn.Parameter(queryWeights, requires_grad=True)
+attn.memory_layer.weight = torch.nn.Parameter(memoryWeights, requires_grad=True)
+
+## Mask logic
+masks = [[2, 3, 4], [4, 3, 2], [7, 7, 2]]
+values = [[1], [1, 2, 3, 4, 5, 6], [10.0, -10.0, 0.013213, 1.0, 1.123, -1]]
+for i in range(len(masks)):
+    print("Mask: ", masks[i])
+    attn = texar.torch.core.BahdanauAttention(
+        4, 5, 7, True, score_mask_value=torch.tensor(values[i])
+    )  # , 0.0, 1.7
+    attn.attention_v = torch.nn.Parameter(attention_v, requires_grad=True)
+    attn.query_layer.weight = torch.nn.Parameter(queryWeights, requires_grad=True)
+    attn.memory_layer.weight = torch.nn.Parameter(memoryWeights, requires_grad=True)
+    z = attn.forward(query, state, memory, torch.tensor(masks[i]))[0]
+    print("Result: ", z)
+    # Backward
+    z.sum().backward()
+    print("Gradient for attention_v:", attn.attention_v.grad)
+    print("Gradient for attention_b:", attn.attention_b.grad)
+    print("Gradient for attention_g:", attn.attention_g.grad)
+    print("Gradient for attention_score_bias:", attn.attention_score_bias.grad)
+
+# Multiple forward
+
+## First
+query1 = torch.rand(2, 2, requires_grad=True)
+state1 = torch.rand(2, 3, requires_grad=True)
+memory1 = torch.rand(2, 3, 2, requires_grad=True)
+## Second
+query2 = torch.rand(2, 2, requires_grad=True)
+state2 = torch.rand(2, 3, requires_grad=True)
+memory2 = torch.rand(2, 3, 2, requires_grad=True)
+
+## Attention
+attn = texar.torch.core.BahdanauMonotonicAttention(
+    3, 2, 2, False, sigmoid_noise=0.0, score_bias_init=1.7
+)
+queryWeights = torch.tensor(
+    [[-0.144869, 0.342418], [-0.365538, 0.267883], [0.123214, 0.565657676]]
+)
+memoryWeights = torch.tensor(
+    [[-0.0409466, -0.260044], [0.0764357, -0.269967], [0.675676, 0.234354545]]
+)
+attention_v = torch.tensor([-0.633191, 0.234963, -0.391515])
+attn.attention_v = torch.nn.Parameter(attention_v, requires_grad=True)
+attn.query_layer.weight = torch.nn.Parameter(queryWeights, requires_grad=True)
+attn.memory_layer.weight = torch.nn.Parameter(memoryWeights, requires_grad=True)
+
+## Run
+z1 = attn.forward(query1, state1, memory1)
+print("1st step result: ", z1[0])
+
+z2 = attn.forward(query2, state2, memory2)
+print("2nd step result: ", z2[0])
+
+z2[0].sum().backward(retain_graph=True)
+print("Current gradient for attention_v:", attn.attention_v.grad)
+print("Current gradient for attention_score_bias:", attn.attention_score_bias.grad)
+
+z1[0].sum().backward()
+print("Final gradient for attention_v:", attn.attention_v.grad)
+print("Final gradient for attention_score_bias:", attn.attention_score_bias.grad)
diff --git a/training/src/tests/tests/python/bahdanau_stepwise_monotonic_attention.py b/training/src/tests/tests/python/bahdanau_stepwise_monotonic_attention.py
new file mode 100644
index 00000000..a58a29b5
--- /dev/null
+++ b/training/src/tests/tests/python/bahdanau_stepwise_monotonic_attention.py
@@ -0,0 +1,312 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# !pip install tensorflow==1.13.2
+
+import functools
+
+import tensorflow as tf
+from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import (
+    BahdanauAttention,
+    _BaseMonotonicAttentionMechanism,
+    _monotonic_probability_fn,
+    BahdanauMonotonicAttention,
+)
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import (
+    array_ops,
+    math_ops,
+    nn_ops,
+    variable_scope,
+    random_ops,
+)
+
+tf.enable_eager_execution()
+
+# Preparation
+
+
+def _bahdanau_score(
+    processed_query, keys, attention_v, attention_g=None, attention_b=None
+):
+    """Implements Bahdanau-style (additive) scoring function.
+    This attention has two forms.  The first is Bhandanau attention,
+    as described in:
+    Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
+    "Neural Machine Translation by Jointly Learning to Align and Translate."
+    ICLR 2015. https://arxiv.org/abs/1409.0473
+    The second is the normalized form.  This form is inspired by the
+    weight normalization article:
+    Tim Salimans, Diederik P. Kingma.
+    "Weight Normalization: A Simple Reparameterization to Accelerate
+     Training of Deep Neural Networks."
+    https://arxiv.org/abs/1602.07868
+    To enable the second form, set please pass in attention_g and attention_b.
+    Args:
+      processed_query: Tensor, shape `[batch_size, num_units]` to compare to
+        keys.
+      keys: Processed memory, shape `[batch_size, max_time, num_units]`.
+      attention_v: Tensor, shape `[num_units]`.
+      attention_g: Optional scalar tensor for normalization.
+      attention_b: Optional tensor with shape `[num_units]` for normalization.
+    Returns:
+      A `[batch_size, max_time]` tensor of unnormalized score values.
+    """
+    # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
+    processed_query = tf.expand_dims(processed_query, 1)
+    if attention_g is not None and attention_b is not None:
+        normed_v = (
+            attention_g
+            * attention_v
+            * tf.math.rsqrt(tf.reduce_sum(tf.square(attention_v)))
+        )
+        return tf.reduce_sum(
+            normed_v * tf.tanh(keys + processed_query + attention_b), [2]
+        )
+    else:
+        return tf.reduce_sum(attention_v * tf.tanh(keys + processed_query), [2])
+
+
+def _maybe_mask_score(
+    score, memory_sequence_length=None, memory_mask=None, score_mask_value=None
+):
+    """Mask the attention score based on the masks."""
+    if memory_sequence_length is None and memory_mask is None:
+        return score
+    if memory_sequence_length is not None and memory_mask is not None:
+        raise ValueError(
+            "memory_sequence_length and memory_mask can't be provided at same time."
+        )
+    if memory_sequence_length is not None:
+        message = "All values in memory_sequence_length must greater than zero."
+        with tf.control_dependencies(
+            [
+                tf.debugging.assert_positive(  # pylint: disable=bad-continuation
+                    memory_sequence_length, message=message
+                )
+            ]
+        ):
+            memory_mask = tf.sequence_mask(
+                memory_sequence_length, maxlen=tf.shape(score)[1]
+            )
+    score_mask_value = 1.1
+    print(score)
+    score_mask_values = score_mask_value * tf.ones_like(score)
+    return tf.where(memory_mask, score, score_mask_values)
+
+
+# Reference implementation for https://arxiv.org/abs/1906.00672
+# from https://gist.github.com/dy-octa/38a7638f75c21479582d7391490df37c
+def monotonic_stepwise_attention(p_choose_i, previous_attention, mode):
+    # p_choose_i, previous_alignments, previous_score: [batch_size, memory_size]
+    # p_choose_i: probability to keep attended to the last attended entry i
+    if mode == "parallel":
+        stay_part = previous_attention * p_choose_i
+        go_part = previous_attention * (1.0 - p_choose_i)
+
+        shifted = go_part[:, :-1]
+        eos = go_part[:, -1][:, tf.newaxis]
+
+        # shift
+        go_part = tf.concat([tf.zeros_like(eos), shifted], axis=1)
+        go_part += tf.concat([tf.zeros_like(shifted), eos], axis=1)
+
+        attention = stay_part + go_part
+    elif mode == "hard":
+        # Given that previous_alignments is one_hot
+        move_next_mask = tf.concat(
+            [tf.zeros_like(previous_attention[:, :1]), previous_attention[:, :-1]],
+            axis=1,
+        )
+        stay_prob = tf.reduce_sum(p_choose_i * previous_attention, axis=1)  # [B]
+        attention = tf.where(stay_prob > 0.5, previous_attention, move_next_mask)
+    else:
+        raise ValueError("mode must be 'parallel', or 'hard'.")
+    return attention
+
+
+def _stepwise_monotonic_probability_fn(
+    score, previous_alignments, sigmoid_noise, mode, seed=None
+):
+    if sigmoid_noise > 0:
+        noise = random_ops.random_normal(
+            array_ops.shape(score), dtype=score.dtype, seed=seed
+        )
+        score += sigmoid_noise * noise
+    if mode == "hard":
+        # When mode is hard, use a hard sigmoid
+        p_choose_i = math_ops.cast(score > 0, score.dtype)
+    else:
+        p_choose_i = math_ops.sigmoid(score)
+    alignments = monotonic_stepwise_attention(p_choose_i, previous_alignments, mode)
+    return alignments
+
+
+class BahdanauStepwiseMonotonicAttention(BahdanauMonotonicAttention):
+    def __init__(
+        self,
+        num_units,
+        memory,
+        memory_sequence_length=None,
+        normalize=True,
+        score_mask_value=None,
+        sigmoid_noise=2.0,
+        sigmoid_noise_seed=None,
+        score_bias_init=3.5,
+        mode="parallel",
+        dtype=None,
+        name="BahdanauStepwiseMonotonicAttention",
+    ):
+        if dtype is None:
+            dtype = tf.float32
+        wrapped_probability_fn = functools.partial(
+            _stepwise_monotonic_probability_fn,
+            sigmoid_noise=sigmoid_noise,
+            mode=mode,
+            seed=sigmoid_noise_seed,
+        )
+        super(BahdanauMonotonicAttention, self).__init__(
+            query_layer=tf.layers.Dense(
+                num_units, name="query_layer", use_bias=False, dtype=dtype
+            ),
+            memory_layer=tf.layers.Dense(
+                num_units, name="memory_layer", use_bias=False, dtype=dtype
+            ),
+            memory=memory,
+            probability_fn=wrapped_probability_fn,
+            memory_sequence_length=memory_sequence_length,
+            score_mask_value=score_mask_value,
+            name=name,
+        )
+        self._num_units = num_units
+        self._normalize = normalize
+        self._name = name
+        self._score_bias_init = score_bias_init
+
+        self.attention_v = tf.Variable(
+            tf.random.uniform(
+                [num_units],
+                minval=0,
+                maxval=None,
+                dtype=tf.dtypes.float32,
+                seed=0,
+                name=None,
+            ),
+            name="attention_v",
+        )
+        self.attention_g = None
+        self.attention_b = None
+        if normalize:
+            self.attention_g = 0.5
+            self.attention_b = 0.0
+
+    def __call__(self, query, state, prev_max_attentions):
+        # Adding max_attentions to the call method for compability
+        processed_query = self.query_layer(query) if self.query_layer else query
+        score = _bahdanau_score(
+            processed_query,
+            self.keys,
+            self.attention_v,
+            self.attention_g,
+            self.attention_b,
+        )
+        score += self._score_bias_init
+        alignments = _stepwise_monotonic_probability_fn(
+            _maybe_mask_score(score, memory_sequence_length=[1.0, 2.0, 3.0]),
+            state,
+            0.0,
+            "parallel",
+        )
+        max_attentions = tf.argmax(alignments, -1, output_type=tf.int32)
+        return alignments, max_attentions
+
+
+# TEST 1 (normalize=False)
+
+tf.random.set_random_seed(0)
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 6, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 6], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 5], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0, 1.0]), name="prev_max")
+
+attn = BahdanauStepwiseMonotonicAttention(
+    4,
+    memory,
+    memory_sequence_length=tf.Variable(tf.constant([1.0, 2.0, 3.0])),
+    normalize=False,
+    sigmoid_noise=0.0,
+    score_mask_value=tf.Variable(tf.constant([1.1])),
+)
+
+result = attn(query, state, prev_max)
+
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+
+# TEST 2 (normalize=True)
+
+tf.random.set_random_seed(0)
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 6, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 6], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 5], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0, 1.0]), name="prev_max")
+
+attn = BahdanauStepwiseMonotonicAttention(
+    4,
+    memory,
+    memory_sequence_length=tf.Variable(tf.constant([1.0, 2.0, 3.0])),
+    normalize=True,
+    sigmoid_noise=0.0,
+    score_mask_value=tf.Variable(tf.constant([1.1])),
+)
+
+result = attn(query, state, prev_max)
+
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
diff --git a/training/src/tests/tests/python/batchnormtest.py b/training/src/tests/tests/python/batchnormtest.py
new file mode 100644
index 00000000..b747864b
--- /dev/null
+++ b/training/src/tests/tests/python/batchnormtest.py
@@ -0,0 +1,68 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import Module
+
+
+class Model(Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.batch = nn.BatchNorm2d(5)
+
+    def forward(self, x):
+
+        out = self.batch(x)
+        return out
+
+
+x = np.array(
+    [
+        0.2992,
+        0.0614,
+        0.3442,
+        0.4992,
+        0.1848,
+        0.3404,
+        0.3627,
+        0.6232,
+        0.5426,
+        0.1261,
+        0.9982,
+        0.7149,
+        0.8062,
+        0.6040,
+        0.0333,
+        0.3870,
+        0.2276,
+        0.0830,
+        0.0222,
+        0.9375,
+        0.9395,
+        0.4894,
+        0.4846,
+        0.3932,
+        0.3220,
+    ]
+)
+
+x = np.reshape(x, (1, 5, 1, 5))
+
+x = torch.Tensor(x)
+model = Model()
+model.eval()
+out = model(x)
+print(out)
diff --git a/training/src/tests/tests/python/batchnormtrain.py b/training/src/tests/tests/python/batchnormtrain.py
new file mode 100644
index 00000000..6f072116
--- /dev/null
+++ b/training/src/tests/tests/python/batchnormtrain.py
@@ -0,0 +1,171 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(1, 16, 5, stride=2)
+        self.batch = nn.BatchNorm2d(16, momentum=0.0)
+        self.fc1 = nn.Linear(16 * 12 * 12, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.fc1.bias)
+
+    def forward(self, x):
+        self.conv1Input = x
+        out = self.conv1(x)
+        out = self.batch(out)
+        out = out.reshape(-1, 16 * 12 * 12)
+        out = self.fc1(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.6f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    model = NeuralNet(num_classes)
+
+    model.eval()
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        model.train()
+        outputs = model(images)
+        # print(outputs)
+        # if i < 1:
+        #    saveWeights(i, model)
+
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        # if i % 100 == 0:
+        #    with open(curdir + 'loss.txt', 'a') as outfile:
+        #        print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    model.eval()
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/batchnormtrain2.py b/training/src/tests/tests/python/batchnormtrain2.py
new file mode 100644
index 00000000..b66636ef
--- /dev/null
+++ b/training/src/tests/tests/python/batchnormtrain2.py
@@ -0,0 +1,181 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(1, 16, 3, stride=2)
+        self.batch1 = nn.BatchNorm2d(16, momentum=0.0)
+        self.conv2 = nn.Conv2d(16, 32, 3, stride=2)
+        self.batch2 = nn.BatchNorm2d(32, momentum=0.0)
+        self.fc1 = nn.Linear(32 * 6 * 6, 128)
+        self.fc2 = nn.Linear(128, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        self.conv1Input = x
+        out = self.conv1(x)
+        out = self.batch1(out)
+        out = self.conv2(out)
+        out = self.batch2(out)
+        out = out.reshape(-1, 32 * 6 * 6)
+        out = self.fc1(out)
+        out = self.fc2(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.6f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    model = NeuralNet(num_classes)
+
+    model.eval()
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        model.train()
+        outputs = model(images)
+        # print(outputs)
+        # if i < 1:
+        #    saveWeights(i, model)
+
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        # if i % 100 == 0:
+        #    with open(curdir + 'loss.txt', 'a') as outfile:
+        #        print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    model.eval()
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/batchnormtrain3.py b/training/src/tests/tests/python/batchnormtrain3.py
new file mode 100644
index 00000000..d0da465d
--- /dev/null
+++ b/training/src/tests/tests/python/batchnormtrain3.py
@@ -0,0 +1,182 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(1, 16, 3, stride=2)
+        self.batch1 = nn.BatchNorm2d(16, momentum=0.1)
+        self.conv2 = nn.Conv2d(16, 32, 3, stride=2)
+        self.batch2 = nn.BatchNorm2d(32, momentum=0.1)
+        self.fc1 = nn.Linear(32 * 6 * 6, 128)
+        self.fc2 = nn.Linear(128, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        self.conv1Input = x
+        out = self.conv1(x)
+        out = self.batch1(out)
+        out = self.conv2(out)
+        out = self.batch2(out)
+        out = out.reshape(-1, 32 * 6 * 6)
+        out = self.fc1(out)
+        out = self.fc2(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.6f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    model = NeuralNet(num_classes)
+
+    model.eval()
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+    start = time.time()
+
+    for epoch in range(1, 3):
+        for i, (images, labels) in enumerate(train_loader):
+
+            model.train()
+            outputs = model(images)
+            # print(outputs)
+            # if i < 1:
+            #    saveWeights(i, model)
+
+            loss, lossInput = CrossEntropy(outputs, labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # if i % 100 == 0:
+            #    with open(curdir + 'loss.txt', 'a') as outfile:
+            #        print(loss.item(), file = outfile)
+
+            if i % 100 == 0:
+                print(
+                    "Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item())
+                )
+
+        model.eval()
+        predict(test_loader, model)
+
+    timeTaken += time.time() - start
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/bce_loss.py b/training/src/tests/tests/python/bce_loss.py
new file mode 100644
index 00000000..85fa63b8
--- /dev/null
+++ b/training/src/tests/tests/python/bce_loss.py
@@ -0,0 +1,43 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# Simple unit
+input = torch.rand(2, 3, requires_grad=True)
+target = torch.rand(2, 3)
+loss = torch.nn.BCELoss(reduction="none")
+print("Input: ", input)
+print("Target: ", target)
+output = loss(input, target)
+print("Loss: ", output)
+output.sum().backward()
+print("Gradient for input: ", input.grad)
+
+# Corner cases
+loss = torch.nn.BCELoss(reduction="none")
+input = torch.tensor(
+    [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.998, 0.002, 1.0], [0.999, 0.0001, 0.11]],
+    requires_grad=True,
+)
+target = torch.tensor(
+    [[0.0, 1.0, 0.99], [0.0, 1.0, 0.5], [0.0001, 0.997, 1.0], [0.999, 0.001, 0.00001]]
+)
+print("Input: ", input)
+print("Target: ", target)
+output = loss(input, target)
+print("Loss: ", loss)
+output.sum().backward()
+print("Gradient for input: ", input.grad)
diff --git a/training/src/tests/tests/python/biastrain.py b/training/src/tests/tests/python/biastrain.py
new file mode 100644
index 00000000..5c208d20
--- /dev/null
+++ b/training/src/tests/tests/python/biastrain.py
@@ -0,0 +1,165 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        # self.fc1 = nn.Linear(28 * 28, num_classes)
+        self.fc1 = nn.Linear(28 * 28, num_classes, bias=False)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.fc1.weight)
+
+        # nn.init.zeros_(self.fc1.bias)
+
+    def forward(self, x):
+        out = x.reshape(-1, 1 * 28 * 28)
+        out = self.fc1(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.6f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    model = NeuralNet(num_classes)
+
+    model.eval()
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        model.train()
+        outputs = model(images)
+        # print(outputs)
+        # if i < 1:
+        #    saveWeights(i, model)
+
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        # if i % 100 == 0:
+        #    with open(curdir + 'loss.txt', 'a') as outfile:
+        #        print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    model.eval()
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/binary_dataloader_test.py b/training/src/tests/tests/python/binary_dataloader_test.py
new file mode 100644
index 00000000..96385390
--- /dev/null
+++ b/training/src/tests/tests/python/binary_dataloader_test.py
@@ -0,0 +1,59 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+from pyraul.tools.save_weights import save_weights
+from pyraul.tools.dumping import gen_cpp_dtVec
+import torchvision.models as models
+import numpy as np
+import torch
+
+
+def simple_test():
+    """Save weights for TestDataLoader.BinarySimpleDataLoad"""
+    fd = open("weights/binary_dataloader_simple_test.bin", "wb")
+    torch.tensor(
+        np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype=torch.float32
+    ).numpy().tofile(fd)
+
+
+def test_save_one_bias():
+    """Save bias for TestDataLoader.BinaryBiasDataLoad"""
+    alexnet = models.alexnet(True)
+    bias = alexnet.features[3].state_dict()["bias"]
+
+    print("Bias shape:", bias.shape)
+    print(gen_cpp_dtVec(bias, "golden_bias_values"))
+
+    fd = open("weights/binary_dataloader_bias_test.bin", "wb")
+    bias.numpy().tofile(fd)
+
+
+def test_save_one_weight():
+    """Save convolution weights for TestDataLoader.BinaryWeightsLoad"""
+    resnet = models.resnet18(True)
+    convolution = resnet.layer3[1].conv1.state_dict()
+    data = convolution["weight"].flatten()
+
+    print("Weight shape:", convolution["weight"].shape)
+    print(gen_cpp_dtVec(data[0 :: data.size()[0] // 100], "golden_biases_values"))
+
+    fd = open("weights/binary_dataloader_weight_test.bin", "wb")
+    convolution["weight"].numpy().tofile(fd)
+
+
+simple_test()
+test_save_one_bias()
+print("------------------------------------------")
+test_save_one_weight()
diff --git a/training/src/tests/tests/python/broadcasting.ipynb b/training/src/tests/tests/python/broadcasting.ipynb
new file mode 100644
index 00000000..cc4cddbc
--- /dev/null
+++ b/training/src/tests/tests/python/broadcasting.ipynb
@@ -0,0 +1,418 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from functools import reduce\n",
+    "from pyraul.tools.dumping import print_torch_tensor, gen_cpp_dtVec\n",
+    "from pyraul.tools.seed import set_seed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def broadcast_test(size, broadcast_size, start=0):\n",
+    "    r = reduce(lambda a,b: a*b, size, 1)\n",
+    "    x = torch.arange(start, start+r).reshape(size)\n",
+    "    y = torch.ones(broadcast_size)\n",
+    "    z = x*y\n",
+    "    print(\"x\", x.shape, x, sep=\"\\n\")\n",
+    "    print(\"y\", y.shape, y, sep=\"\\n\")\n",
+    "    print(\"z\", z.shape, z, sep=\"\\n\")\n",
+    "    \n",
+    "    xf = x.flatten()\n",
+    "    zf = z.flatten()\n",
+    "    for i, x in enumerate(xf):\n",
+    "        print(f\"{i}: x={xf[i].item()}\")\n",
+    "    for i, x in enumerate(zf):\n",
+    "        print(f\"{i}: z={zf[i].item()}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x\n",
+      "torch.Size([1, 2])\n",
+      "tensor([[0, 1]])\n",
+      "y\n",
+      "torch.Size([2, 2])\n",
+      "tensor([[1., 1.],\n",
+      "        [1., 1.]])\n",
+      "z\n",
+      "torch.Size([2, 2])\n",
+      "tensor([[0., 1.],\n",
+      "        [0., 1.]])\n",
+      "0: x=0\n",
+      "1: x=1\n",
+      "0: z=0.0\n",
+      "1: z=1.0\n",
+      "2: z=0.0\n",
+      "3: z=1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "broadcast_test(size=(1,2), broadcast_size=(2,2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x\n",
+      "torch.Size([2, 1])\n",
+      "tensor([[0],\n",
+      "        [1]])\n",
+      "y\n",
+      "torch.Size([2, 2])\n",
+      "tensor([[1., 1.],\n",
+      "        [1., 1.]])\n",
+      "z\n",
+      "torch.Size([2, 2])\n",
+      "tensor([[0., 0.],\n",
+      "        [1., 1.]])\n",
+      "0: x=0\n",
+      "1: x=1\n",
+      "0: z=0.0\n",
+      "1: z=0.0\n",
+      "2: z=1.0\n",
+      "3: z=1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "broadcast_test(size=(2,1), broadcast_size=(2,2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x\n",
+      "torch.Size([2, 2])\n",
+      "tensor([[0, 1],\n",
+      "        [2, 3]])\n",
+      "y\n",
+      "torch.Size([2, 2])\n",
+      "tensor([[1., 1.],\n",
+      "        [1., 1.]])\n",
+      "z\n",
+      "torch.Size([2, 2])\n",
+      "tensor([[0., 1.],\n",
+      "        [2., 3.]])\n",
+      "0: x=0\n",
+      "1: x=1\n",
+      "2: x=2\n",
+      "3: x=3\n",
+      "0: z=0.0\n",
+      "1: z=1.0\n",
+      "2: z=2.0\n",
+      "3: z=3.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "broadcast_test(size=(2,2), broadcast_size=(2,2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x\n",
+      "torch.Size([2, 1, 2])\n",
+      "tensor([[[0, 1]],\n",
+      "\n",
+      "        [[2, 3]]])\n",
+      "y\n",
+      "torch.Size([2, 2, 2])\n",
+      "tensor([[[1., 1.],\n",
+      "         [1., 1.]],\n",
+      "\n",
+      "        [[1., 1.],\n",
+      "         [1., 1.]]])\n",
+      "z\n",
+      "torch.Size([2, 2, 2])\n",
+      "tensor([[[0., 1.],\n",
+      "         [0., 1.]],\n",
+      "\n",
+      "        [[2., 3.],\n",
+      "         [2., 3.]]])\n",
+      "0: x=0\n",
+      "1: x=1\n",
+      "2: x=2\n",
+      "3: x=3\n",
+      "0: z=0.0\n",
+      "1: z=1.0\n",
+      "2: z=0.0\n",
+      "3: z=1.0\n",
+      "4: z=2.0\n",
+      "5: z=3.0\n",
+      "6: z=2.0\n",
+      "7: z=3.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "broadcast_test(size=(2,1,2), broadcast_size=(2,2,2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x\n",
+      "torch.Size([2, 2, 1])\n",
+      "tensor([[[1],\n",
+      "         [2]],\n",
+      "\n",
+      "        [[3],\n",
+      "         [4]]])\n",
+      "y\n",
+      "torch.Size([2, 2, 2])\n",
+      "tensor([[[1., 1.],\n",
+      "         [1., 1.]],\n",
+      "\n",
+      "        [[1., 1.],\n",
+      "         [1., 1.]]])\n",
+      "z\n",
+      "torch.Size([2, 2, 2])\n",
+      "tensor([[[1., 1.],\n",
+      "         [2., 2.]],\n",
+      "\n",
+      "        [[3., 3.],\n",
+      "         [4., 4.]]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "broadcast_test(size=(2,2,1), broadcast_size=(2,2,2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x\n",
+      "torch.Size([1, 2, 2])\n",
+      "tensor([[[0, 1],\n",
+      "         [2, 3]]])\n",
+      "y\n",
+      "torch.Size([2, 2, 2])\n",
+      "tensor([[[1., 1.],\n",
+      "         [1., 1.]],\n",
+      "\n",
+      "        [[1., 1.],\n",
+      "         [1., 1.]]])\n",
+      "z\n",
+      "torch.Size([2, 2, 2])\n",
+      "tensor([[[0., 1.],\n",
+      "         [2., 3.]],\n",
+      "\n",
+      "        [[0., 1.],\n",
+      "         [2., 3.]]])\n",
+      "0: x=0\n",
+      "1: x=1\n",
+      "2: x=2\n",
+      "3: x=3\n",
+      "0: z=0.0\n",
+      "1: z=1.0\n",
+      "2: z=2.0\n",
+      "3: z=3.0\n",
+      "4: z=0.0\n",
+      "5: z=1.0\n",
+      "6: z=2.0\n",
+      "7: z=3.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "broadcast_test(size=(1,2,2), broadcast_size=(2,2,2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x\n",
+      "torch.Size([1, 1, 1, 1])\n",
+      "tensor([[[[1]]]])\n",
+      "y\n",
+      "torch.Size([1, 2, 2, 1])\n",
+      "tensor([[[[1.],\n",
+      "          [1.]],\n",
+      "\n",
+      "         [[1.],\n",
+      "          [1.]]]])\n",
+      "z\n",
+      "torch.Size([1, 2, 2, 1])\n",
+      "tensor([[[[1.],\n",
+      "          [1.]],\n",
+      "\n",
+      "         [[1.],\n",
+      "          [1.]]]])\n",
+      "0: x=1\n",
+      "0: z=1.0\n",
+      "1: z=1.0\n",
+      "2: z=1.0\n",
+      "3: z=1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "broadcast_test(size=(1,1,1,1), broadcast_size=(1,2,2,1), start=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Mul"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([5, 3, 2, 3])\n",
+      "const raul::dtVec x{0.49625658988952637_dt, 0.7682217955589294_dt, 0.08847743272781372_dt, 0.13203048706054688_dt, 0.30742281675338745_dt, 0.6340786814689636_dt, 0.4900934100151062_dt, 0.8964447379112244_dt, 0.455627977848053_dt, 0.6323062777519226_dt, 0.3488934636116028_dt, 0.40171730518341064_dt, 0.022325754165649414_dt, 0.16885894536972046_dt, 0.2938884496688843_dt, 0.518521785736084_dt, 0.6976675987243652_dt, 0.800011396408081_dt, 0.16102945804595947_dt, 0.28226858377456665_dt, 0.6816085577011108_dt, 0.9151939749717712_dt, 0.39709991216659546_dt, 0.8741558790206909_dt, 0.41940832138061523_dt, 0.5529070496559143_dt, 0.9527381062507629_dt, 0.036164820194244385_dt, 0.1852310299873352_dt, 0.37341737747192383_dt};\n",
+      "const raul::dtVec y{0.3051000237464905_dt, 0.9320003986358643_dt, 0.17591017484664917_dt, 0.2698335647583008_dt, 0.15067976713180542_dt, 0.03171950578689575_dt, 0.20812976360321045_dt, 0.9297990202903748_dt, 0.7231091856956482_dt, 0.7423362731933594_dt, 0.5262957811355591_dt, 0.24365824460983276_dt, 0.584592342376709_dt, 0.033152639865875244_dt, 0.13871687650680542_dt, 0.242235004901886_dt, 0.815468966960907_dt, 0.793160617351532_dt, 0.2782524824142456_dt, 0.48195880651474_dt, 0.8197803497314453_dt, 0.9970665574073792_dt, 0.6984410881996155_dt, 0.5675464272499084_dt, 0.8352431654930115_dt, 0.2055988311767578_dt, 0.593172013759613_dt, 0.11234724521636963_dt, 0.1534569263458252_dt, 0.24170821905136108_dt, 0.7262365221977234_dt, 0.7010802030563354_dt, 0.2038237452507019_dt, 0.6510535478591919_dt, 0.7744860053062439_dt, 0.4368913173675537_dt, 0.5190907716751099_dt, 0.6158523559570312_dt, 0.8101882934570312_dt, 0.9800970554351807_dt, 0.1146882176399231_dt, 0.3167651295661926_dt, 0.6965049505233765_dt, 0.9142746925354004_dt, 0.9351036548614502_dt};\n",
+      "const raul::dtVec z{0.1514078974723816_dt, 0.7159830331802368_dt, 0.015564080327749252_dt, 0.04028250649571419_dt, 0.2865181863307953_dt, 0.11154089123010635_dt, 0.13390667736530304_dt, 0.11575548350811005_dt, 0.0028064604848623276_dt, 0.035626258701086044_dt, 0.04632239788770676_dt, 0.0201126616448164_dt, 0.1032857671380043_dt, 0.7142918705940247_dt, 0.06397884339094162_dt, 0.027479473501443863_dt, 0.2858414351940155_dt, 0.45850813388824463_dt, 0.363814115524292_dt, 0.47179508209228516_dt, 0.11101751029491425_dt, 0.4693838953971863_dt, 0.18362115323543549_dt, 0.0978817343711853_dt, 0.286504864692688_dt, 0.029719509184360504_dt, 0.06320329010486603_dt, 0.36964139342308044_dt, 0.011566739529371262_dt, 0.05572497099637985_dt, 0.11871778219938278_dt, 0.731022834777832_dt, 0.36138617992401123_dt, 0.15316671133041382_dt, 0.2845118045806885_dt, 0.318626344203949_dt, 0.006212196312844753_dt, 0.08138305693864822_dt, 0.24092397093772888_dt, 0.14427997171878815_dt, 0.336247056722641_dt, 0.6558336019515991_dt, 0.022260263562202454_dt, 0.11793802678585052_dt, 0.16679534316062927_dt, 0.5170007348060608_dt, 0.4872797131538391_dt, 0.45404359698295593_dt, 0.018647434189915657_dt, 0.03471720218658447_dt, 0.17432640492916107_dt, 0.43309178948402405_dt, 0.1434396356344223_dt, 0.47454437613487244_dt, 0.018091216683387756_dt, 0.0433160699903965_dt, 0.16475039720535278_dt, 0.10281952470541_dt, 0.060937732458114624_dt, 0.2112906575202942_dt, 0.11694547533988953_dt, 0.1978929191827774_dt, 0.13892801105976105_dt, 0.6646472811698914_dt, 0.27839890122413635_dt, 0.1781737208366394_dt, 0.10483880341053009_dt, 0.21861307322978973_dt, 0.2977888584136963_dt, 0.5958402752876282_dt, 0.30754831433296204_dt, 0.38191109895706177_dt, 0.21771098673343658_dt, 0.3405091166496277_dt, 0.7718972563743591_dt, 0.018772823736071587_dt, 0.11407496780157089_dt, 0.3025383949279785_dt, 0.4110608696937561_dt, 0.06341192126274109_dt, 0.3017942011356354_dt, 0.03544503450393677_dt, 0.02124381624162197_dt, 0.11828560382127762_dt, 0.29211997985839844_dt, 0.5055088996887207_dt, 0.8909088969230652_dt, 0.025188976898789406_dt, 0.16935203969478607_dt, 0.34918394684791565_dt};\n"
+     ]
+    }
+   ],
+   "source": [
+    "set_seed(0)\n",
+    "x = torch.rand(5,1,2,3)\n",
+    "y = torch.rand(5,3,1,3)\n",
+    "z=x*y\n",
+    "print(z.shape)\n",
+    "print(gen_cpp_dtVec(x.data.flatten(),\"x\"))\n",
+    "print(gen_cpp_dtVec(y.data.flatten(),\"y\"))\n",
+    "print(gen_cpp_dtVec(z.data.flatten(),\"z\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x (torch.Size([2, 1, 1, 1])):\n",
+      "[0.49625658988952637, 0.7682217955589294]\n",
+      "grad of x (torch.Size([2, 1, 1, 1])):\n",
+      "[0.527930736541748, 0.527930736541748]\n",
+      "y (torch.Size([1, 1, 1, 3])):\n",
+      "[0.08847743272781372, 0.13203048706054688, 0.30742281675338745]\n",
+      "grad of y (torch.Size([1, 1, 1, 3])):\n",
+      "[1.2644784450531006, 1.2644784450531006, 1.2644784450531006]\n",
+      "z (torch.Size([2, 1, 1, 3])):\n",
+      "[0.04390750825405121, 0.0655210018157959, 0.15256059169769287, 0.06797029078006744, 0.10142869502305984, 0.23616890609264374]\n",
+      "==============\n",
+      "const raul::dtVec x{0.49625658988952637_dt, 0.7682217955589294_dt};\n",
+      "const raul::dtVec y{0.08847743272781372_dt, 0.13203048706054688_dt, 0.30742281675338745_dt};\n",
+      "const raul::dtVec z{0.04390750825405121_dt, 0.0655210018157959_dt, 0.15256059169769287_dt, 0.06797029078006744_dt, 0.10142869502305984_dt, 0.23616890609264374_dt};\n",
+      "const raul::dtVec x_grad{0.527930736541748_dt, 0.527930736541748_dt};\n",
+      "const raul::dtVec y_grad{1.2644784450531006_dt, 1.2644784450531006_dt, 1.2644784450531006_dt};\n"
+     ]
+    }
+   ],
+   "source": [
+    "set_seed(0)\n",
+    "x = torch.rand(2,1,1,1, requires_grad=True)\n",
+    "y = torch.rand(1,1,1,3, requires_grad=True)\n",
+    "z=x*y\n",
+    "z.requires_grad_(True)\n",
+    "z.sum().backward()\n",
+    "print_torch_tensor(\"x\", x, grad=True)\n",
+    "print_torch_tensor(\"y\", y, grad=True)\n",
+    "print_torch_tensor(\"z\", z)\n",
+    "print(\"==============\")\n",
+    "print(gen_cpp_dtVec(x.data.flatten(),\"x\"))\n",
+    "print(gen_cpp_dtVec(y.data.flatten(),\"y\"))\n",
+    "print(gen_cpp_dtVec(z.data.flatten(),\"z\"))\n",
+    "print(gen_cpp_dtVec(x.grad.flatten(),\"x_grad\"))\n",
+    "print(gen_cpp_dtVec(y.grad.flatten(),\"y_grad\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/training/src/tests/tests/python/cifar10/train2.py b/training/src/tests/tests/python/cifar10/train2.py
new file mode 100644
index 00000000..989e035b
--- /dev/null
+++ b/training/src/tests/tests/python/cifar10/train2.py
@@ -0,0 +1,167 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+import torch.optim as optim
+import numpy as np
+
+import torch.nn.functional as F
+
+torch.manual_seed(0)
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5)
+
+        self.fc1 = nn.Linear(6 * 28 * 28, 84)
+        self.fc2 = nn.Linear(84, 10)
+        self.soft = Softmax()
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = x.reshape(-1, 6 * 28 * 28)
+        x = self.fc1(x)
+        x = self.fc2(x)
+        x = self.soft(x)
+        return x
+
+
+criterion = nn.CrossEntropyLoss()
+learning_rate = 1e-2
+batch_size = 50
+_epoch = 1
+curdir = "./weights/"
+
+
+transform = transforms.Compose(
+    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+)
+
+trainset = torchvision.datasets.CIFAR10(
+    root="./data/", train=True, download=True, transform=transform
+)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size)
+
+testset = torchvision.datasets.CIFAR10(
+    root="./data", train=False, download=True, transform=transform
+)
+testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size)
+
+classes = (
+    "plane",
+    "car",
+    "bird",
+    "cat",
+    "deer",
+    "dog",
+    "frog",
+    "horse",
+    "ship",
+    "truck",
+)
+
+import matplotlib.pyplot as plt
+import os
+
+
+def saveWeights(index, model):
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+criterion = nn.CrossEntropyLoss()
+model = Model()
+optimizer = optim.SGD(model.parameters(), lr=learning_rate)
+for epoch in range(_epoch):  # loop over the dataset multiple times
+
+    running_loss = 0.0
+    model.train(True)
+    class_correct = 0
+    class_total = 0
+    for data in testloader:
+        images, labels = data
+        outputs = model(images)
+        _, predicted = torch.max(outputs, 1)
+        c = (predicted == labels).squeeze()
+        for i in range(batch_size):
+            label = labels[i]
+            class_correct += c[i].item()
+            class_total += 1
+    print("Accuracy: {:.4f}%".format(100 * class_correct / class_total))
+    for i, data in enumerate(trainloader, 0):
+        if i < 1:
+            saveWeights(i, model)
+        # get the inputs
+        inputs, labels = data
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = model(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # print statistics
+        running_loss += loss.item()
+        if i % 100 == 99:  # print every 2000 mini-batches
+            print("{:.6f}".format(running_loss / 100))
+            running_loss = 0.0
+    model.train(False)
+    class_correct = 0
+    class_total = 0
+    for data in testloader:
+        images, labels = data
+        outputs = model(images)
+        _, predicted = torch.max(outputs, 1)
+        c = (predicted == labels).squeeze()
+        for i in range(batch_size):
+            label = labels[i]
+            class_correct += c[i].item()
+            class_total += 1
+    print("Accuracy: {:.4f}%".format(100 * class_correct / class_total))
+print("Finished Training")
diff --git a/training/src/tests/tests/python/cnn.py b/training/src/tests/tests/python/cnn.py
new file mode 100644
index 00000000..4f687a74
--- /dev/null
+++ b/training/src/tests/tests/python/cnn.py
@@ -0,0 +1,182 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.1
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
+        self.maxpool1 = nn.MaxPool2d(2)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
+        self.maxpool2 = nn.MaxPool2d(2)
+        self.tanh = nn.Tanh()
+        self.fc1 = nn.Linear(512, 256)
+        self.sigm = nn.Sigmoid()
+        self.fc2 = nn.Linear(256, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        self.conv1Input = x
+        out = self.conv1(x)
+        #        out = self.maxpool1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        #        out = self.maxpool2(out)
+        #        out = self.tanh(out)
+        out = out.reshape(-1, 512)
+        out = self.fc1(out)
+        out = self.sigm(out)
+        out = self.fc2(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        #        if i < 1:
+        #            saveWeights(i, model)
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/cnnBiasTrain.py b/training/src/tests/tests/python/cnnBiasTrain.py
new file mode 100644
index 00000000..ad130268
--- /dev/null
+++ b/training/src/tests/tests/python/cnnBiasTrain.py
@@ -0,0 +1,160 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.1
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2, bias=False)
+        self.fc1 = nn.Linear(16 * 12 * 12, num_classes, bias=False)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = out.reshape(-1, 16 * 12 * 12)
+        out = self.fc1(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        # if i < 1:
+        #    saveWeights(i, model)
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        # if i % 100 == 0:
+        #    with open(curdir + 'loss.txt', 'a') as outfile:
+        #        print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/cnnBiasTrain1d.py b/training/src/tests/tests/python/cnnBiasTrain1d.py
new file mode 100644
index 00000000..4a12a254
--- /dev/null
+++ b/training/src/tests/tests/python/cnnBiasTrain1d.py
@@ -0,0 +1,163 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv1d(1, 16, kernel_size=5, stride=2, padding=2, bias=True)
+        self.fc1 = nn.Linear(16 * 392, num_classes, bias=True)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.uniform_(self.conv1.bias, 0, 1)
+        nn.init.uniform_(self.fc1.bias, 0, 1)
+
+    def forward(self, x):
+        x = x.view(50, 1, -1)
+        out = self.conv1(x)
+        out = out.reshape(-1, self.fc1.in_features)
+        out = self.fc1(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model, dir):
+
+    if not os.path.exists(dir):
+        os.mkdir(dir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        dir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(dir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, torch.squeeze(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        if i < 1:
+            saveWeights(i, model, "../../../../testAssets/test_cnn_layer/conv1d/")
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        # if i % 100 == 0:
+        #    with open(curdir + 'loss.txt', 'a') as outfile:
+        #        print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/cnnCifar10.py b/training/src/tests/tests/python/cnnCifar10.py
new file mode 100644
index 00000000..7b60acc7
--- /dev/null
+++ b/training/src/tests/tests/python/cnnCifar10.py
@@ -0,0 +1,171 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(3, 6, kernel_size=5)
+        self.fc1 = nn.Linear(6 * 28 * 28, 84)
+        self.fc2 = nn.Linear(84, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        self.conv1Input = x
+        out = self.conv1(x)
+        out = out.reshape(-1, 6 * 28 * 28)
+        out = self.fc1(out)
+        out = self.fc2(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.CIFAR10(
+        root="./data/cifar10",
+        train=True,
+        transform=transforms.ToTensor(),
+        download=True,
+    )
+
+    test_dataset = torchvision.datasets.CIFAR10(
+        root="./data/cifar10", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        #        if i < 1:
+        #            saveWeights(i, model)
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/cnnCifar10_2.py b/training/src/tests/tests/python/cnnCifar10_2.py
new file mode 100644
index 00000000..2b79a6ce
--- /dev/null
+++ b/training/src/tests/tests/python/cnnCifar10_2.py
@@ -0,0 +1,216 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=3)
+        self.bn1 = nn.BatchNorm2d(32, momentum=0.0)
+        self.conv2 = nn.Conv2d(32, 32, kernel_size=3)
+        self.bn2 = nn.BatchNorm2d(32, momentum=0.0)
+        self.avg1 = nn.AvgPool2d(kernel_size=2, stride=2)
+
+        self.conv3 = nn.Conv2d(32, 64, kernel_size=3)
+        self.bn3 = nn.BatchNorm2d(64, momentum=0.0)
+        self.conv4 = nn.Conv2d(64, 64, kernel_size=3)
+        self.bn4 = nn.BatchNorm2d(64, momentum=0.0)
+        self.avg2 = nn.AvgPool2d(kernel_size=2, stride=2)
+
+        self.conv5 = nn.Conv2d(64, 128, kernel_size=3)
+        self.bn5 = nn.BatchNorm2d(128, momentum=0.0)
+        self.conv6 = nn.Conv2d(128, 128, kernel_size=3)
+        self.bn6 = nn.BatchNorm2d(128, momentum=0.0)
+
+        self.fc1 = nn.Linear(128 * 1 * 1, 84)
+        self.fc2 = nn.Linear(84, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.xavier_uniform_(self.conv3.weight)
+        nn.init.xavier_uniform_(self.conv4.weight)
+        nn.init.xavier_uniform_(self.conv5.weight)
+        nn.init.xavier_uniform_(self.conv6.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+        nn.init.zeros_(self.conv3.bias)
+        nn.init.zeros_(self.conv4.bias)
+        nn.init.zeros_(self.conv5.bias)
+        nn.init.zeros_(self.conv6.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        self.conv1Input = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.avg1(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out = self.conv4(out)
+        out = self.bn4(out)
+        out = self.avg2(out)
+
+        out = self.conv5(out)
+        out = self.bn5(out)
+        out = self.conv6(out)
+        out = self.bn6(out)
+
+        out = out.reshape(-1, 128 * 1 * 1)
+        out = self.fc1(out)
+        out = self.fc2(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.CIFAR10(
+        root="./data/cifar10",
+        train=True,
+        transform=transforms.ToTensor(),
+        download=True,
+    )
+
+    test_dataset = torchvision.datasets.CIFAR10(
+        root="./data/cifar10", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    model.eval()
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        model.train()
+        outputs = model(images)
+        #        if i < 1:
+        #            saveWeights(i, model)
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    model.eval()
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/cnnDepthwise.py b/training/src/tests/tests/python/cnnDepthwise.py
new file mode 100644
index 00000000..081b7f18
--- /dev/null
+++ b/training/src/tests/tests/python/cnnDepthwise.py
@@ -0,0 +1,193 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.1
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1_depthwise = nn.Conv2d(1, 1, kernel_size=3, stride=2, padding=1)
+        self.conv1_pointwise = nn.Conv2d(1, 4, kernel_size=1)
+        self.conv2_depthwise = nn.Conv2d(
+            4, 4, kernel_size=3, stride=2, padding=1, groups=4
+        )
+        self.conv2_pointwise = nn.Conv2d(4, 8, kernel_size=1)
+        self.conv3_depthwise = nn.Conv2d(
+            8, 8, kernel_size=3, stride=2, padding=1, groups=8
+        )
+        self.conv3_pointwise = nn.Conv2d(8, 16, kernel_size=1)
+        self.conv4_depthwise = nn.Conv2d(
+            16, 16, kernel_size=3, stride=4, padding=0, groups=16
+        )
+        self.conv4_pointwise = nn.Conv2d(16, 10, kernel_size=1)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1_depthwise.weight)
+        nn.init.xavier_uniform_(self.conv1_pointwise.weight)
+        nn.init.xavier_uniform_(self.conv2_depthwise.weight)
+        nn.init.xavier_uniform_(self.conv2_pointwise.weight)
+        nn.init.xavier_uniform_(self.conv3_depthwise.weight)
+        nn.init.xavier_uniform_(self.conv3_pointwise.weight)
+        nn.init.xavier_uniform_(self.conv4_depthwise.weight)
+        nn.init.xavier_uniform_(self.conv4_pointwise.weight)
+
+        nn.init.zeros_(self.conv1_depthwise.bias)
+        nn.init.zeros_(self.conv1_pointwise.bias)
+        nn.init.zeros_(self.conv2_depthwise.bias)
+        nn.init.zeros_(self.conv2_pointwise.bias)
+        nn.init.zeros_(self.conv3_depthwise.bias)
+        nn.init.zeros_(self.conv3_pointwise.bias)
+        nn.init.zeros_(self.conv4_depthwise.bias)
+        nn.init.zeros_(self.conv4_pointwise.bias)
+
+    def forward(self, x):
+        out = self.conv1_depthwise(x)
+        out = self.conv1_pointwise(out)
+        out = self.conv2_depthwise(out)
+        out = self.conv2_pointwise(out)
+        out = self.conv3_depthwise(out)
+        out = self.conv3_pointwise(out)
+        out = self.conv4_depthwise(out)
+        out = self.conv4_pointwise(out)
+        out = out.reshape(-1, num_classes)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        #        if i < 1:
+        #            saveWeights(i, model)
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/cnnMnistHSwishTest.py b/training/src/tests/tests/python/cnnMnistHSwishTest.py
new file mode 100644
index 00000000..e7b8e0a2
--- /dev/null
+++ b/training/src/tests/tests/python/cnnMnistHSwishTest.py
@@ -0,0 +1,136 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torchvision.datasets import mnist
+from torch.nn import CrossEntropyLoss
+from torch.optim import SGD
+from torch.utils.data import DataLoader
+from torchvision.transforms import ToTensor
+from torch.autograd import Variable
+import os
+
+curdir = "./weights/"
+num_classes = 10
+
+
+class h_sigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(h_sigmoid, self).__init__()
+        self.relu = nn.ReLU6(inplace=inplace)
+
+    def forward(self, x):
+        return self.relu(x + 3) / 6
+
+
+class h_swish(nn.Module):
+    def __init__(self, inplace=True):
+        super(h_swish, self).__init__()
+        self.sigmoid = h_sigmoid(inplace=inplace)
+
+    def forward(self, x):
+        return x * self.sigmoid(x)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.conv1 = nn.Conv2d(1, 2, 1, 1, 0)
+        self.hsigm = h_sigmoid()
+        self.fc = nn.Linear(1568, 10)
+        self.softmax = nn.Softmax()
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.fc.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.fc.bias)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.hsigm(out)
+        # print(out.shape)
+        out = out.view(out.size(0), -1)
+        # print(out.shape)
+        out = self.fc(out)
+        out = self.softmax(out)
+        return out
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def printModel(model, file):
+    for i in model.state_dict():
+        file.write(len(model.state_dict()[i]).to_bytes(4, byteorder="big"))
+        np.ndarray.tofile(model.state_dict()[i].detach().numpy(), file, format="%f")
+
+
+if __name__ == "__main__":
+    batch_size = 50
+    train_dataset = mnist.MNIST(root="./train", train=True, transform=ToTensor())
+    test_dataset = mnist.MNIST(root="./test", train=False, transform=ToTensor())
+    train_loader = DataLoader(train_dataset, batch_size=batch_size)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size)
+
+    model = Model()
+    sgd = SGD(model.parameters(), lr=1e-2)
+    cross_error = CrossEntropyLoss()
+    epoch = 1
+    predict(test_loader, model)
+    for _epoch in range(epoch):
+
+        for i, (images, labels) in enumerate(train_loader):
+
+            outputs = model(images)
+
+            loss, lossInput = CrossEntropy(outputs, labels)
+            sgd.zero_grad()
+            loss.backward()
+            sgd.step()
+
+            """if i % 100 == 0:
+                with open(curdir + 'loss.txt', 'a') as outfile:
+                    print(loss.item(), file=outfile)"""
+
+            if i % 100 == 0:
+                print("Step [{:4d}], Loss: {:.6f}".format(i, loss.item()))
+
+        print("Epocha: ", _epoch)
+        predict(test_loader, model)
+
+    with open("dump.bin", "wb") as file:
+        printModel(model, file)
diff --git a/training/src/tests/tests/python/cnnNLLLoss.py b/training/src/tests/tests/python/cnnNLLLoss.py
new file mode 100644
index 00000000..c64f4bc8
--- /dev/null
+++ b/training/src/tests/tests/python/cnnNLLLoss.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+
+m = nn.LogSoftmax(dim=1)
+loss = nn.NLLLoss(reduction="mean")
+inp = torch.tensor(
+    [
+        [1.0, 2.0, 5.0, -1.0, 3.0],
+        [-2.0, 5.0, -1.0, 3.0, 2.0],
+        [2.0, -4.0, -1.0, 1.0, 1.0],
+    ],
+    requires_grad=True,
+)
+target = torch.tensor([1, 0, 4])
+t1 = m(inp)
+t1.retain_grad()
+output = loss(t1, target)
+output.backward()
+print(t1)
+print(t1.grad)
+print(output)
+print(inp.grad)
diff --git a/training/src/tests/tests/python/cnnPadding.py b/training/src/tests/tests/python/cnnPadding.py
new file mode 100644
index 00000000..9323a443
--- /dev/null
+++ b/training/src/tests/tests/python/cnnPadding.py
@@ -0,0 +1,186 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.1
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.pad1 = nn.ConstantPad2d(2, 0.0)
+        self.conv1 = nn.Conv2d(1, 1, kernel_size=5, stride=1)
+        self.conv2 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
+        self.maxpool1 = nn.MaxPool2d(2)
+        self.relu = nn.ReLU()
+        self.conv3 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
+        self.maxpool2 = nn.MaxPool2d(2)
+        self.tanh = nn.Tanh()
+        self.fc1 = nn.Linear(512, 256)
+        self.sigm = nn.Sigmoid()
+        self.fc2 = nn.Linear(256, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.xavier_uniform_(self.conv3.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+        nn.init.zeros_(self.conv3.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        out = self.conv1(self.pad1(x))
+        out = self.conv2(out)
+        #        out = self.maxpool1(out)
+        #        out = self.relu(out)
+        out = self.conv3(out)
+        #        out = self.maxpool2(out)
+        #        out = self.tanh(out)
+        out = out.reshape(-1, 512)
+        out = self.fc1(out)
+        #        out = self.sigm(out)
+        out = self.fc2(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        #        if i < 1:
+        #            saveWeights(i, model)
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/cnnPaddingResidual.py b/training/src/tests/tests/python/cnnPaddingResidual.py
new file mode 100644
index 00000000..ef3dd674
--- /dev/null
+++ b/training/src/tests/tests/python/cnnPaddingResidual.py
@@ -0,0 +1,188 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.1
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.pad1 = nn.ConstantPad2d(2, 0.0)
+        self.conv1 = nn.Conv2d(1, 1, kernel_size=5, stride=1)
+        self.conv2 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
+        self.maxpool1 = nn.MaxPool2d(2)
+        self.relu = nn.ReLU()
+        self.conv3 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
+        self.maxpool2 = nn.MaxPool2d(2)
+        self.tanh = nn.Tanh()
+        self.fc1 = nn.Linear(512, 256)
+        self.sigm = nn.Sigmoid()
+        self.fc2 = nn.Linear(256, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.xavier_uniform_(self.conv3.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+        nn.init.zeros_(self.conv3.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(self.pad1(x))
+        out += residual
+        out = self.conv2(out)
+        #        out = self.maxpool1(out)
+        #        out = self.relu(out)
+        out = self.conv3(out)
+        #        out = self.maxpool2(out)
+        #        out = self.tanh(out)
+        out = out.reshape(-1, 512)
+        out = self.fc1(out)
+        #        out = self.sigm(out)
+        out = self.fc2(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        #        if i < 1:
+        #            saveWeights(i, model)
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/cnnResidualDownsample.py b/training/src/tests/tests/python/cnnResidualDownsample.py
new file mode 100644
index 00000000..31eb312c
--- /dev/null
+++ b/training/src/tests/tests/python/cnnResidualDownsample.py
@@ -0,0 +1,188 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.1
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(1, 1, kernel_size=1, stride=2)
+        self.conv2 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
+        self.maxpool1 = nn.MaxPool2d(2)
+        self.relu = nn.ReLU()
+        self.conv3 = nn.Conv2d(16, 32, kernel_size=3, stride=1)
+        self.maxpool2 = nn.MaxPool2d(2)
+        self.tanh = nn.Tanh()
+        self.fc1 = nn.Linear(288, 128)
+        self.sigm = nn.Sigmoid()
+        self.fc2 = nn.Linear(128, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.xavier_uniform_(self.conv3.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+        nn.init.zeros_(self.conv3.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        outRes = self.maxpool1(residual)
+        out += outRes
+        out = self.conv2(out)
+        #        out = self.maxpool1(out)
+        #        out = self.relu(out)
+        out = self.conv3(out)
+        #        out = self.maxpool2(out)
+        #        out = self.tanh(out)
+        out = out.reshape(-1, 288)
+        out = self.fc1(out)
+        #        out = self.sigm(out)
+        out = self.fc2(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        #        if i < 1:
+        #            saveWeights(i, model)
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/cnnSkipPaddingResidual.py b/training/src/tests/tests/python/cnnSkipPaddingResidual.py
new file mode 100644
index 00000000..375bc82d
--- /dev/null
+++ b/training/src/tests/tests/python/cnnSkipPaddingResidual.py
@@ -0,0 +1,181 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(1, 1, kernel_size=1, stride=1)
+        self.conv2 = nn.Conv2d(1, 1, kernel_size=5, stride=1, padding=2)
+        self.conv3 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
+        self.conv4 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
+        self.fc1 = nn.Linear(512, 256)
+        self.fc2 = nn.Linear(256, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.xavier_uniform_(self.conv3.weight)
+        nn.init.xavier_uniform_(self.conv4.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+        nn.init.zeros_(self.conv3.bias)
+        nn.init.zeros_(self.conv4.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        residual = out
+        out = self.conv2(out)
+        out += residual
+        out = self.conv3(out)
+        out = self.conv4(out)
+        out = out.reshape(-1, 512)
+        out = self.fc1(out)
+        out = self.fc2(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        #        if i < 1:
+        #            saveWeights(i, model)
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/cnnVariableBatchSize.py b/training/src/tests/tests/python/cnnVariableBatchSize.py
new file mode 100644
index 00000000..4652b92b
--- /dev/null
+++ b/training/src/tests/tests/python/cnnVariableBatchSize.py
@@ -0,0 +1,176 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.1
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2, bias=False)
+        self.fc1 = nn.Linear(16 * 12 * 12, num_classes, bias=False)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = out.reshape(-1, 16 * 12 * 12)
+        out = self.fc1(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for epoch in range(0, 2):
+
+        if epoch == 1:
+            train_loader = torch.utils.data.DataLoader(
+                dataset=train_dataset, batch_size=batch_size * 2, shuffle=False
+            )
+
+            test_loader = torch.utils.data.DataLoader(
+                dataset=test_dataset,
+                batch_size=batch_size * 2,
+                shuffle=False,
+                drop_last=True,
+            )
+
+        for i, (images, labels) in enumerate(train_loader):
+
+            start = time.time()
+
+            outputs = model(images)
+            # if i < 1:
+            #    saveWeights(i, model)
+            loss, lossInput = CrossEntropy(outputs, labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            timeTaken += time.time() - start
+
+            # if i % 100 == 0:
+            #    with open(curdir + 'loss.txt', 'a') as outfile:
+            #        print(loss.item(), file = outfile)
+
+            if i % 100 == 0:
+                print(
+                    "Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item())
+                )
+
+        predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/conv1d_double_step.py b/training/src/tests/tests/python/conv1d_double_step.py
new file mode 100644
index 00000000..0c308f4c
--- /dev/null
+++ b/training/src/tests/tests/python/conv1d_double_step.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# First
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+m = torch.nn.Conv1d(3, 3, 2, stride=2, padding=1, dilation=2, bias=True)
+input = torch.randn(2, 3, 5, requires_grad=True)
+deltas = torch.randn(2, 3, 2)
+print("Weights: ", m.weight)
+print("Biases: ", m.bias)
+print("Input: ", input)
+result1 = m(input)
+print("Result after first step: ", result1)
+result2 = m(result1)
+print("Result after second step: ", result2)
+result2.backward(deltas)
+print("Incoming deltas: ", deltas)
+print("Gradient for weights: ", m.weight.grad)
+print("Gradient for biases: ", m.bias.grad)
diff --git a/training/src/tests/tests/python/conv1d_test.py b/training/src/tests/tests/python/conv1d_test.py
new file mode 100644
index 00000000..2b35dbb5
--- /dev/null
+++ b/training/src/tests/tests/python/conv1d_test.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+
+def test(x, conv):
+    print("==== TEST START ====")
+    torch.nn.init.ones_(conv.weight)
+    torch.nn.init.ones_(conv.bias)
+    y = x
+    if y.grad is not None:
+        torch.nn.init.zeros_(y.grad)
+    z = conv(y)
+    print(conv)
+    print("conv", z)
+    g = torch.ones_like(z)
+    z.backward(g)
+    print("grad", y.grad)
+    print("w.grad", conv.weight.grad)
+    print("b.grad", conv.bias.grad)
+    print("==== TEST END  ====")
+
+
+x = torch.tensor(
+    [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]],
+    requires_grad=True,
+)
+
+conv = torch.nn.Conv1d(
+    in_channels=2, out_channels=2, kernel_size=3, padding=0, dilation=1, stride=1
+)
+test(x, conv)
+print(conv.weight.shape)
+
+conv = torch.nn.Conv1d(
+    in_channels=2,
+    out_channels=2,
+    kernel_size=3,
+    padding=0,
+    dilation=1,
+    stride=1,
+    groups=2,
+)
+test(x, conv)
+print(conv.weight.shape)
+
+conv = torch.nn.Conv1d(
+    in_channels=2, out_channels=2, kernel_size=3, padding=2, dilation=1, stride=1
+)
+test(x, conv)
+
+conv = torch.nn.Conv1d(
+    in_channels=2, out_channels=2, kernel_size=3, padding=2, dilation=1, stride=2
+)
+test(x, conv)
+
+conv = torch.nn.Conv1d(
+    in_channels=2, out_channels=1, kernel_size=3, padding=2, dilation=1, stride=3
+)
+test(x, conv)
+
+conv = torch.nn.Conv1d(
+    in_channels=2, out_channels=1, kernel_size=3, padding=2, dilation=1, stride=5
+)
+test(x, conv)
+
+conv = torch.nn.Conv1d(
+    in_channels=2,
+    out_channels=4,
+    kernel_size=3,
+    padding=2,
+    dilation=1,
+    stride=5,
+    groups=2,
+)
+test(x, conv)
diff --git a/training/src/tests/tests/python/cumsum.py b/training/src/tests/tests/python/cumsum.py
new file mode 100644
index 00000000..41d969a5
--- /dev/null
+++ b/training/src/tests/tests/python/cumsum.py
@@ -0,0 +1,26 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(99)
+torch.set_printoptions(precision=10)
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+print("Input: ", x)
+
+for i in range(4):
+    y = torch.cumsum(x, i)
+    print("Dimension =", i, "Result:", y)
+    y.sum().backward()
+    print("Gradient for input:", x.grad)
diff --git a/training/src/tests/tests/python/depthwise_conv1d.py b/training/src/tests/tests/python/depthwise_conv1d.py
new file mode 100644
index 00000000..f7a7c052
--- /dev/null
+++ b/training/src/tests/tests/python/depthwise_conv1d.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# 1D Depthwise with dilation
+torch.manual_seed(1)
+torch.set_printoptions(precision=10)
+m = torch.nn.Conv1d(4, 4, 3, stride=2, padding=1, dilation=2, bias=False, groups=4)
+print("Weights:", m.weight)
+input = torch.randn(2, 4, 3, requires_grad=True)
+
+print("Input: ", input)
+result = m(input)
+print("Result: ", result)
+result.sum().backward()
+print("Gradient for input: ", input.grad)
+print("Gradient for weights: ", m.weight.grad)
diff --git a/training/src/tests/tests/python/depthwise_conv2d.py b/training/src/tests/tests/python/depthwise_conv2d.py
new file mode 100644
index 00000000..5bb01032
--- /dev/null
+++ b/training/src/tests/tests/python/depthwise_conv2d.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# Simple
+torch.manual_seed(1)
+torch.set_printoptions(precision=10)
+m = torch.nn.Conv2d(
+    2, 2, (3, 3), stride=(1, 1), padding=(0, 0), dilation=(1, 1), bias=False, groups=2
+)
+print("Weights:", m.weight)
+input = torch.randn(2, 2, 3, 3, requires_grad=True)
+
+print("Input: ", input)
+result = m(input)
+print("Result: ", result)
+result.sum().backward()
+print("Gradient for input: ", input.grad)
+print("Gradient for weights: ", m.weight.grad)
+
+# Hard
+torch.manual_seed(0)
+torch.set_printoptions(precision=10)
+m = torch.nn.Conv2d(
+    4, 4, (1, 3), stride=(1, 3), padding=(3, 1), dilation=(2, 3), bias=False, groups=4
+)
+print("Weights:", m.weight)
+torch.manual_seed(42)
+input = torch.randn(2, 4, 5, 5, requires_grad=True)
+
+print("Input: ", input)
+result = m(input)
+print("Result: ", result)
+result.sum().backward()
+print("Gradient for input: ", input.grad)
+print("Gradient for weights: ", m.weight.grad)
diff --git a/training/src/tests/tests/python/dilated_conv1d.py b/training/src/tests/tests/python/dilated_conv1d.py
new file mode 100644
index 00000000..4ca6c351
--- /dev/null
+++ b/training/src/tests/tests/python/dilated_conv1d.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# First
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+m1 = torch.nn.Conv1d(2, 1, 2, stride=1, padding=0, dilation=2, bias=False)
+input = torch.randn(1, 2, 5, requires_grad=True)
+print("Input: ", input)
+result = m1(input)
+print("Result: ", result)
+result.sum().backward()
+print("Gradient for input: ", input.grad)
+print("Gradient for weights: ", m1.weight.grad)
+
+# Second
+m1 = torch.nn.Conv1d(4, 3, 3, stride=2, padding=1, dilation=3, bias=True)
+input = torch.randn(2, 4, 5, requires_grad=True)
+print("Input: ", input)
+result = m1(input)
+print("Result: ", result)
+result.sum().backward()
+print("Gradient for input: ", input.grad)
+print("Gradient for weights: ", m1.weight.grad)
+print("Gradient for bias: ", m1.bias.grad)
diff --git a/training/src/tests/tests/python/dilated_conv2d.py b/training/src/tests/tests/python/dilated_conv2d.py
new file mode 100644
index 00000000..0cc38ae6
--- /dev/null
+++ b/training/src/tests/tests/python/dilated_conv2d.py
@@ -0,0 +1,64 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# First
+torch.manual_seed(0)
+m = torch.nn.Conv2d(
+    1, 1, (2, 2), stride=(1, 1), padding=(0, 0), dilation=(2, 2), bias=False
+)
+torch.manual_seed(42)
+torch.set_printoptions(precision=8)
+input = torch.randn(1, 1, 5, 5, requires_grad=True)
+print("Input: ", input)
+result = m(input)
+print("Result: ", result)
+result.sum().backward()
+print("Gradient for input: ", input.grad)
+print("Gradient for weights: ", m.weight.grad)
+
+# Second
+torch.manual_seed(0)
+m = torch.nn.Conv2d(
+    2, 3, (3, 3), stride=(2, 2), padding=(2, 2), dilation=(3, 3), bias=False
+)
+
+torch.manual_seed(42)
+torch.set_printoptions(precision=8)
+input = torch.randn(2, 2, 5, 5, requires_grad=True)
+
+print("Input: ", input)
+result = m(input)
+print("Result: ", result)
+result.sum().backward()
+print("Gradient for input: ", input.grad)
+print("Gradient for weights: ", m.weight.grad)
+
+# Third
+torch.manual_seed(0)
+m = torch.nn.Conv2d(
+    3, 2, (1, 3), stride=(1, 3), padding=(3, 1), dilation=(2, 3), bias=True
+)
+
+torch.manual_seed(42)
+torch.set_printoptions(precision=8)
+input = torch.randn(2, 3, 5, 5, requires_grad=True)
+
+print("Input: ", input)
+result = m(input)
+print("Result: ", result)
+result.sum().backward()
+print("Gradient for input: ", input.grad)
+print("Gradient for weights: ", m.weight.grad)
diff --git a/training/src/tests/tests/python/dynamic_convolution_attention.py b/training/src/tests/tests/python/dynamic_convolution_attention.py
new file mode 100644
index 00000000..ce5b38ed
--- /dev/null
+++ b/training/src/tests/tests/python/dynamic_convolution_attention.py
@@ -0,0 +1,484 @@
+# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+# Needed tf version - 1.13.2
+
+import functools
+
+import tensorflow as tf
+from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import (
+    BahdanauAttention,
+    BahdanauMonotonicAttention,
+)
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import (
+    array_ops,
+    math_ops,
+    nn_ops,
+    variable_scope,
+    random_ops,
+)
+
+import scipy.special as sc
+from scipy.special import comb
+from scipy.stats import beta
+
+import numpy as np
+
+tf.enable_eager_execution()
+
+
+def _smoothing_normalization(e):
+    """Applies a smoothing normalization function instead of softmax
+    Introduced in:
+        J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+      gio, “Attention-based models for speech recognition,” in Ad-
+      vances in Neural Information Processing Systems, 2015, pp.
+      577–585.
+
+    ############################################################################
+                        Smoothing normalization function
+                a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
+    ############################################################################
+
+    Args:
+        e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
+            values of an attention mechanism
+    Returns:
+        matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
+            attendance to multiple memory time steps.
+    """
+    return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
+
+
+def _dca_score(W_location, W_dynamic, prior, W_keys):
+    """https://arxiv.org/pdf/1910.10288.pdf"""
+    # Get the number of hidden units from the trailing dimension of keys
+    dtype = W_location.dtype
+    print(dtype)
+    num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
+
+    v_a = tf.get_variable(
+        "attention_variable_projection",
+        shape=[num_units],
+        dtype=dtype,
+        initializer=tf.contrib.layers.xavier_initializer(),
+    )
+    b_a = tf.get_variable(
+        "attention_bias",
+        shape=[num_units],
+        dtype=dtype,
+        initializer=tf.zeros_initializer(),
+    )
+
+    return tf.reduce_sum(v_a * tf.tanh(W_location + W_dynamic + b_a), [2]) + prior
+
+
+class DynamicConvolutionalAttention(BahdanauAttention):
+    """Impelements Bahdanau-style (cumulative) scoring function
+    as in https://arxiv.org/pdf/1910.10288.pdf.
+    The attention is location-based.
+    """
+
+    def beta_pdf(self, k):
+        alpha_ = self.hparams.prior_alpha
+        beta_ = self.hparams.prior_beta
+        n = self.hparams.prior_filter_size - 1
+        return comb(n, k) * sc.beta(k + alpha_, n - k + beta_) / sc.beta(alpha_, beta_)
+
+    def _build_prior_filters(self, prior_filter_size):
+        return [self.beta_pdf(i) for i in range(prior_filter_size)]
+
+    def __init__(
+        self,
+        num_units,
+        memory,
+        hparams,
+        is_training,
+        mask_encoder=True,
+        memory_sequence_length=None,
+        normalize=False,
+        smoothing=False,
+        cumulate_weights=True,
+        name="DynamicConvolutionalAttention",
+    ):
+        """Construct the Attention mechanism.
+        Args:
+            num_units: The depth of the query mechanism.
+            memory: The memory to query; usually the output of an RNN encoder.  This
+                tensor should be shaped `[batch_size, max_time, ...]`.
+            mask_encoder (optional): Boolean, whether to mask encoder paddings.
+            memory_sequence_length (optional): Sequence lengths for the batch entries
+                in memory.  If provided, the memory tensor rows are masked with zeros
+                for values past the respective sequence lengths. Only relevant if mask_encoder = True.
+            smoothing (optional): Boolean. Determines which normalization function to use.
+                Default normalization function (probablity_fn) is softmax. If smoothing is
+                enabled, we replace softmax with:
+                        a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
+                Introduced in:
+                    J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+                  gio, “Attention-based models for speech recognition,” in Ad-
+                  vances in Neural Information Processing Systems, 2015, pp.
+                  577–585.
+                This is mainly used if the model wants to attend to multiple input parts
+                at the same decoding step. We probably won't be using it since multiple sound
+                frames may depend on the same character/phone, probably not the way around.
+                Note:
+                    We still keep it implemented in case we want to test it. They used it in the
+                    paper in the context of speech recognition, where one phoneme may depend on
+                    multiple subsequent sound frames.
+            name: Name to use when creating ops.
+        """
+        # Create normalization function
+        # Setting it to None defaults in using softmax
+        normalization_function = _smoothing_normalization if smoothing is True else None
+        memory_length = memory_sequence_length if mask_encoder is True else None
+        self.hparams = hparams
+        super(DynamicConvolutionalAttention, self).__init__(
+            num_units=num_units,
+            memory=memory,
+            memory_sequence_length=memory_length,
+            normalize=normalize,
+            probability_fn=normalization_function,
+            name=name,
+        )
+
+        self.location_convolution = tf.layers.Conv1D(
+            filters=hparams.attention_filters,
+            kernel_size=hparams.attention_kernel,
+            padding="same",
+            use_bias=True,
+            bias_initializer=tf.zeros_initializer(),
+            name="location_features_convolution",
+        )
+        self.location_layer = tf.layers.Dense(
+            units=num_units,
+            use_bias=False,
+            dtype=tf.float32,
+            name="location_features_layer",
+        )
+
+        self.dynamic_fc1 = tf.layers.Dense(
+            units=128,
+            use_bias=True,
+            dtype=tf.float32,
+            name="dynamic_fc1",
+            activation=tf.tanh,
+        )
+
+        self.dynamic_fc2 = tf.layers.Dense(
+            units=21 * 8,
+            use_bias=False,
+            dtype=tf.float32,
+            name="dynamic_fc2",
+            activation=None,
+        )
+
+        self.dynamic_projection = tf.layers.Dense(
+            units=num_units, use_bias=False, dtype=tf.float32, name="dynamic_projection"
+        )
+
+        prior_filters = tf.convert_to_tensor(
+            self._build_prior_filters(hparams.prior_filter_size), dtype=tf.float32
+        )
+        prior_filters = tf.reverse(prior_filters, axis=[0])
+        self.prior_filters = tf.reshape(
+            prior_filters, [hparams.prior_filter_size, 1, 1]
+        )
+
+        self.synthesis_constraint = hparams.synthesis_constraint and not is_training
+        self.attention_win_size = tf.convert_to_tensor(
+            hparams.attention_win_size, dtype=tf.int32
+        )
+        self.constraint_type = hparams.synthesis_constraint_type
+        self._cumulate = cumulate_weights
+
+    def initial_alignments(self, batch_size, dtype):
+        max_time = self._alignments_size
+        return array_ops.one_hot(
+            array_ops.zeros((batch_size,), dtype=tf.int32), max_time, dtype=dtype
+        )
+
+    def _dynamic_conv(self, dynamic_filters, dynamic_input):
+        """
+        Dynamic conv implementation using depthwise_conv2d.
+        See details here:
+        https://stackoverflow.com/questions/42068999/tensorflow-convolutions-with-different-filter-for-each-sample-in-the-mini-batch
+        """
+        sequence_length = tf.shape(dynamic_input)[1]
+
+        dynamic_input = tf.expand_dims(dynamic_input, axis=1)
+
+        dynamic_input = tf.transpose(dynamic_input, [1, 2, 0, 3])
+        dynamic_input = tf.reshape(dynamic_input, [1, 1, sequence_length, -1])
+
+        dynamic_features = tf.nn.depthwise_conv2d(
+            input=dynamic_input,
+            filter=dynamic_filters,
+            strides=[1, 1, 1, 1],
+            padding="SAME",
+            name="dynamic_convolution",
+            data_format="NHWC",
+        )
+
+        dynamic_features = tf.reshape(dynamic_features, [1, sequence_length, -1, 1, 8])
+
+        dynamic_features = tf.transpose(dynamic_features, [2, 0, 1, 3, 4])
+
+        dynamic_features = tf.reduce_sum(dynamic_features, axis=3)
+
+        dynamic_features = tf.squeeze(dynamic_features, axis=1)
+
+        return dynamic_features
+
+    def _apply_prior_filters(self, expanded_alignments):
+        padded_alignments = tf.pad(expanded_alignments, [[0, 0], [10, 0], [0, 0]])
+        prior = tf.nn.conv1d(
+            padded_alignments, self.prior_filters, stride=1, padding="VALID"
+        )
+        prior = tf.squeeze(prior, axis=2)
+
+        MIN_INPUT = (
+            1.775e-38  # Smallest value that doesn't lead to log underflow for float32.
+        )
+        MIN_OUTPUT = -1e6
+        prior_output = tf.math.log(tf.maximum(prior, MIN_INPUT))
+        prior_output = tf.where(
+            prior >= MIN_INPUT,
+            prior_output,
+            tf.fill(tf.shape(prior_output), MIN_OUTPUT),
+        )
+        return prior_output
+
+    def __call__(self, query, state, prev_max_attentions):
+        """Score the query based on the keys and values.
+        Args:
+            query: Tensor of dtype matching `self.values` and shape
+                `[batch_size, query_depth]`.
+            state (previous alignments): Tensor of dtype matching `self.values` and shape
+                `[batch_size, alignments_size]`
+                (`alignments_size` is memory's `max_time`).
+        Returns:
+            alignments: Tensor of dtype matching `self.values` and shape
+                `[batch_size, alignments_size]` (`alignments_size` is memory's
+                `max_time`).
+        """
+        previous_alignments = state
+        with tf.GradientTape() as g:
+            with variable_scope.variable_scope(None, "DynamicConvolution", [query]):
+
+                # processed_location_features shape [batch_size, max_time, attention dimension]
+                # [batch_size, max_time] -> [batch_size, max_time, 1]
+                expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
+                # Projected location features [batch_size, max_time, attention_dim]
+                f = self.location_convolution(expanded_alignments)
+                processed_location_features = self.location_layer(f)
+                dynamic_filters = self.dynamic_fc2(self.dynamic_fc1(query))
+                dynamic_filters = tf.reshape(dynamic_filters, [-1, 1, 21, 8])
+
+                # Do not mix from various samples in the same batch
+                dynamic_filters = tf.transpose(dynamic_filters, [1, 2, 0, 3])
+
+                # Apply dynamic filters
+                dynamic_features = self._dynamic_conv(
+                    dynamic_filters, expanded_alignments
+                )
+
+                # dynamic_features = tf.matmul(previous_alignments, dynamic_filters)
+                processed_dynamic_features = self.dynamic_projection(dynamic_features)
+
+                # energy shape [batch_size, max_time]
+                # apply prior filters to ensure mo
+                prior_output = self._apply_prior_filters(expanded_alignments)
+
+                energy = _dca_score(
+                    processed_location_features,
+                    processed_dynamic_features,
+                    prior_output,
+                    self.keys,
+                )
+                g.watch(
+                    tf.get_variable(
+                        "attention_variable_projection",
+                        shape=[4],
+                        dtype=tf.dtypes.float32,
+                        initializer=tf.contrib.layers.xavier_initializer(),
+                    )
+                )
+            alignments = tf.nn.softmax(energy)
+
+            max_attentions = tf.argmax(
+                alignments, -1, output_type=tf.int32
+            )  # (N, Ty/r)
+            # Cumulate alignments
+            if self._cumulate:
+                next_state = alignments + previous_alignments
+            else:
+                next_state = alignments
+        print(
+            g.gradient(
+                alignments,
+                tf.get_variable(
+                    "attention_variable_projection",
+                    shape=[4],
+                    dtype=tf.dtypes.float32,
+                    initializer=tf.contrib.layers.xavier_initializer(),
+                ),
+            )
+        )
+        return alignments, next_state, max_attentions
+
+
+def save_weights(path, data, name):
+    import os
+
+    if not os.path.exists(path):
+        os.mkdir(path)
+
+    if data.ndim == 3:
+        with open(path + name + ".txt", "w") as outfile:
+            for i in range(0, data.shape[0]):
+                np.savetxt(outfile, data[i])
+    elif data.ndim == 0:
+        with open(path + name + ".txt", "w") as outfile:
+            print(data, file=outfile)
+    else:
+        with open(path + name + ".txt", "w") as outfile:
+            np.savetxt(outfile, data)
+
+
+# TEST 1
+
+
+class params:
+    attention_filters = 3
+    attention_kernel = 3
+    prior_filter_size = 11
+    synthesis_constraint = False
+    synthesis_constraint_type = 2
+    prior_alpha = 0.6
+    prior_beta = 0.2
+    attention_win_size = 1
+
+
+tf.random.set_random_seed(0)
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+config.allow_soft_placement = True
+config.intra_op_parallelism_threads = 1
+config.inter_op_parallelism_threads = 1
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [2, 3, 5], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [2, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [2, 5], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = DynamicConvolutionalAttention(4, memory, param, False, cumulate_weights=False)
+result = attn(query, state, prev_max)
+print("query", query)
+print("state", state)
+print("memory", memory)
+print("Result", result)
+""" save all params
+save_weights('./', attn.location_convolution.weights[0].numpy(), "location_convolution.weights")
+save_weights('./', attn.location_convolution.weights[1].numpy(), "location_convolution.bias")
+save_weights('./', attn.location_layer.weights[0].numpy(), "location_layer.weights")
+save_weights('./', attn.dynamic_fc1.weights[0].numpy(), "dynamic_fc1.weights")
+save_weights('./', attn.dynamic_fc1.weights[1].numpy(), "dynamic_fc1.bias")
+save_weights('./', attn.dynamic_fc2.weights[0].numpy(), "dynamic_fc2.weights")
+save_weights('./', attn.dynamic_projection.weights[0].numpy(), "dynamic_projection.weights")
+save_weights('./', attn.prior_filters.numpy(), "apply_prior_filters.weights")
+"""
+
+# TEST 2
+class params:
+    attention_filters = 5
+    attention_kernel = 3
+    prior_filter_size = 14
+    synthesis_constraint = False
+    synthesis_constraint_type = 2
+    prior_alpha = 0.9
+    prior_beta = 0.1
+    attention_win_size = 1
+
+
+# Session setting
+tf.random.set_random_seed(1)
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+config.allow_soft_placement = True
+config.intra_op_parallelism_threads = 1
+config.inter_op_parallelism_threads = 1
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 4, 6], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+
+prev_max = tf.Variable(tf.constant([1.0, 1.0, 1.0]), name="prev_max")
+param = params
+attn = DynamicConvolutionalAttention(
+    5,
+    memory,
+    param,
+    False,
+    cumulate_weights=True,
+    memory_sequence_length=tf.constant([2.0, 4.0, 5.0]),
+    smoothing=True,
+    normalize=True,
+)
+result = attn(query, state, prev_max)
+print("query", query)
+print("state", state)
+print("memory", memory)
+print("Result", result)
+
+""" save all params
+save_weights('./', attn.location_convolution.weights[0].numpy(), "location_convolution.weights_2")
+save_weights('./', attn.location_convolution.weights[1].numpy(), "location_convolution.bias_2")
+save_weights('./', attn.location_layer.weights[0].numpy(), "location_layer.weights_2")
+save_weights('./', attn.dynamic_fc1.weights[0].numpy(), "dynamic_fc1.weights_2")
+save_weights('./', attn.dynamic_fc1.weights[1].numpy(), "dynamic_fc1.bias_2")
+save_weights('./', attn.dynamic_fc2.weights[0].numpy(), "dynamic_fc2.weights_2")
+save_weights('./', attn.dynamic_projection.weights[0].numpy(), "dynamic_projection.weights_2")
+"""
diff --git a/training/src/tests/tests/python/element-wise_compare_and_sum.py b/training/src/tests/tests/python/element-wise_compare_and_sum.py
new file mode 100644
index 00000000..0f4ab15a
--- /dev/null
+++ b/training/src/tests/tests/python/element-wise_compare_and_sum.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.set_printoptions(precision=10)
+torch.manual_seed(42)
+x = torch.rand(2, 3, 4, requires_grad=True)
+y = torch.rand(2, 3, 4, requires_grad=True)
+out1 = x + y
+out2 = x < y
+out1.requires_grad_ = True
+out2.requires_grad_ = True
+z = out1 + out2
+z.requires_grad_ = True
+z.sum().backward()
+
+print("x = ", x)
+print("y = ", y)
+print("Result = ", z)
+
+print("Gradient for x = ", x.grad)
+print("Gradient for y = ", y.grad)
diff --git a/training/src/tests/tests/python/element-wise_max.py b/training/src/tests/tests/python/element-wise_max.py
new file mode 100644
index 00000000..300ecb9b
--- /dev/null
+++ b/training/src/tests/tests/python/element-wise_max.py
@@ -0,0 +1,47 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# Simple example
+a = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0], requires_grad=True)
+b = torch.tensor([[-5.0, -0.527, 1.0, 1.2, 2.5]], requires_grad=True)
+z = torch.max(a, b)
+z.requires_grad_ = True
+z.sum().backward()
+print("Result = ", z)
+print("Gradient for first input: ", a.grad)
+print("Gradient for second input: ", b.grad)
+# Equal values
+c1 = torch.tensor([-5.0, -1.0, 0.0, 1.2, 2.5], requires_grad=True)
+c2 = torch.tensor([[-5.0, -1.0, 0.0, 1.2, 2.5]], requires_grad=True)
+z = torch.max(c1, c2)
+z.requires_grad_ = True
+z.sum().backward()
+print("Result = ", z)
+print("Gradient for first input: ", c1.grad)
+print("Gradient for second input: ", c2.grad)
+# broadcasting
+torch.manual_seed(0)
+x = torch.rand(3, 1, 2, 1, requires_grad=True)
+y = torch.rand(3, 2, 1, 3, requires_grad=True)
+z = torch.max(x, y)
+z.requires_grad_ = True
+z.sum().backward()
+print("First input tensor = ", torch.flatten(x))
+print("Second input tensor = ", torch.flatten(y))
+print("Result = ", torch.flatten(z))
+print("Result shape = ", z.shape)
+print("Gradient for first input = ", torch.flatten(x.grad))
+print("Gradient for second input = ", torch.flatten(y.grad))
diff --git a/training/src/tests/tests/python/element-wise_min.py b/training/src/tests/tests/python/element-wise_min.py
new file mode 100644
index 00000000..9bf662b8
--- /dev/null
+++ b/training/src/tests/tests/python/element-wise_min.py
@@ -0,0 +1,47 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# Simple example
+a = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0], requires_grad=True)
+b = torch.tensor([[-5.0, -0.527, 1.0, 1.2, 2.5]], requires_grad=True)
+z = torch.min(a, b)
+z.requires_grad_ = True
+z.sum().backward()
+print("Result = ", z)
+print("Gradient for first input: ", a.grad)
+print("Gradient for second input: ", b.grad)
+# Equal values
+c1 = torch.tensor([-5.0, -1.0, 0.0, 1.2, 2.5], requires_grad=True)
+c2 = torch.tensor([[-5.0, -1.0, 0.0, 1.2, 2.5]], requires_grad=True)
+z = torch.min(c1, c2)
+z.requires_grad_ = True
+z.sum().backward()
+print("Result = ", z)
+print("Gradient for first input: ", c1.grad)
+print("Gradient for second input: ", c2.grad)
+# broadcasting
+torch.manual_seed(42)
+x = torch.rand(3, 1, 2, 1, requires_grad=True)
+y = torch.rand(3, 2, 1, 3, requires_grad=True)
+z = torch.min(x, y)
+z.requires_grad_ = True
+z.sum().backward()
+print("First input tensor = ", torch.flatten(x))
+print("Second input tensor = ", torch.flatten(y))
+print("Result = ", torch.flatten(z))
+print("Result shape = ", z.shape)
+print("Gradient for first input = ", torch.flatten(x.grad))
+print("Gradient for second input = ", torch.flatten(y.grad))
diff --git a/training/src/tests/tests/python/element-wise_rsqrt.py b/training/src/tests/tests/python/element-wise_rsqrt.py
new file mode 100644
index 00000000..e9c7b036
--- /dev/null
+++ b/training/src/tests/tests/python/element-wise_rsqrt.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# Create tensor
+x = torch.tensor([4.0, 2.0, 1.0], requires_grad=True)
+x_zero = torch.tensor([3.0, 0.0], requires_grad=True)
+x_neg = torch.tensor([1.0, -5.0], requires_grad=True)
+# RSQRT
+z = torch.rsqrt(x)
+z_zero = torch.rsqrt(x_zero)
+z_neg = torch.sqrt(x_neg)
+z.requires_grad_(True)
+z_zero.requires_grad_(True)
+z_neg.requires_grad_(True)
+z.sum().backward()
+z_zero.sum().backward()
+z_neg.sum().backward()
+# Results
+print("Result: ", z)
+print("Gradient: ", x.grad)
+print("Result for zero value: ", z_zero)
+print("Gradient for zero value", x_zero.grad)
+print("Result for negative value: ", z_neg)
+print("Gradient for negative value: ", x_neg.grad)
diff --git a/training/src/tests/tests/python/element-wise_sqrt.py b/training/src/tests/tests/python/element-wise_sqrt.py
new file mode 100644
index 00000000..bc561531
--- /dev/null
+++ b/training/src/tests/tests/python/element-wise_sqrt.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# Create tensor
+x = torch.tensor([4.0, 2.0, 1.0], requires_grad=True)
+x_neg = torch.tensor([1.0, -5.0], requires_grad=True)
+# SQRT
+z = torch.sqrt(x)
+z_neg = torch.sqrt(x_neg)
+z.requires_grad_(True)
+z_neg.requires_grad_(True)
+z.sum().backward()
+z_neg.sum().backward()
+# Results
+print("Result: ", z)
+print("Gradient: ", x.grad)
+print("Result for negative value: ", z_neg)
+print("Gradient for negative value: ", x_neg.grad)
diff --git a/training/src/tests/tests/python/element-wise_sum.py b/training/src/tests/tests/python/element-wise_sum.py
new file mode 100644
index 00000000..b885a523
--- /dev/null
+++ b/training/src/tests/tests/python/element-wise_sum.py
@@ -0,0 +1,26 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# Simple example
+a = torch.tensor([[1.0, 2.0], [-1.0, -2.0]], requires_grad=True)
+b = torch.tensor([2.0, 2.0], requires_grad=True)
+
+z = a + b
+print("Result = ", z)
+
+z.backward(torch.tensor([[1.0, 2.0], [3.0, 4.0]]))
+print("a.grad = ", a.grad)
+print("b.grad = ", b.grad)
diff --git a/training/src/tests/tests/python/element-wise_zero_division.py b/training/src/tests/tests/python/element-wise_zero_division.py
new file mode 100644
index 00000000..58ba2a52
--- /dev/null
+++ b/training/src/tests/tests/python/element-wise_zero_division.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# Create two tensors
+x = torch.tensor([3.0, 2.0, 1.0], requires_grad=True)
+y = torch.tensor([2.0, 1.0, 0.0], requires_grad=True)
+# division
+z = x / y
+z.requires_grad_(True)
+z.sum().backward()
+
+print("Result: ", z)
+print("x gradient: ", x.grad)
+print("y gradient: ", y.grad)
diff --git a/training/src/tests/tests/python/embedding_padding.py b/training/src/tests/tests/python/embedding_padding.py
new file mode 100644
index 00000000..d2ddb481
--- /dev/null
+++ b/training/src/tests/tests/python/embedding_padding.py
@@ -0,0 +1,41 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import math
+from torch.autograd import Variable
+
+
+V = 3
+d_model = 6
+
+inp = torch.LongTensor([[2, 1, 0, 2]])
+target = torch.tensor([1])
+
+torch.manual_seed(0)
+np.random.seed(0)
+e = nn.Embedding(V, d_model, scale_grad_by_freq=True, padding_idx=0)
+torch.set_printoptions(precision=6)
+
+t1 = e(inp)
+print(t1)
+t1.backward(torch.ones_like(t1))
+print()
+print("t1grad", t1.grad)
+
+print(e.weight)
+print(e.weight.grad)
diff --git a/training/src/tests/tests/python/embedding_positional.py b/training/src/tests/tests/python/embedding_positional.py
new file mode 100644
index 00000000..094cfc81
--- /dev/null
+++ b/training/src/tests/tests/python/embedding_positional.py
@@ -0,0 +1,105 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import math
+from torch.autograd import Variable
+
+
+class Embeddings(nn.Module):
+    def __init__(self, d_model, vocab):
+        super(Embeddings, self).__init__()
+        self.lut = nn.Embedding(vocab, d_model)
+        self.d_model = d_model
+
+    def forward(self, x):
+        return self.lut(x) * math.sqrt(self.d_model)
+
+
+class PositionalEncoding(nn.Module):
+    "Implement the PE function."
+
+    def __init__(self, d_model, dropout, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        p = Variable(self.pe[:, : x.size(1)], requires_grad=False)
+        x = x + p
+        x = x.squeeze()
+        return self.dropout(x)
+
+
+V = 3
+d_model = 6
+
+inp = torch.LongTensor([[2, 1, 0, 0]])
+target = torch.tensor([1])
+
+torch.manual_seed(0)
+np.random.seed(0)
+e = Embeddings(d_model, V)
+pe = PositionalEncoding(d_model, 0, 4)
+fc = nn.Linear(24, 2)
+
+nn.init.uniform_(fc.weight)
+nn.init.ones_(fc.bias)
+torch.set_printoptions(precision=6)
+print("fc.weight", fc.weight)
+
+f = fc.weight
+
+m = nn.LogSoftmax(dim=1)
+loss = nn.NLLLoss(reduction="mean")
+
+t1 = e(inp)
+t2 = pe(t1)
+t3 = fc(t2.reshape([1, 24]))
+t4 = m(t3)
+
+t1.retain_grad()
+t3.retain_grad()
+t4.retain_grad()
+output = loss(t4, target)
+output.backward()
+
+
+print(t1)
+print(t2)
+print(t3)
+print(t4)
+print(output)
+
+print()
+print("t1grad", t1.grad)
+print("t2grad", t2.grad)
+print("t3grad", t3.grad)
+print("t4grad", t4.grad)
+
+print(e.lut.weight)
+print(e.lut.weight.grad)
diff --git a/training/src/tests/tests/python/gaussian_upsampling.py b/training/src/tests/tests/python/gaussian_upsampling.py
new file mode 100644
index 00000000..a47b217d
--- /dev/null
+++ b/training/src/tests/tests/python/gaussian_upsampling.py
@@ -0,0 +1,79 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow as tf
+import tensorflow_probability as tfp
+
+tfd = tfp.distributions
+
+
+class GaussianUpsampling:
+    def __init__(self):
+        super(GaussianUpsampling, self).__init__()
+
+    def __call__(self, x, durations, ranges, mel_len=None):
+        eps = 0.00001
+        # x: batch_size x seq_len x input_dim
+        # durations: batch_size x seq_len
+        # missing elements to fit dimensions of both arrays should be 0
+        if mel_len is not None:
+            mel_len = tf.cast(mel_len, tf.float32)
+        else:
+            mel_len = tf.reduce_max(tf.reduce_sum(durations, axis=1))
+        c = tf.add(
+            tf.cast(tf.divide(durations, 2), tf.float32),
+            tf.concat(
+                [
+                    tf.zeros([tf.shape(durations)[0], 1], tf.float32),
+                    tf.cast(tf.cumsum(durations, axis=1)[:, :-1], tf.float32),
+                ],
+                axis=1,
+            ),
+        )
+        dist = tfd.Normal(loc=c, scale=ranges + eps)
+        bs = tf.shape(durations)[0]
+        t = tf.range(1, mel_len + 1, dtype=tf.float32)
+        t = t[:, tf.newaxis, tf.newaxis]
+        p = tf.transpose(
+            tf.exp(dist.log_prob(t)), [1, 0, 2]
+        )  # bs x seq_len (t) x len(t)=seq_len (i)
+        s = tf.expand_dims(tf.reduce_sum(p, axis=2), axis=2) + eps
+        w = tf.divide(p, s)
+        u = tf.matmul(w, x)
+        return u
+
+
+tf.random.set_seed(0)
+x = tf.random.uniform([2, 5, 4], minval=0.0, maxval=1.0)
+durations = tf.random.uniform([2, 5], minval=0.0, maxval=1.0)
+ranges = tf.random.uniform([2, 5], minval=0.0, maxval=1.0)
+gauss = GaussianUpsampling()
+print("Input: ", x)
+print("Durations: ", durations)
+print("Ranges: ", ranges)
+with tf.GradientTape() as g:
+    g.watch(x)
+    y = gauss(x, durations, ranges, 3.0)
+    print("Output: ", y)
+dy_dx = g.gradient(y, x)
+print("Gradient for input: ", dy_dx)
+with tf.GradientTape() as g:
+    g.watch(durations)
+    y = gauss(x, durations, ranges, 3.0)
+dy_dx = g.gradient(y, durations)
+print("Gradient for durations: ", dy_dx)
+with tf.GradientTape() as g:
+    g.watch(ranges)
+    y = gauss(x, durations, ranges, 3.0)
+dy_dx = g.gradient(y, ranges)
+print("Gradient for ranges: ", dy_dx)
diff --git a/training/src/tests/tests/python/gelu.py b/training/src/tests/tests/python/gelu.py
new file mode 100644
index 00000000..1892bfb7
--- /dev/null
+++ b/training/src/tests/tests/python/gelu.py
@@ -0,0 +1,72 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+import torch.nn.functional as F
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def gelua(x):
+    return (
+        0.5
+        * x
+        * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    )
+
+
+KK = torch.tensor(
+    [
+        [[[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]]],
+        [
+            [[1.0, 1.0], [3.0, 3.0]],
+            [[4.0, 4.0], [3.0, 3.0]],
+            [[2.0, 1.0], [3.0, 7.0]],
+        ],
+    ],
+    requires_grad=True,
+)
+
+grad = torch.tensor(
+    [
+        [[[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]]],
+        [
+            [[1.0, 1.0], [3.0, 3.0]],
+            [[4.0, 4.0], [3.0, 3.0]],
+            [[2.0, 1.0], [3.0, 7.0]],
+        ],
+    ]
+)
+
+res = gelu(KK)
+print("gelu", res.view(24))
+res.backward(grad)
+print("grad", KK.grad.view(24))
+torch.nn.init.zeros_(KK.grad)
+
+res = gelua(KK)
+print("gelua", res.view(24))
+res.backward(grad)
+print("grada", KK.grad.view(24))
+torch.nn.init.zeros_(KK.grad)
diff --git a/training/src/tests/tests/python/gru.py b/training/src/tests/tests/python/gru.py
new file mode 100644
index 00000000..12b74770
--- /dev/null
+++ b/training/src/tests/tests/python/gru.py
@@ -0,0 +1,119 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+
+# Seq1 unit
+rnn = torch.nn.GRU(4, 3, 1, batch_first=True)
+input = torch.randn(2, 1, 4, requires_grad=True)
+h0 = torch.zeros([1, 2, 3], requires_grad=True)
+rnn.weight_ih_l0 = torch.nn.Parameter(torch.ones([9, 4]))
+rnn.weight_hh_l0 = torch.nn.Parameter(torch.ones([9, 3]))
+rnn.bias_ih_l0 = torch.nn.Parameter(torch.ones([9]))
+rnn.bias_hh_l0 = torch.nn.Parameter(torch.ones([9]))
+print("Input: ", input)
+print("Hidden: ", h0)
+
+## Forward
+output, hn = rnn(input, h0)
+print("Output: ", output)
+print("New hidden: ", hn)
+
+## Backward
+hn.sum().backward()
+print("Input gradient: ", input.grad)
+print("Hidden gradient: ", h0.grad)
+print("IH weights gradient: ", rnn.weight_ih_l0.grad)
+print("HH weights gradient: ", rnn.weight_hh_l0.grad)
+print("IH biases gradient: ", rnn.bias_ih_l0.grad)
+print("HH biases gradient: ", rnn.bias_hh_l0.grad)
+
+# Seq3 unit
+rnn = torch.nn.GRU(5, 4, 1, batch_first=True)
+input = torch.randn(2, 3, 5, requires_grad=True)
+h0 = torch.zeros([1, 2, 4], requires_grad=True)
+rnn.weight_ih_l0 = torch.nn.Parameter(torch.ones([12, 5]))
+rnn.weight_hh_l0 = torch.nn.Parameter(torch.ones([12, 4]))
+rnn.bias_ih_l0 = torch.nn.Parameter(torch.ones([12]))
+rnn.bias_hh_l0 = torch.nn.Parameter(torch.ones([12]))
+print("Input: ", input)
+print("Hidden: ", h0)
+
+## Forward
+output, hn = rnn(input, h0)
+print("Output: ", output)
+print("New hidden: ", hn)
+
+## Backward
+output.sum().backward()
+print("Input gradient: ", input.grad)
+print("Hidden gradient: ", h0.grad)
+print("IH weights gradient: ", rnn.weight_ih_l0.grad)
+print("HH weights gradient: ", rnn.weight_hh_l0.grad)
+print("IH biases gradient: ", rnn.bias_ih_l0.grad)
+print("HH biases gradient: ", rnn.bias_hh_l0.grad)
+
+# Seq7 unit
+rnn = torch.nn.GRU(4, 5, 1, batch_first=True)
+input = torch.randn(2, 7, 4, requires_grad=True)
+h0 = torch.zeros([1, 2, 5], requires_grad=True)
+print("IH weights: ", rnn.weight_ih_l0)
+print("HH weights: ", rnn.weight_hh_l0)
+print("IH biases: ", rnn.bias_ih_l0)
+print("HH biases: ", rnn.bias_hh_l0)
+print("Input: ", input)
+print("Hidden: ", h0)
+
+## Forward
+output, hn = rnn(input, h0)
+print("Output: ", output)
+print("New hidden: ", hn)
+
+## Backward
+output.sum().backward()
+print("Input gradient: ", input.grad)
+print("Hidden gradient: ", h0.grad)
+print("IH weights gradient: ", rnn.weight_ih_l0.grad)
+print("HH weights gradient: ", rnn.weight_hh_l0.grad)
+print("IH biases gradient: ", rnn.bias_ih_l0.grad)
+print("HH biases gradient: ", rnn.bias_hh_l0.grad)
+
+# Seq5 external state unit
+rnn = torch.nn.GRU(4, 3, 1, batch_first=True)
+input = torch.randn(3, 5, 4, requires_grad=True)
+h0 = torch.randn(1, 3, 3, requires_grad=True)
+print("IH weights: ", rnn.weight_ih_l0)
+print("HH weights: ", rnn.weight_hh_l0)
+print("IH biases: ", rnn.bias_ih_l0)
+print("HH biases: ", rnn.bias_hh_l0)
+print("Input: ", input)
+print("Hidden: ", h0)
+
+## Forward
+output, hn = rnn(input, h0)
+print("Output: ", output)
+print("New hidden: ", hn)
+
+## Backward
+output.sum().backward()
+print("Input gradient: ", input.grad)
+print("Hidden gradient: ", h0.grad)
+print("IH weights gradient: ", rnn.weight_ih_l0.grad)
+print("HH weights gradient: ", rnn.weight_hh_l0.grad)
+print("IH biases gradient: ", rnn.bias_ih_l0.grad)
+print("HH biases gradient: ", rnn.bias_hh_l0.grad)
diff --git a/training/src/tests/tests/python/gru_cell.py b/training/src/tests/tests/python/gru_cell.py
new file mode 100644
index 00000000..03865771
--- /dev/null
+++ b/training/src/tests/tests/python/gru_cell.py
@@ -0,0 +1,83 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+
+# Ones hidden unit
+rnn = torch.nn.GRU(9, 5, 1, batch_first=True)
+input = torch.randn(3, 1, 9, requires_grad=True)
+h0 = torch.zeros([1, 3, 5], requires_grad=True)
+rnn.weight_ih_l0 = torch.nn.Parameter(torch.ones([15, 9]))
+rnn.weight_hh_l0 = torch.nn.Parameter(torch.ones([15, 5]))
+rnn.bias_ih_l0 = torch.nn.Parameter(torch.ones([15]))
+rnn.bias_hh_l0 = torch.nn.Parameter(torch.ones([15]))
+print("Input: ", input)
+print("Hidden: ", h0)
+
+## Forward
+output, hn = rnn(input, h0)
+print("New hidden: ", hn)
+
+## Backward
+hn.sum().backward()
+print("Input gradient: ", input.grad)
+print("Hidden gradient: ", h0.grad)
+
+# Ones hidden unit
+rnn = torch.nn.GRU(4, 3, 1, batch_first=True)
+input = torch.randn(2, 1, 4, requires_grad=True)
+h0 = torch.randn(1, 2, 3, requires_grad=True)
+rnn.weight_ih_l0 = torch.nn.Parameter(torch.ones([9, 4]))
+rnn.weight_hh_l0 = torch.nn.Parameter(torch.ones([9, 3]))
+rnn.bias_ih_l0 = torch.nn.Parameter(torch.ones([9]))
+rnn.bias_hh_l0 = torch.nn.Parameter(torch.ones([9]))
+print("Input: ", input)
+print("Hidden: ", h0)
+
+## Forward
+output, hn = rnn(input, h0)
+print("New hidden: ", hn)
+
+## Backward
+hn.sum().backward()
+print("Input gradient: ", input.grad)
+print("Hidden gradient: ", h0.grad)
+
+# Random weights unit
+rnn = torch.nn.GRU(7, 4, 1, batch_first=True)
+input = torch.randn(3, 1, 7, requires_grad=True)
+h0 = torch.randn(1, 3, 4, requires_grad=True)
+print("IH weights: ", rnn.weight_ih_l0)
+print("HH weights: ", rnn.weight_hh_l0)
+print("IH bias: ", rnn.bias_ih_l0)
+print("HH bias: ", rnn.bias_hh_l0)
+print("Input: ", input)
+print("Hidden: ", h0)
+
+## Forward
+output, hn = rnn(input, h0)
+print("New hidden: ", hn)
+
+## Backward
+hn.sum().backward()
+print("Input gradient: ", input.grad)
+print("Hidden gradient: ", h0.grad)
+print("IH weights gradient: ", rnn.weight_ih_l0.grad)
+print("HH weights gradient: ", rnn.weight_hh_l0.grad)
+print("IH biases gradient: ", rnn.bias_ih_l0.grad)
+print("HH biases gradient: ", rnn.bias_hh_l0.grad)
diff --git a/training/src/tests/tests/python/hswish_hsigmoid.ipynb b/training/src/tests/tests/python/hswish_hsigmoid.ipynb
new file mode 100644
index 00000000..b307b36d
--- /dev/null
+++ b/training/src/tests/tests/python/hswish_hsigmoid.ipynb
@@ -0,0 +1,894 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Hswish, swish and HSigmoid Experiments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "from collections import namedtuple\n",
+    "# from pyraul.nn import MLP\n",
+    "# from pyraul.pipeline import accuracy\n",
+    "# from pyraul.pipeline.train_step import train_step\n",
+    "from pyraul.tools.dataset import Dataset\n",
+    "from pyraul.tools.dumping import dump_weights\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Activation functions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Swish"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Swish(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.sigmoid = nn.Sigmoid()\n",
+    "    def forward(self, x):\n",
+    "        return x*self.sigmoid(x)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Hard Swish"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class HSwish(nn.Module):\n",
+    "    def forward(self, x):\n",
+    "        return x*F.relu6(x+3, inplace=True)/6"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Hard Sigmoid"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class HSigmoid(nn.Module):\n",
+    "    def forward(self, x):\n",
+    "        return F.relu6(x+3, inplace=True)/6"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "source": [
+    "## Comparation h-swish vs swish and ReLU, Sigmoid vs h-sigmoid"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "source": [
+    "#### Forward"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = torch.from_numpy(np.arange(-8.0, 8.0, 0.5, dtype=np.float32))\n",
+    "relu_result = nn.ReLU().eval()(x)\n",
+    "swish_result = Swish().eval()(x)\n",
+    "hswish_result = HSwish().eval()(x)\n",
+    "sigmoid_result = nn.Sigmoid().eval()(x)\n",
+    "hsigmoid_result = HSigmoid().eval()(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.title('Swish vs Hard Swish vs ReLU functions (forward)')\n",
+    "plt.grid(True)\n",
+    "plt.ylabel('F(x)')\n",
+    "plt.xlabel('x')\n",
+    "plt.plot(x, relu_result, label=\"ReLU\")\n",
+    "plt.plot(x, swish_result, label=\"Swish\")\n",
+    "plt.plot(x, hswish_result, label=\"h-Swish\")\n",
+    "plt.legend(loc='lower right')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.title('Sigmoid vs Hard Sigmoid functions (forward)')\n",
+    "plt.grid(True)\n",
+    "plt.ylabel('F(x)')\n",
+    "plt.xlabel('x')\n",
+    "plt.plot(x, sigmoid_result, label=\"Sigmoid\")\n",
+    "plt.plot(x, hsigmoid_result, label=\"h-Sigmoid\")\n",
+    "plt.legend(loc='lower right')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Backward"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_grad(function, input_x):\n",
+    "    x = input_x.clone().requires_grad_(True)\n",
+    "    y = function.train()(x)\n",
+    "    y = y.sum()\n",
+    "    y.backward()\n",
+    "    return x.grad"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_grad = torch.from_numpy(np.arange(-8.0, 8.0, 0.25, dtype=np.float32))\n",
+    "relu_grad_y = get_grad(nn.ReLU(), x_grad)\n",
+    "swish_grad_y = get_grad(Swish(), x_grad)\n",
+    "hswish_grad_y = get_grad(HSwish(), x_grad)\n",
+    "sigmoid_grad_y = get_grad(nn.Sigmoid(), x_grad)\n",
+    "hsigmoid_grad_y = get_grad(HSigmoid(), x_grad)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.title('Swish vs Hard Swish vs ReLU functions (backward)')\n",
+    "plt.grid(True)\n",
+    "plt.ylabel('grad F(x)')\n",
+    "plt.xlabel('x')\n",
+    "plt.plot(x_grad, relu_grad_y, label=\"ReLU\")\n",
+    "plt.plot(x_grad, swish_grad_y, label=\"Swish\")\n",
+    "plt.plot(x_grad, hswish_grad_y, label=\"h-Swish\")\n",
+    "plt.legend(loc='lower right')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plt.title('Sigmoid vs Hard Sigmoid functions (backward)')\n",
+    "plt.grid(True)\n",
+    "plt.ylabel('grad F(x)')\n",
+    "plt.xlabel('x')\n",
+    "plt.plot(x_grad, sigmoid_grad_y, label=\"Sigmoid\")\n",
+    "plt.plot(x_grad, hsigmoid_grad_y, label=\"h-Sigmoid\")\n",
+    "plt.legend(loc='lower right')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "list(zip(x_grad, hsigmoid_grad_y))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "source": [
+    "#### H-swish gradient formula"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def hswish_grad(x, grad):\n",
+    "    if x == -3.0: return 0.0\n",
+    "    if x == 3.0: return grad\n",
+    "    if x > 3.0: return grad\n",
+    "    if x > -3.0 and x < 3.0: return  grad*(x/3.0 + 0.5)\n",
+    "    return 0.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "formula_hswish_grad = [hswish_grad(x, 1) for x in x_grad]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "list(zip(x_grad, hswish_grad_y,formula_hswish_grad))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Simple arcitecture"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from enum import Enum\n",
+    "\n",
+    "class NetType(Enum):\n",
+    "    relu = 0,\n",
+    "    swish = 1,\n",
+    "    hswish = 2,\n",
+    "    sigmoid = 3,\n",
+    "    hsigmoid = 4\n",
+    "    \n",
+    "def trace_forward(name, tensor, batch=0, start=0, stop=-1):\n",
+    "    print(f\"{name} ({tensor.shape}), #{batch}[{start}:{stop}]\")\n",
+    "    print(*[x.item() for x in tensor[batch][start:stop]])\n",
+    "    print(\"-----\")\n",
+    "    \n",
+    "class Toy(nn.Module):\n",
+    "    def __init__(self, activation, n_input, n_hidden, n_output, trace = None, **kwargs):\n",
+    "        super().__init__()\n",
+    "        self.trace = trace\n",
+    "        \n",
+    "        self.fc1 = nn.Linear(n_input, n_hidden)\n",
+    "        self.activation = activation\n",
+    "        self.fc2 = nn.Linear(n_hidden, n_output)\n",
+    "        self.softmax = nn.LogSoftmax(dim=1)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        self.trace and trace_forward(\"data\", x, **self.trace)\n",
+    "        out = self.fc1(x)\n",
+    "        self.trace and trace_forward(\"fc1\", out, **self.trace)\n",
+    "        out = self.activation(out)\n",
+    "        self.trace and trace_forward(\"act\", out, **self.trace)\n",
+    "        out = self.fc2(out)\n",
+    "        self.trace and trace_forward(\"fc2\", out, **self.trace)\n",
+    "        out = self.softmax(out)\n",
+    "        self.trace and trace_forward(\"softmax\", out, **self.trace)\n",
+    "        return out\n",
+    "    \n",
+    "def gen_net(net_type: NetType, net_config, device, trace = False):\n",
+    "    if net_type == NetType.relu:\n",
+    "        return Toy(activation=nn.ReLU(), trace=trace, **net_config).to(device)\n",
+    "    if net_type == NetType.swish:\n",
+    "        return Toy(activation=Swish(), trace=trace, **net_config).to(device)\n",
+    "    if net_type == NetType.hswish:\n",
+    "        return Toy(activation=HSwish(), trace=trace, **net_config).to(device)\n",
+    "    if net_type == NetType.sigmoid:\n",
+    "        return Toy(activation=nn.Sigmoid(), trace=trace, **net_config).to(device)\n",
+    "    if net_type == NetType.hsigmoid:\n",
+    "        return Toy(activation=HSigmoid(), trace=trace, **net_config).to(device)\n",
+    "    raise NotImplementedError(\"Unknown network type\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### MNIST Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch.utils.data import DataLoader\n",
+    "from typing import Callable, Optional\n",
+    "\n",
+    "\n",
+    "def accuracy(\n",
+    "    model: torch.nn.Module,\n",
+    "    dataloader: DataLoader,\n",
+    "    preprocessor: Optional[Callable] = None,\n",
+    "    device: str = \"cpu\",\n",
+    "    squeeze_target: bool = False,\n",
+    "    **kwargs,\n",
+    ") -> float:\n",
+    "    \"\"\"\n",
+    "    The function returns an accuracy score in percentages.\n",
+    "\n",
+    "    Accuracy = correct answer / total answers\n",
+    "\n",
+    "    :param model: Neural network model\n",
+    "    :param dataset: Wrapping object that contains data loaders\n",
+    "    :param preprocessor: Callable object which is preprocess data\n",
+    "    :param kwargs: Other arguments in dictionary\n",
+    "    :return:\n",
+    "    \"\"\"\n",
+    "    model.eval()\n",
+    "    correct, total = 0, 0\n",
+    "    cnt = 0\n",
+    "    with torch.no_grad():\n",
+    "        for data, labels in dataloader:\n",
+    "            if preprocessor:\n",
+    "                data = preprocessor(data)\n",
+    "            data = data.to(device)\n",
+    "            labels = labels.to(device)\n",
+    "            if squeeze_target:\n",
+    "                labels = labels.squeeze()\n",
+    "            outputs = model(data)\n",
+    "            _, predicted = torch.max(outputs.data, 1)\n",
+    "            total += outputs.size(0)\n",
+    "            correct += (predicted == labels).sum().item()\n",
+    "            cnt += 1\n",
+    "    return 100.0 * correct / total\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time \n",
+    "from collections import namedtuple\n",
+    "from typing import Callable, Optional, List\n",
+    "from pyraul.tools.logging import get_fixedwide_str\n",
+    "\n",
+    "class AverageMeter:\n",
+    "    \"\"\"Computes and stores the average and current value\"\"\"\n",
+    "    def __init__(self, history: bool = False):\n",
+    "        self.use_history = history\n",
+    "        self.reset()\n",
+    "\n",
+    "    def reset(self):\n",
+    "        self.val = 0\n",
+    "        self.avg = 0\n",
+    "        self.sum = 0\n",
+    "        self.count = 0\n",
+    "        if self.use_history:\n",
+    "            self.history=[]\n",
+    "\n",
+    "    def update(self, val, n=1):\n",
+    "        self.val = val\n",
+    "        self.sum += val * n\n",
+    "        self.count += n\n",
+    "        self.avg = self.sum / self.count\n",
+    "        if self.use_history:\n",
+    "            self.history.append(val)\n",
+    "            \n",
+    "            \n",
+    "def show_params(model):\n",
+    "    print(\"====================================\")\n",
+    "    for name, param in model.named_parameters():\n",
+    "        if param.requires_grad:\n",
+    "            if param.data is not None:\n",
+    "                print(f\"{name}, {param.data.shape}\")\n",
+    "                data = np.transpose(param.data)\n",
+    "                data = data[0] if len(data.shape) > 1 else data\n",
+    "                print([x.item() for x in data][:10])\n",
+    "            if param.grad is not None:\n",
+    "                print(f\"grad of {name}, {param.grad.shape}\")\n",
+    "                grad = np.transpose(param.grad)\n",
+    "                grad = grad[0] if len(grad.shape) > 1 else grad\n",
+    "                print([x.item() for x in grad][:10])\n",
+    "    print(\"====================================\")\n",
+    "        \n",
+    "TrainStepResult = namedtuple(\"TrainStepResult\", [\"loss\", \"time_batch_load\", \"time_batch_full\"])\n",
+    "\n",
+    "def train_step(train_loader, \n",
+    "               model, \n",
+    "               criterion, \n",
+    "               optimizer, \n",
+    "               device, \n",
+    "               print_freq=1,\n",
+    "               verbose: bool = True,\n",
+    "               loss_history: bool = False,\n",
+    "               preprocessor: Optional[Callable] = None):\n",
+    "    \n",
+    "    batch_time = AverageMeter()\n",
+    "    data_time = AverageMeter()\n",
+    "    losses = AverageMeter(history=loss_history)\n",
+    "\n",
+    "    model.train()\n",
+    "\n",
+    "    n = len(train_loader)\n",
+    "    n_wide = len(str(n))\n",
+    "    \n",
+    "    end = time.time()\n",
+    "    for i, (input, target) in enumerate(train_loader):\n",
+    "        \n",
+    "        if preprocessor:\n",
+    "            input = preprocessor(input)\n",
+    "\n",
+    "        # measure data loading time\n",
+    "        data_time.update(time.time() - end)\n",
+    "\n",
+    "        target = target.to(device)\n",
+    "        input_var = input.to(device)\n",
+    "        target_var = target\n",
+    "\n",
+    "        # compute output\n",
+    "        output = model(input_var)\n",
+    "        loss = criterion(output, target_var)\n",
+    "\n",
+    "        # compute gradient and do SGD step\n",
+    "        optimizer.zero_grad()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "        output = output.float()\n",
+    "        loss = loss.float()\n",
+    "        \n",
+    "        losses.update(loss.item(), input.size(0))\n",
+    "\n",
+    "        # measure elapsed time\n",
+    "        batch_time.update(time.time() - end)\n",
+    "        end = time.time()\n",
+    "    \n",
+    "        if verbose and i % print_freq == 0:\n",
+    "            print(f\"Step {get_fixedwide_str(str(i), n_wide)}/{n}\\t\"\n",
+    "                  f\"Loss: {losses.val:.6f} ({losses.avg:.6f})\\t\"\n",
+    "                  f\"Time.step: {batch_time.val:.3f} ({batch_time.avg:.3f})\\t\"\n",
+    "                  f\"Time.load: {data_time.val:.3f} ({data_time.avg:.3f})\"\n",
+    "                 )\n",
+    "    return TrainStepResult(loss=losses, time_batch_load=data_time, time_batch_full=batch_time)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO: Loading MNIST dataset...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10.28\n",
+      "Step    0/1200\tLoss: 2.352203 (2.352203)\tTime.step: 0.013 (0.013)\tTime.load: 0.009 (0.009)\n",
+      "Step  100/1200\tLoss: 2.276294 (2.296440)\tTime.step: 0.005 (0.006)\tTime.load: 0.004 (0.005)\n",
+      "Step  200/1200\tLoss: 2.057391 (2.238303)\tTime.step: 0.006 (0.006)\tTime.load: 0.005 (0.005)\n",
+      "Step  300/1200\tLoss: 1.876074 (2.160662)\tTime.step: 0.006 (0.006)\tTime.load: 0.005 (0.005)\n",
+      "Step  400/1200\tLoss: 1.765929 (2.063696)\tTime.step: 0.005 (0.006)\tTime.load: 0.004 (0.005)\n",
+      "Step  500/1200\tLoss: 1.270914 (1.950968)\tTime.step: 0.005 (0.006)\tTime.load: 0.004 (0.005)\n",
+      "Step  600/1200\tLoss: 1.210810 (1.835414)\tTime.step: 0.005 (0.006)\tTime.load: 0.004 (0.005)\n",
+      "Step  700/1200\tLoss: 0.949828 (1.733999)\tTime.step: 0.006 (0.006)\tTime.load: 0.005 (0.005)\n",
+      "Step  800/1200\tLoss: 0.816559 (1.635761)\tTime.step: 0.007 (0.006)\tTime.load: 0.005 (0.005)\n",
+      "Step  900/1200\tLoss: 0.940067 (1.549612)\tTime.step: 0.010 (0.006)\tTime.load: 0.009 (0.005)\n",
+      "Step 1000/1200\tLoss: 0.779427 (1.471990)\tTime.step: 0.006 (0.006)\tTime.load: 0.004 (0.005)\n",
+      "Step 1100/1200\tLoss: 0.635892 (1.402491)\tTime.step: 0.006 (0.006)\tTime.load: 0.005 (0.005)\n",
+      "83.01\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pyraul.tools.seed import set_seed\n",
+    "\n",
+    "config = {\n",
+    "    \"batch_size\": 50,\n",
+    "    \"feature_space_dim\": 784,\n",
+    "    \"hidden_layer_dim\": 500,\n",
+    "    \"classes_n\": 10,\n",
+    "    \"seed\": 0,\n",
+    "    \"device\": \"cuda\",\n",
+    "    \"epochs\": 50,\n",
+    "    \"sgd\": {\"lr\": 0.05}\n",
+    "}\n",
+    "\n",
+    "net_config = {\n",
+    "    \"n_input\": config[\"feature_space_dim\"], \n",
+    "    \"n_hidden\": config[\"hidden_layer_dim\"], \n",
+    "    \"n_output\": config[\"classes_n\"]\n",
+    "}\n",
+    "\n",
+    "set_seed(config[\"seed\"])\n",
+    "\n",
+    "device = torch.device(config[\"device\"])\n",
+    "model = gen_net(NetType.hsigmoid, net_config, device)\n",
+    "\n",
+    "# dump_weights(model, \"init.txt\")\n",
+    "\n",
+    "ds= Dataset(\"MNIST\", **config)\n",
+    "optimizer = torch.optim.SGD(model.parameters(), lr=config[\"sgd\"][\"lr\"])\n",
+    "criterion = nn.NLLLoss(reduction=\"mean\")\n",
+    "\n",
+    "accuracy_before = accuracy(\n",
+    "        model=model,\n",
+    "        dataloader=ds.test_loader,\n",
+    "        preprocessor=lambda images: images.reshape(-1, 28 * 28),\n",
+    "        **config,\n",
+    ")\n",
+    "\n",
+    "print(accuracy_before)\n",
+    "\n",
+    "loss, _, _ = train_step(\n",
+    "                    ds.train_loader, \n",
+    "                    model,\n",
+    "                    criterion,\n",
+    "                    optimizer,\n",
+    "                    device,\n",
+    "                    print_freq=100,\n",
+    "                    verbose=True,\n",
+    "                    loss_history=True,\n",
+    "                    preprocessor=lambda images: images.reshape(-1, 28 * 28),\n",
+    "                )\n",
+    "accuracy_after = accuracy(\n",
+    "    model=model,\n",
+    "    dataloader=ds.test_loader,\n",
+    "    preprocessor=lambda images: images.reshape(-1, 28 * 28),\n",
+    "    **config,\n",
+    ")\n",
+    "print(accuracy_after)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "const raul::dtVec idealLosses{ 2.352203369140625_dt, 2.2762935161590576_dt, 2.0573911666870117_dt, 1.876073956489563_dt, 1.7659292221069336_dt, 1.2709139585494995_dt, 1.210809588432312_dt, 0.9498279690742493_dt, 0.8165586590766907_dt, 0.9400674700737_dt, 0.7794268131256104_dt, 0.6358923316001892_dt };\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\n",
+    "    \"const raul::dtVec idealLosses{\", \n",
+    "    \", \".join([f\"{x}_dt\" for x in loss.history[::100]]),\n",
+    "    \"};\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train network with toy dataset (binary classification)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "config = {\n",
+    "    \"batches\": 10,\n",
+    "    \"batch_size\": 4,\n",
+    "    \"feature_space_dim\": 16,\n",
+    "    \"classes_n\": 2,\n",
+    "    \"dataset_offset\": [0.0, 0.5],\n",
+    "    \"hidden_layer_dim\": 64,\n",
+    "    \"seed\": 0,\n",
+    "    \"device\": \"cpu\",\n",
+    "    \"epochs\": 50,\n",
+    "    \"sgd\": {\"lr\": 0.05}\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from pyraul.tools.seed import set_seed\n",
+    "\n",
+    "def generate_toy_dataset(dataset_offset: list,\n",
+    "                         classes_n: int,\n",
+    "                         feature_space_dim: int,\n",
+    "                         batches: int,\n",
+    "                         batch_size: int,\n",
+    "                         seed: int,\n",
+    "                         device: str,\n",
+    "                         **kwargs):\n",
+    "    \n",
+    "    set_seed()\n",
+    "    assert len(dataset_offset) == classes_n\n",
+    "    \n",
+    "    amoutn_of_vectors = batch_size*batches // classes_n\n",
+    "    assert amoutn_of_vectors>0\n",
+    "    \n",
+    "    x_class_list = []\n",
+    "    for i in range(classes_n):\n",
+    "        _x = torch.randn(amoutn_of_vectors, feature_space_dim, device=device) + dataset_offset[i]\n",
+    "        x_class_list.append(_x)\n",
+    "\n",
+    "    x = torch.cat(x_class_list, dim=0)\n",
+    "    \n",
+    "    y_class_list = []\n",
+    "    for i in range(classes_n):\n",
+    "        _y = torch.ones(amoutn_of_vectors, 1, device=device).long() * i\n",
+    "        y_class_list.append(_y)\n",
+    "        \n",
+    "    y = torch.cat(y_class_list, dim=0)\n",
+    "\n",
+    "    return x, y\n",
+    "        \n",
+    "x, y = generate_toy_dataset(**config)\n",
+    "\n",
+    "plt.title('Projection of the feature space')\n",
+    "plt.ylabel('x0')\n",
+    "plt.xlabel('x1')\n",
+    "plt.scatter(x[:,0], x[:, 1], c=y, alpha=0.5)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def get_ds(full_dataset):\n",
+    "    train_size = int(0.8 * len(full_dataset))\n",
+    "    test_size = len(full_dataset) - train_size\n",
+    "    return torch.utils.data.random_split(full_dataset, [train_size, test_size])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import DataLoader, Dataset\n",
+    "device = torch.device(config[\"device\"])\n",
+    "\n",
+    "net_config = {\n",
+    "    \"n_input\": config[\"feature_space_dim\"], \n",
+    "    \"n_hidden\": config[\"hidden_layer_dim\"], \n",
+    "    \"n_output\": config[\"classes_n\"]\n",
+    "}\n",
+    "\n",
+    "nets = Nets(\n",
+    "    relu = Toy(activation=nn.ReLU(), **net_config).to(device),\n",
+    "    swish = Toy(activation=Swish(), **net_config).to(device),\n",
+    "    hswish = Toy(activation=HSwish(), **net_config).to(device),\n",
+    "    sigmoid = Toy(activation=nn.Sigmoid(), **net_config).to(device),\n",
+    "    hsigmoid = Toy(activation=HSigmoid(), **net_config).to(device)\n",
+    ")\n",
+    "\n",
+    "model = nets.hsigmoid\n",
+    "optimizer = torch.optim.SGD(model.parameters(), lr=config[\"sgd\"][\"lr\"])\n",
+    "criterion = lambda y,t: nn.NLLLoss()(y, t.squeeze())\n",
+    "\n",
+    "train_ds, test_ds = get_ds(list(zip(x, y)))\n",
+    "ds_train_loader = DataLoader(train_ds, batch_size=config[\"batch_size\"], shuffle=True)\n",
+    "ds_test_loader = DataLoader(test_ds, batch_size=config[\"batch_size\"], shuffle=True)\n",
+    "\n",
+    "print(accuracy(model, ds_test_loader, squeeze_target=True))\n",
+    "      \n",
+    "history_loss=[]\n",
+    "history_acc=[]\n",
+    "for epoch in range(config[\"epochs\"]):\n",
+    "    loss, _, _ = train_step(\n",
+    "                        ds_train_loader, \n",
+    "                        model,\n",
+    "                        criterion,\n",
+    "                        optimizer,\n",
+    "                        config[\"device\"],\n",
+    "                        print_freq=5,\n",
+    "                        verbose=False\n",
+    "                    )\n",
+    "    acc = accuracy(model, ds_test_loader, squeeze_target=True)\n",
+    "    history_acc.append(acc)\n",
+    "    history_loss.append(loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax1 = plt.subplots()\n",
+    "\n",
+    "plt.grid(True)\n",
+    "plt.title('Training')\n",
+    "\n",
+    "ax2 = ax1.twinx()\n",
+    "ax1.plot(history_acc, 'g-')\n",
+    "ax2.plot(history_loss, 'r-')\n",
+    "\n",
+    "ax1.set_xlabel('Epoch')\n",
+    "ax1.set_ylabel('Test accuracy', color='g')\n",
+    "ax2.set_ylabel('Train Loss avg', color='r')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Converter prototyp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from pyraul.tools.converter import cvt_model_to_raul\n",
+    "cvt_model_to_raul(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyraul.tools.converter import cvt_tensor_to_raul\n",
+    "cvt_tensor_to_raul(y.float())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Watermark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext watermark\n",
+    "%watermark -d -u -v -iv"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/training/src/tests/tests/python/index_fill.py b/training/src/tests/tests/python/index_fill.py
new file mode 100644
index 00000000..c2719388
--- /dev/null
+++ b/training/src/tests/tests/python/index_fill.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(41)
+a = torch.rand(1, 2, 3, 4, requires_grad=True)
+print("Input: ", a)
+indexes = [
+    torch.tensor([0]),
+    torch.tensor([1]),
+    torch.tensor([1, 2]),
+    torch.tensor([1, 2, 3]),
+]
+for i in range(4):
+    a = torch.rand(1, 2, 3, 4, requires_grad=True)
+    b = torch.index_fill(a, i, indexes[i], 1.0)
+    print("Result:", b)
+    b.sum().backward()
+    print("Gradient for input:", a.grad)
diff --git a/training/src/tests/tests/python/kldivloss.py b/training/src/tests/tests/python/kldivloss.py
new file mode 100644
index 00000000..9dbc338b
--- /dev/null
+++ b/training/src/tests/tests/python/kldivloss.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+
+m = nn.LogSoftmax(dim=1)
+loss = nn.KLDivLoss(reduction="batchmean")
+inp = torch.tensor(
+    [
+        [1.0, 2.0, 5.0, -1.0],
+        [-2.0, 5.0, -1.0, 3.0],
+    ],
+    requires_grad=True,
+)
+target = torch.tensor([[0.7, 0.0, 0.2, 0.1], [0.2, 0.2, 0.2, 0.4]])
+t1 = m(inp)
+t1.retain_grad()
+output = loss(t1, target)
+print("loss", output)
+output.backward()
+print(t1)
+print(t1.grad)
+
+print(inp.grad)
diff --git a/training/src/tests/tests/python/l1loss.py b/training/src/tests/tests/python/l1loss.py
new file mode 100644
index 00000000..fb88b570
--- /dev/null
+++ b/training/src/tests/tests/python/l1loss.py
@@ -0,0 +1,64 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+
+# Mean reduction
+loss_mean = nn.L1Loss(reduction="mean")
+loss_none = nn.L1Loss(reduction="none")
+loss_sum = nn.L1Loss(reduction="sum")
+inp = torch.tensor(
+    [
+        [1.3, 1.2, 0.1, -4.0, -0.3],
+        [-10.0, 1.0, -1.0, 2.0, -2.3],
+    ],
+    requires_grad=True,
+)
+target = torch.tensor([[0.1, 1.2, 1.0, 0.1, 7.7], [0.2, 0.2, 0.2, -1.3, -2.3]])
+
+output_mean = loss_mean(inp, target)
+print("Loss (mean reduction) = ", output_mean)
+output_mean.backward()
+print("Gradient = ", inp.grad)
+
+# None reduction
+inp = torch.tensor(
+    [
+        [1.3, 1.2, 0.1, -4.0, -0.3],
+        [-10.0, 1.0, -1.0, 2.0, -2.3],
+    ],
+    requires_grad=True,
+)
+target = torch.tensor([[0.1, 1.2, 1.0, 0.1, 7.7], [0.2, 0.2, 0.2, -1.3, -2.3]])
+output_none = loss_none(inp, target)
+print("Loss (none reduction) = ", output_none)
+output_none.sum().backward()
+print("Gradient = ", inp.grad)
+
+# Sum reduction
+inp = torch.tensor(
+    [
+        [1.3, 1.2, 0.1, -4.0, -0.3],
+        [-10.0, 1.0, -1.0, 2.0, -2.3],
+    ],
+    requires_grad=True,
+)
+target = torch.tensor([[0.1, 1.2, 1.0, 0.1, 7.7], [0.2, 0.2, 0.2, -1.3, -2.3]])
+output_sum = loss_sum(inp, target)
+print("Loss (sum reduction) = ", output_sum)
+output_sum.backward()
+print("Gradient = ", inp.grad)
diff --git a/training/src/tests/tests/python/l2_norm.py b/training/src/tests/tests/python/l2_norm.py
new file mode 100644
index 00000000..8a1b6385
--- /dev/null
+++ b/training/src/tests/tests/python/l2_norm.py
@@ -0,0 +1,25 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import tensorflow as tf
+
+tf.random.set_seed(42)
+x = tf.random.uniform([2, 3, 4, 5])
+print("Input: ", x)
+with tf.GradientTape() as g:
+    g.watch(x)
+    y = tf.math.l2_normalize(x, 3)
+    print("Result = ", y)
+dy_dx = g.gradient(y, x)
+print("Gradient = ", dy_dx)
diff --git a/training/src/tests/tests/python/l2_squared_norm.py b/training/src/tests/tests/python/l2_squared_norm.py
new file mode 100644
index 00000000..20ed7f53
--- /dev/null
+++ b/training/src/tests/tests/python/l2_squared_norm.py
@@ -0,0 +1,24 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import tensorflow as tf
+
+x = tf.constant([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]])
+print("Input: ", x)
+with tf.GradientTape() as g:
+    g.watch(x)
+    y = tf.nn.l2_loss(x)
+    print("Result = ", y)
+dy_dx = g.gradient(y, x, output_gradients=tf.constant([0.7]))
+print("Gradient = ", dy_dx)
diff --git a/training/src/tests/tests/python/labelSmoothing.py b/training/src/tests/tests/python/labelSmoothing.py
new file mode 100644
index 00000000..69d733b8
--- /dev/null
+++ b/training/src/tests/tests/python/labelSmoothing.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+from torch.autograd import Variable
+
+
+class LabelSmoothing(nn.Module):
+    "Implement label smoothing."
+
+    def __init__(self, size, padding_idx, smoothing=0.0):
+        super(LabelSmoothing, self).__init__()
+        self.criterion = nn.KLDivLoss(reduction="batchmean")
+        self.padding_idx = padding_idx
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+        self.size = size
+        self.true_dist = None
+
+    def forward(self, x, target):
+        assert x.size(1) == self.size
+        true_dist = x.data.clone()
+        true_dist.fill_(self.smoothing / (self.size - 2))
+        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
+        true_dist[:, self.padding_idx] = 0
+        mask = torch.nonzero(target.data == self.padding_idx)
+        if mask.dim() > 0:
+            true_dist.index_fill_(0, mask.squeeze(), 0.0)
+        self.true_dist = true_dist
+        print("smoothed", true_dist)
+        return self.criterion(x, Variable(true_dist, requires_grad=False))
+
+
+m = nn.LogSoftmax(dim=1)
+loss = LabelSmoothing(4, 0, 0.1)
+inp = torch.tensor(
+    [
+        [1.0, 2.0, 5.0, -1.0],
+        [1.0, 2.0, 6.0, -1.0],
+        [-2.0, 5.0, -1.0, 3.0],
+    ],
+    requires_grad=True,
+)
+target = torch.tensor([1, 2, 0])
+t1 = m(inp)
+t1.retain_grad()
+output = loss(t1, target)
+print("loss", output)
+output.backward()
+print(t1)
+print(t1.grad)
+
+print(inp.grad)
diff --git a/training/src/tests/tests/python/layernorm.py b/training/src/tests/tests/python/layernorm.py
new file mode 100644
index 00000000..0e4ce199
--- /dev/null
+++ b/training/src/tests/tests/python/layernorm.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import math
+import numpy as np
+from torch.autograd import Variable
+
+#####################################################################################
+# based on https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
+#####################################################################################
+
+
+class LayerNorm(nn.Module):
+    "Construct a layernorm module (See citation for details)."
+
+    def __init__(self, features, eps=1e-6):
+        super(LayerNorm, self).__init__()
+        self.a_2 = nn.Parameter(torch.ones(features))
+        self.b_2 = nn.Parameter(torch.zeros(features))
+        self.eps = eps
+
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        print("mean", mean)
+        print("std", std)
+        print("t", std.grad_fn(x))
+        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
+
+
+x = torch.tensor(
+    [[[-1.0, 0, 1.0], [4.0, 5.0, 6.0]], [[0.0, 4.0, 7.0], [-1.0, 2.0, 5.0]]],
+    requires_grad=True,
+)
+target = torch.tensor([1, 0])
+
+L = LayerNorm(3, 0)
+fc = nn.Linear(6, 2)
+torch.manual_seed(0)
+np.random.seed(0)
+nn.init.uniform_(fc.weight)
+nn.init.ones_(fc.bias)
+soft = nn.LogSoftmax(dim=1)
+loss = nn.NLLLoss(reduction="mean")
+
+# forward
+x_l = L(x)
+x_fc = fc(x_l.reshape([2, 6]))
+x_sm = soft(x_fc)
+x_loss = loss(x_sm, target)
+
+torch.set_printoptions(precision=6)
+print("fc.weight", fc.weight)
+
+print("x_l", x_l, "\n")
+print("fc", x_fc, "\n")
+print("sm", x_sm, "\n")
+print("loss", x_loss)
+
+# backward
+
+x.retain_grad()
+x_l.retain_grad()
+x_fc.retain_grad()
+x_sm.retain_grad()
+x_loss.backward()
+
+print("x.grad", x.grad)
+print("x_l.grad", x_l.grad)
+print("x_fc.grad", x_fc.grad)
+print("x_sm.grad", x_sm.grad)
+print("L.a_2.grad", L.a_2.grad)
+print("L.b_2.grad", L.b_2.grad)
diff --git a/training/src/tests/tests/python/layernorm_tf.py b/training/src/tests/tests/python/layernorm_tf.py
new file mode 100644
index 00000000..a4702e73
--- /dev/null
+++ b/training/src/tests/tests/python/layernorm_tf.py
@@ -0,0 +1,81 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import math
+import numpy as np
+from torch.autograd import Variable
+
+
+class BertLayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root)."""
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+
+
+x = torch.tensor(
+    [[[-1.0, 0, 1.0], [4.0, 5.0, 6.0]], [[0.0, 4.0, 7.0], [-1.0, 2.0, 5.0]]],
+    requires_grad=True,
+)
+target = torch.tensor([1, 0])
+
+L = BertLayerNorm(3, 1e-12)
+fc = nn.Linear(6, 2)
+torch.manual_seed(0)
+np.random.seed(0)
+nn.init.uniform_(fc.weight)
+nn.init.ones_(fc.bias)
+soft = nn.LogSoftmax(dim=1)
+loss = nn.NLLLoss(reduction="mean")
+
+# forward
+x_l = L(x)
+x_fc = fc(x_l.reshape([2, 6]))
+x_sm = soft(x_fc)
+x_loss = loss(x_sm, target)
+
+torch.set_printoptions(precision=6)
+print("fc.weight", fc.weight)
+
+print("x_l", x_l, "\n")
+print("fc", x_fc, "\n")
+print("sm", x_sm, "\n")
+print("loss", x_loss)
+
+# backward
+
+x.retain_grad()
+x_l.retain_grad()
+x_fc.retain_grad()
+x_sm.retain_grad()
+x_loss.backward()
+
+print("x.grad", x.grad)
+print("x_l.grad", x_l.grad)
+print("x_fc.grad", x_fc.grad)
+print("x_sm.grad", x_sm.grad)
+print("L.a_2.grad", L.weight.grad)
+print("L.b_2.grad", L.bias.grad)
diff --git a/training/src/tests/tests/python/leaky_relu.py b/training/src/tests/tests/python/leaky_relu.py
new file mode 100644
index 00000000..2761138a
--- /dev/null
+++ b/training/src/tests/tests/python/leaky_relu.py
@@ -0,0 +1,32 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+
+# Deterministic unit
+relu = torch.nn.LeakyReLU(0.05)
+input = torch.randn(2, 3, 4, 5, requires_grad=True)
+print("Input: ", input)
+
+## Forward
+output = relu(input)
+print("Output: ", output)
+
+## Backward
+output.sum().backward()
+print("Input gradient: ", input.grad)
diff --git a/training/src/tests/tests/python/lenet5_cifar.py b/training/src/tests/tests/python/lenet5_cifar.py
new file mode 100644
index 00000000..f098bd11
--- /dev/null
+++ b/training/src/tests/tests/python/lenet5_cifar.py
@@ -0,0 +1,184 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.avgpool1 = nn.AvgPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.avgpool2 = nn.AvgPool2d(2, 2)
+
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+        nn.init.xavier_uniform_(self.fc3.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+        nn.init.zeros_(self.fc3.bias)
+
+    def forward(self, x):
+        self.conv1Input = x
+        out = self.conv1(x)
+
+        out = self.avgpool1(out)
+        out = self.conv2(out)
+        out = self.avgpool2(out)
+        out = out.reshape(-1, 5 * 5 * 16)
+        out = self.fc1(out)
+        out = self.fc2(out)
+        out = self.fc3(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.CIFAR10(
+        root="./data/", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.CIFAR10(
+        root="./data/", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        # print(outputs)
+        #        if i < 1:
+        #            saveWeights(i, model)
+
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/lenet5_mnist.py b/training/src/tests/tests/python/lenet5_mnist.py
new file mode 100644
index 00000000..5034ef1c
--- /dev/null
+++ b/training/src/tests/tests/python/lenet5_mnist.py
@@ -0,0 +1,189 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+num_classes = 10
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.avgpool1 = nn.AvgPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.avgpool2 = nn.AvgPool2d(2, 2)
+
+        self.fc1 = nn.Linear(16 * 4 * 4, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, num_classes)
+        self.softmax = Softmax()
+
+        self.tanh = nn.Tanh()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+        nn.init.xavier_uniform_(self.fc3.weight)
+
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+        nn.init.zeros_(self.fc3.bias)
+
+    def forward(self, x):
+        self.conv1Input = x
+        out = self.conv1(x)
+        out = self.tanh(out)
+        out = self.avgpool1(out)
+        out = self.conv2(out)
+        out = self.tanh(out)
+        out = self.avgpool2(out)
+        out = out.reshape(-1, 4 * 4 * 16)
+        out = self.fc1(out)
+        out = self.tanh(out)
+        out = self.fc2(out)
+        out = self.tanh(out)
+        out = self.fc3(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.6f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False, drop_last=True
+    )
+
+    model = NeuralNet(num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        # print(outputs)
+        #        if i < 1:
+        #            saveWeights(i, model)
+
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/lenet_ms_version_cifar.py b/training/src/tests/tests/python/lenet_ms_version_cifar.py
new file mode 100644
index 00000000..df31467a
--- /dev/null
+++ b/training/src/tests/tests/python/lenet_ms_version_cifar.py
@@ -0,0 +1,192 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+import sys
+
+num_classes = 10
+num_channels = 3
+batch_size = 50
+learning_rate = 0.01
+
+curdir = "./weights/"
+
+torch.manual_seed(0)
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class LeNet(nn.Module):
+    def __init__(self, num_class=10, num_channel=1, include_top=True):
+        super(LeNet, self).__init__()
+        self.conv1 = nn.Conv2d(num_channel, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.relu = nn.ReLU()
+        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.include_top = include_top
+        if self.include_top:
+            self.flatten = nn.Flatten()
+            self.fc1 = nn.Linear(16 * 5 * 5, 120)
+            self.fc2 = nn.Linear(120, 84)
+            self.fc3 = nn.Linear(84, num_class)
+
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.conv1.weight)
+        nn.init.xavier_uniform_(self.conv2.weight)
+        nn.init.zeros_(self.conv1.bias)
+        nn.init.zeros_(self.conv2.bias)
+
+        if self.include_top:
+            nn.init.xavier_uniform_(self.fc1.weight)
+            nn.init.xavier_uniform_(self.fc2.weight)
+            nn.init.xavier_uniform_(self.fc3.weight)
+            nn.init.zeros_(self.fc1.bias)
+            nn.init.zeros_(self.fc2.bias)
+            nn.init.zeros_(self.fc3.bias)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.conv2(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        if self.include_top:
+            x = self.flatten(x)
+            x = self.relu(self.fc1(x))
+            x = self.relu(self.fc2(x))
+            x = self.fc3(x)
+
+        out = x.view(x.size(0), num_classes)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        curdir + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j])
+            else:
+                with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.CIFAR10(
+        root="./data/", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.CIFAR10(
+        root="./data/", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = LeNet(num_classes, num_channels)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        start = time.time()
+
+        outputs = model(images)
+        # print(outputs)
+        #        if i < 1:
+        #            saveWeights(i, model)
+
+        loss, lossInput = CrossEntropy(outputs, labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        # if i % 100 == 0:
+        #  with open(curdir + 'loss.txt', 'a') as outfile:
+        #    print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.8f}".format(i, total_step, loss.item()))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/linear.py b/training/src/tests/tests/python/linear.py
new file mode 100644
index 00000000..79b6a0bd
--- /dev/null
+++ b/training/src/tests/tests/python/linear.py
@@ -0,0 +1,52 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import math
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+x = torch.tensor(
+    [
+        [[[1.0, 1.0, 2.0, 0.0, 5.0], [-1.0, 2.0, 2.0, 0.0, 5.0]]],
+        [[[-1.0, 4.0, 1.0, 2.0, 1], [-3.0, 4.0, 5.0, 2.0, 1]]],
+    ],
+    requires_grad=True,
+)
+
+l = nn.Linear(5, 3)
+l.weight.data = torch.tensor(
+    [
+        [-0.2381, 0.1714, -0.0612, -0.1329, -0.3701],
+        [0.0283, -0.2147, -0.0502, 0.2090, 0.4333],
+        [-0.1200, 0.1664, -0.3021, -0.2250, 0.3329],
+    ],
+    requires_grad=True,
+)
+l.bias.data = torch.tensor([0.3548, 0.2879, 0.0343], requires_grad=True)
+print(l.weight)
+print(l.bias)
+
+res = l(x)
+res.backward(
+    torch.tensor([[[[1.0, 2.0, -1], [0.5, 1, 1.0]]], [[[0.5, 6.0, 1], [2, -1.0, 1.5]]]])
+)
+
+print("res", res)
+print("x.grad", x.grad)
+print("weight.grad", l.weight.grad)
+print("bias.grad", l.bias.grad)
diff --git a/training/src/tests/tests/python/linear_sharing.py b/training/src/tests/tests/python/linear_sharing.py
new file mode 100644
index 00000000..06c45cfd
--- /dev/null
+++ b/training/src/tests/tests/python/linear_sharing.py
@@ -0,0 +1,56 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+
+x = torch.tensor(
+    [
+        [[[1.0, 1.0, 2.0, 0.0, 5.0], [-1.0, 2.0, 2.0, 0.0, 5.0]]],
+        [[[-1.0, 4.0, 1.0, 2.0, 1], [-3.0, 4.0, 5.0, 2.0, 1]]],
+    ],
+    requires_grad=True,
+)
+
+l = nn.Linear(5, 5)
+l.weight.data = torch.tensor(
+    [
+        [-0.2381, 0.1714, -0.0612, -0.1329, -0.3701],
+        [0.0283, -0.2147, -0.0502, 0.2090, 0.4333],
+        [-0.1200, 0.1664, -0.3021, -0.2250, 0.3329],
+        [-0.1200, 0.1664, -0.3021, -0.2250, 0.3329],
+        [0.1200, 0.1664, 0.3021, 0.2250, 0.3329],
+    ],
+    requires_grad=True,
+)
+l.bias.data = torch.tensor([0.3548, 0.2879, 0.0343, 0.1269, 0.2234], requires_grad=True)
+print(l.weight)
+print(l.bias)
+
+res = l(x)
+res = l(res)
+res = l(res)
+res.backward(
+    torch.tensor(
+        [
+            [[[1.0, 2.0, -1, 2.0, 1], [0.5, 1, 1.0, 0.4, 0.8]]],
+            [[[0.5, 6.0, 1, 1, 2], [2, -1.0, 1.5, -0.5, 0.1]]],
+        ]
+    )
+)
+
+print("res", res)
+print("x.grad", x.grad)
+print("weight.grad", l.weight.grad)
+print("bias.grad", l.bias.grad)
diff --git a/training/src/tests/tests/python/location_sensitive_attention.py b/training/src/tests/tests/python/location_sensitive_attention.py
new file mode 100644
index 00000000..8aff099f
--- /dev/null
+++ b/training/src/tests/tests/python/location_sensitive_attention.py
@@ -0,0 +1,969 @@
+# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# !pip install tensorflow==1.13.2
+
+import tensorflow as tf
+from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
+from tensorflow.python.ops import array_ops, math_ops, variable_scope
+
+tf.enable_eager_execution()
+
+
+def _location_sensitive_score(W_query, W_fil, W_keys):
+    """Impelements Bahdanau-style (cumulative) scoring function.
+    This attention is described in:
+        J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+      gio, “Attention-based models for speech recognition,” in Ad-
+      vances in Neural Information Processing Systems, 2015, pp.
+      577–585.
+
+    #############################################################################
+              hybrid attention (content-based + location-based)
+                               f = F * α_{i-1}
+       energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
+    #############################################################################
+
+    Args:
+        W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features.
+        W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]'
+        W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs.
+    Returns:
+        A '[batch_size, max_time]' attention score (energy)
+    """
+    # Get the number of hidden units from the trailing dimension of keys
+    dtype = W_query.dtype
+    num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
+
+    v_a = tf.get_variable(
+        "attention_variable_projection",
+        shape=[num_units],
+        dtype=dtype,
+        initializer=tf.contrib.layers.xavier_initializer(),
+        trainable=True,
+    )
+    print(v_a)
+    b_a = tf.get_variable(
+        "attention_bias",
+        shape=[num_units],
+        dtype=dtype,
+        initializer=tf.zeros_initializer(),
+    )
+
+    return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
+
+
+def _smoothing_normalization(e):
+    """Applies a smoothing normalization function instead of softmax
+    Introduced in:
+        J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+      gio, “Attention-based models for speech recognition,” in Ad-
+      vances in Neural Information Processing Systems, 2015, pp.
+      577–585.
+
+    ############################################################################
+                        Smoothing normalization function
+                a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
+    ############################################################################
+
+    Args:
+        e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
+            values of an attention mechanism
+    Returns:
+        matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
+            attendance to multiple memory time steps.
+    """
+    return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
+
+
+class LocationSensitiveAttention(BahdanauAttention):
+    """Impelements Bahdanau-style (cumulative) scoring function.
+      Usually referred to as "hybrid" attention (content-based + location-based)
+      Extends the additive attention described in:
+      "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
+    tion by jointly learning to align and translate,” in Proceedings
+    of ICLR, 2015."
+      to use previous alignments as additional location features.
+
+      This attention is described in:
+      J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+    gio, “Attention-based models for speech recognition,” in Ad-
+    vances in Neural Information Processing Systems, 2015, pp.
+    577–585.
+    """
+
+    def __init__(
+        self,
+        num_units,
+        memory,
+        hparams,
+        is_training,
+        mask_encoder=True,
+        memory_sequence_length=None,
+        normalize=False,
+        smoothing=False,
+        cumulate_weights=True,
+        use_score_bias=False,
+        score_bias_init=0.0,
+        sigmoid_noise=0.0,
+        use_forward=False,
+        name="LocationSensitiveAttention",
+    ):
+        """Construct the Attention mechanism.
+        Args:
+            num_units: The depth of the query mechanism.
+            memory: The memory to query; usually the output of an RNN encoder.  This
+                tensor should be shaped `[batch_size, max_time, ...]`.
+            mask_encoder (optional): Boolean, whether to mask encoder paddings.
+            memory_sequence_length (optional): Sequence lengths for the batch entries
+                in memory.  If provided, the memory tensor rows are masked with zeros
+                for values past the respective sequence lengths. Only relevant if mask_encoder = True.
+            smoothing (optional): Boolean. Determines which normalization function to use.
+                Default normalization function (probablity_fn) is softmax. If smoothing is
+                enabled, we replace softmax with:
+                        a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
+                Introduced in:
+                    J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+                  gio, “Attention-based models for speech recognition,” in Ad-
+                  vances in Neural Information Processing Systems, 2015, pp.
+                  577–585.
+                This is mainly used if the model wants to attend to multiple input parts
+                at the same decoding step. We probably won't be using it since multiple sound
+                frames may depend on the same character/phone, probably not the way around.
+                Note:
+                    We still keep it implemented in case we want to test it. They used it in the
+                    paper in the context of speech recognition, where one phoneme may depend on
+                    multiple subsequent sound frames.
+            name: Name to use when creating ops.
+        """
+        # Create normalization function
+        # Setting it to None defaults in using softmax
+        normalization_function = _smoothing_normalization if smoothing is True else None
+        memory_length = memory_sequence_length if mask_encoder is True else None
+        super(LocationSensitiveAttention, self).__init__(
+            num_units=num_units,
+            memory=memory,
+            memory_sequence_length=memory_length,
+            normalize=normalize,
+            probability_fn=normalization_function,
+            name=name,
+        )
+
+        self.location_convolution = tf.layers.Conv1D(
+            filters=hparams.attention_filters,
+            kernel_size=hparams.attention_kernel,
+            padding="same",
+            use_bias=True,
+            bias_initializer=tf.zeros_initializer(),
+            name="location_features_convolution",
+        )
+        self.location_layer = tf.layers.Dense(
+            units=num_units,
+            use_bias=False,
+            dtype=tf.float32,
+            name="location_features_layer",
+        )
+
+        self.transition_agent_layer = None
+        if hparams.use_transition_agent:
+            self.transition_agent_layer = tf.layers.Dense(
+                units=1,
+                use_bias=True,
+                bias_initializer=tf.zeros_initializer(),
+                dtype=tf.float32,
+                name="transition_agent_layer",
+            )
+
+        self._cumulate = cumulate_weights
+        self._use_score_bias = use_score_bias
+        self._score_bias_init = score_bias_init
+        self._sigmoid_noise = sigmoid_noise
+        self._use_forward = use_forward
+
+        self.synthesis_constraint = hparams.synthesis_constraint and not is_training
+        self.attention_win_size = tf.convert_to_tensor(
+            hparams.attention_win_size, dtype=tf.int32
+        )
+        self.constraint_type = hparams.synthesis_constraint_type
+
+    def initial_alignments(self, batch_size, dtype):
+        max_time = self._alignments_size
+        return array_ops.one_hot(
+            array_ops.zeros((batch_size,), dtype=tf.int32), max_time, dtype=dtype
+        )
+
+    def __call__(self, query, state, prev_max_attentions):
+        """Score the query based on the keys and values.
+        Args:
+            query: Tensor of dtype matching `self.values` and shape
+                `[batch_size, query_depth]`.
+            state (previous alignments): Tensor of dtype matching `self.values` and shape
+                `[batch_size, alignments_size]`
+                (`alignments_size` is memory's `max_time`).
+        Returns:
+            alignments: Tensor of dtype matching `self.values` and shape
+                `[batch_size, alignments_size]` (`alignments_size` is memory's
+                `max_time`).
+        """
+        previous_alignments = state
+        with variable_scope.variable_scope(
+            None, "Location_Sensitive_Attention", [query]
+        ):
+            # transition agent
+            transit_proba = 0.5
+            if self.transition_agent_layer is not None:
+                expanded_alignments = array_ops.expand_dims(previous_alignments, 1)
+                previous_context = math_ops.matmul(expanded_alignments, self.values)
+                previous_context = array_ops.squeeze(previous_context, [1])
+                ta_input = tf.concat([query, previous_context], axis=-1)
+                transit_proba = tf.sigmoid(self.transition_agent_layer(ta_input))
+
+            # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
+            processed_query = self.query_layer(query) if self.query_layer else query
+            # -> [batch_size, 1, attention_dim]
+            processed_query = tf.expand_dims(processed_query, 1)
+
+            # processed_location_features shape [batch_size, max_time, attention dimension]
+            # [batch_size, max_time] -> [batch_size, max_time, 1]
+            expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
+            # location features [batch_size, max_time, filters]
+            f = self.location_convolution(expanded_alignments)
+
+            # Projected location features [batch_size, max_time, attention_dim]
+            processed_location_features = self.location_layer(f)
+
+            # energy shape [batch_size, max_time]
+            energy = _location_sensitive_score(
+                processed_query, processed_location_features, self.keys
+            )
+
+    """ 
+    # [A. Misevich]: This part will not be supported now.
+    # It will be implemented if will be needed.
+        if self.synthesis_constraint:
+            Tx = tf.shape(energy)[-1]
+            # prev_max_attentions = tf.squeeze(prev_max_attentions, [-1])
+            if self.constraint_type == 'forward':
+                key_masks = tf.sequence_mask(prev_max_attentions, Tx)
+                reverse_masks = tf.sequence_mask(Tx - 2 - prev_max_attentions, Tx)[:, ::-1]
+            elif self.constraint_type == 'monotonic':
+                key_masks = tf.sequence_mask(prev_max_attentions, Tx)
+                reverse_masks = tf.sequence_mask(Tx - self.attention_win_size - prev_max_attentions, Tx)[:, ::-1]
+            else:
+                assert self.constraint_type == 'window'
+                key_masks = tf.sequence_mask(prev_max_attentions - (self.attention_win_size // 2 + (self.attention_win_size % 2 != 0)), Tx)
+                reverse_masks = tf.sequence_mask(Tx - (self.attention_win_size // 2) - prev_max_attentions, Tx)[:, ::-1]
+
+            masks = tf.logical_or(key_masks, reverse_masks)
+            paddings = tf.ones_like(energy) * (-2 ** 32 + 1)  # (N, Ty/r, Tx)
+            energy = tf.where(tf.equal(masks, False), energy, paddings)
+    """
+
+    # alignments shape = energy shape = [batch_size, max_time]
+    if self.constraint_type == "stepwise_monotonic":
+        noise = tf.random_normal(array_ops.shape(energy), dtype=energy.dtype)
+        p = tf.sigmoid(energy + self._sigmoid_noise * noise)
+        alignments = tf.multiply(previous_alignments, p) + tf.pad(
+            tf.multiply(previous_alignments, 1 - p)[:, :-1], [[0, 0], [1, 0]]
+        )
+    else:
+        print("here")
+        alignments = self._probability_fn(energy, previous_alignments)
+
+    if self._use_forward:
+        # [batch_size, max_time] -- shift and add to get alpha(n-1) + alpha(n)
+        previous_alignments_shifted = tf.pad(
+            previous_alignments[:, :-1], [[0, 0], [1, 0]]
+        )
+        alignments = (
+            (1 - transit_proba) * previous_alignments
+            + transit_proba * previous_alignments_shifted
+        ) * alignments
+        alignments = alignments / tf.reduce_sum(alignments, axis=-1, keepdims=True)
+
+    max_attentions = tf.argmax(alignments, -1, output_type=tf.int32)  # (N, Ty/r)
+
+    # Cumulate alignments
+    if self._cumulate:
+        next_state = alignments + previous_alignments
+    else:
+        next_state = alignments
+
+    return alignments, next_state, max_attentions
+
+
+# TESTS
+# TEST 1
+# No mask, no transition agent, synthesis_constraint=False, use_forward=False, cumulate=False, normalize=False, smoothing=False, constraint_type=None
+tf.set_random_seed(0)
+
+
+class params:
+    attention_filters = 3
+    attention_kernel = 3
+    use_transition_agent = False
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = None
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [2, 3, 5], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [2, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [2, 5], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = LocationSensitiveAttention(4, memory, param, False, cumulate_weights=False)
+
+result = attn(query, state, prev_max)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+
+# TEST 2
+# No mask, no transition agent, synthesis_constraint=False, use_forward=False, cumulate=True, smoothing=True, constraint_type=None
+tf.set_random_seed(1)
+
+
+class params:
+    attention_filters = 2
+    attention_kernel = 5
+    use_transition_agent = False
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = None
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 4, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 6], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = LocationSensitiveAttention(
+    5, memory, param, False, cumulate_weights=True, smoothing=True
+)
+
+result = attn(query, state, prev_max)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+
+# TEST 3 (like TEST 2, but with mask)
+# Use mask, no transition agent, synthesis_constraint=False, use_forward=False, cumulate=True, smoothing=True, constraint_type=None
+tf.set_random_seed(1)
+
+
+class params:
+    attention_filters = 2
+    attention_kernel = 5
+    use_transition_agent = False
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = None
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 4, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 6], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = LocationSensitiveAttention(
+    5,
+    memory,
+    param,
+    False,
+    memory_sequence_length=[2, 3, 4],
+    cumulate_weights=True,
+    smoothing=True,
+)
+
+result = attn(query, state, prev_max)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+print("keys:", attn.keys)
+
+# TEST 4
+# Use mask, no transition agent, synthesis_constraint=False, use_forward=True, cumulate=True, smoothing=True, constraint_type=None
+tf.set_random_seed(1)
+
+
+class params:
+    attention_filters = 2
+    attention_kernel = 5
+    use_transition_agent = False
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = None
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 4, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 6], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+weights = np.array(
+    [
+        0.01492912,
+        -0.08340913,
+        -0.13510555,
+        0.7274594,
+        0.27867514,
+        -0.22769517,
+        -0.09443533,
+        0.14927697,
+        -0.06407022,
+        0.3732596,
+        -0.46085864,
+        0.0720188,
+        0.07225263,
+        0.0731563,
+        -0.32511705,
+        -0.5776096,
+        0.19336885,
+        0.5521658,
+        0.04765564,
+        0.09024948,
+        -0.3043793,
+        0.6329126,
+        -0.57645786,
+        0.68469685,
+        0.27464074,
+        -0.32275102,
+        0.4352892,
+        -0.59377813,
+        0.703242,
+        0.49236614,
+    ]
+)
+attn = LocationSensitiveAttention(
+    5,
+    memory,
+    param,
+    False,
+    memory_sequence_length=[2, 3, 4],
+    cumulate_weights=True,
+    smoothing=True,
+    use_forward=True,
+)
+
+result = attn(query, state, prev_max)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+
+# TEST 5
+# Use mask, no transition agent, synthesis_constraint=False, use_forward=False, cumulate=True, smoothing=True, constraint_type=None
+tf.set_random_seed(1)
+
+
+class params:
+    attention_filters = 5
+    attention_kernel = 5
+    use_transition_agent = False
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = "stepwise_monotonic"
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 5, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 5], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 8], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = LocationSensitiveAttention(
+    5,
+    memory,
+    param,
+    False,
+    memory_sequence_length=[3, 1, 2],
+    cumulate_weights=True,
+    smoothing=True,
+    use_forward=False,
+)
+
+result = attn(query, state, prev_max)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+
+# TEST 6 (Like TEST 5, but use forward)
+# Use mask, no transition agent, synthesis_constraint=False, use_forward=False, cumulate=True, smoothing=True, constraint_type='stepwise_monotonic'
+tf.set_random_seed(1)
+
+
+class params:
+    attention_filters = 5
+    attention_kernel = 5
+    use_transition_agent = False
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = "stepwise_monotonic"
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 5, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 5], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 8], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = LocationSensitiveAttention(
+    5,
+    memory,
+    param,
+    False,
+    memory_sequence_length=[3, 1, 2],
+    cumulate_weights=True,
+    smoothing=True,
+    use_forward=True,
+)
+
+result = attn(query, state, prev_max)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+
+# TEST 7
+# Use mask, use transition agent, synthesis_constraint=False, use_forward=False, cumulate=True, smoothing=True, constraint_type=None
+tf.set_random_seed(1)
+
+
+class params:
+    attention_filters = 5
+    attention_kernel = 5
+    use_transition_agent = True
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = None
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 3, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = LocationSensitiveAttention(
+    7,
+    memory,
+    param,
+    False,
+    memory_sequence_length=[2, 2, 2],
+    cumulate_weights=True,
+    smoothing=True,
+)
+
+result = attn(query, state, prev_max)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+
+# TEST 8
+# Use mask, use transition agent, synthesis_constraint=False, use_forward=True, cumulate=True, smoothing=True, constraint_type=None
+tf.set_random_seed(1)
+
+
+class params:
+    attention_filters = 5
+    attention_kernel = 5
+    use_transition_agent = True
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = None
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 3, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = LocationSensitiveAttention(
+    7,
+    memory,
+    param,
+    False,
+    memory_sequence_length=[2, 2, 2],
+    cumulate_weights=True,
+    smoothing=True,
+    use_forward=True,
+)
+
+result = attn(query, state, prev_max)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+
+# TEST 9
+# Enable everything, but no smoothing (and no constraints)
+tf.set_random_seed(1)
+
+
+class params:
+    attention_filters = 5
+    attention_kernel = 5
+    use_transition_agent = True
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = None
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 3, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = LocationSensitiveAttention(
+    7,
+    memory,
+    param,
+    False,
+    memory_sequence_length=[2, 2, 2],
+    cumulate_weights=True,
+    smoothing=False,
+    use_forward=True,
+)
+
+result = attn(query, state, prev_max)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+
+# TEST 10
+# Enable everything
+tf.set_random_seed(1)
+
+
+class params:
+    attention_filters = 5
+    attention_kernel = 5
+    use_transition_agent = True
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = "stepwise_monotonic"
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 3, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query = tf.Variable(
+    tf.random.uniform(
+        [3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query",
+)
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = LocationSensitiveAttention(
+    7,
+    memory,
+    param,
+    False,
+    memory_sequence_length=[2, 2, 2],
+    cumulate_weights=True,
+    smoothing=True,
+    use_forward=True,
+)
+
+result = attn(query, state, prev_max)
+print("query: ", query)
+print("state: ", state)
+print("memory: ", memory)
+print("result: ", result)
+
+# TEST 11
+# Double step, enable everything
+tf.set_random_seed(1)
+
+
+def _location_sensitive_score(W_query, W_fil, W_keys):
+    """Impelements Bahdanau-style (cumulative) scoring function.
+    This attention is described in:
+        J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+      gio, “Attention-based models for speech recognition,” in Ad-
+      vances in Neural Information Processing Systems, 2015, pp.
+      577–585.
+
+    #############################################################################
+              hybrid attention (content-based + location-based)
+                               f = F * α_{i-1}
+       energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
+    #############################################################################
+
+    Args:
+        W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features.
+        W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]'
+        W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs.
+    Returns:
+        A '[batch_size, max_time]' attention score (energy)
+    """
+    # Get the number of hidden units from the trailing dimension of keys
+    dtype = W_query.dtype
+    num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
+    v_a = tf.get_variable(
+        "attention_variable_projection",
+        shape=[num_units],
+        dtype=dtype,
+        initializer=tf.contrib.layers.xavier_initializer(),
+        trainable=True,
+    )
+
+    # PLUG IN ORDER TO KEEP THE SAME VALUE FOR THIS VARIABLE
+    v_a = tf.Variable(
+        [
+            -0.21917742,
+            0.19094306,
+            0.26173806,
+            -0.31035906,
+            0.06741166,
+            0.22481245,
+            -0.6266,
+        ]
+    )
+    b_a = tf.get_variable(
+        "attention_bias",
+        shape=[num_units],
+        dtype=dtype,
+        initializer=tf.zeros_initializer(),
+    )
+    return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
+
+
+class params:
+    attention_filters = 5
+    attention_kernel = 5
+    use_transition_agent = True
+    synthesis_constraint = False
+    attention_win_size = 1
+    synthesis_constraint_type = "stepwise_monotonic"
+
+
+memory = tf.Variable(
+    tf.random.uniform(
+        [3, 3, 7], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="memory",
+)
+state = tf.Variable(
+    tf.random.uniform(
+        [3, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="state",
+)
+query1 = tf.Variable(
+    tf.random.uniform(
+        [3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query1",
+)
+
+query2 = tf.Variable(
+    tf.random.uniform(
+        [3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="query2",
+)
+bias = tf.Variable(
+    tf.random.uniform(
+        [3, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="bias",
+)
+multiplier = tf.Variable(
+    tf.random.uniform(
+        [3, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="multiplier",
+)
+
+prev_max = tf.Variable(tf.constant([1.0, 1.0]), name="prev_max")
+param = params
+attn = LocationSensitiveAttention(
+    7,
+    memory,
+    param,
+    False,
+    memory_sequence_length=[2, 2, 2],
+    cumulate_weights=True,
+    smoothing=True,
+    use_forward=True,
+)
+
+# First step: calculate attention
+result1 = attn(query1, state, prev_max)
+# Second step: Add bias to produced alignments
+new_state = result1[0] + bias
+# Third step: calculate new attention based on previous
+result2 = attn(query2, new_state, result1[2])
+# Fourth step: multiply new alignments by multiplier
+final = result2[0] * multiplier
+
+print("query1: ", query1)
+print("query2: ", query2)
+print("state: ", state)
+print("memory: ", memory)
+print("result1: ", result1)
+print("bias: ", bias)
+print("new state: ", new_state)
+print("result2: ", result2)
+print("multiplier: ", multiplier)
+print("final result: ", final)
diff --git a/training/src/tests/tests/python/logsoftmax.py b/training/src/tests/tests/python/logsoftmax.py
new file mode 100644
index 00000000..634a007c
--- /dev/null
+++ b/training/src/tests/tests/python/logsoftmax.py
@@ -0,0 +1,142 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+import torch.nn.functional as F
+
+KK = torch.tensor(
+    [
+        [[[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]]],
+        [
+            [[1.0, 1.0], [3.0, 3.0]],
+            [[4.0, 4.0], [3.0, 3.0]],
+            [[2.0, 1.0], [3.0, 7.0]],
+        ],
+    ],
+    requires_grad=True,
+)
+
+grad = torch.tensor(
+    [
+        [[[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]]],
+        [
+            [[1.0, 1.0], [3.0, 3.0]],
+            [[4.0, 4.0], [3.0, 3.0]],
+            [[2.0, 1.0], [3.0, 7.0]],
+        ],
+    ]
+)
+
+for i in range(1, 4):
+    x = F.log_softmax(KK, dim=i)
+    print("dim = " + str(i), x.view(24))
+    x.backward(grad)
+    print("grad(dim = " + str(i) + ")", KK.grad.view(24))
+    torch.nn.init.zeros_(KK.grad)
+
+
+KK = torch.tensor(
+    [
+        [
+            [
+                [-0.6226, 0.4617],
+                [-1.4773, 0.6215],
+                [-0.1313, 0.3401],
+                [-0.1132, 0.3223],
+                [-0.6378, 0.4601],
+                [-0.9472, 0.8423],
+                [-0.4721, 0.7331],
+                [-0.9548, 0.7353],
+                [-0.7862, 0.9551],
+                [-0.2192, 0.7782],
+                [-0.6109, 0.9250],
+                [-0.4453, 0.5458],
+                [-0.7532, 0.2978],
+                [-0.5299, 0.6211],
+                [-0.5609, 0.8073],
+                [0.1008, 0.8441],
+                [-0.6271, 0.2552],
+                [-0.3445, 0.8781],
+                [-0.5200, 0.6886],
+                [-0.5070, 0.5257],
+                [-0.2960, 0.3555],
+                [-1.1601, 0.5381],
+                [-0.5758, 0.6153],
+                [-1.0586, 0.7958],
+                [-1.0672, 0.5650],
+                [-0.7843, 1.1598],
+                [-0.7141, 0.3838],
+                [-0.0126, 0.5464],
+                [0.0409, 0.5775],
+                [-0.8350, 0.7043],
+                [-0.7880, 0.1792],
+                [-0.1886, 0.6843],
+            ]
+        ]
+    ],
+    requires_grad=True,
+)
+
+grad = torch.tensor(
+    [
+        [
+            [
+                [-0.0312, 0.0000],
+                [-0.0312, 0.0000],
+                [0.0000, -0.0312],
+                [-0.0312, 0.0000],
+                [-0.0312, 0.0000],
+                [-0.0312, 0.0000],
+                [0.0000, -0.0312],
+                [0.0000, -0.0312],
+                [-0.0312, 0.0000],
+                [0.0000, -0.0312],
+                [-0.0312, 0.0000],
+                [-0.0312, 0.0000],
+                [-0.0312, 0.0000],
+                [-0.0312, 0.0000],
+                [-0.0312, 0.0000],
+                [0.0000, -0.0312],
+                [-0.0312, 0.0000],
+                [0.0000, -0.0312],
+                [-0.0312, 0.0000],
+                [0.0000, -0.0312],
+                [0.0000, -0.0312],
+                [0.0000, -0.0312],
+                [0.0000, -0.0312],
+                [0.0000, -0.0312],
+                [-0.0312, 0.0000],
+                [0.0000, -0.0312],
+                [-0.0312, 0.0000],
+                [0.0000, -0.0312],
+                [0.0000, -0.0312],
+                [-0.0312, 0.0000],
+                [-0.0312, 0.0000],
+                [0.0000, -0.0312],
+            ]
+        ]
+    ]
+)
+
+x = F.log_softmax(KK, dim=3)
+print("dim = " + str(3), x.view(64))
+x.backward(grad)
+print("grad(dim = " + str(3) + ")", KK.grad.view(64))
+
+# print('dim = 1', F.softmax(KK, dim = 1))
+# print('dim = 2', F.softmax(KK, dim = 2))
+# print('dim = 3', F.softmax(KK, dim = 3))
diff --git a/training/src/tests/tests/python/lr_decay_test.py b/training/src/tests/tests/python/lr_decay_test.py
new file mode 100644
index 00000000..b9039da6
--- /dev/null
+++ b/training/src/tests/tests/python/lr_decay_test.py
@@ -0,0 +1,85 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow as tf
+
+tf.compat.v1.enable_eager_execution()
+
+
+def _learning_rate_decay(global_step):
+    tacotron_initial_learning_rate = 1e-2
+    tacotron_start_decay = 200
+    decay_steps = 1000
+    decay_rate = 0.5
+    tacotron_final_learning_rate = 1e-4
+    warmup_enable = True
+    #################################################################
+    # Narrow Exponential Decay:
+
+    # Phase 1: lr = 1e-3
+    # We only start learning rate decay after 50k steps
+
+    # Phase 2: lr in ]1e-5, 1e-3[
+    # decay reach minimal value at step 310k
+
+    # Phase 3: lr = 1e-5
+    # clip by minimal learning rate value (step > 310k)
+    #################################################################
+
+    # Compute natural exponential decay
+    lr = tf.train.exponential_decay(
+        tacotron_initial_learning_rate,
+        global_step - tacotron_start_decay,  # lr = 1e-3 at step 50k
+        decay_steps,
+        decay_rate,  # lr = 1e-5 around step 310k
+        name="lr_exponential_decay",
+    )
+
+    # clip learning rate by max and min values (initial and final values)
+    lr = tf.minimum(
+        tf.maximum(lr, tacotron_final_learning_rate), tacotron_initial_learning_rate
+    )
+
+    if self.warmup_enable:
+        global_steps_int = tf.cast(global_step, tf.int32)
+        warmup_steps_int = tf.constant(hp.warmup_num_steps, dtype=tf.int32)
+
+        global_steps_float = tf.cast(global_steps_int, tf.float32)
+        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
+
+        warmup_percent_done = global_steps_float / warmup_steps_float
+        warmup_learning_rate = tacotron_initial_learning_rate * warmup_percent_done
+
+        lr = tf.minimum(lr, warmup_learning_rate)
+    return lr
+
+
+steps = [
+    0,
+    10,
+    20,
+    100,
+    1000,
+    2000,
+    10000,
+    100000,
+    300000,
+    300001,
+    300010,
+    300100,
+    301000,
+]
+for step in steps:
+    print(
+        "step =", step, "lr =", _learning_rate_decay(tf.Variable(step, dtype=tf.int32))
+    )
diff --git a/training/src/tests/tests/python/lstm_shared.py b/training/src/tests/tests/python/lstm_shared.py
new file mode 100644
index 00000000..ed4aa7d9
--- /dev/null
+++ b/training/src/tests/tests/python/lstm_shared.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.random.manual_seed(0)
+
+bs = 2
+hidden = 2
+input_size = 3
+rnn = torch.nn.LSTMCell(input_size, hidden)  # (input_size, hidden_size)
+# [-0.0053, 0.3793, -0.5820, -0.5204, -0.2723, 0.1896, -0.0140, 0.5607, -0.0628,0.1871, -0.2137, -0.1390, -0.6755, -0.4683, -0.2915,0.0262, 0.2795, 0.4243, -0.4794, -0.3079, 0.2568, 0.5872, -0.1455,  0.5291]
+print(rnn.weight_ih)
+# [-0.1140, 0.0748, 0.6403, -0.6560, -0.4452, -0.1790, -0.2756, 0.6109, -0.4583, -0.3255, -0.4940, -0.6622, -0.4128, 0.6078,0.3155, 0.3427]
+print(rnn.weight_hh)
+# [ 0.0372, -0.3625,  0.1196, -0.6602, -0.5109, -0.3645,  0.4461,  0.4146]
+print(rnn.bias_ih)
+# [-0.3136, -0.0255,  0.4522,  0.7030,  0.2806,  0.0955,  0.4741, -0.4163]
+print(rnn.bias_hh)
+input = torch.randn(bs, input_size)  # (time_steps, batch, input_size)
+hx = torch.randn(bs, hidden)  # (batch, hidden_size)
+cx = torch.randn(bs, hidden)
+
+# [-0.0209, -0.7185,  0.5186, -1.3125,  0.1920,  0.5428 ]
+print(input)
+# [0.2734, -0.9181, -0.0404,  0.2881]
+print(cx)
+# [-2.2188,  0.2590, -1.0297, -0.5008]
+print(hx)
+
+hx, cx = rnn(input, (hx, cx))
+# [0.4616, -0.5580, 0.2754,  0.4600]
+print(cx)
+# [0.3941, -0.2222, 0.2289,  0.1148]
+print(hx)
+hx, cx = rnn(input, (hx, cx))
+# [ 0.1913, -0.4290, 0.2704,  0.0704]
+print(cx)
+# [ 0.1377, -0.2434, 0.2198,  0.0282]
+print(hx)
diff --git a/training/src/tests/tests/python/matmul.py b/training/src/tests/tests/python/matmul.py
new file mode 100644
index 00000000..4d918be9
--- /dev/null
+++ b/training/src/tests/tests/python/matmul.py
@@ -0,0 +1,54 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import math
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+Q = torch.tensor(
+    [
+        [[1.0, 1.0, 2.0, 0.0, 5.0], [-1.0, 2.0, 2.0, 0.0, 5.0]],
+        [[1.0, 1.0, 2.0, 0.0, 5.0], [-1.0, 2.0, 2.0, 0.0, 5.0]],
+    ],
+    requires_grad=True,
+)
+K1 = torch.tensor(
+    [
+        [[-1.0, 4.0, 1.0, 2.0, 1], [-3.0, 4.0, 5.0, 2.0, 1], [-3.0, 3.0, 5.0, 1.0, 1]],
+        [
+            [-2.0, 8.0, 2.0, 4.0, 2],
+            [-6.0, 8.0, 10.0, 4.0, 2],
+            [-6.0, 6.0, 10.0, 2.0, 2],
+        ],
+    ],
+    requires_grad=True,
+)
+
+K = K1.transpose(-2, -1)
+K.retain_grad()
+print("K", K.reshape(30))
+
+res = torch.matmul(Q, K)
+res.backward(
+    torch.tensor([[[1.0, 2.0, -1], [1.0, 0.5, 6.0]], [[1.0, 2.0, -1], [1.0, 0.5, 6.0]]])
+)
+
+print(Q.grad)
+print(K.grad)
+
+print("res", res.reshape(12))
diff --git a/training/src/tests/tests/python/mlp.py b/training/src/tests/tests/python/mlp.py
new file mode 100644
index 00000000..6aebf204
--- /dev/null
+++ b/training/src/tests/tests/python/mlp.py
@@ -0,0 +1,169 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+import os
+import time
+
+input_size = 784
+hidden_size = 500
+hidden_size2 = 100
+num_classes = 10
+batch_size = 50
+learning_rate = 0.1
+
+curdir = "./weights/"
+
+
+class Softmax(nn.Module):
+    def forward(self, input):
+        exp_x = torch.exp(input)
+        y = exp_x / exp_x.sum(1).unsqueeze(1).expand_as(exp_x)
+        return y
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, input_size, hidden_size, num_classes):
+        super(NeuralNet, self).__init__()
+
+        torch.manual_seed(0)
+
+        self.fc1 = nn.Linear(input_size, hidden_size)
+        self.tanh1 = nn.Tanh()
+        self.fc2 = nn.Linear(hidden_size, hidden_size2)
+        self.sigm1 = nn.Sigmoid()
+        self.fc3 = nn.Linear(hidden_size2, num_classes)
+        self.softmax = Softmax()
+
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.xavier_uniform_(self.fc2.weight)
+        nn.init.xavier_uniform_(self.fc3.weight)
+
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.zeros_(self.fc2.bias)
+        nn.init.zeros_(self.fc3.bias)
+
+    def forward(self, x):
+        out = self.fc1(x)
+        out = self.tanh1(out)
+        out = self.fc2(out)
+        out = self.sigm1(out)
+        out = self.fc3(out)
+        out = self.softmax(out)
+        return out
+
+
+def predict(test_loader, model):
+    correct = 0
+    total = 0
+    # ~ with torch.no_grad():
+    for images, labels in test_loader:
+        images = images.reshape(-1, 28 * 28)
+        outputs = model(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+    print(
+        "Accuracy of the network on the 10000 test images: {:.2f} %".format(
+            100 * correct / total
+        )
+    )
+
+
+def saveWeights(index, model):
+
+    if not os.path.exists(curdir):
+        os.mkdir(curdir)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            with open(curdir + str(index) + "_" + name + ".txt", "w") as outfile:
+                np.savetxt(outfile, np.transpose(param.data))
+
+
+def CrossEntropy(y, target):
+    ones = torch.sparse.torch.eye(num_classes)
+    t = ones.index_select(0, target).type(y.data.type())
+    t = Variable(t)
+    loss = (-t * torch.log(y)).sum() / y.size(0)
+    return loss, y
+
+
+def main():
+
+    train_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=True, transform=transforms.ToTensor(), download=True
+    )
+
+    test_dataset = torchvision.datasets.MNIST(
+        root="./data/mnist", train=False, transform=transforms.ToTensor()
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    model = NeuralNet(input_size, hidden_size, num_classes)
+
+    predict(test_loader, model)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    total_step = len(train_loader)
+
+    if os.path.exists(curdir + "loss.txt"):
+        os.remove(curdir + "loss.txt")
+
+    timeTaken = 0
+
+    for i, (images, labels) in enumerate(train_loader):
+
+        images = images.reshape(-1, 28 * 28)
+
+        start = time.time()
+
+        outputs = model(images)
+        # if i < 10:
+        # saveWeights(i, model)
+        loss, lossInput = CrossEntropy(outputs, labels)
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        timeTaken += time.time() - start
+
+        #        if i % 100 == 0:
+        #            with open(curdir + 'loss.txt', 'a') as outfile:
+        #                print(loss.item(), file = outfile)
+
+        if i % 100 == 0:
+            print("Step [{:4d}/{}], Loss: {:.6f}".format(i, total_step, loss))
+
+    predict(test_loader, model)
+
+    print("Time taken = {:.4f}".format(timeTaken))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/mseloss.py b/training/src/tests/tests/python/mseloss.py
new file mode 100644
index 00000000..c54872e2
--- /dev/null
+++ b/training/src/tests/tests/python/mseloss.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(42)
+torch.set_printoptions(precision=7)
+
+# MSELoss variants
+loss_none = torch.nn.MSELoss(reduction="none")
+loss_mean = torch.nn.MSELoss(reduction="mean")
+loss_sum = torch.nn.MSELoss(reduction="sum")
+# None
+x = torch.tensor(
+    [[[0.8822693, 0.9150040, 0.3828638], [0.9593056, 0.3904482, 0.6008953]]],
+    requires_grad=True,
+)
+y = torch.tensor(
+    [[[0.2565725, 0.7936413, 0.9407715], [0.1331859, 0.9345981, 0.5935796]]],
+    requires_grad=True,
+)
+z_none = loss_none(x, y)
+z_none.requires_grad_ = True
+z_none.sum().backward()
+print("input = ", x)
+print("target = ", y)
+print("Result (none) = ", z_none)
+print("Gradient for input (none) = ", x.grad)
+# Mean
+x = torch.tensor(
+    [[[0.8822693, 0.9150040, 0.3828638], [0.9593056, 0.3904482, 0.6008953]]],
+    requires_grad=True,
+)
+y = torch.tensor(
+    [[[0.2565725, 0.7936413, 0.9407715], [0.1331859, 0.9345981, 0.5935796]]],
+    requires_grad=True,
+)
+z_mean = loss_mean(x, y)
+z_mean.requires_grad_ = True
+z_mean.backward()
+print("Result (mean) = ", z_mean)
+print("Gradient for input (mean) = ", x.grad)
+# None
+x = torch.tensor(
+    [[[0.8822693, 0.9150040, 0.3828638], [0.9593056, 0.3904482, 0.6008953]]],
+    requires_grad=True,
+)
+y = torch.tensor(
+    [[[0.2565725, 0.7936413, 0.9407715], [0.1331859, 0.9345981, 0.5935796]]],
+    requires_grad=True,
+)
+z_sum = loss_sum(x, y)
+z_sum.requires_grad_ = True
+z_sum.backward()
+print("Result (sum) = ", z_sum)
+print("Gradient for input (sum) = ", x.grad)
diff --git a/training/src/tests/tests/python/multi-head-attention.py b/training/src/tests/tests/python/multi-head-attention.py
new file mode 100644
index 00000000..4076c75f
--- /dev/null
+++ b/training/src/tests/tests/python/multi-head-attention.py
@@ -0,0 +1,249 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math, copy, time
+from torch.autograd import Variable
+import seaborn
+
+seaborn.set_context(context="talk")
+
+
+def clones(module, N):
+    "Produce N identical layers."
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+
+
+class LayerNorm(nn.Module):
+    "Construct a layernorm module (See citation for details)."
+
+    def __init__(self, features, eps=1e-6):
+        super(LayerNorm, self).__init__()
+        self.a_2 = nn.Parameter(torch.ones(features))
+        self.b_2 = nn.Parameter(torch.zeros(features))
+        self.eps = eps
+
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
+
+
+class SublayerConnection(nn.Module):
+    """
+    A residual connection followed by a layer norm.
+    Note for code simplicity the norm is first as opposed to last.
+    """
+
+    def __init__(self, size, dropout):
+        super(SublayerConnection, self).__init__()
+        self.norm = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, sublayer):
+        "Apply residual connection to any sublayer with the same size."
+        return x + self.dropout(sublayer(self.norm(x)))
+
+
+def attention(query, key, value, mask=None, dropout=None):
+    "Compute 'Scaled Dot Product Attention'"
+    d_k = query.size(-1)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+    if mask is not None:
+        scores = scores.masked_fill(mask == 0, -1e9)
+    p_attn = F.softmax(scores, dim=-1)
+    if dropout is not None:
+        p_attn = dropout(p_attn)
+    return torch.matmul(p_attn, value), p_attn
+
+
+class MultiHeadedAttention(nn.Module):
+    def __init__(self, h, d_model, dropout=0.1):
+        "Take in model size and number of heads."
+        super(MultiHeadedAttention, self).__init__()
+        assert d_model % h == 0
+        # We assume d_v always equals d_k
+        self.d_k = d_model // h
+        self.h = h
+        self.linears = clones(nn.Linear(d_model, d_model), 4)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, query, key, value, mask=None):
+        "Implements Figure 2"
+        if mask is not None:
+            # Same mask applied to all h heads.
+            mask = mask.unsqueeze(1)
+        nbatches = query.size(0)
+        # print('Before heads splitting')
+        # print('Q', query)
+        # print('K', key)
+        # print('V', value)
+        # 1) Do all the linear projections in batch from d_model => h x d_k
+        query, key, value = [
+            l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
+            for l, x in zip(self.linears, (query, key, value))
+        ]
+        # print('After heads splitting')
+        # print('Q', query)
+        # print('K', key)
+        # print('V', value)
+        # 2) Apply attention on all the projected vectors in batch.
+        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
+        # print('x', x)
+        # print('self.attn', self.attn)
+        # 3) "Concat" using a view and apply a final linear.
+        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
+        return self.linears[-1](x)
+
+
+# Train the simple copy task.
+MODEL_SIZE = 9  # 512
+H = 3  # 8
+attn = MultiHeadedAttention(H, MODEL_SIZE, 0)
+
+for p in attn.parameters():
+    nn.init.ones_(p)
+
+X = torch.tensor(
+    np.array(
+        [
+            [
+                [
+                    [
+                        1111.0,
+                        1112.0,
+                        1113.0,
+                        1114.0,
+                        1115.0,
+                        1116.0,
+                        1117.0,
+                        1118.0,
+                        1119.0,
+                    ],
+                    [
+                        5121.0,
+                        1122.0,
+                        1123.0,
+                        1124.0,
+                        1125.0,
+                        1126.0,
+                        1127.0,
+                        1128.0,
+                        1129.0,
+                    ],
+                ],
+            ],
+            [
+                [
+                    [
+                        2111.0,
+                        2112.0,
+                        2113.0,
+                        2114.0,
+                        2115.0,
+                        2116.0,
+                        2117.0,
+                        2118.0,
+                        2119.0,
+                    ],
+                    [
+                        8121.0,
+                        2122.0,
+                        2123.0,
+                        2124.0,
+                        2125.0,
+                        2126.0,
+                        2127.0,
+                        2128.0,
+                        2129.0,
+                    ],
+                ],
+            ],
+        ]
+    )
+    / 10000.0,
+    requires_grad=True,
+    dtype=torch.float32,
+)
+
+print("X", X)
+Y = attn(X, X, X)
+
+print("Y", Y)
+
+Y.backward(
+    torch.tensor(
+        np.array(
+            [
+                [
+                    [
+                        1111.0,
+                        1112.0,
+                        1113.0,
+                        1114.0,
+                        1115.0,
+                        1116.0,
+                        1117.0,
+                        1118.0,
+                        1119.0,
+                    ],
+                    [
+                        8121.0,
+                        1122.0,
+                        1123.0,
+                        1124.0,
+                        1125.0,
+                        1126.0,
+                        1127.0,
+                        1128.0,
+                        1129.0,
+                    ],
+                ],
+                [
+                    [
+                        2111.0,
+                        2112.0,
+                        2113.0,
+                        2114.0,
+                        2115.0,
+                        2116.0,
+                        2117.0,
+                        2118.0,
+                        2119.0,
+                    ],
+                    [
+                        5121.0,
+                        2122.0,
+                        2123.0,
+                        2124.0,
+                        2125.0,
+                        2126.0,
+                        2127.0,
+                        2128.0,
+                        2129.0,
+                    ],
+                ],
+            ]
+        )
+        / 10000.0,
+        requires_grad=False,
+        dtype=torch.float32,
+    )
+)
+
+print("grad", X.grad)
diff --git a/training/src/tests/tests/python/ngram.py b/training/src/tests/tests/python/ngram.py
new file mode 100644
index 00000000..b09402e3
--- /dev/null
+++ b/training/src/tests/tests/python/ngram.py
@@ -0,0 +1,152 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import math
+from torch.autograd import Variable
+
+#####################################################################################
+# based on https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
+#####################################################################################
+
+
+class Embeddings(nn.Module):
+    def __init__(self, d_model, vocab):
+        super(Embeddings, self).__init__()
+        self.lut = nn.Embedding(vocab, d_model)
+        self.d_model = d_model
+
+    def forward(self, x):
+        return self.lut(x) * math.sqrt(self.d_model)
+
+
+class PositionalEncoding(nn.Module):
+    "Implement the PE function."
+
+    def __init__(self, d_model, dropout, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        p = Variable(self.pe[:, : x.size(1)], requires_grad=False)
+        x = x + p
+        return self.dropout(x)
+
+
+class NGramLanguageModeler(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, context_size):
+        super(NGramLanguageModeler, self).__init__()
+        self.embeddings = Embeddings(embedding_dim, vocab_size)
+        self.pe = PositionalEncoding(embedding_dim, 0)
+        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
+        self.linear2 = nn.Linear(128, vocab_size)
+
+    def forward(self, inputs):
+        embeds = self.embeddings(inputs)
+        pp = self.pe(embeds)
+        ppc = pp.view((1, -1))
+        out = F.relu(self.linear1(ppc))
+        out = self.linear2(out)
+        log_probs = F.log_softmax(out, dim=1)
+        return log_probs
+
+
+CONTEXT_SIZE = 2
+EMBEDDING_DIM = 10
+# We will use Shakespeare Sonnet 2
+test_sentence = """When forty winters shall besiege thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery so gazed on now,
+Will be a totter'd weed of small worth held:
+Then being asked, where all thy beauty lies,
+Where all the treasure of thy lusty days;
+To say, within thine own deep sunken eyes,
+Were an all-eating shame, and thriftless praise.
+How much more praise deserv'd thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count, and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.""".split()
+# we should tokenize the input, but we will ignore that for now
+# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
+trigrams = [
+    ([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
+    for i in range(len(test_sentence) - 2)
+]
+# print the first 3, just so you can see what they look like
+print(trigrams[:3])
+
+vocab = set(test_sentence)
+
+lst = list(vocab)
+lst = sorted(lst)
+word_to_ix = {word: i for i, word in enumerate(lst)}
+
+losses = []
+loss_function = nn.NLLLoss()
+model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
+optimizer = optim.SGD(model.parameters(), lr=0.01)
+
+for p in model.parameters():
+    nn.init.zeros_(p)
+
+nepoch = 10
+for epoch in range(nepoch):
+    total_loss = 0
+    for context, target in trigrams:
+
+        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
+        # into integer indices and wrap them in tensors)
+        context_idxs = torch.tensor(
+            [[word_to_ix[w] for w in context]], dtype=torch.long
+        )
+
+        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
+        # new instance, you need to zero out the gradients from the old
+        # instance
+        model.zero_grad()
+
+        # Step 3. Run the forward pass, getting log probabilities over next
+        # words
+        log_probs = model(context_idxs)
+
+        # Step 4. Compute your loss function. (Again, Torch wants the target
+        # word wrapped in a tensor)
+        loss = loss_function(
+            log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long)
+        )
+
+        # Step 5. Do the backward pass and update the gradient
+        loss.backward()
+        optimizer.step()
+
+        # Get the Python number from a 1-element Tensor by calling tensor.item()
+        total_loss += loss.item()
+    losses.append(total_loss)
+print(losses)  # The loss decreased every iteration over the training data!
diff --git a/training/src/tests/tests/python/range_pe.py b/training/src/tests/tests/python/range_pe.py
new file mode 100644
index 00000000..c2f618a3
--- /dev/null
+++ b/training/src/tests/tests/python/range_pe.py
@@ -0,0 +1,294 @@
+# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow as tf
+import numpy as np
+
+tf.compat.v1.enable_eager_execution()
+
+
+def range_with_max(r, max_range):
+    range_values = tf.cond(
+        tf.greater(r, 0.0), lambda: tf.range(r), lambda: -1 * tf.ones(1, tf.float32)
+    )  # when duration is padded
+    # pad to max range in sample
+    paddings = [[0, max_range - tf.shape(range_values)[0]]]
+    return tf.pad(range_values, paddings, "CONSTANT", constant_values=-1)
+
+
+def durations_range(duration, max_mel_len):
+    # for one sample
+    # cur_range = tf.ragged.range(tf.ones(tf.shape(duration)[0]), duration+tf.ones(tf.shape(duration)[0])).flat_values
+    max_range = tf.cast(tf.reduce_max(duration), tf.int32)
+    cur_range = tf.map_fn(
+        fn=lambda r: range_with_max(r, max_range), elems=duration
+    )  # input_length x max_range
+    cur_range = tf.reshape(cur_range, [-1])
+    # drop zeros
+    # mask = tf.cast(cur_range, dtype=tf.bool)
+    mask = tf.greater(cur_range, -1)
+    cur_range = tf.boolean_mask(cur_range, mask)
+    # pad to max_mel_len in batch
+    pad_num = tf.cond(
+        tf.greater(max_mel_len - tf.shape(cur_range)[0], 0),
+        lambda: max_mel_len - tf.shape(cur_range)[0],
+        lambda: 0,
+    )
+
+    paddings = [[0, pad_num]]
+    return tf.cond(
+        tf.greater(max_mel_len - tf.shape(cur_range)[0], 0),
+        lambda: tf.pad(cur_range, paddings, "CONSTANT", constant_values=149),
+        lambda: cur_range[:max_mel_len],
+    )
+
+
+def get_angles(pos, i, d_model):
+    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
+    return pos * angle_rates
+
+
+def positional_encoding(position, d_model):
+    angle_rads = get_angles(
+        np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model
+    )
+
+    # apply sin to even indices in the array; 2i
+    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
+
+    # apply cos to odd indices in the array; 2i+1
+    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
+
+    pos_encoding = angle_rads[np.newaxis, ...]
+
+    return tf.cast(pos_encoding, dtype=tf.float32)
+
+
+class PositionalEncoding:
+    """"""
+
+    def __init__(self, embed_dim):
+        """
+        Args:
+            input_dim: int, encoder_output vector size corresponding to one phoneme/char
+        """
+        super(PositionalEncoding, self).__init__()
+        # self.embed_dim = embed_dim
+        # self.w = tf.constant([1/(pow(10000,2*i/embed_dim)) for i in range(embed_dim)], tf.float32)
+        n = 150
+        pos_encoding = positional_encoding(n, embed_dim)
+        self.pos_encoding_table = tf.reshape(pos_encoding[0], (n, embed_dim))
+
+    def __call__(self, durations, max_mel_len=None):
+        # durations - durations in frames (batch_size x seq_len)
+        # раскрыть все длительности in range()
+        # mel_len = tf.reduce_max(tf.reduce_sum(durations, axis = 1))
+        if max_mel_len is not None:
+            max_mel_len = tf.cast(max_mel_len, tf.int32)
+        else:
+            max_mel_len = tf.cast(
+                tf.reduce_max(tf.reduce_sum(durations, axis=1)), tf.int32
+            )
+        ranges = tf.map_fn(
+            fn=lambda t: durations_range(t, max_mel_len), elems=durations
+        )
+        return tf.nn.embedding_lookup(
+            self.pos_encoding_table, tf.cast(ranges, tf.int32)
+        )
+
+
+d = 100
+p = PositionalEncoding(32)
+durations = tf.Variable(
+    [
+        [
+            0,
+            3,
+            2,
+            3,
+            1,
+            4,
+            1,
+            2,
+            3,
+            3,
+            2,
+            2,
+            3,
+            1,
+            2,
+            2,
+            2,
+            3,
+            6,
+            6,
+            3,
+            5,
+            1,
+            2,
+            1,
+            2,
+            1,
+            2,
+            2,
+            2,
+            3,
+            3,
+            2,
+            1,
+            3,
+            1,
+            1,
+            3,
+            3,
+            2,
+            2,
+            6,
+            4,
+            2,
+            5,
+            7,
+            2,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],
+        [
+            4,
+            1,
+            1,
+            3,
+            1,
+            1,
+            2,
+            3,
+            2,
+            3,
+            4,
+            2,
+            4,
+            5,
+            3,
+            2,
+            3,
+            1,
+            1,
+            1,
+            1,
+            1,
+            2,
+            2,
+            2,
+            3,
+            1,
+            2,
+            3,
+            2,
+            1,
+            1,
+            1,
+            2,
+            3,
+            1,
+            2,
+            5,
+            2,
+            2,
+            3,
+            4,
+            3,
+            1,
+            2,
+            3,
+            3,
+            6,
+            7,
+            1,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],
+    ],
+    dtype=tf.float32,
+)
+res = p(durations, 222)
+tf.print(res, summarize=-1)
diff --git a/training/src/tests/tests/python/reduce_max.py b/training/src/tests/tests/python/reduce_max.py
new file mode 100644
index 00000000..92c9f254
--- /dev/null
+++ b/training/src/tests/tests/python/reduce_max.py
@@ -0,0 +1,83 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow as tf
+tf.random.set_seed(42)
+x = tf.random.uniform(shape=[2, 3, 4, 5])
+print("Input = ", x)
+# Default
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_max(x)
+  print("Output (Default) = ", y)
+print("Gradient (Default) = ", g.gradient(y, x) 
+# Batch
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_max(x, 0)
+  print("Output (Batch) = ", y)
+print("Gradient (Batch) = ", g.gradient(y, x) 
+# Depth
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_max(x, 1)
+  print("Output (Depth) = ", y)
+print("Gradient (Depth) = ", g.gradient(y, x) 
+# Height
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_max(x, 2)
+  print("Output (Height) = ", y)
+print("Gradient (Height) = ", g.gradient(y, x) 
+# Width
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_max(x, 3)
+  print("Output (Width) = ", y)
+print("Gradient (Width) = ", g.gradient(y, x)
+
+# Extremum repeats case:
+x = tf.constant([[[[1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 2.0],
+                [1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 2.0],
+                [1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 2.0]]]])
+print("Input = ", x)
+# Default
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_max(x)
+  print("Output (Default) = ", y)
+print("Gradient (Default) = ", g.gradient(y, x) 
+# Batch
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_max(x, 0)
+  print("Output (Batch) = ", y)
+print("Gradient (Batch) = ", g.gradient(y, x) 
+# Depth
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_max(x, 1)
+  print("Output (Depth) = ", y)
+print("Gradient (Depth) = ", g.gradient(y, x) 
+# Height
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_max(x, 2)
+  print("Output (Height) = ", y)
+print("Gradient (Height) = ", g.gradient(y, x) 
+# Width
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_max(x, 3)
+  print("Output (Width) = ", y)
+print("Gradient (Width) = ", g.gradient(y, x)
\ No newline at end of file
diff --git a/training/src/tests/tests/python/reduce_mean.py b/training/src/tests/tests/python/reduce_mean.py
new file mode 100644
index 00000000..16ccfda8
--- /dev/null
+++ b/training/src/tests/tests/python/reduce_mean.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+# Default
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z = torch.mean(x)
+z.backward()
+print("Input = ", x)
+print("Result (Default) = ", z)
+print("Gradient for input (Default) = ", x.grad)
+
+# By dimension
+dimensions = [0, 1, 2, 3]
+names = ["Batch", "Depth", "Height", "Width"]
+for dim in dimensions:
+    x = torch.rand(2, 3, 4, 5, requires_grad=True)
+    z = torch.mean(x, dim)
+    z.sum().backward()
+    print("Result (" + names[dim] + ") = ", z)
+    print("Gradient for input (" + names[dim] + ") = ", x.grad)
diff --git a/training/src/tests/tests/python/reduce_min.py b/training/src/tests/tests/python/reduce_min.py
new file mode 100644
index 00000000..0171f6d2
--- /dev/null
+++ b/training/src/tests/tests/python/reduce_min.py
@@ -0,0 +1,83 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow as tf
+tf.random.set_seed(42)
+x = tf.random.uniform(shape=[2, 3, 4, 5])
+print("Input = ", x)
+# Default
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_min(x)
+  print("Output (Default) = ", y)
+print("Gradient (Default) = ", g.gradient(y, x) 
+# Batch
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_min(x, 0)
+  print("Output (Batch) = ", y)
+print("Gradient (Batch) = ", g.gradient(y, x) 
+# Depth
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_min(x, 1)
+  print("Output (Depth) = ", y)
+print("Gradient (Depth) = ", g.gradient(y, x) 
+# Height
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_min(x, 2)
+  print("Output (Height) = ", y)
+print("Gradient (Height) = ", g.gradient(y, x) 
+# Width
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_min(x, 3)
+  print("Output (Width) = ", y)
+print("Gradient (Width) = ", g.gradient(y, x)
+
+# Extremum repeats case:
+x = tf.constant([[[[1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 2.0],
+                [1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 2.0],
+                [1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 2.0]]]])
+print("Input = ", x)
+# Default
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_min(x)
+  print("Output (Default) = ", y)
+print("Gradient (Default) = ", g.gradient(y, x) 
+# Batch
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_min(x, 0)
+  print("Output (Batch) = ", y)
+print("Gradient (Batch) = ", g.gradient(y, x) 
+# Depth
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_min(x, 1)
+  print("Output (Depth) = ", y)
+print("Gradient (Depth) = ", g.gradient(y, x) 
+# Height
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_min(x, 2)
+  print("Output (Height) = ", y)
+print("Gradient (Height) = ", g.gradient(y, x) 
+# Width
+with tf.GradientTape() as g:
+  g.watch(x)
+  y = tf.math.reduce_min(x, 3)
+  print("Output (Width) = ", y)
+print("Gradient (Width) = ", g.gradient(y, x)
\ No newline at end of file
diff --git a/training/src/tests/tests/python/reduce_std.py b/training/src/tests/tests/python/reduce_std.py
new file mode 100644
index 00000000..3bf097a6
--- /dev/null
+++ b/training/src/tests/tests/python/reduce_std.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+# Default
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z = torch.std(x)
+z.backward()
+print("Input = ", x)
+print("Result (Default) = ", z)
+print("Gradient for input (Default) = ", x.grad)
+
+# By dimension
+dimensions = [0, 1, 2, 3]
+names = ["Batch", "Depth", "Height", "Width"]
+for dim in dimensions:
+    x = torch.rand(2, 3, 4, 5, requires_grad=True)
+    z = torch.std(x, dim)
+    z.sum().backward()
+    print("Result (" + names[dim] + ") = ", z)
+    print("Gradient for input (" + names[dim] + ") = ", x.grad)
diff --git a/training/src/tests/tests/python/reduce_sum.py b/training/src/tests/tests/python/reduce_sum.py
new file mode 100644
index 00000000..cc29a069
--- /dev/null
+++ b/training/src/tests/tests/python/reduce_sum.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+# Default
+x = torch.rand(2, 3, 4, 5, requires_grad=True)
+z = torch.sum(x)
+z.backward()
+print("Input = ", x)
+print("Result (Default) = ", z)
+print("Gradient for input (Default) = ", x.grad)
+
+# By dimension
+dimensions = [0, 1, 2, 3]
+names = ["Batch", "Depth", "Height", "Width"]
+for dim in dimensions:
+    x = torch.rand(2, 3, 4, 5, requires_grad=True)
+    z = torch.sum(x, dim)
+    z.sum().backward()
+    print("Result (" + names[dim] + ") = ", z)
+    print("Gradient for input (" + names[dim] + ") = ", x.grad)
diff --git a/training/src/tests/tests/python/repeat_interleave.py b/training/src/tests/tests/python/repeat_interleave.py
new file mode 100644
index 00000000..0a315b69
--- /dev/null
+++ b/training/src/tests/tests/python/repeat_interleave.py
@@ -0,0 +1,47 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(21)
+torch.set_printoptions(precision=8)
+x = torch.rand(1, 2, 3, 4, requires_grad=True)
+print("Input: ", x)
+# Depth
+z = torch.repeat_interleave(x, torch.tensor([2, 3]), 1)
+print("Result(Depth): ", z)
+z_grad = torch.rand(1, 5, 3, 4)
+z.backward(z_grad)
+print("Gradient for input(Depth): ", x.grad)
+# Height
+x = torch.rand(1, 2, 3, 4, requires_grad=True)
+z = torch.repeat_interleave(x, torch.tensor([2, 2, 3]), 2)
+print("Result(Height): ", z)
+z_grad = torch.rand(1, 2, 7, 4)
+z.backward(z_grad)
+print("Gradient for input(Height): ", x.grad)
+# Width
+x = torch.rand(1, 2, 3, 4, requires_grad=True)
+z = torch.repeat_interleave(x, torch.tensor([3, 4, 5, 6]), 3)
+print("Result(Width): ", z)
+z_grad = torch.rand(1, 2, 3, 18)
+z.backward(z_grad)
+print("Gradient for input(Width): ", x.grad)
+# Width (1 number)
+x = torch.rand(1, 2, 3, 4, requires_grad=True)
+z = torch.repeat_interleave(x, torch.tensor([3]), 3)
+print("Result(Width): ", z)
+z_grad = torch.rand(1, 2, 3, 12)
+z.backward(z_grad)
+print("Gradient for input(Width): ", x.grad)
diff --git a/training/src/tests/tests/python/roll.py b/training/src/tests/tests/python/roll.py
new file mode 100644
index 00000000..acf57c17
--- /dev/null
+++ b/training/src/tests/tests/python/roll.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.manual_seed(41)
+a = torch.tensor(
+    [
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9.0], [10, 11, 12]]],
+        [[[13, 14, 15], [16, 17, 18], [19, 20, 21.0], [22, 23, 24]]],
+    ],
+    requires_grad=True,
+)
+print("Input: ", a)
+shifts = [1, 2, 3, 4]
+for i in range(4):
+    a = torch.tensor(
+        [
+            [[[1, 2, 3], [4, 5, 6], [7, 8, 9.0], [10, 11, 12]]],
+            [[[13, 14, 15], [16, 17, 18], [19, 20, 21.0], [22, 23, 24]]],
+        ],
+        requires_grad=True,
+    )
+    b = torch.roll(a, shifts[i], i)
+    print("Result:", b)
+    b.sum().backward()
+    print("Gradient for input:", a.grad)
diff --git a/training/src/tests/tests/python/self-attention.py b/training/src/tests/tests/python/self-attention.py
new file mode 100644
index 00000000..f8f7a5ca
--- /dev/null
+++ b/training/src/tests/tests/python/self-attention.py
@@ -0,0 +1,53 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torchvision
+import math
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+
+def attention(query, key, value, mask=None, dropout=None):
+    "Compute 'Scaled Dot Product Attention'"
+    d_k = query.size(-1)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+    # print('scores', scores)
+    if mask is not None:
+        scores = scores.masked_fill(mask == 0, -1e9)
+    scores.retain_grad()
+    p_attn = F.softmax(scores, dim=-1)
+    if dropout is not None:
+        p_attn = dropout(p_attn)
+    res = torch.matmul(p_attn, value)
+    return res, p_attn
+
+
+Q = torch.tensor(
+    [[[1.0, 1.0, 2.0, 0.0, 5.0], [-1.0, 2.0, 2.0, 0.0, 5.0]]], requires_grad=True
+)
+
+mask = torch.tensor([[[1, 1], [1, 1]]])
+
+res, p_attn = attention(Q, Q, Q, mask)
+
+print("res", res)
+print()
+print("p_attn", p_attn)
+p_attn.retain_grad()
+res.backward(torch.tensor([[[1.0, 1.0, 2.0, 0.0, 5.0], [-1.0, 2.0, 2.0, 0.0, 5.0]]]))
+
+print(Q.grad)
diff --git a/training/src/tests/tests/python/sequence_loss.py b/training/src/tests/tests/python/sequence_loss.py
new file mode 100644
index 00000000..b0206dc4
--- /dev/null
+++ b/training/src/tests/tests/python/sequence_loss.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow_addons as tfa
+import tensorflow as tf
+
+tf.random.set_seed(1)
+
+logits = tf.Variable(
+    tf.random.uniform(
+        [2, 3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="logits",
+)
+targets = tf.Variable(
+    tf.random.uniform(
+        [2, 3, 4], minval=0, maxval=3, dtype=tf.dtypes.int64, seed=0, name=None
+    ),
+    name="targets",
+)
+weights = tf.Variable(
+    tf.random.uniform(
+        [2, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="weights",
+)
+
+print("logits: ", logits)
+print("targets: ", targets)
+print("weights: ", weights)
+
+# No reduction
+print(tfa.seq2seq.sequence_loss(logits, targets, weights, False, False, False, False))
+# All sum possibilities
+print(tfa.seq2seq.sequence_loss(logits, targets, weights, False, False, True, False))
+print(tfa.seq2seq.sequence_loss(logits, targets, weights, False, False, False, True))
+print(tfa.seq2seq.sequence_loss(logits, targets, weights, False, False, True, True))
+# All average possibilities
+print(tfa.seq2seq.sequence_loss(logits, targets, weights, True, False, False, False))
+print(tfa.seq2seq.sequence_loss(logits, targets, weights, False, True, False, False))
+print(tfa.seq2seq.sequence_loss(logits, targets, weights, True, True, False, False))
diff --git a/training/src/tests/tests/python/sigmoid_cross_entropy_loss.py b/training/src/tests/tests/python/sigmoid_cross_entropy_loss.py
new file mode 100644
index 00000000..8867ef23
--- /dev/null
+++ b/training/src/tests/tests/python/sigmoid_cross_entropy_loss.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import tensorflow as tf
+
+logits = tf.constant(
+    [
+        [
+            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]],
+            [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+            [[19, 20, 21], [22, 23, 24], [25, 26, 27]],
+        ]
+    ]
+)
+labels = tf.constant(
+    [
+        [
+            [[1.0, 1.0, 1.0], [2.0, 2.0, 2.0], [3.0, 3.0, 3.0]],
+            [[5, 5, 5], [8, 8, 8], [17, 16, 15]],
+            [[14, 21, 22], [2, 3, 4], [5, 6, 7]],
+        ]
+    ]
+)
+print("Logits: ", logits)
+print("Labels: ", labels)
+
+# None
+with tf.GradientTape() as g:
+    g.watch(logits)
+    res = tf.nn.sigmoid_cross_entropy_with_logits(labels, logits)
+    print("Result (None): ", res)
+dy_dx = g.gradient(res, logits)
+print("Gradient (None): ", dy_dx)
+
+# Sum
+with tf.GradientTape() as g:
+    g.watch(logits)
+    res = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels, logits))
+    print("Result (Sum): ", res)
+dy_dx = g.gradient(res, logits)
+print("Gradient (Sum): ", dy_dx)
+
+# Mean
+with tf.GradientTape() as g:
+    g.watch(logits)
+    res = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels, logits))
+    print("Result (Mean): ", res)
+dy_dx = g.gradient(res, logits)
+print("Gradient (Mean): ", dy_dx)
diff --git a/training/src/tests/tests/python/sigmoid_cross_entropy_loss_test.py b/training/src/tests/tests/python/sigmoid_cross_entropy_loss_test.py
new file mode 100644
index 00000000..42e1e74b
--- /dev/null
+++ b/training/src/tests/tests/python/sigmoid_cross_entropy_loss_test.py
@@ -0,0 +1,634 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import tensorflow as tf
+
+tf.compat.v1.enable_eager_execution()
+
+
+def _round_up_tf(x, multiple):
+    # Tf version of remainder = x % multiple
+    remainder = tf.mod(x, multiple)
+    # Tf version of return x if remainder == 0 else x + multiple, -remainder
+    x_round = tf.cond(
+        tf.equal(remainder, tf.zeros(tf.shape(remainder), dtype=tf.int32)),
+        lambda: x,
+        lambda: x + multiple,
+        -remainder,
+    )
+
+    return x_round
+
+
+def sequence_mask(lengths, r, expand=True):
+    """Returns a 2-D or 3-D tensorflow sequence mask depending on the argument 'expand'"""
+    max_len = tf.reduce_max(lengths)
+    max_len = _round_up_tf(max_len, tf.convert_to_tensor(r))
+    if expand:
+        return tf.expand_dims(
+            tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32), axis=-1
+        )
+    return tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32)
+
+
+def MaskedSigmoidCrossEntropy(
+    targets,
+    outputs,
+    targets_lengths,
+    outputs_per_step,
+    cross_entropy_pos_weight,
+    mask=None,
+):
+    """Computes a masked SigmoidCrossEntropy with logits"""
+
+    # [batch_size, time_dimension]
+    # example:
+    # sequence_mask([1, 3, 2], 5) = [[1., 0., 0., 0., 0.],
+    #                                [1., 1., 1., 0., 0.],
+    #                                [1., 1., 0., 0., 0.]]
+    # Note the maxlen argument that ensures mask shape is compatible with r>1
+    # This will by default mask the extra paddings caused by r>1
+    if mask is None:
+        mask = sequence_mask(targets_lengths, outputs_per_step, False)
+
+    with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask))]):
+        # Use a weighted sigmoid cross entropy to measure the <stop_token> loss. Set hparams.cross_entropy_pos_weight to 1
+        # will have the same effect as  vanilla tf.nn.sigmoid_cross_entropy_with_logits.
+        losses = tf.nn.weighted_cross_entropy_with_logits(
+            targets=targets, logits=outputs, pos_weight=cross_entropy_pos_weight
+        )
+
+    with tf.control_dependencies([tf.assert_equal(tf.shape(mask), tf.shape(losses))]):
+        masked_loss = losses * mask
+
+    return tf.reduce_sum(masked_loss) / tf.count_nonzero(masked_loss, dtype=tf.float32)
+
+
+stop_token_prediction = tf.Variable(
+    [
+        [
+            -19.2963486,
+            -19.260931,
+            -20.0291309,
+            -23.1824493,
+            -22.9772644,
+            -20.2885036,
+            -27.675539,
+            -27.3030777,
+            -29.577425,
+            -28.1723957,
+            -27.8901939,
+            -26.384119,
+            -27.3650894,
+            -27.1803169,
+            -23.7659645,
+            -27.9101353,
+            -27.6226883,
+            -28.7287712,
+            -26.2662029,
+            -26.041851,
+            -26.4625721,
+            -24.2690239,
+            -24.1902237,
+            -23.376133,
+            -22.6833057,
+            -22.6326752,
+            -23.0127048,
+            -22.7092934,
+            -22.6531601,
+            -20.4668312,
+            -22.9737015,
+            -22.949688,
+            -19.7955379,
+            -23.9654541,
+            -24.1125488,
+            -16.8582115,
+            -24.5958805,
+            -24.619318,
+            -19.6350899,
+            -26.0553,
+            -25.7573509,
+            -29.0456867,
+            -24.1830368,
+            -24.0690536,
+            -26.6302032,
+            -24.7784214,
+            -24.6312675,
+            -26.5384,
+            -22.3738,
+            -22.324398,
+            -23.8008652,
+            -22.1903877,
+            -22.2572346,
+            -21.1526031,
+            -22.0404415,
+            -22.1381817,
+            -23.1001148,
+            -21.6368828,
+            -21.7594643,
+            -22.5351944,
+            -22.5584583,
+            -22.7108879,
+            -20.525753,
+            -23.0564251,
+            -23.1634941,
+            -22.7847729,
+            -24.2631569,
+            -24.3629684,
+            -25.3400059,
+            -22.6183834,
+            -22.8448315,
+            -23.9382324,
+            -23.1020679,
+            -23.233799,
+            -22.6955643,
+            -22.4547825,
+            -22.6073112,
+            -21.2976532,
+            -22.9158459,
+            -22.9708233,
+            -22.4025,
+            -23.5523796,
+            -23.5779228,
+            -23.5215797,
+            -24.3425903,
+            -24.4970589,
+            -24.7781792,
+            -25.6408978,
+            -25.9453678,
+            -25.3536453,
+            -25.6538544,
+            -25.8861675,
+            -26.3280144,
+            -24.8373375,
+            -24.8233719,
+            -23.5885773,
+            -23.9586372,
+            -23.6939812,
+            -22.1724129,
+            -23.5438118,
+            -23.3040047,
+            -23.8013535,
+            -21.5034828,
+            -21.3676109,
+            -23.6252117,
+            -20.5316467,
+            -20.62747,
+            -22.1351776,
+            -23.3784924,
+            -23.4007187,
+            -23.1962986,
+            -23.2673321,
+            -23.3588448,
+            -20.4417667,
+            -23.7708473,
+            -23.9409142,
+            -22.6762218,
+            -23.636425,
+            -23.8731575,
+            -23.8063,
+            -24.4575443,
+            -24.6403236,
+            -24.1111279,
+            -24.729847,
+            -24.9316673,
+            -23.4876862,
+            -25.3459587,
+            -25.4803429,
+            -23.573637,
+            -23.451,
+            -23.5613155,
+            -22.4887867,
+            -23.6182842,
+            -23.6641579,
+            -22.0665321,
+            -24.9837589,
+            -25.0286808,
+            -22.7693596,
+            -23.6526451,
+            -23.8223419,
+            -22.7516861,
+            -23.4104614,
+            -23.6487885,
+            -23.163723,
+            -23.4834728,
+            -23.7507381,
+            -22.5205746,
+            -23.231369,
+            -23.4580765,
+            -22.335535,
+            -23.637701,
+            -23.7155457,
+            -22.711525,
+            -23.7864494,
+            -23.7470684,
+            -21.1830482,
+            -24.4629459,
+            -24.4870872,
+            -22.1926651,
+            -24.3497772,
+            -24.3965874,
+            -23.1209297,
+            -24.1713791,
+            -24.2138214,
+            -21.8886795,
+            -24.6122303,
+            -24.8375359,
+            -22.7196045,
+            -24.7429218,
+            -24.939991,
+            -22.5758209,
+            -25.1342621,
+            -25.2911663,
+            -22.7056255,
+            -25.5367126,
+            -25.6215458,
+            -24.2621346,
+            -24.959343,
+            -25.1273861,
+            -25.5547142,
+            -23.950901,
+            -24.1237774,
+            -23.0260277,
+            -24.1956635,
+            -24.4791393,
+            -22.6198807,
+            -25.1723881,
+            -25.4708157,
+            -24.9331245,
+            -26.1021328,
+            -26.4309196,
+            -25.2354126,
+            -24.9315262,
+            -25.1621246,
+            -23.9584408,
+            -24.5821438,
+            -24.8249702,
+            -22.8991299,
+            -24.406395,
+            -24.5078354,
+            -21.9084454,
+            -24.3823357,
+            -24.4428082,
+            -21.5360947,
+            -24.3317642,
+            -24.3363438,
+            -21.1170616,
+            -24.9145298,
+            -24.8127346,
+            -22.7880783,
+            -24.8024845,
+            -24.6873283,
+            -22.7920494,
+            -24.7602634,
+            -24.710001,
+            -21.3401356,
+            -21.2255783,
+            -21.2736683,
+            -17.7675343,
+            -23.8382912,
+            -23.7931099,
+            -23.3757706,
+            -24.4784184,
+            -24.4339848,
+            -23.804306,
+            -23.7379055,
+            -23.6727619,
+            -21.8263683,
+            -23.5269566,
+            -23.4892349,
+            -21.7052937,
+            -24.1569557,
+            -24.0990562,
+            -23.29496,
+            -23.8534336,
+            -23.7737827,
+            -22.0034542,
+            -24.1009197,
+            -24.037159,
+            -22.1008091,
+            -23.5646248,
+            -23.5025635,
+            -21.9216061,
+            -24.4235058,
+            -24.3794708,
+            -22.3685799,
+            -24.6468735,
+            -24.5811653,
+            -22.5814056,
+            -24.148201,
+            -24.0724564,
+            -21.335146,
+            -23.6016941,
+            -23.5277939,
+            -21.3815708,
+            -24.4757156,
+            -24.3964157,
+            -22.8095894,
+            -24.6821842,
+            -24.6291065,
+            -22.366869,
+            -22.4068909,
+            -22.4802837,
+            -18.4135742,
+        ]
+    ],
+    dtype=tf.float32,
+)
+
+stop_token_target = tf.Variable(
+    [
+        [
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            1,
+        ]
+    ],
+    dtype=tf.float32,
+)
+
+targets_lengths = tf.Variable([264], dtype=tf.int32)
+
+with tf.GradientTape() as tape:
+    tape.watch(stop_token_prediction)
+    stop_token_loss = MaskedSigmoidCrossEntropy(
+        stop_token_target, stop_token_prediction, targets_lengths, 3, 1
+    )
+
+print(stop_token_loss)
+
+g = tape.gradient(stop_token_loss, stop_token_prediction)
+
+print(g)
diff --git a/training/src/tests/tests/python/sigmoid_cross_entropy_loss_test_2.py b/training/src/tests/tests/python/sigmoid_cross_entropy_loss_test_2.py
new file mode 100644
index 00000000..8d9d3460
--- /dev/null
+++ b/training/src/tests/tests/python/sigmoid_cross_entropy_loss_test_2.py
@@ -0,0 +1,649 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import tensorflow as tf
+
+tf.compat.v1.enable_eager_execution()
+
+
+def _round_up_tf(x, multiple):
+    # Tf version of remainder = x % multiple
+    remainder = tf.mod(x, multiple)
+    # Tf version of return x if remainder == 0 else x + multiple, -remainder
+    x_round = tf.cond(
+        tf.equal(remainder, tf.zeros(tf.shape(remainder), dtype=tf.int32)),
+        lambda: x,
+        lambda: x + multiple,
+        -remainder,
+    )
+
+    return x_round
+
+
+def sequence_mask(lengths, r, expand=True):
+    """Returns a 2-D or 3-D tensorflow sequence mask depending on the argument 'expand'"""
+    max_len = tf.reduce_max(lengths)
+    max_len = _round_up_tf(max_len, tf.convert_to_tensor(r))
+    if expand:
+        return tf.expand_dims(
+            tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32), axis=-1
+        )
+    return tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32)
+
+
+def MaskedSigmoidCrossEntropy(
+    targets,
+    outputs,
+    targets_lengths,
+    outputs_per_step,
+    cross_entropy_pos_weight,
+    mask=None,
+):
+    """Computes a masked SigmoidCrossEntropy with logits"""
+
+    # [batch_size, time_dimension]
+    # example:
+    # sequence_mask([1, 3, 2], 5) = [[1., 0., 0., 0., 0.],
+    #                                [1., 1., 1., 0., 0.],
+    #                                [1., 1., 0., 0., 0.]]
+    # Note the maxlen argument that ensures mask shape is compatible with r>1
+    # This will by default mask the extra paddings caused by r>1
+    if mask is None:
+        mask = sequence_mask(targets_lengths, outputs_per_step, False)
+
+    print("mask", mask)
+
+    with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask))]):
+        # Use a weighted sigmoid cross entropy to measure the <stop_token> loss. Set hparams.cross_entropy_pos_weight to 1
+        # will have the same effect as  vanilla tf.nn.sigmoid_cross_entropy_with_logits.
+        losses = tf.nn.weighted_cross_entropy_with_logits(
+            targets=targets, logits=outputs, pos_weight=cross_entropy_pos_weight
+        )
+    print("losses", losses)
+    with tf.control_dependencies([tf.assert_equal(tf.shape(mask), tf.shape(losses))]):
+        masked_loss = losses * mask
+    print("reduce_sum", tf.reduce_sum(masked_loss))
+    print(
+        "count_nonzero",
+        tf.reduce_sum(tf.count_nonzero(tf.greater(tf.abs(masked_loss), 1e-6))),
+    )
+    return tf.reduce_sum(masked_loss) / tf.count_nonzero(
+        tf.greater(tf.abs(masked_loss), 1e-6), dtype=tf.float32
+    )
+
+
+stop_token_prediction = tf.Variable(
+    [
+        [
+            -19.2554722,
+            -19.2172222,
+            -19.7487106,
+            -23.2418308,
+            -23.0357475,
+            -19.9446392,
+            -27.0010834,
+            -26.6425743,
+            -28.5018578,
+            -28.0486679,
+            -27.7838459,
+            -26.1214581,
+            -27.1233,
+            -26.907362,
+            -22.8142586,
+            -27.0453434,
+            -26.7594566,
+            -26.5863934,
+            -26.3179436,
+            -26.0884037,
+            -25.2211628,
+            -24.620985,
+            -24.5142632,
+            -22.1174355,
+            -23.2725945,
+            -23.1596012,
+            -21.0120411,
+            -23.2527103,
+            -23.1696,
+            -18.9072666,
+            -23.2554893,
+            -23.1731701,
+            -18.4311829,
+            -24.0848808,
+            -24.1499138,
+            -15.960947,
+            -25.0092278,
+            -24.9623737,
+            -19.1189365,
+            -26.3438778,
+            -26.154995,
+            -29.2581463,
+            -25.3176689,
+            -25.2528896,
+            -28.2736454,
+            -25.7639732,
+            -25.7637558,
+            -26.8779392,
+            -24.7767124,
+            -24.7134781,
+            -22.3563709,
+            -23.5989647,
+            -23.6536312,
+            -19.5809612,
+            -23.2020531,
+            -23.299015,
+            -22.2237759,
+            -23.5046,
+            -23.6222115,
+            -23.6591587,
+            -23.600338,
+            -23.6569653,
+            -21.160244,
+            -24.86063,
+            -24.9315052,
+            -22.037775,
+            -24.9977646,
+            -25.026741,
+            -21.9423466,
+            -23.6845512,
+            -23.8408508,
+            -19.2570915,
+            -24.6328468,
+            -24.8431606,
+            -22.6878662,
+            -23.3660145,
+            -23.6384182,
+            -21.5760536,
+            -25.1034832,
+            -25.348238,
+            -23.8637543,
+            -26.663229,
+            -26.7145176,
+            -25.336668,
+            -27.2477226,
+            -27.306406,
+            -25.7301979,
+            -28.7218399,
+            -28.8692188,
+            -26.3121185,
+            -29.7668476,
+            -29.9222355,
+            -27.5231705,
+            -28.9869289,
+            -29.1099186,
+            -26.162838,
+            -26.6737137,
+            -26.6174068,
+            -23.7669525,
+            -25.5801525,
+            -25.382597,
+            -24.8046799,
+            -23.8789845,
+            -23.697506,
+            -24.2769432,
+            -22.6485615,
+            -22.6902733,
+            -19.4312019,
+            -25.5100574,
+            -25.7145748,
+            -24.8000069,
+            -25.820364,
+            -25.8850422,
+            -24.9364567,
+            -25.5461807,
+            -25.6516953,
+            -23.3298531,
+            -26.0074,
+            -26.2451801,
+            -25.3116417,
+            -27.9871769,
+            -28.2245426,
+            -27.5163536,
+            -27.2617893,
+            -27.499155,
+            -26.7148914,
+            -27.917696,
+            -28.1484642,
+            -26.3882656,
+            -26.1964741,
+            -26.4671707,
+            -21.5169525,
+            -25.7654495,
+            -25.956953,
+            -21.4978657,
+            -25.4474277,
+            -25.5888271,
+            -21.8175259,
+            -27.0709019,
+            -27.2391357,
+            -25.4488678,
+            -26.6593227,
+            -26.9093361,
+            -24.8338413,
+            -25.5061531,
+            -25.8105183,
+            -23.624918,
+            -25.6026173,
+            -25.8815746,
+            -24.5043125,
+            -25.9096336,
+            -26.0886726,
+            -25.2726841,
+            -26.928133,
+            -27.0977306,
+            -26.1994705,
+            -27.3337383,
+            -27.4392586,
+            -26.3144608,
+            -27.7289352,
+            -27.8782,
+            -24.213871,
+            -25.7142906,
+            -25.8960228,
+            -22.7322845,
+            -25.6535549,
+            -25.8980904,
+            -24.5248547,
+            -26.2106419,
+            -26.4627228,
+            -26.0687141,
+            -26.5431461,
+            -26.7009983,
+            -25.7237949,
+            -26.6101112,
+            -26.6817436,
+            -25.3532429,
+            -26.7699566,
+            -26.9373798,
+            -26.9421253,
+            -26.6608849,
+            -26.9125614,
+            -24.117918,
+            -26.2250576,
+            -26.5464878,
+            -23.1006,
+            -27.6363869,
+            -27.8620872,
+            -25.3283138,
+            -29.9514732,
+            -30.1839752,
+            -26.477644,
+            -29.3066959,
+            -29.4690552,
+            -23.5445633,
+            -29.1421337,
+            -29.2640915,
+            -22.4593296,
+            -28.1416054,
+            -28.2274284,
+            -22.18853,
+            -26.733387,
+            -26.7894478,
+            -21.491787,
+            -26.1531353,
+            -26.1881218,
+            -20.6900272,
+            -24.7998199,
+            -24.7371349,
+            -19.1967335,
+            -21.245182,
+            -21.1941013,
+            -15.3086586,
+            -21.5838165,
+            -21.5323563,
+            -16.0762138,
+            -21.4765892,
+            -21.4428558,
+            -15.7436924,
+            -22.4633045,
+            -22.4034424,
+            -16.9948044,
+            -22.7169952,
+            -22.6739521,
+            -17.8596611,
+            -21.7312,
+            -21.6699924,
+            -14.5657434,
+            -20.6252861,
+            -20.6077271,
+            -13.4298716,
+            -22.6423855,
+            -22.5940323,
+            -18.3120213,
+            -23.1737137,
+            -23.0944881,
+            -17.6568565,
+            -22.7808514,
+            -22.6918087,
+            -16.7538891,
+            -22.4906845,
+            -22.4037457,
+            -16.9209671,
+            -23.021389,
+            -22.9521198,
+            -17.1286907,
+            -22.9245148,
+            -22.8395557,
+            -16.294363,
+            -21.9706516,
+            -21.9045429,
+            -15.5770559,
+            -22.0162334,
+            -21.9441986,
+            -16.1142902,
+            -23.2365379,
+            -23.1556301,
+            -17.2457199,
+            -21.6759892,
+            -21.6193886,
+            -14.4782619,
+            -21.6260395,
+            -21.6478901,
+            -10.49055,
+        ]
+    ],
+    dtype=tf.float32,
+)
+
+stop_token_target = tf.Variable(
+    [
+        [
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            1,
+        ]
+    ],
+    dtype=tf.float32,
+)
+
+targets_lengths = tf.Variable([264], dtype=tf.int32)
+
+with tf.GradientTape() as tape:
+    tape.watch(stop_token_prediction)
+    stop_token_loss = MaskedSigmoidCrossEntropy(
+        stop_token_target, stop_token_prediction, targets_lengths, 3, 1
+    )
+
+
+print(
+    tf.nn.weighted_cross_entropy_with_logits(
+        targets=[0.0], logits=[-14.4782], pos_weight=1
+    )
+)
+
+print(stop_token_loss)
+
+g = tape.gradient(stop_token_loss, stop_token_prediction)
+
+# print(g)
diff --git a/training/src/tests/tests/python/simple_batchnorm_test.py b/training/src/tests/tests/python/simple_batchnorm_test.py
new file mode 100644
index 00000000..f9f416cb
--- /dev/null
+++ b/training/src/tests/tests/python/simple_batchnorm_test.py
@@ -0,0 +1,25 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+torch.set_printoptions(precision=8)
+
+batchnorm = torch.nn.BatchNorm2d(2, momentum=0.0)
+input = torch.randn(1, 2, 3, 4, requires_grad=True)
+print("Input: ", input)
+result = batchnorm(input)
+print("Result: ", result)
+result.sum().backward()
+print("Gradient for input: ", input.grad)
diff --git a/training/src/tests/tests/python/softmax.py b/training/src/tests/tests/python/softmax.py
new file mode 100644
index 00000000..b3adb034
--- /dev/null
+++ b/training/src/tests/tests/python/softmax.py
@@ -0,0 +1,52 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+import torch.nn.functional as F
+
+KK = torch.tensor(
+    [
+        [[[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]]],
+        [
+            [[1.0, 1.0], [3.0, 3.0]],
+            [[4.0, 4.0], [3.0, 3.0]],
+            [[2.0, 1.0], [3.0, 7.0]],
+        ],
+    ],
+    requires_grad=True,
+)
+
+grad = torch.tensor(
+    [
+        [[[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]], [[1.0, 1.0], [2.0, 2.0]]],
+        [
+            [[1.0, 1.0], [3.0, 3.0]],
+            [[4.0, 4.0], [3.0, 3.0]],
+            [[2.0, 1.0], [3.0, 7.0]],
+        ],
+    ]
+)
+
+for i in range(0, 4):
+    x = F.softmax(KK, dim=i)
+    print("dim = " + str(i), x.view(24))
+    x.backward(grad)
+    print("grad(dim = " + str(i) + ")", KK.grad.view(24))
+    torch.nn.init.zeros_(KK.grad)
+# print('dim = 1', F.softmax(KK, dim = 1))
+# print('dim = 2', F.softmax(KK, dim = 2))
+# print('dim = 3', F.softmax(KK, dim = 3))
diff --git a/training/src/tests/tests/python/softplus.py b/training/src/tests/tests/python/softplus.py
new file mode 100644
index 00000000..35180bf4
--- /dev/null
+++ b/training/src/tests/tests/python/softplus.py
@@ -0,0 +1,36 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import torch
+
+torch.manual_seed(0)
+
+# Positive test
+torch.set_printoptions(precision=8)
+activation = torch.nn.Softplus(beta=5.0, threshold=2.5)
+x = torch.rand(1, 2, 3, 4, requires_grad=True)
+print("Input: ", x)
+output = activation(x)
+print("Output: ", output)
+output.sum().backward()
+print("Gradient for input: ", x.grad)
+
+# Negative test
+torch.set_printoptions(precision=8)
+activation = torch.nn.Softplus(beta=-5.0, threshold=-2.5)
+x = torch.rand(1, 2, 3, 4, requires_grad=True)
+print("Input: ", x)
+output = activation(x)
+print("Output: ", output)
+output.sum().backward()
+print("Gradient for input: ", x.grad)
diff --git a/training/src/tests/tests/python/softplus_tf.py b/training/src/tests/tests/python/softplus_tf.py
new file mode 100644
index 00000000..8d0432fa
--- /dev/null
+++ b/training/src/tests/tests/python/softplus_tf.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import torch
+import tensorflow as tf
+
+torch.manual_seed(0)
+
+torch.set_printoptions(precision=8)
+activation = torch.nn.Softplus(beta=1, threshold=1000000)
+x = torch.rand(1, 2, 3, 4, requires_grad=True)
+print("Input: ", x)
+output = activation(x)
+print("Torch Output: ", output)
+output.sum().backward()
+print("Torch Gradient: ", x.grad)
+
+tf.compat.v1.enable_eager_execution()
+tf_x = tf.Variable(x.data.numpy(), dtype=tf.float32)
+with tf.GradientTape() as tape:
+    tf_output = tf.keras.activations.softplus(tf_x)
+print("TF Output: ", tf_output)
+print("TF Gradient: ", tape.gradient(tf_output, tf_x))
diff --git a/training/src/tests/tests/python/tf_depthwise_conv2d.py b/training/src/tests/tests/python/tf_depthwise_conv2d.py
new file mode 100644
index 00000000..2c35bfc8
--- /dev/null
+++ b/training/src/tests/tests/python/tf_depthwise_conv2d.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+# Convolution (BatchedUnit)
+tf.set_random_seed(0)
+
+dynamic_input = tf.Variable(
+    tf.random.uniform(
+        [5, 3, 2, 2], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="input",
+)
+
+dynamic_filters = tf.Variable(
+    tf.random.uniform(
+        [5, 3, 2, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="input",
+)
+
+dynamic_features1 = tf.nn.depthwise_conv2d(
+    input=dynamic_input,
+    filter=dynamic_filters,
+    strides=[1, 1, 1, 1],
+    padding="SAME",
+    name="dynamic_convolution",
+    data_format="NHWC",
+)
+
+print("Input:", dynamic_input)
+print("Filters:", dynamic_filters)
+print("Result:", dynamic_features1)
+
+# Same result as
+dynamic_input_padded = tf.pad(dynamic_input, [[0, 0], [2, 2], [1, 1], [0, 0]])
+with tf.GradientTape() as g:
+    g.watch(dynamic_input_padded)
+    dynamic_features2 = tf.nn.depthwise_conv2d(
+        input=dynamic_input_padded,
+        filter=dynamic_filters,
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        name="dynamic_convolution",
+        data_format="NHWC",
+    )
+deltas = tf.random.uniform(
+    [5, 3, 2, 6], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+)
+
+print("Result:", dynamic_features2)
+print("Deltas:", deltas)
+print(
+    "Gradient for input:", g.gradient(dynamic_features2, dynamic_input_padded, deltas)
+)
+with tf.GradientTape() as g:
+    g.watch(dynamic_filters)
+    dynamic_features2 = tf.nn.depthwise_conv2d(
+        input=dynamic_input_padded,
+        filter=dynamic_filters,
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        name="dynamic_convolution",
+        data_format="NHWC",
+    )
+print("Gradient for weights:", g.gradient(dynamic_features2, dynamic_filters, deltas))
+
+# Convolution (NonBatchedUnit)
+dynamic_input = tf.Variable(
+    tf.random.uniform(
+        [1, 4, 5, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="input",
+)
+
+dynamic_filters = tf.Variable(
+    tf.random.uniform(
+        [1, 21, 3, 2], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="input",
+)
+
+print("Input:", dynamic_input)
+print("Filters:", dynamic_filters)
+
+dynamic_features1 = tf.nn.depthwise_conv2d(
+    input=dynamic_input,
+    filter=dynamic_filters,
+    strides=[1, 1, 1, 1],
+    padding="SAME",
+    name="dynamic_convolution",
+    data_format="NHWC",
+)
+print("Result:", dynamic_features1)
+
+dynamic_input_padded = tf.pad(dynamic_input, [[0, 0], [0, 0], [10, 10], [0, 0]])
+with tf.GradientTape() as g:
+    g.watch(dynamic_input_padded)
+    dynamic_features2 = tf.nn.depthwise_conv2d(
+        input=dynamic_input_padded,
+        filter=dynamic_filters,
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        name="dynamic_convolution",
+        data_format="NHWC",
+    )
+deltas = tf.random.uniform(
+    [1, 4, 5, 6], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+)
+print("Result:", dynamic_features2)
+print("Deltas", deltas)
+print(
+    "Gradient for input:", g.gradient(dynamic_features2, dynamic_input_padded, deltas)
+)
+with tf.GradientTape() as g:
+    g.watch(dynamic_filters)
+    dynamic_features2 = tf.nn.depthwise_conv2d(
+        input=dynamic_input_padded,
+        filter=dynamic_filters,
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        name="dynamic_convolution",
+        data_format="NHWC",
+    )
+print("Gradient for filters:", g.gradient(dynamic_features2, dynamic_filters, deltas))
diff --git a/training/src/tests/tests/python/tf_gradient_clip.py b/training/src/tests/tests/python/tf_gradient_clip.py
new file mode 100644
index 00000000..73c7b3b2
--- /dev/null
+++ b/training/src/tests/tests/python/tf_gradient_clip.py
@@ -0,0 +1,44 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import tensorflow as tf
+import numpy as np
+
+# Zero clip
+t_list = [tf.ones([1, 1, 2, 2]) for i in range(1)]
+print("Input:", t_list)
+t_list_clipped, _ = tf.clip_by_global_norm(t_list, clip_norm=-0.5)
+print("Result:", t_list_clipped)
+
+# Inf clip
+t_list = [tf.ones([1, 1, 2, 2]) for i in range(1)]
+print("Input:", t_list)
+t_list_clipped, _ = tf.clip_by_global_norm(t_list, clip_norm=np.inf)
+print("Result:", t_list_clipped)
+
+# Process Gradients main test
+tf.random.set_seed(0)
+t_list = [tf.random.uniform([2, 1, 2, 2], minval=0.0, maxval=1.0) for i in range(3)]
+for t in t_list:
+    print("Input:", t)
+
+t_list_clipped, global_norm = tf.clip_by_global_norm(t_list, clip_norm=100.0)
+# No changes expected
+for i in range(len(t_list)):
+    assert tf.reduce_all(tf.equal(t_list, t_list_clipped))
+
+t_list_clipped, global_norm = tf.clip_by_global_norm(t_list, clip_norm=0.5)
+print("Calculated global norm:", global_norm)
+for t in t_list_clipped:
+    print("Result:", t)
diff --git a/training/src/tests/tests/python/tf_round.py b/training/src/tests/tests/python/tf_round.py
new file mode 100644
index 00000000..1aa11bbd
--- /dev/null
+++ b/training/src/tests/tests/python/tf_round.py
@@ -0,0 +1,21 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow as tf
+import numpy as np
+
+tf.compat.v1.enable_eager_execution()
+
+input = tf.Variable([0.9, 2.5, 2.3, 1.5, -4.5, 0.45], dtype=tf.float32)
+output = tf.round(input)
+tf.print(output, summarize=-1)
diff --git a/training/src/tests/tests/python/tf_sequence_loss.py b/training/src/tests/tests/python/tf_sequence_loss.py
new file mode 100644
index 00000000..b67dff83
--- /dev/null
+++ b/training/src/tests/tests/python/tf_sequence_loss.py
@@ -0,0 +1,98 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow_addons as tfa
+import tensorflow as tf
+
+tf.random.set_seed(1)
+
+logits = tf.Variable(
+    tf.random.uniform(
+        [2, 3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="logits",
+)
+targets = tf.Variable(
+    tf.random.uniform(
+        [2, 3, 4], minval=0, maxval=3, dtype=tf.dtypes.int64, seed=0, name=None
+    ),
+    name="targets",
+)
+weights = tf.Variable(
+    tf.random.uniform(
+        [2, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="weights",
+)
+
+print("logits: ", logits)
+print("targets: ", targets)
+print("weights: ", weights)
+
+# No reduction
+with tf.GradientTape() as g:
+    g.watch(logits)
+    result = tfa.seq2seq.sequence_loss(
+        logits, targets, weights, False, False, False, False
+    )
+    print("Loss (no reduction): ", result)
+print("Gradient for logits (no reduction): ", g.gradient(result, logits))
+# All sum possibilities
+with tf.GradientTape() as g:
+    g.watch(logits)
+    result = tfa.seq2seq.sequence_loss(
+        logits, targets, weights, False, False, True, False
+    )
+    print("Loss (sum over timesteps): ", result)
+print("Gradient for logits (sum over timesteps): ", g.gradient(result, logits))
+with tf.GradientTape() as g:
+    g.watch(logits)
+    result = tfa.seq2seq.sequence_loss(
+        logits, targets, weights, False, False, False, True
+    )
+    print("Loss (sum over batch): ", result)
+print("Gradient for logits (sum over batch): ", g.gradient(result, logits))
+with tf.GradientTape() as g:
+    g.watch(logits)
+    result = tfa.seq2seq.sequence_loss(
+        logits, targets, weights, False, False, True, True
+    )
+    print("Loss (sum over timesteps and batch): ", result)
+print(
+    "Gradient for logits (sum over timesteps and batch): ", g.gradient(result, logits)
+)
+# All average possibilities
+with tf.GradientTape() as g:
+    g.watch(logits)
+    result = tfa.seq2seq.sequence_loss(
+        logits, targets, weights, True, False, False, False
+    )
+    print("Loss (average across timesteps): ", result)
+print("Gradient for logits (average across timesteps): ", g.gradient(result, logits))
+with tf.GradientTape() as g:
+    g.watch(logits)
+    result = tfa.seq2seq.sequence_loss(
+        logits, targets, weights, False, True, False, False
+    )
+    print("Loss (average across batch): ", result)
+print("Gradient for logits (average across batch): ", g.gradient(result, logits))
+with tf.GradientTape() as g:
+    g.watch(logits)
+    result = tfa.seq2seq.sequence_loss(
+        logits, targets, weights, True, True, False, False
+    )
+    print("Loss (average across timesteps and batch): ", result)
+print(
+    "Gradient for logits (average across timesteps and batch): ",
+    g.gradient(result, logits),
+)
diff --git a/training/src/tests/tests/python/tf_sotfmax_cross_entropy_with_logits.py b/training/src/tests/tests/python/tf_sotfmax_cross_entropy_with_logits.py
new file mode 100644
index 00000000..18199ce4
--- /dev/null
+++ b/training/src/tests/tests/python/tf_sotfmax_cross_entropy_with_logits.py
@@ -0,0 +1,65 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow_addons as tfa
+import tensorflow as tf
+
+# Simple unit
+tf.random.set_seed(1)
+
+logits = tf.Variable(
+    tf.random.uniform(
+        [1, 1, 3], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="logits",
+)
+targets = tf.Variable(
+    tf.random.uniform(
+        [1, 1, 3], minval=0, maxval=3, dtype=tf.dtypes.int64, seed=0, name=None
+    ),
+    name="targets",
+)
+
+print("logits: ", logits)
+print("targets: ", targets)
+
+with tf.GradientTape() as g:
+    g.watch(logits)
+    result = tf.nn.softmax_cross_entropy_with_logits(labels=targets, logits=logits)
+    print("Loss: ", result)
+print("Gradient: ", g.gradient(result, logits))
+
+# Complex unit
+tf.random.set_seed(1)
+
+logits = tf.Variable(
+    tf.random.uniform(
+        [2, 3, 4], minval=0, maxval=None, dtype=tf.dtypes.float32, seed=0, name=None
+    ),
+    name="logits",
+)
+targets = tf.Variable(
+    tf.random.uniform(
+        [2, 3, 4], minval=0, maxval=3, dtype=tf.dtypes.int64, seed=0, name=None
+    ),
+    name="targets",
+)
+
+print("logits: ", logits)
+print("targets: ", targets)
+
+with tf.GradientTape() as g:
+    g.watch(logits)
+    result = tf.nn.softmax_cross_entropy_with_logits(labels=targets, logits=logits)
+    print("Loss: ", result)
+print("Gradient: ", g.gradient(result, logits))
diff --git a/training/src/tests/tests/python/tf_torch_conv1d.py b/training/src/tests/tests/python/tf_torch_conv1d.py
new file mode 100644
index 00000000..f25510a4
--- /dev/null
+++ b/training/src/tests/tests/python/tf_torch_conv1d.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+import tensorflow as tf
+import numpy as np
+
+c_out = 64
+c_in = 1500
+w = 20
+N = 8
+kernel = 5
+
+np.random.seed(0)
+weights = np.random.randn(c_out * w * kernel)
+
+tf.compat.v1.enable_eager_execution()
+
+input = tf.ones([N, c_in, w])
+tf_c = tf.keras.layers.Conv1D(
+    filters=c_out,
+    kernel_size=kernel,
+    activation=None,
+    padding="same",
+    use_bias=False,
+    kernel_initializer=tf.keras.initializers.uniform,
+    dilation_rate=1,
+)
+tf_c.build(input.shape)
+tf_c.set_weights([weights.reshape(tf_c.weights[0].shape)])
+result = tf_c(input)
+
+print("TF Input: ", input.shape)
+print("TF weights: ", tf_c.weights[0].shape)
+print("TF Result: ", result.shape)
+
+print(result)
+
+# First
+torch.set_printoptions(precision=8)
+input = torch.ones(N, c_in, w, requires_grad=True)
+input1 = input.transpose(1, 2)
+torch_c = torch.nn.Conv1d(
+    in_channels=w, out_channels=c_out, kernel_size=5, padding=2, bias=False
+)
+torch_c.weight.data = torch.FloatTensor(
+    np.transpose(weights.reshape(tf_c.weights[0].shape), (2, 1, 0)).reshape(
+        torch_c.weight.shape
+    )
+)
+result = torch_c(input1)
+result = result.transpose(1, 2)
+print("Torch Input: ", input.shape)
+print("Torch weights: ", torch_c.weight.shape)
+print("Torch Result: ", result.shape)
+print(result)
diff --git a/training/src/tests/tests/python/tf_torch_simple_conv1d.py b/training/src/tests/tests/python/tf_torch_simple_conv1d.py
new file mode 100644
index 00000000..d865ff37
--- /dev/null
+++ b/training/src/tests/tests/python/tf_torch_simple_conv1d.py
@@ -0,0 +1,180 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import tensorflow as tf
+import torch
+import numpy as np
+
+# Simple
+c_out = 3
+w = 3
+c_in = 2
+N = 2
+
+np.random.seed(0)
+weights = np.random.randn(c_out * c_in * 2)
+
+tf.compat.v1.enable_eager_execution()
+
+input = tf.constant(
+    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0], shape=[N, w, c_in]
+)
+tf_c = tf.keras.layers.Conv1D(
+    filters=c_out, kernel_size=2, activation=None, use_bias=False, dilation_rate=1
+)
+tf_c.build(input.shape)
+tf_c.set_weights([weights.reshape(tf_c.weights[0].shape)])
+
+print("TF Input: ", input)
+print("TF weights: ", tf_c.weights)
+print("TF Result: ", tf_c(input))
+
+# Equal torch variant
+torch.set_printoptions(precision=8)
+input = torch.tensor(
+    [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]]],
+    requires_grad=True,
+)
+input1 = input.transpose(1, 2)
+torch_c = torch.nn.Conv1d(
+    in_channels=c_in, out_channels=c_out, kernel_size=2, padding=0, bias=False
+)
+torch_c.weight.data = torch.FloatTensor(
+    np.transpose(weights.reshape(tf_c.weights[0].shape), (2, 1, 0)).reshape(
+        torch_c.weight.shape
+    )
+)
+result = torch_c(input1)
+result = result.transpose(1, 2)
+
+print("Torch result:", result)
+# Gradients
+deltas = torch.tensor([[[12, 11, 10], [9, 8, 7]], [[6, 5, 4], [3, 2, 1]]])
+result.backward(deltas)
+print("Gradient for input:", input.grad)
+print("Gradient for weights:", torch.transpose(torch_c.weight.grad, 0, 2))
+
+# More complex
+c_out = 2
+w = 5
+c_in = 3
+N = 2
+
+np.random.seed(0)
+weights = np.random.randn(c_out * c_in * 3)
+
+tf.compat.v1.enable_eager_execution()
+
+input = tf.constant(
+    [
+        1.0,
+        2.0,
+        3.0,
+        4.0,
+        5.0,
+        6.0,
+        7.0,
+        8.0,
+        9.0,
+        10.0,
+        11.0,
+        12.0,
+        13.0,
+        14.0,
+        15.0,
+        16.0,
+        17.0,
+        18.0,
+        19.0,
+        20.0,
+        21.0,
+        22.0,
+        23.0,
+        24.0,
+        25.0,
+        26.0,
+        27.0,
+        28.0,
+        29.0,
+        30.0,
+    ],
+    shape=[N, w, c_in],
+)
+
+tf_c = tf.keras.layers.Conv1D(
+    filters=c_out,
+    kernel_size=3,
+    activation=None,
+    padding="same",
+    use_bias=False,
+    dilation_rate=2,
+    groups=1,
+)
+tf_c.build(input.shape)
+tf_c.set_weights([weights.reshape(tf_c.weights[0].shape)])
+
+print("TF input:", input.shape)
+print("TF weights:", tf_c.weights[0].shape)
+print("TF result:", tf_c(input).shape)
+
+# The same thing, but in torch
+torch.set_printoptions(precision=8)
+input = torch.tensor(
+    [
+        [
+            [1.0, 2.0, 3.0],
+            [4.0, 5.0, 6.0],
+            [7.0, 8.0, 9.0],
+            [10.0, 11.0, 12.0],
+            [13.0, 14.0, 15.0],
+        ],
+        [
+            [16.0, 17.0, 18.0],
+            [19.0, 20.0, 21.0],
+            [22.0, 23.0, 24.0],
+            [25.0, 26.0, 27.0],
+            [28.0, 29.0, 30.0],
+        ],
+    ],
+    requires_grad=True,
+)
+input1 = input.transpose(1, 2)
+torch_c = torch.nn.Conv1d(
+    in_channels=c_in,
+    out_channels=c_out,
+    kernel_size=3,
+    padding=2,
+    bias=False,
+    dilation=2,
+)
+torch_c.weight.data = torch.FloatTensor(
+    np.transpose(weights.reshape(tf_c.weights[0].shape), (2, 1, 0)).reshape(
+        torch_c.weight.shape
+    )
+)
+result = torch_c(input1)
+result = result.transpose(1, 2)
+
+print("Torch result:", result)
+
+# Gradients
+deltas = torch.tensor(
+    [
+        [[20.0, 19.0], [18.0, 17.0], [16.0, 15.0], [14.0, 13.0], [12.0, 11.0]],
+        [[10.0, 9.0], [8.0, 7.0], [6.0, 5.0], [4.0, 3.0], [2.0, 1.0]],
+    ]
+)
+result.backward(deltas)
+print("Gradient for input:", input.grad)
+print("Gradient for weights:", torch.transpose(torch_c.weight.grad, 0, 2))
diff --git a/training/src/tests/tests/python/threads_experiments.ipynb b/training/src/tests/tests/python/threads_experiments.ipynb
new file mode 100644
index 00000000..6d8719b6
--- /dev/null
+++ b/training/src/tests/tests/python/threads_experiments.ipynb
@@ -0,0 +1,591 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Experiments on multithread execution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "import time \n",
+    "import os\n",
+    "import pickle\n",
+    "import multiprocessing\n",
+    "from collections import namedtuple\n",
+    "\n",
+    "def timing_run(cmd: str, n=1, env=None):\n",
+    "    assert n > 0\n",
+    "    data = None\n",
+    "    ts = time.time()\n",
+    "    for _ in range(n):\n",
+    "        cmd = cmd.split()\n",
+    "        data = subprocess.run(cmd, env=env, capture_output=True)\n",
+    "    te = time.time()\n",
+    "    return (te - ts)/n, data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# iteration: 0\n",
+      "checking threads=1\n",
+      "checking threads=2\n",
+      "checking threads=3\n",
+      "checking threads=4\n",
+      "checking threads=5\n",
+      "checking threads=6\n",
+      "checking threads=7\n",
+      "checking threads=8\n",
+      "checking threads=9\n",
+      "checking threads=10\n",
+      "checking threads=11\n",
+      "checking threads=12\n",
+      "checking threads=13\n",
+      "checking threads=14\n",
+      "checking threads=15\n",
+      "checking threads=16\n",
+      "checking threads=17\n",
+      "checking threads=18\n",
+      "checking threads=19\n",
+      "checking threads=20\n",
+      "checking threads=21\n",
+      "checking threads=22\n",
+      "checking threads=23\n",
+      "checking threads=24\n",
+      "checking threads=25\n",
+      "checking threads=26\n",
+      "checking threads=27\n",
+      "checking threads=28\n",
+      "checking threads=29\n",
+      "checking threads=30\n",
+      "checking threads=31\n",
+      "checking threads=32\n",
+      "checking threads=33\n",
+      "checking threads=34\n",
+      "checking threads=35\n",
+      "checking threads=36\n",
+      "checking threads=37\n",
+      "checking threads=38\n",
+      "checking threads=39\n",
+      "checking threads=40\n",
+      "checking threads=41\n",
+      "checking threads=42\n",
+      "checking threads=43\n",
+      "checking threads=44\n",
+      "checking threads=45\n",
+      "# iteration: 1\n",
+      "checking threads=1\n",
+      "checking threads=2\n",
+      "checking threads=3\n",
+      "checking threads=4\n",
+      "checking threads=5\n",
+      "checking threads=6\n",
+      "checking threads=7\n",
+      "checking threads=8\n",
+      "checking threads=9\n",
+      "checking threads=10\n",
+      "checking threads=11\n",
+      "checking threads=12\n",
+      "checking threads=13\n",
+      "checking threads=14\n",
+      "checking threads=15\n",
+      "checking threads=16\n",
+      "checking threads=17\n",
+      "checking threads=18\n",
+      "checking threads=19\n",
+      "checking threads=20\n",
+      "checking threads=21\n",
+      "checking threads=22\n",
+      "checking threads=23\n",
+      "checking threads=24\n",
+      "checking threads=25\n",
+      "checking threads=26\n",
+      "checking threads=27\n",
+      "checking threads=28\n",
+      "checking threads=29\n",
+      "checking threads=30\n",
+      "checking threads=31\n",
+      "checking threads=32\n",
+      "checking threads=33\n",
+      "checking threads=34\n",
+      "checking threads=35\n",
+      "checking threads=36\n",
+      "checking threads=37\n",
+      "checking threads=38\n",
+      "checking threads=39\n",
+      "checking threads=40\n",
+      "checking threads=41\n",
+      "checking threads=42\n",
+      "checking threads=43\n",
+      "checking threads=44\n",
+      "checking threads=45\n",
+      "# iteration: 2\n",
+      "checking threads=1\n",
+      "checking threads=2\n",
+      "checking threads=3\n",
+      "checking threads=4\n",
+      "checking threads=5\n",
+      "checking threads=6\n",
+      "checking threads=7\n",
+      "checking threads=8\n",
+      "checking threads=9\n",
+      "checking threads=10\n",
+      "checking threads=11\n",
+      "checking threads=12\n",
+      "checking threads=13\n",
+      "checking threads=14\n",
+      "checking threads=15\n",
+      "checking threads=16\n",
+      "checking threads=17\n",
+      "checking threads=18\n",
+      "checking threads=19\n",
+      "checking threads=20\n",
+      "checking threads=21\n",
+      "checking threads=22\n",
+      "checking threads=23\n",
+      "checking threads=24\n",
+      "checking threads=25\n",
+      "checking threads=26\n",
+      "checking threads=27\n",
+      "checking threads=28\n",
+      "checking threads=29\n",
+      "checking threads=30\n",
+      "checking threads=31\n",
+      "checking threads=32\n",
+      "checking threads=33\n",
+      "checking threads=34\n",
+      "checking threads=35\n",
+      "checking threads=36\n",
+      "checking threads=37\n",
+      "checking threads=38\n",
+      "checking threads=39\n",
+      "checking threads=40\n",
+      "checking threads=41\n",
+      "checking threads=42\n",
+      "checking threads=43\n",
+      "checking threads=44\n",
+      "checking threads=45\n",
+      "# iteration: 3\n",
+      "checking threads=1\n",
+      "checking threads=2\n",
+      "checking threads=3\n",
+      "checking threads=4\n",
+      "checking threads=5\n",
+      "checking threads=6\n",
+      "checking threads=7\n",
+      "checking threads=8\n",
+      "checking threads=9\n",
+      "checking threads=10\n",
+      "checking threads=11\n",
+      "checking threads=12\n",
+      "checking threads=13\n",
+      "checking threads=14\n",
+      "checking threads=15\n",
+      "checking threads=16\n",
+      "checking threads=17\n",
+      "checking threads=18\n",
+      "checking threads=19\n",
+      "checking threads=20\n",
+      "checking threads=21\n",
+      "checking threads=22\n",
+      "checking threads=23\n",
+      "checking threads=24\n",
+      "checking threads=25\n",
+      "checking threads=26\n",
+      "checking threads=27\n",
+      "checking threads=28\n",
+      "checking threads=29\n",
+      "checking threads=30\n",
+      "checking threads=31\n",
+      "checking threads=32\n",
+      "checking threads=33\n",
+      "checking threads=34\n",
+      "checking threads=35\n",
+      "checking threads=36\n",
+      "checking threads=37\n",
+      "checking threads=38\n",
+      "checking threads=39\n",
+      "checking threads=40\n",
+      "checking threads=41\n",
+      "checking threads=42\n",
+      "checking threads=43\n",
+      "checking threads=44\n",
+      "checking threads=45\n",
+      "# iteration: 4\n",
+      "checking threads=1\n",
+      "checking threads=2\n",
+      "checking threads=3\n",
+      "checking threads=4\n",
+      "checking threads=5\n",
+      "checking threads=6\n",
+      "checking threads=7\n",
+      "checking threads=8\n",
+      "checking threads=9\n",
+      "checking threads=10\n",
+      "checking threads=11\n",
+      "checking threads=12\n",
+      "checking threads=13\n",
+      "checking threads=14\n",
+      "checking threads=15\n",
+      "checking threads=16\n",
+      "checking threads=17\n",
+      "checking threads=18\n",
+      "checking threads=19\n",
+      "checking threads=20\n",
+      "checking threads=21\n",
+      "checking threads=22\n",
+      "checking threads=23\n",
+      "checking threads=24\n",
+      "checking threads=25\n",
+      "checking threads=26\n",
+      "checking threads=27\n",
+      "checking threads=28\n",
+      "checking threads=29\n",
+      "checking threads=30\n",
+      "checking threads=31\n",
+      "checking threads=32\n",
+      "checking threads=33\n",
+      "checking threads=34\n",
+      "checking threads=35\n",
+      "checking threads=36\n",
+      "checking threads=37\n",
+      "checking threads=38\n",
+      "checking threads=39\n",
+      "checking threads=40\n",
+      "checking threads=41\n",
+      "checking threads=42\n",
+      "checking threads=43\n",
+      "checking threads=44\n",
+      "checking threads=45\n",
+      "# iteration: 5\n",
+      "checking threads=1\n",
+      "checking threads=2\n",
+      "checking threads=3\n",
+      "checking threads=4\n",
+      "checking threads=5\n",
+      "checking threads=6\n",
+      "checking threads=7\n",
+      "checking threads=8\n",
+      "checking threads=9\n",
+      "checking threads=10\n",
+      "checking threads=11\n",
+      "checking threads=12\n",
+      "checking threads=13\n",
+      "checking threads=14\n",
+      "checking threads=15\n",
+      "checking threads=16\n",
+      "checking threads=17\n",
+      "checking threads=18\n",
+      "checking threads=19\n",
+      "checking threads=20\n",
+      "checking threads=21\n",
+      "checking threads=22\n",
+      "checking threads=23\n",
+      "checking threads=24\n",
+      "checking threads=25\n",
+      "checking threads=26\n",
+      "checking threads=27\n",
+      "checking threads=28\n",
+      "checking threads=29\n",
+      "checking threads=30\n",
+      "checking threads=31\n",
+      "checking threads=32\n",
+      "checking threads=33\n",
+      "checking threads=34\n",
+      "checking threads=35\n",
+      "checking threads=36\n",
+      "checking threads=37\n",
+      "checking threads=38\n",
+      "checking threads=39\n",
+      "checking threads=40\n",
+      "checking threads=41\n",
+      "checking threads=42\n",
+      "checking threads=43\n",
+      "checking threads=44\n",
+      "checking threads=45\n",
+      "# iteration: 6\n",
+      "checking threads=1\n",
+      "checking threads=2\n",
+      "checking threads=3\n",
+      "checking threads=4\n",
+      "checking threads=5\n",
+      "checking threads=6\n",
+      "checking threads=7\n",
+      "checking threads=8\n",
+      "checking threads=9\n",
+      "checking threads=10\n",
+      "checking threads=11\n",
+      "checking threads=12\n",
+      "checking threads=13\n",
+      "checking threads=14\n",
+      "checking threads=15\n",
+      "checking threads=16\n",
+      "checking threads=17\n",
+      "checking threads=18\n",
+      "checking threads=19\n",
+      "checking threads=20\n",
+      "checking threads=21\n",
+      "checking threads=22\n",
+      "checking threads=23\n",
+      "checking threads=24\n",
+      "checking threads=25\n",
+      "checking threads=26\n",
+      "checking threads=27\n",
+      "checking threads=28\n",
+      "checking threads=29\n",
+      "checking threads=30\n",
+      "checking threads=31\n",
+      "checking threads=32\n",
+      "checking threads=33\n",
+      "checking threads=34\n",
+      "checking threads=35\n",
+      "checking threads=36\n",
+      "checking threads=37\n",
+      "checking threads=38\n",
+      "checking threads=39\n",
+      "checking threads=40\n",
+      "checking threads=41\n",
+      "checking threads=42\n",
+      "checking threads=43\n",
+      "checking threads=44\n",
+      "checking threads=45\n",
+      "# iteration: 7\n",
+      "checking threads=1\n",
+      "checking threads=2\n",
+      "checking threads=3\n",
+      "checking threads=4\n",
+      "checking threads=5\n",
+      "checking threads=6\n",
+      "checking threads=7\n",
+      "checking threads=8\n",
+      "checking threads=9\n",
+      "checking threads=10\n",
+      "checking threads=11\n",
+      "checking threads=12\n",
+      "checking threads=13\n",
+      "checking threads=14\n",
+      "checking threads=15\n",
+      "checking threads=16\n",
+      "checking threads=17\n",
+      "checking threads=18\n",
+      "checking threads=19\n",
+      "checking threads=20\n",
+      "checking threads=21\n",
+      "checking threads=22\n",
+      "checking threads=23\n",
+      "checking threads=24\n",
+      "checking threads=25\n",
+      "checking threads=26\n",
+      "checking threads=27\n",
+      "checking threads=28\n",
+      "checking threads=29\n",
+      "checking threads=30\n",
+      "checking threads=31\n",
+      "checking threads=32\n",
+      "checking threads=33\n",
+      "checking threads=34\n",
+      "checking threads=35\n",
+      "checking threads=36\n",
+      "checking threads=37\n",
+      "checking threads=38\n",
+      "checking threads=39\n",
+      "checking threads=40\n",
+      "checking threads=41\n",
+      "checking threads=42\n",
+      "checking threads=43\n",
+      "checking threads=44\n",
+      "checking threads=45\n",
+      "# iteration: 8\n",
+      "checking threads=1\n",
+      "checking threads=2\n",
+      "checking threads=3\n",
+      "checking threads=4\n",
+      "checking threads=5\n",
+      "checking threads=6\n",
+      "checking threads=7\n",
+      "checking threads=8\n",
+      "checking threads=9\n",
+      "checking threads=10\n",
+      "checking threads=11\n",
+      "checking threads=12\n",
+      "checking threads=13\n",
+      "checking threads=14\n",
+      "checking threads=15\n",
+      "checking threads=16\n",
+      "checking threads=17\n",
+      "checking threads=18\n",
+      "checking threads=19\n",
+      "checking threads=20\n",
+      "checking threads=21\n",
+      "checking threads=22\n",
+      "checking threads=23\n",
+      "checking threads=24\n",
+      "checking threads=25\n",
+      "checking threads=26\n",
+      "checking threads=27\n",
+      "checking threads=28\n",
+      "checking threads=29\n",
+      "checking threads=30\n",
+      "checking threads=31\n",
+      "checking threads=32\n",
+      "checking threads=33\n",
+      "checking threads=34\n",
+      "checking threads=35\n",
+      "checking threads=36\n",
+      "checking threads=37\n",
+      "checking threads=38\n",
+      "checking threads=39\n",
+      "checking threads=40\n",
+      "checking threads=41\n",
+      "checking threads=42\n",
+      "checking threads=43\n",
+      "checking threads=44\n",
+      "checking threads=45\n",
+      "# iteration: 9\n",
+      "checking threads=1\n",
+      "checking threads=2\n",
+      "checking threads=3\n",
+      "checking threads=4\n",
+      "checking threads=5\n",
+      "checking threads=6\n",
+      "checking threads=7\n",
+      "checking threads=8\n",
+      "checking threads=9\n",
+      "checking threads=10\n",
+      "checking threads=11\n",
+      "checking threads=12\n",
+      "checking threads=13\n",
+      "checking threads=14\n",
+      "checking threads=15\n",
+      "checking threads=16\n",
+      "checking threads=17\n",
+      "checking threads=18\n",
+      "checking threads=19\n",
+      "checking threads=20\n",
+      "checking threads=21\n",
+      "checking threads=22\n",
+      "checking threads=23\n",
+      "checking threads=24\n",
+      "checking threads=25\n",
+      "checking threads=26\n",
+      "checking threads=27\n",
+      "checking threads=28\n",
+      "checking threads=29\n",
+      "checking threads=30\n",
+      "checking threads=31\n",
+      "checking threads=32\n",
+      "checking threads=33\n",
+      "checking threads=34\n",
+      "checking threads=35\n",
+      "checking threads=36\n",
+      "checking threads=37\n",
+      "checking threads=38\n",
+      "checking threads=39\n",
+      "checking threads=40\n",
+      "checking threads=41\n",
+      "checking threads=42\n",
+      "checking threads=43\n",
+      "checking threads=44\n",
+      "checking threads=45\n"
+     ]
+    }
+   ],
+   "source": [
+    "work_dir=\"/home/ck/raul\"\n",
+    "repeat = 10\n",
+    "max_threads = 45 #multiprocessing.cpu_count()//2\n",
+    "Data = namedtuple(\"Data\", [\"threads\", \"timing\", \"status\"])\n",
+    "\n",
+    "data = []\n",
+    "\n",
+    "for r in range(repeat):\n",
+    "    print(f\"# iteration: {r}\")\n",
+    "    for threads in range(1,max_threads+1):\n",
+    "        print(f\"checking threads={threads}\")\n",
+    "        env = {\n",
+    "            **os.environ,\n",
+    "            \"OMP_NUM_THREADS\": str(threads),\n",
+    "            \"RAUL_ASSETS\": f\"{work_dir}/testAssets\"\n",
+    "        }\n",
+    "\n",
+    "        timing, result = timing_run(f\"{work_dir}/build/RaulTests --gtest_filter=TestOptimizerAdam.ToyNetTraining\", env=env)\n",
+    "        data.append(Data(threads=threads, timing=timing, status=result.returncode))\n",
+    "    pickle.dump(data, open(f\"timing_experiment_checkpoint_{r}.p\", \"wb\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEcCAYAAAAydkhNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAgAElEQVR4nOzdeXhU9bnA8e+ZfZ/sO/saNkWgiCIqiAuLaFW0LrhSl7rU1ttSe4u4tBVt61Wv1rqhdamtG0gEoeKGAm4FQfYlLCF7Mkkms8+ZuX9MmEskgWFJMknez/PkeZKZM+e8c5LMe87v/S1KNBqNIoQQQrRA09EBCCGESF6SJIQQQrRKkoQQQohWSZIQQgjRKkkSQgghWiVJQgghRKskSYiEfPPNN5x33nnH9NrS0lJGjhyJqqonOKqjN3HiRFatWtXRYXQpc+fO5amnnjrh2x6v9957jxtuuOGYXns8f+9djSLjJNrXyJEj49/7fD4MBgNarRaA+++/nwsvvPCo9nfNNddw4YUXctlll8Ufi0ajvPDCC/zrX/+ivLyctLQ0pk+fzh133IHBYEhov4MGDWL58uX06tXrqOLpCB6Ph/HjxzNq1Cief/75w247ceJEHnroIU477bQ2i2fu3LksXrwYgFAoRDQajZ/3RGJsyTXXXMO6detYvnw5ubm5AKxatYr//u//5qOPPjri65988kn27NnDn/70JwCmTp1KaWkpAH6/H51Oh06nA+Dmm2/mlltuOeoYO1JJSQmTJk1i48aN8fchTgw5m+1s7dq18e/b6gProYceYuXKlcyfP5/hw4dTXFzMb37zG3bs2MFf//rXE3qsjhIOh+MfBsuXL8dgMLBq1SqqqqrIzMzs0NgeeOABHnjgAeDQD+fjYbFYePrpp3nwwQePe1/vv/9+/PuWLjQOOPg8i+5JmpuSRCQS4dlnn+Wcc85h7Nix3HXXXdTV1QEQCAS45557GDt2LKNHj+aSSy6hurqaxx57jG+++YYHHniAkSNH8sADD7B7925ef/11/vSnPzFy5Eh0Oh0DBgzgySefZOXKlaxevRqAOXPmMHfuXK6//npGjhzJ1Vdfzf79+wG46qqrAJgxYwYjR45kyZIlfPnll0yYMCEe78SJE3n++eeZPn06J598Mvfeey/V1dXcdNNNjBw5kuuuu476+nogdpU3aNAgwuEwa9euZeTIkfGv4cOHM3HixCOegwP7ePPNNznrrLO49tpr47G8++67XHHFFQwaNIj33nuv2XlduHAhZ599NmPHjj0kQa5fv57LL7+c0aNHM378eB544AGCwWD8+UGDBvHaa69x7rnnMnLkSP7nf/6HvXv3csUVV3DKKadw1113Nds+EStWrGDq1KmMHj2aa665hp07dwLw/PPPc8cddzTb9qGHHuKhhx6K/3zNNddQVFTE3r17W9x3RUUFd9xxB6eeeioTJ07k73//OwCfffYZf/vb31i6dCkjR4487N1qa+f5zjvv5PTTT2fUqFFcddVVbN++Pf6aOXPm8NhjjwHE/05efPFFxo0bx/jx43n77bePaVuXy8Utt9zCKaecwiWXXMJjjz3GT37ykxbjvvrqqwEYM2YMI0eOZO3atbzzzjvNtj+a32dLf+8vvPAC06dPZ9SoUfz85z8nEAjEn3/uuecYP34848eP580332TQoEHs2bOn1fPcmUiSSBKvvPIKH374Ia+++iorV67E6XTGr0bfffddGhsb+eSTT/jyyy+5//77MZlM3H333YwePZq5c+eydu1a5s6dy+rVq8nJyWHEiBHN9p+bm8vJJ5/crD1+8eLF3HbbbXz55ZcMHjyYe+65B4DXXnsNgEWLFrF27VqmTJnSYszLly9nwYIFLFu2jI8//pjZs2fzi1/8gjVr1hCJRHjllVcOec2Bf+C1a9fy1VdfcdJJJzF16tQjnoMDvv76a5YsWcILL7wAwP79+/nqq6+YPn0606dPZ+HChfFtd+zYwf33388jjzzCypUrqauro7y8PP68RqPhN7/5DWvWrOGNN95g9erVvP76682O9/nnn/POO+/wr3/9i+eff57f/e53PProo3z66ads37692RX5kRQXF/PLX/6Se++9l9WrVzNhwgRuueUWgsEgF154IStXrqShoQGIXcG///77XHTRRfHXZ2dnM3PmTJ544olD9h2JRLj11lsZNGgQn332GS+//DIvv/wyK1euZMKECdx8881ccMEFrF279pBE2pIfnucJEyawbNkyVq9ezZAhQ+J/Ky2prq7G7Xbz2Wef8fvf/54HHnggfsFwNNs+8MADmM1mvvjiC+bPn9/sd/tDr776ajzuAxciLTme3+fSpUt5/vnnWbFiBVu3buWdd94BYkn4pZdeYsGCBfz73//myy+/bHUfnZEkiSTxxhtvcPfdd5OTk4PBYOD2229n2bJl8dv9uro69uzZg1arZdiwYdhsthb343K5Wm1uyczMxOVyxX8+66yzGDNmDAaDgbvvvpt169ZRVlaWcMxXX301GRkZZGdnM3r0aEaMGMGQIUMwGo1MnjyZTZs2Hfb1Dz30EFarlbvvvvuI5+CAO+64A4vFgslkAmKJbNCgQfTv35+pU6eyY8eO+HE/+OCDZu/xrrvuQqP5/z/5YcOGcfLJJ6PT6SgoKODyyy/n66+/bhbjTTfdhM1mY8CAAQwcOJDTTz+dHj16YLfbmTBhwhHf48GWLFnCmWeeyemnn45er+fGG2/E7/ezdu1asrKyGD16NB988AEAK1euJDU1lWHDhjXbx80338zHH3/c7EoeYMOGDdTW1nL77bdjMBjo0aMHM2fOZMmSJQnHd7AfnudLL70Um82GwWDgjjvuYMuWLbjd7hZfq9Pp+NnPfoZer+fMM8/EYrFQXFx8VNuqqsry5cu54447MJvN9O/fv1nCPFbH8/u85ppryM7OJiUlhbPPPpvNmzcDseTx4x//mAEDBmA2mw+5I+zspLExSZSWlvKzn/2s2YeYRqOhpqaGGTNmUF5ezi9+8QsaGhq48MILufvuu9Hr9YfsJzU1laqqqhaPUVVVRUFBQfznnJyc+PdWqxWn00llZWW8MHokGRkZ8e+NRmOzn00mE16vt9XXvvHGG3z11Ve8+eab8fd8uHPQUswQSxIH2tKzs7MZM2YM7777LkOGDKGysrLZ9haLhZSUlPjPxcXFPPzww3z//ff4fD5UVWXo0KEJv0ej0Uh1dXWr7/GHKisrycvLa/becnNzqaioAODiiy/mH//4BzNnzuS9995jxowZh+wjLS2Nq6++mieeeKJZU8r+/fuprKxk9OjR8cdUVW3289E4+Lypqspjjz3GBx98QG1tbfz343K5sNvth7w2JSWlWR3DbDa3+rfQ2ra1tbWEw+Fmf4uJ/l0ezvH8Pg+++DKbzVRWVgKx3+vByfxExJlMJEkkiZycHP7whz8watSoFp+//fbbuf322ykpKeGnP/0pffr0abHQeOqpp3L//fezfv36Zk1OZWVlrFu3jttuuy3+2MFNLx6Ph/r6erKysk7gu2rZN998w+OPP87rr7/e7I7ocOegpKQEAEVR4o/95z//Yffu3Tz77LMsWLAAiL2P7du38+tf/5qsrKx4mz/EepMdqHEAzJs3jyFDhvDnP/8Zm83GSy+9xLJly074+z0gKyuLbdu2xX+ORqOUlZWRnZ0NwDnnnMO8efPYtm0bn3zyCf/1X//V4n5uvPFGzjnnHIYPHx5/LDc3l4KCApYvX97iaw4+b4k4ePvFixezYsUKFixYQEFBAW63mzFjxtCWHSPT0tLQ6XSUl5fTp08fgMPe5R7t+zuRsrKy4okeDh9nZyTNTUniJz/5Cf/zP/8TLx7X1tby4YcfArBmzRq2bt2KqqrYbDZ0Ol38ai4jI4N9+/bF99OnTx+uuOIK7rnnHtatW4eqqmzfvp077riD0047rVlPqk8//ZRvvvmGYDDI448/zkknnRS/Cvrhfk+UsrIyfv7znzN//vz4P38i56AlCxcu5PTTT+f9999n4cKFLFy4kMWLF+P3+/nss88477zz+OSTT+Lv8YknniASicRf7/F4sFqtWK1Wdu7cyT/+8Y8T/n4PdsEFF/Dpp5+yevVqQqEQL774IgaDId5+bjQaOe+88/jlL3/J8OHDm911HMzhcHD99dfH6wUAI0aMwGq18uyzz+L3+1FVlW3btrF+/XoA0tPT2b9/f7P3nyiPx4PBYCA1NRWfz8df/vKXY3j3R0er1TJ58mT+93//F5/Px86dO1m0aFGr26elpaHRaNrkb/ZIzj//fN555x127tyJz+fj6aefbvcY2pIkiSQxa9YsJk6cyA033MDIkSOZOXNm/B+8urqaO++8k1GjRjFlyhR+9KMfxZsiZs2axbJlyxgzZky8J8zcuXO59NJL+a//+i9GjhzJTTfdxI9+9COefPLJZsecNm0aTz31FGPHjmXjxo08+uij8eduv/125syZw+jRo4+5Xbslq1evprq6mrvuuivew+lA4fpw5+CHAoEAS5cu5eqrryYzMzP+1aNHD2bMmMHChQsZMGAAc+fO5Z577uGMM87A4XA0a0b59a9/TVFREaeccgq/+93vWi3QJ+LAgMEDYw9a0rdvXx599FEefPBBTj31VD7++GOeeeaZZmNXLrroIrZt29ZiU9PBZs2a1axZTqvV8swzz7BlyxYmTZrEqaeeyn//93/T2NgIxD7IAMaOHcvFF198VO/toosuIi8vjzPOOIOpU6dy8sknH9Xrj9XcuXNxu92cfvrp/OpXv2Lq1KmtjvMxm83ccsst/OQnP2H06NGsW7euXWIEOPPMM7nmmmuYNWsWkydP5qSTTgJIeExSspPBdN3UnDlzyM7OjheNRXIoLS3lggsu4Isvvmi1c0J39eijj1JdXc38+fM7OpTD2rlzJ9OmTWPDhg1dYoyJ3EkIkSQikQgLFixgypQpkiCIfdhu2bKFaDTK+vXreeutt5g8eXJHh9Wif//73wSDQerr63n00Uc5++yzu0SCAClcC5EUvF4vp59+Onl5ecc0bUdX5PF4+OUvf0llZSXp6enccMMNTJo0qaPDatEbb7zBnDlz0Gq1jBkzhvvuu6+jQzph2qW5af78+Sxbtoz9+/ezePFiBg4ceMg2qqrGp5NQFIWf/vSnLfbeEUII0X7apblp0qRJvPbaa+Tn57e6zeLFi9m7dy/Lly/nn//8J08++WS826MQQoiO0S5JYvTo0UccYLJkyRIuu+wyNBoNaWlpnHPOOfHRp0IIITpG0hSuy8rKmvULz83NbTbYSwghRPtLmiQhhBAi+SRN76bc3FxKS0vjU0n88M4iUTU1jUQiLdfiMzPtVFW1PClZe5NYkjcOSJ5YkiUOSJ5YkiUO6BqxaDQK6emtd7lOmjuJ888/nzfffJNIJBKfjkGWDxRCiI7VLknioYceYsKECZSXl3P99dfHp2GYPXs2GzZsAGIL3BQUFHDuuecyc+ZMfvazn9GjR4/2CE8IIUQruty0HNLcdPSSJZZkiQOSJ5ZkiQOSJ5ZkiQO6RiydprlJCCFE8pEkIYQQolWSJIQQQrRKkkSTsBpp05W2hBCiM5Ik0aSqzovHH+7oMIQQIqlIkmgSVqHRF+zoMIQQIqlIkjggCg2eUEdHIYQQSUWSxEG8gRBh9egXihdCiK5KksRBQuEI/qDa0WEIIUTSkCRxEK1GkbqEEEIcRJLEQQx6rdQlhBDiIJIkgAZvkC17Xei0Cv5gWOoSQgjRRJIEsKm4ln+s2E51vR8FpC4hhBBNJEkAg3qmArBtbx0arYLHJ01OQggBkiQASLUb6ZllY9u+Ogw6LfUeKV4LIQRIkogr7J1GZZ0ftzcodQkhhGgiSaLJkF6xJqfNe1xSlxBCiCaSJJqk2IzkpFnYvMeFRiN1CSGEAEkSzQzq6aS02osvEJa6hBBCIEmimYE9UgDYsb+eQFCVuoQQotuTJHGQVLuR7FQzm/a4gKjUJYQQ3Z4kiR8o7J1KSaUHjz8sdQkhRLcnSaKJXq+gqlDY1MupuLyBBq/UJYQQ3ZskiSZpdhORaIQMp4nMFBPb9tXjD0hdQgjRvUmSaGLQa0l3mPH6VQb3SmVvhRuvPyR1CSFEtyZJ4iAZztjdxOCeKUSjsKusQeoSQohuTZLEQQ7cTdjNBtLsRnbsr5e6hBCiW5Mk8QMZThNRogzulcKe8kbq3AGpSwghui1JEj9w4G6iT66TSDTKztJ6qUsIIbotSRItiPVwMuK0GthRUo/XL3UJIUT3JEmiBQa9lgynhf75TvZUNFLp8nV0SEII0SEkSbQiw2mif4EDNRJl8x6X1CWEEN1SuyWJ4uJiLr/8cs477zwuv/xydu/efcg2VVVV3HrrrUyfPp0LLriARYsWtVd4hzDotQztk47NrGd7ST2hsCQJIUT3025J4r777uPKK69k2bJlXHnllcydO/eQbR5++GGGDRvG4sWLee2113jssccoKytrrxAPkZVipm+end3lboJhKV4LIbqfdkkSNTU1bNq0iWnTpgEwbdo0Nm3aRG1tbbPttmzZwhlnnAFAWloagwcPZunSpe0RYosMei09suyE1Qi1Df4Oi0MIITpKuySJsrIysrOz0Wq1AGi1WrKysg65Sxg6dChLliwhGo2yb98+1q5dS2lpaXuE2KoeWTYAKV4LIbolXUcHcLA5c+bwhz/8gRkzZpCXl8e4cePiiSVR6em2wz6fmWk/qv0VBmO1iAa/etSvPZITvb/jkSyxJEsckDyxJEsckDyxJEsc0PVjaZckkZubS0VFBaqqotVqUVWVyspKcnNzm22XlpbGn/70p/jPs2fPpn///kd1rJqaRiKRaIvPZWbaqapyH9X+FDVWiyitdB/1aw/nWGJpK8kSS7LEAckTS7LEAckTS7LEAV0jFo1GOezFdbs0N6Wnp1NYWEhRUREARUVFFBYWkpaW1mw7l8tFOBwGYPXq1Wzbti1ex+goVpMOg15DfWOg1eQjhBBdVbs1N82bN485c+bw9NNP43A4mD9/PhC7W7jzzjsZPnw469ev5/e//z0ajYbU1FSeeeYZzGZze4XYIkVRSLObaPCGCKkRjJqja/4SQojOrN2SRL9+/XjzzTcPefy5556Lf3/mmWdy5plntldICUtzGKmq86GqEdBLkhBCdB8y4joB6Q4Tbm+IkCrNTUKI7kWSRAIynCb8QRW3N9DRoQghRLuSJJGADKcJgIpaGSshhOheJEkkIN0ZK55XyIA6IUQ3I0kiAQfuJOrc0twkhOheJEkkwGkzoFGg3hOUKcOFEN2KJIkEaDUanDYj9R5Z71oI0b1IkkhQqt2I2xsiLN1ghRDdiCSJBB0YKyF3EkKI7uSII65DoRDfffcdW7ZsoaGhAYfDweDBgznppJPQ6/XtEWNSSHeY8PhDePwhUmzGjg5HCCHaRatJwuVy8eyzz/Luu+/idDrp27cvVqsVj8fDK6+8Qn19PRdffDGzZ88+ZKK+rigjxUQ0CjX1PvIzDj8duRBCdBWtJokrr7ySSy+9lEWLFpGdnX3I8xUVFSxevJirr76aJUuWtGmQyeBAN9gql3SDFUJ0H60miUWLFmEwGFp9YXZ2NjfddBOzZs1qk8CSTUbTgLqaBj/RaBRFUTo4IiGEaHutFq4PlyCOZbvOLt0Ru5OIdYOVHk5CiO7hsM1NiVwtv/baayc0oGRlNGixGHXxAXV6nXQME0J0fa0micsuuyz+/d69e3n77be5+OKLycvLo7S0lIULF3LJJZe0S5DJIiU+VkK6wQohuodWk8TFF18c/37mzJm88MILDBgwIP7Y9OnTuffee7nzzjvbNsIkku4wUlLpIRSWJCGE6B4SajPZuXMnPXv2bPZYQUEBu3btapOgklWaw0SjL4QvEOroUIQQol0klCTGjBnDnDlz2L17N36/n+LiYn77298yevToto4vqWQ4TYTUCPUeSRJCiO4hoSTx8MMPAzBt2jROPvlkpk+fTjQa5Q9/+EObBpdsDnSDrazzdnAkQgjRPo44LQdASkoKjz32GJFIhNraWtLS0tBoul/vnsyUWDdYV0OASDSKRsZKCCG6uIQ/6Xfu3Mlf//pXnn76aTQaDbt27WLLli1tGVvSOXAnUe8JEZbitRCiG0goSSxdupSrrrqKiooKFi5cCIDH44k3Q3UXdosenVahQdaVEEJ0Ewk1Nz3xxBO89NJLDB48mKVLlwIwePDgbncnoSgKTpuRBllXQgjRTSR0J1FbW8ugQYMA4qOwFUXplvMXpdqMNHqDBENqR4cihBBtLqEkMXToUBYtWtTssffff58RI0a0SVDJLN1pxO0L4wtKkhBCdH0JNTf99re/5cYbb+Stt97C6/Vy4403UlxczIsvvtjW8SWdNIcJXyCM1x/s6FCEEKLNJZQk+vXrx9KlS/n4448566yzyM3N5ayzzsJqtbZ1fEkno2k22Kr6AIM6OBYhhGhrCSWJhQsXMm7cOKZMmdLs8aKiIqZNm9YmgSWrzJSmbrDuAGokgrYbjhcRQnQfCX3C/eY3v+Gyyy5j7dq1zR6fO3dumwSVzDJTY0miwRskHJYeTkKIri2hJGEymXjooYf42c9+xptvvhl/PBrtfh+SBxYfavAECclYCSFEF5dQklAUhQkTJvDaa6+xYMECHnzwQVRVPaousMXFxVx++eWcd955XH755ezevfuQbWpqavjpT3/K9OnTueCCC5g3bx7hcDjhY7QHnVaDzayn3hNClSQhhOjiEkoSB+4Y+vTpwz//+U9KSkq4/vrrUdXEu4Hed999XHnllSxbtowrr7yyxaaqZ555hn79+rF48WLee+89Nm7cyPLlyxM+RntJtRtp9AXxSzdYIUQXl1CSGDt2bPx7u93OM888w0knnUR6enpCB6mpqWHTpk3xIve0adPYtGkTtbW1zbZTFAWPx0MkEiEYDBIKhcjOzk70vbSb1KYV6vyh5LrLEUKIEy2hJPHMM880+1lRFH75y1/y0UcfJXSQsrIysrOz0Wq1AGi1WrKysigrK2u23W233UZxcTHjx4+Pf40aNSqhY7SndKcJty+EPyB3EkKIrq3VLrB//etfufXWWwF4/PHHW93BXXfddcKC+eCDDxg0aBAvv/wyHo+H2bNn88EHH3D++ecnvI/0dNthn8/MtB9vmPTMdRL5z35UjYaMDNsxT09yImI5UZIllmSJA5InlmSJA5InlmSJA7p+LK0mifLy8ha/Pxa5ublUVFSgqiparRZVVamsrCQ3N7fZdq+++ip/+MMf0Gg02O12Jk6cyJdffnlUSaKmppFIpOVeV5mZdqqq3Mf1XgDMulhSKCmtpzzXjk579GMlTlQsJ0KyxJIscUDyxJIscUDyxJIscUDXiEWjUQ57cd1qkrj//vvj3//xj3886gMfLD09ncLCQoqKipgxYwZFRUUUFhaSlpbWbLuCggI+++wzRowYQTAYZPXq1UyePPm4jt0WDiw+1OALEQpHjilJCCFEZ9Bqkti3b19CO+jRo0dC282bN485c+bw9NNP43A4mD9/PgCzZ8/mzjvvZPjw4dx7773cd999TJ8+HVVVGTt2LDNnzkxo/+0pM9UCgNsTlHUlhBBdWqtJYvLkySiKctgBc4qisHnz5oQO1K9fv2YD8Q547rnn4t/37NmTBQsWJLS/jmQz6THoNTR4g4RkhTohRBfWapLobgsKHa0UqwG3J0hAxkoIIbowaUw/RilNK9T5ZfEhIUQXltAssOFwmNdff52vv/4al8vVrAnqtddea7Pgklmaw8SeikYCkiSEEF1YQncSf/zjH/nnP//J6NGj2bhxI+eeey41NTWceuqpbR1f0kp3mgiEVBp9ISLdcKJDIUT3kFCSWL58Oc899xzXXnstWq2Wa6+9lqeeeoovv/yyreNLWhnOWDdYtzdIWIrXQoguKqEk4ff74wPfTCYTPp+Pfv36sWnTpjYNLpllOJvWlfAECUqSEEJ0UQkvX7phwwZGjBjBsGHDePLJJ7HZbEk5+V57+f87iRCBkIrNrO/giIQQ4sRL6E7i3nvvjU/ON2fOHDZt2sTHH3/Mgw8+2KbBJbN0pwmNRsHjD+H1hTo6HCGEaBMJ3UmMGDEi/n3v3r156aWX2iqeTkOn1WA363F7Q3gCMmW4EKJrSihJAOzfv58tW7bg9XqbPT59+vQTHlRn4bQZYsuYhiOokQhajQw7EUJ0LQklib/97W88/fTT9OvXD5PJFH9cUZRunSTS7Ca2l9QBEAxFMBslSQghupaEksSLL77I22+/Tf/+/ds6nk4lL8PCuh3VNPpidxNmY0dHJIQQJ1ZCl74pKSnk5+e3dSydTs/s2AIfNQ1+fFKXEEJ0QQndSdx777387ne/49prrz1kXeu8vLw2Cawz6JXTlCTq/Hj8kiSEEF1PQkkiFArxxRdfUFRU1Ozxo5kqvCuymw04rQYq6/z4AiGi0egxL2UqhBDJKKEkcf/99/OLX/yCKVOmNCtcd3c6rUJmiomKWi+RKITVCHqdtqPDEkKIEyahJKGqKj/+8Y/jA+pEjFarkJVqYcf+BgIhlWBYkoQQomtJqHB9ww038Oyzzx52lbruSKvRkJ0am8Opus4nCxAJIbqchO4kXnnlFaqrq/nb3/5GSkpKs+c++eSTtoir0+iRZQOgul56OAkhup6EksSjjz7a1nF0WjlpFiwmHVV1PunhJIToco6YJFRV5d5772XJkiUYDIb2iKlTsZj0ZKaYqXD5CATDRCJRNBrp4SSE6BqOWJPQarVotVoCgUB7xNPpGPVaslLMVNf5CUeiBMNSlxBCdB0JNTfNmjWLn//859x8883k5OQ0GwvQo0ePNguuM9DrNWSnmYlEo9TU+wmGI5jkhksI0UUklCQOrBvxxRdfNHu8uw+mA9AoCj2zYiOvq+p8+INhHBbJEkKIriGhJLFly5a2jqNTK8i2YtBrqK734/WFIeXIrxFCiM4g4fUkAEpLS6moqCAnJye+5rUAqzFWvK50+fBKN1ghRBeSUJKorKzkF7/4BevWrSMlJYW6ujpOOukk/vKXv3Trda4PMOq1ZKaY2VhcSyisEn5PO14AACAASURBVFYj6LSytoQQovNL6JNs3rx5DB48mK+++orPP/+cr776isLCQu677762jq9T0Os05KRZCIUjuNxBgiHp4SSE6BoSShLffvstv/71r7FYLABYLBZ+9atfsXbt2jYNrrNQFIXe2bGR11V1PgKSJIQQXURCScLpdLJz585mj+3atQuHw9EmQXVGvXLsaDUKVfU+mZ5DCNFlJFSTuOmmm7juuuu49NJLycvLo7S0lHfeeYe77rqrrePrNGwWAxlOE1V1fileCyG6jISSxMyZM+nRowdFRUVs3bqVrKws/vznPzNu3Li2jq/TMOi0ZKaa2VFSj9cfJhKNopEFiIQQnVzCXWDHjRt3XEmhuLiYOXPmUFdXR0pKCvPnz6d3797NtvnVr37F1q1b4z9v3bqVp556ikmTJh3zcdvLgeL197tqcXuDhMIRjHpZW0II0bkllCSCwSDvvvsumzdvxuv1NnvukUceSehA9913H1deeSUzZsxg0aJFzJ07l7///e+t7mvLli1ce+21nHHGGQntPxn0blrzutLllyQhhOgSEipcz5kzh5dffhmr1UrPnj2bfSWipqaGTZs2MW3aNACmTZvGpk2bqK2tbfU1b731FtOnT+9UM8/2zXWgAFX1PvxSlxBCdAEJ3UmsXLmSFStWHHNvprKyMrKzs+PLn2q1WrKysigrKyMtLe2Q7YPBIIsXL+all146puN1FIfVSKrDSJXLhycQIgNzR4ckhBDHJaEkkZubSzAYbOtY4j788EPy8vIoLCw86temp9sO+3xmpv1YwzqisBohP9PO3vIGDCbDEY/VlrEcrWSJJVnigOSJJVnigOSJJVnigK4fS0JJ4qKLLuK2225j1qxZpKenN3sukWJ2bm4uFRUVqKqKVqtFVVUqKytbnf/p7bff5pJLLkkktEPU1DQSibS8Fndmpp2qKvcx7TdRGU4jG3YG2bu/nkybodXpOdojlkQlSyzJEgckTyzJEgckTyzJEgd0jVg0GuWwF9cJJYlXX30VgL/85S/NHlcUhRUrVhzx9enp6RQWFlJUVMSMGTMoKiqisLCwxaam8vJyvv3220OO1VkcKF5XubyEwjKHkxCic0soSXz00UfHfaB58+YxZ84cnn76aRwOB/Pnzwdg9uzZ3HnnnQwfPhyAd999l7PPPhun03ncx+wI/fJidZvKOh/BcASzsYMDEkKI43BUU4Ufj379+vHmm28e8vhzzz3X7Odbb721vUJqE6l2E3aLnqp6P15fCKe18/TOEkKIH2q1LeSSSy5h6dKlrRasg8EgS5Ys4bLLLmuz4Dojo15LdqqFKpcPbyDU0eEIIcRxafVOYv78+TzxxBPMmzePoUOH0qdPH6xWKx6Ph927d7Nx40ZOPfVUHn744faMN+lpNAp5GRZ27K+n3hOU6TmEEJ1aq0mif//+PPHEE1RVVfHFF1+wbds2XC4XDoeDGTNm8MgjjxzS00nE9M6x89l3ZVS4YjPCWk36jg5JCCGOyRFrEpmZmVx00UXtEUuX0Tc/VnSvqffjaghIkhBCdFrSP7MNZDnNmI1aahr8uBoDhNVIR4ckhBDHRJJEGzAatGSnWdhb0Ug0GqXRJwVsIUTnJEmiDSiKwog+6bjcAarqfNTU+zs6JCGEOCaSJNrIqMJM9DoN3xfX4gmECQRl3WshROeTUJKIRqP861//YtasWUyfPh2Ar7/+miVLlrRpcJ1Zqs3EoB4pbNrtIhxSqfcGOjokIYQ4agkliccff5y33nqLyy+/nLKyMgBycnJ4/vnn2zS4zsyo1zC0TxqhcIRdZQ3U1PmJRFueeFAIIZJVQkni3Xff5ZlnnmHq1KkoTQPDCgoK2LdvX5sG15nptBoKsqxkOE2s31lDSI3gk4WIhBCdTEJJQlVVrFYrQDxJeDweLBZL20XWySmKQnaKhcJeqZRUeahvDOJqkCYnIUTnklCSOPPMM/njH/8Yn8cpGo3y+OOPc/bZZ7dpcJ2d02ZgSO9UNAps2euSMRNCiE4noSTxm9/8hqqqKkaNGoXb7WbkyJGUlpZyzz33tHV8nZpepyU33Uq/fCfrd9agRmTMhBCic0loqnCbzcZTTz1FdXU1paWl5ObmkpmZ2daxdQnpThOFvVLZXlJPSaUbu1lPik0WmRBCdA5HNU7CZDKRnZ1NJBKhoqKCioqKtoqry7AYdQzskYLNrGfDLhkzIYToXBK6k1i1ahW/+93vKC0tJXpQN05FUdi8eXObBdcVKIpCdpqFwl4pfLO1Cq8vRL03QEFHByaEEAlIKEn89re/5bbbbmPKlCmYTKa2jqnLcVoNDOuTztdbqtheUofTaiASkTETQojkl1BzUyAQ4Mc//jFWqxWtVtvsSxyZTquhb56Dgkwr3+2sIRhW8fqlgC2ESH4JJYnrrruO559/vllTkzg6qXYTQ3qnUtsQoNLlo1om/RNCdAIJNTede+653Hjjjfztb38jNTW12XMrVqxok8C6GrNRy7C+6XyyrpRNu10M7peBKcWEViNzLAohkldCSeLOO+9k9OjRnH/++VKTOEaKopCfYWVgQQqb9ri4wB/G4w/jsBg6OjQhhGhVQkmipKSEhQsXopGr3uNitxgY3jeN74tr2brHhUmbJklCCJHUEvrUnzRpEmvWrGnrWLo8nVbDkN5ppDuM/GdbFfWeoEzTIYQ4LpGm1S89bTSbQ0J3EsFgkFtvvZXRo0eTnp7e7LlHHnmkTQLrqtIcJgp7p/L5+nJcDQG8/jAOq9xNCCGOTiCoUu8JUF3vxx9U0Rp0mLXKCT9OQkliwIABDBgw4IQfvDsyG3WcMiCDLzaUs3Wfi145NkkSQoiEqJEIbm+Imno/Hn8IraLBZNKgnPjcEJdQkrj99tvbLoJuqHeuk775TjbtdjF2SDZhNYJOK/UeIUTr3N4ge8rdRKJgMmh+cHHZds3WrSaJr7/+mjFjxgCwevXqVncwbty4Ex9VF2e36Bk5KIu3VmxnX6WH3jkOuZsQootRI5ET1sXd6w+xu7wBs0GHTte+F5StJon777+foqIiIDYtR0sURZFxEsdAq9Fw5sn5vPfZTrbudTGkd6okCSG6kEBQZVdZA9mpZtIcxzdswB8MU1zmxmTQtnuCgMMkiaKiIoqKipg2bRofffRRe8bULeSkWxlQ4GTrvjpcDX56ZNmkyUmILqK81osaibC3shFfIExuuhWN5ugLB4GQyq7SBnRaBb2uY6ZBOuyn0ty5c9srjm7HZNQxamAmoXCE7fvrZTEiIboItzdIXWMAm1mP06qntsFPcVkDofDRLREQCkfYXeZGUcBo6Lh58g6bJGSuprY1cmAmTquBLXtcsv61EF1AJBJlf7UHiyn2oa4oCnargUBIZXtJfcITe4bVCHvK3aiRCGZjQv2L2sxhjx6JRFizZs1hk0Wihevi4mLmzJlDXV0dKSkpzJ8/n969ex+y3ZIlS/jrX/9KNBpFURQWLFhARkZGQsfobOxmA0N7p7JqYwWlNY0UZNnQd0CboxDixKhp8BEMqYfUGC0mHaGwyo799RRk2ki1G1Fa6bcaiUTZV+nGHwxjs+jbI+zDOmySCAaD/Pa3v201SRxN4fq+++7jyiuvZMaMGSxatIi5c+fy97//vdk2GzZs4H//9395+eWXyczMxO12YzB03YKuRqNw+ohcVm2sYPOeOob2TiPFLnNjCdEZBUIq5TU+bOaWP9j1Oi1ajYZ9VY24vSFMBi0aDWg0GrQaBUVRUIC6xgCN3hD2o+jM0patPodNEmaz+YT0XqqpqWHTpk0sWLAAgGnTpvHggw9SW1tLWlpafLuXXnqJG264Ib5+tt1uP+5jJ7u+eU7yM6xs2euitiEgSUKITqq81otWy2EL1BqNgsOixxsI4fEHiUaJfRGFqEKUKJqmJqqWBIIq1Q1+qutiyw1U1/uprvPj9gW5/dKTGNYrtcXXHY92aewqKysjOzs7vkiRVqslKyuLsrKyZkli586dFBQUcNVVV+H1epk8eTK33nprq7dlLUlPtx32+czM5Ek8B2I5/eQ8/vXhdsrr/YwcmotB3/5FqmQ5L8kSByRPLMkSByRPLMkSB8RicXuDRDUe8rKdR/V5lajqOh8LijZSWu2JP6bRKGQ4zeRl2khPMZGXaWuT83LYJNHehWtVVdm6dSsLFiwgGAxy0003kZeXx0UXXZTwPmpqGltdGjQz005VlftEhXtcDo5lVL903v5oB19tLKewh5PUdr6bSJbzkixxQPLEkixxQPLEkixxQCyWiooGtpfUoyhR6tSj68GUiAqXl9eWb0eNRDn7lHwynSYynCZSHcb4YD1fIIzZqDum86LRKIe9uD5slXTt2rVHfcCW5ObmUlFRgdp0AlVVpbKyktzc3Gbb5eXlcf7552MwGLDZbEyaNIn169efkBiSWXqKmYEFTrbtq6PS5evocIQQR6HW7ScQUtukBaCkqpGXl25FUeC6CwZxxohcBvdKJSPF3G4LlrXLUdLT0yksLIyP4C4qKqKwsLBZUxPEahWff/450WiUUCjEmjVrGDx4cHuE2KE0isJpw3PxB1W+L6456v7UQoiOEQyplNd4sZqbJ4iSykbW76yhwRM85n0XlzXwyrJtmI06rrtgMJkp5uMN95i0WwfcefPmMWfOHJ5++mkcDgfz588HYPbs2dx5550MHz6cqVOn8v333zNlyhQ0Gg3jx4/n0ksvba8QO9SogZm8sWI7m3e7OHtkAWmOjhs8I4RIzP6qRjSK0uyqfsOuGhauLOZAa32aw0ifHAe9c+30zrFjbaX308G27nXx1ie7SHeYuOrcAdg7cHEyJdrFRsx1xprEAc8XbWLNxnJuu3g4Q3untdsoy2Q5L8kSByRPLMkSByRPLCc6jmg0SlWdD6fVeFT/c3WNAep8YaLhcLxY/Z9tVRSt2kOvHDvnjMpnX2UjxWVu9lY0EgjFWgiyUswUZFnJTrWQlWYmO9WMyfD/1+vrd9aw6PNictOtXDV5QEKD6XyBML0LUo9pPYkj1SQ6diifaObMk/NY9X053+2oRq9TyE6zkOEwH9OcL0KIxFTUeimr8eJyB+mX70hoDrVASKWkspHcLAfuxtiH/5ebKlj21T765zu47Oz+6HUa8jNtnDo0h0gkSlmNh+IyN7vL3Wze4+I/26rj+3NaDWSnWbCadKzdXk3vXDuXT+yPsQN6Ov6QJIkk0j/fSe8cOyvXl2E0aIlEorgaguRnWlsdoCOEOHbV9T7KXT5S7AY8vjD7qz30zLIdthtrJBJlX0UjWq0Sn5V15foyPv7Pfgb3SuHHE/oekmg0GoX8TBv5mTbGj8glGo3i9oaocHmpqPVR4fJRUetle4mfwl6pXHxGnw6Z8bUlkiSSiKIo3DStkJeWbuXDb0qocweYeEo+O/fXk2Y3kpNu6bCZIIXoauoaA+yv8uCw6FEUBZtFT707QJVRS1aKpdXXVbq8eAMhHFYD0WiUFd+W8MWGcob3TWPG+D4J3fkrioLDasBhNTCgICX+eCQSTbqWA0kSSSYzxcKFp/dm1cZy1myswNUY4JIJfXF7Q9R76sjPspFqM3Z0mEJ0ao2+EHsr3NjMumYfyjaLnvJqLya9Fof10P+zRl+ICpcPh1VPNBrl3U938sWGck4ZmMHUcb2OeyBdawlCjUTw+lvv9RiJRJvuXk58iVmSRJLR6zT0znUQiUZJsxtZ+uVeXvpgKz85ZwA2s46SikasRl2HjMoWXVc0GsUfVPEFwkQO9GWJQlBVqXT5qKj1kZ5qIdNhIDvFgsGgRdOWCyu3IV8gzO6yBsxGHVqthl2lDeytcHPasBwMei0Wk459lR765+uaFbJDYZU9Fe6myfoivL96Dxt21TJ2SBbnjunRJiOtw2oErz+MVqOQm2bBbjVwaB6JPZCTbqWmpvGExyBJIgnZzHp6ZtuJRqOk2Abw9qe7eKFoM1dM6o/TaqSqzkd+5uGnHxGdnz8YZm95A7W1HsxGHSaDFr1Oe8JmCo5EoviCYdyeILUNfmrdAarq/FTX+6hpiM0J5GoM8MP+jyaDlqxUM/kZVnrl2Omf76Qg04pBn/wfJ4GQSnFZAwZ9bFK9T9eV8um6UgA2Ftfy4zP7kZtuIaxG2Fvppm+eA61GQzQapaTKE5+A761PdlJV5+eCcb0ZPTD9qBPEkZqVQmEVn19Fr9NQkGnDaTMccfBcWzVTJf9vtZtKsRkJZ9hAUbjugkG8sWIHL3+wlcsn9kONREh3mpp1mxNdhy8QprzWy+rvyyh3+UmzG8jLsJKVYkGrAa1Wg8Wow2bWYzHpMRu1CX9IhdUI3kCYSpeXrXvrKKvxUl7rpaLWi8cfjm+XajeSlWpmSJ9U0h0mUqxG9CYdu/bVUV3vp8Ll5cvNlazeWAHAacOyuWHKkKRrTz/YgUV8NErsPPzr453sKm1gRL90hvZJpWjVHl58fzOTRuUzdkg2Hl+Y0ioPBVk2aur9uD1B9lS4KVq9B71Ow1XnDmD0kDxcdZ4WjxcORwhHIqhqNNYtX1GAKERBo9XEHoty4EaApmeJRsGo19Izx47DYujwcyqfMkksI8VMSI1Q5fJx49RC/r5sK4u/2MN1FwymwuWjV3byTHImjp/XH6K81st3O6pZ9X05NQ0BbBY93++KLVQTu6q00iPLRn6mlQyHGa1WQa/TkO4wYbcYMBkOTRhhNUJ9Y5B126vYuLuWkioP1fX++PPpDhP9853kZ1rJzbCSYtMTjSpEIrHPNYtRh9NqIC/XycA8B43eEJ5AmGBIpabex5pNlXy1uZLzxvSgR7ajXc9ZokLhCHvKG1AjEWob/Lz1yU48/jBTx/XilIEZKIrCzRfaWPzFbpZ/XcLO0gYuPL03te4AWq2G8loPX2woZ+32anpk2bjkzL6HrBkRiUQJhFTCamxqV6NBh9Wox2jUYtTr0GsVtFoNeq0GjUYhGo0SicYSSCQCaiT284Fz3hbNV8dCkkSSy06zEA5HqGsMMGVcL/7+wVbWbq/ilIGZeJ1mLCb5FXZmkWgUrz92Zb99Xz1ffF9GSZWHdIeRS8/qy/ABWZRWNlDp8rG/ysOecjeffVcGxKZzyUwxkZNuIcNpJivFTG6GhexUCzaznkqXl2+2VrFlj4s9FY2E1Qg6rUKvbDtD+6SRn2klJ9WMTqshFI7Gr2iNei0OqwGrSY/JoI03czhtRoJOMxlOM9FolLAaIRCKkJ9h409vrGPx6r3cMmNo0tUqQmGV3eVugiGVjbtrWf51CQ6LnuunDCYvwxrfzmLSMXNiP77ZWsXyr/bx7HubuHB8H1xuPx98tY9Kl4/ThuUw8ZT8+NV9MKzS6A0RiUbRahQcViNOqwGzUXfEZkFFUdAqCsm+tL18wiQ5jRLrXx1SI2g0mthKdt+XU9gzhfJaD31yHUlzxSESFwypNHiCVNX5qKzzs2ZjGVv31WM16Zhyak+G903HH1TJSDFj1IDTaqRXjp3xw3OJRCOU1/ooqWqkrNrLtn31fLejBgCNAulOEwoKlXWxySKdVgMnD0hnQL6TvAxrszUMdFoNVrMBq0mHyaDDoNckNJhMUZSm+oiWwt5pjOifzrrt1ZRWeyhIonpZMKRSXObGHwjx729L2LTbxcAeKcwY37vFkcyKojBmcBY9s2288+ku/vHhdvS6WP3iikn9GdgjJb5ff0DFYjWRk2bBatZj7MTF/MORJNEJaDQKPbPtFJc1MH5ELttK6vlsfTnnjumBxx+WgXZJIBqNUlHrpdYdiNcLjAYtBr0Wg06Doijxu4aaeh8NniB1jUH+s62KDbtq0WoVzjw5j3FDswmrEcJqlD65dnrmOKjSKmSlWggEVRr9IercfnLTFHLSLOiGKGg0Cl5fiAqXn9IaD2U1XlQ1wsRT8umX78BhMRCJAlEwGXQ4bQYsTT3kEkkIibhofF++2/E1S1bv4abpQ5Liw/JAkbq8xkPRqj24GgNMGpXPacNyml1YhdUI/qCKVqNg1GvRaBSyUy3cNG0IK74tobrez7TTepFiMxIKq3gDKia9jt55Dvr2TKO6+sT3KEomkiQ6CZ1WQ69sO8FQhNOGZfPpujJO6p+OxaSjf37bLHQiEnMgQVS4fNgsOgJhlUZXiGgEUGKVSYtRSzAUIaRGafD4+WpzrD6g1SiMHpzJ+OG5WEy6pmUr9eRn2A7p5mw0aDEatKQ7TPEPK18ghD+gAgoFeh0FWbam2mgUBQWtViHFZsBuiTWBnKik8EO9cuwM75vGt9uqmO7ykptmPfKL2lAgqLJzfx3rdlTz8dpSLCYds84fFK/jqZEI/oCKGgWDNpYUgiGVek8QtamgbNBrOO9Hsa6toXCEhsYgRoOO3tn2pq6oSrf4v5Mk0YkY9FryM634Q+ms31nLim9LuGLSABo8QZwywK5DHEgQu8vdrN9Zg0EfKyKnOU2k242YjPp4+31Ng59VG8vZsqcOvU7DqUOzGTckB5tFTzCk0ugLkZdhJc1pOuKVuF6nxanT4jyoeBqJRmM9apruRAx6DUZ94j2fjtfFZ/TlgZe/YcmqPdwwtbDNjhsMqQSCKjqd0mK3UH8wzKbdtSz/eh/b9tUzoMDJjPG9MRl0eP0hwmqsmS3dacJpNTYr9udlxIrPXn+YBk8Qty9EFNBrlVhvo6bk0J1IkuhknFYDGQ4zZ56cx8KVxWza7cJs0GJPgq5y3c2BBLFzfz0LPy+m1n3omAKLSUe6w4RGgT0VjRj1Ws44KZexhdlYTDoikdgcPnqdhv75KcfVEUGjKLHmrQ4aaNk718HQ3ql8vaWSC8f3OWHrH6iRWHNQozdEvSeIsdpLQ4OPaBS0WgWjXoNRp42/92+3VvL+6j24vSEmjy7g1KHZ+IMqHn+IdIcZpy12V9XSh72iKJgMsfpMmsMU77F0oBmqO5Ik0ckoikJehgW310HfPAefry+jX56DusYAaY72Xfa0O4tGo1S4vGzZ62Lh58UEghFmnT+I/HQrrsYAtQ1+ahoC1NT7qWnw4/GHOfuUfMYMzsRk0BFtqk+okQiZKbEeQ23VFNSeLjqjL79/5VuKVu3mugsGH/PdxIE7q5oGPyVVHurcAeo9ARo8IQLhCJqmbqJmoxaTQYfRELtrKq/xser7MuwWA9dNGUReuhW3N4zVpKNvrvOop9/XaJSEpuruyrr3u++k9DotBZlWxg/P4dXl2/lyU0V8srCu8EGT7KLRKJV1XjbsrGHRF7sBmHX+IHLSzKhqlMwU82Gvov3BMMFgJDZgLc2SFNNBnyj98p0U9kzlq82VXDi+N+mOo7ub8AXCfL6+jFUby6itD+D2hZo9bzbGrvB9gRBefxh/8ND5jAb3SmH6ab1RlNhcS3npiTXhiZZJkuiknDYjvXMcnDIog2+2VDGkdyo2kx6NViEcjqBGIqiRWM8NNRLFZNCSkxbrP98dim1t5UCC+HZbNYs/L8Zo0HHNuQNJsRto8IQwGXT4vKFYH1MU9LrYYDetVkMoHMHnD2M16+lZ4OiyY1wuOqMPf3ztPyxZvZdrzhuU0GsavUE+/s9+PvmuFJc7QIrNQO9cO2kOE+kOI6l2Iw6rAaNei91upr7ei1arQaOALxirIXgDYRSgINOKxx/GbNQxoMDR7e8EjpecvU4q1uxkZWxhNpv3uPj0u1Jy0i14fCEavCEaPEEaPEHqPUG8/jCFvVLx+EPYTHpy0mV9iqNxYPI7jy9ErTvA5t21vL9mD06rkavPG4jVpKPRF6Znto1Uuwk1EiEQjBAIhWn0hfH4gnj8YfQ6Db2aip9dOVEP6JHCwB5OVm8sZ9ppvUm1t96pos7tZ/k3+/hiQzlub4isVDMXn9GHXtn22CwWTedJIdZxQ6/T4LQbCQfDBEJhfAGVKFHMBm1TMoji8YXjAwy7ax3hRJIk0YkZ9Fr65Do4fVguy7/ex2P/Wt/seUUhPvdL0eo99M6xc86YAryB2NiKnDRrl72aPV6RSBSvP0SDN4jLHSAcjk2XsLO0nsWr9pCVauaqyQMw6LV4/WH65NjjU0trNRosJg0Wk47UpplTwmoEjaJ0mw+ti87oyyOvr2Xpl3u48pyB8cej0Sj1jUG27nOxYVcta7dX4wuE6ZFlZcqpvchLt6AoChkpsZ5HOm2sB9PB5y0z046laZnOaDSKGokSaurVFQxHMBt08nd9AsmZ7ORS7UbGDM7EGwihURRSbEacNgMpNiMOqz4+g+W3W6tY8e1+Fry/hdOG5zB6YCbb99eRYjVitUvB+4BQOILL7We/y0dNrRetFjTAjtJ6vt1aRWm1h57ZNq6Y1B+NohAIqvTLd2I1Hf7OrLvVigb3TKVfvoNVG8oZNSiTfRWNbC+pZ0+5m6p6X7wXWP98B6cNyyG9aZnezBQTqXZTwjPdKoqCTqt0u/PbniRJdHKKolCQZWfMoGws5tg8O5HI/19d+SMqERWG9EpjUM8UPvymhJXflfH9rlouGNsTg07Dxl216ImQlWrutivfhcJqfHpsgLwcJ/5gmG+3VrF+Zw2BkEqG08S5Y3owalAmkUiUUDhKv3yntHm34qLxffnzP9cx/7W1QGxwWn6GlYE9cshNt5CdbkGviY1Gz0ozk2Y3yYd9EpK/7i7AqNeSl2mlpLIRjRKbStqg12A1GjAaY33I3b4gNQ0Bpo7rxUn9M1iyZg+vf7idob1TueycgdS5Y80q7fnPGo1GCYYi+EMqjd4gNZ4gdkP79vMPhlRqG/zxWVEVBbaX1PPOymJ2lzWg1SgU9k5l1MBMembH1j72BcIQhX75Dpmu/TCG9knjxxP6Uu+JrdOeZjOgNA1+MzZNV2Kz6EmxGSU5JDH5C+8i0h0mHBb9Ie23BzhtRlLtJkoqPWQ4Tfx0+hBWb6zg8/VlPPz3b5h4Sj4jB2RQUeujus5PTrqFFJvxhHYbDKuRWFIIhnF7gzT6Qk1zCkXR6TSoGg0+mUvbJgAAEXBJREFUb4CCzLadAj0UjhBommCvut6H2xNib6Wbbfvq2FvZSDQKmSlmJo8uaJr6RB+P3+cPYdDr6JNnl9UBE3D+2J40eILodLEpsnVaDTpt95jOoquQJNGFHKmpyGrSM6DASU2Dj/JaLz8qzGJYnzSWf1PCkjV7+W5nDdPG9SLdaaKkopEql58Um6HV/Wk1/9+9U6tRYl9NhcZYQlCbRrqG8fhDNHhC1NR78QVUgmE1vlxm7Pkw+Zk2xgzKIN1hPmFNOAdGzPqDsZ5Gjb4goZBKhcvHrrIGdpU2UNXUxJSVYub04bkM6uFkaP8s6uq9APH3oddqyMu04ZTxKAnTaTUyyLOTkyTRzcSKgxYcFiOlNR6C4QizLxzGqvX7Y3PoL97EuKE5TDgpF4hS0+BrdV+RKE2T2B14JApRBUWJzSOkRqCi1su+Kjd7yxsprfG0uBSm1RRba3jVhjKynEbsFsNxT4EeCKrUuv3U1PuJRqNEo1BZ52XH/ga27K2jwRNEUaBXtp2RYzIZ1DPlkK6avkCYUDiC2aj7v/buPbipat8D+HfvPEnTJE3oI22hhUp6AgUpj2otUCk6A1oLxxlmGMYOf4BepyMMIgNVERBxxoIySimg0nvmOlNlRlHAFkHnQDu2Rzt4kXMoBQ7lUYSmrX3RB2nS7Kz7R9p9CGVL9ZKdSH6fv9JkJ/vLnhV+2WvvtRaSbpvUjZBwQkUiTGnUCiTHRfomMXMLmJBgxEN/NeLv/3sd/6hrRv3VDjw5cwxMeg28Xi+8DOIF8aHVtIDBMWNgg+sT+K4z9DoHcMXRg6uOHrgGBHAcEG+JwKwpVoyJ0SNylAo6re82xaEJ2rxehtKK86g83YSEaD2iTQOI1EmfxUi51e9B200nunpd4MHh15tOnL/WhXONneh1DkDBc0hJMGBuejxsY0ziGYvv+oiAAcG33CSvVEKnUSE6VotRIbRKGCFyoyIRxjiOg1GvwViTDv8814zuW248nZmEKSkWVPzQiM9PXPrDn23SqzFpXBRS4o1ItkZCq1ag3y1gwOP1nXgwDv0uAQreC36wm+qvj6dg94F/4czlduhHqTFhjGpEv9wZY+gbXN2tu8+N5sFJ985f60JfvwdKBYeHEo2YmBSFCYkmaNQK33UJt4CePjfAceA5YJRGBUOEGjqNCgnxRvTclD6LIiRcUJEgUKsUSIqLREe3C01tvYg1j8J/5U3EpaZuMMbADQ4CU/CcOCCM5yD+uuY4DuLgWM5358rQFNbuAd8Slx7BA5NejajBMRmC4IXLI8A94B3cxoPRJh1Sxxjxw9kW2BJNiI7SwnyPMRzuAQGXHT24+EsnLjV14+L1m3C6fKObJyQaYU+KwoREI9QqBbxeBqfLA3efFxq1AnEWnW9yuMHV2G4/W9CqlegJyNEm5M+FigQB4PuP3mLUQqdV4vqvvbjV78GExOGLGbHBribBy4ZdX7hdr9MDMAb9KDXiLBGI0N57wRuNToPMNCsabnTjh7PNMEVqYNBJXyR2ugZQ9u2/cepiG/rdAtRKHrYxJkxM9g3kUikVYIyJdzIpeA4WgxZGvf8aAoQQaVQkiJ9RGiVS4o1o7fSttKbgOfG6w9A8Oiqlb1pmxW232jL4V4zRplEw6FS/a3CeIUKNpLhIzPxLDH6sb8Gk8RaMNmoRE6Ubtm2fcwClFedwuqENqWNNmPrQaKTEG6BU+ibS89095QXAEKlTI2G0FhFaVdhMi0HI/UJFggzD8xziLBGIjFDD5RagUg7d385DoeACeoeP1azDzL9E4+zVDnz/zybEm33jNW4fk9B9y43/rjiHf11qR2ZaLLIftsLtYbjlEsC5BHGJzwitClq1csRTPBBChpOtSFy5cgWFhYXo6uqCyWRCUVERkpOT/bYpLi7Gp59+ipiYGADAtGnTsGnTJrkikjtEaFX3nJPoflOrFBgTG4msyVYcrb2Gc42dsBi1SIjWAwA6e/rxtyPnUXelA1mT45A5MRZexiE2ahR0WiU0KgWNYSDkPpKtSGzatAlLly7FwoULcejQIWzcuBGffPLJsO0WLVqE9evXyxWLhCBzpBYPp1hQd7kd1WccGJ9ggNmgRZ9rAP9z5DzOXu3ErClWZE6MAcfzSIk3hO2cU4QEmiw/udrb21FfX4/c3FwAQG5uLurr69HR0SHH7smfDM9zSIzRY/bDVvT1e3DyXCsaHd34W4WvQMx52IqstFgwcBhnpQJBSCDJcibhcDgQGxsLhcL3ZVYoFIiJiYHD4YDZbPbbtqKiAtXV1YiOjsbKlSuRnp4uR0QSYiK0KtiTojB5vBmn/t2Gjh4XLjd1I3tqPB5LixOn6H6Qlv4kJBSF1IXrJUuW4MUXX4RKpUJNTQ0KCgpw5MgRREVFjfgzLBb9b74eHeDJ434PyjLc7TlMUTr0uhkuXv8Zl5u6sSAzGTkzxqDXOYDJqVF/aET2H80STKGSAwidLKGSA3jws8hSJKxWK1paWiAIAhQKBQRBQGtrK6xWq9920dHR4uOsrCxYrVZcvHgRGRkZI95Xe3uvOGXEnaKjI/Hrr6ExRIqyjCxHTKQaCzLGgnFA2jgzbjhuIikuEv19LvT3uWTNEgyhkgMInSyhkgN4MLLwPPebP65luSZhsVhgt9tRXl4OACgvL4fdbh/W1dTS0iI+PnfuHG7cuIFx48bJEZGEKFOkBvZk36jpnlsDSIzRw6iXXjOZEHJ/ydbdtHnzZhQWFmL37t0wGAwoKioCADz//PNYtWoVJk+ejB07duDs2bPgeR4qlQrbtm3zO7sg4YfnOMSP1qPhehesFh1NO02IzGQrEikpKfj888+HPf/xxx+Lj4cKByG302mVmDDGBK2aLlITIreQunBNiBRaR5qQ4KChqYQQQiRRkSCEECKJigQhhBBJVCQIIYRIoiJBCCFEEhUJQgghkh64+wrvtfJYKK1MRlmGC5UcQOhkCZUcQOhkCZUcwJ8/y73ewzH2WysVE0IICWfU3UQIIUQSFQlCCCGSqEgQQgiRREWCEEKIJCoShBBCJFGRIIQQIomKBCGEEElUJAghhEiiIkEIIUTSAzcth5QrV66gsLAQXV1dMJlMKCoqQnJysuw5cnJyoFarodFoAABr167F7NmzZdl3UVERjh07hhs3buDrr7+GzWYDIP+xkcoRjGPT2dmJdevW4dq1a1Cr1UhKSsKWLVtgNptx+vRpbNy4ES6XCwkJCdi+fTssFovsOVJTU2Gz2cDzvt9027ZtQ2pqakByDCkoKMD169fB8zx0Oh3eeOMN2O122duKVI5gfo927dqF4uJise3K2U5+K0fA2gkLE/n5+ezgwYOMMcYOHjzI8vPzg5Jj7ty57MKFC0HZ98mTJ1lTU9OwDHIfG6kcwTg2nZ2d7McffxT/fuedd9irr77KBEFgTzzxBDt58iRjjLGSkhJWWFgoew7GGLPZbKy3tzdg+76b7u5u8fF3333HFi1axBiTv61I5QjW96iuro4tX75c3L/c7UQqB2OBaydh0d3U3t6O+vp65ObmAgByc3NRX1+Pjo6OICeT14wZM2C1Wv2eC8axuVuOYDGZTHjkkUfEv6dOnYqmpibU1dVBo9FgxowZAIAlS5bg6NGjsucIlsjISPFxb28vOI4LSlu5W45gcbvd2LJlCzZv3iw+J3c7kcoRSGHR3eRwOBAbGwuFQgEAUCgUiImJgcPhgNlslj3P2rVrwRjD9OnTsWbNGhgMBtkzDKFj8x9erxefffYZcnJy4HA4EB8fL75mNpvh9XrFbha5cgzJz8+HIAiYM2cOVq5cCbVaHdAMAPD666+jpqYGjDHs27cvaG3lzhxD5G4rH3zwAfLy8pCYmCg+F4x2crccQwLRTsLiTCKUlJWV4fDhwzhw4AAYY9iyZUuwI4WMYB+bt956CzqdDs8995ys+71XjsrKSnz55ZcoKytDQ0MDSkpKZMnx9ttvo7KyEi+//DK2bdsmyz5HmkPutvLzzz+jrq4OS5cuDeh+/j85AtVOwqJIWK1WtLS0QBAEAIAgCGhtbQ1Kl8fQPtVqNZYuXYpTp07JnuHOPHRsfBfTGxsb8f7774PneVitVr/uno6ODvA8H/CziDtzAP85Lnq9HosXL5a9zSxatAi1tbWIi4sLalsZytHZ2Sl7Wzl58iQuXbqEefPmIScnB83NzVi+fDkaGxtlbSdSOaqrqwPWTsKiSFgsFtjtdpSXlwMAysvLYbfbZe9OuXXrFnp6egAAjDEcOXIEdrtd1gx3omMD7NixA3V1dSgpKRFPz9PS0tDf34+ffvoJALB//37Mnz9f9hw3b95Ef38/AMDj8eDYsWMBPy59fX1wOBzi38ePH4fRaJS9rUjl0Gg0sreVF154AdXV1Th+/DiOHz+OuLg4lJaWYsWKFbK2E6kckydPDlg7CZtFhy5duoTCwkJ0d3fDYDCgqKgI48ePlzXDL7/8gpUrV0IQBHi9XqSkpGDDhg2IiYmRZf9bt27Ft99+i7a2NkRFRcFkMqGiokL2Y3O3HHv37g3Ksbl48SJyc3ORnJwMrVYLAEhMTERJSQlOnTqFTZs2+d3aOHr0aFlzrFixAhs3bgTHcfB4PEhPT8drr72GiIiIgOQAgLa2NhQUFMDpdILneRiNRqxfvx6TJk2Sta1I5TAYDEH9HgG+27X37t0Lm80mazuRytHX1xewdhI2RYIQQsjvFxbdTYQQQv4YKhKEEEIkUZEghBAiiYoEIYQQSVQkCCGESKIiQcgIFBcXY+3atUHbf2pqKhobG4O2fxK+wmLuJkLuJT09XXzsdDqhVqvFOYrefPPNYMUiJOioSBAC35w4Q3JycrB161Y89thj4nPFxcUj/iyPxwOlkr5a5MFA3U2EjNDAwADWrVuH9PR0PP300zhz5oz4Wk5ODj766CM888wzmDp1KjweD06fPo0lS5ZgxowZyMvLQ21trbj9gQMHsGDBAqSnp2PevHnYv3+/37727duHWbNmYdasWfjiiy/8XquqqsJTTz2F9PR0zJ49G6WlpYH9h5Pwdt9XqCDkT27u3LmspqbG77mdO3eytLQ0VllZyTweD3v33XfZ4sWL/d6Tl5fHmpqamNPpZM3NzSwjI4NVVlYyQRBYdXU1y8jIYO3t7Ywxxk6cOMEaGxuZ1+tltbW1bMqUKayuro4xxlhVVRXLzMxkFy5cYH19fWzNmjXMZrOxq1evMsYYy8rKEhe56erqEt9HSCDQmQQhIzR9+nRkZ2dDoVBg4cKFOH/+vN/r+fn5sFqt0Gq1OHToEObMmYPs7GzwPI+srCykpaWhqqoKAPD4449j7Nix4DgOGRkZyMrKEieJ++abb/Dss8/CZrNBp9PhpZde8tuPUqlEQ0MDent7YTQaMWnSJHkOAAlL1HFKyAjdPmmbVquFy+Xyu/5w+5TZTU1NOHr0KE6cOCE+5/F4xNXnqqqqUFJSgqtXr8Lr9aK/v19c67u1tRVpaWni+xISEvxy7Ny5E3v27MF7772H1NRUvPLKK34X3gm5n6hIEHKf3L60ptVqxcKFC7F169Zh27ndbqxatQpFRUWYN28eVCoVCgoKwAbn2hxa7W3IncuYTpkyBXv27MHAwADKysqwevVq8QyFkPuNupsICYC8vDycOHEC33//PQRBgMvlQm1tLZqbm+F2u+F2u2E2m6FUKlFVVYWamhrxvfPnz8dXX32FhoYGOJ1O7Nq1S3zN7Xbj8OHD6OnpgUqlQkREhLg4ESGBQK2LkACwWq3YvXs3PvzwQ2RmZiI7OxulpaXwer3Q6/XYsGEDVq9ejZkzZ6K8vNxvPevs7GwsW7YMy5Ytw5NPPolHH33U77MPHTqEnJwcTJs2Dfv378f27dvl/ueRMELrSRBCCJFEZxKEEEIkUZEghBAiiYoEIYQQSVQkCCGESKIiQQghRBIVCUIIIZKoSBBCCJFERYIQQogkKhKEEEIk/R/U+7KLosut2AAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import seaborn as sns; sns.set()\n",
+    "import matplotlib.pyplot as plt\n",
+    "from matplotlib.ticker import MaxNLocator\n",
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.DataFrame(data)\n",
+    "df[\"timing\"] = df[\"timing\"] / df[\"timing\"].max()\n",
+    "ax = sns.lineplot(x=\"threads\", y=\"timing\", data=df)\n",
+    "ax.set_title(\"TestOptimizerAdam.ToyNetTraining timing\")\n",
+    "ax.set_ylabel(\"Time (normalized)\")\n",
+    "ax.set_xlabel(\"Threads\")\n",
+    "ax.xaxis.set_major_locator(MaxNLocator(integer=True))\n",
+    "fig = ax.get_figure()\n",
+    "fig.savefig(\"timing_experiments.png\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/training/src/tests/tests/python/tile.py b/training/src/tests/tests/python/tile.py
new file mode 100644
index 00000000..b64dee25
--- /dev/null
+++ b/training/src/tests/tests/python/tile.py
@@ -0,0 +1,41 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow as tf
+
+tf.random.set_seed(0)
+x = tf.random.uniform([1, 2, 3, 4])
+print("Input: ", x)
+# Depth
+with tf.GradientTape() as g:
+    g.watch(x)
+    y = tf.tile(x, [1, 3, 1, 1])
+    print("Output (Depth * 3) = ", y)
+print("Gradient (Depth * 3) = ", g.gradient(y, x))
+# Height
+with tf.GradientTape() as g:
+    g.watch(x)
+    y = tf.tile(x, [1, 1, 3, 1])
+    print("Output (Height * 3) = ", y)
+print("Gradient (Height * 3) = ", g.gradient(y, x))
+# Width
+with tf.GradientTape() as g:
+    g.watch(x)
+    y = tf.tile(x, [1, 1, 1, 2])
+    print("Output (Width * 2) = ", y)
+print("Gradient (Width * 2) = ", g.gradient(y, x))
+with tf.GradientTape() as g:
+    g.watch(x)
+    y = tf.tile(x, [1, 3, 2, 1])
+    print("Output (Depth * 3, Height * 2, Width * 1) = ", y)
+print("Gradient (Depth * 3, Height * 2, Width * 1) = ", g.gradient(y, x))
diff --git a/training/src/tests/tests/python/trace_optimizers.py b/training/src/tests/tests/python/trace_optimizers.py
new file mode 100644
index 00000000..3efa4c90
--- /dev/null
+++ b/training/src/tests/tests/python/trace_optimizers.py
@@ -0,0 +1,139 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+#!/usr/bin/env python
+import torch
+from pyraul.nn import MLP
+from pyraul.pipeline import train, predict
+from pyraul.losses import cross_entropy
+from pyraul.tools.dataset import Dataset
+from pyraul.tools.logging import logging
+from pyraul.tools.seed import use_seed
+from pyraul.tools.dumping import DumpConfig
+from typing import Callable, Optional
+
+_seed = 0
+
+_config = {
+    "in": 784,
+    "hidden_1": 500,
+    "hidden_2": 100,
+    "out": 10,  # number of classes
+    "batch_size": 50,
+    "sgd": {
+        "lr": 0.1,
+    },
+    "sgd_momentum": {"lr": 0.1, "momentum": 0.5},
+    "sgd_momentum_nesterov": {"lr": 0.1, "momentum": 0.5},
+    "adam": {},
+    "adamax": {},
+    "adagrad": {},
+    "adadelta": {},
+}
+
+
+@use_seed(_seed)
+def trace_optimizer(
+    optimizer_func: Callable,
+    loss_filename: Optional[str] = None,
+    weights_filename: Optional[str] = None,
+) -> None:
+    """
+    Function runs training MLP network with specified optimizzer
+    :param optimizer_func: Function returning optimuzer, takes model as argument
+    :param loss_filename: Path to loss trace file
+    :param weights_filename: Path to weights trace file
+    :return:
+    """
+    logging.info(f"IEEE 754 Precision: {torch.Tensor().type()}")
+    # Configuration
+    model = MLP(**_config)
+    logging.info(f"Architecture:\n{model}")
+    ds = Dataset("MNIST", **_config)
+    optimizer = optimizer_func(model)
+    # Inference
+    accuracy_before = predict(
+        model=model,
+        dataset=ds,
+        preprocessor=lambda images: images.reshape(-1, 28 * 28),
+        **_config,
+    )
+    logging.info(
+        f"Accuracy of the network on the 10000 test images: {accuracy_before:.2f} %"
+    )
+    # Train
+    steps = len(ds.train_loader)
+    train(
+        model=model,
+        dataset=ds,
+        optimizer=optimizer,
+        loss_func=cross_entropy,
+        total_steps=steps,
+        trace_loss=loss_filename and DumpConfig(filename=loss_filename, step=100),
+        trace_weights=weights_filename
+        and DumpConfig(filename=weights_filename, step=100),
+        preprocessor=lambda images: images.reshape(-1, 28 * 28),
+        **_config,
+    )
+    # Inference
+    accuracy_after = predict(
+        model=model,
+        dataset=ds,
+        preprocessor=lambda images: images.reshape(-1, 28 * 28),
+        **_config,
+    )
+    logging.info(
+        f"Accuracy of the network on the 10000 test images: {accuracy_after:.2f} %"
+    )
+
+
+def main():
+    # Default parameters
+    optimizers = {
+        "sgd": lambda model: torch.optim.SGD(model.parameters(), **_config["sgd"]),
+        "sgd_momentum": lambda model: torch.optim.SGD(
+            model.parameters(), **_config["sgd_momentum"]
+        ),
+        "sgd_momentum_nesterov": lambda model: torch.optim.SGD(
+            model.parameters(), **_config["sgd_momentum_nesterov"], nesterov=True
+        ),
+        "adadelta": lambda model: torch.optim.Adadelta(
+            model.parameters(),
+            **_config["adadelta"],
+        ),
+        "adagrad": lambda model: torch.optim.Adagrad(
+            model.parameters(),
+            **_config["adagrad"],
+        ),
+        "adam": lambda model: torch.optim.Adam(
+            model.parameters(),
+            **_config["adam"],
+        ),
+        "adamax": lambda model: torch.optim.Adamax(
+            model.parameters(), **_config["adamax"]
+        ),
+    }
+
+    for name in optimizers.keys():
+        logging.info(f"Trace optimizer {name}")
+        optimizer_func = optimizers[name]
+        trace_optimizer(
+            optimizer_func,
+            loss_filename=f"{name}_loss.txt",
+            weights_filename=f"{name}_weights.txt",
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/src/tests/tests/python/transformer.py b/training/src/tests/tests/python/transformer.py
new file mode 100644
index 00000000..69b467b4
--- /dev/null
+++ b/training/src/tests/tests/python/transformer.py
@@ -0,0 +1,439 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math, copy, time
+from torch.autograd import Variable
+import seaborn
+
+seaborn.set_context(context="talk")
+
+
+class EncoderDecoder(nn.Module):
+    """
+    A standard Encoder-Decoder architecture. Base for this and many
+    other models.
+    """
+
+    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
+        super(EncoderDecoder, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.src_embed = src_embed
+        self.tgt_embed = tgt_embed
+        self.generator = generator
+
+    def forward(self, src, tgt, src_mask, tgt_mask):
+        "Take in and process masked src and target sequences."
+        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
+
+    def encode(self, src, src_mask):
+        return self.encoder(self.src_embed(src), src_mask)
+
+    def decode(self, memory, src_mask, tgt, tgt_mask):
+        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
+
+
+class Generator(nn.Module):
+    "Define standard linear + softmax generation step."
+
+    def __init__(self, d_model, vocab):
+        super(Generator, self).__init__()
+        self.proj = nn.Linear(d_model, vocab)
+
+    def forward(self, x):
+        A = self.proj(x)
+        B = F.log_softmax(A, dim=-1)
+        return B
+
+
+def clones(module, N):
+    "Produce N identical layers."
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+
+
+class Encoder(nn.Module):
+    "Core encoder is a stack of N layers"
+
+    def __init__(self, layer, N):
+        super(Encoder, self).__init__()
+        self.layers = clones(layer, N)
+        self.norm = LayerNorm(layer.size)
+
+    def forward(self, x, mask):
+        "Pass the input (and mask) through each layer in turn."
+        for layer in self.layers:
+            x = layer(x, mask)
+        return self.norm(x)
+
+
+class LayerNorm(nn.Module):
+    "Construct a layernorm module (See citation for details)."
+
+    def __init__(self, features, eps=1e-6):
+        super(LayerNorm, self).__init__()
+        self.a_2 = nn.Parameter(torch.ones(features))
+        self.b_2 = nn.Parameter(torch.zeros(features))
+        self.eps = eps
+
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        res = self.a_2 * (x - mean) / (std + self.eps) + self.b_2
+        return res
+
+
+class SublayerConnection(nn.Module):
+    """
+    A residual connection followed by a layer norm.
+    Note for code simplicity the norm is first as opposed to last.
+    """
+
+    def __init__(self, size, dropout):
+        super(SublayerConnection, self).__init__()
+        self.norm = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, sublayer):
+        "Apply residual connection to any sublayer with the same size."
+        y = x + self.dropout(sublayer(self.norm(x)))
+        return y
+
+
+class EncoderLayer(nn.Module):
+    "Encoder is made up of self-attn and feed forward (defined below)"
+
+    def __init__(self, size, self_attn, feed_forward, dropout):
+        super(EncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.sublayer = clones(SublayerConnection(size, dropout), 2)
+        self.size = size
+
+    def forward(self, x, mask):
+        "Follow Figure 1 (left) for connections."
+        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
+        return self.sublayer[1](x, self.feed_forward)
+
+
+class Decoder(nn.Module):
+    "Generic N layer decoder with masking."
+
+    def __init__(self, layer, N):
+        super(Decoder, self).__init__()
+        self.layers = clones(layer, N)
+        self.norm = LayerNorm(layer.size)
+
+    def forward(self, x, memory, src_mask, tgt_mask):
+        for layer in self.layers:
+            x = layer(x, memory, src_mask, tgt_mask)
+        return self.norm(x)
+
+
+class DecoderLayer(nn.Module):
+    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
+
+    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
+        super(DecoderLayer, self).__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        self.sublayer = clones(SublayerConnection(size, dropout), 3)
+
+    def forward(self, x, memory, src_mask, tgt_mask):
+        "Follow Figure 1 (right) for connections."
+        m = memory
+        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
+        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
+        return self.sublayer[2](x, self.feed_forward)
+
+
+def subsequent_mask(size):
+    "Mask out subsequent positions."
+    attn_shape = (1, size, size)
+    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype("uint8")
+    return torch.from_numpy(subsequent_mask) == 0
+
+
+def attention(query, key, value, mask=None, dropout=None):
+    "Compute 'Scaled Dot Product Attention'"
+    d_k = query.size(-1)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+    if mask is not None:
+        scores = scores.masked_fill(mask == 0, -1e9)
+    p_attn = F.softmax(scores, dim=-1)
+    if dropout is not None:
+        p_attn = dropout(p_attn)
+    return torch.matmul(p_attn, value), p_attn
+
+
+class MultiHeadedAttention(nn.Module):
+    def __init__(self, h, d_model, dropout=0.1):
+        "Take in model size and number of heads."
+        super(MultiHeadedAttention, self).__init__()
+        assert d_model % h == 0
+        # We assume d_v always equals d_k
+        self.d_k = d_model // h
+        self.h = h
+        self.linears = clones(nn.Linear(d_model, d_model), 4)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, query, key, value, mask=None):
+        "Implements Figure 2"
+        if mask is not None:
+            # Same mask applied to all h heads.
+            mask = mask.unsqueeze(1)
+        nbatches = query.size(0)
+        # print('Before heads splitting')
+        # print('Q', query)
+        # print('K', key)
+        # print('V', value)
+        # 1) Do all the linear projections in batch from d_model => h x d_k
+        query, key, value = [
+            l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
+            for l, x in zip(self.linears, (query, key, value))
+        ]
+        # print('After heads splitting')
+        # print('Q', query)
+        # print('K', key)
+        # print('V', value)
+        # 2) Apply attention on all the projected vectors in batch.
+        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
+        # print('x', x)
+        # print('self.attn', self.attn)
+        # 3) "Concat" using a view and apply a final linear.
+        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
+        y = self.linears[-1](x)
+        return y
+
+
+class PositionwiseFeedForward(nn.Module):
+    "Implements FFN equation."
+
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        return self.w_2(self.dropout(F.relu(self.w_1(x))))
+
+
+class Embeddings(nn.Module):
+    def __init__(self, d_model, vocab):
+        super(Embeddings, self).__init__()
+        self.lut = nn.Embedding(vocab, d_model)
+        self.d_model = d_model
+
+    def forward(self, x):
+        return self.lut(x) * math.sqrt(self.d_model)
+
+
+class PositionalEncoding(nn.Module):
+    "Implement the PE function."
+
+    def __init__(self, d_model, dropout, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        x = x + Variable(self.pe[:, : x.size(1)], requires_grad=False)
+        return self.dropout(x)
+
+
+def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
+    "Helper: Construct a model from hyperparameters."
+    c = copy.deepcopy
+    attn = MultiHeadedAttention(h, d_model, dropout)
+    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
+    position = PositionalEncoding(d_model, dropout)
+    model = EncoderDecoder(
+        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
+        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
+        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
+        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
+        Generator(d_model, tgt_vocab),
+    )
+
+    # This was important from their code.
+    # Initialize parameters with Glorot / fan_avg.
+    for p in model.parameters():
+        if p.dim() > 1:
+            nn.init.xavier_uniform_(p)
+    return model
+
+
+class NoamOpt:
+    "Optim wrapper that implements rate."
+
+    def __init__(self, model_size, factor, warmup, optimizer):
+        self.optimizer = optimizer
+        self._step = 0
+        self.warmup = warmup
+        self.factor = factor
+        self.model_size = model_size
+        self._rate = 0
+
+    def step(self):
+        "Update parameters and rate"
+        self._step += 1
+        rate = self.rate()
+        for p in self.optimizer.param_groups:
+            p["lr"] = rate
+        self._rate = rate
+        self.optimizer.step()
+
+    def rate(self, step=None):
+        "Implement `lrate` above"
+        if step is None:
+            step = self._step
+        return self.factor * (
+            self.model_size ** (-0.5)
+            * min(step ** (-0.5), step * self.warmup ** (-1.5))
+        )
+
+
+def get_std_opt(model):
+    return NoamOpt(
+        model.src_embed[0].d_model,
+        2,
+        4000,
+        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9),
+    )
+
+
+class Batch:
+    "Object for holding a batch of data with mask during training."
+
+    def __init__(self, src, trg=None, pad=0):
+        self.src = src
+        self.src_mask = (src != pad).unsqueeze(-2)
+        if trg is not None:
+            self.trg = trg[:, :-1]
+            self.trg_y = trg[:, 1:]
+            self.trg_mask = self.make_std_mask(self.trg, pad)
+            self.ntokens = (self.trg_y != pad).data.sum()
+
+    @staticmethod
+    def make_std_mask(tgt, pad):
+        "Create a mask to hide padding and future words."
+        tgt_mask = (tgt != pad).unsqueeze(-2)
+        tgt_mask = tgt_mask & Variable(
+            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)
+        )
+        return tgt_mask
+
+
+def run_epoch(data_iter, model, loss_compute):
+    "Standard Training and Logging Function"
+    start = time.time()
+    total_tokens = 0
+    total_loss = 0.0
+    tokens = 0
+    for i, batch in enumerate(data_iter):
+        out = model.forward(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
+        loss = loss_compute(out, batch.trg_y, batch.ntokens)
+        total_loss += loss
+        total_tokens += batch.ntokens
+        tokens += batch.ntokens
+        # if i % 50 == 1:
+        #    elapsed = time.time() - start
+        #    print(
+        #        "Epoch Step: %d Loss: %f Tokens per Sec: %f"
+        #        % (i, loss / batch.ntokens, tokens / elapsed)
+        #    )
+        #    start = time.time()
+        #    tokens = 0
+    return total_loss / float(total_tokens)
+
+
+class LabelSmoothing(nn.Module):
+    "Implement label smoothing."
+
+    def __init__(self, size, padding_idx, smoothing=0.0):
+        super(LabelSmoothing, self).__init__()
+        self.criterion = nn.KLDivLoss(reduction="sum")
+        self.padding_idx = padding_idx
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+        self.size = size
+        self.true_dist = None
+
+    def forward(self, x, target):
+        assert x.size(1) == self.size
+        true_dist = x.data.clone()
+        true_dist.fill_(self.smoothing / (self.size - 2))
+        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
+        true_dist[:, self.padding_idx] = 0
+        mask = torch.nonzero(target.data == self.padding_idx)
+        if mask.dim() > 0:
+            true_dist.index_fill_(0, mask.squeeze(), 0.0)
+        self.true_dist = true_dist
+        return self.criterion(x, Variable(true_dist, requires_grad=False))
+
+
+class SimpleLossCompute:
+    "A simple loss compute and train function."
+
+    def __init__(self, generator, criterion, opt=None):
+        self.generator = generator
+        self.criterion = criterion
+        self.opt = opt
+
+    def __call__(self, x, y, norm):
+        x = self.generator(x)
+        loss = (
+            self.criterion(x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1))
+            # / norm
+        )
+
+        loss.backward()
+        if self.opt is not None:
+            self.opt.step()
+            self.opt.optimizer.zero_grad()
+        return loss.item()  # * norm
+
+
+def greedy_decode(model, src, src_mask, max_len, start_symbol):
+    memory = model.encode(src, src_mask)
+    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
+    for i in range(max_len - 1):
+        out = model.decode(
+            memory,
+            src_mask,
+            Variable(ys),
+            Variable(subsequent_mask(ys.size(1)).type_as(src.data)),
+        )
+        prob = model.generator(out[:, -1])
+        _, next_word = torch.max(prob, dim=1)
+        next_word = next_word.data[0]
+        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
+    return ys
diff --git a/training/src/tests/tests/python/transformer_test1.py b/training/src/tests/tests/python/transformer_test1.py
new file mode 100644
index 00000000..d6e44ee4
--- /dev/null
+++ b/training/src/tests/tests/python/transformer_test1.py
@@ -0,0 +1,156 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+import os
+from transformer import (
+    make_model,
+    Batch,
+    LabelSmoothing,
+    NoamOpt,
+    run_epoch,
+    greedy_decode,
+    SimpleLossCompute,
+)
+
+
+def save_weights(path, index, model):
+    if not os.path.exists(path):
+        os.mkdir(path)
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if param.data.dim() == 4:
+                for i in range(0, param.data.shape[0]):
+                    with open(
+                        path + str(index) + "_" + name + "_" + str(i) + ".txt", "w"
+                    ) as outfile:
+                        for j in range(0, param.data.shape[1]):
+                            np.savetxt(outfile, param.data[i, j].cpu())
+            else:
+                with open(path + str(index) + "_" + name + ".txt", "w") as outfile:
+                    np.savetxt(outfile, param.data.cpu())
+
+
+def data_gen(V, batch, nbatches):
+    "Generate random data for a src-tgt copy task."
+    for i in range(nbatches):
+        data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))
+        # print(data)
+        data = data.type(torch.LongTensor)
+        data[:, 0] = 1
+        src = Variable(data, requires_grad=False)
+        tgt = Variable(data, requires_grad=False)
+        yield Batch(src, tgt, 0)
+
+
+def data_gen2(input, batch, nbatches):
+    "Generate random data for a src-tgt copy task."
+    for i in range(nbatches):
+        data = torch.from_numpy(input[i * batch : (i + 1) * batch])
+        data = data.type(torch.LongTensor)
+        data[:, 0] = 1
+        src = Variable(data, requires_grad=False)
+        tgt = Variable(data, requires_grad=False)
+        yield Batch(src, tgt, 0)
+
+
+class TestOpt:
+    "Optim wrapper that implements rate."
+
+    def __init__(self, optimizer):
+        self.optimizer = optimizer
+
+    def step(self):
+        self.optimizer.step()
+
+
+torch.set_printoptions(precision=10)
+# Train the simple copy task.
+V = 11
+MODEL_SIZE = 64
+Heads = 4
+LENGTH = 4
+criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
+
+np.random.seed(0)
+torch.manual_seed(0)
+model = make_model(V, V, N=LENGTH, h=Heads, d_model=MODEL_SIZE, dropout=0.0)
+
+for p in model.parameters():
+    if p.dim() > 1:
+        nn.init.xavier_uniform_(p)
+
+P = [x[0] for x in model.named_parameters()]
+
+model_opt = TestOpt(torch.optim.SGD(model.parameters(), lr=0.002))
+
+nepoch = 20  # 10
+nepoch2 = 10
+batch_size = 30
+nbatches = 20
+
+input = []
+np.random.seed(0)
+input = np.random.randint(1, V, size=(batch_size * nbatches * (nepoch + nepoch2), 10))
+
+path = "../../../../testAssets/transformer/{}_{}_{}__{}/".format(
+    MODEL_SIZE, Heads, LENGTH, nepoch
+)
+
+for epoch in range(nepoch):
+    model.train()
+    tl = run_epoch(
+        data_gen2(
+            input[epoch * nbatches * batch_size : (epoch + 1) * nbatches * batch_size],
+            batch_size,
+            nbatches,
+        ),
+        model,
+        SimpleLossCompute(model.generator, criterion, model_opt),
+    )
+    print("Epoch", epoch, ":", tl)
+
+save_weights(path, nepoch, model)
+with open(path + "input.txt", "w") as outfile:
+    np.savetxt(outfile, input)
+print("Fine-tuning")
+
+for epoch in range(nepoch2):
+    model.train()
+    tl = run_epoch(
+        data_gen2(
+            input[
+                (epoch + nepoch)
+                * nbatches
+                * batch_size : (epoch + nepoch + 1)
+                * nbatches
+                * batch_size
+            ],
+            batch_size,
+            nbatches,
+        ),
+        model,
+        SimpleLossCompute(model.generator, criterion, model_opt),
+    )
+    print("Epoch", epoch + nepoch, ":", tl)
+
+model.eval()
+src = Variable(torch.LongTensor([[i + 1 for i in range(V - 1)]]))
+src_mask = Variable(torch.ones(1, 1, V - 1))
+print("before decode")
+print(greedy_decode(model, src, src_mask, max_len=V - 1, start_symbol=1))
diff --git a/training/src/tests/tests/python/transformer_test2.py b/training/src/tests/tests/python/transformer_test2.py
new file mode 100644
index 00000000..a56b5545
--- /dev/null
+++ b/training/src/tests/tests/python/transformer_test2.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from transformer import (
+    make_model,
+    Batch,
+    LabelSmoothing,
+    NoamOpt,
+    run_epoch,
+    greedy_decode,
+    SimpleLossCompute,
+)
+
+
+def data_gen(V, batch, nbatches):
+    "Generate random data for a src-tgt copy task."
+    for i in range(nbatches):
+        data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))
+        print(data)
+        data = data.type(torch.LongTensor)
+        data[:, 0] = 1
+        src = Variable(data, requires_grad=False)
+        tgt = Variable(data, requires_grad=False)
+        yield Batch(src, tgt, 0)
+
+
+# Train the simple copy task.
+V = 11
+MODEL_SIZE = 10
+Heads = 2
+criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
+model = make_model(V, V, N=2, h=Heads, d_model=MODEL_SIZE, dropout=0.0)
+
+for p in model.parameters():
+    nn.init.ones_(p)
+
+model_opt = NoamOpt(
+    model.src_embed[0].d_model,
+    1,
+    400,
+    torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9),
+)
+
+nepoch = 5  # 10
+batch_size = 2  # 30
+nbatches = 2  # 20
+nbatches_eval = 5
+
+np.random.seed(0)
+
+for epoch in range(nepoch):
+    model.train()
+    run_epoch(
+        data_gen(V, batch_size, nbatches),
+        model,
+        SimpleLossCompute(model.generator, criterion, model_opt),
+    )
+    model.eval()
+    print(
+        run_epoch(
+            data_gen(V, batch_size, nbatches_eval),
+            model,
+            SimpleLossCompute(model.generator, criterion, None),
+        )
+    )
+
+model.eval()
+src = Variable(torch.LongTensor([[i + 1 for i in range(V - 1)]]))
+src_mask = Variable(torch.ones(1, 1, V - 1))
+print("before decode")
+print(greedy_decode(model, src, src_mask, max_len=V - 1, start_symbol=1))
diff --git a/training/src/tests/tests/python/transposed_conv1d.py b/training/src/tests/tests/python/transposed_conv1d.py
new file mode 100644
index 00000000..d82e6a05
--- /dev/null
+++ b/training/src/tests/tests/python/transposed_conv1d.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import torch
+
+# C-tor Check
+torch.manual_seed(42)
+torch.set_printoptions(precision=8)
+m = torch.nn.ConvTranspose1d(3, 2, 3, 3, 2, 1, 1, True, 3)
+input = torch.rand(2, 3, 3)
+print("Weights shape:", m.weight.shape)
+print("Biases shape:", m.bias.shape)
+print("Output shape:", m(input).shape)
+
+# Simple Unit
+## Setup
+m = torch.nn.ConvTranspose1d(1, 1, 2, bias=False)
+input = torch.rand(2, 1, 3, requires_grad=True)
+print("Convolution weights:", m.weight)
+print("Input:", input)
+## Forward
+result = m(input)
+print("Result:", result)
+## Backward
+result.sum().backward()
+print("Gradient for input:", input.grad)
+print("Gradient for weights:", m.weight.grad)
+
+# Non-trivial Unit
+m = torch.nn.ConvTranspose1d(4, 6, 3, 2, 2, 1, 2, True, 3)
+input = torch.rand(2, 4, 3, requires_grad=True)
+print("Convolution weights:", m.weight)
+print("Convolution bias:", m.bias)
+print("Input:", input)
+## Forward
+result = m(input)
+print("Result:", result)
+## Backward
+deltas = torch.rand(2, 6, 8)
+result.backward(deltas)
+print("Incoming deltas:", deltas)
+print("Gradient for input:", input.grad)
+print("Gradient for weights:", m.weight.grad)
+print("Gradient for biases:", m.bias.grad)
diff --git a/training/src/tests/tests/python/transposed_conv2d.py b/training/src/tests/tests/python/transposed_conv2d.py
new file mode 100644
index 00000000..ea990229
--- /dev/null
+++ b/training/src/tests/tests/python/transposed_conv2d.py
@@ -0,0 +1,66 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+# Bias unit
+import torch
+
+m = torch.nn.ConvTranspose2d(1, 2, 3)
+input = torch.ones(1, 1, 3, 3, requires_grad=True)
+m.weight = torch.nn.Parameter(torch.ones(1, 2, 3, 3))
+m.bias = torch.nn.Parameter(torch.tensor([2.0, 3.0]))
+result = m(input)
+
+## Input
+print(input)
+
+## Result
+print(result)
+
+# Simple unit
+import torch
+
+torch.manual_seed(0)
+## Create convolution and input
+m = torch.nn.ConvTranspose2d(1, 1, 2, bias=False)
+input = torch.randn(1, 1, 2, 2, requires_grad=True)
+print("Weights:", m.weight)
+print("Bias:", m.bias)
+print("Input:", input)
+## Forward
+result = m(input)
+print("Result:", result)
+## Backward
+result.sum().backward()
+print("Gradient for input:", input.grad)
+print("Gradient for weights:", m.weight.grad)
+
+# Non-trivial Unit
+torch.manual_seed(0)
+torch.set_printoptions(precision=8)
+## Create convolution and input
+m = torch.nn.ConvTranspose2d(3, 2, (3, 1), (2, 3), (2, 1), dilation=(2, 3), bias=True)
+print("Weights:", m.weight)
+print("Bias:", m.bias)
+torch.manual_seed(0)
+input = torch.randn(2, 3, 3, 3, requires_grad=True)
+print("Input:", input)
+## Forward
+result = m(input)
+print("Result:", result)
+## Backward
+deltas = torch.randn(2, 2, 5, 5)
+result.backward(deltas)
+print("Gradient for input:", input.grad)
+print("Gradient for weights:", m.weight.grad)
+print("Gradient for bias:", m.bias.grad)
diff --git a/training/src/tests/tests/tools/TestTools.cpp b/training/src/tests/tests/tools/TestTools.cpp
new file mode 100644
index 00000000..7e64f41a
--- /dev/null
+++ b/training/src/tests/tests/tools/TestTools.cpp
@@ -0,0 +1,215 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/BasicLayer.h>
+#include <training/compiler/Layers.h>
+
+namespace UT::tools
+{
+
+std::map<string, string> ARGS::ARGUMENTS;
+
+using namespace std;
+using namespace raul;
+
+void checkTensors(const vector<pair<Name, Name>>& tensors, MemoryManager& m, dtype eps)
+{
+    for (const auto& it : tensors)
+    {
+        Tensor& t = m[it.first];
+        Tensor& golden = m[it.second];
+        EXPECT_EQ(t.getShape(), golden.getShape());
+        for (size_t i = 0; i < golden.size(); ++i)
+        {
+            CHECK_NEAR(t[i], golden[i], eps);
+        }
+        cout << "Tensor '" << it.first << "' checked" << endl;
+    }
+}
+
+size_t get_size_of_trainable_params(const Workflow& network)
+{
+    size_t amount_of_trainable_parameters = 0U;
+    for (const auto& name : network.getTrainableParameterNames())
+    {
+        amount_of_trainable_parameters += network.getMemoryManager()[name].getShape().total_size();
+    }
+    return amount_of_trainable_parameters;
+}
+
+size_t get_size_of_trainable_params_mixed_precision(const Workflow& network)
+{
+    size_t amount_of_trainable_parameters = 0U;
+    for (const auto& name : network.getTrainableParameterNames())
+    {
+        if (network.getMemoryManager().tensorExists(name))
+            amount_of_trainable_parameters += network.getMemoryManager()[name].getShape().total_size();
+        else if (network.getMemoryManager<MemoryManagerFP16>().tensorExists(name))
+            amount_of_trainable_parameters += network.getMemoryManager<MemoryManagerFP16>()[name].getShape().total_size();
+    }
+    return amount_of_trainable_parameters;
+}
+
+dtype TensorDiff(const Tensor& a, const Tensor& b)
+{
+    if (a.size() != b.size() || a.empty() || b.empty())
+    {
+        THROW_NONAME("TestTools", "empty tensor");
+    }
+    dtype sum = 0;
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        sum += (a[i] - b[i]) * (a[i] - b[i]);
+    }
+    return std::sqrt(sum);
+}
+
+dtype TensorNorm(const Tensor& a)
+{
+    if (a.empty())
+    {
+        THROW_NONAME("TestTools", "empty tensor");
+    }
+    dtype sum = 0;
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        sum += a[i] * a[i];
+    }
+    return std::sqrt(sum);
+}
+
+dtype TensorNorm(const TensorFP16& a)
+{
+    if (a.empty())
+    {
+        THROW_NONAME("TestTools", "empty tensor");
+    }
+    dtype sum = 0;
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        sum += TODTYPE(a[i]) * TODTYPE(a[i]);
+    }
+    return std::sqrt(sum);
+}
+
+#if !defined(_WIN32)
+long getPeakOfMemory()
+{
+    rusage ru{};
+    getrusage(RUSAGE_SELF, &ru);
+    return ru.ru_maxrss;
+}
+
+Timestamp getCPUTimestamp()
+{
+    rusage ru{};
+    getrusage(RUSAGE_SELF, &ru);
+    return { ru.ru_utime, ru.ru_stime };
+}
+
+std::pair<double, double> getElapsedTime(Timestamp begin, Timestamp end)
+{
+    const auto userDiff = (end.user.tv_sec - begin.user.tv_sec) + (end.user.tv_usec - begin.user.tv_usec) / 1e6;
+    const auto systemDiff = end.system.tv_sec - begin.system.tv_sec + (end.system.tv_usec - begin.system.tv_usec) / 1e6;
+    return make_pair(userDiff, systemDiff);
+}
+
+#endif
+
+} // UT::tools
+
+namespace raul::helpers
+{
+
+using namespace std;
+
+ProfilerGuard::ProfilerGuard(const string& testName)
+    : mWasEnabled(false)
+    , mTestName(testName)
+{
+    if (Profiler::getInstance().isDisabled())
+    {
+        return;
+    }
+
+    mWasEnabled = Profiler::getInstance().getState();
+    if (Profiler::getInstance().useJsonFormat())
+    {
+        Profiler::getInstance().increasePrefix(mTestName);
+    }
+    else
+    {
+        *Profiler::getInstance().getOstream() << mTestName + "\n";
+    }
+    Profiler::getInstance().enableProfiler();
+}
+
+ProfilerGuard::~ProfilerGuard()
+{
+    if (!mWasEnabled && !Profiler::getInstance().isDisabled())
+    {
+        if (Profiler::getInstance().useJsonFormat())
+        {
+            Profiler::getInstance().clearPrefix();
+        }
+        Profiler::getInstance().disableProfiler();
+    }
+}
+
+void ListenerHelper::addProcesser(const ListenerHelper::ProcessData& checkData)
+{
+    mProcessers.push_back(checkData);
+}
+
+void Checker::check(const Workflow& work) const
+{
+    const auto& memory_manager = work.getMemoryManager();
+
+    if (mProcessers.empty())
+    {
+        THROW_NONAME("Checker", "empty processors");
+    }
+
+    for (const auto& checkData : mProcessers)
+    {
+        const auto& tensor = memory_manager[checkData.first];
+        EXPECT_EQ(tensor.size(), checkData.second.size());
+
+        for (size_t i = 0; i < tensor.size(); ++i)
+        {
+            ASSERT_TRUE(UT::tools::expect_near_relative(tensor[i], checkData.second[i], mEps)) << "at " << i << ", expected: " << checkData.second[i] << ", got: " << tensor[i];
+        }
+    }
+}
+
+void FillerBeforeBackward::BeforeBackward(Workflow& work)
+{
+    auto& memory_manager = work.getMemoryManager();
+
+    if (mProcessers.empty())
+    {
+        THROW_NONAME("FillerBeforeBackward", "empty processors");
+    }
+
+    for (const auto& checkData : mProcessers)
+    {
+        auto& tensor = memory_manager[checkData.first];
+        EXPECT_EQ(tensor.size(), checkData.second.size());
+
+        tensor = TORANGE(checkData.second);
+    }
+}
+
+} // namespace UT::tools
\ No newline at end of file
diff --git a/training/src/tests/tests/tools/TestTools.h b/training/src/tests/tests/tools/TestTools.h
new file mode 100644
index 00000000..3d1cdfb0
--- /dev/null
+++ b/training/src/tests/tests/tools/TestTools.h
@@ -0,0 +1,253 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TESTTOOLS_H
+#define TESTTOOLS_H
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <optional>
+#include <training/base/common/Random.h>
+#include <training/base/layers/parameters/LayerParameters.h>
+#include <training/compiler/Workflow.h>
+#include <training/compiler/WorkflowEager.h>
+#include <training/system/Profiler.h>
+ 
+#include <unordered_map>
+
+#if !defined(_WIN32)
+#include <sys/resource.h>
+#endif
+
+#define TEST_NAME (::testing::UnitTest::GetInstance()->current_test_info()->test_case_name() + std::string(".") + ::testing::UnitTest::GetInstance()->current_test_info()->name())
+
+using namespace raul;
+using namespace std;
+
+namespace UT::tools
+{
+
+class ARGS
+{
+  public:
+    static std::map<string, string> ARGUMENTS;
+};
+
+template<typename T>
+inline T getArg(const string&, T defaultVal)
+{
+    return defaultVal;
+}
+
+template<>
+inline std::string getArg(const string& name, std::string defaultVal)
+{
+    auto v = ARGS::ARGUMENTS.find(name);
+    if (v == ARGS::ARGUMENTS.end())
+    {
+        return defaultVal;
+    }
+    return v->second;
+}
+
+inline std::string getArg(const string& name, const char* defaultVal)
+{
+    auto v = ARGS::ARGUMENTS.find(name);
+    if (v == ARGS::ARGUMENTS.end())
+    {
+        return string(defaultVal);
+    }
+    return v->second;
+}
+
+template<>
+inline size_t getArg(const string& name, size_t defaultVal)
+{
+    auto v = ARGS::ARGUMENTS.find(name);
+    if (v == ARGS::ARGUMENTS.end())
+    {
+        return defaultVal;
+    }
+    return stoul(v->second);
+}
+
+template<>
+inline int getArg(const string& name, int defaultVal)
+{
+    auto v = ARGS::ARGUMENTS.find(name);
+    if (v == ARGS::ARGUMENTS.end())
+    {
+        return defaultVal;
+    }
+    return stoi(v->second);
+}
+
+template<>
+inline bool getArg(const string& name, bool defaultVal)
+{
+    auto v = ARGS::ARGUMENTS.find(name);
+    if (v == ARGS::ARGUMENTS.end())
+    {
+        return defaultVal;
+    }
+    return v->second == "1" || v->second == "true" || v->second == "yes";
+}
+
+void checkTensors(const std::vector<std::pair<raul::Name, raul::Name>>& tensors, raul::MemoryManager& m, raul::dtype eps);
+
+template<typename MM>
+void init_rand_tensor(const std::string& tensor_name, const raul::random::dataRange<typename MM::type> random_range, MM& memory_manager)
+{
+    auto& tensor = memory_manager[tensor_name];
+    for (auto& val : tensor)
+    {
+        val = raul::random::uniform::rand<typename MM::type>(random_range);
+    }
+}
+
+size_t get_size_of_trainable_params(const raul::Workflow& network);
+size_t get_size_of_trainable_params_mixed_precision(const raul::Workflow& network);
+
+dtype TensorDiff(const raul::Tensor& a, const raul::Tensor& b);
+dtype TensorNorm(const raul::Tensor& a);
+dtype TensorNorm(const raul::TensorFP16& a);
+
+template<typename T>
+bool expect_near_relative(const T a, const T b, const T epsilon)
+{
+    const auto diff = static_cast<T>(std::abs(TODTYPE(a - b)));
+    constexpr auto min_t = std::numeric_limits<T>::min();
+    if (a == b)
+    {
+        return true;
+    }
+    if (a == static_cast<T>(0.0) || b == static_cast<T>(0.0) || diff < min_t)
+    {
+        return diff < (epsilon * min_t);
+    }
+    const auto abs_a = static_cast<T>(std::abs(TODTYPE(a)));
+    const auto abs_b = static_cast<T>(std::abs(TODTYPE(b)));
+    return (diff / abs_a <= epsilon) && (diff / abs_b <= epsilon);
+}
+
+#if !defined(_WIN32)
+/**
+ * Return the peak of really used RAM (RSS)
+ * This is a high water mark for RSS.
+ * @return maximum resident set size in kilobytes
+ */
+long getPeakOfMemory();
+
+struct Timestamp
+{
+    timeval user;
+    timeval system;
+};
+/**
+ * Get timestamp form the kernel
+ * @return timestamp (see Timestamp)
+ */
+Timestamp getCPUTimestamp();
+
+/**
+ * Calculate elapsed time
+ *
+ * @param begin timestamp of start
+ * @param end timestamp of stop
+ * @return a pair with elapsed time in seconds (user, system)
+ */
+std::pair<double, double> getElapsedTime(Timestamp begin, Timestamp end);
+#endif
+
+} // namespace UT::tools
+
+namespace raul::helpers
+{
+
+class ProfilerGuard
+{
+  public:
+    ProfilerGuard(const std::string& testName);
+
+    ~ProfilerGuard();
+
+  private:
+    bool mWasEnabled;
+    std::string mTestName;
+};
+
+#define PROFILE_TEST raul::helpers::ProfilerGuard guard{ TEST_NAME };
+
+class ListenerHelper : public WorkflowListener
+{
+  public:
+    typedef std::pair<raul::Name, const raul::Tensor&> ProcessData;
+
+    void addProcesser(const ProcessData& checkData);
+
+  protected:
+    std::vector<ProcessData> mProcessers;
+};
+
+class Checker : public ListenerHelper
+{
+  public:
+    Checker()
+        : mEps(1e-5_dt)
+    {
+    }
+
+    Checker(raul::dtype eps)
+        : mEps(eps)
+    {
+    }
+
+  protected:
+    void check(const Workflow& work) const;
+
+    raul::dtype mEps;
+};
+
+class CheckerAfterForward : public Checker
+{
+  public:
+    CheckerAfterForward(raul::dtype eps)
+        : Checker(eps)
+    {
+    }
+
+    void AfterForward(Workflow& work) override { check(work); }
+};
+
+class CheckerAfterBackward : public Checker
+{
+  public:
+    CheckerAfterBackward(raul::dtype eps)
+        : Checker(eps)
+    {
+    }
+
+    void AfterBackward(Workflow& work) override { check(work); }
+};
+
+class FillerBeforeBackward : public ListenerHelper
+{
+  public:
+    FillerBeforeBackward() {}
+
+    void BeforeBackward(Workflow& work) override;
+};
+}
+
+#endif
diff --git a/training/src/tests/tests/tools/callbacks/LayerTypeStatistics.h b/training/src/tests/tests/tools/callbacks/LayerTypeStatistics.h
new file mode 100644
index 00000000..642dc39b
--- /dev/null
+++ b/training/src/tests/tests/tools/callbacks/LayerTypeStatistics.h
@@ -0,0 +1,163 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef LAYER_TYPE_STATISTICS_H
+#define LAYER_TYPE_STATISTICS_H
+
+#include <tests/tools/TestTools.h>
+
+#include <algorithm>
+#include <chrono>
+#include <map>
+#include <vector>
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace UT::tools::callbacks
+{
+using namespace std;
+using namespace raul;
+
+struct LayerTypeStat
+{
+    LayerTypeStat(Name name = "")
+        : mTypeName(name)
+    {
+    }
+    Name mTypeName;
+    size_t mCallCount = 0;
+    float mLongestForwardCall = 0;
+    float mLongestBackwardCall = 0;
+    float mTotalForwardTime = 0;
+    float mTotalBackwardTime = 0;
+
+    void print(ostream& o, float epochTime) const
+    {
+        o << mTypeName << endl;
+        o << "  calls: " << mCallCount << endl;
+        o << "  total forward: " << mTotalForwardTime << " (" << 100.f * mTotalForwardTime / epochTime << "%)" << endl;
+        o << "  total backward: " << mTotalBackwardTime << " (" << 100.f * mTotalBackwardTime / epochTime << "%)" << endl;
+        o << "  longest forward: " << mLongestForwardCall << " (" << 100.f * mLongestForwardCall / epochTime << "%)" << endl;
+        o << "  longest backward: " << mLongestBackwardCall << " (" << 100.f * mLongestBackwardCall / epochTime << "%)" << endl;
+        o << "  average forward: " << mTotalForwardTime / static_cast<float>(mCallCount) << endl;
+        o << "  average backward: " << mTotalBackwardTime / static_cast<float>(mCallCount) << endl;
+    }
+
+    void operator+=(LayerTypeStat s) 
+    { 
+        mCallCount += s.mCallCount;
+        mLongestForwardCall += s.mLongestForwardCall;
+        mLongestBackwardCall += s.mLongestBackwardCall;
+        mTotalForwardTime += s.mTotalForwardTime;
+        mTotalBackwardTime += s.mTotalBackwardTime;
+    }
+};
+
+class LayerTypeStatistics
+{
+  public:
+    LayerTypeStatistics() {}
+
+    void operator()(BasicLayer* layer, const MemoryManager&, NetworkParameters::CallbackPlace place)
+    {
+        auto name = layer->getName();
+        if (place == NetworkParameters::CallbackPlace::Before_Forward || place == NetworkParameters::CallbackPlace::Before_Backward)
+        {
+            if (mLayerStarts.find(name) != mLayerStarts.end())
+            {
+                throw runtime_error("LayerTypeStatistics: layer \"" + name + "\" started twice");
+            }
+            mLayerStarts[name] = chrono::steady_clock::now();
+        }
+        else
+        {
+            if (mLayerStarts.find(name) == mLayerStarts.end())
+            {
+                throw runtime_error("LayerTypeStatistics: layer \"" + name + "\" started twice");
+            }
+            float duration = static_cast<float>(chrono::duration_cast<chrono::nanoseconds>(chrono::steady_clock::now() - mLayerStarts[name]).count()) / 1000000.f;
+            mLayerStarts.erase(name);
+            auto typeName = layer->getTypeName();
+            auto& s = mStatMap[typeName];
+            s.mTypeName = typeName;
+            if (place == NetworkParameters::CallbackPlace::After_Forward)
+            {
+                ++s.mCallCount;
+                s.mLongestForwardCall = max(s.mLongestForwardCall, duration);
+                s.mTotalForwardTime += duration;
+            }
+            else
+            {
+                s.mLongestBackwardCall = max(s.mLongestBackwardCall, duration);
+                s.mTotalBackwardTime += duration;
+            }
+        }
+    }
+
+    vector<LayerTypeStat> getStat() const
+    {
+        vector<LayerTypeStat> v;
+        for (const auto& it : mStatMap)
+        {
+            v.push_back(it.second);
+        }
+        sort(v.begin(), v.end(), [](const auto& a, const auto& b) { return a.mTotalForwardTime + a.mTotalBackwardTime > b.mTotalForwardTime + b.mTotalBackwardTime; });
+        return v;
+    }
+
+    void print(float forwardTime, float backwardTime, float minPercent = 1.f) const
+    {
+        auto epochTime = forwardTime + backwardTime;
+
+        float totalLayerForwardTime = 0;
+        float totalLayerBackwardTime = 0;
+
+        auto stat = getStat();
+
+        for (const auto& s : stat)
+        {
+            totalLayerForwardTime += s.mTotalForwardTime;
+            totalLayerBackwardTime += s.mTotalBackwardTime;
+        }
+
+        cout << "Calculations take " << totalLayerForwardTime + totalLayerBackwardTime << "ms (" << 100.f * (totalLayerForwardTime + totalLayerBackwardTime) / epochTime << "% of " << epochTime
+             << "ms epoch time)" << endl;
+
+        cout << "  forward: " << totalLayerForwardTime << "ms (" << 100.f * totalLayerForwardTime / forwardTime << "% of " << forwardTime << "ms forward time)" << endl;
+        cout << "  backward: " << totalLayerBackwardTime << "ms (" << 100.f * totalLayerBackwardTime / backwardTime << "% of " << backwardTime << "ms backward time)" << endl;
+
+        LayerTypeStat other("Other");
+
+        for (const auto& s : stat)
+        {
+            if (s.mTotalForwardTime >= totalLayerForwardTime * minPercent / 100.f || s.mTotalBackwardTime >= totalLayerBackwardTime * minPercent / 100.f)
+            {
+                s.print(cout, epochTime);
+            }
+            else
+            {
+                other += s;
+            }
+        }
+        other.print(cout, epochTime);
+    }
+
+  protected:
+    map<Name, LayerTypeStat> mStatMap;                        // type name
+    map<Name, chrono::steady_clock::time_point> mLayerStarts; // layer name
+};
+
+} // UT::tools::callbacks
+
+#endif
\ No newline at end of file
diff --git a/training/src/tests/tests/tools/callbacks/MultiCallback.h b/training/src/tests/tests/tools/callbacks/MultiCallback.h
new file mode 100644
index 00000000..7a6dbba2
--- /dev/null
+++ b/training/src/tests/tests/tools/callbacks/MultiCallback.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef MULTICALLBACK_H
+#define MULTICALLBACK_H
+
+namespace UT::tools::callbacks
+{
+
+using cb = std::function<void(raul::BasicLayer*, raul::MemoryManager&, raul::NetworkParameters::CallbackPlace)>;
+
+using namespace raul;
+
+class MultiCallback
+{
+  public:
+    explicit MultiCallback(std::vector<cb> callbacks)
+        : mCallbacks{ std::move(callbacks) }
+    {
+    }
+
+    void operator()(BasicLayer* layer, MemoryManager& memory_manager, NetworkParameters::CallbackPlace place)
+    {
+        for (const auto& callback : mCallbacks)
+        {
+            callback.operator()(layer, memory_manager, place);
+        }
+    }
+
+  private:
+    std::vector<cb> mCallbacks;
+};
+} // UT::tools::callbacks
+
+#endif // MULTICALLBACK_H
diff --git a/training/src/tests/tests/tools/callbacks/TensorChecker.h b/training/src/tests/tests/tools/callbacks/TensorChecker.h
new file mode 100644
index 00000000..2e89b387
--- /dev/null
+++ b/training/src/tests/tests/tools/callbacks/TensorChecker.h
@@ -0,0 +1,199 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TENSOR_CHECKER_H
+#define TENSOR_CHECKER_H
+
+#include <tests/tools/TestTools.h>
+
+#include <gtest/gtest.h>
+
+#include <fstream>
+#include <vector>
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace
+{
+using namespace std;
+using namespace raul;
+
+[[maybe_unused]] void save3(const Tensor& t, string file)
+{
+    auto r = t.reshape(t.getBatchSize(), t.getHeight() * t.getDepth(), t.getWidth());
+    ofstream f(file);
+    f << "[" << r.size(0) << " " << r.size(1) << " " << r.size(2) << "]" << endl;
+    f << "[";
+    for (size_t i = 0; i < r.size(0); ++i)
+    {
+        f << "[";
+        for (size_t j = 0; j < r.size(1); ++j)
+        {
+            f << "[";
+            for (size_t k = 0; k < r.size(2); ++k)
+            {
+                if (k == 0)
+                {
+                    f << r[i][j][k];
+                }
+                else
+                {
+                    f << " " << r[i][j][k];
+                }
+            }
+            f << "]" << endl;
+        }
+        f << "]" << endl;
+    }
+    f << "]" << endl;
+}
+}
+
+namespace UT::tools::callbacks
+{
+using namespace std;
+using namespace raul;
+
+template<typename MM = raul::MemoryManager>
+class TensorChecker
+{
+  public:
+    TensorChecker(const vector<pair<Name, Name>>& tensors, dtype epsAbs, dtype epsRel = -1_dt, bool stopOnFirstError = false)
+        : TensorChecker(tensors, {}, epsAbs, epsRel, stopOnFirstError)
+    {
+    }
+
+    TensorChecker(const vector<pair<Name, Name>>& tensors, const vector<pair<Name, Name>>& gradients, dtype epsAbs, dtype epsRel = -1_dt, bool stopOnFirstError = false)
+        : mTensors(tensors)
+        , mGradients(gradients)
+        , mEpsAbsolute(epsAbs)
+        , mEpsRelative(epsRel)
+        , mStopOnFirstError(stopOnFirstError)
+    {
+    }
+
+    void operator()(BasicLayer* layer, const MM& memory_manager, NetworkParameters::CallbackPlace place)
+    {
+        vector<pair<Name, Name>>* tensors = nullptr;
+        Names searchNames;
+        if (place == NetworkParameters::CallbackPlace::Before_Forward)
+        {
+            tensors = &mTensors;
+            searchNames = layer->getInputs();
+        }
+        if (place == NetworkParameters::CallbackPlace::After_Forward)
+        {
+            tensors = &mTensors;
+            searchNames = layer->getOutputs();
+        }
+        else if (place == NetworkParameters::CallbackPlace::After_Backward)
+        {
+            tensors = &mGradients;
+            searchNames = layer->getInputs();
+            for (auto& s : searchNames)
+            {
+                s = s.grad();
+            }
+        }
+
+        if (!tensors || tensors->empty())
+        {
+            return;
+        }
+
+        for (const auto& outp : searchNames)
+        {
+            for (const auto& p : *tensors)
+            {
+                if (outp == p.first)
+                {
+                    const auto& output = memory_manager[p.first];
+                    const auto& target = memory_manager[p.second];
+
+                    if (output.empty())
+                    {
+                        return;
+                    }
+
+                    ASSERT_EQ(output.getShape(), target.getShape())
+                        << " inconsistent shapes: " + p.first + " " + Conversions::toString(output.getShape()) + " vs " + p.second + " " + Conversions::toString(target.getShape());
+
+                    ASSERT_EQ(output.size(), target.size()) << " inconsistent sizes: " + p.first + " " + Conversions::toString(output.size()) + " vs " + p.second + " " +
+                                                                   Conversions::toString(target.size());
+
+
+                    for (size_t i = 0; i < output.size(); ++i)
+                    {
+                        auto doutput = TODTYPE(output[i]);
+                        auto dtarget = TODTYPE(target[i]);
+                        if (mEpsRelative < 0_dt)
+                        {
+                            if (mStopOnFirstError)
+                            {
+                                ASSERT_NEAR(doutput, dtarget, mEpsAbsolute) << "at " << i << ", output('" << p.first << "'): " << doutput << ", target('" << p.second << "'): " << dtarget;
+                            }
+                            else
+                            {
+                                EXPECT_NEAR(doutput, dtarget, mEpsAbsolute) << "at " << i << ", output('" << p.first << "'): " << doutput << ", target('" << p.second << "'): " << dtarget;
+                            }
+                        }
+                        else if (mEpsAbsolute < 0_dt)
+                        {
+                            if (mStopOnFirstError)
+                            {
+                                ASSERT_TRUE(UT::tools::expect_near_relative(doutput, dtarget, mEpsRelative)) << "at " << i << ", output('" << p.first << "'): " << doutput << ", target('" << p.second << "'): " << dtarget;
+                            }
+                            else
+                            {
+                                EXPECT_TRUE(UT::tools::expect_near_relative(doutput, dtarget, mEpsRelative)) << "at " << i << ", output('" << p.first << "'): " << doutput << ", target('" << p.second << "'): " << dtarget;
+                            }
+                        }
+                        else
+                        {
+                            if (std::fabs(doutput - target[i]) < mEpsAbsolute)
+                            {
+                                continue;
+                            }
+                            if (mStopOnFirstError)
+                            {
+                                ASSERT_TRUE(UT::tools::expect_near_relative(doutput, dtarget, mEpsRelative)) << "at " << i << ", output('" << p.first << "'): " << doutput << ", target('" << p.second << "'): " << dtarget;
+                            }
+                            else
+                            {
+                                EXPECT_TRUE(UT::tools::expect_near_relative(doutput, dtarget, mEpsRelative)) << "at " << i << ", output('" << p.first << "'): " << doutput << ", target('" << p.second << "'): " << dtarget;
+                            }
+                        }
+                    }
+                    
+                    auto norm = TensorNorm(memory_manager[p.first]);
+                    auto golden_norm = TensorNorm(memory_manager[p.second]);
+                    cout << "Norm(" << p.first << ") = " << norm << ", Norm(" << p.second << ") = " << golden_norm << " (diff = " << fabs(norm - golden_norm) << ")" << endl;
+
+
+                    cout << "Tensor \"" + p.first + "\" checked" << endl;
+                }
+            }
+        }
+    }
+
+  private:
+    vector<pair<Name, Name>> mTensors;
+    vector<pair<Name, Name>> mGradients;
+    dtype mEpsAbsolute;
+    dtype mEpsRelative;
+    bool mStopOnFirstError;
+};
+} // UT::tools::callbacks
+
+#endif
diff --git a/training/src/tests/tests/tools/callbacks/TensorTracer.h b/training/src/tests/tests/tools/callbacks/TensorTracer.h
new file mode 100644
index 00000000..582cd5ec
--- /dev/null
+++ b/training/src/tests/tests/tools/callbacks/TensorTracer.h
@@ -0,0 +1,208 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef TENSORTRACER_H
+#define TENSORTRACER_H
+
+#include <cmath>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <regex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/BasicLayer.h>
+
+namespace UT::tools::callbacks
+{
+
+using namespace raul;
+
+template<class Os, class U, class V>
+Os& operator<<(Os& os, const std::pair<U, V>& p)
+{
+    return os << '{' << p.first << ": " << p.second << '}';
+}
+
+template<class Os, class K, class V>
+Os& operator<<(Os& os, const std::unordered_map<K, V>& v)
+{
+    std::map<int, size_t> ordered(v.begin(), v.end());
+    os << '[' << ordered.size() << "] { ";
+    bool o{};
+    for (const auto& e : ordered)
+        os << (o ? ", " : (o = 1, "")) << e;
+    return os << " }\n";
+}
+
+class TensorTracer
+{
+  public:
+    explicit TensorTracer(NetworkParameters::CallbackPlace place, std::optional<std::regex> filter = std::nullopt, std::string filename = "data.trace")
+        : mPlace{ place }
+        , mLayer{ std::nullopt }
+        , mRange{ -126, 127 }
+        , mFileName{ std::move(filename) }
+        , mFilter{ std::move(filter) }
+    {
+        write_header();
+    }
+
+    TensorTracer(NetworkParameters::CallbackPlace place, const raul::Name& layer, std::optional<std::regex> filter = std::nullopt, std::string filename = "data.trace")
+        : mPlace{ place }
+        , mLayer{ layer }
+        , mRange{ -126, 127 }
+        , mFileName{ std::move(filename) }
+        , mFilter{ std::move(filter) }
+    {
+        write_header();
+    }
+
+    void operator()(BasicLayer* layer, const MemoryManager& memory_manager, NetworkParameters::CallbackPlace place)
+    {
+        if (mPlace && *mPlace != place)
+        {
+            return;
+        }
+        if (mLayer && *mLayer != layer->getName())
+        {
+            return;
+        }
+        trace_tensors(layer, memory_manager, callbackPlaceToName(place));
+    }
+
+  private:
+    template<typename T>
+    int get_exp(T value)
+    {
+        int exp;
+        std::frexp(static_cast<double>(value), &exp);
+        return exp;
+    }
+
+    static std::string callbackPlaceToName(NetworkParameters::CallbackPlace place)
+    {
+        switch (place)
+        {
+
+            case NetworkParameters::CallbackPlace::Before_Forward:
+                return "Before_Forward";
+            case NetworkParameters::CallbackPlace::After_Forward:
+                return "After_Forward";
+            case NetworkParameters::CallbackPlace::Before_Backward:
+                return "Before_Backward";
+            case NetworkParameters::CallbackPlace::After_Backward:
+                return "After_Backward";
+            default:
+                return "Unknown";
+        }
+    }
+
+    static void insert_or_add(std::unordered_map<int, size_t>& dict, int key, size_t value = 1)
+    {
+        auto it = dict.find(key);
+        if (it != dict.end())
+        {
+            it->second += value;
+        }
+        else
+        {
+            dict.insert(std::make_pair(key, value));
+        }
+    }
+
+    auto get_exp_distr(raul::Tensor tensor)
+    {
+        std::unordered_map<int, size_t> dict;
+
+        for (const auto& value : tensor)
+        {
+            const int exp = get_exp(value);
+            insert_or_add(dict, exp);
+        }
+        return dict;
+    }
+
+    struct AccumulatingDict
+    {
+        void append(const std::unordered_map<int, size_t>& new_dict)
+        {
+            for (const auto& [key, value] : new_dict)
+            {
+                insert_or_add(dict, key, value);
+            }
+        }
+
+        auto get_dict() const { return dict; }
+
+      private:
+        std::unordered_map<int, size_t> dict;
+    };
+
+    void trace_tensors(BasicLayer* layer, const MemoryManager& memory_manager, const std::string& placeName)
+    {
+        AccumulatingDict accumulator;
+        const auto& tensors = memory_manager.getTensorCollection().tensors;
+
+        for (const auto& [name, tensor] : tensors)
+        {
+            if (mFilter && !std::regex_match(name.str(), *mFilter))
+            {
+                continue;
+            }
+            const auto distr = get_exp_distr(*tensor);
+            accumulator.append(distr);
+        }
+        write_record(layer->getName(), placeName, accumulator.get_dict());
+    }
+
+    void write_header()
+    {
+        std::fstream mFile(mFileName, std::fstream::out | std::fstream::app);
+        if (mFile.is_open())
+        {
+            for (int i = mRange.first; i < mRange.second; ++i)
+            {
+                mFile << i << ",";
+            }
+            mFile << "layer,place" << std::endl;
+        }
+    }
+
+    void write_record(const std::string& layer, const std::string& place, const std::unordered_map<int, size_t>& dict)
+    {
+        std::fstream mFile(mFileName, std::fstream::out | std::fstream::app);
+        if (mFile.is_open())
+        {
+            for (int i = mRange.first; i < mRange.second; ++i)
+            {
+                const auto it = dict.find(i);
+                const auto res = (it == dict.end()) ? 0 : it->second;
+                mFile << res << ",";
+            }
+            mFile << layer << "," << place << std::endl;
+        }
+    }
+
+    std::optional<NetworkParameters::CallbackPlace> mPlace;
+    std::optional<raul::Name> mLayer;
+    std::pair<int, int> mRange;
+    std::string mFileName;
+    std::optional<std::regex> mFilter;
+};
+} // UT::tools::callbacks
+
+#endif // TENSORTRACER_H
diff --git a/training/src/tests/tests/topologies/Test_MobilenetV2.cpp b/training/src/tests/tests/topologies/Test_MobilenetV2.cpp
new file mode 100644
index 00000000..a87d7f69
--- /dev/null
+++ b/training/src/tests/tests/topologies/Test_MobilenetV2.cpp
@@ -0,0 +1,176 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/compiler/Layers.h>
+#include <training/base/layers/basic/trainable/Batchnorm.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace
+{
+
+const size_t reproduceLayers = 16;
+
+const size_t filterSizes[reproduceLayers][3] = { { 96, 96, 24 },   { 144, 144, 24 }, { 144, 144, 32 }, { 192, 192, 32 }, { 192, 192, 32 },  { 192, 192, 64 },  { 384, 384, 64 },  { 384, 384, 64 },
+                                                 { 384, 384, 64 }, { 384, 384, 96 }, { 576, 576, 96 }, { 576, 576, 96 }, { 576, 576, 160 }, { 960, 960, 160 }, { 960, 960, 160 }, { 960, 960, 320 } };
+
+const size_t lastLayerSize = 1280;
+
+size_t createTopology(raul::Workflow& work, size_t IMAGE_SIZE, size_t IMAGE_CHANNELS, size_t NUM_CLASSES, float bnMomentum, bool bias)
+{
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "data", "labels" }, IMAGE_CHANNELS, IMAGE_SIZE, IMAGE_SIZE, NUM_CLASSES });
+
+    // 0
+    work.add<raul::Convolution2DLayer>("conv1", raul::Convolution2DParams{ { "data" }, { "conv1" }, 3, 32, 2, 1, bias });
+    work.add<raul::BatchNormLayer>("bn1", raul::BatchnormParams{ { "conv1" }, { "bn1" }, bnMomentum, 1e-5f });
+    // bnNames.push_back("bn1");
+    work.add<raul::ReLU6Activation>("relu1", raul::BasicParams{ { "bn1" }, { "relu1" } });
+
+    // 1
+    work.add<raul::ConvolutionDepthwiseLayer>("conv2", raul::Convolution2DParams{ { "relu1" }, { "conv2" }, 3, 32, 1, 1, bias });
+    work.add<raul::BatchNormLayer>("bn2", raul::BatchnormParams{ { "conv2" }, { "bn2" }, bnMomentum, 1e-5f });
+    // bnNames.push_back("bn2");
+    work.add<raul::ReLU6Activation>("relu2", raul::BasicParams{ { "bn2" }, { "relu2" } });
+
+    work.add<raul::Convolution2DLayer>("conv3", raul::Convolution2DParams{ { "relu2" }, { "conv3" }, 1, 16, 1, 0, bias });
+    work.add<raul::BatchNormLayer>("bn3", raul::BatchnormParams{ { "conv3" }, { "bn3" }, bnMomentum, 1e-5f });
+    // bnNames.push_back("bn3");
+
+    std::string inputName = "bn3";
+
+    const size_t avgWidth = 7;
+
+    const size_t strideSizes[reproduceLayers] = { 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1 };
+
+    const bool residual[reproduceLayers] = { 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0 };
+
+    size_t layerIndex = 4;
+
+    for (size_t w = 0; w < reproduceLayers; ++w)
+    {
+        work.add<raul::Convolution2DLayer>("conv" + Conversions::toString(layerIndex),
+                                           raul::Convolution2DParams{ { inputName }, { "conv" + Conversions::toString(layerIndex) }, 1, filterSizes[w][0], 1, 0, bias });
+        work.add<raul::BatchNormLayer>("bn" + Conversions::toString(layerIndex),
+                                       raul::BatchnormParams{ { "conv" + Conversions::toString(layerIndex) }, { "bn" + Conversions::toString(layerIndex) }, bnMomentum, 1e-5f });
+        // bnNames.push_back("bn" + Conversions::toString(layerIndex));
+        work.add<raul::ReLU6Activation>("relu" + Conversions::toString(layerIndex), raul::BasicParams{ { "bn" + Conversions::toString(layerIndex) }, { "relu" + Conversions::toString(layerIndex) } });
+
+        ++layerIndex;
+
+        work.add<raul::ConvolutionDepthwiseLayer>(
+            "conv" + Conversions::toString(layerIndex),
+            raul::Convolution2DParams{ { "relu" + Conversions::toString(layerIndex - 1) }, { "conv" + Conversions::toString(layerIndex) }, 3, filterSizes[w][1], strideSizes[w], 1, bias });
+        work.add<raul::BatchNormLayer>("bn" + Conversions::toString(layerIndex),
+                                       raul::BatchnormParams{ { "conv" + Conversions::toString(layerIndex) }, { "bn" + Conversions::toString(layerIndex) }, bnMomentum, 1e-5f });
+
+        work.add<raul::ReLU6Activation>("relu" + Conversions::toString(layerIndex), raul::BasicParams{ { "bn" + Conversions::toString(layerIndex) }, { "relu" + Conversions::toString(layerIndex) } });
+
+        ++layerIndex;
+
+        work.add<raul::Convolution2DLayer>(
+            "conv" + Conversions::toString(layerIndex),
+            raul::Convolution2DParams{ { "relu" + Conversions::toString(layerIndex - 1) }, { "conv" + Conversions::toString(layerIndex) }, 1, filterSizes[w][2], 1, 0, bias });
+        work.add<raul::BatchNormLayer>("bn" + Conversions::toString(layerIndex),
+                                       raul::BatchnormParams{ { "conv" + Conversions::toString(layerIndex) }, { "bn" + Conversions::toString(layerIndex) }, bnMomentum, 1e-5f });
+        // bnNames.push_back("bn" + Conversions::toString(layerIndex));
+
+        if (residual[w])
+        {
+            work.add<raul::ElementWiseSumLayer>("sum" + Conversions::toString(layerIndex),
+                                                raul::ElementWiseLayerParams{ { "bn" + Conversions::toString(layerIndex), inputName }, { "sum" + Conversions::toString(layerIndex) } });
+            inputName = "sum" + Conversions::toString(layerIndex);
+        }
+        else
+            inputName = "bn" + Conversions::toString(layerIndex);
+
+        ++layerIndex;
+    }
+
+    // 18
+    work.add<raul::Convolution2DLayer>("conv" + Conversions::toString(layerIndex),
+                                       raul::Convolution2DParams{ { inputName }, { "conv" + Conversions::toString(layerIndex) }, 1, lastLayerSize, 1, 0, bias });
+    work.add<raul::BatchNormLayer>("bn" + Conversions::toString(layerIndex),
+                                   raul::BatchnormParams{ { "conv" + Conversions::toString(layerIndex) }, { "bn" + Conversions::toString(layerIndex) }, bnMomentum, 1e-5f });
+
+    work.add<raul::ReLU6Activation>("relu" + Conversions::toString(layerIndex), raul::BasicParams{ { "bn" + Conversions::toString(layerIndex) }, { "relu" + Conversions::toString(layerIndex) } });
+
+    work.add<raul::AveragePoolLayer>("avg", raul::Pool2DParams{ { "relu" + Conversions::toString(layerIndex) }, { "avg" }, avgWidth, 1 });
+    work.add<raul::ReshapeLayer>("reshape", raul::ViewParams{ "avg", "avgr", 1, 1, -1 });
+    work.add<raul::LinearLayer>("fc", raul::LinearParams{ { "avgr" }, { "fc" }, NUM_CLASSES, bias });
+
+    work.add<raul::LogSoftMaxActivation>("softmax", raul::BasicParamsWithDim{ { "fc" }, { "softmax" } });
+    work.add<raul::NLLLoss>("loss", raul::LossParams{ { "softmax", "labels" }, { "loss" }, "custom_batch_mean" });
+
+    return layerIndex;
+}
+
+} // anonymous
+
+namespace UT
+{
+
+TEST(TestMobileNetV2, TopologyUnit)
+{
+    PROFILE_TEST
+
+    const size_t golden_trainable_parameters = 2253738U;
+
+    bool useCheckpointing = false;
+    bool usePool = false;
+
+    const size_t BATCH_SIZE = 50;
+
+    const size_t NUM_CLASSES = 10;
+    const size_t IMAGE_SIZE = 224;
+    const size_t IMAGE_CHANNELS = 3;
+
+    const float bnMomentum = 0.1f;
+
+    const bool bias = true;
+    bool compressionConv = false;
+    raul::CompressionMode compressionMode = compressionConv ? raul::CompressionMode::FP16 : raul::CompressionMode::NONE;
+    // raul::CompressionMode compressionMode = raul::CompressionMode::INT8;
+
+    raul::Workflow work(compressionMode, raul::CalculationMode::DETERMINISTIC, usePool ? raul::AllocationMode::POOL : raul::AllocationMode::STANDARD);
+
+    createTopology(work, IMAGE_SIZE, IMAGE_CHANNELS, NUM_CLASSES, bnMomentum, bias);
+
+    if (useCheckpointing)
+    {
+        // work.setCheckpoints({"bn3", "bn6", "bn9", "bn12"});
+        raul::Names checkpointsAll = work.getPotentialCheckpoints();
+        raul::Names checkpoints;
+        for (raul::Name& checkP : checkpointsAll)
+        {
+            if (checkP.str().find("bn") != std::string::npos)
+            {
+                checkpoints.push_back(checkP);
+            }
+        }
+        work.setCheckpoints(checkpoints);
+        work.preparePipelines(raul::Workflow::Execution::Checkpointed);
+    }
+    else
+    {
+        work.preparePipelines();
+    }
+    work.setBatchSize(BATCH_SIZE);
+    work.prepareMemoryForTraining();
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/topologies/Test_MobilenetV3.cpp b/training/src/tests/tests/topologies/Test_MobilenetV3.cpp
new file mode 100644
index 00000000..09b14524
--- /dev/null
+++ b/training/src/tests/tests/topologies/Test_MobilenetV3.cpp
@@ -0,0 +1,351 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/compiler/Layers.h> 
+#include <training/base/layers/basic/GlobalAveragePoolLayer.h>
+#include <training/base/layers/basic/trainable/Batchnorm.h>
+#include <training/base/layers/parameters/LayerParameters.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/Adam.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+enum class NonLinearType : char
+{
+    ReLU,
+    HSwish
+};
+
+/*
+ * Squeeze-and-Excite Block aka SE Block
+ */
+void add_se_block(size_t& block_cnt, raul::Workflow& work, raul::Name& input, const size_t in_size, const size_t reduction = 4, const float bnMomentum = 0.1f)
+{
+    const auto block_name = "se" + Conversions::toString(block_cnt);
+    ++block_cnt;
+
+    const size_t internal_size = in_size / reduction;
+
+    work.add<raul::GlobAveragePoolLayer>(block_name + "::avg", raul::BasicParams{ { input }, { block_name + "::avg" } });
+    work.add<raul::Convolution2DLayer>(block_name + "::conv1", raul::Convolution2DParams{ { block_name + "::avg" }, { block_name + "::conv1" }, 1, internal_size, 1, 0 });
+    work.add<raul::BatchNormLayer>(block_name + "::bn1", raul::BatchnormParams{ { block_name + "::conv1" }, { block_name + "::bn1" }, bnMomentum });
+    work.add<raul::ReLUActivation>(block_name + "::relu", raul::BasicParams{ { block_name + "::bn1" }, { block_name + "::relu" } });
+    work.add<raul::Convolution2DLayer>(block_name + "::conv2", raul::Convolution2DParams{ { block_name + "::relu" }, { block_name + "::conv2" }, 1, in_size, 1, 0 });
+    work.add<raul::BatchNormLayer>(block_name + "::bn2", raul::BatchnormParams{ { block_name + "::conv2" }, { block_name + "::bn2" }, bnMomentum });
+    work.add<raul::HSigmoidActivation>(block_name + "::hsigmoid", raul::HSigmoidActivationParams{ { block_name + "::bn2" }, { block_name + "::hsigmoid" } });
+
+    work.add<raul::ElementWiseMulLayer>(block_name + "::mul", raul::ElementWiseLayerParams{ { input, block_name + "::hsigmoid" }, { block_name + "::mul" }, true });
+    input = block_name + "::mul";
+}
+
+/*
+ * MobileNetV3 Block: Expand + depthwise + pointwise
+ */
+void add_mobilenetv3_block(size_t& block_cnt,
+                           size_t& se_block_cnt,
+                           raul::Workflow& work,
+                           raul::Name& input,
+                           const size_t kernel_size,
+                           const size_t in_channels,
+                           const size_t expand_channels,
+                           const size_t out_channels,
+                           const NonLinearType nonlinear,
+                           const int semodule,
+                           const size_t stride,
+                           const float bnMomentum)
+{
+    const auto block_name = "block" + Conversions::toString(block_cnt);
+    ++block_cnt;
+
+    auto input_for_shortcut = input;
+
+    // 0: 1x1 NL
+    work.add<raul::Convolution2DLayer>(block_name + "::conv0", raul::Convolution2DParams{ { input }, { block_name + "::conv0" }, 1, expand_channels, 1, 0 });
+    work.add<raul::BatchNormLayer>(block_name + "::bn0", raul::BatchnormParams{ { block_name + "::conv0" }, { block_name + "::bn0" }, bnMomentum });
+    switch (nonlinear)
+    {
+        case NonLinearType::ReLU:
+            work.add<raul::ReLUActivation>(block_name + "::relu0", raul::BasicParams{ { block_name + "::bn0" }, { block_name + "::relu0" } });
+            input = block_name + "::relu0";
+            break;
+        case NonLinearType::HSwish:
+            work.add<raul::HSwishActivation>(block_name + "::hswish0", raul::HSwishActivationParams{ { block_name + "::bn0" }, { block_name + "::hswish0" } });
+            input = block_name + "::hswish0";
+            break;
+            // default: Do nothing
+    }
+
+    // 1: Dwise
+    work.add<raul::ConvolutionDepthwiseLayer>(block_name + "::conv1", raul::Convolution2DParams{ { input }, { block_name + "::conv1" }, kernel_size, expand_channels, stride, kernel_size / 2 });
+    work.add<raul::BatchNormLayer>(block_name + "::bn1", raul::BatchnormParams{ { block_name + "::conv1" }, { block_name + "::bn1" }, bnMomentum });
+    switch (nonlinear)
+    {
+        case NonLinearType::ReLU:
+            work.add<raul::ReLUActivation>(block_name + "::relu1", raul::BasicParams{ { block_name + "::bn1" }, { block_name + "::relu1" } });
+            input = block_name + "::relu1";
+            break;
+        case NonLinearType::HSwish:
+            work.add<raul::HSwishActivation>(block_name + "::hswish1", raul::HSwishActivationParams{ { block_name + "::bn1" }, { block_name + "::hswish1" } });
+            input = block_name + "::hswish1";
+            break;
+            // default: Do nothing
+    }
+
+    // 2: Linear
+    work.add<raul::Convolution2DLayer>(block_name + "::conv2", raul::Convolution2DParams{ { input }, { block_name + "::conv2" }, 1, out_channels, 1 });
+    work.add<raul::BatchNormLayer>(block_name + "::bn2", raul::BatchnormParams{ { block_name + "::conv2" }, { block_name + "::bn2" }, bnMomentum });
+    input = block_name + "::bn2";
+
+    if (semodule > -1)
+    {
+        add_se_block(se_block_cnt, work, input, semodule);
+    }
+
+    if (stride == 1U)
+    {
+        if (in_channels != out_channels)
+        {
+            work.add<raul::Convolution2DLayer>(block_name + "::conv3", raul::Convolution2DParams{ { input_for_shortcut }, { block_name + "::conv3" }, 1, out_channels, 1 });
+            work.add<raul::BatchNormLayer>(block_name + "::bn3", raul::BatchnormParams{ { block_name + "::conv3" }, { block_name + "::bn3" }, bnMomentum });
+            input_for_shortcut = block_name + "::bn3";
+        }
+
+        work.add<raul::ElementWiseSumLayer>(block_name + "::sum", raul::ElementWiseLayerParams{ { input_for_shortcut, input }, { block_name + "::sum" } });
+        input = block_name + "::sum";
+    }
+}
+
+void add_input_block(raul::Workflow& work, raul::Name& input, const float bnMomentum)
+{
+    work.add<raul::Convolution2DLayer>("input::conv0", raul::Convolution2DParams{ { input }, { "input::conv0" }, 3, 16, 2, 1 });
+    work.add<raul::BatchNormLayer>("input::bn0", raul::BatchnormParams{ { "input::conv0" }, { "input::bn0" }, bnMomentum });
+    work.add<raul::HSwishActivation>("input::hswish0", raul::HSwishActivationParams({ { "input::bn0" }, { "input::hswish0" } }));
+    input = "input::hswish0";
+}
+
+void add_output_block(raul::Workflow& work, raul::Name& input, const float bnMomentum)
+{
+    work.add<raul::Convolution2DLayer>("output::conv0", raul::Convolution2DParams{ { input }, { "output::conv0" }, 1, 576, 1, 0 });
+    work.add<raul::BatchNormLayer>("output::bn0", raul::BatchnormParams{ { "output::conv0" }, { "output::bn0" }, bnMomentum });
+    work.add<raul::HSwishActivation>("output::hswish0", raul::HSwishActivationParams({ { "output::bn0" }, { "output::hswish0" } }));
+
+    work.add<raul::AveragePoolLayer>("output::avg", raul::Pool2DParams{ { "output::hswish0" }, { "output::avg" }, 7, 1, 0 });
+    work.add<raul::ReshapeLayer>("reshape", raul::ViewParams{ "output::avg", "output::avgr", 1, 1, -1 });
+    work.add<raul::LinearLayer>("output::fc0", raul::LinearParams{ { "output::avgr" }, { "output::fc0" }, 1024 });
+    work.add<raul::HSwishActivation>("output::hswish1", raul::HSwishActivationParams({ { "output::fc0" }, { "output::hswish1" } }));
+
+    work.add<raul::LinearLayer>("output::fc1", raul::LinearParams{ { "output::hswish1" }, { "output::fc1" }, 10 });
+    input = "output::fc1";
+}
+
+raul::Name build_mobilenetv3_small(raul::Workflow& work, const size_t image_size = 224U, const size_t image_channels = 3U, const size_t labels_cnt = 10U, const float bnMomentum = 0.1f)
+{
+    size_t bneck_block_cnt = 0;
+    size_t se_block_cnt = 0;
+
+    raul::Name input = "data";
+    work.add<raul::DataLayer>("data", raul::DataParams{ { input, "labels" }, image_channels, image_size, image_size, labels_cnt });
+    add_input_block(work, input, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 3, 16, 16, 16, NonLinearType::ReLU, 16, 2, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 3, 16, 72, 24, NonLinearType::ReLU, -1, 2, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 3, 24, 88, 24, NonLinearType::ReLU, -1, 1, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 5, 24, 96, 40, NonLinearType::HSwish, 40, 2, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 5, 40, 240, 40, NonLinearType::HSwish, 40, 1, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 5, 40, 240, 40, NonLinearType::HSwish, 40, 1, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 5, 40, 120, 48, NonLinearType::HSwish, 48, 1, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 5, 48, 144, 48, NonLinearType::HSwish, 48, 1, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 5, 48, 288, 96, NonLinearType::HSwish, 96, 2, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 5, 96, 576, 96, NonLinearType::HSwish, 96, 1, bnMomentum);
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, 5, 96, 576, 96, NonLinearType::HSwish, 96, 1, bnMomentum);
+    add_output_block(work, input, bnMomentum);
+    return input;
+}
+
+TEST(TestMobileNetV3, SeBlockBuildingUnit)
+{
+    PROFILE_TEST
+    const size_t block_in_size = 8U;
+    // This value is got from pytorch model (mobilenetv3_experiments.ipynb)
+    const size_t golden_trainable_parameters = 62U;
+
+    size_t se_block_cnt = 0;
+
+    raul::Workflow work;
+    raul::Name input = "data";
+
+    // Build block
+    work.add<raul::DataLayer>("data", raul::DataParams{ { input, "labels" }, block_in_size, 1, 1, 0 });
+    add_se_block(se_block_cnt, work, input, block_in_size);
+
+    work.preparePipelines();
+    work.setBatchSize(1u);
+    work.prepareMemoryForTraining();
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+TEST(TestMobileNetV3, BottleneckBlockBuildingNoSEUnit)
+{
+    PROFILE_TEST
+    const size_t image_size = 56U;
+    const size_t image_channels = 16U;
+    const size_t kernel_size = 3U;
+    const size_t expand_channels = 72U;
+    const size_t out_channels = 24U;
+    const NonLinearType nonlinear = NonLinearType::ReLU;
+    const int semodule = -1;
+    const size_t stride = 2U;
+    const float bnMomentum = 0.1f;
+
+    // This value is got from pytorch model (mobilenetv3_experiments.ipynb)
+    const size_t golden_trainable_parameters = 4'032U;
+
+    size_t se_block_cnt = 0;
+    size_t bneck_block_cnt = 0;
+
+    raul::Workflow work;
+    raul::Name input = "data";
+
+    // Build block
+    work.add<raul::DataLayer>("data", raul::DataParams{ { input, "labels" }, image_channels, image_size, image_size, 0 });
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, kernel_size, image_channels, expand_channels, out_channels, nonlinear, semodule, stride, bnMomentum);
+
+    work.preparePipelines();
+    work.setBatchSize(1u);
+    work.prepareMemoryForTraining();
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+TEST(TestMobileNetV3, BottleneckBlockBuildingSEUnit)
+{
+    PROFILE_TEST
+    const size_t image_size = 14U;
+    const size_t image_channels = 40U;
+    const size_t kernel_size = 5U;
+    const size_t expand_channels = 120U;
+    const size_t out_channels = 48U;
+    const NonLinearType nonlinear = NonLinearType::HSwish;
+    const int semodule = 48;
+    const size_t stride = 1U;
+    const float bnMomentum = 0.1f;
+
+    // This value is got from pytorch model (mobilenetv3_experiments.ipynb)
+    const size_t golden_trainable_parameters = 17'820U;
+
+    size_t se_block_cnt = 0;
+    size_t bneck_block_cnt = 0;
+
+    raul::Workflow work;
+    raul::Name input = "data";
+
+    // Build block
+    work.add<raul::DataLayer>("data", raul::DataParams{ { input, "labels" }, image_channels, image_size, image_size, 0 });
+    add_mobilenetv3_block(bneck_block_cnt, se_block_cnt, work, input, kernel_size, image_channels, expand_channels, out_channels, nonlinear, semodule, stride, bnMomentum);
+
+    work.preparePipelines();
+    work.setBatchSize(1u);
+    work.prepareMemoryForTraining();
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+TEST(TestMobileNetV3, OutputBlockBuildingUnit)
+{
+    PROFILE_TEST
+    const size_t image_size = 7U;
+    const size_t image_channels = 96U;
+    const float bnMomentum = 0.1f;
+
+    // This value is got from pytorch model (mobilenetv3_experiments.ipynb)
+    const size_t golden_trainable_parameters = 658'122U;
+
+    raul::Workflow work;
+    raul::Name input = "data";
+
+    // Build block
+    work.add<raul::DataLayer>("data", raul::DataParams{ { input, "labels" }, image_channels, image_size, image_size, 0 });
+    add_output_block(work, input, bnMomentum);
+
+    work.preparePipelines();
+    work.setBatchSize(1u);
+    work.prepareMemoryForTraining();
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+TEST(TestMobileNetV3, InputBlockBuildingUnit)
+{
+    PROFILE_TEST
+    const size_t image_size = 7U;
+    const size_t image_channels = 3U;
+    const float bnMomentum = 0.1f;
+
+    // This value is got from pytorch model (mobilenetv3_experiments.ipynb)
+    const size_t golden_trainable_parameters = 480U;
+
+    raul::Workflow work;
+    raul::Name input = "data";
+
+    // Build block
+    work.add<raul::DataLayer>("data", raul::DataParams{ { input, "labels" }, image_channels, image_size, image_size, 0 });
+    add_input_block(work, input, bnMomentum);
+
+    work.preparePipelines();
+    work.setBatchSize(1u);
+    work.prepareMemoryForTraining();
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+TEST(TestMobileNetV3, SmallNetBuildingUnit)
+{
+    PROFILE_TEST
+    // This value is got from pytorch model (mobilenetv3_experiments.ipynb)
+    const size_t golden_trainable_parameters = 1'095'496U;
+
+    raul::Workflow work;
+    build_mobilenetv3_small(work, 224U, 3U, 10U);
+
+    work.preparePipelines();
+    work.setBatchSize(1u);
+    work.prepareMemoryForTraining();
+
+    work.printInfo(std::cout);
+
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+} // namespace UT
\ No newline at end of file
diff --git a/training/src/tests/tests/topologies/Test_NINCifar.cpp b/training/src/tests/tests/topologies/Test_NINCifar.cpp
new file mode 100644
index 00000000..664165f7
--- /dev/null
+++ b/training/src/tests/tests/topologies/Test_NINCifar.cpp
@@ -0,0 +1,120 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/layers/BasicLayer.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+TEST(TestNINCifar, TopologyUnit)
+{
+    PROFILE_TEST
+    const size_t golden_trainable_parameters = 966986U;
+
+    const size_t BATCH_SIZE = 64;
+
+    const size_t NUM_CLASSES = 10;
+    const size_t IMAGE_SIZE = 32;
+    const size_t IMAGE_CHANNELS = 3;
+
+    const size_t CONV1_FILTERS = 192;
+    const size_t CONV1_KERNEL_SIZE = 5;
+    const size_t CONV1_STRIDE = 1;
+    const size_t CONV1_PADDING = 2;
+
+    const size_t CONV2_FILTERS = 160;
+    const size_t CONV2_KERNEL_SIZE = 1;
+
+    const size_t CONV3_FILTERS = 96;
+    const size_t CONV3_KERNEL_SIZE = 1;
+
+    const size_t MAXPOOL_KERNEL = 3;
+    const size_t MAXPOOL_STRIDE = 2;
+    const size_t MAXPOOL_PADDING = 1;
+
+    const size_t CONV4_FILTERS = 192;
+    const size_t CONV4_KERNEL_SIZE = 5;
+    const size_t CONV4_STRIDE = 1;
+    const size_t CONV4_PADDING = 2;
+
+    const size_t CONV5_FILTERS = 192;
+    const size_t CONV5_KERNEL_SIZE = 1;
+
+    const size_t CONV6_FILTERS = 192;
+    const size_t CONV6_KERNEL_SIZE = 1;
+
+    const size_t AVGPOOL1_KERNEL = 3;
+    const size_t AVGPOOL1_STRIDE = 2;
+    const size_t AVGPOOL1_PADDING = 1;
+
+    const size_t CONV7_FILTERS = 192;
+    const size_t CONV7_KERNEL_SIZE = 3;
+    const size_t CONV7_STRIDE = 1;
+    const size_t CONV7_PADDING = 1;
+
+    const size_t CONV8_FILTERS = 192;
+    const size_t CONV8_KERNEL_SIZE = 1;
+
+    const size_t CONV9_FILTERS = 10;
+    const size_t CONV9_KERNEL_SIZE = 1;
+
+    const size_t AVGPOOL2_KERNEL = 8;
+    const size_t AVGPOOL2_STRIDE = 1;
+
+    printf("begin creating graph\n");
+    raul::Workflow work;
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "data", "labels" }, IMAGE_CHANNELS, IMAGE_SIZE, IMAGE_SIZE, NUM_CLASSES });
+
+    work.add<raul::Convolution2DLayer>("conv1", raul::Convolution2DParams{ { "data" }, { "conv1" }, CONV1_KERNEL_SIZE, CONV1_FILTERS, CONV1_STRIDE, CONV1_PADDING });
+    work.add<raul::ReLUActivation>("relu1", raul::BasicParams{ { "conv1" }, { "relu1" } });
+    work.add<raul::Convolution2DLayer>("conv2", raul::Convolution2DParams{ { "relu1" }, { "conv2" }, CONV2_KERNEL_SIZE, CONV2_FILTERS });
+    work.add<raul::ReLUActivation>("relu2", raul::BasicParams{ { "conv2" }, { "relu2" } });
+    work.add<raul::Convolution2DLayer>("conv3", raul::Convolution2DParams{ { "relu2" }, { "conv3" }, CONV3_KERNEL_SIZE, CONV3_FILTERS });
+    work.add<raul::ReLUActivation>("relu3", raul::BasicParams{ { "conv3" }, { "relu3" } });
+    work.add<raul::MaxPoolLayer2D>("mp", raul::Pool2DParams{ { "relu3" }, { "mp" }, MAXPOOL_KERNEL, MAXPOOL_STRIDE, MAXPOOL_PADDING });
+
+    work.add<raul::Convolution2DLayer>("conv4", raul::Convolution2DParams{ { "mp" }, { "conv4" }, CONV4_KERNEL_SIZE, CONV4_FILTERS, CONV4_STRIDE, CONV4_PADDING });
+    work.add<raul::ReLUActivation>("relu4", raul::BasicParams{ { "conv4" }, { "relu4" } });
+    work.add<raul::Convolution2DLayer>("conv5", raul::Convolution2DParams{ { "relu4" }, { "conv5" }, CONV5_KERNEL_SIZE, CONV5_FILTERS });
+    work.add<raul::ReLUActivation>("relu5", raul::BasicParams{ { "conv5" }, { "relu5" } });
+    work.add<raul::Convolution2DLayer>("conv6", raul::Convolution2DParams{ { "relu5" }, { "conv6" }, CONV6_KERNEL_SIZE, CONV6_FILTERS });
+    work.add<raul::ReLUActivation>("relu6", raul::BasicParams{ { "conv6" }, { "relu6" } });
+    work.add<raul::AveragePoolLayer>("avg1", raul::Pool2DParams{ { "relu6" }, { "avg1" }, AVGPOOL1_KERNEL, AVGPOOL1_STRIDE, AVGPOOL1_PADDING });
+
+    work.add<raul::Convolution2DLayer>("conv7", raul::Convolution2DParams{ { "avg1" }, { "conv7" }, CONV7_KERNEL_SIZE, CONV7_FILTERS, CONV7_STRIDE, CONV7_PADDING });
+    work.add<raul::ReLUActivation>("relu7", raul::BasicParams{ { "conv7" }, { "relu7" } });
+    work.add<raul::Convolution2DLayer>("conv8", raul::Convolution2DParams{ { "relu7" }, { "conv8" }, CONV8_KERNEL_SIZE, CONV8_FILTERS });
+    work.add<raul::ReLUActivation>("relu8", raul::BasicParams{ { "conv8" }, { "relu8" } });
+    work.add<raul::Convolution2DLayer>("conv9", raul::Convolution2DParams{ { "relu8" }, { "conv9" }, CONV9_KERNEL_SIZE, CONV9_FILTERS });
+    work.add<raul::ReLUActivation>("relu9", raul::BasicParams{ { "conv9" }, { "relu9" } });
+    work.add<raul::AveragePoolLayer>("avg2", raul::Pool2DParams{ { "relu9" }, { "avg2" }, AVGPOOL2_KERNEL, AVGPOOL2_STRIDE });
+
+    work.add<raul::SoftMaxActivation>("softmax", raul::BasicParamsWithDim{ { "avg2" }, { "softmax" } });
+    work.add<raul::CrossEntropyLoss>("loss", raul::LossParams{ { "softmax", "labels" }, { "loss" } });
+
+    work.preparePipelines();
+    work.setBatchSize(BATCH_SIZE);
+    work.prepareMemoryForTraining();
+
+    printf("end creating graph\n");
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+} // UT namespace
\ No newline at end of file
diff --git a/training/src/tests/tests/topologies/Test_ResNet.cpp b/training/src/tests/tests/topologies/Test_ResNet.cpp
new file mode 100644
index 00000000..b203f3c1
--- /dev/null
+++ b/training/src/tests/tests/topologies/Test_ResNet.cpp
@@ -0,0 +1,290 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/tools/TestTools.h>
+
+#include <training/base/common/Common.h>
+#include <training/compiler/Layers.h>
+#include <training/base/layers/basic/GlobalAveragePoolLayer.h>
+#include <training/base/layers/basic/trainable/Batchnorm.h>
+#include <training/base/layers/parameters/LayerParameters.h>
+#include <training/base/common/Conversions.h>
+#include <training/base/common/MemoryManager.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+#include <training/base/common/quantization/SymmetricQuantizer.h>
+
+namespace UT
+{
+
+using ResNetT = std::array<size_t, 4U>;
+using downsampleT = std::function<void(raul::Workflow&, raul::Name&, const raul::Name&)>;
+
+void add_resnet_basic_block(size_t& block_cnt,
+                            raul::Workflow& work,
+                            raul::Name& input,
+                            const raul::Name& name_prefix,
+                            const size_t planes,
+                            const size_t stride = 1U,
+                            const std::optional<downsampleT> downsample = std::nullopt,
+                            const size_t padding = 1U, // dilation in the reference
+                            const float bnMomentum = 0.1f,
+                            const bool bias = false,
+                            const bool quantize = false)
+{
+    if(padding != 1U)
+    {
+        THROW_NONAME("add_resnet_basic_block with padding != 1", "not implemented");
+    }
+
+    const auto kernel_size = 3U;
+    const auto default_stride = 1U;
+    const auto block_name = name_prefix / "block" + Conversions::toString(block_cnt);
+    auto input_for_shortcut = input;
+
+    ++block_cnt;
+
+    if (quantize)
+    {
+        work.add<raul::FakeQuantLayer>(block_name + "::fq1", raul::FakeQuantParams{ { input }, { block_name + "::fq1" } });
+        input = block_name + "::fq1";
+    }
+
+    work.add<raul::Convolution2DLayer>(block_name + "::conv1", raul::Convolution2DParams{ { input }, { block_name + "::conv1" }, kernel_size, planes, stride, padding, bias, quantize });
+    work.add<raul::BatchNormLayer>(block_name + "::bn1", raul::BatchnormParams{ { block_name + "::conv1" }, { block_name + "::bn1" }, bnMomentum });
+    work.add<raul::ReLUActivation>(block_name + "::relu1", raul::BasicParams{ { block_name + "::bn1" }, { block_name + "::relu1" } });
+    input = block_name + "::relu1";
+    if (quantize)
+    {
+        work.add<raul::FakeQuantLayer>(block_name + "::fq2", raul::FakeQuantParams{ { input }, { block_name + "::fq2" } });
+        input = block_name + "::fq2";
+    }
+
+    work.add<raul::Convolution2DLayer>(block_name + "::conv2", raul::Convolution2DParams{ { input }, { block_name + "::conv2" }, kernel_size, planes, default_stride, padding, bias, quantize });
+    work.add<raul::BatchNormLayer>(block_name + "::bn2", raul::BatchnormParams{ { block_name + "::conv2" }, { block_name + "::bn2" }, bnMomentum });
+    input = block_name + "::bn2";
+
+    if (downsample)
+    {
+        (*downsample)(work, input_for_shortcut, block_name);
+    }
+
+    work.add<raul::ElementWiseSumLayer>(block_name + "::sum", raul::ElementWiseLayerParams{ { input_for_shortcut, input }, { block_name + "::sum" } });
+    work.add<raul::ReLUActivation>(block_name + "::relu2", raul::BasicParams{ { block_name + "::sum" }, { block_name + "::relu2" } });
+    input = block_name + "::relu2";
+}
+
+void add_input_block(raul::Workflow& work, raul::Name& input, const float bnMomentum = 0.1f, const bool bias = false)
+{
+    const auto conv_in_planes = 64U;
+    const auto conv_kernel_size = 7U;
+    const auto conv_stride = 2U;
+    const auto conv_padding = 3U;
+
+    const auto max_pool_kernel_size = 3;
+    const auto max_pool_stride = 2;
+    const auto max_pool_padding = 1U;
+
+    work.add<raul::Convolution2DLayer>("input::conv1", raul::Convolution2DParams{ { input }, { "input::conv1" }, conv_kernel_size, conv_in_planes, conv_stride, conv_padding, bias });
+    work.add<raul::BatchNormLayer>("input::bn1", raul::BatchnormParams{ { "input::conv1" }, { "input::bn1" }, bnMomentum });
+    work.add<raul::ReLUActivation>("input::relu", raul::BasicParams{ { "input::bn1" }, { "input::relu" } });
+    work.add<raul::MaxPoolLayer2D>("input::maxpool", raul::Pool2DParams{ { "input::relu" }, { "input::maxpool" }, max_pool_kernel_size, max_pool_stride, max_pool_padding });
+    input = "input::maxpool";
+}
+
+void add_output_block(raul::Workflow& work, raul::Name& input, const size_t num_classes = 10U)
+{
+    work.add<raul::GlobAveragePoolLayer>("output::avg", raul::BasicParams{ { input }, { "output::avg" } });
+    work.add<raul::ReshapeLayer>("output::reshape", raul::ViewParams{ "output::avg", "output::avgr", 1, 1, -1 });
+    work.add<raul::LinearLayer>("output::fc0", raul::LinearParams{ { "output::avgr" }, { "output::fc0" }, num_classes });
+    input = "output::fc0";
+}
+
+void add_resnet_layer(size_t& layer_cnt,
+                      raul::Workflow& work,
+                      raul::Name& input,
+                      size_t& inplanes,
+                      size_t expansion,
+                      const size_t planes,
+                      const size_t blocks,
+                      const size_t stride = 1U,
+                      const size_t padding = 1U,
+                      const float bnMomentum = 0.1f,
+                      const bool bias = false,
+                      const bool quantize = false)
+{
+    const auto layer_name = "layer" + Conversions::toString(layer_cnt);
+    size_t block_cnt = 0U;
+    std::optional<downsampleT> downsample = std::nullopt;
+
+    ++layer_cnt;
+
+    if (stride != 1U || inplanes != planes * expansion)
+    {
+        downsample = [layer_name, bias, quantize, planes, expansion, stride, bnMomentum](raul::Workflow& work, raul::Name& input, const raul::Name& name_prefix) {
+            const auto conv_kernel_size = 1U;
+            const auto downsample_name = name_prefix + "::downsample";
+            if (quantize)
+            {
+                work.add<raul::FakeQuantLayer>(downsample_name + "::fq1", raul::FakeQuantParams{ { input }, { downsample_name + "::fq1" } });
+                input = downsample_name + "::fq1";
+            }
+            work.add<raul::Convolution2DLayer>(downsample_name + "::conv1",
+                                               raul::Convolution2DParams{ { input }, { downsample_name + "::conv1" }, conv_kernel_size, planes * expansion, stride, 0U, bias, quantize });
+            work.add<raul::BatchNormLayer>(downsample_name + "::bn1", raul::BatchnormParams{ { downsample_name + "::conv1" }, { downsample_name + "::bn1" }, bnMomentum });
+            input = downsample_name + "::bn1";
+        };
+    }
+    add_resnet_basic_block(block_cnt, work, input, layer_name, planes, stride, downsample, padding, bnMomentum, bias, quantize);
+    inplanes = planes * expansion;
+    downsample = std::nullopt;
+    for (size_t i = 1U; i < blocks; ++i)
+    {
+        const auto default_stride = 1U;
+        add_resnet_basic_block(block_cnt, work, input, layer_name, planes, default_stride, downsample, padding, bnMomentum, bias, quantize);
+    }
+}
+
+raul::Name build_resnet(raul::Workflow& work,
+                        const ResNetT layers,
+                        const size_t image_size = 224U,
+                        const size_t image_channels = 3U,
+                        const size_t labels_cnt = 10U,
+                        const float bnMomentum = 0.1f,
+                        const bool bias = false,
+                        const bool quantize = false)
+{
+    // This parameters are actual only for BasicBlock (see notebook for details)
+    size_t inplanes = 64U;
+    const size_t expansion = 1U;
+
+    size_t layer_cnt = 1U;
+    raul::Name input = "data";
+    work.add<raul::DataLayer>("data", raul::DataParams{ { input, "labels" }, image_channels, image_size, image_size, labels_cnt });
+    add_input_block(work, input);
+    add_resnet_layer(layer_cnt, work, input, inplanes, expansion, 64U, layers[0], 1U, 1U, bnMomentum, bias, quantize);
+    add_resnet_layer(layer_cnt, work, input, inplanes, expansion, 128U, layers[1], 2U, 1U, bnMomentum, bias, quantize);
+    add_resnet_layer(layer_cnt, work, input, inplanes, expansion, 256U, layers[2], 2U, 1U, bnMomentum, bias, quantize);
+    add_resnet_layer(layer_cnt, work, input, inplanes, expansion, 512U, layers[3], 2U, 1U, bnMomentum, bias, quantize);
+    add_output_block(work, input);
+    return input;
+}
+
+raul::Name build_resnet18(raul::Workflow& work,
+                          const size_t image_size = 224U,
+                          const size_t image_channels = 3U,
+                          const size_t labels_cnt = 10U,
+                          const float bnMomentum = 0.1f,
+                          const bool bias = false,
+                          const bool quantize = false)
+{
+    const ResNetT layers{ 2U, 2U, 2U, 2U };
+    return build_resnet(work, layers, image_size, image_channels, labels_cnt, bnMomentum, bias, quantize);
+}
+
+TEST(TestResNet, BasicBlockBuildingUnit)
+{
+    PROFILE_TEST
+    const size_t block_in_planes = 10U;
+    const size_t block_planes = 20U;
+    const size_t golden_trainable_parameters = 5480U;
+
+    const auto name_prefix = "";
+    size_t block_cnt = 0U;
+    raul::Name input{ "data" };
+    raul::Workflow work;
+    work.add<raul::DataLayer>("data", raul::DataParams{ { input, "labels" }, block_in_planes, 1U, 1U, 0U });
+    add_resnet_basic_block(block_cnt, work, input, name_prefix, block_planes);
+
+    work.preparePipelines();
+    work.setBatchSize(1u);
+    work.prepareMemoryForTraining();
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+TEST(TestResNet, InputBlockBuildingUnit)
+{
+    PROFILE_TEST
+    const size_t block_in_planes = 3U;
+    const size_t golden_trainable_parameters = 9536U;
+
+    raul::Name input{ "data" };
+    raul::Workflow work;
+    work.add<raul::DataLayer>("data", raul::DataParams{ { input, "labels" }, block_in_planes, 1U, 1U, 0U });
+    add_input_block(work, input);
+
+    work.preparePipelines();
+    work.setBatchSize(1u);
+    work.prepareMemoryForTraining();
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+TEST(TestResNet, ResNet18BuildingUnit)
+{
+    PROFILE_TEST
+    const size_t image_size = 224U;
+    const size_t image_channels = 3U;
+    const size_t image_classes = 10U;
+    const size_t batch_size = 50U;
+
+    const size_t golden_trainable_parameters = 11'181'642U;
+
+    raul::Workflow work;
+    const auto output_name = build_resnet18(work, image_size, image_channels, image_classes);
+
+    work.preparePipelines();
+    work.setBatchSize(batch_size);
+    work.prepareMemoryForTraining();
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+TEST(TestResNet, ResNet18QuantizedBuildingUnit)
+{
+    PROFILE_TEST
+    const size_t image_size = 224U;
+    const size_t image_channels = 3U;
+    const size_t image_classes = 10U;
+    const size_t batch_size = 50U;
+    const float batchtornm_momentum = 0.1f;
+    const bool bias = false;
+    const bool quantized = true;
+
+    const size_t golden_trainable_parameters = 11'181'642U;
+
+    auto quantizer = raul::quantization::SymmetricQuantizer(static_cast<raul::dtype (*)(raul::dtype)>(std::trunc));
+    raul::Workflow work(raul::CompressionMode::FP16, raul::CalculationMode::DETERMINISTIC, raul::AllocationMode::STANDARD, raul::ExecutionTarget::CPU, false, &quantizer);
+    const auto output_name = build_resnet18(work, image_size, image_channels, image_classes, batchtornm_momentum, bias, quantized);
+
+    work.preparePipelines();
+    work.setBatchSize(batch_size);
+    work.prepareMemoryForTraining();
+
+    work.printInfo(std::cout);
+
+    // Checks
+    EXPECT_EQ(tools::get_size_of_trainable_params(work), golden_trainable_parameters);
+}
+
+} // namespace UT
\ No newline at end of file
diff --git a/training/src/tests/tests/topologies/Test_Transformer.cpp b/training/src/tests/tests/topologies/Test_Transformer.cpp
new file mode 100644
index 00000000..73764ac6
--- /dev/null
+++ b/training/src/tests/tests/topologies/Test_Transformer.cpp
@@ -0,0 +1,743 @@
+// Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <tests/GTestExtensions.h>
+#include <tests/tools/TestTools.h>
+
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <set>
+#include <sstream>
+
+#include <training/base/common/Common.h>
+#include <training/base/layers/activations/LogSoftMaxActivation.h>
+#include <training/base/layers/activations/SoftMaxActivation.h>
+#include <training/base/layers/basic/DropoutLayer.h>
+#include <training/base/layers/basic/LabelSmoothing.h>
+#include <training/base/layers/basic/MaskedFillLayer.h>
+#include <training/base/layers/basic/MatMulLayer.h>
+#include <training/base/layers/basic/PositionalEncoding.h>
+#include <training/base/layers/basic/ReshapeLayer.h>
+#include <training/base/layers/basic/SplitterLayer.h>
+#include <training/base/layers/basic/TransposeLayer.h>
+#include <training/base/layers/basic/trainable/Embedding.h>
+#include <training/base/layers/basic/trainable/LinearLayer.h>
+#include <training/base/layers/composite/AttentionLayer.h>
+#include <training/base/layers/composite/MultiHeadAttention.h>
+#include <training/base/layers/composite/Transformer.h>
+#include <training/base/loss/KLDivLoss.h>
+#include <training/base/loss/NegativeLogLikelihoodLoss.h>
+#include <training/compiler/Layers.h>
+#include <training/compiler/Workflow.h>
+#include <training/base/optimizers/SGD.h>
+
+namespace UT
+{
+
+using dvec = std::vector<raul::dtype>;
+struct VariousEmbeddingLayerParameters : public testing::TestWithParam<std::tuple<bool, dvec, dvec, dvec>>
+{
+    static constexpr raul::dtype eps = 1e-4_dt;
+
+    static constexpr size_t DICT_SIZE = 3;
+    static constexpr size_t MODEL_SIZE = 6;
+    static constexpr size_t BATCH_SIZE = 1;
+    static constexpr size_t NUM_CLASSES = 2;
+
+    bool enablePadding = std::get<0>(GetParam());
+    raul::Tensor realEmbOut = raul::Tensor::dt_range{ std::get<1>(GetParam()).data(), std::get<1>(GetParam()).data() + std::get<1>(GetParam()).size() };
+    raul::Tensor realEmbGradient = raul::Tensor::dt_range{ std::get<2>(GetParam()).data(), std::get<2>(GetParam()).data() + std::get<2>(GetParam()).size() };
+    raul::Tensor lut = raul::Tensor::dt_range{ std::get<3>(GetParam()).data(), std::get<3>(GetParam()).data() + std::get<3>(GetParam()).size() };
+
+    raul::Tensor raw = { 2.0_dt, 1.0_dt, 0.0_dt, 2.0_dt };
+
+    std::vector<raul::dtype> gradientAfterTwoBackwardPasses;
+
+    void SetUp() final
+    {
+        std::transform(realEmbGradient.begin(), realEmbGradient.end(), std::back_inserter(gradientAfterTwoBackwardPasses), [](raul::dtype v) { return v * 2; });
+    }
+};
+
+// corresponds to embedding_padding.py test
+TEST_P(VariousEmbeddingLayerParameters, EmbeddingPaddingNoPaddingUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, raw.size() / BATCH_SIZE, 1 });
+
+    EmbeddingParams layerParameter{ "in", "emb", DICT_SIZE, MODEL_SIZE, enablePadding ? 0 : -1, false, true };
+    raul::Embedding emb("embedding", layerParameter, networkParameters);
+    TENSORS_CREATE(BATCH_SIZE);
+
+    memory_manager["in"] = TORANGE(raw);
+
+    memory_manager["embedding::Weights"] = TORANGE(lut);
+
+    emb.forwardCompute(NetworkMode::Train);
+
+    auto& embT = memory_manager["emb"];
+    ASSERT_INTERVALS_NEAR(embT.begin(), embT.end(), realEmbOut.begin(), realEmbOut.end(), eps);
+    printf(" - Embedding forward is Ok.\n");
+
+    memory_manager[Name("emb").grad()] = 1.0_dt;
+
+    emb.backwardCompute();
+
+    auto& embGrad = memory_manager["embedding::WeightsGradient"];
+    ASSERT_INTERVALS_NEAR(embGrad.begin(), embGrad.end(), realEmbGradient.begin(), realEmbGradient.end(), eps);
+}
+
+TEST_P(VariousEmbeddingLayerParameters, ShouldAccumulateGradientsDuringBackwardPassByDefaultUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, raw.size() / BATCH_SIZE, 1 });
+
+    EmbeddingParams layerParameter{ "in", "emb", DICT_SIZE, MODEL_SIZE, enablePadding ? 0 : -1, false, true };
+    raul::Embedding emb("embedding", layerParameter, networkParameters);
+    TENSORS_CREATE(BATCH_SIZE);
+
+    memory_manager["in"] = TORANGE(raw);
+
+    memory_manager["embedding::Weights"] = TORANGE(lut);
+
+    emb.forwardCompute(NetworkMode::Train);
+
+    memory_manager[Name("emb").grad()] = 1.0_dt;
+
+    emb.backwardCompute();
+    emb.backwardCompute();
+
+    auto& embGrad = memory_manager["embedding::WeightsGradient"];
+    ASSERT_INTERVALS_NEAR(embGrad.begin(), embGrad.end(), gradientAfterTwoBackwardPasses.begin(), gradientAfterTwoBackwardPasses.end(), eps);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    TestEmbedding,
+    VariousEmbeddingLayerParameters,
+    testing::Values(
+        std::make_tuple(true,
+                        dvec{ 1.389366_dt, 1.586334_dt, 0.946298_dt, -0.843677_dt, 0.931827_dt, 1.259009_dt, -0.341360_dt, 1.853006_dt, 0.468096_dt, -0.157712_dt, -0.173397_dt, 0.183478_dt,
+                              0.000000_dt, 0.000000_dt, 0.000000_dt, 0.000000_dt,  0.000000_dt, 0.000000_dt, 1.389366_dt,  1.586334_dt, 0.946298_dt, -0.843677_dt, 0.931827_dt,  1.259009_dt },
+                        dvec{ 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 0.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                        dvec{ 0.0_dt,
+                              0.0_dt,
+                              0.0_dt,
+                              0.0_dt,
+                              0.0_dt,
+                              0.0_dt,
+                              -0.341360_dt,
+                              1.853006_dt,
+                              0.468096_dt,
+                              -0.157712_dt,
+                              -0.173397_dt,
+                              0.183478_dt,
+                              1.389366_dt,
+                              1.586334_dt,
+                              0.946298_dt,
+                              -0.843677_dt,
+                              0.931827_dt,
+                              1.259009_dt }),
+        std::make_tuple(false,
+                        dvec{ 1.389366_dt,  1.586334_dt,  0.946298_dt, -0.843677_dt, 0.931827_dt, 1.259009_dt,  -0.341360_dt, 1.853006_dt, 0.468096_dt, -0.157712_dt, -0.173397_dt, 0.183478_dt,
+                              -1.125840_dt, -1.152360_dt, 0.566651_dt, 0.793508_dt,  0.598839_dt, -1.555095_dt, 1.389366_dt,  1.586334_dt, 0.946298_dt, -0.843677_dt, 0.931827_dt,  1.259009_dt },
+                        dvec{ 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt },
+                        dvec{ -1.125840_dt,
+                              -1.152360_dt,
+                              0.566651_dt,
+                              0.793508_dt,
+                              0.598839_dt,
+                              -1.555095_dt,
+                              -0.341360_dt,
+                              1.853006_dt,
+                              0.468096_dt,
+                              -0.157712_dt,
+                              -0.173397_dt,
+                              0.183478_dt,
+                              1.389366_dt,
+                              1.586334_dt,
+                              0.946298_dt,
+                              -0.843677_dt,
+                              0.931827_dt,
+                              1.259009_dt })));
+
+// test based on based on https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
+TEST(TestTransformer, NGramLanguageModelerUnit)
+{
+    PROFILE_TEST
+    using namespace std;
+    using namespace raul;
+
+    size_t CONTEXT_SIZE = 2;
+
+    Tensor idealLosses = { 517.2954f, 516.8915f, 516.4974f, 516.1131f, 515.7386f, 515.3741f, 515.0194f, 514.6746f, 514.3398f, 514.0149f };
+
+    string sonet = "When forty winters shall besiege thy brow, "
+                   "And dig deep trenches in thy beauty's field, "
+                   "Thy youth's proud livery so gazed on now, "
+                   "Will be a totter'd weed of small worth held: "
+                   "Then being asked, where all thy beauty lies, "
+                   "Where all the treasure of thy lusty days; "
+                   "To say, within thine own deep sunken eyes, "
+                   "Were an all - eating shame, and thriftless praise. "
+                   "How much more praise deserv'd thy beauty's use, "
+                   "If thou couldst answer 'This fair child of mine "
+                   "Shall sum my count, and make my old excuse,' "
+                   "Proving his beauty by succession thine! "
+                   "This were to be new made when thou art old, "
+                   "And see thy blood warm when thou feel'st it cold.";
+
+    stringstream ss(sonet);
+    vector<std::string> words;
+    copy(istream_iterator<string>(ss), istream_iterator<string>(), back_inserter(words));
+
+    vector<pair<vector<string>, string>> ngrams;
+    for (size_t i = 0; i < words.size() - CONTEXT_SIZE; ++i)
+        ngrams.push_back(make_pair(vector<string>(words.begin() + i, words.begin() + i + CONTEXT_SIZE), words[i + CONTEXT_SIZE]));
+
+    set<string> vocab(words.begin(), words.end());
+    vector<string> wordList(vocab.begin(), vocab.end());
+    sort(wordList.begin(), wordList.end());
+    map<string, size_t> word_to_idx;
+    for (size_t i = 0; i < wordList.size(); ++i)
+        word_to_idx[wordList[i]] = i;
+}
+
+//  output = input + dropout(sublayer({LayerNorm(input), sublayerAuxArgs}))
+/*
+raul::NetDef SublayerConnection(const std::string name, const raul::Name& input, const raul::Name& output, raul::NetDef& sublayer, const raul::Names& sublayerAuxArgs, raul::dtype dropout)
+{
+    using namespace raul;
+    NetDef res;
+    res.add<raul::SplitterLayer>(name + ".splitter", BasicParams{ { input }, { name + ".x1", name + ".x2" } });
+    res.add<raul::LayerNormLayer>(name + ".norm", LayerNormParams{ name + ".x2", name + ".x", 1e-6f });
+
+    Names sublayerArgs = { name + ".x" };
+    std::copy(sublayerAuxArgs.begin(), sublayerAuxArgs.end(), std::back_inserter(sublayerArgs));
+
+    res.addNetDef(sublayer, "sublayer", sublayerArgs, { name + ".y" });
+    std::string out = name + ".y";
+    if (dropout > 0.0_dt)
+    {
+        res.add<raul::DropoutLayer>(name + ".dropout", DropoutParams{ { out }, { name + ".do" }, static_cast<float>(dropout) });
+        out = name + ".do";
+    }
+    res.add<raul::ElementWiseSumLayer>(name + ".sum", ElementWiseLayerParams{ { name + ".x1", out }, { output } });
+    return res;
+}*/
+/*
+// architecture according to https://nlp.seas.harvard.edu/2018/04/03/attention.html
+TEST(TestTransformer, DISABLED_TransformerArchitectureUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+    unsigned int HEADS = 8;
+    size_t LENGTH = 6;
+    size_t MODEL_SIZE = 512;
+    size_t SRC_VOCAB = 11;
+    size_t TGT_VOCAB = 11;
+    size_t FEED_FORWARD_SIZE = 2048;
+    dtype DROPOUT_RATE = TODTYPE(0.1);
+    float LAYER_NORM_EPSILON = 1.e-6f;
+
+    Tensor maskData = { 0, 0, 0, 0 };
+
+    NetDef Attention;
+    AddOp(Attention, "attn", MULTI_HEAD_ATTENTION_LAYER, createParam(MultiHeadAttentionParams{ { "q", "k", "v", "mask" }, "attn", HEADS, static_cast<float>(DROPOUT_RATE) }));
+
+    NetDef SelfAttention;
+    AddOp(SelfAttention, "self_attn", MULTI_HEAD_ATTENTION_LAYER, createParam(MultiHeadAttentionParams{ { "x", "mask" }, "attn", HEADS, static_cast<float>(DROPOUT_RATE) }));
+
+    NetDef PositionwiseFeedForward;
+    Add<raul::LinearLayer>(PositionwiseFeedForward, "w_1", LinearParams{ { "x" }, { "w_1" }, FEED_FORWARD_SIZE });
+    Add<raul::DropoutLayer>(PositionwiseFeedForward, "dropout", DropoutParams{ { "w_1" }, { "dropout" }, static_cast<float>(DROPOUT_RATE) });
+    Add<raul::LinearLayer>(PositionwiseFeedForward, "w_2", LinearParams{ { "dropout" }, { "w_2" }, MODEL_SIZE });
+
+    NetDef EncoderLayer;
+    AddNetDef(EncoderLayer, SublayerConnection("sublayer", "x", "y", SelfAttention, { "mask" }, static_cast<float>(DROPOUT_RATE)), "l1", { "x", "mask" }, { "y" });
+    AddNetDef(EncoderLayer, SublayerConnection("sublayer", "x", "y", PositionwiseFeedForward, {}, static_cast<float>(DROPOUT_RATE)), "l2", { "y" }, { "z" });
+
+    NetDef Encoder = cloneSequential("encoder_layer", EncoderLayer, LENGTH);
+
+    NetDef DecoderLayer;
+    AddNetDef(EncoderLayer, SublayerConnection("sublayer", "x", "y", SelfAttention, { "tgt_mask" }, static_cast<float>(DROPOUT_RATE)), "l1", { "x", "tgt_mask" }, { "y" });
+    Add<raul::LayerNormLayer>(EncoderLayer, "mem_norm", LayerNormParams{ "m", "m_norm", LAYER_NORM_EPSILON });
+    addNetDef(EncoderLayer, SublayerConnection("sublayer", "x", "y", Attention, { "m_norm", "m_norm", "tgt_mask" }, static_cast<float>(DROPOUT_RATE)), "l2", { "y" }, { "z" });
+}
+*/
+
+template <typename MM>
+std::vector<typename MM::type> subsequent_mask(size_t size)
+{
+    std::vector<typename MM::type> v(size * size, TOMMTYPE(1.0_dt));
+    raul::Common::triu(v.data(), size, size, 1);
+    std::transform(v.begin(), v.end(), v.begin(), [](auto s) { return s == TOMMTYPE(0.0_dt) ? TOMMTYPE(1.0_dt) : TOMMTYPE(0.0_dt); });
+    return v;
+}
+
+template<typename MM>
+struct TransformerBatch
+{
+    TransformerBatch(const typename MM::tensor& s, const typename MM::tensor& t, MM& m, typename MM::type pad = 0.0_dt)
+    {
+        trg = &m["trg"];
+        trg_y = &m["trg_y"];
+        trg_mask = &m["trg_mask"];
+
+        src = &m["src"];
+        src_mask = &m["src_mask"];
+
+        *src = TORANGE_MM(s);
+
+        for (size_t i = 0; i < s.size(); ++i)
+            (*src_mask)[i] = (s[i] != pad ? TOMMTYPE(1.0) : TOMMTYPE(0.0));
+
+        if (!t.empty())
+        {
+            auto n = t.getBatchSize();
+            auto w = t.getWidth();
+            const auto t2d = t.reshape(yato::dims(n, w));
+            const auto trg2d = trg->reshape(yato::dims(n, w - 1));
+            const auto trg_y2d = trg_y->reshape(yato::dims(n, w - 1));
+            const auto trg_mask2d = trg_mask->reshape(yato::dims(n, (w - 1) * (w - 1)));
+
+            for (size_t b = 0; b < n; ++b)
+            {
+                std::copy(t2d[b].begin(), t2d[b].end() - 1, trg2d[b].begin());
+                std::copy(t2d[b].begin() + 1, t2d[b].end(), trg_y2d[b].begin());
+                auto sm = subsequent_mask<MM>(w - 1);
+                for (size_t i = 0; i < (w - 1); ++i)
+                    if (trg_y2d[b][i] != pad) ++ntokens;
+                for (size_t i = 0; i < (w - 1); ++i)
+                    for (size_t j = 0; j < (w - 1); ++j)
+                        trg_mask2d[b][i * (w - 1) + j] = (trg2d[b][j] != pad ? TOMMTYPE(1.0) : TOMMTYPE(0.0)) * sm[i * (w - 1) + j];
+            }
+        }
+    }
+
+    typename MM::tensor* src = nullptr;
+    typename MM::tensor* src_mask = nullptr;
+    typename MM::tensor* trg = nullptr;
+    typename MM::tensor* trg_y = nullptr;
+    typename MM::tensor* trg_mask = nullptr;
+    size_t ntokens = 0;
+};
+
+TEST(TestTransformer, AttentionUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t MODEL_SIZE = 5;
+    size_t BATCH_SIZE = 1;
+    size_t HEIGHT = 2;
+    constexpr dtype DROPOUT_RATE = 0.0_dt;
+    constexpr dtype FILL_VALUE = -1e9_dt;
+
+    constexpr dtype eps = 1e-4_dt;
+
+    Tensor realAttn = { 0.9360_dt, 1.0640_dt, 0.9887_dt, 1.0113_dt };
+    Tensor realPAttn = { 0.0640_dt, 0.9360_dt, 0.0113_dt, 0.9887_dt };
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "query" }, 1, HEIGHT, MODEL_SIZE });
+    work.add<raul::DataLayer>("data2", raul::DataParams{ { "key" }, 1, HEIGHT, MODEL_SIZE });
+    work.add<raul::DataLayer>("data3", raul::DataParams{ { "value" }, 1, HEIGHT, HEIGHT });
+    work.add<raul::DataLayer>("data4", raul::DataParams{ { "mask" }, 1, HEIGHT, HEIGHT });
+
+    work.add<TransposeLayer>("t", raul::TransposingParams{ "key", "key_t", "width", "height" });
+    work.add<MatMulLayer>("attn", raul::MatMulParams{ { "query", "key_t" }, "scores", TODTYPE(1.0_dt / sqrt(static_cast<raul::dtype>(MODEL_SIZE))) });
+    work.add<MaskedFillLayer>("mfill", raul::MaskedFillParams{ { "scores", "mask" }, { "scores_masked" }, FILL_VALUE, true });
+    work.add<SoftMaxActivation>("sm", raul::BasicParamsWithDim{ { "scores_masked" }, { "p_attn_" }, "width" });
+    work.add<DropoutLayer>("dropout", raul::DropoutParams{ { "p_attn_" }, { "p_attn_do" }, DROPOUT_RATE });
+    work.add<SplitterLayer>("splitter", raul::BasicParams{ { "p_attn_do" }, { "p_attn_do_", "p_attn" } });
+    work.add<MatMulLayer>("mm", raul::MatMulParams{ { "p_attn_do_", "value" }, "res" });
+    TENSORS_CREATE(BATCH_SIZE);
+
+    memory_manager["query"] = { 1._dt, 1._dt, 2._dt, 0._dt, 5._dt, -1._dt, 2._dt, 2._dt, 0._dt, 5._dt };
+    memory_manager["key"] = { -1._dt, 4._dt, 1._dt, 2._dt, 1._dt, -3._dt, 4._dt, 5._dt, 2._dt, 1._dt };
+    memory_manager["value"] = { 0._dt, 2._dt, 1._dt, 1._dt };
+    memory_manager["mask"] = { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+
+    work.forwardPassTraining();
+
+    const Tensor& p_attn = memory_manager["p_attn"];
+    const Tensor& res = memory_manager["res"];
+
+    EXPECT_EQ(p_attn.size(), realPAttn.size());
+    for (size_t i = 0; i < p_attn.size(); ++i)
+        EXPECT_NEAR(p_attn[i], realPAttn[i], eps);
+
+    EXPECT_EQ(res.size(), realAttn.size());
+    for (size_t i = 0; i < res.size(); ++i)
+        EXPECT_NEAR(res[i], realAttn[i], eps);
+
+    memory_manager[Name("res").grad()] = { 0.9360_dt, 1.0640_dt, 0.9887_dt, 1.0113_dt };
+    memory_manager[Name("p_attn").grad()] = 0.0_dt;
+
+    work.backwardPassTraining();
+
+    const auto& query_nabla = memory_manager[Name("query").grad()];
+    const auto& key_nabla = memory_manager[Name("key").grad()];
+    const auto& value_nabla = memory_manager[Name("value").grad()];
+
+    Tensor realQ_nabla = { 6.8549e-03_dt, 3.7253e-08_dt, -1.3710e-02_dt, 1.8626e-08_dt, 9.3132e-09_dt, 2.2586e-04_dt, -1.8816e-07_dt, -4.5167e-04_dt, -9.4078e-08_dt, -4.7039e-08_dt };
+    Tensor realK_nabla = { 0.0033_dt, 0.0037_dt, 0.0071_dt, 0.0000_dt, 0.0177_dt, -0.0033_dt, -0.0037_dt, -0.0071_dt, 0.0000_dt, -0.0177_dt };
+    Tensor realV_nabla = { 0.0710_dt, 0.0795_dt, 1.8537_dt, 1.9958_dt };
+
+    EXPECT_EQ(realQ_nabla.size(), query_nabla.size());
+    for (size_t i = 0; i < realQ_nabla.size(); ++i)
+        EXPECT_NEAR(realQ_nabla[i], query_nabla[i], eps);
+
+    EXPECT_EQ(realK_nabla.size(), key_nabla.size());
+    for (size_t i = 0; i < realK_nabla.size(); ++i)
+        EXPECT_NEAR(realK_nabla[i], key_nabla[i], eps);
+
+    EXPECT_EQ(realV_nabla.size(), value_nabla.size());
+    for (size_t i = 0; i < realV_nabla.size(); ++i)
+        EXPECT_NEAR(realV_nabla[i], value_nabla[i], eps);
+}
+
+TEST(TestTransformer, AttentionLayerUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t MODEL_SIZE = 5;
+    size_t BATCH_SIZE = 1;
+    size_t HEIGHT = 2;
+    constexpr dtype DROPOUT_RATE = 0.0_dt;
+
+    constexpr dtype eps = 1e-4_dt;
+
+    Tensor realAttn = { 0.9360_dt, 1.0640_dt, 0.9887_dt, 1.0113_dt };
+    Tensor realPAttn = { 0.0640_dt, 0.9360_dt, 0.0113_dt, 0.9887_dt };
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "query" }, 1, HEIGHT, MODEL_SIZE });
+    work.add<raul::DataLayer>("data2", raul::DataParams{ { "key" }, 1, HEIGHT, MODEL_SIZE });
+    work.add<raul::DataLayer>("data3", raul::DataParams{ { "value" }, 1, HEIGHT, HEIGHT });
+    work.add<raul::DataLayer>("data4", raul::DataParams{ { "mask" }, 1, HEIGHT, HEIGHT });
+
+    AttentionLayer("attn", { { "query", "value", "key", "mask" }, { "res", "p_attn" }, DROPOUT_RATE }, networkParameters);
+    TENSORS_CREATE(BATCH_SIZE);
+
+    memory_manager["query"] = { 1._dt, 1._dt, 2._dt, 0._dt, 5._dt, -1._dt, 2._dt, 2._dt, 0._dt, 5._dt };
+    memory_manager["key"] = { -1._dt, 4._dt, 1._dt, 2._dt, 1._dt, -3._dt, 4._dt, 5._dt, 2._dt, 1._dt };
+    memory_manager["value"] = { 0._dt, 2._dt, 1._dt, 1._dt };
+    memory_manager["mask"] = { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+
+    work.forwardPassTraining();
+
+    const auto& p_attn = memory_manager["p_attn"];
+    const auto& res = memory_manager["res"];
+
+    EXPECT_EQ(p_attn.size(), realPAttn.size());
+    for (size_t i = 0; i < p_attn.size(); ++i)
+        EXPECT_NEAR(p_attn[i], realPAttn[i], eps);
+
+    EXPECT_EQ(res.size(), realAttn.size());
+    for (size_t i = 0; i < res.size(); ++i)
+        EXPECT_NEAR(res[i], realAttn[i], eps);
+
+    Tensor resGrad = { 0.9360_dt, 1.0640_dt, 0.9887_dt, 1.0113_dt };
+    Tensor pattnGrad = { 0._dt, 0._dt, 0._dt, 0._dt };
+    memory_manager[Name("res").grad()] = TORANGE(resGrad);
+    memory_manager[Name("p_attn").grad()] = TORANGE(pattnGrad);
+
+    work.backwardPassTraining();
+
+    const auto& query_nabla = memory_manager[Name("query").grad()];
+    const auto& key_nabla = memory_manager[Name("key").grad()];
+    const auto& value_nabla = memory_manager[Name("value").grad()];
+
+    Tensor realQ_nabla = { 6.8549e-03_dt, 3.7253e-08_dt, -1.3710e-02_dt, 1.8626e-08_dt, 9.3132e-09_dt, 2.2586e-04_dt, -1.8816e-07_dt, -4.5167e-04_dt, -9.4078e-08_dt, -4.7039e-08_dt };
+    Tensor realK_nabla = { 0.0033_dt, 0.0037_dt, 0.0071_dt, 0.0000_dt, 0.0177_dt, -0.0033_dt, -0.0037_dt, -0.0071_dt, 0.0000_dt, -0.0177_dt };
+    Tensor realV_nabla = { 0.0710_dt, 0.0795_dt, 1.8537_dt, 1.9958_dt };
+    EXPECT_EQ(realQ_nabla.size(), query_nabla.size());
+    for (size_t i = 0; i < realQ_nabla.size(); ++i)
+        EXPECT_NEAR(realQ_nabla[i], query_nabla[i], eps);
+
+    EXPECT_EQ(realK_nabla.size(), key_nabla.size());
+    for (size_t i = 0; i < realK_nabla.size(); ++i)
+        EXPECT_NEAR(realK_nabla[i], key_nabla[i], eps);
+
+    EXPECT_EQ(realV_nabla.size(), value_nabla.size());
+    for (size_t i = 0; i < realV_nabla.size(); ++i)
+        EXPECT_NEAR(realV_nabla[i], value_nabla[i], eps);
+}
+
+TEST(TestTransformer, AttentionLayerNoMaskNoPAttnUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t MODEL_SIZE = 5;
+    size_t BATCH_SIZE = 1;
+    size_t HEIGHT = 2;
+    constexpr dtype DROPOUT_RATE = 0.0_dt;
+
+    constexpr dtype eps = 1e-4_dt;
+
+    Tensor realAttn = { 0.9360_dt, 1.0640_dt, 0.9887_dt, 1.0113_dt };
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "query" }, 1, HEIGHT, MODEL_SIZE });
+    work.add<raul::DataLayer>("data2", raul::DataParams{ { "key" }, 1, HEIGHT, MODEL_SIZE });
+    work.add<raul::DataLayer>("data3", raul::DataParams{ { "value" }, 1, HEIGHT, HEIGHT });
+
+    AttentionLayer("attn", { { "query", "value", "key" }, { "res" }, DROPOUT_RATE }, networkParameters);
+    TENSORS_CREATE(BATCH_SIZE);
+
+    memory_manager["query"] = { 1._dt, 1._dt, 2._dt, 0._dt, 5._dt, -1._dt, 2._dt, 2._dt, 0._dt, 5._dt };
+    memory_manager["key"] = { -1._dt, 4._dt, 1._dt, 2._dt, 1._dt, -3._dt, 4._dt, 5._dt, 2._dt, 1._dt };
+    memory_manager["value"] = { 0._dt, 2._dt, 1._dt, 1._dt };
+
+    work.forwardPassTraining();
+
+    const auto& res = memory_manager["res"];
+
+    EXPECT_EQ(res.size(), realAttn.size());
+    for (size_t i = 0; i < res.size(); ++i)
+        EXPECT_NEAR(res[i], realAttn[i], eps);
+
+    Tensor resGrad = { 0.9360_dt, 1.0640_dt, 0.9887_dt, 1.0113_dt };
+    memory_manager[Name("res").grad()] = TORANGE(resGrad);
+
+    work.backwardPassTraining();
+
+    const auto& query_nabla = memory_manager[Name("query").grad()];
+    const auto& key_nabla = memory_manager[Name("key").grad()];
+    const auto& value_nabla = memory_manager[Name("value").grad()];
+
+    Tensor realQ_nabla = { 6.8549e-03_dt, 3.7253e-08_dt, -1.3710e-02_dt, 1.8626e-08_dt, 9.3132e-09_dt, 2.2586e-04_dt, -1.8816e-07_dt, -4.5167e-04_dt, -9.4078e-08_dt, -4.7039e-08_dt };
+    Tensor realK_nabla = { 0.0033_dt, 0.0037_dt, 0.0071_dt, 0.0000_dt, 0.0177_dt, -0.0033_dt, -0.0037_dt, -0.0071_dt, 0.0000_dt, -0.0177_dt };
+    Tensor realV_nabla = { 0.0710_dt, 0.0795_dt, 1.8537_dt, 1.9958_dt };
+
+    EXPECT_EQ(realQ_nabla.size(), query_nabla.size());
+    for (size_t i = 0; i < realQ_nabla.size(); ++i)
+        EXPECT_NEAR(realQ_nabla[i], query_nabla[i], eps);
+
+    EXPECT_EQ(realK_nabla.size(), key_nabla.size());
+    for (size_t i = 0; i < realK_nabla.size(); ++i)
+        EXPECT_NEAR(realK_nabla[i], key_nabla[i], eps);
+
+    EXPECT_EQ(realV_nabla.size(), value_nabla.size());
+    for (size_t i = 0; i < realV_nabla.size(); ++i)
+        EXPECT_NEAR(realV_nabla[i], value_nabla[i], eps);
+}
+
+TEST(TestTransformer, SelfAttentionLayerUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t MODEL_SIZE = 5;
+    size_t BATCH_SIZE = 1;
+    size_t HEIGHT = 2;
+    constexpr dtype DROPOUT_RATE = 0.0_dt;
+    [[maybe_unused]] constexpr dtype FILL_VALUE = -1e9_dt;
+
+    constexpr dtype eps = 1e-4_dt;
+
+    Tensor realAttn = { 0.2200_dt, 1.3900_dt, 2.0000_dt, 0.0000_dt, 5.0000_dt, -0.7136_dt, 1.8568_dt, 2.0000_dt, 0.0000_dt, 5.0000_dt };
+    Tensor realPAttn = { 0.6100_dt, 0.3900_dt, 0.1432_dt, 0.8568_dt };
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "query" }, 1, HEIGHT, MODEL_SIZE });
+    work.add<raul::DataLayer>("data4", raul::DataParams{ { "mask" }, 1, HEIGHT, HEIGHT });
+
+    AttentionLayer("attn", { { "query", "query", "query", "mask" }, { "res", "p_attn" }, DROPOUT_RATE }, networkParameters);
+    TENSORS_CREATE(BATCH_SIZE);
+
+    memory_manager["query"] = { 1._dt, 1._dt, 2._dt, 0._dt, 5._dt, -1._dt, 2._dt, 2._dt, 0._dt, 5._dt };
+    memory_manager["mask"] = { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+
+    work.forwardPassTraining();
+
+    const auto& p_attn = memory_manager["p_attn"];
+    const auto& res = memory_manager["res"];
+
+    EXPECT_EQ(p_attn.size(), realPAttn.size());
+    for (size_t i = 0; i < p_attn.size(); ++i)
+        EXPECT_NEAR(p_attn[i], realPAttn[i], eps);
+
+    EXPECT_EQ(res.size(), realAttn.size());
+    for (size_t i = 0; i < res.size(); ++i)
+        EXPECT_NEAR(res[i], realAttn[i], eps);
+
+    Tensor resGrad = { 1._dt, 1._dt, 2._dt, 0._dt, 5._dt, -1._dt, 2._dt, 2._dt, 0._dt, 5._dt };
+    Tensor pattnGrad = { 0._dt, 0._dt, 0._dt, 0._dt };
+    memory_manager[Name("res").grad()] = TORANGE(resGrad);
+    memory_manager[Name("p_attn").grad()] = TORANGE(pattnGrad);
+
+    work.backwardPassTraining();
+
+    const auto& query_nabla = memory_manager[Name("query").grad()];
+
+    Tensor realQ_nabla = { 1.0054_dt, 0.4574_dt, 1.2802_dt, 0.0000_dt, 3.2004_dt, -1.2317_dt, 2.6557_dt, 2.7198_dt, 0.0000_dt, 6.7996_dt };
+
+    EXPECT_EQ(realQ_nabla.size(), query_nabla.size());
+    for (size_t i = 0; i < realQ_nabla.size(); ++i)
+        EXPECT_NEAR(realQ_nabla[i], query_nabla[i], eps);
+}
+
+TEST(TestTransformer, SelfAttentionLayerWithSplitterUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t MODEL_SIZE = 5;
+    size_t BATCH_SIZE = 1;
+    size_t HEIGHT = 2;
+    constexpr dtype DROPOUT_RATE = 0.0_dt;
+    constexpr dtype eps = 1e-4_dt;
+
+    Tensor realAttn = { 0.2200_dt, 1.3900_dt, 2.0000_dt, 0.0000_dt, 5.0000_dt, -0.7136_dt, 1.8568_dt, 2.0000_dt, 0.0000_dt, 5.0000_dt };
+    Tensor realPAttn = { 0.6100_dt, 0.3900_dt, 0.1432_dt, 0.8568_dt };
+
+    work.add<raul::DataLayer>("data1", raul::DataParams{ { "query" }, 1, HEIGHT, MODEL_SIZE });
+    work.add<raul::DataLayer>("data4", raul::DataParams{ { "mask" }, 1, HEIGHT, HEIGHT });
+
+    work.add<SplitterLayer>("splitter", raul::BasicParams{ { "query" }, { "q", "k", "v" } });
+    AttentionLayer("attn", { { "q", "v", "k", "mask" }, { "res", "p_attn" }, DROPOUT_RATE }, networkParameters);
+    TENSORS_CREATE(BATCH_SIZE);
+
+    memory_manager["query"] = { 1._dt, 1._dt, 2._dt, 0._dt, 5._dt, -1._dt, 2._dt, 2._dt, 0._dt, 5._dt };
+    memory_manager["mask"] = { 1.0_dt, 1.0_dt, 1.0_dt, 1.0_dt };
+
+    work.forwardPassTraining();
+
+    const auto& p_attn = memory_manager["p_attn"];
+    const auto& res = memory_manager["res"];
+
+    EXPECT_EQ(p_attn.size(), realPAttn.size());
+    for (size_t i = 0; i < p_attn.size(); ++i)
+        EXPECT_NEAR(p_attn[i], realPAttn[i], eps);
+
+    EXPECT_EQ(res.size(), realAttn.size());
+    for (size_t i = 0; i < res.size(); ++i)
+        EXPECT_NEAR(res[i], realAttn[i], eps);
+
+    memory_manager[Name("res").grad()] = { 1._dt, 1._dt, 2._dt, 0._dt, 5._dt, -1._dt, 2._dt, 2._dt, 0._dt, 5._dt };
+    memory_manager[Name("p_attn").grad()] = 0._dt;
+
+    work.backwardPassTraining();
+
+    const auto& query_nabla = memory_manager[Name("query").grad()];
+
+    Tensor realQ_nabla = { 1.0054_dt, 0.4574_dt, 1.2802_dt, 0.0000_dt, 3.2004_dt, -1.2317_dt, 2.6557_dt, 2.7198_dt, 0.0000_dt, 6.7996_dt };
+
+    EXPECT_EQ(realQ_nabla.size(), query_nabla.size());
+    for (size_t i = 0; i < realQ_nabla.size(); ++i)
+        EXPECT_NEAR(realQ_nabla[i], query_nabla[i], eps);
+}
+
+TEST(TestTransformer, MultiHeadAttentionLayerUnit)
+{
+    PROFILE_TEST
+    using namespace raul;
+
+    MANAGERS_DEFINE
+    NETWORK_PARAMS_DEFINE(networkParameters);
+
+    size_t MODEL_SIZE = 9;
+    size_t BATCH_SIZE = 2;
+    size_t HEIGHT = 2;
+    unsigned int HEADS = 3;
+    constexpr dtype DROPOUT_RATE = 0.0_dt;
+
+    constexpr dtype eps = 1e-4_dt;
+
+    Tensor realOut(BATCH_SIZE,
+                   1,
+                   HEIGHT,
+                   MODEL_SIZE,
+                   {
+                       21.9955_dt, 21.9955_dt, 21.9955_dt, 21.9955_dt, 21.9955_dt, 21.9955_dt, 21.9955_dt, 21.9955_dt, 21.9955_dt, 22.1482_dt, 22.1482_dt, 22.1482_dt,
+                       22.1482_dt, 22.1482_dt, 22.1482_dt, 22.1482_dt, 22.1482_dt, 22.1482_dt, 32.3676_dt, 32.3676_dt, 32.3676_dt, 32.3676_dt, 32.3676_dt, 32.3676_dt,
+                       32.3676_dt, 32.3676_dt, 32.3676_dt, 32.4809_dt, 32.4809_dt, 32.4809_dt, 32.4809_dt, 32.4809_dt, 32.4809_dt, 32.4809_dt, 32.4809_dt, 32.4809_dt,
+                   });
+
+    Tensor realGrad(BATCH_SIZE,
+                    1,
+                    HEIGHT,
+                    MODEL_SIZE,
+                    {
+                        -0.8971_dt, -0.8971_dt, -0.8971_dt, -0.8971_dt, -0.8971_dt, -0.8971_dt, -0.8971_dt, -0.8971_dt, -0.8971_dt, 26.3311_dt, 26.3311_dt, 26.3311_dt,
+                        26.3311_dt, 26.3311_dt, 26.3311_dt, 26.3311_dt, 26.3311_dt, 26.3311_dt, -2.2549_dt, -2.2549_dt, -2.2549_dt, -2.2549_dt, -2.2549_dt, -2.2549_dt,
+                        -2.2549_dt, -2.2549_dt, -2.2549_dt, 40.0683_dt, 40.0683_dt, 40.0683_dt, 40.0683_dt, 40.0683_dt, 40.0683_dt, 40.0683_dt, 40.0683_dt, 40.0683_dt,
+                    });
+
+    work.add<raul::DataLayer>("data", raul::DataParams{ { "in" }, 1, HEIGHT, MODEL_SIZE });
+
+    MultiHeadAttentionLayer("attn", MultiHeadAttentionParams{ { "in" }, "out", HEADS, DROPOUT_RATE }, networkParameters);
+    TENSORS_CREATE(BATCH_SIZE);
+
+    memory_manager["in"] = { 0.1111_dt, 0.1112_dt, 0.1113_dt, 0.1114_dt, 0.1115_dt, 0.1116_dt, 0.1117_dt, 0.1118_dt, 0.1119_dt, 0.5121_dt, 0.1122_dt, 0.1123_dt,
+                             0.1124_dt, 0.1125_dt, 0.1126_dt, 0.1127_dt, 0.1128_dt, 0.1129_dt, 0.2111_dt, 0.2112_dt, 0.2113_dt, 0.2114_dt, 0.2115_dt, 0.2116_dt,
+                             0.2117_dt, 0.2118_dt, 0.2119_dt, 0.8121_dt, 0.2122_dt, 0.2123_dt, 0.2124_dt, 0.2125_dt, 0.2126_dt, 0.2127_dt, 0.2128_dt, 0.2129_dt };
+
+    for (size_t i = 0; i < 4; ++i)
+    {
+        auto suffix = "[" + std::to_string(i) + "]";
+        auto& w = memory_manager[Name("attn") / "linears" + suffix + "::Weights"];
+        auto& b = memory_manager[Name("attn") / "linears" + suffix + "::Biases"];
+
+        w = 1.0_dt;
+        b = 1.0_dt;
+    }
+
+    work.forwardPassTraining();
+
+    const auto& out = memory_manager["out"];
+
+    EXPECT_EQ(out.size(), realOut.size());
+    for (size_t i = 0; i < out.size(); ++i)
+        EXPECT_NEAR(out[i], realOut[i], eps);
+
+    memory_manager[raul::Name("out").grad()] = { 0.1111_dt, 0.1112_dt, 0.1113_dt, 0.1114_dt, 0.1115_dt, 0.1116_dt, 0.1117_dt, 0.1118_dt, 0.1119_dt, 0.8121_dt, 0.1122_dt, 0.1123_dt,
+                                                 0.1124_dt, 0.1125_dt, 0.1126_dt, 0.1127_dt, 0.1128_dt, 0.1129_dt, 0.2111_dt, 0.2112_dt, 0.2113_dt, 0.2114_dt, 0.2115_dt, 0.2116_dt,
+                                                 0.2117_dt, 0.2118_dt, 0.2119_dt, 0.5121_dt, 0.2122_dt, 0.2123_dt, 0.2124_dt, 0.2125_dt, 0.2126_dt, 0.2127_dt, 0.2128_dt, 0.2129_dt };
+
+    work.backwardPassTraining();
+
+    const auto& query_nabla = memory_manager[raul::Name("in").grad()];
+
+    EXPECT_EQ(query_nabla.size(), realGrad.size());
+    for (size_t i = 0; i < query_nabla.size(); ++i)
+        EXPECT_NEAR(realGrad[i], query_nabla[i], eps);
+}
+
+}
\ No newline at end of file